From 253e9719863b67a8f7a3265c54029f181ec76901 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 30 Oct 2024 13:20:26 -0400
Subject: [PATCH 001/134] Merging and batch processing improvements

---
 python/sv_merger.py | 69 ++++++++++++++++++++++++++++++++-------------
 src/contextsv.cpp   |  9 +++---
 src/sv_caller.cpp   | 42 ++++++++++++++++-----------
 src/sv_data.cpp     | 28 ++++++++++++++----
 4 files changed, 101 insertions(+), 47 deletions(-)

diff --git a/python/sv_merger.py b/python/sv_merger.py
index 2d0027b1..56c0ae26 100644
--- a/python/sv_merger.py
+++ b/python/sv_merger.py
@@ -89,6 +89,12 @@ def update_support(record, cluster_size):
 
     return record
 
+def weighted_score(read_support, hmm_score, sv_len, weight_hmm, weight_sv_len):
+    """
+    Calculate a weighted score based on read support and HMM score.
+    """
+    return (1 - weight_hmm) * read_support + weight_hmm * hmm_score
+
 def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
     """
     Cluster SV breakpoints using HDBSCAN.
@@ -131,22 +137,24 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
     cluster_labels = []
 
     # dbscan = DBSCAN(eps=30000, min_samples=3)
-    dbscan = HDBSCAN(min_cluster_size=cluster_size_min, min_samples=3)
+    logging.info("Clustering %d SV breakpoints with parameters: min_cluster_size=%d", len(breakpoints), cluster_size_min)
+    dbscan = HDBSCAN(min_cluster_size=cluster_size_min, min_samples=2)
     if len(breakpoints) > 0:
         logging.info("Clustering %d SV breakpoints...", len(breakpoints))
         cluster_labels = dbscan.fit_predict(breakpoints)
 
         logging.info("Label counts: %d", len(np.unique(cluster_labels)))
 
-    # Set all 0 values to NaN
-    hmm_scores[hmm_scores == 0] = np.nan
-
     # Merge SVs with the same label
     unique_labels = np.unique(cluster_labels)
+    logging.info("Unique labels: %s", unique_labels)
+
     for label in unique_labels:
 
         # Skip label -1 (outliers)
-        if label == -1:
+        # if label == -1:
+        # Skip label -1 (outliers) only if there are no other clusters
+        if label == -1 and len(unique_labels) > 1:
             # # Print the positions if any are within a certain range
             # pos_min = 180915940
             # pos_max = 180950356
@@ -171,9 +179,10 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
         idx = cluster_labels == label
 
         # Get HMM and read support values for the cluster
-        max_score_idx = 0  # Default to the first SV in the cluster
+        # max_score_idx = 0  # Default to the first SV in the cluster
         cluster_hmm_scores = np.array(hmm_scores[idx])
         cluster_depth_scores = np.array(sv_support[idx])
+        cluster_sv_lengths = np.array(breakpoints[idx][:, 1] - breakpoints[idx][:, 0] + 1)
         max_hmm = None
         max_support = None
         max_hmm_idx = None
@@ -189,20 +198,40 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
             max_support_idx = np.argmax(cluster_depth_scores)
             max_support = cluster_depth_scores[max_support_idx]
 
-        # For deletions, choose the SV with the highest HMM score if available
-        if sv_type == 'DEL':
-            if max_hmm is not None:
-                max_score_idx = max_hmm_idx
-            elif max_support is not None:
-                max_score_idx = max_support_idx
-
-        # For insertions and duplications, choose the SV with the highest read
-        # support if available
-        elif sv_type == 'INS/DUP':
-            if max_support is not None:
-                max_score_idx = max_support_idx
-            elif max_hmm is not None:
-                max_score_idx = max_hmm_idx
+        # Use a weighted approach to choose the best SV based on HMM and
+        # support. Deletions have higher priority for HMM scores, while
+        # insertions and duplications have higher priority for read alignment
+        # support.
+        # hmm_weight = 0.7 if sv_type == 'DEL' else 0.3
+        hmm_weight = 0.4
+        sv_len_weight = 0.4
+        max_score_idx = 0  # Default to the first SV in the cluster
+        max_score = weighted_score(cluster_depth_scores[max_score_idx], cluster_hmm_scores[max_score_idx], cluster_sv_lengths[max_score_idx], hmm_weight, sv_len_weight)
+        for k, hmm_loglh in enumerate(cluster_hmm_scores):
+            sv_len = cluster_sv_lengths[k] / 1000  # Normalize SV length to kilobases
+            read_support = cluster_depth_scores[k]
+            score = weighted_score(read_support, hmm_loglh, sv_len, hmm_weight, sv_len_weight)
+            if score > max_score:
+                max_score = score
+                max_score_idx = k
+
+        # Get the VCF record with the highest depth score
+        max_record = vcf_df.iloc[idx, :].iloc[max_score_idx, :]
+
+        # # For deletions, choose the SV with the highest HMM score if available
+        # if sv_type == 'DEL':
+        #     if max_hmm is not None:
+        #         max_score_idx = max_hmm_idx
+        #     elif max_support is not None:
+        #         max_score_idx = max_support_idx
+
+        # # For insertions and duplications, choose the SV with the highest read
+        # # support if available
+        # elif sv_type == 'INS/DUP':
+        #     if max_support is not None:
+        #         max_score_idx = max_support_idx
+        #     elif max_hmm is not None:
+        #         max_score_idx = max_hmm_idx
 
         # Get the VCF record with the highest depth score
         max_record = vcf_df.iloc[idx, :].iloc[max_score_idx, :]
diff --git a/src/contextsv.cpp b/src/contextsv.cpp
index 47d68054..0a502881 100644
--- a/src/contextsv.cpp
+++ b/src/contextsv.cpp
@@ -32,7 +32,7 @@ int ContextSV::run()
     SVData sv_calls = sv_caller.run();
 
     // Print the total number of SVs called
-    std::cout << "Total SVs called: " << sv_calls.totalCalls() << std::endl;
+    // std::cout << "Total SVs called: " << sv_calls.totalCalls() << std::endl;
 
     // Write SV calls to file
     std::string output_dir = this->input_data->getOutputDir();
@@ -40,9 +40,10 @@ int ContextSV::run()
     sv_calls.saveToVCF(ref_genome, output_dir);
 
     // Format and print the time taken to call SVs
-    auto end_sv = std::chrono::high_resolution_clock::now();
-    std::string elapsed_time = getElapsedTime(start_sv, end_sv);
-    std::cout << "SV calling complete. Found " << sv_calls.totalCalls() << " total SVs. Time taken (h:m:s) = " << elapsed_time << std::endl;
+    // auto end_sv = std::chrono::high_resolution_clock::now();
+    // std::string elapsed_time = getElapsedTime(start_sv, end_sv);
+    std::cout << "SV calling complete." << std::endl;
+    // std::cout << "SV calling complete. Found " << sv_calls.totalCalls() << " total SVs. Time taken (h:m:s) = " << elapsed_time << std::endl;
 
     return 0;
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 179a5360..76011de9 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -411,12 +411,13 @@ SVData SVCaller::run()
     }
     int chr_count = chromosomes.size();
 
-    // Loop through each region and detect SVs
+    // Loop through each region and detect SVs (Note: The main loop is
+    // single-threaded)
     std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl;
+    int chunk_count = 100;  // Number of chunks to split the chromosome into
     int region_count = 0;
     auto start1 = std::chrono::high_resolution_clock::now();
     SVData sv_calls;
-    int chunk_count = 10000;  // Number of chunks to split the chromosome into
     int min_cnv_length = this->input_data->getMinCNVLength();
     for (const auto& chr : chromosomes) {
         std::cout << "Running SV detection for chromosome " << chr << "..." << std::endl;
@@ -433,10 +434,14 @@ SVData SVCaller::run()
             // Use one chunk for the region
             std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end);
             region_chunks.push_back(chunk);
+            std::cout << "Using specified region " << chunk << "..." << std::endl;
             
         } else {
             int chr_len = this->input_data->getRefGenomeChromosomeLength(chr);
-            int chunk_size = chr_len / chunk_count;
+            std::cout << "Chromosome length: " << chr_len << std::endl;
+            std::cout << "Chunk count: " << chunk_count << std::endl;
+            int chunk_size = std::ceil((double)chr_len / chunk_count);
+            std::cout << "Chunk size: " << chunk_size << std::endl;
             for (int i = 0; i < chunk_count; i++) {
                 int start = i * chunk_size + 1;  // 1-based
                 int end = start + chunk_size;
@@ -446,6 +451,7 @@ SVData SVCaller::run()
                 std::string chunk = chr + ":" + std::to_string(start) + "-" + std::to_string(end);
                 region_chunks.push_back(chunk);
             }
+            std::cout << "Split chromosome " << chr << " into " << region_chunks.size() << " chunks." << std::endl;
         }
 
         // Load chromosome data for copy number predictions
@@ -494,8 +500,10 @@ SVData SVCaller::run()
         // std::cout << "Extracted aligments for " << region_count << " of " << chr_count << " chromosome(s)..." << std::endl;
     }
 
-    auto end1 = std::chrono::high_resolution_clock::now();
-    std::cout << "Finished detecting " << sv_calls.totalCalls() << " SVs from " << chr_count << " chromosome(s). Elapsed time: " << getElapsedTime(start1, end1) << std::endl;
+    // auto end1 = std::chrono::high_resolution_clock::now();
+    std::cout << "SV calling completed." << std::endl;
+    // int total_sv_calls = sv_calls.totalCalls();
+    // std::cout << "Finished detecting " << sv_calls.totalCalls() << " SVs from " << chr_count << " chromosome(s). Elapsed time: " << getElapsedTime(start1, end1) << std::endl;
 
     return sv_calls;
 }
@@ -582,12 +590,12 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
                 std::vector<std::pair<SVCandidate, std::string>> sv_list;  // SV candidate and alignment type
 
                 // Use the gap ends as the SV endpoints
-                if (primary_start - supp_end >= min_cnv_length) {
-                    SVCandidate sv_candidate(supp_end+1, primary_start+1, ".");
-                    std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "GAPINNER_A");
-                    sv_list.push_back(sv_pair);
-                    sv_count++;
-                }
+                // if (primary_start - supp_end >= min_cnv_length) {
+                //     SVCandidate sv_candidate(supp_end+1, primary_start+1, ".");
+                //     std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "GAPINNER_A");
+                //     sv_list.push_back(sv_pair);
+                //     sv_count++;
+                // }
 
                 // Also use the alignment ends as the SV endpoints
                 if (primary_end - supp_start >= min_cnv_length) {
@@ -608,12 +616,12 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
                 std::vector<std::pair<SVCandidate, std::string>> sv_list;  // SV candidate and alignment type
 
                 // Use the gap ends as the SV endpoints
-                if (supp_start - primary_end >= min_cnv_length) {
-                    SVCandidate sv_candidate(primary_end+1, supp_start+1, ".");
-                    std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "GAPINNER_B");
-                    sv_list.push_back(sv_pair);
-                    sv_count++;
-                }
+                // if (supp_start - primary_end >= min_cnv_length) {
+                //     SVCandidate sv_candidate(primary_end+1, supp_start+1, ".");
+                //     std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "GAPINNER_B");
+                //     sv_list.push_back(sv_pair);
+                //     sv_count++;
+                // }
 
                 // Also use the alignment ends as the SV endpoints
                 if (supp_end - primary_start >= min_cnv_length) {
diff --git a/src/sv_data.cpp b/src/sv_data.cpp
index 96e5a2fd..37ff32c3 100644
--- a/src/sv_data.cpp
+++ b/src/sv_data.cpp
@@ -133,12 +133,31 @@ int SVData::getClippedBaseSupport(std::string chr, int64_t pos, int64_t end)
 void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
 {
     // Create a VCF writer
+    std::cout << "Creating VCF writer..." << std::endl;
     std::string output_vcf = output_dir + "/output.vcf";
     VcfWriter vcf_writer(output_vcf);
+    std::cout << "Writing VCF file to " << output_vcf << std::endl;
 
     // Set the sample name
     std::string sample_name = "SAMPLE";
 
+    std::cout << "Getting reference genome filepath..." << std::endl;
+    try {
+        std::string ref_fp = ref_genome.getFilepath();
+        std::cout << "Reference genome filepath: " << ref_fp << std::endl;
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return;
+    }
+
+    std::cout << "Getting reference genome header..." << std::endl;
+    try {
+        ref_genome.getContigHeader();
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return;
+    }
+
     // Set the header lines
     std::vector<std::string> header_lines = {
         std::string("##reference=") + ref_genome.getFilepath(),
@@ -159,6 +178,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
     };
 
     // Write the header lines
+    std::cout << "Writing VCF header..." << std::endl;
     vcf_writer.writeHeader(header_lines);
 
     // Save the SV calls
@@ -251,12 +271,6 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
                     alt_allele = "<DUP>";
 
                     // Set the repeat type as an interspersed duplication
-                    repeat_type = "INTERSPERSED";
-                } else if (sv_type == TANDUP) {
-                    // Use a symbolic allele for tandem duplications
-                    alt_allele = "<DUP>";
-
-                    // Set the repeat type
                     repeat_type = "TANDEM";
                 }
             }
@@ -301,10 +315,12 @@ std::set<std::string> SVData::getChromosomes()
 
 int SVData::totalCalls()
 {
+    std::cout << "Calculating total SV calls..." << std::endl;
     int sv_calls = 0;
     for (auto const& sv_call : this->sv_calls) {
         sv_calls += sv_call.second.size();
     }
+    std::cout << "Total SV calls: " << sv_calls << std::endl;
 
     return sv_calls;
 }

From e7211ed6ad84f2c0d31aa6666958a7638d489bf1 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 31 Oct 2024 14:48:57 -0400
Subject: [PATCH 002/134] Add inversion detection

---
 include/cnv_caller.h  |  6 ++---
 include/sv_caller.h   |  4 +--
 include/sv_types.h    |  4 +--
 src/cnv_caller.cpp    | 32 ++++++++++++++++-------
 src/sv_caller.cpp     | 59 ++++++++++++++++++++++++++++++++++---------
 src/sv_data.cpp       |  6 ++---
 tests/test_general.py |  2 +-
 7 files changed, 81 insertions(+), 32 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index c913c24b..7b52e05e 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -83,8 +83,8 @@ class CNVCaller {
             {0, sv_types::UNKNOWN},
             {1, sv_types::DEL},
             {2, sv_types::DEL},
-            {3, sv_types::UNKNOWN},
-            {4, sv_types::UNKNOWN},
+            {3, sv_types::NEUTRAL},
+            {4, sv_types::NEUTRAL},
             {5, sv_types::DUP},
             {6, sv_types::DUP}
         };
@@ -125,7 +125,7 @@ class CNVCaller {
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
         SNPData runCIGARCopyNumberPrediction(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, int min_length);
 
-        void updateSVsFromCopyNumberPrediction(SVData& sv_calls, std::vector<std::pair<SVCandidate, std::string>>& sv_list, std::string chr);
+        void updateSVsFromCopyNumberPrediction(SVData& sv_calls, std::vector<std::pair<SVCandidate, std::string>>& sv_list, std::string chr, bool inversion);
 
         // Calculate the mean chromosome coverage
         double calculateMeanChromosomeCoverage(std::string chr);
diff --git a/include/sv_caller.h b/include/sv_caller.h
index c461d101..ed11d08f 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -17,8 +17,8 @@
 /// @endcond
 
 // SV candidate alignment data (chr, start, end, sequence, query start, query
-// end, mismatch map)
-using AlignmentData   = std::tuple<std::string, int64_t, int64_t, std::string, int32_t, int32_t, std::unordered_map<int, int>>;
+// end, mismatch map, strand)
+using AlignmentData   = std::tuple<std::string, int64_t, int64_t, std::string, int32_t, int32_t, std::unordered_map<int, int>, bool>;
 using AlignmentVector = std::vector<AlignmentData>;
 
 // Query map (query name, alignment vector)
diff --git a/include/sv_types.h b/include/sv_types.h
index 7e002777..af82ffec 100644
--- a/include/sv_types.h
+++ b/include/sv_types.h
@@ -16,11 +16,11 @@ namespace sv_types {
     static const int INV = 2;
     static const int INS = 3;
     static const int BND = 4;
-    static const int TANDUP = 5;  // Tandem duplication
+    static const int NEUTRAL = 5;  // Neutral copy number with unknown type
     static const int UNKNOWN = -1;
 
     // Define SVTypeString for SV types
-    static const std::string SVTypeString[] = {"DEL", "DUP", "INV", "INS", "BND", "DUP"};
+    static const std::string SVTypeString[] = {"DEL", "DUP", "INV", "INS", "BND", "NEUT"};
 
     // Create a struct for storing SV information
     struct SVInfo {
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index ca0588a2..255b1ba6 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -35,6 +35,10 @@ using namespace sv_types;
 std::pair<std::vector<int>, double> CNVCaller::runViterbi(CHMM hmm, SNPData& snp_data)
 {
     int data_count = (int) snp_data.pos.size();
+    if (data_count == 0)
+    {
+        throw std::runtime_error("Error: No SNP data found for Viterbi algorithm.");
+    }
     std::lock_guard<std::mutex> lock(this->hmm_mtx);  // Lock the mutex for the HMM
     std::pair<std::vector<int>, double> state_sequence = testVit_CHMM(hmm, data_count, snp_data.log2_cov, snp_data.baf, snp_data.pfb);
     return state_sequence;
@@ -47,12 +51,6 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, int64_t star
     bool snps_found = false;
     int window_size = this->input_data->getWindowSize();
 
-    // std::cout << "Querying SNPs for region " << chr << ":" << start_pos <<
-    // "-" << end_pos << "..." << std::endl;
-    // TEST
-    if (start_pos == 43593639 && end_pos == 43608172) {
-        printMessage("Querying SNPs for region " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
-    }
     // printMessage("Querying SNPs for region " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
     for (int64_t i = start_pos; i <= end_pos; i += window_size)
     {
@@ -117,7 +115,7 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, int64_t star
     return std::make_pair(snp_data, snps_found);
 }
 
-void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector<std::pair<SVCandidate, std::string>> &sv_list, std::string chr)
+void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector<std::pair<SVCandidate, std::string>> &sv_list, std::string chr, bool inversion)
 {
     // Throw an error if there are more than two SV candidates
     if (sv_list.size() > 2) {
@@ -155,6 +153,21 @@ void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector<
         aln_type += "_NOSNPS";
     }
 
+    // Update the SV type if inversion is detected and the best CNV type is
+    // copy neutral
+    if (inversion && (best_cnv_type == sv_types::NEUTRAL))
+    {
+        best_cnv_type = sv_types::INV;
+        printMessage("Inversion detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
+    }
+
+    // If the dummy call was used, then throw an error if the best SV type
+    // is unknown
+    if (std::get<0>(best_sv_candidate) == 0 && std::get<1>(best_sv_candidate) == 0)
+    {
+        throw std::runtime_error("Error: No valid SV type found for copy number prediction.");
+    }
+
     // Add the SV call to the main SV data
     sv_calls.add(chr, start_pos, end_pos, best_cnv_type, ".", aln_type, best_genotype, best_likelihood);
 }
@@ -204,7 +217,7 @@ std::tuple<int, double, int, std::string, bool> CNVCaller::runCopyNumberPredicti
 
         // Query the SNP region for the SV candidate
         std::pair<SNPData, bool> snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, this->snp_info, this->pos_depth_map, this->mean_chr_cov);
-        SNPData sv_snps = snp_call.first;
+        SNPData& sv_snps = snp_call.first;
         bool sv_snps_found = snp_call.second;
 
         // Run the Viterbi algorithm
@@ -264,7 +277,8 @@ std::tuple<int, double, int, std::string, bool> CNVCaller::runCopyNumberPredicti
     // Save the SV calls as a TSV file if enabled
     int64_t sv_start_pos = std::get<0>(best_pos);
     int64_t sv_end_pos = std::get<1>(best_pos);
-    if (this->input_data->getSaveCNVData() && predicted_cnv_type != sv_types::UNKNOWN && (sv_end_pos - sv_start_pos) > 10000)
+    bool copy_number_change = (predicted_cnv_type != sv_types::UNKNOWN && predicted_cnv_type != sv_types::NEUTRAL);
+    if (this->input_data->getSaveCNVData() && copy_number_change && (sv_end_pos - sv_start_pos) > 10000)
     {
         std::string cnv_type_str = SVTypeString[predicted_cnv_type];
         std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) sv_start_pos) + "-" + std::to_string((int) sv_end_pos) + "_SPLITALN.tsv";
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 76011de9..f800d1c3 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -89,6 +89,9 @@ RegionData SVCaller::detectSVsFromRegion(std::string region)
                 int64_t start = bam1->core.pos;
                 int64_t end = bam_endpos(bam1);  // This is the first position after the alignment
 
+                // Get the strand
+                bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
+
                 // Call SVs directly from the CIGAR string
                 std::tuple<std::unordered_map<int, int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true);
                 std::unordered_map<int, int> match_map = std::get<0>(query_info);
@@ -96,7 +99,7 @@ RegionData SVCaller::detectSVsFromRegion(std::string region)
                 int32_t query_end = std::get<2>(query_info);
 
                 // Add the primary alignment to the map
-                AlignmentData alignment(chr, start, end, ".", query_start, query_end, match_map);
+                AlignmentData alignment(chr, start, end, ".", query_start, query_end, std::move(match_map), fwd_strand);
                 primary_alignments[qname] = std::move(alignment);
 
             // Process supplementary alignments
@@ -107,6 +110,9 @@ RegionData SVCaller::detectSVsFromRegion(std::string region)
                 int32_t start = bam1->core.pos;
                 int32_t end = bam_endpos(bam1);
 
+                // Get the strand
+                bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
+
                 // Get CIGAR string information, but don't call SVs
                 std::tuple<std::unordered_map<int, int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false);
                 const std::unordered_map<int, int>& match_map = std::get<0>(query_info);
@@ -114,7 +120,7 @@ RegionData SVCaller::detectSVsFromRegion(std::string region)
                 int32_t query_end = std::get<2>(query_info);
 
                 // Add the supplementary alignment to the map
-                AlignmentData alignment(chr, start, end, ".", query_start, query_end, std::move(match_map));
+                AlignmentData alignment(chr, start, end, ".", query_start, query_end, std::move(match_map), fwd_strand);
                 supplementary_alignments[qname].emplace_back(alignment);
 
                 // If Read ID == 8873acc1-eb84-415d-8557-a32a8f52ccee, print the
@@ -438,10 +444,10 @@ SVData SVCaller::run()
             
         } else {
             int chr_len = this->input_data->getRefGenomeChromosomeLength(chr);
-            std::cout << "Chromosome length: " << chr_len << std::endl;
-            std::cout << "Chunk count: " << chunk_count << std::endl;
+            // std::cout << "Chromosome length: " << chr_len << std::endl;
+            // std::cout << "Chunk count: " << chunk_count << std::endl;
             int chunk_size = std::ceil((double)chr_len / chunk_count);
-            std::cout << "Chunk size: " << chunk_size << std::endl;
+            // std::cout << "Chunk size: " << chunk_size << std::endl;
             for (int i = 0; i < chunk_count; i++) {
                 int start = i * chunk_size + 1;  // 1-based
                 int end = start + chunk_size;
@@ -451,7 +457,7 @@ SVData SVCaller::run()
                 std::string chunk = chr + ":" + std::to_string(start) + "-" + std::to_string(end);
                 region_chunks.push_back(chunk);
             }
-            std::cout << "Split chromosome " << chr << " into " << region_chunks.size() << " chunks." << std::endl;
+            std::cout << "Split chromosome " << chr << " into " << region_chunks.size() << " chunks of size " << chunk_size << "..." << std::endl;
         }
 
         // Load chromosome data for copy number predictions
@@ -524,6 +530,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
         int32_t primary_query_start = std::get<4>(primary_alignment);
         int32_t primary_query_end = std::get<5>(primary_alignment);
         std::unordered_map<int, int> primary_match_map = std::get<6>(primary_alignment);
+        bool primary_strand = std::get<7>(primary_alignment);
 
         // Loop through the supplementary alignments and find gaps and overlaps
         AlignmentVector supp_alignments = supp_map[qname];
@@ -543,9 +550,9 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
             int32_t supp_query_start = std::get<4>(supp_alignment);
             int32_t supp_query_end = std::get<5>(supp_alignment);
             std::unordered_map<int, int> supp_match_map = std::get<6>(supp_alignment);
+            bool supp_strand = std::get<7>(supp_alignment);
 
-            // Determine if there is overlap between the primary and
-            // supplementary query sequences
+            // Resolve overlaps between the primary and supplementary query sequences
             int32_t overlap_start = std::max(primary_query_start, supp_query_start);
             int32_t overlap_end = std::min(primary_query_end, supp_query_end);
             int32_t overlap_length = overlap_end - overlap_start;
@@ -582,8 +589,36 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
                 }
             }
 
-            // Gap analysis (deletion or duplication)
-            if (supp_start < primary_start && supp_end < primary_start) {
+            // [1] Inversion detection from primary and supplementary alignments
+            // on opposite strands
+            if (primary_strand != supp_strand) {
+                // std::cout << "Inversion detected for read " << qname << std::endl;
+                // std::cout << "Primary read position: " << primary_start << "-" << primary_end << std::endl;
+                // std::cout << "Supplementary read position: " << supp_start << "-" << supp_end << std::endl;
+
+                std::vector<std::pair<SVCandidate, std::string>> sv_list;  // SV candidate and alignment type
+
+                // Use the supplementary alignment coordinates as the SV
+                // endpoints
+                if (supp_end - supp_start >= min_cnv_length) {
+                    SVCandidate sv_candidate(supp_start+1, supp_end+1, ".");
+                    std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "INVERSION");
+                    sv_list.push_back(sv_pair);
+                    sv_count++;
+                    // SVCandidate sv_candidate(supp_start+1, primary_end+1, ".");
+                    // std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "INVERSION");
+                    // sv_list.push_back(sv_pair);
+                    // sv_count++;
+                }
+
+                // Determine which SV to keep based on HMM prediction likelihood
+                if (sv_list.size() > 0) {
+                    cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr, true);
+                }
+            }
+
+            // [2] CNV detection based on primary and supplementary alignment boundaries
+            else if (supp_start < primary_start && supp_end < primary_start) {
 
                 // Gap with supplementary before primary:
                 // [supp_start] [supp_end] -- [primary_start] [primary_end]
@@ -607,7 +642,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
 
                 // Determine which SV to keep based on HMM prediction likelihood
                 if (sv_list.size() > 0) {
-                    cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr);
+                    cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr, false);
                 }
                 
             } else if (supp_start > primary_end && supp_end > primary_end) {
@@ -633,7 +668,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
 
                 // Determine which SV to keep based on HMM prediction likelihood
                 if (sv_list.size() > 0) {
-                    cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr);
+                    cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr, false);
                 }
             }
         }
diff --git a/src/sv_data.cpp b/src/sv_data.cpp
index 37ff32c3..2ade74c6 100644
--- a/src/sv_data.cpp
+++ b/src/sv_data.cpp
@@ -50,7 +50,7 @@ int SVData::add(std::string chr, int64_t start, int64_t end, int sv_type, std::s
         // For insertions and duplications, the SV length is the length of the
         // inserted sequence, not including the insertion position
         int sv_length = 0;
-        if (sv_type == INS || sv_type == DUP || sv_type == TANDUP) {
+        if (sv_type == INS || sv_type == DUP) {
             sv_length = end - start;
         } else {
             // For deletions, the SV length is the length of the deletion
@@ -216,7 +216,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
             int64_t end = std::get<1>(candidate);
 
             // If the SV type is unknown, skip it
-            if (sv_type == UNKNOWN) {
+            if (sv_type == UNKNOWN || sv_type == NEUTRAL) {
                 skip_count += 1;
                 continue;
             }
@@ -247,7 +247,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
                 pos = preceding_pos;
 
             // Duplications and insertions
-            } else if (sv_type == INS || sv_type == DUP || sv_type == TANDUP) {
+            } else if (sv_type == INS || sv_type == DUP) {
                 // Use the preceding base as the reference allele
                 int64_t preceding_pos = (int64_t) std::max(1, (int) pos-1);  // Make sure the position is not negative
                 ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
diff --git a/tests/test_general.py b/tests/test_general.py
index 499805c8..9083eb6a 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -64,7 +64,7 @@ def test_run():
 
     # Check that the VCF file has the correct number of lines.
     with open(output_file, 'r', encoding='utf-8') as f:
-        assert len(f.readlines()) == 21
+        assert len(f.readlines()) == 23
 
     # Check that the VCF file has the correct header, and the correct
     # VCF CHROM, POS, and INFO fields in the next 2 lines.

From 71e375e36baefd2672a20743b9da3ac511528e0f Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Nov 2024 09:13:32 -0400
Subject: [PATCH 003/134] Fix vcf writer error and add invdup

---
 .gitignore           |  3 +++
 include/sv_data.h    | 16 ++++++++--------
 include/sv_types.h   | 43 +++++++++++++++++++++++++++++++++----------
 include/vcf_writer.h |  4 +---
 src/cnv_caller.cpp   | 16 ++++++++++++----
 src/contextsv.cpp    |  2 ++
 src/sv_caller.cpp    | 18 ++++++++++++++++++
 src/sv_data.cpp      | 19 +++++++++++++------
 src/vcf_writer.cpp   | 20 +++++++++-----------
 9 files changed, 99 insertions(+), 42 deletions(-)

diff --git a/.gitignore b/.gitignore
index 50575520..c627421d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -84,3 +84,6 @@ data/hg19ToHg38.over.chain.gz
 # Test images
 python/dbscan_clustering*.png
 python/dist_plots
+
+# Temporary files
+lib/.nfs*
diff --git a/include/sv_data.h b/include/sv_data.h
index 414d2eda..44a45590 100644
--- a/include/sv_data.h
+++ b/include/sv_data.h
@@ -24,14 +24,14 @@ class SVData {
         std::map<std::pair<std::string, int64_t>, int> clipped_base_support;
 
         // SV type to string map for VCF output
-        std::map<int, std::string> sv_type_map = {
-            {0, "DEL"},
-            {1, "DUP"},
-            {2, "INV"},
-            {3, "INS"},
-            {4, "BND"},
-            {5, "DUP"}
-        };
+        // std::map<int, std::string> sv_type_map = {
+        //     {0, "DEL"},
+        //     {1, "DUP"},
+        //     {2, "INV"},
+        //     {3, "INS"},
+        //     {4, "BND"},
+        //     {5, "DUP"}
+        // };
         
     public:
         SVData() {};
diff --git a/include/sv_types.h b/include/sv_types.h
index af82ffec..2d4573bf 100644
--- a/include/sv_types.h
+++ b/include/sv_types.h
@@ -10,17 +10,40 @@
 /// @endcond
 
 namespace sv_types {
+
     // Define constants for SV types
-    static const int DEL = 0;
-    static const int DUP = 1;
-    static const int INV = 2;
-    static const int INS = 3;
-    static const int BND = 4;
-    static const int NEUTRAL = 5;  // Neutral copy number with unknown type
-    static const int UNKNOWN = -1;
-
-    // Define SVTypeString for SV types
-    static const std::string SVTypeString[] = {"DEL", "DUP", "INV", "INS", "BND", "NEUT"};
+    enum SVType {
+        UNKNOWN = -1,
+        DEL = 0,
+        DUP = 1,
+        INV = 2,
+        INS = 3,
+        BND = 4,
+        NEUTRAL = 5,  // Neutral copy number with unknown type
+        INV_DUP = 6  // Inversion duplication
+    };
+
+    // Mapping of SV types to strings
+    const std::unordered_map<int, std::string> SVTypeString = {
+        {DEL, "DEL"},
+        {DUP, "DUP"},
+        {INV, "INV"},
+        {INS, "INS"},
+        {BND, "BND"},
+        {NEUTRAL, "NEUT"},
+        {INV_DUP, "INVDUP"}
+    };
+    // static const int UNKNOWN = -1;
+    // static const int DEL = 0;
+    // static const int DUP = 1;
+    // static const int INV = 2;
+    // static const int INS = 3;
+    // static const int BND = 4;
+    // static const int NEUTRAL = 5;  // Neutral copy number with unknown type
+    // static const int INV_DUP = 6;  // Inversion duplication
+
+    // // Define SVTypeString for SV types (for VCF output)
+    // static const std::string SVTypeString[] = {"DEL", "DUP", "INV", "INS", "BND", "NEUT", "INVDUP"};
 
     // Create a struct for storing SV information
     struct SVInfo {
diff --git a/include/vcf_writer.h b/include/vcf_writer.h
index 800df144..e395ea37 100644
--- a/include/vcf_writer.h
+++ b/include/vcf_writer.h
@@ -8,6 +8,7 @@ class VcfWriter {
 public:
     // Constructor
     VcfWriter(const std::string& filename);
+    ~VcfWriter();
     void writeHeader(const std::vector<std::string>& headerLines);
     void writeRecord(const std::string& chrom, int pos, const std::string& id,
                      const std::string& ref, const std::string& alt,
@@ -15,9 +16,6 @@ class VcfWriter {
                      const std::string& info, const std::string& format,
                      const std::vector<std::string>& samples);
 
-    // Close the VCF file
-    void close();
-
 private:
     std::ofstream file_stream;
 };
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 255b1ba6..c47bda5e 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -154,11 +154,19 @@ void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector<
     }
 
     // Update the SV type if inversion is detected and the best CNV type is
-    // copy neutral
-    if (inversion && (best_cnv_type == sv_types::NEUTRAL))
+    // copy neutral or duplication
+    if (inversion) // && (best_cnv_type == sv_types::NEUTRAL))
     {
-        best_cnv_type = sv_types::INV;
-        printMessage("Inversion detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
+        if (best_cnv_type == sv_types::NEUTRAL)
+        {
+            best_cnv_type = sv_types::INV;
+        } else if (best_cnv_type == sv_types::DUP)
+        {
+            best_cnv_type = sv_types::INV_DUP;
+            printMessage("INVDUP detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
+        }
+        // best_cnv_type = sv_types::INV;
+        // printMessage("Inversion detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
     }
 
     // If the dummy call was used, then throw an error if the best SV type
diff --git a/src/contextsv.cpp b/src/contextsv.cpp
index 0a502881..5553dacc 100644
--- a/src/contextsv.cpp
+++ b/src/contextsv.cpp
@@ -43,6 +43,8 @@ int ContextSV::run()
     // auto end_sv = std::chrono::high_resolution_clock::now();
     // std::string elapsed_time = getElapsedTime(start_sv, end_sv);
     std::cout << "SV calling complete." << std::endl;
+    // int sv_count = sv_calls.totalCalls();
+    // std::cout << "Found " << sv_count << " total SVs." << std::endl;
     // std::cout << "SV calling complete. Found " << sv_calls.totalCalls() << " total SVs. Time taken (h:m:s) = " << elapsed_time << std::endl;
 
     return 0;
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index f800d1c3..c79f8ddb 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -589,6 +589,24 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
                 }
             }
 
+            // TODO:
+            // if (find_complex_events)
+            // # Calculate likelihood for entire coordinate
+            // likelihood_entire = hmm_model.predict_likelihood(entire_coordinate)
+
+            // # Split coordinates into smaller sections and calculate likelihoods
+            // subsections = split_coordinates(entire_coordinate)
+            // likelihoods_subsections = [hmm_model.predict_likelihood(sub) for sub in subsections]
+
+            // # Determine best likelihood from subsections
+            // best_likelihood_split = max(likelihoods_subsections)
+
+            // # Compare and decide
+            // if likelihood_entire > best_likelihood_split:
+            //     best_choice = "entire coordinate"
+            // else:
+            //     best_choice = "split coordinates"
+
             // [1] Inversion detection from primary and supplementary alignments
             // on opposite strands
             if (primary_strand != supp_strand) {
diff --git a/src/sv_data.cpp b/src/sv_data.cpp
index 2ade74c6..3ea742f3 100644
--- a/src/sv_data.cpp
+++ b/src/sv_data.cpp
@@ -135,6 +135,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
     // Create a VCF writer
     std::cout << "Creating VCF writer..." << std::endl;
     std::string output_vcf = output_dir + "/output.vcf";
+    std::cout << "Writing VCF file to " << output_vcf << std::endl;
     VcfWriter vcf_writer(output_vcf);
     std::cout << "Writing VCF file to " << output_vcf << std::endl;
 
@@ -184,8 +185,8 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
     // Save the SV calls
     std::cout << "Saving SV calls to " << output_vcf << std::endl;
     std::string sv_method = "CONTEXTSVv0.1";
-    int num_sv_calls = this->totalCalls();
     int skip_count = 0;
+    int total_count = 0;
     std::set<std::string> chrs = this->getChromosomes();
     for (auto const& chr : chrs) {
         if (this->sv_calls.find(chr) == this->sv_calls.end()) {
@@ -219,6 +220,8 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
             if (sv_type == UNKNOWN || sv_type == NEUTRAL) {
                 skip_count += 1;
                 continue;
+            } else {
+                total_count += 1;
             }
 
             // Process by SV type
@@ -277,7 +280,8 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
 
             // Create the VCF parameter strings
             int clipped_base_support = this->getClippedBaseSupport(chr, pos, end);
-            std::string sv_type_str = this->sv_type_map[sv_type];
+            // std::string sv_type_str = this->sv_type_map[sv_type];
+            std::string sv_type_str = sv_types::SVTypeString[sv_type];
             std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
                 ";SVLEN=" + std::to_string(sv_length) + ";SUPPORT=" + std::to_string(read_support) + \
                 ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";CLIPSUP=" + std::to_string(clipped_base_support) + \
@@ -293,10 +297,13 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
     }
 
     // Print the number of SV calls skipped
-    std::cout << "Skipped " << skip_count << " of " << num_sv_calls << " SV calls because the SV type is unknown" << std::endl;
+    std::cout << "Finished writing VCF file." << std::endl;
+    // int num_sv_calls = this->totalCalls();
+    // std::cout << "Skipped " << skip_count << " of " << num_sv_calls << " SV calls because the SV type is unknown" << std::endl;
+    // std::cout << "Finished writing VCF file with " << num_sv_calls - skip_count << " SV calls" << std::endl;
 
     // Close the output stream
-    vcf_writer.close();
+    // vcf_writer.close();
 }
 
 std::map<SVCandidate, SVInfo>& SVData::getChromosomeSVs(std::string chr)
@@ -315,12 +322,12 @@ std::set<std::string> SVData::getChromosomes()
 
 int SVData::totalCalls()
 {
-    std::cout << "Calculating total SV calls..." << std::endl;
+    // std::cout << "Calculating total SV calls..." << std::endl;
     int sv_calls = 0;
     for (auto const& sv_call : this->sv_calls) {
         sv_calls += sv_call.second.size();
     }
-    std::cout << "Total SV calls: " << sv_calls << std::endl;
+    // std::cout << "Total SV calls: " << sv_calls << std::endl;
 
     return sv_calls;
 }
diff --git a/src/vcf_writer.cpp b/src/vcf_writer.cpp
index eaf41ea5..8c93a36f 100644
--- a/src/vcf_writer.cpp
+++ b/src/vcf_writer.cpp
@@ -7,17 +7,21 @@
 
 VcfWriter::VcfWriter(const std::string &filename)
 {
-    // Remove the file if it already exists
-    std::remove(filename.c_str());
-
-    // Open the VCF file
-    this->file_stream.open(filename);
+    // Open the VCF file, overwrite if it already exists
+    this->file_stream.open(filename, std::ios::out);
     if (!this->file_stream.is_open()) {
         std::cerr << "Error: Unable to open " << filename << std::endl;
         exit(1);
     }
 }
 
+VcfWriter::~VcfWriter()
+{
+    if (this->file_stream.is_open()) {
+        this->file_stream.close();
+    }
+}
+
 void VcfWriter::writeHeader(const std::vector<std::string> &headerLines)
 {
     // Add the file format
@@ -55,9 +59,3 @@ void VcfWriter::writeRecord(const std::string &chrom, int pos, const std::string
     // Write a record to the VCF file
     this->file_stream << chrom << "\t" << pos << "\t" << id << "\t" << ref << "\t" << alt << "\t" << qual << "\t" << filter << "\t" << info << "\t" << format << "\t" << samples[0] << std::endl;
 }
-
-void VcfWriter::close()
-{
-    // Close the VCF file
-    this->file_stream.close();
-}

From e4df9d16fa854080d54981f289db4171a2f918c8 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Nov 2024 13:02:13 -0400
Subject: [PATCH 004/134] Update sv types

---
 include/cnv_caller.h | 15 ++--------
 include/sv_data.h    |  4 +--
 include/sv_types.h   | 65 +++++++++++++++++++++++++++-----------------
 src/cnv_caller.cpp   | 47 ++++++++++++++------------------
 src/contextsv.cpp    | 29 ++++----------------
 src/sv_caller.cpp    | 20 ++------------
 src/sv_data.cpp      | 51 ++++++++++++----------------------
 7 files changed, 90 insertions(+), 141 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 7b52e05e..23a70640 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -78,17 +78,6 @@ class CNVCaller {
             {6, "1/1"}
         };
 
-        // Define a map of CNV types by HMM predicted state (0=No predicted state)
-        std ::map<int, int> cnv_type_map = {
-            {0, sv_types::UNKNOWN},
-            {1, sv_types::DEL},
-            {2, sv_types::DEL},
-            {3, sv_types::NEUTRAL},
-            {4, sv_types::NEUTRAL},
-            {5, sv_types::DUP},
-            {6, sv_types::DUP}
-        };
-
         void updateSNPData(SNPData& snp_data, int64_t pos, double pfb, double baf, double log2_cov, bool is_snp);
 
         std::pair<std::vector<int>, double> runViterbi(CHMM hmm, SNPData &snp_data);
@@ -99,7 +88,7 @@ class CNVCaller {
         // Run copy number prediction for a chunk of SV candidates from CIGAR strings
         void runCIGARCopyNumberPredictionChunk(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, std::vector<SVCandidate> sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map<uint32_t, int>& pos_depth_map);
 
-        void updateSVCopyNumber(std::map<SVCandidate, SVInfo>& sv_candidates, SVCandidate key, int sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood);
+        void updateSVCopyNumber(std::map<SVCandidate, SVInfo>& sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood);
 
         void updateDPValue(std::map<SVCandidate, SVInfo>& sv_candidates, SVCandidate key, int dp_value);
 
@@ -120,7 +109,7 @@ class CNVCaller {
 
         // Run copy number prediction for a pair of SV candidates, and add only
         // the SV candidate with the highest likelihood
-        std::tuple<int, double, int, std::string, bool> runCopyNumberPredictionPair(std::string chr, SVCandidate sv_one, SVCandidate sv_two);
+        std::tuple<int, double, SVType, std::string, bool> runCopyNumberPredictionPair(std::string chr, SVCandidate sv_one, SVCandidate sv_two);
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
         SNPData runCIGARCopyNumberPrediction(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, int min_length);
diff --git a/include/sv_data.h b/include/sv_data.h
index 44a45590..548d6513 100644
--- a/include/sv_data.h
+++ b/include/sv_data.h
@@ -1,7 +1,6 @@
 #ifndef SV_DATA_H
 #define SV_DATA_H
 
-#include "fasta_query.h"  // For querying the reference genome
 
 /// @cond
 #include <string>
@@ -10,6 +9,7 @@
 #include <mutex>
 
 #include "sv_types.h"
+#include "fasta_query.h"
 /// @endcond
 
 // Include the SV types namespace
@@ -36,7 +36,7 @@ class SVData {
     public:
         SVData() {};
 
-        int add(std::string chr, int64_t start, int64_t end, int sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood);
+        int add(std::string chr, int64_t start, int64_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood);
 
         void concatenate(const SVData& sv_data);
 
diff --git a/include/sv_types.h b/include/sv_types.h
index 2d4573bf..97b49185 100644
--- a/include/sv_types.h
+++ b/include/sv_types.h
@@ -12,7 +12,7 @@
 namespace sv_types {
 
     // Define constants for SV types
-    enum SVType {
+    enum class SVType {
         UNKNOWN = -1,
         DEL = 0,
         DUP = 1,
@@ -24,15 +24,38 @@ namespace sv_types {
     };
 
     // Mapping of SV types to strings
-    const std::unordered_map<int, std::string> SVTypeString = {
-        {DEL, "DEL"},
-        {DUP, "DUP"},
-        {INV, "INV"},
-        {INS, "INS"},
-        {BND, "BND"},
-        {NEUTRAL, "NEUT"},
-        {INV_DUP, "INVDUP"}
+    const std::unordered_map<SVType, std::string> SVTypeString = {
+        {SVType::UNKNOWN, "UNKNOWN"},
+        {SVType::DEL, "DEL"},
+        {SVType::DUP, "DUP"},
+        {SVType::INV, "INV"},
+        {SVType::INS, "INS"},
+        {SVType::BND, "BND"},
+        {SVType::NEUTRAL, "NEUTRAL"},
+        {SVType::INV_DUP, "INV_DUP"}
     };
+
+    // Mapping of 6 copy number states to SV types
+    const std::unordered_map<int, SVType> CNVTypeMap = {
+        {0, SVType::UNKNOWN},
+        {1, SVType::DEL},
+        {2, SVType::DEL},
+        {3, SVType::NEUTRAL},
+        {4, SVType::NEUTRAL},
+        {5, SVType::DUP},
+        {6, SVType::DUP}
+    };
+
+    // Function to get the SV type string
+    inline std::string getSVTypeString(SVType sv_type) {
+        return SVTypeString.at(sv_type);
+    }
+
+    // Function to get the SV type from the CNV state
+    inline SVType getSVTypeFromCNState(int cn_state) {
+        return CNVTypeMap.at(cn_state);
+    }
+
     // static const int UNKNOWN = -1;
     // static const int DEL = 0;
     // static const int DUP = 1;
@@ -47,7 +70,7 @@ namespace sv_types {
 
     // Create a struct for storing SV information
     struct SVInfo {
-        int sv_type;
+        SVType sv_type;
         int read_support;  // Number of reads supporting the SV breakpoints
         int read_depth;  // Read depth at the SV start position
         std::set<std::string> data_type;  // Alignment type used to call the SV
@@ -55,25 +78,17 @@ namespace sv_types {
         std::string genotype = "./.";  // Default genotype (no call)
         double hmm_likelihood = 0.0;  // HMM likelihood score for the state sequence
 
-        SVInfo() :
-            sv_type(-1), read_support(0), read_depth(0), data_type({}), sv_length(0), genotype("./."), hmm_likelihood(0.0){}
+        SVInfo() = default;
+        // SVInfo() :
+        //     sv_type(-1), read_support(0), read_depth(0), data_type({}), sv_length(0), genotype("./."), hmm_likelihood(0.0){}
             
-        SVInfo(int sv_type, int read_support, int read_depth, std::string data_type, int sv_length, std::string genotype, double hmm_likelihood) :
+        SVInfo(SVType sv_type, int read_support, int read_depth, std::string data_type, int sv_length, std::string genotype, double hmm_likelihood) :
             sv_type(sv_type), read_support(read_support), read_depth(read_depth), data_type({data_type}), sv_length(sv_length), genotype(genotype), hmm_likelihood(hmm_likelihood) {}
     };
 
-    // SV (start, end, alt_allele)
-    using SVCandidate = std::tuple<int64_t, int64_t, std::string>;
-    
-    // Chromosome to SV candidate to read depth map
-    using SVDepthMap = std::unordered_map<std::string, std::map<SVCandidate, SVInfo>>;
-
-    // Define a map for storing copy number calls by SV candidate
-    using SVCopyNumberMap = std::map<SVCandidate, std::tuple<int, std::string, std::string>>;
-
-    // Create a type for storing SV update information from copy number caller
-    // (SVCandidate, SV type, genotype, data type)
-    using SVUpdate = std::tuple<SVCandidate, int, std::string, std::string>;
+    // Type definition for SV-related structures
+    using SVCandidate = std::tuple<int64_t, int64_t, std::string>;  // SV (start, end, alt_allele)
+    using SVDepthMap = std::unordered_map<std::string, std::map<SVCandidate, SVInfo>>;  // Chromosome -> SV candidate -> SV info
 }
 
 #endif // SV_TYPES_H
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index c47bda5e..35793d47 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -132,7 +132,7 @@ void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector<
     // candidate with the highest likelihood
     SVCandidate& sv_one = sv_list[0].first;
     SVCandidate& sv_two = sv_list[1].first;
-    std::tuple<int, double, int, std::string, bool> cnv_prediction = this->runCopyNumberPredictionPair(chr, sv_one, sv_two);
+    std::tuple<int, double, SVType, std::string, bool> cnv_prediction = this->runCopyNumberPredictionPair(chr, sv_one, sv_two);
 
     // Get the SV info
     int best_index = std::get<0>(cnv_prediction);
@@ -143,7 +143,7 @@ void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector<
 
     // Get the prediction data
     double best_likelihood = std::get<1>(cnv_prediction);
-    int best_cnv_type = std::get<2>(cnv_prediction);
+    SVType best_cnv_type = std::get<2>(cnv_prediction);
     std::string best_genotype = std::get<3>(cnv_prediction);
     bool snps_found = std::get<4>(cnv_prediction);
     if (snps_found)
@@ -157,12 +157,12 @@ void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector<
     // copy neutral or duplication
     if (inversion) // && (best_cnv_type == sv_types::NEUTRAL))
     {
-        if (best_cnv_type == sv_types::NEUTRAL)
+        if (best_cnv_type == SVType::NEUTRAL)
         {
-            best_cnv_type = sv_types::INV;
-        } else if (best_cnv_type == sv_types::DUP)
+            best_cnv_type = SVType::INV;
+        } else if (best_cnv_type == SVType::DUP)
         {
-            best_cnv_type = sv_types::INV_DUP;
+            best_cnv_type = SVType::INV_DUP;
             printMessage("INVDUP detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
         }
         // best_cnv_type = sv_types::INV;
@@ -180,7 +180,7 @@ void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector<
     sv_calls.add(chr, start_pos, end_pos, best_cnv_type, ".", aln_type, best_genotype, best_likelihood);
 }
 
-std::tuple<int, double, int, std::string, bool> CNVCaller::runCopyNumberPredictionPair(std::string chr, SVCandidate sv_one, SVCandidate sv_two)
+std::tuple<int, double, SVType, std::string, bool> CNVCaller::runCopyNumberPredictionPair(std::string chr, SVCandidate sv_one, SVCandidate sv_two)
 {
     // std::cout << "Running copy number prediction for SV pair " << chr << ":" << std::get<0>(sv_one) << "-" << std::get<1>(sv_one) << " and " << std::get<0>(sv_two) << "-" << std::get<1>(sv_two) << "..." << std::endl;
     double best_likelihood = 0.0;
@@ -197,7 +197,7 @@ std::tuple<int, double, int, std::string, bool> CNVCaller::runCopyNumberPredicti
     // calculateDepthsForSNPRegion(chr, region_start_pos, region_end_pos, pos_depth_map);
 
     int current_index = 0;
-    int predicted_cnv_type = sv_types::UNKNOWN;
+    SVType predicted_cnv_type = SVType::UNKNOWN;
     std::string genotype = "./.";
     for (const auto& sv_call : {sv_one, sv_two})
     {
@@ -262,7 +262,7 @@ std::tuple<int, double, int, std::string, bool> CNVCaller::runCopyNumberPredicti
         int state_count = (int) sv_states.size();
         if ((double) max_count / (double) state_count > pct_threshold)
         {
-            predicted_cnv_type = cnv_type_map[max_state];
+            predicted_cnv_type = getSVTypeFromCNState(max_state);
             genotype = cnv_genotype_map[max_state];
         }
 
@@ -285,10 +285,10 @@ std::tuple<int, double, int, std::string, bool> CNVCaller::runCopyNumberPredicti
     // Save the SV calls as a TSV file if enabled
     int64_t sv_start_pos = std::get<0>(best_pos);
     int64_t sv_end_pos = std::get<1>(best_pos);
-    bool copy_number_change = (predicted_cnv_type != sv_types::UNKNOWN && predicted_cnv_type != sv_types::NEUTRAL);
+    bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL);
     if (this->input_data->getSaveCNVData() && copy_number_change && (sv_end_pos - sv_start_pos) > 10000)
     {
-        std::string cnv_type_str = SVTypeString[predicted_cnv_type];
+        std::string cnv_type_str = getSVTypeString(predicted_cnv_type);
         std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) sv_start_pos) + "-" + std::to_string((int) sv_end_pos) + "_SPLITALN.tsv";
         std::cout << "Saving SV split-alignment copy number predictions to " << sv_filename << std::endl;
         this->saveSVCopyNumberToTSV(best_snp_data, sv_filename, chr, best_pos.first, best_pos.second, cnv_type_str, best_likelihood);
@@ -323,14 +323,7 @@ SNPData CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::map<SVCand
         return snp_data;
     }
 
-    // Get read depths for the SV candidate region
-    // int64_t first_pos = std::get<0>(sv_candidates.begin()->first);
-    // int64_t last_pos = std::get<1>(sv_candidates.rbegin()->first);
-    // std::unordered_map<uint64_t, int> pos_depth_map;
-    // calculateDepthsForSNPRegion(chr, first_pos, last_pos, pos_depth_map);
-    
-    // Run copy number prediction for the SV candidates
-    // Loop through each SV candidate and predict the copy number state
+   
     printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "...");
 
     // Create a map with counts for each CNV type
@@ -457,7 +450,7 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map<SVCa
         }
 
         // Update the SV calls with the CNV type and genotype
-        int cnv_type = cnv_type_map[max_state];
+        SVType cnv_type = getSVTypeFromCNState(max_state);
         std::string genotype = cnv_genotype_map[max_state];
 
         // Determine the SV calling method used to call the SV
@@ -475,14 +468,14 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map<SVCa
 
         // Save the SV calls as a TSV file if enabled, if the SV type is
         // known, and the length is greater than 10 kb
-        int updated_sv_type = sv_candidates[sv_call].sv_type;
-        if (this->input_data->getSaveCNVData() && updated_sv_type != sv_types::UNKNOWN && (end_pos - start_pos) > 10000)
+        SVType updated_sv_type = sv_candidates[sv_call].sv_type;
+        if (this->input_data->getSaveCNVData() && updated_sv_type != SVType::UNKNOWN && (end_pos - start_pos) > 10000)
         {
             // Add the state sequence to the SNP data (avoid copying the data)
             sv_snps.state_sequence = std::move(state_sequence);
 
             // Save the SV calls as a TSV file
-            std::string cnv_type_str = SVTypeString[updated_sv_type];
+            std::string cnv_type_str = getSVTypeString(updated_sv_type);
             std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_CIGAR.tsv";
             // std::cout << "Saving SV CIGAR copy number predictions to " <<
             // sv_filename << std::endl;
@@ -492,16 +485,16 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map<SVCa
     }
 }
 
-void CNVCaller::updateSVCopyNumber(std::map<SVCandidate, SVInfo> &sv_candidates, SVCandidate key, int sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood)
+void CNVCaller::updateSVCopyNumber(std::map<SVCandidate, SVInfo> &sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood)
 {
     // Update SV data from the HMM copy number prediction
     // Lock the SV candidate map
     std::lock_guard<std::mutex> lock(this->sv_candidates_mtx);
 
     // Update the SV type if the update is not unknown, and if the types don't
-    // conflict (To avoid overwriting CIGAR-based SV calls with SNP-based calls)
-    int current_sv_type = sv_candidates[key].sv_type;
-    if ((sv_type_update != sv_types::UNKNOWN) && ((current_sv_type == sv_type_update) || (current_sv_type == sv_types::UNKNOWN)))
+    // conflict (To avoid overwriting previous calls)
+    SVType current_sv_type = sv_candidates[key].sv_type;
+    if ((sv_type_update != SVType::UNKNOWN) && ((current_sv_type == sv_type_update) || (current_sv_type == SVType::UNKNOWN)))
     {
         sv_candidates[key].sv_type = sv_type_update;  // Update the SV type
         sv_candidates[key].data_type.insert(data_type);  // Update the data type
diff --git a/src/contextsv.cpp b/src/contextsv.cpp
index 5553dacc..c0d4acd7 100644
--- a/src/contextsv.cpp
+++ b/src/contextsv.cpp
@@ -20,32 +20,13 @@ ContextSV::ContextSV(InputData& input_data)
 // Entry point
 int ContextSV::run()
 {
-    // Start the program's timer
-    auto start_sv = std::chrono::high_resolution_clock::now();
-
-    // Get the reference genome
-    FASTAQuery ref_genome = this->input_data->getRefGenome();
-
-    // Call SVs from long read alignments:
-    std::cout << "Running alignment-based SV calling..." << std::endl;
-    SVCaller sv_caller(*this->input_data);
-    SVData sv_calls = sv_caller.run();
-
-    // Print the total number of SVs called
-    // std::cout << "Total SVs called: " << sv_calls.totalCalls() << std::endl;
-
-    // Write SV calls to file
-    std::string output_dir = this->input_data->getOutputDir();
+    FASTAQuery ref_genome = this->input_data->getRefGenome();  // Load the reference genome
+    SVCaller sv_caller(*this->input_data);  // Create an SV caller object
+    SVData sv_calls = sv_caller.run();  // Run the SV caller
+    std::string output_dir = this->input_data->getOutputDir();  // Get the output directory
     std::cout << "Writing SV calls to file " << output_dir << "/output.vcf..." << std::endl;
-    sv_calls.saveToVCF(ref_genome, output_dir);
-
-    // Format and print the time taken to call SVs
-    // auto end_sv = std::chrono::high_resolution_clock::now();
-    // std::string elapsed_time = getElapsedTime(start_sv, end_sv);
+    sv_calls.saveToVCF(ref_genome, output_dir);  // Save the SV calls to a VCF file
     std::cout << "SV calling complete." << std::endl;
-    // int sv_count = sv_calls.totalCalls();
-    // std::cout << "Found " << sv_count << " total SVs." << std::endl;
-    // std::cout << "SV calling complete. Found " << sv_calls.totalCalls() << " total SVs. Time taken (h:m:s) = " << elapsed_time << std::endl;
 
     return 0;
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index c79f8ddb..926ae663 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -297,9 +297,9 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
                 // Lock the SV calls object and add the insertion
                 std::lock_guard<std::mutex> lock(this->sv_mtx);
                 if (is_duplication) {
-                    sv_calls.add(chr, ref_pos, ref_end, DUP, ins_seq_str, "CIGARDUP", "./.", 0.0);
+                    sv_calls.add(chr, ref_pos, ref_end, SVType::DUP, ins_seq_str, "CIGARDUP", "./.", 0.0);
                 } else {
-                    sv_calls.add(chr, ref_pos, ref_end, INS, ins_seq_str, "CIGARINS", "./.", 0.0);
+                    sv_calls.add(chr, ref_pos, ref_end, SVType::INS, ins_seq_str, "CIGARINS", "./.", 0.0);
                 }
             }
 
@@ -315,7 +315,7 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
 
                 // Lock the SV calls object and add the deletion
                 // std::lock_guard<std::mutex> lock(this->sv_mtx);
-                sv_calls.add(chr, ref_pos, ref_end, DEL, ".", "CIGARDEL", "./.", 0.0);
+                sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", "CIGARDEL", "./.", 0.0);
             }
 
         // Check if the CIGAR operation is a clipped base
@@ -422,7 +422,6 @@ SVData SVCaller::run()
     std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl;
     int chunk_count = 100;  // Number of chunks to split the chromosome into
     int region_count = 0;
-    auto start1 = std::chrono::high_resolution_clock::now();
     SVData sv_calls;
     int min_cnv_length = this->input_data->getMinCNVLength();
     for (const auto& chr : chromosomes) {
@@ -430,8 +429,6 @@ SVData SVCaller::run()
 
         // Split the chromosome into chunks
         std::vector<std::string> region_chunks;
-
-        // Get the region start and end positions
         if (this->input_data->isRegionSet()) {
             std::pair<int32_t, int32_t> region = this->input_data->getRegion();
             int region_start = region.first;
@@ -444,10 +441,7 @@ SVData SVCaller::run()
             
         } else {
             int chr_len = this->input_data->getRefGenomeChromosomeLength(chr);
-            // std::cout << "Chromosome length: " << chr_len << std::endl;
-            // std::cout << "Chunk count: " << chunk_count << std::endl;
             int chunk_size = std::ceil((double)chr_len / chunk_count);
-            // std::cout << "Chunk size: " << chunk_size << std::endl;
             for (int i = 0; i < chunk_count; i++) {
                 int start = i * chunk_size + 1;  // 1-based
                 int end = start + chunk_size;
@@ -464,12 +458,10 @@ SVData SVCaller::run()
         std::cout << "Loading chromosome data for copy number predictions..." << std::endl;
         CNVCaller cnv_caller(*this->input_data);
         cnv_caller.loadChromosomeData(chr);
-        // std::cout << "Loaded chromosome data for copy number predictions." << std::endl;
 
         // Process each chunk one at a time
         std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl;
         for (const auto& sub_region : region_chunks) {
-            // Detect SVs from the sub-region
             // std::cout << "Detecting CIGAR string SVs from " << sub_region << "..." << std::endl;
             RegionData region_data = this->detectSVsFromRegion(sub_region);
             SVData& sv_calls_region = std::get<0>(region_data);
@@ -488,8 +480,6 @@ SVData SVCaller::run()
                 std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl;
                 cnv_caller.runCIGARCopyNumberPrediction(chr, cigar_svs, min_cnv_length);
             }
-            // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl;
-            // cnv_caller.runCIGARCopyNumberPrediction(chr, cigar_svs, min_cnv_length);
 
             // Run split-read SV detection in a single thread, combined with
             // copy number variant predictions
@@ -503,13 +493,9 @@ SVData SVCaller::run()
         // Increment the region count
         region_count++;
         std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)..." << std::endl;
-        // std::cout << "Extracted aligments for " << region_count << " of " << chr_count << " chromosome(s)..." << std::endl;
     }
 
-    // auto end1 = std::chrono::high_resolution_clock::now();
     std::cout << "SV calling completed." << std::endl;
-    // int total_sv_calls = sv_calls.totalCalls();
-    // std::cout << "Finished detecting " << sv_calls.totalCalls() << " SVs from " << chr_count << " chromosome(s). Elapsed time: " << getElapsedTime(start1, end1) << std::endl;
 
     return sv_calls;
 }
diff --git a/src/sv_data.cpp b/src/sv_data.cpp
index 3ea742f3..055f2ebe 100644
--- a/src/sv_data.cpp
+++ b/src/sv_data.cpp
@@ -8,7 +8,7 @@
 /// @endcond
 
 
-int SVData::add(std::string chr, int64_t start, int64_t end, int sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
+int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
 {
     // Check if the alternate allele contains ambiguous bases
     const std::unordered_set<char> ambiguous_bases = {'R', 'Y', 'W', 'S', 'K', 'M', 'B', 'D', 'H', 'V'};
@@ -26,7 +26,7 @@ int SVData::add(std::string chr, int64_t start, int64_t end, int sv_type, std::s
         sv_info.read_support += 1;
 
         // Update the SV type if it is unknown
-        if (sv_info.sv_type == UNKNOWN) {
+        if (sv_info.sv_type == SVType::UNKNOWN) {
             sv_info.sv_type = sv_type;
         }
 
@@ -47,14 +47,11 @@ int SVData::add(std::string chr, int64_t start, int64_t end, int sv_type, std::s
 
     // Otherwise, add the SV candidate to the map
     } else {
-        // For insertions and duplications, the SV length is the length of the
-        // inserted sequence, not including the insertion position
-        int sv_length = 0;
-        if (sv_type == INS || sv_type == DUP) {
-            sv_length = end - start;
-        } else {
-            // For deletions, the SV length is the length of the deletion
-            sv_length = end - start + 1;
+        int sv_length = end - start;
+
+        // For deletions, the SV length is the length of the deletion, including the start position
+        if (sv_type == SVType::DEL) {
+            sv_length++;
         }
 
         // Create a new SVInfo object (SV type, alignment support, read depth, data type, SV length, genotype)
@@ -198,7 +195,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
             // Get the SV candidate and SV info
             SVCandidate candidate = sv_call.first;
             SVInfo info = sv_call.second;
-            int sv_type = info.sv_type;
+            SVType sv_type = info.sv_type;
             int read_support = info.read_support;
             int read_depth = info.read_depth;
             int sv_length = info.sv_length;
@@ -217,7 +214,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
             int64_t end = std::get<1>(candidate);
 
             // If the SV type is unknown, skip it
-            if (sv_type == UNKNOWN || sv_type == NEUTRAL) {
+            if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) {
                 skip_count += 1;
                 continue;
             } else {
@@ -230,7 +227,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
             std::string repeat_type = "NA";
 
             // Deletion
-            if (sv_type == DEL) {
+            if (sv_type == SVType::DEL) {
                 // Get the deleted sequence from the reference genome, also including the preceding base
                 int64_t preceding_pos = (int64_t) std::max(1, (int) pos-1);  // Make sure the position is not negative
                 ref_allele = ref_genome.query(chr, preceding_pos, end);
@@ -239,7 +236,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
                 if (ref_allele != "") {
                     alt_allele = ref_allele.at(0);
                 } else {
-                    alt_allele = "<DEL>";  // Use symbolic allele for imprecise deletions
+                    alt_allele = "<DEL>";  // Symbolic allele
                     std::cerr << "Warning: Reference allele is empty for deletion at " << chr << ":" << pos << "-" << end << std::endl;
                 }
 
@@ -249,18 +246,16 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
                 // Update the position
                 pos = preceding_pos;
 
-            // Duplications and insertions
-            } else if (sv_type == INS || sv_type == DUP) {
+            // Other types (duplications, insertions, inversions)
+            } else {
                 // Use the preceding base as the reference allele
                 int64_t preceding_pos = (int64_t) std::max(1, (int) pos-1);  // Make sure the position is not negative
                 ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
 
                 // Format novel insertions
-                if (sv_type == INS) {
+                if (sv_type == SVType::INS) {
                     // Use the insertion sequence as the alternate allele
                     alt_allele = std::get<2>(candidate);
-
-                    // Insert the reference base into the alternate allele
                     alt_allele.insert(0, ref_allele);
 
                     // Update the position
@@ -269,19 +264,15 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
                     // Update the end position to the start position to change from
                     // query to reference coordinates for insertions
                     end = pos;
-                } else if (sv_type == DUP) {
-                    // Use a symbolic allele for duplications
-                    alt_allele = "<DUP>";
-
-                    // Set the repeat type as an interspersed duplication
+                } else if (sv_type == SVType::DUP) {                    
+                    alt_allele = "<DUP>";  // Symbolic allele
                     repeat_type = "TANDEM";
                 }
             }
 
             // Create the VCF parameter strings
             int clipped_base_support = this->getClippedBaseSupport(chr, pos, end);
-            // std::string sv_type_str = this->sv_type_map[sv_type];
-            std::string sv_type_str = sv_types::SVTypeString[sv_type];
+            std::string sv_type_str = getSVTypeString(sv_type);
             std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
                 ";SVLEN=" + std::to_string(sv_length) + ";SUPPORT=" + std::to_string(read_support) + \
                 ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";CLIPSUP=" + std::to_string(clipped_base_support) + \
@@ -297,13 +288,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
     }
 
     // Print the number of SV calls skipped
-    std::cout << "Finished writing VCF file." << std::endl;
-    // int num_sv_calls = this->totalCalls();
-    // std::cout << "Skipped " << skip_count << " of " << num_sv_calls << " SV calls because the SV type is unknown" << std::endl;
-    // std::cout << "Finished writing VCF file with " << num_sv_calls - skip_count << " SV calls" << std::endl;
-
-    // Close the output stream
-    // vcf_writer.close();
+    std::cout << "Finished writing VCF file. Total SV calls: " << total_count << ", skipped: " << skip_count << " with unknown SV type" << std::endl;
 }
 
 std::map<SVCandidate, SVInfo>& SVData::getChromosomeSVs(std::string chr)

From 6fb4b72c6f10b10d3b1e58ecaa059bf5bc1289b2 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Nov 2024 13:29:17 -0400
Subject: [PATCH 005/134] Update docs

---
 src/sv_caller.cpp | 82 ++++++++++-------------------------------------
 src/sv_data.cpp   | 27 ++++------------
 2 files changed, 24 insertions(+), 85 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 926ae663..4ce5e940 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -189,31 +189,12 @@ SVCaller::SVCaller(InputData &input_data)
 
 std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& sv_calls, bool is_primary)
 {
-    // Get the chromosome
-    std::string chr = header->target_name[alignment->core.tid];
-
-    // Get the position of the alignment in the reference genome
-    int32_t pos = alignment->core.pos;
-
-    // Get the CIGAR string
-    uint32_t* cigar = bam_get_cigar(alignment);
-
-    // Get the CIGAR length
+    std::string chr = header->target_name[alignment->core.tid];  // Chromosome name
+    int32_t pos = alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
+    uint32_t* cigar = bam_get_cigar(alignment);  // CIGAR array
     int cigar_len = alignment->core.n_cigar;
-
-    // Track the query position
     int query_pos = 0;
-
-    // Loop through the CIGAR string (0-based) and detect insertions and deletions in
-    // reference coordinates (1-based)
-    // POS is the leftmost position of where the alignment maps to the reference:
-    // https://genome.sph.umich.edu/wiki/SAM
-    // std::vector<std::thread> threads;
-    // std::vector<SVData> sv_calls_vec;
-
-    // Create a map of query position to match/mismatch (1/0) for calculating
-    // the mismatch rate at alignment overlaps
-    std::unordered_map<int, int> query_match_map;
+    std::unordered_map<int, int> query_match_map;  // Query position to match/mismatch (1/0) map
 
     // Loop through the CIGAR string, process operations, detect SVs (primary
     // only), update clipped base support, calculate sequence identity for
@@ -226,24 +207,14 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
     bool first_op = false;  // First alignment operation for the query
     for (int i = 0; i < cigar_len; i++) {
 
-        // Get the CIGAR operation
-        int op = bam_cigar_op(cigar[i]);
-
-        // Get the CIGAR operation length
-        int op_len = bam_cigar_oplen(cigar[i]);
+        int op = bam_cigar_op(cigar[i]);  // CIGAR operation
+        int op_len = bam_cigar_oplen(cigar[i]);  // CIGAR operation length
         
-        // Check if the CIGAR operation is an insertion
+        // Process the CIGAR operation
         if (op == BAM_CINS && is_primary) {
-
-            // Add the SV if greater than the minimum SV size
             if (op_len >= this->min_sv_size) {
 
                 // Get the sequence of the insertion from the query
-                // std::string ins_seq_str = "";
-                // uint8_t* seq_ptr = bam_get_seq(alignment);
-                // for (int j = 0; j < op_len; j++) {
-                //     ins_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)];
-                // }
                 std::string ins_seq_str(op_len, ' ');
                 for (int j = 0; j < op_len; j++) {
                     ins_seq_str[j] = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
@@ -293,9 +264,6 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
                 // Add to SV calls (1-based) with the appropriate SV type
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
-
-                // Lock the SV calls object and add the insertion
-                std::lock_guard<std::mutex> lock(this->sv_mtx);
                 if (is_duplication) {
                     sv_calls.add(chr, ref_pos, ref_end, SVType::DUP, ins_seq_str, "CIGARDUP", "./.", 0.0);
                 } else {
@@ -307,23 +275,17 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
         } else if (op == BAM_CDEL && is_primary) {
 
             // Add the SV if greater than the minimum SV size
-            if (op_len >= this->min_sv_size) {
-                
-                // Add the deletion to the SV calls (1-based)
+            if (op_len >= this->min_sv_size)
+            {
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
-
-                // Lock the SV calls object and add the deletion
-                // std::lock_guard<std::mutex> lock(this->sv_mtx);
-                sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", "CIGARDEL", "./.", 0.0);
+                sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", "CIGARDEL", "./.", 0.0);  // Add the deletion
             }
 
         // Check if the CIGAR operation is a clipped base
         } else if (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP) {
 
-            // Update the clipped base support
-            // std::lock_guard<std::mutex> lock(this->sv_mtx);
-            sv_calls.updateClippedBaseSupport(chr, pos);
+            sv_calls.updateClippedBaseSupport(chr, pos);  // Update clipped base support
 
             // Update the query alignment start position
             if (!first_op) {
@@ -394,21 +356,14 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
         }
     }
 
-    // Update the query end position
-    query_end = query_pos;
+    query_end = query_pos;  // Last alignment position in the query
 
-    // Return the mismatch map and the query start and end positions
     return std::tuple<std::unordered_map<int, int>, int32_t, int32_t>(query_match_map, query_start, query_end);
 }
 
-// Detect SVs from split read alignments (primary and supplementary) and
-// directly from the CIGAR string
 SVData SVCaller::run()
 {
-    // Open the BAM file
-    std::string bam_filepath = this->input_data->getLongReadBam();
-
-    // Get the region data
+    // Get the chromosomes to process
     std::vector<std::string> chromosomes;
     if (this->input_data->getChromosome() != "") {
         chromosomes.push_back(this->input_data->getChromosome());
@@ -417,8 +372,7 @@ SVData SVCaller::run()
     }
     int chr_count = chromosomes.size();
 
-    // Loop through each region and detect SVs (Note: The main loop is
-    // single-threaded)
+    // Loop through each region and detect SVs in chunks
     std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl;
     int chunk_count = 100;  // Number of chunks to split the chromosome into
     int region_count = 0;
@@ -430,11 +384,11 @@ SVData SVCaller::run()
         // Split the chromosome into chunks
         std::vector<std::string> region_chunks;
         if (this->input_data->isRegionSet()) {
+
+            // Use one chunk for the specified region
             std::pair<int32_t, int32_t> region = this->input_data->getRegion();
             int region_start = region.first;
             int region_end = region.second;
-
-            // Use one chunk for the region
             std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end);
             region_chunks.push_back(chunk);
             std::cout << "Using specified region " << chunk << "..." << std::endl;
@@ -485,9 +439,7 @@ SVData SVCaller::run()
             // copy number variant predictions
             std::cout << "Detecting copy number variants from split reads..." << std::endl;
             this->detectSVsFromSplitReads(sv_calls_region, primary_map, supp_map, cnv_caller);
-
-            // Add the SV calls to the main SV calls object
-            sv_calls.concatenate(sv_calls_region);
+            sv_calls.concatenate(sv_calls_region);  // Add the calls to the main set
         }
 
         // Increment the region count
diff --git a/src/sv_data.cpp b/src/sv_data.cpp
index 055f2ebe..797ef704 100644
--- a/src/sv_data.cpp
+++ b/src/sv_data.cpp
@@ -21,7 +21,8 @@ int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std
     // Check if the SV candidate already exists in the map
     SVCandidate candidate(start, end, alt_allele);
     if (this->sv_calls[chr].find(candidate) != this->sv_calls[chr].end()) {
-        // Update the alignment-based support count (+1)
+        
+        // Update the alignment-based support count
         SVInfo& sv_info = this->sv_calls[chr][candidate];
         sv_info.read_support += 1;
 
@@ -39,9 +40,7 @@ int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std
         if ((sv_info.hmm_likelihood == 0.0) || (hmm_likelihood > sv_info.hmm_likelihood)) {
             sv_info.hmm_likelihood = hmm_likelihood;
         }
-
-        // Add the alignment type used to call the SV
-        sv_info.data_type.insert(data_type);
+        sv_info.data_type.insert(data_type);  // Add the alignment type to the set
 
         return 0;  // SV call already exists
 
@@ -54,11 +53,8 @@ int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std
             sv_length++;
         }
 
-        // Create a new SVInfo object (SV type, alignment support, read depth, data type, SV length, genotype)
         SVInfo sv_info(sv_type, 1, 0, data_type, sv_length, genotype, hmm_likelihood);
-
-        // Add the SV candidate to the map
-        this->sv_calls[chr][candidate] = sv_info;
+        this->sv_calls[chr][candidate] = sv_info;  // Add the SV candidate to the map
 
         return 1;  // SV call added
     }
@@ -88,10 +84,8 @@ void SVData::updateClippedBaseSupport(std::string chr, int64_t pos)
     // Update clipped base support
     std::pair<std::string, int64_t> key(chr, pos);
     if (this->clipped_base_support.find(key) != this->clipped_base_support.end()) {
-        // Update the depth
         this->clipped_base_support[key] += 1;
     } else {
-        // Add the depth
         this->clipped_base_support[key] = 1;
     }
 }
@@ -175,11 +169,9 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
         "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth\">"
     };
 
-    // Write the header lines
     std::cout << "Writing VCF header..." << std::endl;
     vcf_writer.writeHeader(header_lines);
 
-    // Save the SV calls
     std::cout << "Saving SV calls to " << output_vcf << std::endl;
     std::string sv_method = "CONTEXTSVv0.1";
     int skip_count = 0;
@@ -240,11 +232,9 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
                     std::cerr << "Warning: Reference allele is empty for deletion at " << chr << ":" << pos << "-" << end << std::endl;
                 }
 
-                // Make the SV length negative
-                sv_length = -1 * sv_length;
+                sv_length = -1 * sv_length;  // Negative length for deletions
 
-                // Update the position
-                pos = preceding_pos;
+                pos = preceding_pos;  // Update the position to the preceding base
 
             // Other types (duplications, insertions, inversions)
             } else {
@@ -258,8 +248,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
                     alt_allele = std::get<2>(candidate);
                     alt_allele.insert(0, ref_allele);
 
-                    // Update the position
-                    pos = preceding_pos;
+                    pos = preceding_pos;  // Update the position to the preceding base
 
                     // Update the end position to the start position to change from
                     // query to reference coordinates for insertions
@@ -307,12 +296,10 @@ std::set<std::string> SVData::getChromosomes()
 
 int SVData::totalCalls()
 {
-    // std::cout << "Calculating total SV calls..." << std::endl;
     int sv_calls = 0;
     for (auto const& sv_call : this->sv_calls) {
         sv_calls += sv_call.second.size();
     }
-    // std::cout << "Total SV calls: " << sv_calls << std::endl;
 
     return sv_calls;
 }

From a38ada7a5e08c2f69199a60d9a85b2feddb1c572 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Nov 2024 13:33:05 -0400
Subject: [PATCH 006/134] Update docs

---
 src/sv_caller.cpp | 51 +++++++++--------------------------------------
 1 file changed, 9 insertions(+), 42 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 4ce5e940..0a511965 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -34,36 +34,34 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
 
 RegionData SVCaller::detectSVsFromRegion(std::string region)
 {
-    SVData sv_calls;
+    // Open the BAM file
     std::string bam_filepath = this->input_data->getLongReadBam();
-
-    // Open the BAM file in a thread-safe manner
     samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
     if (fp_in == NULL) {
         std::cerr << "ERROR: failed to open " << bam_filepath << std::endl;
         exit(1);
     }
 
-    // Get the header in a thread-safe manner
+    // Load the header for the BAM file
     bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
     if (bamHdr == NULL) {
         std::cerr << "ERROR: failed to read header for " << bam_filepath << std::endl;
         exit(1);
     }
 
-    // Get the index in a thread-safe manner
+    // Load the index for the BAM file
     hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str());
     if (idx == NULL) {
         std::cerr << "ERROR: failed to load index for " << bam_filepath << std::endl;
         exit(1);
     }
 
-    // Create a read and iterator for the region in a thread-safe manner
+    // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
     hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str());
 
-    // Loop through the alignments
-    // Create a map of primary and supplementary alignments by QNAME (query template name)
+    // Main loop to process the alignments
+    SVData sv_calls;
     int num_alignments = 0;
     PrimaryMap primary_alignments;
     SuppMap supplementary_alignments;
@@ -78,18 +76,15 @@ RegionData SVCaller::detectSVsFromRegion(std::string region)
             // Do nothing
 
         } else {
-            // Get the QNAME (query template name) for associating split reads
-            std::string qname = bam_get_qname(bam1);
+            std::string qname = bam_get_qname(bam1);  // Query template name
 
             // Process primary alignments
             if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
 
-                // Get the primary alignment chromosome, start, end, and depth
+                // Get the primary alignment information
                 std::string chr = bamHdr->target_name[bam1->core.tid];
                 int64_t start = bam1->core.pos;
                 int64_t end = bam_endpos(bam1);  // This is the first position after the alignment
-
-                // Get the strand
                 bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
 
                 // Call SVs directly from the CIGAR string
@@ -105,12 +100,10 @@ RegionData SVCaller::detectSVsFromRegion(std::string region)
             // Process supplementary alignments
             } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
 
-                // Add the supplementary alignment to the map
+                // Get the supplementary alignment information
                 std::string chr = bamHdr->target_name[bam1->core.tid];
                 int32_t start = bam1->core.pos;
                 int32_t end = bam_endpos(bam1);
-
-                // Get the strand
                 bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
 
                 // Get CIGAR string information, but don't call SVs
@@ -122,49 +115,24 @@ RegionData SVCaller::detectSVsFromRegion(std::string region)
                 // Add the supplementary alignment to the map
                 AlignmentData alignment(chr, start, end, ".", query_start, query_end, std::move(match_map), fwd_strand);
                 supplementary_alignments[qname].emplace_back(alignment);
-
-                // If Read ID == 8873acc1-eb84-415d-8557-a32a8f52ccee, print the
-                // alignment
-                // if (qname == "8873acc1-eb84-415d-8557-a32a8f52ccee") {
-                //     std::cout << "Supplementary alignment: " << chr << ":" << start << "-" << end << std::endl;
-                //     std::cout << "Query start: " << query_start << ", Query end: " << query_end << std::endl;
-                //     std::cout << "Match map: ";
-                //     for (const auto& entry : match_map) {
-                //         std::cout << entry.first << ":" << entry.second << " ";
-                //     }
-                //     std::cout << std::endl;
-                // }
             }
         }
 
-        // Increment the number of alignment records processed
         num_alignments++;
     }
 
-    // Destroy the iterator
     hts_itr_destroy(itr);
-
-    // Destroy the read
     bam_destroy1(bam1);
-
-    // Close the BAM file
     sam_close(fp_in);
-
-    // Destroy the header
     bam_hdr_destroy(bamHdr);
-
-    // Destroy the index
     hts_idx_destroy(idx);
 
     // Return the SV calls and the primary and supplementary alignments
-    // return std::make_tuple(sv_calls, primary_alignments,
-    // supplementary_alignments);
     return std::make_tuple(std::move(sv_calls), std::move(primary_alignments), std::move(supplementary_alignments));
 }
 
 double SVCaller::calculateMismatchRate(std::unordered_map<int, int> &match_map, int32_t start, int32_t end)
 {
-    // Calculate the mismatch rate
     int match_count = 0;
     int mismatch_count = 0;
     for (int i = start; i <= end; i++) {
@@ -178,7 +146,6 @@ double SVCaller::calculateMismatchRate(std::unordered_map<int, int> &match_map,
     }
     double mismatch_rate = (double)mismatch_count / (double)(match_count + mismatch_count);
 
-    // Return the mismatch rate
     return mismatch_rate;
 }
 

From 1cf3a127902ee19d3b5352c9ed6d434dbab79822 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Nov 2024 13:41:47 -0400
Subject: [PATCH 007/134] Remove unused code and update docs

---
 include/cnv_caller.h |   5 +-
 src/cnv_caller.cpp   | 161 ++-----------------------------------------
 src/sv_caller.cpp    |  32 +--------
 3 files changed, 11 insertions(+), 187 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 23a70640..a281219f 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -119,9 +119,6 @@ class CNVCaller {
         // Calculate the mean chromosome coverage
         double calculateMeanChromosomeCoverage(std::string chr);
 
-        // Calculate read depths for a region
-        void calculateDepthsForSNPRegion(std::string chr, int64_t start_pos, int64_t end_pos, std::unordered_map<uint64_t, int>& pos_depth_map);
-
         // Calculate the log2 ratio for a region given the read depths and mean
         // chromosome coverage
         double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::unordered_map<uint32_t, int>& pos_depth_map, double mean_chr_cov);
@@ -133,7 +130,7 @@ class CNVCaller {
         // of population frequencies for each SNP location
         void getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info);
 
-        // Save a TSV with B-allele frequencies, log 2 ratios, and copy number predictions
+        // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions
         void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, int64_t start, int64_t end, std::string sv_type, double likelihood);
 };
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 35793d47..ed71237a 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -189,13 +189,6 @@ std::tuple<int, double, SVType, std::string, bool> CNVCaller::runCopyNumberPredi
     int best_index = 0;
     std::pair<int64_t, int64_t> best_pos;
     SNPData best_snp_data;
-
-    // Get read depths for the SV candidate region
-    // int64_t region_start_pos = std::min(std::get<0>(sv_one), std::get<0>(sv_two));
-    // int64_t region_end_pos = std::max(std::get<1>(sv_one), std::get<1>(sv_two));
-    // std::unordered_map<uint64_t, int> pos_depth_map;
-    // calculateDepthsForSNPRegion(chr, region_start_pos, region_end_pos, pos_depth_map);
-
     int current_index = 0;
     SVType predicted_cnv_type = SVType::UNKNOWN;
     std::string genotype = "./.";
@@ -377,31 +370,17 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map<SVCa
     {
         // Get the SV candidate
         const SVCandidate& candidate = sv_call;
-
-        // Get the start and end positions of the SV call
         int64_t start_pos = std::get<0>(candidate);
         int64_t end_pos = std::get<1>(candidate);
 
-        // // [TEST] Skip if not in the following list of SVs
-        // std::vector<std::string> sv_list = {"chr19:53013528-53051102", "chr1:43593639-43617165", "chr6:35786784-35799012", "chr1:152787870-152798352", "chr17:41265461-41275765", "chr5:180950357-181003515"};
-        // std::string sv_key = chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos);
-        // if (std::find(sv_list.begin(), sv_list.end(), sv_key) == sv_list.end())
-        // {
-        //     continue;
-        // }
-
         // Get the depth at the start position. This is used as the FORMAT/DP
         // value in the VCF file
         int dp_value = pos_depth_map[start_pos];
         this->updateDPValue(sv_candidates, sv_call, dp_value);
 
-        // Loop through the SV region, calculate the log2 ratios, and run the
-        // Viterbi algorithm to predict the copy number states
-
-        // We will run the Viterbi algorithm on SNPs in the SV region +/- 1/2
-        // the SV length
+        // Loop through the SV region +/- 1/2 SV length and run copy number
+        // predictions
         int64_t sv_half_length = (end_pos - start_pos) / 2.0;
-        // std::cout << "SV half length: " << sv_half_length << std::endl;
         int64_t query_start = std::max((int64_t) 1, start_pos - sv_half_length);
         int64_t query_end = end_pos + sv_half_length;
 
@@ -514,10 +493,7 @@ void CNVCaller::updateSVCopyNumber(std::map<SVCandidate, SVInfo> &sv_candidates,
 
 void CNVCaller::updateDPValue(std::map<SVCandidate,SVInfo>& sv_candidates, SVCandidate key, int dp_value)
 {
-    // Lock the SV candidate map
     std::lock_guard<std::mutex> lock(this->sv_candidates_mtx);
-
-    // Update the DP value
     sv_candidates[key].read_depth = dp_value;
 }
 
@@ -540,8 +516,6 @@ std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, int64
 
         // Add the region chunk to the vector
         region_chunks.push_back(chr + ":" + std::to_string(chunk_start) + "-" + std::to_string(chunk_end));
-
-        // Update the chunk start
         chunk_start = chunk_end + 1;
     }
 
@@ -558,7 +532,6 @@ std::vector<std::vector<SVCandidate>> CNVCaller::splitSVCandidatesIntoChunks(std
     std::vector<SVCandidate> current_sv_chunk;
     for (auto const& sv_call : sv_candidates)
     {
-        // Add the SV candidate to the current chunk
         current_sv_chunk.push_back(sv_call.first);
 
         // If the current chunk size is reached, then add the chunk to the
@@ -587,23 +560,19 @@ CNVCaller::CNVCaller(InputData &input_data)
 
 void CNVCaller::loadChromosomeData(std::string chr)
 {
-    // Read the HMM from file
     std::string hmm_filepath = this->input_data->getHMMFilepath();
     std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
     this->hmm = ReadCHMM(hmm_filepath.c_str());
 
-    // Calculate the mean chromosome coverage and generate the position-depth map
     printMessage("Calculating mean chromosome coverage for " + chr + "...");
     mean_chr_cov = calculateMeanChromosomeCoverage(chr);
     printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov));
     this->mean_chr_cov = mean_chr_cov;
 
-    // Read the SNP positions and B-allele frequency values from the VCF file
     std::cout << "Reading SNP allele frequencies for chromosome " << chr << " from VCF file..." << std::endl;
     std::string snp_filepath = this->input_data->getSNPFilepath();
     readSNPAlleleFrequencies(chr, snp_filepath, this->snp_info);
 
-    // Get the population frequencies for each SNP
     std::cout << "Obtaining SNP population frequencies for chromosome " << chr << "..." << std::endl;
     getSNPPopulationFrequencies(chr, this->snp_info);
     std::cout << "Finished loading chromosome data for " << chr << std::endl;
@@ -685,107 +654,6 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
     return mean_chr_cov;
 }
 
-void CNVCaller::calculateDepthsForSNPRegion(std::string chr, int64_t start_pos, int64_t end_pos, std::unordered_map<uint64_t, int>& pos_depth_map)
-{
-    std::cout << "Calculating read depths for SV region " << chr << ":" << start_pos << "-" << end_pos << "..." << std::endl;
-
-    // // If extending the CNV regions, then extend the SV region by window size *
-    // // N. Otherwise, log2 ratios will be zero due to missing read depth data
-    // // before/after the first/last SV positions
-    // if (this->input_data->getSaveCNVData())
-    // {
-    //     int extend_factor = 100;
-    //     int window_size = this->input_data->getWindowSize();
-    //     start_pos = std::max((int64_t) 1, start_pos - (window_size * extend_factor));
-    //     end_pos = end_pos + (window_size * extend_factor);
-    // }
-
-    // // Split the region into equal parts for each thread if the region is larger
-    // // than 100 kb
-    // int num_threads = this->input_data->getThreadCount();
-    // std::vector<std::string> region_chunks;
-    // int64_t region_size = end_pos - start_pos;
-    // if (region_size < 100000)
-    // {
-    //     region_chunks.push_back(chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos));
-    // } else {
-    //     region_chunks = splitRegionIntoChunks(chr, start_pos, end_pos, num_threads);
-    // }
-
-    // // Loop through each region chunk and get the mean chromosome coverage in
-    // // parallel
-    // std::string input_filepath = this->input_data->getShortReadBam();
-    // std::vector<std::future<std::unordered_map<uint64_t, int>>> futures;
-    // for (const auto& region_chunk : region_chunks)
-    // {
-    //     // Create a lambda function to get the mean chromosome coverage for the
-    //     // region chunk
-    //     auto get_pos_depth_map = [region_chunk, input_filepath]() -> std::unordered_map<uint64_t, int>
-    //     {
-    //         // Run samtools depth on the entire region, and print positions and
-    //         // depths (not chromosome)
-    //         const int cmd_size = 256;
-    //         char cmd[cmd_size];
-    //         snprintf(cmd, cmd_size,
-    //             "samtools depth -r %s %s | awk '{print $2, $3}'",
-    //             region_chunk.c_str(), input_filepath.c_str());
-
-    //         // Open a pipe to read the output of the command
-    //         FILE *fp = popen(cmd, "r");
-    //         if (fp == NULL)
-    //         {
-    //             std::cerr << "ERROR: Could not open pipe for command: " << cmd << std::endl;
-    //             exit(EXIT_FAILURE);
-    //         }
-
-    //         // Create a map of positions and depths
-    //         std::unordered_map<uint64_t, int> pos_depth_map;
-    //         const int line_size = 1024;
-    //         char line[line_size];
-    //         while (fgets(line, line_size, fp) != NULL)
-    //         {
-    //             // Parse the line
-    //             uint64_t pos;
-    //             int depth;
-    //             if (sscanf(line, "%ld%d", &pos, &depth) == 2)
-    //             {
-    //                 // Add the position and depth to the map
-    //                 pos_depth_map[pos] = depth;
-    //             } else {
-    //                 // No reads
-    //             }
-    //         }
-
-    //         // Close the pipe
-    //         pclose(fp);
-
-    //         return pos_depth_map;
-    //     };
-
-    //     // Create a future for the thread
-    //     std::future<std::unordered_map<uint64_t, int>> future = std::async(std::launch::async, get_pos_depth_map);
-
-    //     // Add the future to the vector
-    //     futures.push_back(std::move(future));
-    // }
-
-    // // Loop through the futures and get the results
-    // int current_chunk = 0;
-    // for (auto& future : futures)
-    // {
-    //     current_chunk++;
-    //     future.wait();
-    //     std::unordered_map<uint64_t, int> result = std::move(future.get());
-
-    //     // Merge the position depth maps
-    //     this->mergePosDepthMaps(pos_depth_map, result);
-    //     if (this->input_data->getVerbose())
-    //     {
-    //         printMessage("Completed region chunk " + std::to_string(current_chunk) + " of " + std::to_string(region_chunks.size()) + "...");
-    //     }
-    // }
-}
-
 void CNVCaller::mergePosDepthMaps(std::unordered_map<uint32_t, int>& main_map, std::unordered_map<uint32_t, int>& map_update)
 {
     // Merge the second depth map into the first
@@ -857,16 +725,14 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath,
         std::cerr << "ERROR: " << index_error << std::endl;
         exit(1);
     }
-
-    // Close the pipe
-    pclose(index_fp);
+    pclose(index_fp);  // Close the process
 
     // Filter variants by depth, quality, and region
     if (this->input_data->getVerbose()) {
         std::cout << "Filtering SNPs by depth, quality, and region..." << std::endl;
     }
 
-    // // Check if a region was specified by the user
+    // Check if a region was specified by the user
     std::string region_str = chr;
     if (this->input_data->isRegionSet())
     {
@@ -947,8 +813,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath,
         snp_info.insertSNPAlleleFrequency(chr, pos, baf);
     }
 
-    // Close the pipe
-    pclose(fp);
+    pclose(fp);  // Close the process
 
     if (this->input_data->getVerbose()) {
         std::cout << "Finished extracting B-allele frequency data from filtered SNPs" << std::endl;
@@ -1033,12 +898,6 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
             // Run bcftools query to get the population frequencies for the
             // chromosome within the SNP region, filtering for SNPS only,
             // and within the MIN-MAX range of frequencies.
-            // TODO: Update to use ethnicity-specific population frequencies
-            // Example from gnomAD:
-            // ##INFO=<ID=AF_asj,Number=A,Type=Float,Description="Alternate
-            // allele frequency in samples of Ashkenazi Jewish ancestry">
-            // std::string ethnicity_suffix = "_asj";  // Ashkenazi Jewish
-            // (leave empty for all populations)
             std::string filter_criteria = "INFO/variant_type=\"snv\" && " + AF_key + " >= " + std::to_string(MIN_PFB) + " && " + AF_key + " <= " + std::to_string(MAX_PFB);
             std::string cmd = \
                 "bcftools query -r " + region_chunk + " -f '%POS\t%" + AF_key + "\n' -i '" + filter_criteria + "' " + pfb_filepath + " 2>/dev/null";
@@ -1065,8 +924,7 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
                 double pfb;
                 if (sscanf(line, "%d%lf", &pos, &pfb) == 2)
                 {
-                    // Add the position and population frequency to the map
-                    pos_pfb_map[pos] = pfb;
+                    pos_pfb_map[pos] = pfb;  // Add the position and population frequency to the map
                 }
             }
             pclose(fp);
@@ -1083,10 +941,7 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
     int pfb_count = 0;
     for (auto& future : futures)
     {
-        // Wait for the future to finish
         future.wait();
-
-        // Get the result from the future
         std::unordered_map<int, double> result = std::move(future.get());
 
         // Loop through the result and add to SNPInfo
@@ -1096,12 +951,10 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
             int pos = pair.first;
             double pfb = pair.second;
 
-            // Lock the SNPInfo mutex
+            // Add the population frequency to the SNPInfo
             this->snp_data_mtx.lock();
             snp_info.insertSNPPopulationFrequency(chr_snp, pos, pfb);
             this->snp_data_mtx.unlock();
-
-            // Increment the population frequency count
             pfb_count++;
 
             // [TEST] Print 15 values
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 0a511965..f20ddf69 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -25,10 +25,7 @@
 
 int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
 {
-    // Read the next alignment
     int ret = sam_itr_next(fp_in, itr, bam1);
-
-    // Return the result of reading the next alignment
     return ret;
 }
 
@@ -409,7 +406,6 @@ SVData SVCaller::run()
             sv_calls.concatenate(sv_calls_region);  // Add the calls to the main set
         }
 
-        // Increment the region count
         region_count++;
         std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)..." << std::endl;
     }
@@ -441,12 +437,10 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
         AlignmentVector supp_alignments = supp_map[qname];
         for (const auto& supp_alignment : supp_alignments) {
 
-            // Get the supplementary alignment chromosome
-            std::string supp_chr = std::get<0>(supp_alignment);
-
             // Skip supplementary alignments that are on a different chromosome
             // for now (TODO: Use for identifying trans-chromosomal SVs such as
             // translocations)
+            std::string supp_chr = std::get<0>(supp_alignment);
             if (primary_chr != supp_chr) {
                 continue;
             }
@@ -528,10 +522,6 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
                     std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "INVERSION");
                     sv_list.push_back(sv_pair);
                     sv_count++;
-                    // SVCandidate sv_candidate(supp_start+1, primary_end+1, ".");
-                    // std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "INVERSION");
-                    // sv_list.push_back(sv_pair);
-                    // sv_count++;
                 }
 
                 // Determine which SV to keep based on HMM prediction likelihood
@@ -547,15 +537,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
                 // [supp_start] [supp_end] -- [primary_start] [primary_end]
                 std::vector<std::pair<SVCandidate, std::string>> sv_list;  // SV candidate and alignment type
 
-                // Use the gap ends as the SV endpoints
-                // if (primary_start - supp_end >= min_cnv_length) {
-                //     SVCandidate sv_candidate(supp_end+1, primary_start+1, ".");
-                //     std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "GAPINNER_A");
-                //     sv_list.push_back(sv_pair);
-                //     sv_count++;
-                // }
-
-                // Also use the alignment ends as the SV endpoints
+                // Use the alignment ends as the SV endpoints
                 if (primary_end - supp_start >= min_cnv_length) {
                     SVCandidate sv_candidate(supp_start+1, primary_end+1, ".");
                     std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "GAPOUTER_A");
@@ -573,15 +555,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
                 // [primary_start] [primary_end] -- [supp_start] [supp_end]
                 std::vector<std::pair<SVCandidate, std::string>> sv_list;  // SV candidate and alignment type
 
-                // Use the gap ends as the SV endpoints
-                // if (supp_start - primary_end >= min_cnv_length) {
-                //     SVCandidate sv_candidate(primary_end+1, supp_start+1, ".");
-                //     std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "GAPINNER_B");
-                //     sv_list.push_back(sv_pair);
-                //     sv_count++;
-                // }
-
-                // Also use the alignment ends as the SV endpoints
+                // Use the alignment ends as the SV endpoints
                 if (supp_end - primary_start >= min_cnv_length) {
                     SVCandidate sv_candidate(primary_start+1, supp_end+1, ".");
                     std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "GAPOUTER_B");

From 181f6d1b5e490ca96acf4d9e22d5ee5a9aca7fd2 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Nov 2024 14:40:34 -0400
Subject: [PATCH 008/134] simplify predictions

---
 include/cnv_caller.h  |   8 +-
 src/cnv_caller.cpp    | 207 +++++++++++-------------------------------
 src/sv_caller.cpp     | 104 ++++++++++++++-------
 tests/test_general.py |   2 +-
 4 files changed, 129 insertions(+), 192 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index a281219f..86f0abf9 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -107,15 +107,13 @@ class CNVCaller {
         // Load file data for a chromosome (SNP positions, BAF values, and PFB values)
         void loadChromosomeData(std::string chr);
 
-        // Run copy number prediction for a pair of SV candidates, and add only
-        // the SV candidate with the highest likelihood
-        std::tuple<int, double, SVType, std::string, bool> runCopyNumberPredictionPair(std::string chr, SVCandidate sv_one, SVCandidate sv_two);
+        // Run copy number prediction for a single SV candidate, returning the
+        // likelihood, predicted CNV type, genotype, and whether SNPs were found
+        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, SVCandidate& sv_candidate);
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
         SNPData runCIGARCopyNumberPrediction(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, int min_length);
 
-        void updateSVsFromCopyNumberPrediction(SVData& sv_calls, std::vector<std::pair<SVCandidate, std::string>>& sv_list, std::string chr, bool inversion);
-
         // Calculate the mean chromosome coverage
         double calculateMeanChromosomeCoverage(std::string chr);
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index ed71237a..9e575771 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -115,181 +115,80 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, int64_t star
     return std::make_pair(snp_data, snps_found);
 }
 
-void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector<std::pair<SVCandidate, std::string>> &sv_list, std::string chr, bool inversion)
+std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, SVCandidate& candidate)
 {
-    // Throw an error if there are more than two SV candidates
-    if (sv_list.size() > 2) {
-        throw std::runtime_error("Error: More than two SV candidates found for copy number prediction comparisons.");
-    }
-
-    // Add a dummy call to the SV list if there is only one SV candidate
-    if (sv_list.size() == 1) {
-        SVCandidate dummy(0, 0, ".");
-        sv_list.push_back(std::make_pair(dummy, "."));
-    }
-    
-    // Run copy number prediction for the SV pair and add only the SV
-    // candidate with the highest likelihood
-    SVCandidate& sv_one = sv_list[0].first;
-    SVCandidate& sv_two = sv_list[1].first;
-    std::tuple<int, double, SVType, std::string, bool> cnv_prediction = this->runCopyNumberPredictionPair(chr, sv_one, sv_two);
-
-    // Get the SV info
-    int best_index = std::get<0>(cnv_prediction);
-    SVCandidate& best_sv_candidate = sv_list[best_index].first;
-    int64_t start_pos = std::get<0>(best_sv_candidate);
-    int64_t end_pos = std::get<1>(best_sv_candidate);
-    std::string aln_type = sv_list[best_index].second;
-
-    // Get the prediction data
-    double best_likelihood = std::get<1>(cnv_prediction);
-    SVType best_cnv_type = std::get<2>(cnv_prediction);
-    std::string best_genotype = std::get<3>(cnv_prediction);
-    bool snps_found = std::get<4>(cnv_prediction);
-    if (snps_found)
-    {
-        aln_type += "_SNPS";
-    } else {
-        aln_type += "_NOSNPS";
-    }
-
-    // Update the SV type if inversion is detected and the best CNV type is
-    // copy neutral or duplication
-    if (inversion) // && (best_cnv_type == sv_types::NEUTRAL))
-    {
-        if (best_cnv_type == SVType::NEUTRAL)
-        {
-            best_cnv_type = SVType::INV;
-        } else if (best_cnv_type == SVType::DUP)
+    // std::cout << "Running copy number prediction for SV pair " << chr << ":" << std::get<0>(sv_one) << "-" << std::get<1>(sv_one) << " and " << std::get<0>(sv_two) << "-" << std::get<1>(sv_two) << "..." << std::endl;
+     // Get the start and end positions of the SV call
+    int64_t start_pos = std::get<0>(candidate);
+    int64_t end_pos = std::get<1>(candidate);
+
+    // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2
+    // the SV length
+    int64_t sv_length = (end_pos - start_pos) / 2.0;
+    int64_t snp_start_pos = std::max((int64_t) 1, start_pos - sv_length);
+    int64_t snp_end_pos = end_pos + sv_length;
+
+    // Query the SNP region for the SV candidate
+    std::pair<SNPData, bool> snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, this->snp_info, this->pos_depth_map, this->mean_chr_cov);
+    SNPData& sv_snps = snp_call.first;
+    bool sv_snps_found = snp_call.second;
+
+    // Run the Viterbi algorithm
+    std::pair<std::vector<int>, double> prediction = runViterbi(this->hmm, sv_snps);
+    std::vector<int>& state_sequence = prediction.first;
+    double likelihood = prediction.second;
+
+    // Get all the states in the SV region
+    std::vector<int> sv_states;
+    for (size_t i = 0; i < state_sequence.size(); i++)
+    {
+        if (sv_snps.pos[i] >= start_pos && sv_snps.pos[i] <= end_pos)
         {
-            best_cnv_type = SVType::INV_DUP;
-            printMessage("INVDUP detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
+            sv_states.push_back(state_sequence[i]);
         }
-        // best_cnv_type = sv_types::INV;
-        // printMessage("Inversion detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
     }
 
-    // If the dummy call was used, then throw an error if the best SV type
-    // is unknown
-    if (std::get<0>(best_sv_candidate) == 0 && std::get<1>(best_sv_candidate) == 0)
+    // Determine if there is a majority state within the SV region and if it
+    // is greater than 75%
+    double pct_threshold = 0.75;
+    int max_state = 0;
+    int max_count = 0;
+    for (int i = 0; i < 6; i++)
     {
-        throw std::runtime_error("Error: No valid SV type found for copy number prediction.");
+        int state_count = std::count(sv_states.begin(), sv_states.end(), i+1);
+        if (state_count > max_count)
+        {
+            max_state = i+1;
+            max_count = state_count;
+        }
     }
-
-    // Add the SV call to the main SV data
-    sv_calls.add(chr, start_pos, end_pos, best_cnv_type, ".", aln_type, best_genotype, best_likelihood);
-}
-
-std::tuple<int, double, SVType, std::string, bool> CNVCaller::runCopyNumberPredictionPair(std::string chr, SVCandidate sv_one, SVCandidate sv_two)
-{
-    // std::cout << "Running copy number prediction for SV pair " << chr << ":" << std::get<0>(sv_one) << "-" << std::get<1>(sv_one) << " and " << std::get<0>(sv_two) << "-" << std::get<1>(sv_two) << "..." << std::endl;
-    double best_likelihood = 0.0;
-    bool best_likelihood_set = false;
-    bool snps_found = false;
-    int best_index = 0;
-    std::pair<int64_t, int64_t> best_pos;
-    SNPData best_snp_data;
-    int current_index = 0;
+    
+    // Update SV type and genotype based on the majority state
     SVType predicted_cnv_type = SVType::UNKNOWN;
     std::string genotype = "./.";
-    for (const auto& sv_call : {sv_one, sv_two})
+    int state_count = (int) sv_states.size();
+    if ((double) max_count / (double) state_count > pct_threshold)
     {
-        // Get the SV candidate
-        const SVCandidate& candidate = sv_call;
-
-        // Get the start and end positions of the SV call
-        int64_t start_pos = std::get<0>(candidate);
-        int64_t end_pos = std::get<1>(candidate);
-
-        // Skip if the start position equals zero (dummy call)
-        if (start_pos == 0) {
-            continue;
-        }
-
-        // Get the depth at the start position, which is used as the FORMAT/DP
-        // value
-        // int dp_value = pos_depth_map[start_pos];
-
-        // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2
-        // the SV length
-        int64_t sv_length = (end_pos - start_pos) / 2.0;
-        int64_t snp_start_pos = std::max((int64_t) 1, start_pos - sv_length);
-        int64_t snp_end_pos = end_pos + sv_length;
-
-        // Query the SNP region for the SV candidate
-        std::pair<SNPData, bool> snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, this->snp_info, this->pos_depth_map, this->mean_chr_cov);
-        SNPData& sv_snps = snp_call.first;
-        bool sv_snps_found = snp_call.second;
-
-        // Run the Viterbi algorithm
-        std::pair<std::vector<int>, double> prediction = runViterbi(this->hmm, sv_snps);
-        std::vector<int>& state_sequence = prediction.first;
-        double likelihood = prediction.second;
-
-        // Get all the states in the SV region
-        std::vector<int> sv_states;
-        for (size_t i = 0; i < state_sequence.size(); i++)
-        {
-            if (sv_snps.pos[i] >= start_pos && sv_snps.pos[i] <= end_pos)
-            {
-                sv_states.push_back(state_sequence[i]);
-            }
-        }
-
-        // Determine if there is a majority state within the SV region and if it
-        // is greater than 75%
-        double pct_threshold = 0.75;
-        int max_state = 0;
-        int max_count = 0;
-        for (int i = 0; i < 6; i++)
-        {
-            int state_count = std::count(sv_states.begin(), sv_states.end(), i+1);
-            if (state_count > max_count)
-            {
-                max_state = i+1;
-                max_count = state_count;
-            }
-        }
-        
-        // Update SV type and genotype based on the majority state
-        int state_count = (int) sv_states.size();
-        if ((double) max_count / (double) state_count > pct_threshold)
-        {
-            predicted_cnv_type = getSVTypeFromCNState(max_state);
-            genotype = cnv_genotype_map[max_state];
-        }
-
-        // Update the best SV call based on the likelihood
-        if (!best_likelihood_set || (likelihood > best_likelihood))
-        {
-            best_likelihood = likelihood;
-            best_likelihood_set = true;
-            snps_found = sv_snps_found;
-            best_index = current_index;
-
-            // Add the state sequence to the SNP data (avoid copying the data)
-            sv_snps.state_sequence = std::move(state_sequence);
-            best_snp_data = std::move(sv_snps);
-            best_pos = std::make_pair(start_pos, end_pos);
-        }
-        current_index++;
+        predicted_cnv_type = getSVTypeFromCNState(max_state);
+        genotype = cnv_genotype_map[max_state];
     }
+    sv_snps.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
 
     // Save the SV calls as a TSV file if enabled
-    int64_t sv_start_pos = std::get<0>(best_pos);
-    int64_t sv_end_pos = std::get<1>(best_pos);
     bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL);
-    if (this->input_data->getSaveCNVData() && copy_number_change && (sv_end_pos - sv_start_pos) > 10000)
+    if (this->input_data->getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 10000)
     {
         std::string cnv_type_str = getSVTypeString(predicted_cnv_type);
-        std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) sv_start_pos) + "-" + std::to_string((int) sv_end_pos) + "_SPLITALN.tsv";
+        std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv";
         std::cout << "Saving SV split-alignment copy number predictions to " << sv_filename << std::endl;
-        this->saveSVCopyNumberToTSV(best_snp_data, sv_filename, chr, best_pos.first, best_pos.second, cnv_type_str, best_likelihood);
+        this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood);
+        // this->saveSVCopyNumberToTSV(best_snp_data, sv_filename, chr, best_pos.first, best_pos.second, cnv_type_str, best_likelihood);
     }
 
-    return std::make_tuple(best_index, best_likelihood, predicted_cnv_type, genotype, snps_found);
+    return std::make_tuple(likelihood, predicted_cnv_type, genotype, sv_snps_found);
 }
 
+
 SNPData CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::map<SVCandidate, SVInfo> &sv_candidates, int min_length)
 {
     SNPInfo& snp_info = this->snp_info;
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index f20ddf69..6521fd77 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -16,6 +16,7 @@
 #include <chrono>
 #include <future>
 #include <cmath>
+#include <algorithm>
 
 #include "utils.h"
 #include "sv_types.h"
@@ -433,6 +434,11 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
         std::unordered_map<int, int> primary_match_map = std::get<6>(primary_alignment);
         bool primary_strand = std::get<7>(primary_alignment);
 
+        // Sort the supplementary alignments by chr, start, and end
+        std::sort(supp_map[qname].begin(), supp_map[qname].end(), [](const AlignmentData& a, const AlignmentData& b) {
+            return std::get<0>(a) < std::get<0>(b) || (std::get<0>(a) == std::get<0>(b) && std::get<1>(a) < std::get<1>(b)) || (std::get<0>(a) == std::get<0>(b) && std::get<1>(a) == std::get<1>(b) && std::get<2>(a) < std::get<2>(b));
+        });
+
         // Loop through the supplementary alignments and find gaps and overlaps
         AlignmentVector supp_alignments = supp_map[qname];
         for (const auto& supp_alignment : supp_alignments) {
@@ -497,7 +503,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
             // subsections = split_coordinates(entire_coordinate)
             // likelihoods_subsections = [hmm_model.predict_likelihood(sub) for sub in subsections]
 
-            // # Determine best likelihood from subsections
+            // # Determine best (or worst?) likelihood from subsections (also print all likelihoods for each component)
             // best_likelihood_split = max(likelihoods_subsections)
 
             // # Compare and decide
@@ -505,6 +511,10 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
             //     best_choice = "entire coordinate"
             // else:
             //     best_choice = "split coordinates"
+            bool find_complex_events = true;
+            if (find_complex_events) {
+                // std::cout << "Complex event detection not implemented yet" << std::endl;
+            }
 
             // [1] Inversion detection from primary and supplementary alignments
             // on opposite strands
@@ -512,21 +522,34 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
                 // std::cout << "Inversion detected for read " << qname << std::endl;
                 // std::cout << "Primary read position: " << primary_start << "-" << primary_end << std::endl;
                 // std::cout << "Supplementary read position: " << supp_start << "-" << supp_end << std::endl;
-
-                std::vector<std::pair<SVCandidate, std::string>> sv_list;  // SV candidate and alignment type
-
-                // Use the supplementary alignment coordinates as the SV
-                // endpoints
                 if (supp_end - supp_start >= min_cnv_length) {
                     SVCandidate sv_candidate(supp_start+1, supp_end+1, ".");
-                    std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "INVERSION");
-                    sv_list.push_back(sv_pair);
-                    sv_count++;
-                }
+                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate);
+                    double likelihood = std::get<0>(result);
+                    SVType cnv_type = std::get<1>(result);
+                    std::string genotype = std::get<2>(result);
+                    bool snps_found = std::get<3>(result);
+                    std::string aln_type = "LOG2";
+                    if (snps_found) {
+                        aln_type += "_SNPS";
+                    } else {
+                        aln_type += "_NOSNPS";
+                    }
 
-                // Determine which SV to keep based on HMM prediction likelihood
-                if (sv_list.size() > 0) {
-                    cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr, true);
+                    // Update the SV type for inversions
+                    if (cnv_type == SVType::NEUTRAL) {
+                        cnv_type = SVType::INV;
+                    } else if (cnv_type == SVType::DUP) {
+                        cnv_type = SVType::INV_DUP;
+                    } else {
+                        cnv_type = SVType::UNKNOWN;
+                    }
+                    
+                    // Add the SV call to the main SV data if not unknown
+                    if (cnv_type != SVType::UNKNOWN) {
+                        sv_calls.add(supp_chr, supp_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood);
+                    }
+                    sv_count++;
                 }
             }
 
@@ -535,37 +558,54 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
 
                 // Gap with supplementary before primary:
                 // [supp_start] [supp_end] -- [primary_start] [primary_end]
-                std::vector<std::pair<SVCandidate, std::string>> sv_list;  // SV candidate and alignment type
-
-                // Use the alignment ends as the SV endpoints
                 if (primary_end - supp_start >= min_cnv_length) {
                     SVCandidate sv_candidate(supp_start+1, primary_end+1, ".");
-                    std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "GAPOUTER_A");
-                    sv_list.push_back(sv_pair);
-                    sv_count++;
-                }
 
-                // Determine which SV to keep based on HMM prediction likelihood
-                if (sv_list.size() > 0) {
-                    cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr, false);
+                    // Run copy number prediction for the SV candidate
+                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate);
+                    double likelihood = std::get<0>(result);
+                    SVType cnv_type = std::get<1>(result);
+                    std::string genotype = std::get<2>(result);
+                    bool snps_found = std::get<3>(result);
+                    std::string aln_type = "GAPOUTER_A";
+                    if (snps_found) {
+                        aln_type += "_SNPS";
+                    } else {
+                        aln_type += "_NOSNPS";
+                    }
+
+                    // Add the SV call to the main SV data if not unknown
+                    if (cnv_type != SVType::UNKNOWN) {
+                        sv_calls.add(supp_chr, supp_start, primary_end, cnv_type, ".", aln_type, genotype, likelihood);
+                    }
+                    sv_count++;
                 }
                 
             } else if (supp_start > primary_end && supp_end > primary_end) {
                 // Gap with supplementary after primary:
                 // [primary_start] [primary_end] -- [supp_start] [supp_end]
-                std::vector<std::pair<SVCandidate, std::string>> sv_list;  // SV candidate and alignment type
 
-                // Use the alignment ends as the SV endpoints
                 if (supp_end - primary_start >= min_cnv_length) {
                     SVCandidate sv_candidate(primary_start+1, supp_end+1, ".");
-                    std::pair<SVCandidate, std::string> sv_pair(sv_candidate, "GAPOUTER_B");
-                    sv_list.push_back(sv_pair);
-                    sv_count++;
-                }
 
-                // Determine which SV to keep based on HMM prediction likelihood
-                if (sv_list.size() > 0) {
-                    cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr, false);
+                    // Run copy number prediction for the SV candidate
+                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate);
+                    double likelihood = std::get<0>(result);
+                    SVType cnv_type = std::get<1>(result);
+                    std::string genotype = std::get<2>(result);
+                    bool snps_found = std::get<3>(result);
+                    std::string aln_type = "GAPOUTER_B";
+                    if (snps_found) {
+                        aln_type += "_SNPS";
+                    } else {
+                        aln_type += "_NOSNPS";
+                    }
+
+                    // Add the SV call to the main SV data if not unknown
+                    if (cnv_type != SVType::UNKNOWN) {
+                        sv_calls.add(supp_chr, primary_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood);
+                    }
+                    sv_count++;
                 }
             }
         }
diff --git a/tests/test_general.py b/tests/test_general.py
index 9083eb6a..8f5d4006 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -64,7 +64,7 @@ def test_run():
 
     # Check that the VCF file has the correct number of lines.
     with open(output_file, 'r', encoding='utf-8') as f:
-        assert len(f.readlines()) == 23
+        assert len(f.readlines()) == 22
 
     # Check that the VCF file has the correct header, and the correct
     # VCF CHROM, POS, and INFO fields in the next 2 lines.

From fa4f744bef6f8a9fb0ae4e9803e8f29fcad962b9 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Nov 2024 16:02:32 -0400
Subject: [PATCH 009/134] Simplify split sv calling

---
 src/sv_caller.cpp | 313 ++++++++++++++++++++++------------------------
 1 file changed, 149 insertions(+), 164 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 6521fd77..3a3d3723 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -261,18 +261,15 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
 
         // Update match/mismatch query map
         if (op == BAM_CEQUAL) {
-            // match_count += op_len;
             for (int j = 0; j < op_len; j++) {
                 query_match_map[query_pos + j] = 1;
             }
         } else if (op == BAM_CDIFF) {
-            // mismatch_count += op_len;
             for (int j = 0; j < op_len; j++) {
                 query_match_map[query_pos + j] = 0;
             }
         } else if (op == BAM_CMATCH) {
-            // Compare read and reference sequences
-            // Get the sequence from the query
+            // Get the read sequence
             uint8_t* seq_ptr = bam_get_seq(alignment);
             std::string cmatch_seq_str = "";
             for (int j = 0; j < op_len; j++) {
@@ -433,180 +430,168 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
         int32_t primary_query_end = std::get<5>(primary_alignment);
         std::unordered_map<int, int> primary_match_map = std::get<6>(primary_alignment);
         bool primary_strand = std::get<7>(primary_alignment);
+        if (supp_map.find(qname) == supp_map.end()) {
+            continue;
+        }
 
-        // Sort the supplementary alignments by chr, start, and end
-        std::sort(supp_map[qname].begin(), supp_map[qname].end(), [](const AlignmentData& a, const AlignmentData& b) {
-            return std::get<0>(a) < std::get<0>(b) || (std::get<0>(a) == std::get<0>(b) && std::get<1>(a) < std::get<1>(b)) || (std::get<0>(a) == std::get<0>(b) && std::get<1>(a) == std::get<1>(b) && std::get<2>(a) < std::get<2>(b));
-        });
-
-        // Loop through the supplementary alignments and find gaps and overlaps
-        AlignmentVector supp_alignments = supp_map[qname];
-        for (const auto& supp_alignment : supp_alignments) {
-
-            // Skip supplementary alignments that are on a different chromosome
-            // for now (TODO: Use for identifying trans-chromosomal SVs such as
-            // translocations)
-            std::string supp_chr = std::get<0>(supp_alignment);
-            if (primary_chr != supp_chr) {
-                continue;
-            }
-            int32_t supp_start = std::get<1>(supp_alignment);
-            int32_t supp_end = std::get<2>(supp_alignment);
-            int32_t supp_query_start = std::get<4>(supp_alignment);
-            int32_t supp_query_end = std::get<5>(supp_alignment);
-            std::unordered_map<int, int> supp_match_map = std::get<6>(supp_alignment);
-            bool supp_strand = std::get<7>(supp_alignment);
-
-            // Resolve overlaps between the primary and supplementary query sequences
-            int32_t overlap_start = std::max(primary_query_start, supp_query_start);
-            int32_t overlap_end = std::min(primary_query_end, supp_query_end);
-            int32_t overlap_length = overlap_end - overlap_start;
-            if (overlap_length > 0) {
-                // std::cout << "Overlap detected for read " << qname << std::endl;
-                // std::cout << "Primary read position: " << primary_query_start << "-" << primary_query_end << std::endl;
-                // std::cout << "Supplementary read position: " << supp_query_start << "-" << supp_query_end << std::endl;
-                // std::cout << "Overlap range: " << overlap_start << "-" << overlap_end << std::endl;
-                // std::cout << "Overlap length: " << overlap_length << std::endl;
-                // std::cout << "Primary reference position: " << primary_start << "-" << primary_end << std::endl;
-                // std::cout << "Supplementary reference position: " << supp_start << "-" << supp_end << std::endl;
-
-                // Calculate the mismatch rate for each alignment at the overlap
-                double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1);
-                double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1);
-                // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl;
-                // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl;
-
-                // Trim the overlap from the alignment with the higher mismatch
-                // rate
-                if (primary_mismatch_rate > supp_mismatch_rate) {
-                    if (overlap_start == primary_query_start) {
-                        primary_start += overlap_length;
-                    } else if (overlap_end == primary_query_end) {
-                        primary_end -= overlap_length;
-                    }
-
-                } else {
-                    if (overlap_start == supp_query_start) {
-                        supp_start += overlap_length;
-                    } else if (overlap_end == supp_query_end) {
-                        supp_end -= overlap_length;
-                    }
+        // Find the largest alignment on the primary chromosome
+        AlignmentData supp_alignment = supp_map[qname][0];
+        int32_t largest_supp_length = 0;
+        auto largest_supp_it = supp_map[qname].end();
+        for (auto it = supp_map[qname].begin(); it != supp_map[qname].end();) {
+            const auto& supp_chr = std::get<0>(*it);
+            if (supp_chr != primary_chr) {
+                it = supp_map[qname].erase(it);
+            } else {
+                int32_t supp_length = std::get<2>(*it) - std::get<1>(*it);
+                if (supp_length > largest_supp_length) {
+                    largest_supp_length = supp_length;
+                    largest_supp_it = it;
                 }
+                ++it;
             }
+        }
+        if (largest_supp_it == supp_map[qname].end()) {
+            continue;  // No primary chromosome alignments
+        }
+        supp_alignment = *largest_supp_it;
+       
+        // Run SV detection from the primary and supplementary alignment
+        std::string supp_chr = std::get<0>(supp_alignment);
+        int32_t supp_start = std::get<1>(supp_alignment);
+        int32_t supp_end = std::get<2>(supp_alignment);
+        int32_t supp_query_start = std::get<4>(supp_alignment);
+        int32_t supp_query_end = std::get<5>(supp_alignment);
+        std::unordered_map<int, int> supp_match_map = std::get<6>(supp_alignment);
+        bool supp_strand = std::get<7>(supp_alignment);
+
+        // Resolve overlaps between the primary and supplementary query sequences
+        int32_t overlap_start = std::max(primary_query_start, supp_query_start);
+        int32_t overlap_end = std::min(primary_query_end, supp_query_end);
+        int32_t overlap_length = overlap_end - overlap_start;
+        if (overlap_length > 0) {
+            // std::cout << "Overlap detected for read " << qname << std::endl;
+            // std::cout << "Primary read position: " << primary_query_start << "-" << primary_query_end << std::endl;
+            // std::cout << "Supplementary read position: " << supp_query_start << "-" << supp_query_end << std::endl;
+            // std::cout << "Overlap range: " << overlap_start << "-" << overlap_end << std::endl;
+            // std::cout << "Overlap length: " << overlap_length << std::endl;
+            // std::cout << "Primary reference position: " << primary_start << "-" << primary_end << std::endl;
+            // std::cout << "Supplementary reference position: " << supp_start << "-" << supp_end << std::endl;
+
+            // Calculate the mismatch rate for each alignment at the overlap
+            double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1);
+            double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1);
+            // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl;
+            // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl;
+
+            // Trim the overlap from the alignment with the higher mismatch
+            // rate
+            if (primary_mismatch_rate > supp_mismatch_rate) {
+                if (overlap_start == primary_query_start) {
+                    primary_start += overlap_length;
+                } else if (overlap_end == primary_query_end) {
+                    primary_end -= overlap_length;
+                }
 
-            // TODO:
-            // if (find_complex_events)
-            // # Calculate likelihood for entire coordinate
-            // likelihood_entire = hmm_model.predict_likelihood(entire_coordinate)
-
-            // # Split coordinates into smaller sections and calculate likelihoods
-            // subsections = split_coordinates(entire_coordinate)
-            // likelihoods_subsections = [hmm_model.predict_likelihood(sub) for sub in subsections]
-
-            // # Determine best (or worst?) likelihood from subsections (also print all likelihoods for each component)
-            // best_likelihood_split = max(likelihoods_subsections)
-
-            // # Compare and decide
-            // if likelihood_entire > best_likelihood_split:
-            //     best_choice = "entire coordinate"
-            // else:
-            //     best_choice = "split coordinates"
-            bool find_complex_events = true;
-            if (find_complex_events) {
-                // std::cout << "Complex event detection not implemented yet" << std::endl;
+            } else {
+                if (overlap_start == supp_query_start) {
+                    supp_start += overlap_length;
+                } else if (overlap_end == supp_query_end) {
+                    supp_end -= overlap_length;
+                }
             }
+        }
 
-            // [1] Inversion detection from primary and supplementary alignments
-            // on opposite strands
-            if (primary_strand != supp_strand) {
-                // std::cout << "Inversion detected for read " << qname << std::endl;
-                // std::cout << "Primary read position: " << primary_start << "-" << primary_end << std::endl;
-                // std::cout << "Supplementary read position: " << supp_start << "-" << supp_end << std::endl;
-                if (supp_end - supp_start >= min_cnv_length) {
-                    SVCandidate sv_candidate(supp_start+1, supp_end+1, ".");
-                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate);
-                    double likelihood = std::get<0>(result);
-                    SVType cnv_type = std::get<1>(result);
-                    std::string genotype = std::get<2>(result);
-                    bool snps_found = std::get<3>(result);
-                    std::string aln_type = "LOG2";
-                    if (snps_found) {
-                        aln_type += "_SNPS";
-                    } else {
-                        aln_type += "_NOSNPS";
-                    }
+        // [1] Inversion detection from primary and supplementary alignments
+        // on opposite strands
+        if (primary_strand != supp_strand) {
+            // std::cout << "Inversion detected for read " << qname << std::endl;
+            // std::cout << "Primary read position: " << primary_start << "-" << primary_end << std::endl;
+            // std::cout << "Supplementary read position: " << supp_start << "-" << supp_end << std::endl;
+            if (supp_end - supp_start >= min_cnv_length) {
+                SVCandidate sv_candidate(supp_start+1, supp_end+1, ".");
+                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate);
+                double likelihood = std::get<0>(result);
+                SVType cnv_type = std::get<1>(result);
+                std::string genotype = std::get<2>(result);
+                bool snps_found = std::get<3>(result);
+                std::string aln_type = "LOG2";
+                if (snps_found) {
+                    aln_type += "_SNPS";
+                } else {
+                    aln_type += "_NOSNPS";
+                }
 
-                    // Update the SV type for inversions
-                    if (cnv_type == SVType::NEUTRAL) {
-                        cnv_type = SVType::INV;
-                    } else if (cnv_type == SVType::DUP) {
-                        cnv_type = SVType::INV_DUP;
-                    } else {
-                        cnv_type = SVType::UNKNOWN;
-                    }
-                    
-                    // Add the SV call to the main SV data if not unknown
-                    if (cnv_type != SVType::UNKNOWN) {
-                        sv_calls.add(supp_chr, supp_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood);
-                    }
-                    sv_count++;
+                // Update the SV type for inversions
+                if (cnv_type == SVType::NEUTRAL) {
+                    cnv_type = SVType::INV;
+                } else if (cnv_type == SVType::DUP) {
+                    cnv_type = SVType::INV_DUP;
+                } else {
+                    cnv_type = SVType::UNKNOWN;
+                }
+                
+                // Add the SV call to the main SV data if not unknown
+                if (cnv_type != SVType::UNKNOWN) {
+                    sv_calls.add(supp_chr, supp_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood);
                 }
+                sv_count++;
             }
+        }
 
-            // [2] CNV detection based on primary and supplementary alignment boundaries
-            else if (supp_start < primary_start && supp_end < primary_start) {
-
-                // Gap with supplementary before primary:
-                // [supp_start] [supp_end] -- [primary_start] [primary_end]
-                if (primary_end - supp_start >= min_cnv_length) {
-                    SVCandidate sv_candidate(supp_start+1, primary_end+1, ".");
-
-                    // Run copy number prediction for the SV candidate
-                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate);
-                    double likelihood = std::get<0>(result);
-                    SVType cnv_type = std::get<1>(result);
-                    std::string genotype = std::get<2>(result);
-                    bool snps_found = std::get<3>(result);
-                    std::string aln_type = "GAPOUTER_A";
-                    if (snps_found) {
-                        aln_type += "_SNPS";
-                    } else {
-                        aln_type += "_NOSNPS";
-                    }
+        // [2] CNV detection based on primary and supplementary alignment boundaries
+        else if (supp_start < primary_start && supp_end < primary_start) {
+
+            // Gap with supplementary before primary:
+            // [supp_start] [supp_end] -- [primary_start] [primary_end]
+            if (primary_end - supp_start >= min_cnv_length) {
+                SVCandidate sv_candidate(supp_start+1, primary_end+1, ".");
+
+                // Run copy number prediction for the SV candidate
+                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate);
+                double likelihood = std::get<0>(result);
+                SVType cnv_type = std::get<1>(result);
+                std::string genotype = std::get<2>(result);
+                bool snps_found = std::get<3>(result);
+                std::string aln_type = "GAPOUTER_A";
+                if (snps_found) {
+                    aln_type += "_SNPS";
+                } else {
+                    aln_type += "_NOSNPS";
+                }
 
-                    // Add the SV call to the main SV data if not unknown
-                    if (cnv_type != SVType::UNKNOWN) {
-                        sv_calls.add(supp_chr, supp_start, primary_end, cnv_type, ".", aln_type, genotype, likelihood);
-                    }
-                    sv_count++;
+                // Add the SV call to the main SV data if not unknown
+                if (cnv_type != SVType::UNKNOWN) {
+                    sv_calls.add(supp_chr, supp_start, primary_end, cnv_type, ".", aln_type, genotype, likelihood);
+                }
+                sv_count++;
+            }
+            
+        } else if (supp_start > primary_end && supp_end > primary_end) {
+            // Gap with supplementary after primary:
+            // [primary_start] [primary_end] -- [supp_start] [supp_end]
+
+            if (supp_end - primary_start >= min_cnv_length) {
+                SVCandidate sv_candidate(primary_start+1, supp_end+1, ".");
+
+                // Run copy number prediction for the SV candidate
+                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate);
+                double likelihood = std::get<0>(result);
+                SVType cnv_type = std::get<1>(result);
+                std::string genotype = std::get<2>(result);
+                bool snps_found = std::get<3>(result);
+                std::string aln_type = "GAPOUTER_B";
+                if (snps_found) {
+                    aln_type += "_SNPS";
+                } else {
+                    aln_type += "_NOSNPS";
                 }
-                
-            } else if (supp_start > primary_end && supp_end > primary_end) {
-                // Gap with supplementary after primary:
-                // [primary_start] [primary_end] -- [supp_start] [supp_end]
-
-                if (supp_end - primary_start >= min_cnv_length) {
-                    SVCandidate sv_candidate(primary_start+1, supp_end+1, ".");
-
-                    // Run copy number prediction for the SV candidate
-                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate);
-                    double likelihood = std::get<0>(result);
-                    SVType cnv_type = std::get<1>(result);
-                    std::string genotype = std::get<2>(result);
-                    bool snps_found = std::get<3>(result);
-                    std::string aln_type = "GAPOUTER_B";
-                    if (snps_found) {
-                        aln_type += "_SNPS";
-                    } else {
-                        aln_type += "_NOSNPS";
-                    }
 
-                    // Add the SV call to the main SV data if not unknown
-                    if (cnv_type != SVType::UNKNOWN) {
-                        sv_calls.add(supp_chr, primary_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood);
-                    }
-                    sv_count++;
+                // Add the SV call to the main SV data if not unknown
+                if (cnv_type != SVType::UNKNOWN) {
+                    sv_calls.add(supp_chr, primary_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood);
                 }
+                sv_count++;
             }
         }
     }

From 956e7e51fcc28967a612b468dbe7dcf606256e1c Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 4 Nov 2024 12:59:45 -0500
Subject: [PATCH 010/134] Fix GT error and reduce fps

---
 include/cnv_caller.h  |   4 +-
 include/sv_types.h    |  21 +--
 python/sv_merger.py   |   7 +-
 src/cnv_caller.cpp    |   3 +-
 src/sv_caller.cpp     | 367 ++++++++++++++++++++++++++++++------------
 src/sv_data.cpp       |   6 +
 tests/test_general.py |   2 +-
 7 files changed, 286 insertions(+), 124 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 86f0abf9..af211fc8 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -61,7 +61,7 @@ class CNVCaller {
         // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output.
         // Each of the 6 state predictions corresponds to a copy number state
         // (0=No predicted state)
-        // 0: 1/1 (Normal diploid: no copy number change, GT: 1/1)
+        // 0: Unknown (No predicted state)
         // 1: 0/0 (Two copy loss: homozygous deletion, GT: 0/0)
         // 2: 1/0 (One copy loss: heterozygous deletion, GT: 0/1)
         // 3: 1/1 (Normal diploid: no copy number change, GT: 1/1)
@@ -69,7 +69,7 @@ class CNVCaller {
         // 5: 2/1 (One copy gain: heterozygous duplication, GT: 1/2->0/1)
         // 6: 2/2 (Two copy gain: homozygous duplication, GT: 2/2->1/1)
         std ::map<int, std::string> cnv_genotype_map = {
-            {0, "1/1"},
+            {0, "./."},
             {1, "0/0"},
             {2, "0/1"},
             {3, "1/1"},
diff --git a/include/sv_types.h b/include/sv_types.h
index 97b49185..335a4033 100644
--- a/include/sv_types.h
+++ b/include/sv_types.h
@@ -20,7 +20,8 @@ namespace sv_types {
         INS = 3,
         BND = 4,
         NEUTRAL = 5,  // Neutral copy number with unknown type
-        INV_DUP = 6  // Inversion duplication
+        INV_DUP = 6,  // Inversion duplication
+        COMPLEX = 7  // Complex SV
     };
 
     // Mapping of SV types to strings
@@ -32,7 +33,8 @@ namespace sv_types {
         {SVType::INS, "INS"},
         {SVType::BND, "BND"},
         {SVType::NEUTRAL, "NEUTRAL"},
-        {SVType::INV_DUP, "INV_DUP"}
+        {SVType::INV_DUP, "INVDUP"},
+        {SVType::COMPLEX, "COMPLEX"}
     };
 
     // Mapping of 6 copy number states to SV types
@@ -56,18 +58,6 @@ namespace sv_types {
         return CNVTypeMap.at(cn_state);
     }
 
-    // static const int UNKNOWN = -1;
-    // static const int DEL = 0;
-    // static const int DUP = 1;
-    // static const int INV = 2;
-    // static const int INS = 3;
-    // static const int BND = 4;
-    // static const int NEUTRAL = 5;  // Neutral copy number with unknown type
-    // static const int INV_DUP = 6;  // Inversion duplication
-
-    // // Define SVTypeString for SV types (for VCF output)
-    // static const std::string SVTypeString[] = {"DEL", "DUP", "INV", "INS", "BND", "NEUT", "INVDUP"};
-
     // Create a struct for storing SV information
     struct SVInfo {
         SVType sv_type;
@@ -79,9 +69,6 @@ namespace sv_types {
         double hmm_likelihood = 0.0;  // HMM likelihood score for the state sequence
 
         SVInfo() = default;
-        // SVInfo() :
-        //     sv_type(-1), read_support(0), read_depth(0), data_type({}), sv_length(0), genotype("./."), hmm_likelihood(0.0){}
-            
         SVInfo(SVType sv_type, int read_support, int read_depth, std::string data_type, int sv_length, std::string genotype, double hmm_likelihood) :
             sv_type(sv_type), read_support(read_support), read_depth(read_depth), data_type({data_type}), sv_length(sv_length), genotype(genotype), hmm_likelihood(hmm_likelihood) {}
     };
diff --git a/python/sv_merger.py b/python/sv_merger.py
index 56c0ae26..d2c59977 100644
--- a/python/sv_merger.py
+++ b/python/sv_merger.py
@@ -137,6 +137,10 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
     cluster_labels = []
 
     # dbscan = DBSCAN(eps=30000, min_samples=3)
+    
+    if len(breakpoints) == 1:
+        return merged_records
+    
     logging.info("Clustering %d SV breakpoints with parameters: min_cluster_size=%d", len(breakpoints), cluster_size_min)
     dbscan = HDBSCAN(min_cluster_size=cluster_size_min, min_samples=2)
     if len(breakpoints) > 0:
@@ -144,6 +148,7 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
         cluster_labels = dbscan.fit_predict(breakpoints)
 
         logging.info("Label counts: %d", len(np.unique(cluster_labels)))
+       
 
     # Merge SVs with the same label
     unique_labels = np.unique(cluster_labels)
@@ -421,4 +426,4 @@ def sv_merger(vcf_file_path, cluster_size_min=3, suffix='.merged'):
 
     # DBSCAN 
     sv_merger(vcf_file_path, cluster_size_min=cluster_size_min, suffix=suffix)
-    
\ No newline at end of file
+    
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 9e575771..e052f45f 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -182,9 +182,8 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
         std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv";
         std::cout << "Saving SV split-alignment copy number predictions to " << sv_filename << std::endl;
         this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood);
-        // this->saveSVCopyNumberToTSV(best_snp_data, sv_filename, chr, best_pos.first, best_pos.second, cnv_type_str, best_likelihood);
     }
-
+    
     return std::make_tuple(likelihood, predicted_cnv_type, genotype, sv_snps_found);
 }
 
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 3a3d3723..53f20ffb 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -434,121 +434,286 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
             continue;
         }
 
-        // Find the largest alignment on the primary chromosome
-        AlignmentData supp_alignment = supp_map[qname][0];
-        int32_t largest_supp_length = 0;
-        auto largest_supp_it = supp_map[qname].end();
+        // Resolve overlaps between the primary and supplementary query
+        // sequences
+        for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
+            std::string supp_chr = std::get<0>(*it);
+            // int32_t supp_start = std::get<1>(*it);
+            // int32_t supp_end = std::get<2>(*it);
+            int32_t supp_query_start = std::get<4>(*it);
+            int32_t supp_query_end = std::get<5>(*it);
+            std::unordered_map<int, int> supp_match_map = std::get<6>(*it);
+            bool supp_strand = std::get<7>(*it);
+
+            // Resolve overlaps between the primary and supplementary query
+            // sequences
+            int32_t overlap_start = std::max(primary_query_start, supp_query_start);
+            int32_t overlap_end = std::min(primary_query_end, supp_query_end);
+            int32_t overlap_length = overlap_end - overlap_start;
+            if (overlap_length > 0) {
+
+                // Calculate the mismatch rate for each alignment at the overlap
+                double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1);
+                double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1);
+                // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl;
+                // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl;
+
+                // Trim the overlap from the alignment with the higher mismatch
+                // rate
+                if (primary_mismatch_rate > supp_mismatch_rate) {
+                    if (overlap_start == primary_query_start) {
+                        primary_start += overlap_length;
+                    } else if (overlap_end == primary_query_end) {
+                        primary_end -= overlap_length;
+                    }
+
+                } else {
+                    if (overlap_start == supp_query_start) {
+                        // supp_start += overlap_length;
+                        // Update the value in the supp map
+                        std::get<1>(*it) += overlap_length;
+                    } else if (overlap_end == supp_query_end) {
+                        // supp_end -= overlap_length;
+                        // Update the value in the supp map
+                        std::get<2>(*it) -= overlap_length;
+                    }
+                }
+            }
+        }
+
+        // Remove supplementary alignments that are not on the same chromosome
+        // as the primary alignment
         for (auto it = supp_map[qname].begin(); it != supp_map[qname].end();) {
-            const auto& supp_chr = std::get<0>(*it);
-            if (supp_chr != primary_chr) {
+            if (std::get<0>(*it) != primary_chr) {
                 it = supp_map[qname].erase(it);
             } else {
-                int32_t supp_length = std::get<2>(*it) - std::get<1>(*it);
-                if (supp_length > largest_supp_length) {
-                    largest_supp_length = supp_length;
-                    largest_supp_it = it;
-                }
                 ++it;
             }
         }
-        if (largest_supp_it == supp_map[qname].end()) {
-            continue;  // No primary chromosome alignments
-        }
-        supp_alignment = *largest_supp_it;
-       
-        // Run SV detection from the primary and supplementary alignment
-        std::string supp_chr = std::get<0>(supp_alignment);
-        int32_t supp_start = std::get<1>(supp_alignment);
-        int32_t supp_end = std::get<2>(supp_alignment);
-        int32_t supp_query_start = std::get<4>(supp_alignment);
-        int32_t supp_query_end = std::get<5>(supp_alignment);
-        std::unordered_map<int, int> supp_match_map = std::get<6>(supp_alignment);
-        bool supp_strand = std::get<7>(supp_alignment);
-
-        // Resolve overlaps between the primary and supplementary query sequences
-        int32_t overlap_start = std::max(primary_query_start, supp_query_start);
-        int32_t overlap_end = std::min(primary_query_end, supp_query_end);
-        int32_t overlap_length = overlap_end - overlap_start;
-        if (overlap_length > 0) {
-            // std::cout << "Overlap detected for read " << qname << std::endl;
-            // std::cout << "Primary read position: " << primary_query_start << "-" << primary_query_end << std::endl;
-            // std::cout << "Supplementary read position: " << supp_query_start << "-" << supp_query_end << std::endl;
-            // std::cout << "Overlap range: " << overlap_start << "-" << overlap_end << std::endl;
-            // std::cout << "Overlap length: " << overlap_length << std::endl;
-            // std::cout << "Primary reference position: " << primary_start << "-" << primary_end << std::endl;
-            // std::cout << "Supplementary reference position: " << supp_start << "-" << supp_end << std::endl;
-
-            // Calculate the mismatch rate for each alignment at the overlap
-            double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1);
-            double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1);
-            // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl;
-            // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl;
-
-            // Trim the overlap from the alignment with the higher mismatch
-            // rate
-            if (primary_mismatch_rate > supp_mismatch_rate) {
-                if (overlap_start == primary_query_start) {
-                    primary_start += overlap_length;
-                } else if (overlap_end == primary_query_end) {
-                    primary_end -= overlap_length;
-                }
 
-            } else {
-                if (overlap_start == supp_query_start) {
-                    supp_start += overlap_length;
-                } else if (overlap_end == supp_query_end) {
-                    supp_end -= overlap_length;
-                }
+        // Loop through the supplementary alignments, find the largest
+        // supplementary alignment, and the closest non-overlapping
+        // supplementary alignment to the primary alignment
+        AlignmentData largest_supp_alignment = supp_map[qname][0];
+        AlignmentData closest_supp_alignment = supp_map[qname][0];
+        int32_t largest_supp_length = 0;
+        int32_t closest_supp_distance = std::numeric_limits<int32_t>::max();
+        for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
+            const auto& supp_chr = std::get<0>(*it);
+            int32_t supp_start = std::get<1>(*it);
+            int32_t supp_end = std::get<2>(*it);
+            int32_t supp_length = supp_end - supp_start;
+            int32_t supp_distance = std::numeric_limits<int32_t>::max();
+            if (supp_start > primary_end) {
+                supp_distance = supp_start - primary_end;
+            } else if (supp_end < primary_start) {
+                supp_distance = primary_start - supp_end;
+            }
+            if (supp_length > largest_supp_length) {
+                largest_supp_length = supp_length;
+                largest_supp_alignment = *it;
+            }
+            if (supp_distance < closest_supp_distance) {
+                closest_supp_distance = supp_distance;
+                closest_supp_alignment = *it;
             }
         }
 
-        // [1] Inversion detection from primary and supplementary alignments
-        // on opposite strands
-        if (primary_strand != supp_strand) {
-            // std::cout << "Inversion detected for read " << qname << std::endl;
-            // std::cout << "Primary read position: " << primary_start << "-" << primary_end << std::endl;
-            // std::cout << "Supplementary read position: " << supp_start << "-" << supp_end << std::endl;
-            if (supp_end - supp_start >= min_cnv_length) {
-                SVCandidate sv_candidate(supp_start+1, supp_end+1, ".");
-                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate);
-                double likelihood = std::get<0>(result);
-                SVType cnv_type = std::get<1>(result);
-                std::string genotype = std::get<2>(result);
-                bool snps_found = std::get<3>(result);
-                std::string aln_type = "LOG2";
-                if (snps_found) {
-                    aln_type += "_SNPS";
-                } else {
-                    aln_type += "_NOSNPS";
-                }
+        // Find if there are any reverse strand alignments between the primary
+        // and supplementary alignment
+        bool complex_sv_found = false;
+        int32_t largest_supp_start = std::get<1>(largest_supp_alignment);
+        int32_t largest_supp_end = std::get<2>(largest_supp_alignment);
+        for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
+            if (std::get<7>(*it) != std::get<7>(primary_alignment)) {  // Check if the strands are different
+                // Check if it is between the primary and supplementary
+                // alignment
+                int32_t rev_supp_start = std::get<1>(*it);
+                int32_t rev_supp_end = std::get<2>(*it);
+                if ((rev_supp_start > primary_end && rev_supp_end < largest_supp_start) || (rev_supp_start > largest_supp_end && rev_supp_end < primary_start)) {
+                    // [primary_end] -- [supp_reverse] -- [supp_start]
+                    // Or: [supp_end] -- [supp_reverse] -- [primary_start]
+
+                    // Detect CNVs at the primary alignment
+                    SVType primary_type = SVType::UNKNOWN;
+                    if (primary_end - primary_start >= this->input_data->getMinCNVLength()) {
+                        SVCandidate sv_candidate(primary_start+1, primary_end+1, ".");
+                        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
+                        double primary_likelihood = std::get<0>(result);
+                        primary_type = std::get<1>(result);
+
+                        // Break if prediction is unknown
+                        if (primary_type == SVType::UNKNOWN) {
+                            continue;
+                        }
+                    }
 
-                // Update the SV type for inversions
-                if (cnv_type == SVType::NEUTRAL) {
-                    cnv_type = SVType::INV;
-                } else if (cnv_type == SVType::DUP) {
-                    cnv_type = SVType::INV_DUP;
-                } else {
-                    cnv_type = SVType::UNKNOWN;
-                }
-                
-                // Add the SV call to the main SV data if not unknown
-                if (cnv_type != SVType::UNKNOWN) {
-                    sv_calls.add(supp_chr, supp_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood);
+                    // Detect CNVs at the largest supplementary alignment
+                    SVType largest_supp_type = SVType::UNKNOWN;
+                    if (largest_supp_end - largest_supp_start >= this->input_data->getMinCNVLength()) {
+                        SVCandidate sv_candidate(largest_supp_start+1, largest_supp_end+1, ".");
+                        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
+                        double largest_supp_likelihood = std::get<0>(result);
+                        largest_supp_type = std::get<1>(result);
+                        
+                        // Break if prediction is unknown
+                        if (largest_supp_type == SVType::UNKNOWN) {
+                            continue;
+                        }
+                    }
+
+                    // Predict between the primary and largest supplementary
+                    // alignment
+                    int32_t left_start = std::min(primary_end, largest_supp_end);
+                    int32_t right_end = std::max(primary_start, largest_supp_start);
+
+                    // Detect CNVs between the left and reverse supplementary
+                    // alignment
+                    SVType left_type = SVType::UNKNOWN;
+                    // if (rev_supp_start - primary_end >=
+                    // this->input_data->getMinCNVLength()) {
+                    if (rev_supp_start - left_start >= this->input_data->getMinCNVLength()) {
+                        // SVCandidate sv_candidate(primary_end+1,
+                        // rev_supp_start+1, ".");
+                        SVCandidate sv_candidate(left_start+1, rev_supp_start+1, ".");
+                        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
+                        double left_likelihood = std::get<0>(result);
+                        left_type = std::get<1>(result);
+
+                        // Break if prediction is unknown
+                        if (left_type == SVType::UNKNOWN) {
+                            continue;
+                        }
+                    }
+
+                    // Detect CNVs at the reverse alignment
+                    SVType rev_type = SVType::UNKNOWN;
+                    if (rev_supp_end - rev_supp_start >= this->input_data->getMinCNVLength()) {
+                        SVCandidate sv_candidate(rev_supp_start+1, rev_supp_end+1, ".");
+                        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
+                        double inv_likelihood = std::get<0>(result);
+                        rev_type = std::get<1>(result);
+                        if (rev_type == SVType::NEUTRAL) {
+                            rev_type = SVType::INV;
+                        } else if (rev_type == SVType::DUP) {
+                            rev_type = SVType::INV_DUP;
+                        }
+
+                        // Break if prediction is unknown
+                        if (rev_type == SVType::UNKNOWN) {
+                            continue;
+                        }
+                    }
+
+                    // Detect CNVs between the reverse supplementary and the
+                    // supplementary alignment (right side)
+                    SVType right_type = SVType::UNKNOWN;
+                    // if (supp_start - rev_supp_end >=
+                    // this->input_data->getMinCNVLength()) {
+                    if (right_end - rev_supp_end >= this->input_data->getMinCNVLength()) {
+                        SVCandidate sv_candidate(rev_supp_end+1, right_end+1, ".");
+                        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
+                        double right_likelihood = std::get<0>(result);
+                        right_type = std::get<1>(result);
+
+                        // Break if prediction is unknown
+                        if (right_type == SVType::UNKNOWN) {
+                            continue;
+                        }
+                    }
+
+                    // Resolve the SV type and coordinates
+                    std::string sv_type_str = "";
+                    int32_t sv_start_pos = left_start;
+                    int32_t sv_end_pos = left_start;
+
+                    // Alignment predictions
+                    if (primary_start < left_start) {
+                        if (primary_type != SVType::NEUTRAL && primary_type != SVType::UNKNOWN) {
+                            sv_type_str += getSVTypeString(primary_type) + "+";
+                            sv_end_pos = primary_end;
+                        } else {
+                            sv_start_pos = primary_start;
+                        }
+                    } else {
+                        if (largest_supp_type != SVType::NEUTRAL && largest_supp_type != SVType::UNKNOWN) {
+                            sv_type_str += getSVTypeString(largest_supp_type) + "+";
+                            sv_end_pos = largest_supp_end;
+                        } else {
+                            sv_start_pos = largest_supp_start;
+                        }
+                    }
+
+                    // Between-alignments predictions
+                    if (left_type != SVType::NEUTRAL && left_type != SVType::UNKNOWN) {
+                        sv_type_str += getSVTypeString(left_type) + "+";
+                        sv_end_pos = rev_supp_start;
+                    } else {
+                        sv_start_pos = rev_supp_start;
+                    }
+
+                    if (rev_type != SVType::NEUTRAL && rev_type != SVType::UNKNOWN) {
+                        sv_type_str += getSVTypeString(rev_type) + "+";
+                        sv_end_pos = rev_supp_end;
+                    } else {
+                        sv_start_pos = rev_supp_end;
+                    }
+
+                    if (right_type != SVType::NEUTRAL && right_type != SVType::UNKNOWN) {
+                        sv_end_pos = right_end;
+                        sv_type_str += getSVTypeString(right_type) + "+";
+                    }
+
+                    // Alignments predictions
+                    if (primary_end > right_end) {
+                        if (primary_type != SVType::NEUTRAL && primary_type != SVType::UNKNOWN) {
+                            sv_type_str += getSVTypeString(primary_type) + "+";
+                            sv_end_pos = primary_end;
+                        } else {
+                            sv_start_pos = primary_start;
+                        }
+                    } else {
+                        if (largest_supp_type != SVType::NEUTRAL && largest_supp_type != SVType::UNKNOWN) {
+                            sv_type_str += getSVTypeString(largest_supp_type) + "+";
+                            sv_end_pos = largest_supp_end;
+                        } else {
+                            sv_start_pos = largest_supp_start;
+                        }
+                    }
+
+                    if (sv_type_str != "") {
+                        sv_type_str.pop_back();  // Remove the last '+'
+
+                        // Add the complex SV
+                        complex_sv_found = true;
+                        std::cout << "Complex SV detected of type " << sv_type_str << " at positions " << primary_chr << ":" << left_start << "-" << right_end << std::endl;
+                        sv_count++;
+
+                        // Add the complex SV
+                        sv_calls.add(primary_chr, sv_start_pos+2, sv_end_pos+1, SVType::COMPLEX, ".", "COMPLEX", "./.", 0.0);
+                    }
                 }
-                sv_count++;
             }
         }
 
-        // [2] CNV detection based on primary and supplementary alignment boundaries
-        else if (supp_start < primary_start && supp_end < primary_start) {
+        if (complex_sv_found) {
+            continue;  // Continue to the next alignment
+        }
+
+        // [2] CNV detection based on primary and largest supplementary
+        // alignment boundaries
+        // else if (largest_supp_start < primary_start && largest_supp_end <
+        // primary_start) {
+        std::string largest_supp_chr = std::get<0>(largest_supp_alignment);
+        if (largest_supp_start < primary_start && largest_supp_end < primary_start) {
 
             // Gap with supplementary before primary:
             // [supp_start] [supp_end] -- [primary_start] [primary_end]
-            if (primary_end - supp_start >= min_cnv_length) {
-                SVCandidate sv_candidate(supp_start+1, primary_end+1, ".");
+            if (primary_end - largest_supp_start >= min_cnv_length) {
+                SVCandidate sv_candidate(largest_supp_start+1, primary_end+1, ".");
 
                 // Run copy number prediction for the SV candidate
-                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate);
+                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(largest_supp_chr, sv_candidate);
                 double likelihood = std::get<0>(result);
                 SVType cnv_type = std::get<1>(result);
                 std::string genotype = std::get<2>(result);
@@ -562,20 +727,20 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
 
                 // Add the SV call to the main SV data if not unknown
                 if (cnv_type != SVType::UNKNOWN) {
-                    sv_calls.add(supp_chr, supp_start, primary_end, cnv_type, ".", aln_type, genotype, likelihood);
+                    sv_calls.add(largest_supp_chr, largest_supp_start+1, primary_end+1, cnv_type, ".", aln_type, genotype, likelihood);
                 }
                 sv_count++;
             }
             
-        } else if (supp_start > primary_end && supp_end > primary_end) {
+        } else if (largest_supp_start > primary_end && largest_supp_end > primary_end) {
             // Gap with supplementary after primary:
             // [primary_start] [primary_end] -- [supp_start] [supp_end]
 
-            if (supp_end - primary_start >= min_cnv_length) {
-                SVCandidate sv_candidate(primary_start+1, supp_end+1, ".");
+            if (largest_supp_end - primary_start >= min_cnv_length) {
+                SVCandidate sv_candidate(primary_start+1, largest_supp_end+1, ".");
 
                 // Run copy number prediction for the SV candidate
-                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate);
+                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(largest_supp_chr, sv_candidate);
                 double likelihood = std::get<0>(result);
                 SVType cnv_type = std::get<1>(result);
                 std::string genotype = std::get<2>(result);
@@ -589,7 +754,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
 
                 // Add the SV call to the main SV data if not unknown
                 if (cnv_type != SVType::UNKNOWN) {
-                    sv_calls.add(supp_chr, primary_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood);
+                    sv_calls.add(largest_supp_chr, primary_start+1, largest_supp_end+1, cnv_type, ".", aln_type, genotype, likelihood);
                 }
                 sv_count++;
             }
diff --git a/src/sv_data.cpp b/src/sv_data.cpp
index 797ef704..18ea8056 100644
--- a/src/sv_data.cpp
+++ b/src/sv_data.cpp
@@ -10,6 +10,12 @@
 
 int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
 {
+    // Throw an error if the genotype is not valid
+    if (genotype != "./." && genotype != "0/0" && genotype != "0/1" && genotype != "1/1") {
+        std::cerr << "Error: Invalid genotype " << genotype << std::endl;
+        return -1;
+    }
+
     // Check if the alternate allele contains ambiguous bases
     const std::unordered_set<char> ambiguous_bases = {'R', 'Y', 'W', 'S', 'K', 'M', 'B', 'D', 'H', 'V'};
     for (char c : alt_allele) {
diff --git a/tests/test_general.py b/tests/test_general.py
index 8f5d4006..499805c8 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -64,7 +64,7 @@ def test_run():
 
     # Check that the VCF file has the correct number of lines.
     with open(output_file, 'r', encoding='utf-8') as f:
-        assert len(f.readlines()) == 22
+        assert len(f.readlines()) == 21
 
     # Check that the VCF file has the correct header, and the correct
     # VCF CHROM, POS, and INFO fields in the next 2 lines.

From e88f5629b838511fe429331c656683194aecdb3b Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 11 Nov 2024 16:22:13 -0500
Subject: [PATCH 011/134] Fix alt allele and likelihood comparisons

---
 include/sv_data.h     |   2 +-
 include/sv_types.h    |   2 +-
 src/khmm.cpp          |   6 +-
 src/sv_caller.cpp     | 339 ++++++++++++++----------------------------
 src/sv_data.cpp       |   9 +-
 tests/test_general.py |   2 +-
 6 files changed, 128 insertions(+), 232 deletions(-)

diff --git a/include/sv_data.h b/include/sv_data.h
index 548d6513..c321bf64 100644
--- a/include/sv_data.h
+++ b/include/sv_data.h
@@ -36,7 +36,7 @@ class SVData {
     public:
         SVData() {};
 
-        int add(std::string chr, int64_t start, int64_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood);
+        int add(std::string chr, int32_t start, int32_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood);
 
         void concatenate(const SVData& sv_data);
 
diff --git a/include/sv_types.h b/include/sv_types.h
index 335a4033..f58e6f7b 100644
--- a/include/sv_types.h
+++ b/include/sv_types.h
@@ -74,7 +74,7 @@ namespace sv_types {
     };
 
     // Type definition for SV-related structures
-    using SVCandidate = std::tuple<int64_t, int64_t, std::string>;  // SV (start, end, alt_allele)
+    using SVCandidate = std::tuple<int32_t, int32_t, std::string>;  // SV (start, end, alt_allele)
     using SVDepthMap = std::unordered_map<std::string, std::map<SVCandidate, SVInfo>>;  // Chromosome -> SV candidate -> SV info
 }
 
diff --git a/src/khmm.cpp b/src/khmm.cpp
index a5d553be..375325a1 100644
--- a/src/khmm.cpp
+++ b/src/khmm.cpp
@@ -378,8 +378,12 @@ std::pair<std::vector<int>, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect
 	free_dmatrix(biot, 1, hmm.N, 1, T);
 	free_dmatrix(A1, 1, hmm.N, 1, hmm.N);
 
+	// Normalize the log likelihood by the sample size
+	double min_prob_normalized = min_prob / (double)T;
+
 	// Return the state sequence and its likelihood
-	return std::make_pair(q, min_prob);
+	// return std::make_pair(q, min_prob);
+	return std::make_pair(q, min_prob_normalized);
 }
 
 CHMM ReadCHMM(const char *filename)
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 53f20ffb..12c6d5f4 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -491,6 +491,17 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
             }
         }
 
+        // Run copy number variant predictions on the primary alignment
+        SVType primary_type = SVType::UNKNOWN;
+        double primary_log_likelihood = std::numeric_limits<double>::lowest();
+        if (primary_end - primary_start >= min_cnv_length) {
+            SVCandidate sv_candidate(primary_start+1, primary_end+1, ".");
+            std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
+            primary_log_likelihood = std::get<0>(result);
+            // primary_log_likelihood /= (double)(primary_end - primary_start);  // Normalize the log likelihood by the length
+            primary_type = std::get<1>(result);
+        }
+
         // Loop through the supplementary alignments, find the largest
         // supplementary alignment, and the closest non-overlapping
         // supplementary alignment to the primary alignment
@@ -498,6 +509,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
         AlignmentData closest_supp_alignment = supp_map[qname][0];
         int32_t largest_supp_length = 0;
         int32_t closest_supp_distance = std::numeric_limits<int32_t>::max();
+        int32_t closest_supp_length = 0;
         for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
             const auto& supp_chr = std::get<0>(*it);
             int32_t supp_start = std::get<1>(*it);
@@ -514,249 +526,124 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
                 largest_supp_alignment = *it;
             }
             if (supp_distance < closest_supp_distance) {
-                closest_supp_distance = supp_distance;
+                closest_supp_length = supp_length;
                 closest_supp_alignment = *it;
+                closest_supp_distance = supp_distance;
             }
         }
 
-        // Find if there are any reverse strand alignments between the primary
-        // and supplementary alignment
-        bool complex_sv_found = false;
-        int32_t largest_supp_start = std::get<1>(largest_supp_alignment);
-        int32_t largest_supp_end = std::get<2>(largest_supp_alignment);
-        for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
-            if (std::get<7>(*it) != std::get<7>(primary_alignment)) {  // Check if the strands are different
-                // Check if it is between the primary and supplementary
-                // alignment
-                int32_t rev_supp_start = std::get<1>(*it);
-                int32_t rev_supp_end = std::get<2>(*it);
-                if ((rev_supp_start > primary_end && rev_supp_end < largest_supp_start) || (rev_supp_start > largest_supp_end && rev_supp_end < primary_start)) {
-                    // [primary_end] -- [supp_reverse] -- [supp_start]
-                    // Or: [supp_end] -- [supp_reverse] -- [primary_start]
-
-                    // Detect CNVs at the primary alignment
-                    SVType primary_type = SVType::UNKNOWN;
-                    if (primary_end - primary_start >= this->input_data->getMinCNVLength()) {
-                        SVCandidate sv_candidate(primary_start+1, primary_end+1, ".");
-                        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-                        double primary_likelihood = std::get<0>(result);
-                        primary_type = std::get<1>(result);
-
-                        // Break if prediction is unknown
-                        if (primary_type == SVType::UNKNOWN) {
-                            continue;
-                        }
-                    }
-
-                    // Detect CNVs at the largest supplementary alignment
-                    SVType largest_supp_type = SVType::UNKNOWN;
-                    if (largest_supp_end - largest_supp_start >= this->input_data->getMinCNVLength()) {
-                        SVCandidate sv_candidate(largest_supp_start+1, largest_supp_end+1, ".");
-                        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-                        double largest_supp_likelihood = std::get<0>(result);
-                        largest_supp_type = std::get<1>(result);
-                        
-                        // Break if prediction is unknown
-                        if (largest_supp_type == SVType::UNKNOWN) {
-                            continue;
-                        }
-                    }
-
-                    // Predict between the primary and largest supplementary
-                    // alignment
-                    int32_t left_start = std::min(primary_end, largest_supp_end);
-                    int32_t right_end = std::max(primary_start, largest_supp_start);
-
-                    // Detect CNVs between the left and reverse supplementary
-                    // alignment
-                    SVType left_type = SVType::UNKNOWN;
-                    // if (rev_supp_start - primary_end >=
-                    // this->input_data->getMinCNVLength()) {
-                    if (rev_supp_start - left_start >= this->input_data->getMinCNVLength()) {
-                        // SVCandidate sv_candidate(primary_end+1,
-                        // rev_supp_start+1, ".");
-                        SVCandidate sv_candidate(left_start+1, rev_supp_start+1, ".");
-                        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-                        double left_likelihood = std::get<0>(result);
-                        left_type = std::get<1>(result);
-
-                        // Break if prediction is unknown
-                        if (left_type == SVType::UNKNOWN) {
-                            continue;
-                        }
-                    }
-
-                    // Detect CNVs at the reverse alignment
-                    SVType rev_type = SVType::UNKNOWN;
-                    if (rev_supp_end - rev_supp_start >= this->input_data->getMinCNVLength()) {
-                        SVCandidate sv_candidate(rev_supp_start+1, rev_supp_end+1, ".");
-                        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-                        double inv_likelihood = std::get<0>(result);
-                        rev_type = std::get<1>(result);
-                        if (rev_type == SVType::NEUTRAL) {
-                            rev_type = SVType::INV;
-                        } else if (rev_type == SVType::DUP) {
-                            rev_type = SVType::INV_DUP;
-                        }
-
-                        // Break if prediction is unknown
-                        if (rev_type == SVType::UNKNOWN) {
-                            continue;
-                        }
-                    }
-
-                    // Detect CNVs between the reverse supplementary and the
-                    // supplementary alignment (right side)
-                    SVType right_type = SVType::UNKNOWN;
-                    // if (supp_start - rev_supp_end >=
-                    // this->input_data->getMinCNVLength()) {
-                    if (right_end - rev_supp_end >= this->input_data->getMinCNVLength()) {
-                        SVCandidate sv_candidate(rev_supp_end+1, right_end+1, ".");
-                        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-                        double right_likelihood = std::get<0>(result);
-                        right_type = std::get<1>(result);
-
-                        // Break if prediction is unknown
-                        if (right_type == SVType::UNKNOWN) {
-                            continue;
-                        }
-                    }
-
-                    // Resolve the SV type and coordinates
-                    std::string sv_type_str = "";
-                    int32_t sv_start_pos = left_start;
-                    int32_t sv_end_pos = left_start;
-
-                    // Alignment predictions
-                    if (primary_start < left_start) {
-                        if (primary_type != SVType::NEUTRAL && primary_type != SVType::UNKNOWN) {
-                            sv_type_str += getSVTypeString(primary_type) + "+";
-                            sv_end_pos = primary_end;
-                        } else {
-                            sv_start_pos = primary_start;
-                        }
-                    } else {
-                        if (largest_supp_type != SVType::NEUTRAL && largest_supp_type != SVType::UNKNOWN) {
-                            sv_type_str += getSVTypeString(largest_supp_type) + "+";
-                            sv_end_pos = largest_supp_end;
-                        } else {
-                            sv_start_pos = largest_supp_start;
-                        }
-                    }
-
-                    // Between-alignments predictions
-                    if (left_type != SVType::NEUTRAL && left_type != SVType::UNKNOWN) {
-                        sv_type_str += getSVTypeString(left_type) + "+";
-                        sv_end_pos = rev_supp_start;
-                    } else {
-                        sv_start_pos = rev_supp_start;
-                    }
-
-                    if (rev_type != SVType::NEUTRAL && rev_type != SVType::UNKNOWN) {
-                        sv_type_str += getSVTypeString(rev_type) + "+";
-                        sv_end_pos = rev_supp_end;
-                    } else {
-                        sv_start_pos = rev_supp_end;
-                    }
-
-                    if (right_type != SVType::NEUTRAL && right_type != SVType::UNKNOWN) {
-                        sv_end_pos = right_end;
-                        sv_type_str += getSVTypeString(right_type) + "+";
-                    }
-
-                    // Alignments predictions
-                    if (primary_end > right_end) {
-                        if (primary_type != SVType::NEUTRAL && primary_type != SVType::UNKNOWN) {
-                            sv_type_str += getSVTypeString(primary_type) + "+";
-                            sv_end_pos = primary_end;
-                        } else {
-                            sv_start_pos = primary_start;
-                        }
-                    } else {
-                        if (largest_supp_type != SVType::NEUTRAL && largest_supp_type != SVType::UNKNOWN) {
-                            sv_type_str += getSVTypeString(largest_supp_type) + "+";
-                            sv_end_pos = largest_supp_end;
-                        } else {
-                            sv_start_pos = largest_supp_start;
-                        }
-                    }
+        // Run copy number variant predictions on the largest supplementary
+        // alignment
+        double largest_supp_log_likelihood = std::numeric_limits<double>::lowest();
+        SVType largest_supp_type = SVType::UNKNOWN;
+        if (largest_supp_length >= min_cnv_length) {
+            SVCandidate sv_candidate(std::get<1>(largest_supp_alignment)+1, std::get<2>(largest_supp_alignment)+1, ".");
+            std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
+            largest_supp_log_likelihood = std::get<0>(result);
+            // largest_supp_log_likelihood /= (double)largest_supp_length;  // Normalize the log likelihood by the length
+            largest_supp_type = std::get<1>(result);
+        }
 
-                    if (sv_type_str != "") {
-                        sv_type_str.pop_back();  // Remove the last '+'
+        // Run copy number variant predictions on the closest non-overlapping
+        // supplementary alignment (if not the same as the largest)
+        double closest_supp_log_likelihood = std::numeric_limits<double>::lowest();
+        SVType closest_supp_type = SVType::UNKNOWN;
+        if (largest_supp_alignment != closest_supp_alignment) {
+            if (closest_supp_length >= min_cnv_length) {
+                SVCandidate sv_candidate(std::get<1>(closest_supp_alignment)+1, std::get<2>(closest_supp_alignment)+1, ".");
+                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
+                closest_supp_log_likelihood = std::get<0>(result);
+                // closest_supp_log_likelihood /= (double)closest_supp_length;  // Normalize the log likelihood by the length
+                closest_supp_type = std::get<1>(result);
+            }
+        }
 
-                        // Add the complex SV
-                        complex_sv_found = true;
-                        std::cout << "Complex SV detected of type " << sv_type_str << " at positions " << primary_chr << ":" << left_start << "-" << right_end << std::endl;
-                        sv_count++;
+        // Loop through all the supplementary alignments and find the highest
+        // likelihood prediction
+        double best_supp_log_likelihood = std::numeric_limits<double>::lowest();
+        SVType best_supp_type = SVType::UNKNOWN;
+        std::pair<int32_t, int32_t> best_supp_candidate;
+        for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
+            int32_t supp_start = std::get<1>(*it);
+            int32_t supp_end = std::get<2>(*it);
 
-                        // Add the complex SV
-                        sv_calls.add(primary_chr, sv_start_pos+2, sv_end_pos+1, SVType::COMPLEX, ".", "COMPLEX", "./.", 0.0);
-                    }
+            // Create the SV candidate as the boundary of the supplementary
+            // and primary alignment
+            // int32_t sv_start = std::min(primary_start, std::get<1>(*it));
+            // int32_t sv_end = std::max(primary_end, std::get<2>(*it));
+            int32_t sv_start = std::min(primary_start, supp_start);
+            int32_t sv_end = std::max(primary_end, supp_end);
+            SVCandidate sv_candidate(sv_start+1, sv_end+1, ".");
+
+            // Determine if the strand is the same as the primary alignment
+            bool same_strand = std::get<7>(*it) == std::get<7>(primary_alignment);
+
+            // SVCandidate sv_candidate(std::get<1>(*it)+1, std::get<2>(*it)+1, ".");
+            std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
+            double supp_likelihood = std::get<0>(result);
+            SVType supp_type = std::get<1>(result);
+
+            // If opposite strand, set the type to INV or INV_DUP
+            if (!same_strand) {
+                if (supp_type == SVType::NEUTRAL) {
+                    supp_type = SVType::INV;
+                } else if (supp_type == SVType::DUP) {
+                    supp_type = SVType::INV_DUP;
                 }
             }
-        }
 
-        if (complex_sv_found) {
-            continue;  // Continue to the next alignment
+            if (supp_type != SVType::UNKNOWN && supp_likelihood > best_supp_log_likelihood) {
+                best_supp_log_likelihood = supp_likelihood;
+                // best_supp_log_likelihood /= (double)(sv_end - sv_start);  // Normalize the log likelihood by the length
+                best_supp_type = supp_type;
+                best_supp_candidate = std::make_pair(supp_start, supp_end);
+            }
         }
 
-        // [2] CNV detection based on primary and largest supplementary
-        // alignment boundaries
-        // else if (largest_supp_start < primary_start && largest_supp_end <
-        // primary_start) {
-        std::string largest_supp_chr = std::get<0>(largest_supp_alignment);
-        if (largest_supp_start < primary_start && largest_supp_end < primary_start) {
-
-            // Gap with supplementary before primary:
-            // [supp_start] [supp_end] -- [primary_start] [primary_end]
-            if (primary_end - largest_supp_start >= min_cnv_length) {
-                SVCandidate sv_candidate(largest_supp_start+1, primary_end+1, ".");
-
-                // Run copy number prediction for the SV candidate
-                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(largest_supp_chr, sv_candidate);
-                double likelihood = std::get<0>(result);
-                SVType cnv_type = std::get<1>(result);
-                std::string genotype = std::get<2>(result);
-                bool snps_found = std::get<3>(result);
-                std::string aln_type = "GAPOUTER_A";
-                if (snps_found) {
-                    aln_type += "_SNPS";
-                } else {
-                    aln_type += "_NOSNPS";
+        // Add the SV call with the highest likelihood prediction
+        if (best_supp_log_likelihood > primary_log_likelihood || best_supp_log_likelihood > largest_supp_log_likelihood || best_supp_log_likelihood > closest_supp_log_likelihood) {
+            int32_t sv_start = best_supp_candidate.first;
+            int32_t sv_end = best_supp_candidate.second;
+            sv_calls.add(primary_chr, sv_start, sv_end, best_supp_type, ".", "SPLITREAD", "./.", best_supp_log_likelihood);
+            sv_count++;
+        } else {
+            // Resolve complex SVs
+            // Simplest case: Largest supplementary is also the closest
+            if (largest_supp_alignment == closest_supp_alignment) {
+                // [primary] -- [supp_start] -- [supp_end]
+                // Determine if opposite strands
+                bool opposite_strands = std::get<7>(largest_supp_alignment) != std::get<7>(primary_alignment);
+
+                // Determine if the supplementary alignment is an inversion
+                if (opposite_strands) {
+                    if (largest_supp_type == SVType::NEUTRAL) {
+                        largest_supp_type = SVType::INV;
+                    } else if (largest_supp_type == SVType::DUP) {
+                        largest_supp_type = SVType::INV_DUP;
+                    }
                 }
 
-                // Add the SV call to the main SV data if not unknown
-                if (cnv_type != SVType::UNKNOWN) {
-                    sv_calls.add(largest_supp_chr, largest_supp_start+1, primary_end+1, cnv_type, ".", aln_type, genotype, likelihood);
-                }
-                sv_count++;
-            }
-            
-        } else if (largest_supp_start > primary_end && largest_supp_end > primary_end) {
-            // Gap with supplementary after primary:
-            // [primary_start] [primary_end] -- [supp_start] [supp_end]
-
-            if (largest_supp_end - primary_start >= min_cnv_length) {
-                SVCandidate sv_candidate(primary_start+1, largest_supp_end+1, ".");
-
-                // Run copy number prediction for the SV candidate
-                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(largest_supp_chr, sv_candidate);
-                double likelihood = std::get<0>(result);
-                SVType cnv_type = std::get<1>(result);
-                std::string genotype = std::get<2>(result);
-                bool snps_found = std::get<3>(result);
-                std::string aln_type = "GAPOUTER_B";
-                if (snps_found) {
-                    aln_type += "_SNPS";
+                // Get the SV type strings
+                std::string primary_type_str = getSVTypeString(primary_type);
+                std::string supp_type_str = getSVTypeString(largest_supp_type);
+
+                // Determine the order of the primary and supplementary
+                // alignment to resolve the SV
+                if (std::get<1>(largest_supp_alignment) < primary_start) {
+                    // [supp_start] -- [supp_end] -- [primary]
+                    std::string complex_sv_type_str = supp_type_str + "+" + primary_type_str;
+
+                    // Add the complex SV call
+                    sv_calls.add(primary_chr, std::get<1>(largest_supp_alignment), primary_end, SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0);
+                    sv_count++;
                 } else {
-                    aln_type += "_NOSNPS";
-                }
+                    // [primary] -- [supp_start] -- [supp_end]
+                    std::string complex_sv_type_str = primary_type_str + "+" + supp_type_str;
 
-                // Add the SV call to the main SV data if not unknown
-                if (cnv_type != SVType::UNKNOWN) {
-                    sv_calls.add(largest_supp_chr, primary_start+1, largest_supp_end+1, cnv_type, ".", aln_type, genotype, likelihood);
+                    // Add the complex SV call
+                    sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0);
+                    sv_count++;
                 }
-                sv_count++;
             }
         }
     }
diff --git a/src/sv_data.cpp b/src/sv_data.cpp
index 18ea8056..e9b925dc 100644
--- a/src/sv_data.cpp
+++ b/src/sv_data.cpp
@@ -8,7 +8,7 @@
 /// @endcond
 
 
-int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
+int SVData::add(std::string chr, int32_t start, int32_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
 {
     // Throw an error if the genotype is not valid
     if (genotype != "./." && genotype != "0/0" && genotype != "0/1" && genotype != "1/1") {
@@ -16,9 +16,14 @@ int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std
         return -1;
     }
 
+    // Trim the alternate allele if it is too long
+    if (alt_allele.length() > 100) {
+        alt_allele = alt_allele.substr(0, 100);
+    }
+
     // Check if the alternate allele contains ambiguous bases
     const std::unordered_set<char> ambiguous_bases = {'R', 'Y', 'W', 'S', 'K', 'M', 'B', 'D', 'H', 'V'};
-    for (char c : alt_allele) {
+    for (char &c : alt_allele) {
         if (ambiguous_bases.count(c) > 0) {
             c = 'N';
         }
diff --git a/tests/test_general.py b/tests/test_general.py
index 499805c8..52d3fc84 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -64,7 +64,7 @@ def test_run():
 
     # Check that the VCF file has the correct number of lines.
     with open(output_file, 'r', encoding='utf-8') as f:
-        assert len(f.readlines()) == 21
+        assert len(f.readlines()) == 25
 
     # Check that the VCF file has the correct header, and the correct
     # VCF CHROM, POS, and INFO fields in the next 2 lines.

From 41f83826e7bfabb0089ce4422235be43b92347e3 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 12 Nov 2024 17:59:47 -0500
Subject: [PATCH 012/134] Fix CN length filter error

---
 include/sv_data.h            | 10 -----
 include/vcf_writer.h         | 14 ++++++-
 python/plot_distributions.py |  6 ++-
 src/cnv_caller.cpp           | 64 ++++++++++++++++++++----------
 src/input_data.cpp           |  3 +-
 src/sv_caller.cpp            | 76 ++++++++++++++++++++++++++++++++++--
 src/sv_data.cpp              |  3 --
 src/vcf_writer.cpp           | 30 +++++++++++---
 tests/test_general.py        |  2 +-
 9 files changed, 161 insertions(+), 47 deletions(-)

diff --git a/include/sv_data.h b/include/sv_data.h
index c321bf64..f4ed6e25 100644
--- a/include/sv_data.h
+++ b/include/sv_data.h
@@ -22,16 +22,6 @@ class SVData {
 
         // Map of clipped base support by position (chr, pos) : depth
         std::map<std::pair<std::string, int64_t>, int> clipped_base_support;
-
-        // SV type to string map for VCF output
-        // std::map<int, std::string> sv_type_map = {
-        //     {0, "DEL"},
-        //     {1, "DUP"},
-        //     {2, "INV"},
-        //     {3, "INS"},
-        //     {4, "BND"},
-        //     {5, "DUP"}
-        // };
         
     public:
         SVData() {};
diff --git a/include/vcf_writer.h b/include/vcf_writer.h
index e395ea37..a6f7707c 100644
--- a/include/vcf_writer.h
+++ b/include/vcf_writer.h
@@ -1,3 +1,6 @@
+#ifndef VCF_WRITER_H
+#define VCF_WRITER_H
+
 /// @cond
 #include <string>
 #include <vector>
@@ -6,9 +9,14 @@
 
 class VcfWriter {
 public:
-    // Constructor
-    VcfWriter(const std::string& filename);
+    explicit VcfWriter(const std::string& filename);
+    // VcfWriter(const std::string& filename);
     ~VcfWriter();
+
+    // Delete copy constructor and assignment operator
+    VcfWriter(const VcfWriter&) = delete;
+    VcfWriter& operator=(const VcfWriter&) = delete;
+
     void writeHeader(const std::vector<std::string>& headerLines);
     void writeRecord(const std::string& chrom, int pos, const std::string& id,
                      const std::string& ref, const std::string& alt,
@@ -19,3 +27,5 @@ class VcfWriter {
 private:
     std::ofstream file_stream;
 };
+
+#endif  // VCF_WRITER_H
diff --git a/python/plot_distributions.py b/python/plot_distributions.py
index 1f684ed8..7db7cb2a 100644
--- a/python/plot_distributions.py
+++ b/python/plot_distributions.py
@@ -90,10 +90,12 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"):
 
     # Create a dictionary of SV types and their corresponding colors.
     # From: https://davidmathlogic.com/colorblind/
-    sv_colors = {'DEL': '#D81B60', 'DUP': '#1E88E5', 'INV': '#FFC107', 'INS': '#004D40'}
+    # sv_colors = {'DEL': '#D81B60', 'DUP': '#1E88E5', 'INV': '#FFC107', 'INS': '#004D40'}
+    # WONG colors
+    sv_colors = {'DEL': '#E69F00', 'DUP': '#56B4E9', 'INV': '#009E73', 'INS': '#F0E442', 'INVDUP': '#D55E00', 'COMPLEX': '#CC79A7'}
 
     # Create a dictionary of SV types and their corresponding labels
-    sv_labels = {'DEL': 'Deletion', 'DUP': 'Duplication', 'INV': 'Inversion', 'INS': 'Insertion'}
+    sv_labels = {'DEL': 'Deletion', 'DUP': 'Duplication', 'INV': 'Inversion', 'INS': 'Insertion', 'INVDUP': 'Inverted Duplication', 'COMPLEX': 'Complex'}
 
     # Get the list of SV types and sort them in the order of the labels
     sv_types = sorted(sv_sizes.keys(), key=lambda x: sv_labels[x])
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index e052f45f..7d07f0ea 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -197,22 +197,22 @@ SNPData CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::map<SVCand
     SNPData snp_data;
 
     // Filter the SV candidates by length
-    std::map<SVCandidate, SVInfo> filtered_sv_candidates;
-    for (const auto& sv_call : sv_candidates)
-    {
-        int64_t start_pos = std::get<0>(sv_call.first);
-        int64_t end_pos = std::get<1>(sv_call.first);
-        if ((end_pos - start_pos) >= min_length)
-        {
-            filtered_sv_candidates[sv_call.first] = sv_call.second;
-        }
-    }
-    sv_candidates = std::move(filtered_sv_candidates);
-    int sv_count = (int) sv_candidates.size();
-    if (sv_count == 0)
-    {
-        return snp_data;
-    }
+    // std::map<SVCandidate, SVInfo> filtered_sv_candidates;
+    // for (const auto& sv_call : sv_candidates)
+    // {
+    //     int64_t start_pos = std::get<0>(sv_call.first);
+    //     int64_t end_pos = std::get<1>(sv_call.first);
+    //     if ((end_pos - start_pos) >= min_length)
+    //     {
+    //         filtered_sv_candidates[sv_call.first] = sv_call.second;
+    //     }
+    // }
+    // sv_candidates = std::move(filtered_sv_candidates);
+    // int sv_count = (int) sv_candidates.size();
+    // if (sv_count == 0)
+    // {
+    //     return snp_data;
+    // }
 
    
     printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "...");
@@ -271,6 +271,12 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map<SVCa
         int64_t start_pos = std::get<0>(candidate);
         int64_t end_pos = std::get<1>(candidate);
 
+        // Skip if not the minimum length for CNV predictions
+        if ((end_pos - start_pos) < this->input_data->getMinCNVLength())
+        {
+            continue;
+        }
+
         // Get the depth at the start position. This is used as the FORMAT/DP
         // value in the VCF file
         int dp_value = pos_depth_map[start_pos];
@@ -479,8 +485,16 @@ void CNVCaller::loadChromosomeData(std::string chr)
 // Calculate the mean chromosome coverage
 double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
 {
-    // Split the chromosome into equal parts for each thread
+
+    // Use a maximum of 8 threads to avoid overloading the system with too many
+    // parallel processes
     int num_threads = this->input_data->getThreadCount();
+    if (num_threads > 8)
+    {
+        num_threads = 8;
+    }
+
+    // Split the chromosome into equal parts for each thread
     uint32_t chr_len = this->input_data->getRefGenomeChromosomeLength(chr);
     std::vector<std::string> region_chunks = splitRegionIntoChunks(chr, 1, chr_len, num_threads);
 
@@ -780,12 +794,19 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
         }
     }
 
+    // Use a maximum of 8 threads to avoid overloading the system with too many
+    // processes
+    int num_threads = this->input_data->getThreadCount();
+    if (num_threads > 8)
+    {
+        num_threads = 8;
+    }
+
     // Split region into chunks and get the population frequencies in parallel
     std::cout << "SNP range for chromosome " << chr << ": " << snp_start << "-" << snp_end << std::endl;
-    int num_threads = this->input_data->getThreadCount();
     std::vector<std::string> region_chunks = splitRegionIntoChunks(chr_gnomad, snp_start, snp_end, num_threads);
     std::unordered_map<int, double> pos_pfb_map;
-    std::vector<std::thread> threads;
+    // std::vector<std::thread> threads;
     std::vector<std::future<std::unordered_map<int, double>>> futures;
     for (const auto& region_chunk : region_chunks)
     {
@@ -800,7 +821,8 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
             std::string cmd = \
                 "bcftools query -r " + region_chunk + " -f '%POS\t%" + AF_key + "\n' -i '" + filter_criteria + "' " + pfb_filepath + " 2>/dev/null";
 
-            std::cout << "Command: " << cmd << std::endl;
+            // std::cout << "Command: " << cmd << std::endl;
+            printMessage("Running command: " + cmd);
 
             // Open a pipe to read the output of the command
             FILE *fp = popen(cmd.c_str(), "r");
@@ -812,6 +834,7 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
 
             // Loop through the BCFTOOLS output and populate the map of population
             // frequencies
+            // printMessage("Parsing population frequencies for chromosome " + chr + "...");
             std::unordered_map<int, double> pos_pfb_map;
             const int line_size = 256;
             char line[line_size];
@@ -826,6 +849,7 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
                 }
             }
             pclose(fp);
+            // printMessage("Finished parsing population frequencies for chromosome " + chr + "...");
 
             return pos_pfb_map;
         };
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 85e4f8d1..8867d0ff 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -197,9 +197,10 @@ void InputData::setRegion(std::string region)
             // Set the region
             this->start_end = std::make_pair(start, end);
             this->region_set = true;
+
+            std::cout << "Region set to " << this->chr << ":" << start << "-" << end << std::endl;
         }
     }
-    std::cout << "Region set to " << this->start_end.first << "-" << this->start_end.second << std::endl;
 }
 
 std::pair<int32_t, int32_t> InputData::getRegion()
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 12c6d5f4..590c8db4 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -429,7 +429,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
         int32_t primary_query_start = std::get<4>(primary_alignment);
         int32_t primary_query_end = std::get<5>(primary_alignment);
         std::unordered_map<int, int> primary_match_map = std::get<6>(primary_alignment);
-        bool primary_strand = std::get<7>(primary_alignment);
+        // bool primary_strand = std::get<7>(primary_alignment);
         if (supp_map.find(qname) == supp_map.end()) {
             continue;
         }
@@ -443,7 +443,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
             int32_t supp_query_start = std::get<4>(*it);
             int32_t supp_query_end = std::get<5>(*it);
             std::unordered_map<int, int> supp_match_map = std::get<6>(*it);
-            bool supp_strand = std::get<7>(*it);
+            // bool supp_strand = std::get<7>(*it);
 
             // Resolve overlaps between the primary and supplementary query
             // sequences
@@ -511,7 +511,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
         int32_t closest_supp_distance = std::numeric_limits<int32_t>::max();
         int32_t closest_supp_length = 0;
         for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
-            const auto& supp_chr = std::get<0>(*it);
+            // const auto& supp_chr = std::get<0>(*it);
             int32_t supp_start = std::get<1>(*it);
             int32_t supp_end = std::get<2>(*it);
             int32_t supp_length = supp_end - supp_start;
@@ -644,6 +644,76 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
                     sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0);
                     sv_count++;
                 }
+            } else {
+                // Resolve complex SVs with multiple supplementary alignments
+                // Determine the order of the primary and supplementary
+                // alignments
+                // [primary] -- [closest_supp] -- [largest_supp]
+                // [closest_supp] -- [primary] -- [largest_supp]
+                // [largest_supp] -- [closest_supp] -- [primary]
+                // [largest_supp] -- [primary] -- [closest_supp]
+                // Only consider case 1 for efficiency:
+                if (primary_end < std::get<1>(closest_supp_alignment) && std::get<2>(closest_supp_alignment) < std::get<1>(largest_supp_alignment)) {
+                    // [primary] -- [closest_supp] -- [largest_supp]
+                    // Determine if the closest supplementary alignment is an
+                    // inversion
+                    if (std::get<7>(closest_supp_alignment) != std::get<7>(primary_alignment)) {
+                        if (closest_supp_type == SVType::NEUTRAL) {
+                            closest_supp_type = SVType::INV;
+                        } else if (closest_supp_type == SVType::DUP) {
+                            closest_supp_type = SVType::INV_DUP;
+                        }
+                    }
+
+                    // Run copy number variant predictions on the region between
+                    // the closest supplementary alignment and the largest
+                    // supplementary alignment
+                    SVCandidate sv_candidate(std::get<2>(closest_supp_alignment)+1, std::get<1>(largest_supp_alignment)+1, ".");
+                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
+                    // double complex_log_likelihood = std::get<0>(result);
+                    SVType complex_type = std::get<1>(result);
+
+                    // if (std::get<7>(largest_supp_alignment) != std::get<7>(primary_alignment)) {
+                    //     if (largest_supp_type == SVType::NEUTRAL) {
+                    //         largest_supp_type = SVType::INV;
+                    //     } else if (largest_supp_type == SVType::DUP) {
+                    //         largest_supp_type = SVType::INV_DUP;
+                    //     }
+                    // }
+
+                    std::string primary_type_str = getSVTypeString(primary_type);
+                    std::string closest_supp_type_str = getSVTypeString(closest_supp_type);
+                    // std::string largest_supp_type_str = getSVTypeString(largest_supp_type);
+                    // std::string complex_sv_type_str = primary_type_str + "+" + closest_supp_type_str;
+
+
+                    // Combine the types if equal and not unknown/neutral
+                    std::string complex_sv_type_str = "";
+                    if (primary_type != SVType::UNKNOWN && primary_type != SVType::NEUTRAL) {
+                        complex_sv_type_str += primary_type_str;
+                    }
+                    if (closest_supp_type != primary_type && closest_supp_type != SVType::UNKNOWN && closest_supp_type != SVType::NEUTRAL) {
+                        if (complex_sv_type_str != "") {
+                            complex_sv_type_str += "+";
+                        }
+                        complex_sv_type_str += closest_supp_type_str;
+                    }
+                    if (complex_type != closest_supp_type && complex_type != primary_type && complex_type != SVType::UNKNOWN && complex_type != SVType::NEUTRAL) {
+                        if (complex_sv_type_str != "") {
+                            complex_sv_type_str += "+";
+                        }
+                        complex_sv_type_str += getSVTypeString(complex_type);
+                    }
+
+                    // Add the complex SV call if not empty
+                    if (complex_sv_type_str != "") {
+                        std::cout << "Found complex SV type: " << complex_sv_type_str << std::endl;
+                        sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0);
+                        sv_count++;
+                    }
+                }
+
+                
             }
         }
     }
diff --git a/src/sv_data.cpp b/src/sv_data.cpp
index e9b925dc..600ce14c 100644
--- a/src/sv_data.cpp
+++ b/src/sv_data.cpp
@@ -139,9 +139,6 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
     std::string output_vcf = output_dir + "/output.vcf";
     std::cout << "Writing VCF file to " << output_vcf << std::endl;
     VcfWriter vcf_writer(output_vcf);
-    std::cout << "Writing VCF file to " << output_vcf << std::endl;
-
-    // Set the sample name
     std::string sample_name = "SAMPLE";
 
     std::cout << "Getting reference genome filepath..." << std::endl;
diff --git a/src/vcf_writer.cpp b/src/vcf_writer.cpp
index 8c93a36f..7cf2108d 100644
--- a/src/vcf_writer.cpp
+++ b/src/vcf_writer.cpp
@@ -3,16 +3,36 @@
 /// @cond
 #include <iostream>
 #include <fstream>
+#include <stdexcept>
 /// @endcond
 
 VcfWriter::VcfWriter(const std::string &filename)
 {
-    // Open the VCF file, overwrite if it already exists
-    this->file_stream.open(filename, std::ios::out);
-    if (!this->file_stream.is_open()) {
-        std::cerr << "Error: Unable to open " << filename << std::endl;
-        exit(1);
+    try {
+        this->file_stream.exceptions(std::ofstream::failbit | std::ofstream::badbit);  // Enable exceptions
+        this->file_stream.open(filename, std::ios::out | std::ios::trunc);  // Open the file for writing
+    } catch (const std::ofstream::failure &e) {
+        std::cerr << "Error opening file " << filename << ": " << e.what() << std::endl;
+        exit(EXIT_FAILURE);
     }
+    // // Open the VCF file, overwrite if it already exists
+    // try {
+    //     this->file_stream.open(filename, std::ios::out);
+    //     if (!this->file_stream.is_open()) {
+    //         std::cerr << "Error: Unable to open " << filename << std::endl;
+    //         exit(1);
+    //     }
+    // } catch (std::exception &e) {
+    //     std::cerr << "Error: " << e.what() << std::endl;
+    //     exit(1);
+    // }
+
+    // // this->file_stream.open(filename, std::ios::out);
+    // if (!this->file_stream.is_open()) {
+    //     std::cerr << "Error: Unable to open " << filename << std::endl;
+    //     exit(1);
+    // }
+    // std::cout << "Opened " << filename << " for writing" << std::endl;
 }
 
 VcfWriter::~VcfWriter()
diff --git a/tests/test_general.py b/tests/test_general.py
index 52d3fc84..92776b11 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -64,7 +64,7 @@ def test_run():
 
     # Check that the VCF file has the correct number of lines.
     with open(output_file, 'r', encoding='utf-8') as f:
-        assert len(f.readlines()) == 25
+        assert len(f.readlines()) == 41
 
     # Check that the VCF file has the correct header, and the correct
     # VCF CHROM, POS, and INFO fields in the next 2 lines.

From a19c21ae989789173be2b14a3438f1854713f5ce Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 13 Nov 2024 16:10:10 -0500
Subject: [PATCH 013/134] Venn diagram plots and vcf writer error

---
 include/vcf_writer.h |  2 +-
 python/plot_venn.py  | 47 ++++++++++++++++++++++++++++++++++++++++++++
 python/sv_merger.py  |  2 +-
 src/sv_caller.cpp    | 19 +++++++++++++-----
 src/vcf_writer.cpp   | 16 ++++++++++-----
 5 files changed, 74 insertions(+), 12 deletions(-)
 create mode 100644 python/plot_venn.py

diff --git a/include/vcf_writer.h b/include/vcf_writer.h
index a6f7707c..a9bd5931 100644
--- a/include/vcf_writer.h
+++ b/include/vcf_writer.h
@@ -9,7 +9,7 @@
 
 class VcfWriter {
 public:
-    explicit VcfWriter(const std::string& filename);
+    explicit VcfWriter(std::string filename);
     // VcfWriter(const std::string& filename);
     ~VcfWriter();
 
diff --git a/python/plot_venn.py b/python/plot_venn.py
new file mode 100644
index 00000000..8e5e73bd
--- /dev/null
+++ b/python/plot_venn.py
@@ -0,0 +1,47 @@
+# from matplotlib_venn import venn3
+from matplotlib_venn import venn2
+import argparse
+
+import matplotlib.pyplot as plt
+
+def plot_venn(AB, Ab, aB, output, title_AB, title_Ab, title_aB):
+    plt.figure(figsize=(8, 8))
+
+    print('AB:', AB)
+    print('Ab:', Ab)
+    print('aB:', aB)
+
+    # Create scaled subsets for the venn diagram
+    scaling_factor = 1000
+    scaled_AB = AB / scaling_factor
+    scaled_Ab = Ab / scaling_factor
+    scaled_aB = aB / scaling_factor
+
+    # Create a venn diagram scaled to the number of elements in each set
+    # venn = venn2(subsets=(AB, Ab, aB), set_labels=(title_Ab, title_aB))
+    venn = venn2(subsets=(scaled_Ab, scaled_aB, scaled_AB), set_labels=(title_Ab, title_aB))
+
+    # Update the labels to reflect the actual counts
+    venn.get_label_by_id('10').set_text(str(Ab))
+    venn.get_label_by_id('01').set_text(str(aB))
+    venn.get_label_by_id('11').set_text(str(AB))
+
+    # Update the title
+    plt.title("ContextSV and " + title_aB.capitalize() + " Venn Diagram (All SV types)")
+    plt.savefig(output)
+    plt.close()
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Generate a Venn diagram.')
+    parser.add_argument('-a', type=int, required=True, help='Shared count')
+    parser.add_argument('-b', type=int, required=True, help='False positive count')
+    parser.add_argument('-c', type=int, required=True, help='False negative count')
+    parser.add_argument('-o', '--output', type=str, required=True, help='Output file path')
+    parser.add_argument('-a_title', type=str, required=True, help='Title for set A')
+    parser.add_argument('-b_title', type=str, required=True, help='Title for set B')
+    parser.add_argument('-c_title', type=str, required=True, help='Title for set C')
+
+    args = parser.parse_args()
+
+    plot_venn(args.a, args.b, args.c, args.output, args.a_title, args.b_title, args.c_title)
+    print(f'Venn diagram saved to {args.output}')
diff --git a/python/sv_merger.py b/python/sv_merger.py
index d2c59977..fa2a3a30 100644
--- a/python/sv_merger.py
+++ b/python/sv_merger.py
@@ -152,7 +152,7 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
 
     # Merge SVs with the same label
     unique_labels = np.unique(cluster_labels)
-    logging.info("Unique labels: %s", unique_labels)
+    # logging.info("Unique labels: %s", unique_labels)
 
     for label in unique_labels:
 
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 590c8db4..d591792b 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -332,12 +332,17 @@ SVData SVCaller::run()
     } else {
         chromosomes = this->input_data->getRefGenomeChromosomes();
     }
-    int chr_count = chromosomes.size();
+
+    // [TEST] Only process the last N chromosomes
+    // last_n = 10;
+    // chromosomes = std::vector<std::string>(chromosomes.end()-last_n, chromosomes.end());
+    // chromosomes = std::vector<std::string>(chromosomes.end()-3, chromosomes.end());
 
     // Loop through each region and detect SVs in chunks
+    int chr_count = chromosomes.size();
+    int current_chr = 0;
     std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl;
     int chunk_count = 100;  // Number of chunks to split the chromosome into
-    int region_count = 0;
     SVData sv_calls;
     int min_cnv_length = this->input_data->getMinCNVLength();
     for (const auto& chr : chromosomes) {
@@ -377,6 +382,8 @@ SVData SVCaller::run()
 
         // Process each chunk one at a time
         std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl;
+        int region_count = region_chunks.size();
+        int current_region = 0;
         for (const auto& sub_region : region_chunks) {
             // std::cout << "Detecting CIGAR string SVs from " << sub_region << "..." << std::endl;
             RegionData region_data = this->detectSVsFromRegion(sub_region);
@@ -385,7 +392,7 @@ SVData SVCaller::run()
             SuppMap& supp_map = std::get<2>(region_data);
             int region_sv_count = sv_calls_region.totalCalls();
             if (region_sv_count > 0) {
-                std::cout << "Detected " << region_sv_count << " SVs from " << sub_region << "..." << std::endl;
+                std::cout << "Detected " << region_sv_count << " CIGAR SVs from " << sub_region << "..." << std::endl;
             }
 
             // Run copy number variant predictions on the SVs detected from the
@@ -402,11 +409,13 @@ SVData SVCaller::run()
             std::cout << "Detecting copy number variants from split reads..." << std::endl;
             this->detectSVsFromSplitReads(sv_calls_region, primary_map, supp_map, cnv_caller);
             sv_calls.concatenate(sv_calls_region);  // Add the calls to the main set
+            std::cout << "Completed " << ++current_region << " of " << region_count << " region(s)..." << std::endl;
         }
 
-        region_count++;
-        std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)..." << std::endl;
+        std::cout << "Completed " << ++current_chr << " of " << chr_count << " chromosome(s)..." << std::endl;
+        // std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)" << std::endl;
     }
+    
 
     std::cout << "SV calling completed." << std::endl;
 
diff --git a/src/vcf_writer.cpp b/src/vcf_writer.cpp
index 7cf2108d..0d5f60e7 100644
--- a/src/vcf_writer.cpp
+++ b/src/vcf_writer.cpp
@@ -6,15 +6,21 @@
 #include <stdexcept>
 /// @endcond
 
-VcfWriter::VcfWriter(const std::string &filename)
+VcfWriter::VcfWriter(std::string filename)
 {
     try {
+    	std::cout << "Opening file..." << std::endl;
         this->file_stream.exceptions(std::ofstream::failbit | std::ofstream::badbit);  // Enable exceptions
         this->file_stream.open(filename, std::ios::out | std::ios::trunc);  // Open the file for writing
-    } catch (const std::ofstream::failure &e) {
-        std::cerr << "Error opening file " << filename << ": " << e.what() << std::endl;
-        exit(EXIT_FAILURE);
-    }
+    	std::cout << "File opened." << std::endl;
+     } catch (std::exception &e) {
+         std::cerr << "Error: " << e.what() << std::endl;
+         exit(EXIT_FAILURE);
+     }
+    //} catch (const std::ofstream::failure &e) {
+    //    std::cerr << "Error opening file " << filename << ": " << e.what() << std::endl;
+    //    exit(EXIT_FAILURE);
+    //}
     // // Open the VCF file, overwrite if it already exists
     // try {
     //     this->file_stream.open(filename, std::ios::out);

From 5dc01cb569ed0ba8844783891157fb876c811c06 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 14 Nov 2024 13:19:30 -0500
Subject: [PATCH 014/134] Fix vcf writer error

---
 include/vcf_writer.h | 31 ----------------
 python/plot_venn.py  |  2 +-
 src/sv_data.cpp      | 42 +++++++++++++++++++--
 src/vcf_writer.cpp   | 87 --------------------------------------------
 4 files changed, 39 insertions(+), 123 deletions(-)
 delete mode 100644 include/vcf_writer.h
 delete mode 100644 src/vcf_writer.cpp

diff --git a/include/vcf_writer.h b/include/vcf_writer.h
deleted file mode 100644
index a9bd5931..00000000
--- a/include/vcf_writer.h
+++ /dev/null
@@ -1,31 +0,0 @@
-#ifndef VCF_WRITER_H
-#define VCF_WRITER_H
-
-/// @cond
-#include <string>
-#include <vector>
-#include <fstream>
-/// @endcond
-
-class VcfWriter {
-public:
-    explicit VcfWriter(std::string filename);
-    // VcfWriter(const std::string& filename);
-    ~VcfWriter();
-
-    // Delete copy constructor and assignment operator
-    VcfWriter(const VcfWriter&) = delete;
-    VcfWriter& operator=(const VcfWriter&) = delete;
-
-    void writeHeader(const std::vector<std::string>& headerLines);
-    void writeRecord(const std::string& chrom, int pos, const std::string& id,
-                     const std::string& ref, const std::string& alt,
-                     const std::string& qual, const std::string& filter,
-                     const std::string& info, const std::string& format,
-                     const std::vector<std::string>& samples);
-
-private:
-    std::ofstream file_stream;
-};
-
-#endif  // VCF_WRITER_H
diff --git a/python/plot_venn.py b/python/plot_venn.py
index 8e5e73bd..eb7e8e78 100644
--- a/python/plot_venn.py
+++ b/python/plot_venn.py
@@ -27,7 +27,7 @@ def plot_venn(AB, Ab, aB, output, title_AB, title_Ab, title_aB):
     venn.get_label_by_id('11').set_text(str(AB))
 
     # Update the title
-    plt.title("ContextSV and " + title_aB.capitalize() + " Venn Diagram (All SV types)")
+    plt.title("contextsv and " + title_aB + " venn diagram (all SV types)")
     plt.savefig(output)
     plt.close()
 
diff --git a/src/sv_data.cpp b/src/sv_data.cpp
index 600ce14c..085e737e 100644
--- a/src/sv_data.cpp
+++ b/src/sv_data.cpp
@@ -1,5 +1,4 @@
 #include "sv_data.h"
-#include "vcf_writer.h"
 
 /// @cond
 #include <unordered_set>
@@ -138,7 +137,10 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
     std::cout << "Creating VCF writer..." << std::endl;
     std::string output_vcf = output_dir + "/output.vcf";
     std::cout << "Writing VCF file to " << output_vcf << std::endl;
-    VcfWriter vcf_writer(output_vcf);
+	std::ofstream vcf_stream(output_vcf);
+    if (!vcf_stream.is_open()) {
+        throw std::runtime_error("Failed to open VCF file for writing.");
+    }
     std::string sample_name = "SAMPLE";
 
     std::cout << "Getting reference genome filepath..." << std::endl;
@@ -178,7 +180,35 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
     };
 
     std::cout << "Writing VCF header..." << std::endl;
-    vcf_writer.writeHeader(header_lines);
+
+    // Add the file format
+    std::string file_format = "##fileformat=VCFv4.2";
+    vcf_stream << file_format << std::endl;
+
+    // Add date and time
+    time_t rawtime;
+    struct tm * timeinfo;
+    char buffer[80];
+    time (&rawtime);
+    timeinfo = localtime(&rawtime);
+    strftime(buffer, sizeof(buffer), "%Y%m%d", timeinfo);
+    vcf_stream << "##fileDate=" << buffer << std::endl;
+
+    // Add source
+    std::string source = "##source=ContexSV";
+    vcf_stream << source << std::endl;
+
+    // Loop over the header metadata lines
+    for (const auto &line : header_lines) {
+        vcf_stream << line << std::endl;
+    }
+
+    // Add the header line
+    std::string header_line = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE";
+    vcf_stream << header_line << std::endl;
+
+    // Flush the stream to ensure that the header is written
+    //this->file_stream.flush();
 
     std::cout << "Saving SV calls to " << output_vcf << std::endl;
     std::string sv_method = "CONTEXTSVv0.1";
@@ -280,7 +310,11 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
             std::vector<std::string> samples = {sample_str};
 
             // Write the SV call to the file (CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLES)
-            vcf_writer.writeRecord(chr, pos, ".", ref_allele, alt_allele, ".", "PASS", info_str, format_str, samples);
+            vcf_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "PASS" << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl;
+            if (total_count % 1000 == 0)
+            {
+            	std::cout << "Wrote SV at " << chr << ": " << pos << ", total=" << total_count << std::endl;
+        	}
         }
     }
 
diff --git a/src/vcf_writer.cpp b/src/vcf_writer.cpp
deleted file mode 100644
index 0d5f60e7..00000000
--- a/src/vcf_writer.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-#include "vcf_writer.h"
-
-/// @cond
-#include <iostream>
-#include <fstream>
-#include <stdexcept>
-/// @endcond
-
-VcfWriter::VcfWriter(std::string filename)
-{
-    try {
-    	std::cout << "Opening file..." << std::endl;
-        this->file_stream.exceptions(std::ofstream::failbit | std::ofstream::badbit);  // Enable exceptions
-        this->file_stream.open(filename, std::ios::out | std::ios::trunc);  // Open the file for writing
-    	std::cout << "File opened." << std::endl;
-     } catch (std::exception &e) {
-         std::cerr << "Error: " << e.what() << std::endl;
-         exit(EXIT_FAILURE);
-     }
-    //} catch (const std::ofstream::failure &e) {
-    //    std::cerr << "Error opening file " << filename << ": " << e.what() << std::endl;
-    //    exit(EXIT_FAILURE);
-    //}
-    // // Open the VCF file, overwrite if it already exists
-    // try {
-    //     this->file_stream.open(filename, std::ios::out);
-    //     if (!this->file_stream.is_open()) {
-    //         std::cerr << "Error: Unable to open " << filename << std::endl;
-    //         exit(1);
-    //     }
-    // } catch (std::exception &e) {
-    //     std::cerr << "Error: " << e.what() << std::endl;
-    //     exit(1);
-    // }
-
-    // // this->file_stream.open(filename, std::ios::out);
-    // if (!this->file_stream.is_open()) {
-    //     std::cerr << "Error: Unable to open " << filename << std::endl;
-    //     exit(1);
-    // }
-    // std::cout << "Opened " << filename << " for writing" << std::endl;
-}
-
-VcfWriter::~VcfWriter()
-{
-    if (this->file_stream.is_open()) {
-        this->file_stream.close();
-    }
-}
-
-void VcfWriter::writeHeader(const std::vector<std::string> &headerLines)
-{
-    // Add the file format
-    std::string file_format = "##fileformat=VCFv4.2";
-    this->file_stream << file_format << std::endl;
-
-    // Add date and time
-    time_t rawtime;
-    struct tm * timeinfo;
-    char buffer[80];
-    time (&rawtime);
-    timeinfo = localtime(&rawtime);
-    strftime(buffer, sizeof(buffer), "%Y%m%d", timeinfo);
-    file_stream << "##fileDate=" << buffer << std::endl;
-
-    // Add source
-    std::string source = "##source=ContexSV";
-    this->file_stream << source << std::endl;
-
-    // Loop over the header metadata lines
-    for (auto &line : headerLines) {
-        this->file_stream << line << std::endl;
-    }
-
-    // Add the header line
-    std::string header_line = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE";
-    this->file_stream << header_line << std::endl;
-
-    // Flush the stream to ensure that the header is written
-    this->file_stream.flush();
-}
-
-void VcfWriter::writeRecord(const std::string &chrom, int pos, const std::string &id, const std::string &ref, const std::string &alt, const std::string &qual, const std::string &filter, const std::string &info, const std::string &format, const std::vector<std::string> &samples)
-{
-    // Write a record to the VCF file
-    this->file_stream << chrom << "\t" << pos << "\t" << id << "\t" << ref << "\t" << alt << "\t" << qual << "\t" << filter << "\t" << info << "\t" << format << "\t" << samples[0] << std::endl;
-}

From b9b489044e033b4fb12ac390bfcb98e5b2044c14 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 15 Nov 2024 11:22:12 -0500
Subject: [PATCH 015/134] Fix overlap errors

---
 include/cnv_caller.h  |   2 +-
 python/sv_merger.py   |   8 +-
 src/cnv_caller.cpp    |  31 +---
 src/khmm.cpp          |  13 +-
 src/sv_caller.cpp     | 384 ++++++++++++++++++++++++++++++++----------
 tests/test_general.py |   4 +-
 6 files changed, 315 insertions(+), 127 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index af211fc8..80c69f89 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -109,7 +109,7 @@ class CNVCaller {
 
         // Run copy number prediction for a single SV candidate, returning the
         // likelihood, predicted CNV type, genotype, and whether SNPs were found
-        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, SVCandidate& sv_candidate);
+        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate);
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
         SNPData runCIGARCopyNumberPrediction(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, int min_length);
diff --git a/python/sv_merger.py b/python/sv_merger.py
index fa2a3a30..172cba6e 100644
--- a/python/sv_merger.py
+++ b/python/sv_merger.py
@@ -89,7 +89,7 @@ def update_support(record, cluster_size):
 
     return record
 
-def weighted_score(read_support, hmm_score, sv_len, weight_hmm, weight_sv_len):
+def weighted_score(read_support, hmm_score, weight_hmm):
     """
     Calculate a weighted score based on read support and HMM score.
     """
@@ -209,13 +209,11 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
         # support.
         # hmm_weight = 0.7 if sv_type == 'DEL' else 0.3
         hmm_weight = 0.4
-        sv_len_weight = 0.4
         max_score_idx = 0  # Default to the first SV in the cluster
-        max_score = weighted_score(cluster_depth_scores[max_score_idx], cluster_hmm_scores[max_score_idx], cluster_sv_lengths[max_score_idx], hmm_weight, sv_len_weight)
+        max_score = weighted_score(cluster_depth_scores[max_score_idx], cluster_hmm_scores[max_score_idx], hmm_weight)
         for k, hmm_loglh in enumerate(cluster_hmm_scores):
-            sv_len = cluster_sv_lengths[k] / 1000  # Normalize SV length to kilobases
             read_support = cluster_depth_scores[k]
-            score = weighted_score(read_support, hmm_loglh, sv_len, hmm_weight, sv_len_weight)
+            score = weighted_score(read_support, hmm_loglh, hmm_weight)
             if score > max_score:
                 max_score = score
                 max_score_idx = k
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 7d07f0ea..ceaeb5ec 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -115,9 +115,8 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, int64_t star
     return std::make_pair(snp_data, snps_found);
 }
 
-std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, SVCandidate& candidate)
+std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate)
 {
-    // std::cout << "Running copy number prediction for SV pair " << chr << ":" << std::get<0>(sv_one) << "-" << std::get<1>(sv_one) << " and " << std::get<0>(sv_two) << "-" << std::get<1>(sv_two) << "..." << std::endl;
      // Get the start and end positions of the SV call
     int64_t start_pos = std::get<0>(candidate);
     int64_t end_pos = std::get<1>(candidate);
@@ -127,6 +126,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     int64_t sv_length = (end_pos - start_pos) / 2.0;
     int64_t snp_start_pos = std::max((int64_t) 1, start_pos - sv_length);
     int64_t snp_end_pos = end_pos + sv_length;
+    // printMessage("Running copy number prediction for SV candidate " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + " with SNP region " + chr + ":" + std::to_string(snp_start_pos) + "-" + std::to_string(snp_end_pos) + "...");
 
     // Query the SNP region for the SV candidate
     std::pair<SNPData, bool> snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, this->snp_info, this->pos_depth_map, this->mean_chr_cov);
@@ -153,12 +153,15 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     double pct_threshold = 0.75;
     int max_state = 0;
     int max_count = 0;
-    for (int i = 0; i < 6; i++)
+
+    // Combine counts for states 1 and 2, states 3 and 4, and states 5 and 6
+    for (int i = 0; i < 6; i += 2)
     {
-        int state_count = std::count(sv_states.begin(), sv_states.end(), i+1);
+        // Combine counts for states 1 and 2, states 3 and 4, and states 5 and 6
+        int state_count = std::count(sv_states.begin(), sv_states.end(), i+1) + std::count(sv_states.begin(), sv_states.end(), i+2);
         if (state_count > max_count)
         {
-            max_state = i+1;
+            max_state = i+1;  // Set the state to the first state in the pair (sequence remains intact)
             max_count = state_count;
         }
     }
@@ -196,24 +199,6 @@ SNPData CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::map<SVCand
     double mean_chr_cov = this->mean_chr_cov;
     SNPData snp_data;
 
-    // Filter the SV candidates by length
-    // std::map<SVCandidate, SVInfo> filtered_sv_candidates;
-    // for (const auto& sv_call : sv_candidates)
-    // {
-    //     int64_t start_pos = std::get<0>(sv_call.first);
-    //     int64_t end_pos = std::get<1>(sv_call.first);
-    //     if ((end_pos - start_pos) >= min_length)
-    //     {
-    //         filtered_sv_candidates[sv_call.first] = sv_call.second;
-    //     }
-    // }
-    // sv_candidates = std::move(filtered_sv_candidates);
-    // int sv_count = (int) sv_candidates.size();
-    // if (sv_count == 0)
-    // {
-    //     return snp_data;
-    // }
-
    
     printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "...");
 
diff --git a/src/khmm.cpp b/src/khmm.cpp
index 375325a1..3b3bffa6 100644
--- a/src/khmm.cpp
+++ b/src/khmm.cpp
@@ -339,12 +339,12 @@ std::pair<std::vector<int>, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect
 	// of the state sequence ending in state i at time T, along with observing
 	// the sequence O1, O2.
 	q[T] = 1;
-	double min_prob = -VITHUGE;
+	double final_lh = -VITHUGE;
 	for (i = 1; i <= hmm.N; i++)
 	{
-		if (delta[T][i] > min_prob)
+		if (delta[T][i] > final_lh)
 		{
-			min_prob = delta[T][i];
+			final_lh = delta[T][i];
 			q[T] = i;
 		}
 	}
@@ -378,12 +378,7 @@ std::pair<std::vector<int>, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect
 	free_dmatrix(biot, 1, hmm.N, 1, T);
 	free_dmatrix(A1, 1, hmm.N, 1, hmm.N);
 
-	// Normalize the log likelihood by the sample size
-	double min_prob_normalized = min_prob / (double)T;
-
-	// Return the state sequence and its likelihood
-	// return std::make_pair(q, min_prob);
-	return std::make_pair(q, min_prob_normalized);
+	return std::make_pair(q, final_lh);
 }
 
 CHMM ReadCHMM(const char *filename)
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index d591792b..3cc41aed 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -170,6 +170,8 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
     int32_t query_start = 0;  // First alignment position in the query
     int32_t query_end = 0;    // Last alignment position in the query
     bool first_op = false;  // First alignment operation for the query
+    double default_lh = std::numeric_limits<double>::lowest();  // Default likelihood
+    // double default_lh = std::numeric_limits<double>::quiet_NaN();  // Default likelihood
     for (int i = 0; i < cigar_len; i++) {
 
         int op = bam_cigar_op(cigar[i]);  // CIGAR operation
@@ -230,9 +232,9 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
                 if (is_duplication) {
-                    sv_calls.add(chr, ref_pos, ref_end, SVType::DUP, ins_seq_str, "CIGARDUP", "./.", 0.0);
+                    sv_calls.add(chr, ref_pos, ref_end, SVType::DUP, ins_seq_str, "CIGARDUP", "./.", default_lh);
                 } else {
-                    sv_calls.add(chr, ref_pos, ref_end, SVType::INS, ins_seq_str, "CIGARINS", "./.", 0.0);
+                    sv_calls.add(chr, ref_pos, ref_end, SVType::INS, ins_seq_str, "CIGARINS", "./.", default_lh);
                 }
             }
 
@@ -244,7 +246,7 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
             {
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
-                sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", "CIGARDEL", "./.", 0.0);  // Add the deletion
+                sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", "CIGARDEL", "./.", default_lh);  // Add to SV calls (1-based)
             }
 
         // Check if the CIGAR operation is a clipped base
@@ -443,52 +445,49 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
             continue;
         }
 
-        // Resolve overlaps between the primary and supplementary query
-        // sequences
-        for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
-            std::string supp_chr = std::get<0>(*it);
-            // int32_t supp_start = std::get<1>(*it);
-            // int32_t supp_end = std::get<2>(*it);
-            int32_t supp_query_start = std::get<4>(*it);
-            int32_t supp_query_end = std::get<5>(*it);
-            std::unordered_map<int, int> supp_match_map = std::get<6>(*it);
-            // bool supp_strand = std::get<7>(*it);
-
-            // Resolve overlaps between the primary and supplementary query
-            // sequences
-            int32_t overlap_start = std::max(primary_query_start, supp_query_start);
-            int32_t overlap_end = std::min(primary_query_end, supp_query_end);
-            int32_t overlap_length = overlap_end - overlap_start;
-            if (overlap_length > 0) {
-
-                // Calculate the mismatch rate for each alignment at the overlap
-                double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1);
-                double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1);
-                // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl;
-                // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl;
-
-                // Trim the overlap from the alignment with the higher mismatch
-                // rate
-                if (primary_mismatch_rate > supp_mismatch_rate) {
-                    if (overlap_start == primary_query_start) {
-                        primary_start += overlap_length;
-                    } else if (overlap_end == primary_query_end) {
-                        primary_end -= overlap_length;
-                    }
-
-                } else {
-                    if (overlap_start == supp_query_start) {
-                        // supp_start += overlap_length;
-                        // Update the value in the supp map
-                        std::get<1>(*it) += overlap_length;
-                    } else if (overlap_end == supp_query_end) {
-                        // supp_end -= overlap_length;
-                        // Update the value in the supp map
-                        std::get<2>(*it) -= overlap_length;
-                    }
-                }
-            }
-        }
+        // // Resolve overlaps between the primary and supplementary query
+        // // sequences
+        // for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
+        //     std::string supp_chr = std::get<0>(*it);
+        //     // int32_t supp_start = std::get<1>(*it);
+        //     // int32_t supp_end = std::get<2>(*it);
+        //     int32_t supp_query_start = std::get<4>(*it);
+        //     int32_t supp_query_end = std::get<5>(*it);
+        //     std::unordered_map<int, int> supp_match_map = std::get<6>(*it);
+        //     // bool supp_strand = std::get<7>(*it);
+
+        //     // Resolve overlaps between the primary and supplementary query
+        //     // sequences
+        //     if (primary_query_start < supp_query_end && primary_query_end > supp_query_start || supp_query_start < primary_query_end && supp_query_end > primary_query_start) {
+
+        //         // Calculate the mismatch rate for each alignment at the overlap
+        //         double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1);
+        //         double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1);
+        //         // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl;
+        //         // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl;
+
+        //         // Trim the overlap from the alignment with the higher mismatch
+        //         // rate
+        //         if (primary_mismatch_rate > supp_mismatch_rate) {
+        //             if (overlap_start == primary_query_start) {
+        //                 primary_start += overlap_length;
+        //             } else if (overlap_end == primary_query_end) {
+        //                 primary_end -= overlap_length;
+        //             }
+
+        //         } else {
+        //             if (overlap_start == supp_query_start) {
+        //                 // supp_start += overlap_length;
+        //                 // Update the value in the supp map
+        //                 std::get<1>(*it) += overlap_length;
+        //             } else if (overlap_end == supp_query_end) {
+        //                 // supp_end -= overlap_length;
+        //                 // Update the value in the supp map
+        //                 std::get<2>(*it) -= overlap_length;
+        //             }
+        //         }
+        //     }
+        // }
 
         // Remove supplementary alignments that are not on the same chromosome
         // as the primary alignment
@@ -502,11 +501,12 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
 
         // Run copy number variant predictions on the primary alignment
         SVType primary_type = SVType::UNKNOWN;
-        double primary_log_likelihood = std::numeric_limits<double>::lowest();
+        double primary_lh = std::numeric_limits<double>::lowest();
+        int32_t primary_lh_t = 0;
         if (primary_end - primary_start >= min_cnv_length) {
             SVCandidate sv_candidate(primary_start+1, primary_end+1, ".");
             std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-            primary_log_likelihood = std::get<0>(result);
+            primary_lh = std::get<0>(result);
             // primary_log_likelihood /= (double)(primary_end - primary_start);  // Normalize the log likelihood by the length
             primary_type = std::get<1>(result);
         }
@@ -523,7 +523,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
             // const auto& supp_chr = std::get<0>(*it);
             int32_t supp_start = std::get<1>(*it);
             int32_t supp_end = std::get<2>(*it);
-            int32_t supp_length = supp_end - supp_start;
+            int32_t supp_length = supp_end - supp_start + 1;
             int32_t supp_distance = std::numeric_limits<int32_t>::max();
             if (supp_start > primary_end) {
                 supp_distance = supp_start - primary_end;
@@ -543,80 +543,288 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
 
         // Run copy number variant predictions on the largest supplementary
         // alignment
-        double largest_supp_log_likelihood = std::numeric_limits<double>::lowest();
+        double largest_supp_lh = std::numeric_limits<double>::lowest();
         SVType largest_supp_type = SVType::UNKNOWN;
+        int largest_supp_lh_t = 0;
         if (largest_supp_length >= min_cnv_length) {
             SVCandidate sv_candidate(std::get<1>(largest_supp_alignment)+1, std::get<2>(largest_supp_alignment)+1, ".");
             std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-            largest_supp_log_likelihood = std::get<0>(result);
+            largest_supp_lh = std::get<0>(result);
             // largest_supp_log_likelihood /= (double)largest_supp_length;  // Normalize the log likelihood by the length
             largest_supp_type = std::get<1>(result);
         }
 
         // Run copy number variant predictions on the closest non-overlapping
         // supplementary alignment (if not the same as the largest)
-        double closest_supp_log_likelihood = std::numeric_limits<double>::lowest();
+        double closest_supp_lh = std::numeric_limits<double>::lowest();
         SVType closest_supp_type = SVType::UNKNOWN;
+        int closest_supp_lh_t = 0;
         if (largest_supp_alignment != closest_supp_alignment) {
             if (closest_supp_length >= min_cnv_length) {
                 SVCandidate sv_candidate(std::get<1>(closest_supp_alignment)+1, std::get<2>(closest_supp_alignment)+1, ".");
                 std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-                closest_supp_log_likelihood = std::get<0>(result);
+                closest_supp_lh = std::get<0>(result);
                 // closest_supp_log_likelihood /= (double)closest_supp_length;  // Normalize the log likelihood by the length
                 closest_supp_type = std::get<1>(result);
+                int32_t closest_supp_start = std::get<1>(closest_supp_alignment);
+                int32_t closest_supp_end = std::get<2>(closest_supp_alignment);
             }
         }
 
+        // Define constants representing read scenarios used for SV detection
+        const int NOCALL = -1;  // Default
+        const int PRIM_SUPP_BD = 0;  // Primary and supplementary boundary
+        const int PRIM_SUPP_GAP = 1;  // Primary and supplementary gap
+        const int SUPP_PRIM_BD = 2;  // Supplementary and primary boundary
+        const int SUPP_PRIM_GAP = 3;  // Supplementary and primary gap
+
         // Loop through all the supplementary alignments and find the highest
         // likelihood prediction
-        double best_supp_log_likelihood = std::numeric_limits<double>::lowest();
+        double best_split_aln_lh = std::numeric_limits<double>::lowest();
+        double best_split_aln_lh_norm = std::numeric_limits<double>::lowest();
+        // int best_split_aln_length = 0;
         SVType best_supp_type = SVType::UNKNOWN;
         std::pair<int32_t, int32_t> best_supp_candidate;
+        AlignmentData& best_split_alignment = supp_map[qname][0];
+        int best_scenario = NOCALL;
         for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
             int32_t supp_start = std::get<1>(*it);
             int32_t supp_end = std::get<2>(*it);
+            bool primary_before_supp = primary_start < supp_start;
+
+            // Create the SV candidate as the boundary of the primary and
+            // supplementary alignments
+            SVCandidate split_boundary;
+            SVCandidate split_gap;
+            bool invalid_gap = false;
+            if (primary_before_supp) {
+                split_boundary = SVCandidate(primary_start+1, supp_end+1, ".");
+
+                // Check for an invalid gap (overlap)
+                if (primary_end >= supp_start) {
+                    invalid_gap = true;
+                } else {
+                    split_gap = SVCandidate(primary_end+1, supp_start+1, ".");
+                }
+                // split_gap = SVCandidate(primary_end+1, supp_start+1, ".");
 
-            // Create the SV candidate as the boundary of the supplementary
-            // and primary alignment
-            // int32_t sv_start = std::min(primary_start, std::get<1>(*it));
-            // int32_t sv_end = std::max(primary_end, std::get<2>(*it));
-            int32_t sv_start = std::min(primary_start, supp_start);
-            int32_t sv_end = std::max(primary_end, supp_end);
-            SVCandidate sv_candidate(sv_start+1, sv_end+1, ".");
+            } else {
+                split_boundary = SVCandidate(supp_start+1, primary_end+1, ".");
 
-            // Determine if the strand is the same as the primary alignment
-            bool same_strand = std::get<7>(*it) == std::get<7>(primary_alignment);
+                // Check for an invalid gap (overlap)
+                if (supp_end >= primary_start) {
+                    invalid_gap = true;
+                } else {
+                    split_gap = SVCandidate(supp_end+1, primary_start+1, ".");
+                }
+            }
 
-            // SVCandidate sv_candidate(std::get<1>(*it)+1, std::get<2>(*it)+1, ".");
-            std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-            double supp_likelihood = std::get<0>(result);
-            SVType supp_type = std::get<1>(result);
+            // Create a vector of the two SV candidates, don't add the gap if
+            // it is an overlap, or if either SV is less than the minimum CNV
+            // length
+            std::vector<SVCandidate> sv_candidates;
+            if (!invalid_gap && std::get<1>(split_gap) - std::get<0>(split_gap) >= min_cnv_length) {
+                sv_candidates.push_back(split_gap);
+            }
+            if (std::get<1>(split_boundary) - std::get<0>(split_boundary) >= min_cnv_length) {
+                sv_candidates.push_back(split_boundary);
+            }
+
+            // Continue if no SV candidates
+            if (sv_candidates.size() == 0) {
+                continue;
+            }
+
+            // Run copy number variant predictions on both, and keep the
+            // prediction with the highest normalized log likelihood
+            double chosen_lh_norm = std::numeric_limits<double>::lowest();
+            SVType chosen_type = SVType::UNKNOWN;
+            std::pair<int32_t, int32_t> chosen_candidate;
+            std::string chosen_candidate_str = "BOUNDARY";
+            int split_scenario = NOCALL;
+            for (const auto& sv_candidate : sv_candidates) {
+                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
+                double current_lh = std::get<0>(result);
+                SVType current_type = std::get<1>(result);
+
+                // Normalize the log likelihood by the state sequence length
+                double current_lh_norm = current_lh;// / (double)T;
+                // if (sv_candidate == split_boundary) {
+                //     std::cout << "Boundary candidate: " << current_lh_norm << ", before normalization: " << current_lh << std::endl;
+                // } else if (sv_candidate == split_gap) {
+                //     std::cout << "Gap candidate: " << current_lh_norm << ", before normalization: " << current_lh << std::endl;
+                // }
+
+                // Update the current SV candidate if the likelihood is higher
+                if (current_type != SVType::UNKNOWN && current_lh_norm > chosen_lh_norm) {
+                    chosen_lh_norm = current_lh_norm;
+                    chosen_type = current_type;
+                    chosen_candidate = std::make_pair(std::get<0>(sv_candidate), std::get<1>(sv_candidate));
+
+                    // Update the candidate string
+                    if (sv_candidate == split_boundary) {
+                        chosen_candidate_str = "BOUNDARY";
+                        if (primary_before_supp) {
+                            split_scenario = PRIM_SUPP_BD;
+                        } else {
+                            split_scenario = SUPP_PRIM_BD;
+                        }
+                    } else if (sv_candidate == split_gap) {
+                        chosen_candidate_str = "GAP";
+                        if (primary_before_supp) {
+                            split_scenario = PRIM_SUPP_GAP;
+                        } else {
+                            split_scenario = SUPP_PRIM_GAP;
+                        }
+                    }
+                    // std::cout << "Updated candidate: " << chosen_candidate_str << " with likelihood: " << current_lh_norm << std::endl;
+                } else if (current_type == SVType::UNKNOWN) {
+                    // std::cerr << "ERROR: Unknown SV type" << std::endl;
+                    // exit(1);
+                }
+            }
+
+            // std::cout << "Chosen candidate: " << chosen_candidate_str << std::endl;
+
+            // Continue if unknown SV type
+            if (chosen_type == SVType::UNKNOWN) {
+                std::cerr << "ERROR: Unknown SV type" << std::endl;
+                continue;
+            }
 
             // If opposite strand, set the type to INV or INV_DUP
+            bool same_strand = std::get<7>(*it) == std::get<7>(primary_alignment);
             if (!same_strand) {
-                if (supp_type == SVType::NEUTRAL) {
-                    supp_type = SVType::INV;
-                } else if (supp_type == SVType::DUP) {
-                    supp_type = SVType::INV_DUP;
+                if (chosen_type == SVType::NEUTRAL) {
+                    chosen_type = SVType::INV;
+                } else if (chosen_type == SVType::DUP) {
+                    chosen_type = SVType::INV_DUP;
                 }
             }
 
-            if (supp_type != SVType::UNKNOWN && supp_likelihood > best_supp_log_likelihood) {
-                best_supp_log_likelihood = supp_likelihood;
-                // best_supp_log_likelihood /= (double)(sv_end - sv_start);  // Normalize the log likelihood by the length
-                best_supp_type = supp_type;
-                best_supp_candidate = std::make_pair(supp_start, supp_end);
+            if (chosen_lh_norm > best_split_aln_lh_norm) {
+                // best_supp_log_likelihood = supp_likelihood;
+                // best_supp_log_likelihood /= (double)(sv_end - sv_start);  //
+                // Normalize the log likelihood by the length
+                // best_split_aln_lh = split_aln_lh;
+                best_split_aln_lh_norm = chosen_lh_norm;
+                // best_split_aln_length = split_aln_length;
+                best_supp_type = chosen_type;
+                best_supp_candidate = chosen_candidate;
+                best_split_alignment = *it;
+                best_scenario = split_scenario;
+            } else if (chosen_lh_norm <= best_split_aln_lh_norm) {
+                // std::cerr << "ERROR: split_aln_lh_norm is less than or equal to best_split_aln_lh_norm" << std::endl;
+                // exit(1);
             }
         }
 
+        // If the likelihood is equal to the lowest value, print an error
+        if (best_split_aln_lh_norm == std::numeric_limits<double>::lowest()) {
+            // std::cerr << "ERROR: best_supp_log_likelihood is the lowest value" << std::endl;
+            // exit(1);
+        }
+
+        // Print the likelihoods
+        // std::cout << "Primary log likelihood: " << primary_lh << std::endl;
+        // std::cout << "Largest supplementary log likelihood: " << largest_supp_lh << std::endl;
+        // std::cout << "Closest supplementary log likelihood: " << closest_supp_lh << std::endl;
+        // // std::cout << "Best split alignment log likelihood: " << best_split_aln_lh << std::endl;
+        // std::cout << "Best split alignment log likelihood (normalized): " << best_split_aln_lh_norm << std::endl;
+        // std::cout << "Best scenario: " << best_scenario << std::endl;
+
         // Add the SV call with the highest likelihood prediction
-        if (best_supp_log_likelihood > primary_log_likelihood || best_supp_log_likelihood > largest_supp_log_likelihood || best_supp_log_likelihood > closest_supp_log_likelihood) {
+        // 
+        // Determine the normalized log likelihood for the combined alignments
+        // by summing and normalizing the log likelihoods by the length
+        double complex_lh = 0.0;
+        double complex_lh_norm = 0.0;
+        if (largest_supp_alignment == closest_supp_alignment) {
+            int32_t complex_t = primary_lh_t + largest_supp_lh_t;
+            complex_lh = primary_lh + largest_supp_lh;
+            complex_lh_norm = complex_lh;// / complex_t;
+        } else {
+            int32_t complex_t = primary_lh_t + largest_supp_lh_t + closest_supp_lh_t;
+            complex_lh = primary_lh + largest_supp_lh + closest_supp_lh;
+            complex_lh_norm = complex_lh;// / complex_t;
+        }
+        // std::cout << "Complex log likelihood (normalized): " << complex_lh_norm << std::endl;
+
+        // Compare the best split alignment likelihood to the complex likelihood
+        // if (best_supp_log_likelihood > primary_log_likelihood || best_supp_log_likelihood > largest_supp_log_likelihood || best_supp_log_likelihood > closest_supp_log_likelihood) {
+        if (best_split_aln_lh_norm > complex_lh_norm) {
             int32_t sv_start = best_supp_candidate.first;
             int32_t sv_end = best_supp_candidate.second;
-            sv_calls.add(primary_chr, sv_start, sv_end, best_supp_type, ".", "SPLITREAD", "./.", best_supp_log_likelihood);
+
+            // Print an error and continue if the end is less than the start
+            if (sv_end < sv_start) {
+                std::cerr << "ERROR: SV end is less than the start: " << sv_start << " - " << sv_end << ", SV type: " << getSVTypeString(best_supp_type) << std::endl;
+                continue;
+            }
+
+            // Resolve overlaps between the primary and supplementary query
+            // sequences for deletions (not usually an issue for other types)
+            if (best_supp_type == SVType::DEL) {
+                AlignmentData& best_supp_alignment = best_split_alignment;
+                int32_t supp_start = std::get<1>(best_supp_alignment);
+                int32_t supp_end = std::get<2>(best_supp_alignment);
+                int32_t supp_query_start = std::get<4>(best_supp_alignment);
+                int32_t supp_query_end = std::get<5>(best_supp_alignment);
+                std::unordered_map<int, int> supp_match_map = std::get<6>(best_supp_alignment);
+
+                // Resolve overlaps between the primary and supplementary query
+                // sequences
+                // int32_t overlap_start = std::max(primary_query_start, supp_query_start);
+                // int32_t overlap_end = std::min(primary_query_end, supp_query_end);
+                // int32_t overlap_length = overlap_end - overlap_start;
+                bool gap_present = primary_query_end < supp_query_start || supp_query_end < primary_query_start;
+                if (!gap_present) {
+                    int32_t overlap_start = std::max(primary_query_start, supp_query_start);
+                    int32_t overlap_end = std::min(primary_query_end, supp_query_end);
+                    int32_t overlap_length = overlap_end - overlap_start;
+
+                    // Calculate the mismatch rate for each alignment at the overlap
+                    double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end);
+                    double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end);
+                    // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl;
+                    // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl;
+
+                    // Trim the overlap from the alignment with the higher mismatch
+                    // rate
+                    if (primary_mismatch_rate > supp_mismatch_rate) {
+
+                        // Handle each scenario
+                        if (best_scenario == PRIM_SUPP_BD || best_scenario == PRIM_SUPP_GAP) {
+                            // Primary is first, incorporate the overlap into
+                            // the beginning of the deletion
+                            sv_start -= overlap_length;
+                        } else if (best_scenario == SUPP_PRIM_BD || best_scenario == SUPP_PRIM_GAP) {
+                            // Primary is last, incorporate the overlap into
+                            // the end of the deletion
+                            sv_end += overlap_length;
+                        }
+                    } else {
+
+                        // Handle each scenario
+                        if (best_scenario == SUPP_PRIM_BD || best_scenario == SUPP_PRIM_GAP) {
+                            // Supplementary is first, incorporate the overlap into
+                            // the beginning of the deletion
+                            sv_start -= overlap_length;
+                        } else if (best_scenario == PRIM_SUPP_BD || best_scenario == PRIM_SUPP_GAP) {
+                            // Supplementary is last, incorporate the overlap into
+                            // the end of the deletion
+                            sv_end += overlap_length;
+                        }
+                    }
+                }
+            }
+
+            // Add the best split alignment as the SV call
+            sv_calls.add(primary_chr, sv_start, sv_end, best_supp_type, ".", "SPLITREAD", "./.", best_split_aln_lh_norm);
             sv_count++;
         } else {
             // Resolve complex SVs
+
             // Simplest case: Largest supplementary is also the closest
             if (largest_supp_alignment == closest_supp_alignment) {
                 // [primary] -- [supp_start] -- [supp_end]
@@ -643,14 +851,14 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
                     std::string complex_sv_type_str = supp_type_str + "+" + primary_type_str;
 
                     // Add the complex SV call
-                    sv_calls.add(primary_chr, std::get<1>(largest_supp_alignment), primary_end, SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0);
+                    sv_calls.add(primary_chr, std::get<1>(largest_supp_alignment), primary_end, SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm);
                     sv_count++;
                 } else {
                     // [primary] -- [supp_start] -- [supp_end]
                     std::string complex_sv_type_str = primary_type_str + "+" + supp_type_str;
 
                     // Add the complex SV call
-                    sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0);
+                    sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm);
                     sv_count++;
                 }
             } else {
@@ -697,32 +905,34 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
 
 
                     // Combine the types if equal and not unknown/neutral
+                    std::cout << "Resolving complex SVs..." << std::endl;
                     std::string complex_sv_type_str = "";
                     if (primary_type != SVType::UNKNOWN && primary_type != SVType::NEUTRAL) {
                         complex_sv_type_str += primary_type_str;
+                        std::cout << "[1] Updated to type: " << complex_sv_type_str << std::endl;
                     }
                     if (closest_supp_type != primary_type && closest_supp_type != SVType::UNKNOWN && closest_supp_type != SVType::NEUTRAL) {
                         if (complex_sv_type_str != "") {
                             complex_sv_type_str += "+";
                         }
                         complex_sv_type_str += closest_supp_type_str;
+                        std::cout << "[2] Updated to type: " << complex_sv_type_str << std::endl;
                     }
                     if (complex_type != closest_supp_type && complex_type != primary_type && complex_type != SVType::UNKNOWN && complex_type != SVType::NEUTRAL) {
                         if (complex_sv_type_str != "") {
                             complex_sv_type_str += "+";
                         }
                         complex_sv_type_str += getSVTypeString(complex_type);
+                        std::cout << "[3] Updated to type: " << complex_sv_type_str << std::endl;
                     }
 
                     // Add the complex SV call if not empty
                     if (complex_sv_type_str != "") {
                         std::cout << "Found complex SV type: " << complex_sv_type_str << std::endl;
-                        sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0);
+                        sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm);
                         sv_count++;
                     }
-                }
-
-                
+                }                
             }
         }
     }
diff --git a/tests/test_general.py b/tests/test_general.py
index 92776b11..dbca30b9 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -78,11 +78,11 @@ def test_run():
                 fields = line.strip().split('\t')
                 assert fields[0] == "21"
                 assert fields[1] == "14458394"
-                assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1341;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS,;CLIPSUP=0;REPTYPE=NA;HMM=0.000000"
+                assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1341;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS,;CLIPSUP=0;REPTYPE=NA;HMM=-179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.000000"
             elif i == header_line + 2:
                 fields = line.strip().split('\t')
                 assert fields[0] == "21"
                 assert fields[1] == "14458394"
-                assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1344;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS,;CLIPSUP=0;REPTYPE=NA;HMM=0.000000"
+                assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1344;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS,;CLIPSUP=0;REPTYPE=NA;HMM=-179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.000000"
                 break
             
\ No newline at end of file

From 9665e8ef2a2293b62e10454cd01864c43e73cfbe Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 16 Nov 2024 14:33:12 -0500
Subject: [PATCH 016/134] Create a cpp module for memory tests

---
 .gitignore      |  1 +
 Makefile        | 19 +++++++++++--------
 Makefile-cpp    | 43 +++++++++++++++++++++++++++++++++++++++++++
 Makefile-python | 15 +++++++++++++++
 setup.py        |  3 +++
 5 files changed, 73 insertions(+), 8 deletions(-)
 create mode 100644 Makefile-cpp
 create mode 100644 Makefile-python

diff --git a/.gitignore b/.gitignore
index c627421d..939d32b9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -67,6 +67,7 @@ python/dbscan
 python/agglo
 linktoscripts
 tests/data
+tests/cpp_module_out
 
 # Population allele frequency filepaths
 data/gnomadv2_filepaths.txt
diff --git a/Makefile b/Makefile
index b6f7ddab..6b0170ae 100644
--- a/Makefile
+++ b/Makefile
@@ -1,11 +1,14 @@
-INCL_DIR := $(CURDIR)/include
-SRC_DIR := $(CURDIR)/src
-LIB_DIR := $(CURDIR)/lib
+# Top-Level Makefile
 
+.PHONY: python cpp clean
 
-all:
-	# Generate the SWIG wrapper (C++ -> Python)
-	swig -c++ -python -I$(INCL_DIR) -o $(SRC_DIR)/swig_wrapper.cpp -outdir $(LIB_DIR) $(SRC_DIR)/swig_wrapper.i
+# Targets for the sub-makefiles
+python:
+	$(MAKE) -f Makefile-python
 
-	# Compile the SWIG wrapper using setuptools
-	python3 setup.py build_ext --build-lib $(LIB_DIR)
+cpp:
+	$(MAKE) -f Makefile-cpp
+
+clean:
+	$(MAKE) -f Makefile-python clean
+	$(MAKE) -f Makefile-cpp clean
diff --git a/Makefile-cpp b/Makefile-cpp
new file mode 100644
index 00000000..cf76256b
--- /dev/null
+++ b/Makefile-cpp
@@ -0,0 +1,43 @@
+# Directories
+INCL_DIR := $(CURDIR)/include
+SRC_DIR := $(CURDIR)/src
+BUILD_DIR := $(CURDIR)/build
+LIB_DIR := $(CURDIR)/lib
+
+# Conda environment directories
+CONDA_PREFIX := $(shell echo $$CONDA_PREFIX)
+CONDA_INCL_DIR := $(CONDA_PREFIX)/include
+CONDA_LIB_DIR := $(CONDA_PREFIX)/lib
+
+# Compiler and Flags
+CXX := g++
+CXXFLAGS := -std=c++11 -I$(INCL_DIR) -I$(CONDA_INCL_DIR)
+LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR)  # Add rpath for shared libraries
+
+# Link htslib
+LDLIBS := -lhts  # Link with libhts.a or libhts.so
+# LDLIBS := -lmylib  # Link with libraries in LIB_DIR, e.g., libmylib.a or libmylib.so
+
+# Sources and Output
+# SOURCES := $(wildcard $(SRC_DIR)/*.cpp)
+SOURCES := $(filter-out $(SRC_DIR)/swig_wrapper.cpp, $(wildcard $(SRC_DIR)/*.cpp))  # Filter out the SWIG wrapper from the sources
+OBJECTS := $(patsubst $(SRC_DIR)/%.cpp,$(BUILD_DIR)/%.o,$(SOURCES))
+TARGET := $(BUILD_DIR)/cpp_module
+
+# Default target
+all: $(TARGET)
+
+# Link the executable
+$(TARGET): $(OBJECTS)
+	@mkdir -p $(BUILD_DIR)
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDLIBS)
+
+# Compile source files
+$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp
+	@mkdir -p $(BUILD_DIR)
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+# Clean the build directory
+clean:
+	rm -rf $(BUILD_DIR)
+	
\ No newline at end of file
diff --git a/Makefile-python b/Makefile-python
new file mode 100644
index 00000000..361ba11b
--- /dev/null
+++ b/Makefile-python
@@ -0,0 +1,15 @@
+INCL_DIR := $(CURDIR)/include
+SRC_DIR := $(CURDIR)/src
+LIB_DIR := $(CURDIR)/lib
+
+
+all:
+	# Generate the SWIG wrapper (C++ -> Python)
+	swig -c++ -python -I$(INCL_DIR) -o $(SRC_DIR)/swig_wrapper.cpp -outdir $(LIB_DIR) $(SRC_DIR)/swig_wrapper.i
+
+	# Compile the SWIG wrapper using setuptools
+	python3 setup.py build_ext --build-lib $(LIB_DIR)
+
+clean:
+	rm -rf $(LIB_DIR)/*.so $(LIB_DIR)/contextsv.py
+	
\ No newline at end of file
diff --git a/setup.py b/setup.py
index c57fb30f..5b6d9ef2 100644
--- a/setup.py
+++ b/setup.py
@@ -28,7 +28,10 @@
 
 # Set the project dependencies
 SRC_DIR = "src"
+# SRC_FILES = glob.glob(os.path.join(SRC_DIR, "*.cpp"))
 SRC_FILES = glob.glob(os.path.join(SRC_DIR, "*.cpp"))
+SRC_FILES = [f for f in SRC_FILES if "main.cpp" not in f]  # Ignore the main.cpp file
+
 INCLUDE_DIR = "include"
 INCLUDE_FILES = glob.glob(os.path.join(INCLUDE_DIR, "*.h"))
 

From a50530886f4916a4e7435d337bd7998e93297a35 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 16 Nov 2024 20:13:15 -0500
Subject: [PATCH 017/134] Fix all memory leaks

---
 .gitignore             |   1 +
 include/khmm.h         |  51 ++---
 include/sv_caller.h    |   1 -
 src/cnv_caller.cpp     |  74 ++++++--
 src/contextsv.cpp      |   1 +
 src/input_data.cpp     |  14 +-
 src/khmm.cpp           | 409 +++++++++++++++++++++++++----------------
 src/main.cpp           |  55 ++++++
 src/sv_data.cpp        |  24 ++-
 src/swig_interface.cpp |   4 -
 10 files changed, 414 insertions(+), 220 deletions(-)
 create mode 100644 src/main.cpp

diff --git a/.gitignore b/.gitignore
index 939d32b9..5b1177ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -88,3 +88,4 @@ python/dist_plots
 
 # Temporary files
 lib/.nfs*
+valgrind.log
diff --git a/include/khmm.h b/include/khmm.h
index 2f7ebe14..8f86e7a4 100644
--- a/include/khmm.h
+++ b/include/khmm.h
@@ -10,28 +10,26 @@
 #include <map>
 /// @endcond
 
-typedef struct {
-	int N;			/* number of states;  Q={1,2,...,N} */
-	int M; 			/* number of observation symbols; V={1,2,...,M}*/
-	double **A;		/* A[1..N][1..N]. a[i][j] is the transition prob
-			   	of going from state i at time t to state j
-			   	at time t+1 */
-	double **B;		/* B[1..N][1..M]. b[j][k] is the probability of
-			   	of observing symbol k in state j */
-	double *pi;		/* pi[1..N] pi[i] is the initial state distribution. */
-	double *B1_mean;	/* B1_mean[1..N] mean of a continuous Gaussian distribution for state 1 through N*/
-	double *B1_sd;		/*B1_sd standard deviation of B1 values, which is the same for all states*/
-	double B1_uf;		/*B1_uniform_fraction: the contribution of uniform distribution to the finite mixture model */
-	double *B2_mean;	/* B2_mean[1..4] is the average of B_allele_freq*/
-	double *B2_sd;		/* B2_sd[1..4] is the standard deviation of four B_allele_freq, B2_sd[5] is specially for state1, where B is modelled as a wide normal distribution */
-	double B2_uf;		/* B2_uniform_fraction: the fraction of uniform distribution in the finite mixture model */
-	
-	int NP_flag;		/*flag of 1 and 0 to indicate whether Non-Polymorhpic marker information is contained with HMM file*/
-	double *B3_mean;	/* B3_mean[1..N] mean of non-polymorphic probe for state 1 through N*/
-	double *B3_sd;		/* B3_sd[1..4] is the standard deviation of B3 values*/
-	double B3_uf;		/* B3_uniform_fraction: */
-	int dist;		/* new parameter to facilitate CNV calling from resequencing data (2009 April) */
-} CHMM;
+// Struct for HMM (C++ RAII style)
+struct CHMM
+{
+	int N;	// Number of states
+	int M; 	// Number of observation symbols
+	std::vector<std::vector<double>> A;  // Transition probability matrix
+	std::vector<std::vector<double>> B;  // Emission probability matrix
+	std::vector<double> pi;  // Initial state distribution
+	std::vector<double> B1_mean;  // Mean of a continuous Gaussian distribution for state 1 through N
+	std::vector<double> B1_sd;  // Standard deviation of B1 values, which is the same for all states
+	double B1_uf;  // B1_uniform_fraction: the contribution of uniform distribution to the finite mixture model
+	std::vector<double> B2_mean;  // B2_mean[1..4] is the average of B_allele_freq
+	std::vector<double> B2_sd;  // B2_sd[1..4] is the standard deviation of four B_allele_freq, B2_sd[5] is specially for state1, where B is modelled as a wide normal distribution
+	double B2_uf;  // B2_uniform_fraction: the fraction of uniform distribution in the finite mixture model
+	int NP_flag;
+	std::vector<double> B3_mean;
+	std::vector<double> B3_sd;
+	double B3_uf;
+	int dist;
+};
 
 
 /************************************
@@ -39,10 +37,13 @@ typedef struct {
 ************************************/
 
 /// Read an HMM from a file
-CHMM ReadCHMM (const char *filename);
+CHMM ReadCHMM (const std::string filename);
 
-// /// Free the memory allocated for an HMM
-// void FreeCHMM(CHMM *phmm);
+// Read a matrix
+std::vector<std::vector<double>> readMatrix(std::ifstream& file, int rows, int cols);
+
+// Read a vector
+std::vector<double> readVector(std::ifstream& file, int size);
 
 /// Run the main HMM algorithm
 std::pair<std::vector<int>, double> testVit_CHMM(CHMM hmm, int T, std::vector<double>& O1, std::vector<double>& O2, std::vector<double>& pfb);
diff --git a/include/sv_caller.h b/include/sv_caller.h
index ed11d08f..0a94b254 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -31,7 +31,6 @@ class SVCaller {
         int min_sv_size = 50;       // Minimum SV size to be considered
         int min_mapq = 20;          // Minimum mapping quality to be considered
         InputData* input_data;
-        std::mutex sv_mtx;  // Mutex for locking the SV data
 
         // Detect SVs from the CIGAR string of a read alignment, and return the
         // mismatch rate, and the start and end positions of the query sequence
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index ceaeb5ec..951775c6 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -481,7 +481,17 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
 
     // Split the chromosome into equal parts for each thread
     uint32_t chr_len = this->input_data->getRefGenomeChromosomeLength(chr);
+    if (chr_len == 0)
+    {
+    	printError("ERROR: Chromosome length is zero for: " + chr);
+        return 0.0;
+    }
     std::vector<std::string> region_chunks = splitRegionIntoChunks(chr, 1, chr_len, num_threads);
+    if (region_chunks.empty())
+    {
+        printError("ERROR: Failed to split chromosome into regions.");
+        return 0.0;
+    }
 
     // Calculate the mean chromosome coverage in parallel
     uint32_t pos_count = 0;
@@ -496,23 +506,22 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
         {
             // Run samtools depth on the entire region, and print positions and
             // depths (not chromosome)
-            const int cmd_size = 256;
-            char cmd[cmd_size];
-            snprintf(cmd, cmd_size,\
+            size_t cmd_size = input_filepath.size() + 256;
+            std::vector<char> cmd(cmd_size);
+            snprintf(cmd.data(), cmd_size,\
                 "samtools depth -r %s %s | awk '{print $2, $3}'",\
                 region_chunk.c_str(), input_filepath.c_str());
 
             // Open a pipe to read the output of the command
-            FILE *fp = popen(cmd, "r");
+            FILE *fp = popen(cmd.data(), "r");
             if (fp == NULL)
             {
-                printError("ERROR: Could not open pipe for command: " + std::string(cmd));
-                exit(EXIT_FAILURE);
+                throw std::runtime_error("ERROR: Could not open pipe for command: " + std::string(cmd.data()));
             }
 
             // Parse the outputs (position and depth)
             std::unordered_map<uint32_t, int> pos_depth_map;
-            const int line_size = 256;
+            const int line_size = 1024;
             char line[line_size];
             uint32_t pos;
             int depth;
@@ -527,26 +536,53 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
                     cum_depth += depth;
                 }
             }
-            pclose(fp);  // Close the process
+            
+            // Check if pclose fails
+            if (pclose(fp) == -1)
+            {
+                throw std::runtime_error("ERROR: Failed to close pipe for command: " + std::string(cmd.data()));
+            }
+            //pclose(fp);  // Close the process
 
             return std::make_tuple(pos_count, cum_depth, pos_depth_map);
         };
-        std::future<std::tuple<uint32_t, uint32_t, std::unordered_map<uint32_t, int>>> future = std::async(std::launch::async, get_mean_chr_cov);
-        futures.push_back(std::move(future));
+        
+        futures.emplace_back(std::async(std::launch::async, get_mean_chr_cov));
+        //std::future<std::tuple<uint32_t, uint32_t, std::unordered_map<uint32_t, int>>> future = std::async(std::launch::async, get_mean_chr_cov);
+        //futures.push_back(std::move(future));
     }
 
-    // Loop through the futures and get the results
+    // Thread-safe map merging (using mutex)
+    std::mutex merge_mutex;
     for (auto& future : futures)
     {
-        future.wait();
-        std::tuple<uint32_t, uint32_t, std::unordered_map<uint32_t, int>> result = std::move(future.get());
-
-        // Update the position count, cumulative depth, and merge the position-depth maps
-        pos_count += std::get<0>(result);
-        cum_depth += std::get<1>(result);
-        this->mergePosDepthMaps(this->pos_depth_map, std::get<2>(result));
+        try
+        {
+            future.wait();
+            auto result = std::move(future.get());
+
+            // Safely merge results
+            std::lock_guard<std::mutex> lock(merge_mutex);
+            pos_count += std::get<0>(result);
+            cum_depth += std::get<1>(result);
+            this->mergePosDepthMaps(this->pos_depth_map, std::get<2>(result));
+        }
+        catch (const std::exception& ex)
+        {
+            printError("ERROR: Exception in thread execution - " + std::string(ex.what()));
+            return 0.0;
+        }
     }
-    double mean_chr_cov = (double) cum_depth / (double) pos_count;
+    
+    // Validate and calculate mean chromosome coverage
+    if (pos_count == 0)
+    {
+        printError("ERROR: No positions found in chromosome coverage calculation.");
+        return 0.0;
+    }
+    
+    double mean_chr_cov = static_cast<double>(cum_depth) / static_cast<double>(pos_count);
+
 
     return mean_chr_cov;
 }
diff --git a/src/contextsv.cpp b/src/contextsv.cpp
index c0d4acd7..4f35c052 100644
--- a/src/contextsv.cpp
+++ b/src/contextsv.cpp
@@ -24,6 +24,7 @@ int ContextSV::run()
     SVCaller sv_caller(*this->input_data);  // Create an SV caller object
     SVData sv_calls = sv_caller.run();  // Run the SV caller
     std::string output_dir = this->input_data->getOutputDir();  // Get the output directory
+    
     std::cout << "Writing SV calls to file " << output_dir << "/output.vcf..." << std::endl;
     sv_calls.saveToVCF(ref_genome, output_dir);  // Save the SV calls to a VCF file
     std::cout << "SV calling complete." << std::endl;
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 8867d0ff..99a4cade 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -42,7 +42,7 @@ void InputData::setShortReadBam(std::string filepath)
     this->short_read_bam = filepath;
 
     // Check if empty string
-    if (filepath == "")
+    if (filepath.empty())
     {
         return;
         
@@ -51,8 +51,9 @@ void InputData::setShortReadBam(std::string filepath)
         FILE *fp = fopen(filepath.c_str(), "r");
         if (fp == NULL)
         {
-            std::cerr << "Short read BAM file does not exist: " << filepath << std::endl;
-            exit(1);
+            throw std::runtime_error("Short read BAM file does not exist: " + filepath);
+        } else {
+            fclose(fp);
         }
     }
 }
@@ -67,7 +68,7 @@ void InputData::setLongReadBam(std::string filepath)
     this->long_read_bam = filepath;
 
     // Check if empty string
-    if (filepath == "")
+    if (filepath.empty())
     {
         return;
         
@@ -76,8 +77,9 @@ void InputData::setLongReadBam(std::string filepath)
         FILE *fp = fopen(filepath.c_str(), "r");
         if (fp == NULL)
         {
-            std::cerr << "Long read BAM file does not exist: " << filepath << std::endl;
-            exit(1);
+            throw std::runtime_error("Long read BAM file does not exist: " + filepath);
+        } else {
+            fclose(fp);
         }
     }
 }
diff --git a/src/khmm.cpp b/src/khmm.cpp
index 3b3bffa6..bdb6eb8b 100644
--- a/src/khmm.cpp
+++ b/src/khmm.cpp
@@ -3,10 +3,13 @@
 
 /// @cond
 #include <iostream>
+#include <fstream>
 #include <sstream>
 #include <vector>
 #include <map>
 #include <iomanip>
+#include <stdexcept>
+#include <limits>
 /// @endcond
 
 #define STATE_CHANGE 100000.0 /*this is the expected changes (D value) in the transition matrix*/
@@ -50,30 +53,53 @@ std::pair<std::vector<int>, double> testVit_CHMM(CHMM hmm, int T, std::vector<do
 	return state_sequence;
 }
 
-double b1iot(int state, double *mean, double *sd, double uf, double o)
+// double b1iot(int state, double *mean, double *sd, double uf, double o)
+double b1iot(int state, std::vector<double> mean, std::vector<double> sd, double uf, double o)
 {
-	if (o < mean[1])
+	// if (o < mean[1])
+	// {
+	// 	o = mean[1];
+	// }
+	// double p = uf + ((1 - uf) * pdf_normal(o, mean[state], sd[state]));
+
+	// Get the values (0-based indexing)
+	if (o < mean[0])
 	{
-		o = mean[1];
+		o = mean[0];
 	}
-	double p = uf + ((1 - uf) * pdf_normal(o, mean[state], sd[state]));
+	double p = uf + ((1 - uf) * pdf_normal(o, mean[state-1], sd[state-1]));
 
 	return log(p);
 }
 
-double b2iot(int state, double *mean, double *sd, double uf, double pfb, double b)
+// double b2iot(int state, double *mean, double *sd, double uf, double pfb, double b)
+double b2iot(int state, const std::vector<double> mean, const std::vector<double> sd, double uf, double pfb, double b)
 {
+	// double p = 0;
+	// double mean0 = mean[1];  // mean[1] = 0
+	// double mean25 = mean[2];  // mean[2] = 0.25
+	// double mean33 = mean[3];  // mean[3] = 0.33
+	// double mean50 = mean[4];  // mean[4] = 0.5
+	// double mean50_state1 = mean[5];  // mean[5] = 0.5
+	// double sd0 = sd[1];  // sd[1] = 0
+	// double sd25 = sd[2];  // sd[2] = 0.25
+	// double sd33 = sd[3];  // sd[3] = 0.33
+	// double sd50 = sd[4];  // sd[4] = 0.5
+	// double sd50_state1 = sd[5];  // sd[5] = 0.5
+	// p = uf;  // UF = previous alpha (transition probability)
+
+	// Get the values (0-based indexing)
 	double p = 0;
-	double mean0 = mean[1];  // mean[1] = 0
-	double mean25 = mean[2];  // mean[2] = 0.25
-	double mean33 = mean[3];  // mean[3] = 0.33
-	double mean50 = mean[4];  // mean[4] = 0.5
-	double mean50_state1 = mean[5];  // mean[5] = 0.5
-	double sd0 = sd[1];  // sd[1] = 0
-	double sd25 = sd[2];  // sd[2] = 0.25
-	double sd33 = sd[3];  // sd[3] = 0.33
-	double sd50 = sd[4];  // sd[4] = 0.5
-	double sd50_state1 = sd[5];  // sd[5] = 0.5
+	double mean0 = mean[0];  // mean[0] = 0
+	double mean25 = mean[1];  // mean[1] = 0.25
+	double mean33 = mean[2];  // mean[2] = 0.33
+	double mean50 = mean[3];  // mean[3] = 0.5
+	double mean50_state1 = mean[4];  // mean[4] = 0.5
+	double sd0 = sd[0];  // sd[0] = 0
+	double sd25 = sd[1];  // sd[1] = 0.25
+	double sd33 = sd[2];  // sd[2] = 0.33
+	double sd50 = sd[3];  // sd[3] = 0.5
+	double sd50_state1 = sd[4];  // sd[4] = 0.5
 	p = uf;  // UF = previous alpha (transition probability)
 
 	// PDF normal is the transition probability distrubution a_ij (initialized
@@ -247,7 +273,9 @@ std::pair<std::vector<int>, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect
 	{
 		for (j = 1; j <= hmm.N; j++)
 		{
-			A1[i][j] = hmm.A[i][j];
+			// A1[i][j] = hmm.A[i][j];
+			// Update for 0-based indexing
+			A1[i][j] = hmm.A[i-1][j-1];
 		}
 	}
 
@@ -257,9 +285,15 @@ std::pair<std::vector<int>, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect
 	// Threshold any zero values to avoid calculation issues.
 	for (i = 1; i <= hmm.N; i++)
 	{
-		if (hmm.pi[i] == 0)
-			hmm.pi[i] = 1e-9; /*eliminate problems with zero probability*/
-		hmm.pi[i] = log(hmm.pi[i]);  // Convert to log probability due to underflow
+		// if (hmm.pi[i] == 0)
+		// 	hmm.pi[i] = 1e-9; /*eliminate problems with zero probability*/
+		// hmm.pi[i] = log(hmm.pi[i]);  // Convert to log probability due to underflow
+
+		// Update to 0-based indexing
+		if (hmm.pi[i-1] == 0) {
+			hmm.pi[i-1] = 1e-9; /*eliminate problems with zero probability*/
+		}
+		hmm.pi[i-1] = log(hmm.pi[i-1]);  // Convert to log probability due to underflow
 	}
 
 	// Biot is the NxT matrix of state observation likelihoods.
@@ -302,7 +336,12 @@ std::pair<std::vector<int>, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect
 	/* 1. Initialization  */
 	for (i = 1; i <= hmm.N; i++)
 	{
-		delta[1][i] = hmm.pi[i] + biot[i][1];  // Initialize the delta matrix (log probability) to the initial state distribution + the emission probability
+		// delta[1][i] = hmm.pi[i] + biot[i][1];  // Initialize the delta matrix
+		// (log probability) to the initial state distribution + the emission
+		// probability
+		
+		// Update to 0-based indexing
+		delta[1][i] = hmm.pi[i-1] + biot[i][1];  // Initialize the delta matrix
 		psi[1][i] = 0;  // Initialize the psi matrix (state sequence) to 0 (no state)
 	}
 
@@ -372,7 +411,9 @@ std::pair<std::vector<int>, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect
 
 	for (i = 1; i <= hmm.N; i++)
 	{ /*recover the HMM model as original*/
-		hmm.pi[i] = exp(hmm.pi[i]);
+		// hmm.pi[i] = exp(hmm.pi[i]);
+		// Update to 0-based indexing
+		hmm.pi[i-1] = exp(hmm.pi[i-1]);
 	}
 
 	free_dmatrix(biot, 1, hmm.N, 1, T);
@@ -381,155 +422,209 @@ std::pair<std::vector<int>, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect
 	return std::make_pair(q, final_lh);
 }
 
-CHMM ReadCHMM(const char *filename)
+CHMM ReadCHMM(const std::string filename)
 {
-	FILE *fp;
+	std::ifstream file(filename);
+	if (!file.is_open())
+	{
+		throw std::runtime_error("Error opening file");
+	}
 	CHMM hmm;
-	int i, j, k;
 
-	fp = fopen(filename, "r");
-	if (!fp)
-		fprintf(stderr, "Error: cannot read from HMM file %s\n", filename);
 
-	if (fscanf(fp, "M=%d\n", &(hmm.M)) == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read M annotation from HMM file");
-	if (fscanf(fp, "N=%d\n", &(hmm.N)) == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read N annotation from HMM file");
+	// Read M
+	std::string line;
+	std::getline(file, line);
+	if (sscanf(line.c_str(), "M=%d", &hmm.M) != 1)
+	{
+		throw std::runtime_error("Error reading M");
+	}
 
-	if (fscanf(fp, "A:\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read A annotation from HMM file");
-	hmm.A = (double **)dmatrix(1, hmm.N, 1, hmm.N);
-	for (i = 1; i <= hmm.N; i++)
+	// Read N
+	std::getline(file, line);
+	if (sscanf(line.c_str(), "N=%d", &hmm.N) != 1)
 	{
-		for (j = 1; j <= hmm.N; j++)
-		{
-			if (fscanf(fp, "%lf", &(hmm.A[i][j])) == EOF)
-				fprintf(stderr, "khmm::ReadCHMM: cannot read A matrix from HMM file");
-		}
-		if (fscanf(fp, "\n") == EOF)
-			fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file");
+		throw std::runtime_error("Error reading N");
 	}
 
-	if (fscanf(fp, "B:\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read B annotation from HMM file");
-	hmm.B = (double **)dmatrix(1, hmm.N, 1, hmm.M);
-	for (j = 1; j <= hmm.N; j++)
+	// Read A
+	std::getline(file, line);
+	if (line != "A:")
 	{
-		for (k = 1; k <= hmm.M; k++)
-		{
-			if (fscanf(fp, "%lf", &(hmm.B[j][k])) == EOF)
-				fprintf(stderr, "khmm::ReadCHMM: cannot read B matrix from HMM file");
-		}
-		if (fscanf(fp, "\n") == EOF)
-			fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file");
+		throw std::runtime_error("Error reading A");
+	}
+	hmm.A = readMatrix(file, hmm.N, hmm.N);
+	if (hmm.A.size() != (size_t)hmm.N || hmm.A[0].size() != (size_t)hmm.N)
+	{
+		throw std::runtime_error("Error reading A");
 	}
 
-	if (fscanf(fp, "pi:\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read PI annotation from HMM file");
-	hmm.pi = (double *)dvector(1, hmm.N);
-	for (i = 1; i <= hmm.N; i++)
+	// Print A
+	// std::cout << "A: " << std::endl;
+	// for (int i = 0; i < hmm.N; i++)
+	// {
+	// 	for (int j = 0; j < hmm.N; j++)
+	// 	{
+	// 		std::cout << std::setprecision(10) << hmm.A[i][j] << " ";
+	// 	}
+	// 	std::cout << std::endl;
+	// }
+
+	// Read B
+	std::getline(file, line);
+	if (line != "B:")
+	{
+		throw std::runtime_error("Error reading B");
+	}
+	hmm.B = readMatrix(file, hmm.N, hmm.M);
+	if (hmm.B.size() != (size_t)hmm.N || hmm.B[0].size() != (size_t)hmm.M)
 	{
-		if (fscanf(fp, "%lf", &(hmm.pi[i])) == EOF)
-			fprintf(stderr, "khmm::ReadCHMM: cannot read PI vector from HMM file");
-		if (hmm.pi[i] < 1e-6)
-			hmm.pi[i] = 1e-6;
+		throw std::runtime_error("Error reading B");
 	}
-	if (fscanf(fp, "\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file");
 
-	if (fscanf(fp, "B1_mean:\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read B1_mean annotation from HMM file");
-	hmm.B1_mean = (double *)dvector(1, hmm.N);
-	for (i = 1; i <= hmm.N; i++)
-		if (fscanf(fp, "%lf", &(hmm.B1_mean[i])) == EOF)
-			fprintf(stderr, "khmm::ReadCHMM: cannot read B1_mean vector from HMM file");
-	if (fscanf(fp, "\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file");
-
-	if (fscanf(fp, "B1_sd:\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read B1_sd annotation from HMM file");
-	hmm.B1_sd = (double *)dvector(1, hmm.N);
-	for (i = 1; i <= hmm.N; i++)
-		if (fscanf(fp, "%lf", &(hmm.B1_sd[i])) == EOF)
-			fprintf(stderr, "khmm::ReadCHMM: cannot read B1_sd from HMM file");
-	if (fscanf(fp, "\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file");
-
-	if (fscanf(fp, "B1_uf:\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read B1_uf annotation from HMM file");
-	if (fscanf(fp, "%lf", &(hmm.B1_uf)) == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read B1_uf from HMM file");
-	if (fscanf(fp, "\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file");
-
-	if (fscanf(fp, "B2_mean:\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read B2_mean annotation from HMM file");
-	hmm.B2_mean = (double *)dvector(1, 5);
-	for (i = 1; i <= 5; i++)
-		if (fscanf(fp, "%lf", &(hmm.B2_mean[i])) == EOF)
-			fprintf(stderr, "khmm::ReadCHMM: cannot read B2_mean from HMM file");
-	if (fscanf(fp, "\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file");
-
-	if (fscanf(fp, "B2_sd:\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read B2_sd annotation from HMM file");
-	hmm.B2_sd = (double *)dvector(1, 5);
-	for (i = 1; i <= 5; i++)
-		if (fscanf(fp, "%lf", &(hmm.B2_sd[i])) == EOF)
-			fprintf(stderr, "khmm::ReadCHMM: cannot read B2_sd from HMM file");
-	if (fscanf(fp, "\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file");
-
-	if (fscanf(fp, "B2_uf:\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read B2_uf annotation from HMM file");
-	if (fscanf(fp, "%lf", &(hmm.B2_uf)) == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read B2_uf from HMM file");
-	if (fscanf(fp, "\n") == EOF)
-		fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file");
-
-	if (fscanf(fp, "B3_mean:\n") != EOF)
-	{
-		hmm.NP_flag = 1;
-		hmm.B3_mean = (double *)dvector(1, hmm.N);
-		for (i = 1; i <= hmm.N; i++)
-			if (fscanf(fp, "%lf", &(hmm.B3_mean[i])) == EOF)
-				fprintf(stderr, "khmm::ReadCHMM: cannot read B3_mean from HMM file");
-		if (fscanf(fp, "\n") == EOF)
-			fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file");
-		if (fscanf(fp, "B3_sd:\n") == EOF)
-			fprintf(stderr, "khmm::ReadCHMM: cannot read B3_sd annotation from HMM file");
-		hmm.B3_sd = (double *)dvector(1, hmm.N);
-		for (i = 1; i <= hmm.N; i++)
-			if (fscanf(fp, "%lf", &(hmm.B3_sd[i])) == EOF)
-				fprintf(stderr, "khmm::ReadCHMM: cannot read B3_sd from HMM file");
-		if (fscanf(fp, "\n") == EOF)
-			fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file");
-		if (fscanf(fp, "B3_uf:\n") == EOF)
-			fprintf(stderr, "khmm::ReadCHMM: cannot read B3_uf annotation from HMM file");
-		if (fscanf(fp, "%lf", &(hmm.B3_uf)) == EOF)
-			fprintf(stderr, "khmm::ReadCHMM: cannot read B3_uf from HMM file");
-		if (fscanf(fp, "\n") == EOF)
-			fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file");
-	}
-	else
-	{
-		hmm.NP_flag = 0;
-	}
-
-	if (fscanf(fp, "DIST:\n") != EOF)
-	{
-		if (fscanf(fp, "%d", &(hmm.dist)) == EOF)
-			fprintf(stderr, "khmm:ReadCHMM: cannot read DIST from HMM file");
-	}
-	else
-	{
-		// hmm.dist = STATE_CHANGE;
-		//  snp_dist is the default distance between two SNPs in the same state
-		//  (not used in this implementation)
-		//  Set it to 1 to disable the distance model
-		hmm.dist = 1;
-	}
-
-	fclose(fp);
+	// Read pi
+	std::getline(file, line);
+	if (line != "pi:")
+	{
+		throw std::runtime_error("Error reading pi");
+	}
+	hmm.pi = readVector(file, hmm.N);
+	if (hmm.pi.size() != (size_t)hmm.N)
+	{
+		throw std::runtime_error("Error reading pi");
+	}
+
+	// Print pi
+	// std::cout << "pi: ";
+	// for (int i = 0; i < hmm.N; i++)
+	// {
+	// 	std::cout << std::setprecision(10) << hmm.pi[i] << " ";
+	// }
+
+	// Read B1_mean
+	std::getline(file, line);
+	if (line != "B1_mean:")
+	{
+		throw std::runtime_error("Error reading B1_mean");
+	}
+	hmm.B1_mean = readVector(file, hmm.N);
+	if (hmm.B1_mean.size() != (size_t)hmm.N)
+	{
+		throw std::runtime_error("Error reading B1_mean");
+	}
+
+	// Print B1_mean
+	// std::cout << "B1_mean: ";
+	// for (int i = 0; i < hmm.N; i++)
+	// {
+	// 	std::cout << std::setprecision(10) << hmm.B1_mean[i] << " ";
+	// }
+
+	// Read B1_sd
+	std::getline(file, line);
+	if (line != "B1_sd:")
+	{
+		throw std::runtime_error("Error reading B1_sd");
+	}
+	hmm.B1_sd = readVector(file, hmm.N);
+	if (hmm.B1_sd.size() != (size_t)hmm.N)
+	{
+		throw std::runtime_error("Error reading B1_sd");
+	}
+
+	// Print B1_sd
+	// std::cout << "B1_sd: ";
+	// for (int i = 0; i < hmm.N; i++)
+	// {
+	// 	std::cout << std::setprecision(10) << hmm.B1_sd[i] << " ";
+	// }
+
+	// Read B1_uf
+	std::getline(file, line);
+	if (line != "B1_uf:")
+	{
+		throw std::runtime_error("Error reading B1_uf");
+	}
+	std::getline(file, line);
+	try {
+		hmm.B1_uf = std::stod(line);
+	} catch (const std::invalid_argument& e) {
+		throw std::runtime_error("Error reading B1_uf");
+	}
+
+	// Print B1_uf
+	// std::cout << "B1_uf: " << std::setprecision(10) << hmm.B1_uf << std::endl;
+
+	// Read B2_mean
+	std::getline(file, line);
+	if (line != "B2_mean:")
+	{
+		throw std::runtime_error("Error reading B2_mean");
+	}
+	hmm.B2_mean = readVector(file, 5);
+	if (hmm.B2_mean.size() != (size_t)5)
+	{
+		throw std::runtime_error("Error reading B2_mean");
+	}
+
+	// Read B2_sd
+	std::getline(file, line);
+	if (line != "B2_sd:")
+	{
+		throw std::runtime_error("Error reading B2_sd");
+	}
+	hmm.B2_sd = readVector(file, 5);
+	if (hmm.B2_sd.size() != (size_t)5)
+	{
+		throw std::runtime_error("Error reading B2_sd");
+	}
+
+	// Read B2_uf
+	std::getline(file, line);
+	if (line != "B2_uf:")
+	{
+		throw std::runtime_error("Error reading B2_uf");
+	}
+	std::getline(file, line);
+	try {
+		hmm.B2_uf = std::stod(line);
+	} catch (const std::invalid_argument& e) {
+		throw std::runtime_error("Error reading B2_uf");
+	}
+
+	// Print B2_uf
+	// std::cout << "B2_uf: " << std::setprecision(10) << hmm.B2_uf << std::endl;
+
 	return hmm;
 }
+
+std::vector<std::vector<double>> readMatrix(std::ifstream &file, int rows, int cols)
+{
+    std::vector<std::vector<double>> matrix(rows, std::vector<double>(cols));
+	for (int i = 0; i < rows; i++)
+	{
+		for (int j = 0; j < cols; j++)
+		{
+			if (!(file >> matrix[i][j]))
+			{
+				throw std::runtime_error("Error reading matrix");
+			}
+		}
+	}
+	file.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+	return matrix;
+}
+
+std::vector<double> readVector(std::ifstream &file, int size)
+{
+	std::vector<double> vector(size);
+	for (int i = 0; i < size; i++)
+	{
+		if (!(file >> vector[i]))
+		{
+			throw std::runtime_error("Error reading vector");
+		}
+	}
+	file.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
+	return vector;
+}
diff --git a/src/main.cpp b/src/main.cpp
new file mode 100644
index 00000000..e1ac4c12
--- /dev/null
+++ b/src/main.cpp
@@ -0,0 +1,55 @@
+
+#include "swig_interface.h"
+#include "input_data.h"
+
+/// @cond DOXYGEN_IGNORE
+#include <iostream>
+#include <string>
+/// @endcond
+
+// Placeholder for ContextSV library includes
+// #include "ContextSV.h"
+
+void runContextSV(const std::string& bamFile, const std::string& refFile, const std::string& vcfFile, const std::string& outputDir, int threadCount = 1) {
+    // Placeholder for setting up input data and running ContextSV
+    std::cout << "Running ContextSV with the following files:" << std::endl;
+    std::cout << "BAM file: " << bamFile << std::endl;
+    std::cout << "Reference file: " << refFile << std::endl;
+    std::cout << "VCF file: " << vcfFile << std::endl;
+    std::cout << "Output directory: " << outputDir << std::endl;
+
+    // Set up input data
+    InputData input_data;
+    input_data.setShortReadBam(bamFile);
+    input_data.setLongReadBam(bamFile);
+    input_data.setRefGenome(refFile);
+    input_data.setSNPFilepath(vcfFile);
+    input_data.setChromosome("21");
+    input_data.setRegion("14486099-14515105");
+    input_data.setThreadCount(1);
+    input_data.setAlleleFreqFilepaths("");
+    input_data.setHMMFilepath("");
+    input_data.setOutputDir(outputDir);
+    input_data.saveCNVData(true);
+    input_data.setThreadCount(threadCount);
+
+    // Run ContextSV
+    run(input_data);
+}
+
+int main(int argc, char* argv[]) {
+    if (argc != 6) {
+        std::cerr << "Usage: " << argv[0] << " <bam_file> <ref_file> <vcf_file> <output_dir> <thread_count>" << std::endl;
+        return 1;
+    }
+
+    std::string bamFile = argv[1];
+    std::string refFile = argv[2];
+    std::string vcfFile = argv[3];
+    std::string outputDir = argv[4];
+    int threadCount = std::stoi(argv[5]);
+    
+    runContextSV(bamFile, refFile, vcfFile, outputDir, threadCount);
+
+    return 0;
+}
diff --git a/src/sv_data.cpp b/src/sv_data.cpp
index 085e737e..a0611c8d 100644
--- a/src/sv_data.cpp
+++ b/src/sv_data.cpp
@@ -6,7 +6,6 @@
 #include <fstream>
 /// @endcond
 
-
 int SVData::add(std::string chr, int32_t start, int32_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
 {
     // Throw an error if the genotype is not valid
@@ -72,19 +71,28 @@ int SVData::add(std::string chr, int32_t start, int32_t end, SVType sv_type, std
 
 void SVData::concatenate(const SVData &sv_data)
 {
+    if (sv_data.sv_calls.empty()) {
+        std::cerr << "Error: SVData object is empty." << std::endl;
+        return;
+    }
+
     // Iterate over the chromosomes in the other SVData object
     for (auto const& chr_sv_calls : sv_data.sv_calls) {
-        std::string chr = chr_sv_calls.first;
+        const auto &chr = chr_sv_calls.first;
+        // std::string chr = chr_sv_calls.first;
+        auto &current_chr_calls = this->sv_calls[chr];
 
         // Iterate over the SV calls in the other SVData object
         for (auto const& sv_call : chr_sv_calls.second) {
 
-            // Add the SV call to the map of candidate locations. Since the region
-            // is unique (per chromosome), there is no need to check if the SV
-            // candidate already exists in the map.
-            SVCandidate candidate = sv_call.first;  // (start, end, alt_allele)
-            SVInfo info = sv_call.second;  // (sv_type, read_support, data_type, sv_length)
-            this->sv_calls[chr][candidate] = info;
+            // Add the SV call to the map of candidate locations
+            std::pair<std::map<SVCandidate, SVInfo>::iterator, bool> result = current_chr_calls.emplace(sv_call);
+            bool inserted = result.second;
+
+            // Throw a warning if the SV candidate already exists
+            if (!inserted) {
+                std::cerr << "Warning: SV candidate already exists in the map." << std::endl;
+            }
         }
     }
 }
diff --git a/src/swig_interface.cpp b/src/swig_interface.cpp
index 87334fec..8d2e7a42 100644
--- a/src/swig_interface.cpp
+++ b/src/swig_interface.cpp
@@ -1,7 +1,3 @@
-//
-// Created by jperdomo on 1/8/2023.
-//
-
 #include "swig_interface.h"
 #include "contextsv.h"
 

From 7cac258e88c11252f80227852202e9f33ffde4d7 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 20 Nov 2024 11:47:58 -0500
Subject: [PATCH 018/134] Improve memory management

---
 Makefile-cpp          |   2 +-
 include/cnv_caller.h  |  10 +-
 include/contextsv.h   |   4 +-
 include/fasta_query.h |  12 +-
 include/input_data.h  |   6 +-
 include/sv_caller.h   |  19 +-
 include/sv_data.h     |   2 +-
 include/sv_object.h   |  37 ++++
 python/sv_merger.py   |   5 +-
 src/cnv_caller.cpp    | 206 +++++++++--------
 src/contextsv.cpp     |  18 +-
 src/fasta_query.cpp   |  23 +-
 src/input_data.cpp    |   2 +-
 src/main.cpp          |  43 +++-
 src/snp_info.cpp      |   2 +-
 src/sv_caller.cpp     | 505 +++++++++++++++++++++++++++++++++---------
 src/sv_data.cpp       |   2 +-
 src/sv_object.cpp     | 126 +++++++++++
 18 files changed, 769 insertions(+), 255 deletions(-)
 create mode 100644 include/sv_object.h
 create mode 100644 src/sv_object.cpp

diff --git a/Makefile-cpp b/Makefile-cpp
index cf76256b..e6ba7d30 100644
--- a/Makefile-cpp
+++ b/Makefile-cpp
@@ -11,7 +11,7 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib
 
 # Compiler and Flags
 CXX := g++
-CXXFLAGS := -std=c++11 -I$(INCL_DIR) -I$(CONDA_INCL_DIR)
+CXXFLAGS := -std=c++14 -I$(INCL_DIR) -I$(CONDA_INCL_DIR)
 LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR)  # Add rpath for shared libraries
 
 # Link htslib
diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 80c69f89..22a14cc9 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -9,6 +9,7 @@
 #include "cnv_data.h"
 #include "sv_data.h"
 #include "sv_types.h"
+#include "sv_object.h"
 
 /// @cond
 #include <string>
@@ -47,7 +48,7 @@ struct SNPData {
 // CNVCaller: Detect CNVs and return the state sequence by SNP position
 class CNVCaller {
     private:
-        InputData* input_data;
+        InputData& input_data;
         mutable std::mutex sv_candidates_mtx; // SV candidate map mutex
         mutable std::mutex snp_data_mtx;  // SNP data mutex
         mutable std::mutex hmm_mtx;  // HMM mutex
@@ -86,7 +87,7 @@ class CNVCaller {
         std::pair<SNPData, bool> querySNPRegion(std::string chr, int64_t start_pos, int64_t end_pos, SNPInfo &snp_info, std::unordered_map<uint32_t, int> &pos_depth_map, double mean_chr_cov);
 
         // Run copy number prediction for a chunk of SV candidates from CIGAR strings
-        void runCIGARCopyNumberPredictionChunk(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, std::vector<SVCandidate> sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map<uint32_t, int>& pos_depth_map);
+        void runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map<uint32_t, int>& pos_depth_map);
 
         void updateSVCopyNumber(std::map<SVCandidate, SVInfo>& sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood);
 
@@ -102,7 +103,7 @@ class CNVCaller {
         void mergePosDepthMaps(std::unordered_map<uint32_t, int>& main_map, std::unordered_map<uint32_t, int>& map_update);
 
     public:
-        CNVCaller(InputData& input_data);
+        explicit CNVCaller(InputData& input_data);
 
         // Load file data for a chromosome (SNP positions, BAF values, and PFB values)
         void loadChromosomeData(std::string chr);
@@ -112,7 +113,8 @@ class CNVCaller {
         std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate);
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
-        SNPData runCIGARCopyNumberPrediction(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, int min_length);
+        // SNPData runCIGARCopyNumberPrediction(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, int min_length);
+        void runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall>& sv_candidates, int min_length);
 
         // Calculate the mean chromosome coverage
         double calculateMeanChromosomeCoverage(std::string chr);
diff --git a/include/contextsv.h b/include/contextsv.h
index b2a5d6e3..56a82a54 100644
--- a/include/contextsv.h
+++ b/include/contextsv.h
@@ -13,10 +13,10 @@
 
 class ContextSV {
 	private:
-		InputData* input_data;
+		InputData& input_data;
 
 	public:
-		ContextSV(InputData& input_data);
+		explicit ContextSV(InputData& input_data);
 
 		// Entry point
 		int run();
diff --git a/include/fasta_query.h b/include/fasta_query.h
index 558728bf..ffa88d8a 100644
--- a/include/fasta_query.h
+++ b/include/fasta_query.h
@@ -1,4 +1,4 @@
-// FASTAQuery: A class for querying a FASTA file.
+// ReferenceGenome: A class for querying a reference genome FASTA file.
 
 #ifndef FASTA_QUERY_H
 #define FASTA_QUERY_H
@@ -10,7 +10,7 @@
 #include <vector>
 /// @endcond
 
-class FASTAQuery {
+class ReferenceGenome {
     private:
         std::string fasta_filepath;
         std::vector<std::string> chromosomes;
@@ -18,17 +18,17 @@ class FASTAQuery {
 
     public:
         int setFilepath(std::string fasta_filepath);
-        std::string getFilepath();
-        std::string query(std::string chr, int64_t pos_start, int64_t pos_end);
+        std::string getFilepath() const;
+        std::string query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const;
 
         // Get the chromosome contig lengths in VCF header format
-        std::string getContigHeader();
+        std::string getContigHeader() const;
 
         // Get the list of chromosomes, used for whole genome analysis
         std::vector<std::string> getChromosomes();
 
         // Get the length of a chromosome
-        int64_t getChromosomeLength(std::string chr);
+        uint32_t getChromosomeLength(std::string chr);
 };
 
 #endif // FASTA_QUERY_H
diff --git a/include/input_data.h b/include/input_data.h
index 1042d664..0a74125f 100644
--- a/include/input_data.h
+++ b/include/input_data.h
@@ -38,8 +38,8 @@ class InputData {
         // Set the filepath to the reference genome FASTA file.
 		void setRefGenome(std::string fasta_filepath);
 
-        // Return a reference to the FASTAQuery object.
-        const FASTAQuery& getRefGenome() const;
+        // Return a reference to the ReferenceGenome object.
+        const ReferenceGenome& getRefGenome() const;
         // FASTAQuery getRefGenome();
 
         // Query the reference genome for a sequence.
@@ -111,7 +111,7 @@ class InputData {
         std::string snp_vcf_filepath;
         std::string ethnicity;
         std::unordered_map<std::string, std::string> pfb_filepaths;  // Map of population frequency VCF filepaths by chromosome
-        FASTAQuery fasta_query;
+        ReferenceGenome fasta_query;
         std::string output_dir;
         int window_size;
         int min_cnv_length;
diff --git a/include/sv_caller.h b/include/sv_caller.h
index 0a94b254..371cc53d 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -7,6 +7,8 @@
 #include "input_data.h"
 #include "cnv_data.h"
 #include "sv_data.h"
+#include "sv_object.h"
+#include "fasta_query.h"
 
 #include <htslib/sam.h>
 
@@ -24,38 +26,41 @@ using AlignmentVector = std::vector<AlignmentData>;
 // Query map (query name, alignment vector)
 using PrimaryMap = std::unordered_map<std::string, AlignmentData>;
 using SuppMap = std::unordered_map<std::string, AlignmentVector>;
-using RegionData = std::tuple<SVData, PrimaryMap, SuppMap>;
+// using RegionData = std::tuple<SVData, PrimaryMap, SuppMap>;
 
 class SVCaller {
     private:
         int min_sv_size = 50;       // Minimum SV size to be considered
         int min_mapq = 20;          // Minimum mapping quality to be considered
-        InputData* input_data;
+        InputData& input_data;
 
         // Detect SVs from the CIGAR string of a read alignment, and return the
         // mismatch rate, and the start and end positions of the query sequence
-        std::tuple<std::unordered_map<int, int>, int32_t, int32_t> detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& sv_calls, bool is_primary);
+        std::tuple<std::unordered_map<int, int>, int32_t, int32_t> detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set<SVCall>& sv_calls, bool is_primary);
 
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
-        RegionData detectSVsFromRegion(std::string region);
+        // RegionData detectSVsFromRegion(std::string region);
+        std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> detectCIGARSVs(std::string region);
  
         // Read the next alignment from the BAM file in a thread-safe manner
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
 
         // Detect SVs from split alignments
-        void detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller);
+        void detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller);
 
         // Calculate the mismatch rate given a map of query positions to
         // match/mismatch (1/0) values within a specified range of the query
         // sequence
         double calculateMismatchRate(std::unordered_map<int, int>& mismatch_map, int32_t start, int32_t end);
 
+        void saveToVCF(const std::unordered_map<std::string, std::set<SVCall>>& sv_calls, const ReferenceGenome& ref_genome);
+
     public:
-        SVCaller(InputData& input_data);
+        explicit SVCaller(InputData& input_data);
 
         // Detect SVs and predict SV type from long read alignments and CNV calls
-        SVData run();
+        std::unordered_map<std::string, std::set<SVCall>> run();
 };
 
 #endif // SV_CALLER_H
diff --git a/include/sv_data.h b/include/sv_data.h
index f4ed6e25..fef815ed 100644
--- a/include/sv_data.h
+++ b/include/sv_data.h
@@ -35,7 +35,7 @@ class SVData {
 
         int getClippedBaseSupport(std::string chr, int64_t pos, int64_t end);
         
-        void saveToVCF(FASTAQuery& ref_genome, std::string output_dir);
+        void saveToVCF(ReferenceGenome& ref_genome, std::string output_dir);
 
         std::map<SVCandidate, SVInfo>& getChromosomeSVs(std::string chr);
 
diff --git a/include/sv_object.h b/include/sv_object.h
new file mode 100644
index 00000000..3128dfe1
--- /dev/null
+++ b/include/sv_object.h
@@ -0,0 +1,37 @@
+#ifndef SV_OBJECT_H
+#define SV_OBJECT_H
+
+#include <vector>
+#include <memory>
+#include <string>
+#include <set>
+#include <stdexcept>
+
+// Struct to represent a structural variant call
+struct SVCall {
+    uint32_t start;
+    uint32_t end;
+    std::string sv_type = "NA";
+    std::string alt_allele = ".";
+    std::string data_type = "NA";
+    std::string genotype = "./.";
+    double hmm_likelihood = 0.0;
+    int support = 0;
+
+    // Comparison operator for std::set
+    bool operator<(const SVCall& other) const;
+
+    // Constructor with parameters for all fields
+    SVCall(uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int support) :
+        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), support(support) {}
+};
+
+void addSVCall(std::set<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood);
+
+std::vector<std::set<SVCall>> splitSVsIntoChunks(std::set<SVCall>& sv_calls, int chunk_count);
+
+uint32_t getSVCount(const std::set<SVCall>& sv_calls);
+
+void concatenateSVCalls(std::set<SVCall>& sv_calls, const std::set<SVCall>& sv_calls_update);
+
+#endif // SV_OBJECT_H
diff --git a/python/sv_merger.py b/python/sv_merger.py
index 172cba6e..4254e0ad 100644
--- a/python/sv_merger.py
+++ b/python/sv_merger.py
@@ -130,8 +130,9 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
     # Get the HMM likelihood scores
     hmm_scores = vcf_df['INFO'].str.extract(r'HMM=(-?\d+\.?\d*)', expand=False).astype(float)
 
-    # Set all 0 values to NaN
-    hmm_scores[hmm_scores == 0] = np.nan
+    # Set all 0 values to a low negative value
+    hmm_scores[hmm_scores == 0] = -1e-100
+    # hmm_scores[hmm_scores == 0] = np.nan
 
     # Cluster SV breakpoints using HDBSCAN
     cluster_labels = []
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 951775c6..55e363ae 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -31,6 +31,11 @@
 
 using namespace sv_types;
 
+CNVCaller::CNVCaller(InputData &input_data)
+    : input_data(input_data)  // Initialize the input data
+{
+}
+
 // Function to call the Viterbi algorithm for the CHMM
 std::pair<std::vector<int>, double> CNVCaller::runViterbi(CHMM hmm, SNPData& snp_data)
 {
@@ -39,7 +44,7 @@ std::pair<std::vector<int>, double> CNVCaller::runViterbi(CHMM hmm, SNPData& snp
     {
         throw std::runtime_error("Error: No SNP data found for Viterbi algorithm.");
     }
-    std::lock_guard<std::mutex> lock(this->hmm_mtx);  // Lock the mutex for the HMM
+    // std::lock_guard<std::mutex> lock(this->hmm_mtx);  // Lock the mutex for the HMM
     std::pair<std::vector<int>, double> state_sequence = testVit_CHMM(hmm, data_count, snp_data.log2_cov, snp_data.baf, snp_data.pfb);
     return state_sequence;
 }
@@ -49,7 +54,7 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, int64_t star
 {
     SNPData snp_data;
     bool snps_found = false;
-    int window_size = this->input_data->getWindowSize();
+    int window_size = this->input_data.getWindowSize();
 
     // printMessage("Querying SNPs for region " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
     for (int64_t i = start_pos; i <= end_pos; i += window_size)
@@ -61,9 +66,9 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, int64_t star
 
         // Get the SNP info for the window
         // std::cout << "Querying SNPs for window " << chr << ":" << window_start << "-" << window_end << "..." << std::endl;
-        this->snp_data_mtx.lock();
+        // this->snp_data_mtx.lock();
         std::tuple<std::vector<int64_t>, std::vector<double>, std::vector<double>> window_snps = snp_info.querySNPs(chr, window_start, window_end);
-        this->snp_data_mtx.unlock();
+        // this->snp_data_mtx.unlock();
         std::vector<int64_t>& snp_window_pos = std::get<0>(window_snps);  // SNP positions
         std::vector<double>& snp_window_bafs = std::get<1>(window_snps);  // B-allele frequencies
         std::vector<double>& snp_window_pfbs = std::get<2>(window_snps);  // Population frequencies of the B allele
@@ -134,6 +139,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     bool sv_snps_found = snp_call.second;
 
     // Run the Viterbi algorithm
+    // printMessage("[TEST] Running Viterbi algorithm for SV candidate " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
     std::pair<std::vector<int>, double> prediction = runViterbi(this->hmm, sv_snps);
     std::vector<int>& state_sequence = prediction.first;
     double likelihood = prediction.second;
@@ -179,10 +185,10 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 
     // Save the SV calls as a TSV file if enabled
     bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL);
-    if (this->input_data->getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 10000)
+    if (this->input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 10000)
     {
         std::string cnv_type_str = getSVTypeString(predicted_cnv_type);
-        std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv";
+        std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv";
         std::cout << "Saving SV split-alignment copy number predictions to " << sv_filename << std::endl;
         this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood);
     }
@@ -191,54 +197,58 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 }
 
 
-SNPData CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::map<SVCandidate, SVInfo> &sv_candidates, int min_length)
+void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall> &sv_candidates, int min_length)
 {
-    SNPInfo& snp_info = this->snp_info;
     CHMM& hmm = this->hmm;
-    int window_size = this->input_data->getWindowSize();
-    double mean_chr_cov = this->mean_chr_cov;
-    SNPData snp_data;
-
-   
+    int window_size = this->input_data.getWindowSize();
+    double mean_chr_cov = this->mean_chr_cov;  
     printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "...");
 
     // Create a map with counts for each CNV type
-    std::map<int, int> cnv_type_counts;
-    for (int i = 0; i < 6; i++)
-    {
-        cnv_type_counts[i] = 0;
-    }
-
-    // Split the SV candidates into chunks for each thread
-    int chunk_count = this->input_data->getThreadCount();
-    std::vector<std::vector<SVCandidate>> sv_chunks = splitSVCandidatesIntoChunks(sv_candidates, chunk_count);
-
-    // Loop through each SV chunk and run the copy number prediction in parallel
-    std::vector<std::future<SNPData>> futures;
-    for (const auto& sv_chunk : sv_chunks)
-    {
-        // Run the copy number prediction for the SV chunk
-        std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, std::ref(sv_candidates), sv_chunk, std::ref(snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map));
-    }
-
-    // Get the SNP data for each SV chunk
-    int current_chunk = 0;
-    for (auto& future : futures)
-    {
-        current_chunk++;
-        SNPData chunk_snp_data = std::move(future.get());
-        if (this->input_data->getVerbose())
-        {
-            printMessage("Finished processing SV chunk " + std::to_string(current_chunk) + " of " + std::to_string(chunk_count) + "...");
-        }
-    }
+    // std::map<int, int> cnv_type_counts;
+    // for (int i = 0; i < 6; i++)
+    // {
+    //     cnv_type_counts[i] = 0;
+    // }
+
+    runCIGARCopyNumberPredictionChunk(chr, sv_candidates, this->snp_info, hmm, window_size, mean_chr_cov, this->pos_depth_map);
+    // // Split the SV candidates into chunks for each thread
+    // int chunk_count = this->input_data.getThreadCount();
+    // // std::vector<std::vector<SVCandidate>> sv_chunks = splitSVCandidatesIntoChunks(sv_candidates, chunk_count);
+    // std::vector<std::set<SVCall>> sv_chunks = splitSVsIntoChunks(sv_candidates, chunk_count);
+
+    // // Loop through each SV chunk and run the copy number prediction in parallel
+    // // std::vector<std::future<SNPData>> futures;
+    // std::vector<std::future<void>> futures;
+    // for (auto& sv_chunk : sv_chunks)
+    // {
+    //     // Run the copy number prediction for the SV chunk
+    //     futures.emplace_back(std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, std::ref(sv_chunk), std::ref(this->snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map)));
+    //     // futures.emplace_back(std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, std::ref(sv_chunk), std::ref(this->snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map)));
+    //     // std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, sv_chunk, std::ref(this->snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map));
+    // }
+
+    // // Wait for all the futures to finish
+    // int current_chunk = 0;
+    // for (auto& future : futures)
+    // {
+    //     current_chunk++;
+    //     try {
+    //         future.wait();
+    //         // SNPData chunk_snp_data = std::move(future.get());
+    //         if (this->input_data.getVerbose())
+    //         {
+    //             printMessage("Finished processing SV chunk " + std::to_string(current_chunk) + " of " + std::to_string(chunk_count) + "...");
+    //         }
+    //     } catch (const std::exception& e) {
+    //         printError("Error processing SV chunk " + std::to_string(current_chunk) + " of " + std::to_string(chunk_count) + ": " + e.what());
+    //     }
+    // }
 
     printMessage("Finished predicting copy number states for chromosome " + chr + "...");
-
-    return snp_data;
 }
 
-void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, std::vector<SVCandidate> sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map<uint32_t, int>& pos_depth_map)
+void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map<uint32_t, int>& pos_depth_map)
 {
     // printMessage("Running copy number prediction for " + std::to_string(sv_chunk.size()) + " SV candidates on chromosome " + chr + "...");
     // Map with counts for each CNV type
@@ -249,23 +259,25 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map<SVCa
     }
     
     // Loop through each SV candidate and predict the copy number state
-    for (const auto& sv_call : sv_chunk)
+    for (auto& sv_call : sv_chunk)
     {
         // Get the SV candidate
-        const SVCandidate& candidate = sv_call;
-        int64_t start_pos = std::get<0>(candidate);
-        int64_t end_pos = std::get<1>(candidate);
+        // const SVCandidate& candidate = sv_call;
+        // int64_t start_pos = std::get<0>(candidate);
+        // int64_t end_pos = std::get<1>(candidate);
+        uint32_t start_pos = sv_call.start;
+        uint32_t end_pos = sv_call.end;
 
         // Skip if not the minimum length for CNV predictions
-        if ((end_pos - start_pos) < this->input_data->getMinCNVLength())
+        if ((int)(end_pos - start_pos) < this->input_data.getMinCNVLength())
         {
             continue;
         }
 
         // Get the depth at the start position. This is used as the FORMAT/DP
         // value in the VCF file
-        int dp_value = pos_depth_map[start_pos];
-        this->updateDPValue(sv_candidates, sv_call, dp_value);
+        // int dp_value = pos_depth_map[start_pos];
+        // this->updateDPValue(sv_candidates, sv_call, dp_value);
 
         // Loop through the SV region +/- 1/2 SV length and run copy number
         // predictions
@@ -273,19 +285,21 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map<SVCa
         int64_t query_start = std::max((int64_t) 1, start_pos - sv_half_length);
         int64_t query_end = end_pos + sv_half_length;
 
-        // printMessage("Querying SNPs for SV " + chr + ":" +
-        // std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) +
-        // "...");
+        // printMessage("Querying SNPs for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", qstart = " + std::to_string(query_start) + ", qend = " + std::to_string(query_end));
         std::pair<SNPData, bool> snp_call = this->querySNPRegion(chr, query_start, query_end, snp_info, pos_depth_map, mean_chr_cov);
+        // printMessage("Finished querying SNPs for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         SNPData& sv_snps = snp_call.first;
         bool snps_found = snp_call.second;
 
         // Run the Viterbi algorithm
+        // printMessage("[TEST2] Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
         std::pair<std::vector<int>, double> prediction = runViterbi(hmm, sv_snps);
         std::vector<int>& state_sequence = prediction.first;
         double likelihood = prediction.second;
+        // printMessage("Finished running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
 
         // Get all the states in the SV region
+        // printMessage("Getting states for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
         std::vector<int> sv_states;
         for (size_t i = 0; i < state_sequence.size(); i++)
         {
@@ -318,7 +332,7 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map<SVCa
         }
 
         // Update the SV calls with the CNV type and genotype
-        SVType cnv_type = getSVTypeFromCNState(max_state);
+        SVType updated_sv_type = getSVTypeFromCNState(max_state);
         std::string genotype = cnv_genotype_map[max_state];
 
         // Determine the SV calling method used to call the SV
@@ -331,20 +345,31 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map<SVCa
             data_type = "Log2CNV";
         }
 
-        // Update the SV copy number data
-        this->updateSVCopyNumber(sv_candidates, sv_call, cnv_type, data_type, genotype, likelihood);
+        // Update the SV copy number data if not unknown
+        // printMessage("Updating SV copy number data for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
+        if (updated_sv_type != SVType::UNKNOWN)
+        {
+            std::string sv_type_str = getSVTypeString(updated_sv_type);
+            addSVCall(sv_chunk, sv_call.start, sv_call.end, sv_type_str, ".", data_type, genotype, likelihood);
+            // std::string sv_type_str = getSVTypeString(updated_sv_type);
+            // sv_call.sv_type = sv_type_str;
+            // sv_call.data_type += "," + data_type;
+            // sv_call.genotype = genotype;
+            // sv_call.hmm_likelihood = likelihood;
+        }
+        // this->updateSVCopyNumber(sv_candidates, sv_call, cnv_type, data_type, genotype, likelihood);
 
         // Save the SV calls as a TSV file if enabled, if the SV type is
         // known, and the length is greater than 10 kb
-        SVType updated_sv_type = sv_candidates[sv_call].sv_type;
-        if (this->input_data->getSaveCNVData() && updated_sv_type != SVType::UNKNOWN && (end_pos - start_pos) > 10000)
+        // SVType updated_sv_type = sv_candidates[sv_call].sv_type;
+        if (this->input_data.getSaveCNVData() && updated_sv_type != SVType::UNKNOWN && (end_pos - start_pos) > 10000)
         {
             // Add the state sequence to the SNP data (avoid copying the data)
             sv_snps.state_sequence = std::move(state_sequence);
 
             // Save the SV calls as a TSV file
             std::string cnv_type_str = getSVTypeString(updated_sv_type);
-            std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_CIGAR.tsv";
+            std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_CIGAR.tsv";
             // std::cout << "Saving SV CIGAR copy number predictions to " <<
             // sv_filename << std::endl;
             printMessage("Saving SV CIGAR copy number predictions to " + sv_filename);
@@ -442,14 +467,9 @@ std::vector<std::vector<SVCandidate>> CNVCaller::splitSVCandidatesIntoChunks(std
     return sv_chunks;
 }
 
-CNVCaller::CNVCaller(InputData &input_data)
-{
-    this->input_data = &input_data;
-}
-
 void CNVCaller::loadChromosomeData(std::string chr)
 {
-    std::string hmm_filepath = this->input_data->getHMMFilepath();
+    std::string hmm_filepath = this->input_data.getHMMFilepath();
     std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
     this->hmm = ReadCHMM(hmm_filepath.c_str());
 
@@ -459,7 +479,7 @@ void CNVCaller::loadChromosomeData(std::string chr)
     this->mean_chr_cov = mean_chr_cov;
 
     std::cout << "Reading SNP allele frequencies for chromosome " << chr << " from VCF file..." << std::endl;
-    std::string snp_filepath = this->input_data->getSNPFilepath();
+    std::string snp_filepath = this->input_data.getSNPFilepath();
     readSNPAlleleFrequencies(chr, snp_filepath, this->snp_info);
 
     std::cout << "Obtaining SNP population frequencies for chromosome " << chr << "..." << std::endl;
@@ -471,16 +491,21 @@ void CNVCaller::loadChromosomeData(std::string chr)
 double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
 {
 
+	bool test=true;
+	if (test) {
+		return 30.0;
+	}
+
     // Use a maximum of 8 threads to avoid overloading the system with too many
     // parallel processes
-    int num_threads = this->input_data->getThreadCount();
+    int num_threads = this->input_data.getThreadCount();
     if (num_threads > 8)
     {
         num_threads = 8;
     }
 
     // Split the chromosome into equal parts for each thread
-    uint32_t chr_len = this->input_data->getRefGenomeChromosomeLength(chr);
+    uint32_t chr_len = this->input_data.getRefGenomeChromosomeLength(chr);
     if (chr_len == 0)
     {
     	printError("ERROR: Chromosome length is zero for: " + chr);
@@ -497,7 +522,7 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
     uint32_t pos_count = 0;
     uint64_t cum_depth = 0;
     std::vector<std::future<std::tuple<uint32_t, uint32_t, std::unordered_map<uint32_t, int>>>> futures;
-    std::string input_filepath = this->input_data->getShortReadBam();
+    std::string input_filepath = this->input_data.getShortReadBam();
     for (const auto& region_chunk : region_chunks)
     {
         // Create a lambda function to get the mean chromosome coverage for the
@@ -638,7 +663,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath,
     // Check that the SNP file is sorted by running bcftools index and reading
     // the error output
     std::string index_cmd = "bcftools index " + filepath + " 2>&1 | grep -i error";
-    if (this->input_data->getVerbose()) {
+    if (this->input_data.getVerbose()) {
         std::cout << "Command: " << index_cmd << std::endl;
     }
 
@@ -661,33 +686,33 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath,
     pclose(index_fp);  // Close the process
 
     // Filter variants by depth, quality, and region
-    if (this->input_data->getVerbose()) {
+    if (this->input_data.getVerbose()) {
         std::cout << "Filtering SNPs by depth, quality, and region..." << std::endl;
     }
 
     // Check if a region was specified by the user
     std::string region_str = chr;
-    if (this->input_data->isRegionSet())
+    if (this->input_data.isRegionSet())
     {
-        std::pair<int32_t, int32_t> region = this->input_data->getRegion();
+        std::pair<int32_t, int32_t> region = this->input_data.getRegion();
         region_str = chr + ":" + std::to_string(region.first) + "-" + std::to_string(region.second);
     }
 
-    std::string filtered_snp_vcf_filepath = this->input_data->getOutputDir() + "/filtered_snps.vcf";
+    std::string filtered_snp_vcf_filepath = this->input_data.getOutputDir() + "/filtered_snps.vcf";
     std::string cmd = "bcftools view -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath;
-    if (this->input_data->getVerbose()) {
+    if (this->input_data.getVerbose()) {
         std::cout << "Filtering SNPs by depth and quality..." << std::endl;
         std::cout << "Command: " << cmd << std::endl;
     }
     system(cmd.c_str());
     
-    if (this->input_data->getVerbose()) {
+    if (this->input_data.getVerbose()) {
         std::cout << "Filtered SNPs written to " << filtered_snp_vcf_filepath << std::endl;
     }
 
     // Extract B-allele frequency data from the VCF file and sort by chromosome
     // and position
-    if (this->input_data->getVerbose()) {
+    if (this->input_data.getVerbose()) {
         std::cout << "Extracting B-allele frequency data from filtered SNPs..." << std::endl;
     }
     cmd = "bcftools query -f '%POS,[%AD]\n' " + filtered_snp_vcf_filepath + " 2>/dev/null";
@@ -748,7 +773,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath,
 
     pclose(fp);  // Close the process
 
-    if (this->input_data->getVerbose()) {
+    if (this->input_data.getVerbose()) {
         std::cout << "Finished extracting B-allele frequency data from filtered SNPs" << std::endl;
     }
 }
@@ -756,7 +781,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath,
 void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
 {
     // Get the population frequency file for the chromosome
-    std::string pfb_filepath = this->input_data->getAlleleFreqFilepath(chr);
+    std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr);
     if (pfb_filepath == "")
     {
         std::cout << "No population frequency file provided for chromosome " << chr << std::endl;
@@ -765,9 +790,9 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
 
     // Determine the ethnicity-specific allele frequency key
     std::string AF_key = "AF";
-    if (this->input_data->getEthnicity() != "")
+    if (this->input_data.getEthnicity() != "")
     {
-        AF_key += "_" + this->input_data->getEthnicity();
+        AF_key += "_" + this->input_data.getEthnicity();
     }
 
     // Check if the filepath uses the 'chr' prefix notations based on the
@@ -804,10 +829,10 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
     std::pair<int64_t, int64_t> snp_range = snp_info.getSNPRange(chr);
     int64_t snp_start = snp_range.first;
     int64_t snp_end = snp_range.second;
-    if (this->input_data->isRegionSet())
+    if (this->input_data.isRegionSet())
     {
         // Get the user-defined region
-        std::pair<int32_t, int32_t> region = this->input_data->getRegion();
+        std::pair<int32_t, int32_t> region = this->input_data.getRegion();
         if (snp_start < region.first) {
             snp_start = region.first;
         } else if (snp_end > region.second) {
@@ -817,7 +842,7 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
 
     // Use a maximum of 8 threads to avoid overloading the system with too many
     // processes
-    int num_threads = this->input_data->getThreadCount();
+    int num_threads = this->input_data.getThreadCount();
     if (num_threads > 8)
     {
         num_threads = 8;
@@ -876,8 +901,9 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
         };
 
         // Create a future for the thread
-        std::future<std::unordered_map<int, double>> future = std::async(std::launch::async, get_pfb);
-        futures.push_back(std::move(future));
+        futures.emplace_back(std::async(std::launch::async, get_pfb));
+        // std::future<std::unordered_map<int, double>> future = std::async(std::launch::async, get_pfb);
+        // futures.push_back(std::move(future));
     }
 
     // Loop through the futures and get the results
@@ -895,9 +921,9 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
             double pfb = pair.second;
 
             // Add the population frequency to the SNPInfo
-            this->snp_data_mtx.lock();
+            // this->snp_data_mtx.lock();
             snp_info.insertSNPPopulationFrequency(chr_snp, pos, pfb);
-            this->snp_data_mtx.unlock();
+            // this->snp_data_mtx.unlock();
             pfb_count++;
 
             // [TEST] Print 15 values
diff --git a/src/contextsv.cpp b/src/contextsv.cpp
index 4f35c052..1e22b650 100644
--- a/src/contextsv.cpp
+++ b/src/contextsv.cpp
@@ -13,20 +13,22 @@
 /// @endcond
 
 ContextSV::ContextSV(InputData& input_data)
+    : input_data(input_data)  // Initialize the input data
 {
-    this->input_data = &input_data;
 }
 
-// Entry point
 int ContextSV::run()
 {
-    FASTAQuery ref_genome = this->input_data->getRefGenome();  // Load the reference genome
-    SVCaller sv_caller(*this->input_data);  // Create an SV caller object
-    SVData sv_calls = sv_caller.run();  // Run the SV caller
-    std::string output_dir = this->input_data->getOutputDir();  // Get the output directory
+    ReferenceGenome ref_genome = this->input_data.getRefGenome();  // Load the reference genome
+    SVCaller sv_caller(this->input_data);  // Create an SV caller object
+    // SVCaller sv_caller(*this->input_data);  // Create an SV caller object
+    // SVData sv_calls = sv_caller.run();  // Run the SV caller
+    std::unordered_map<std::string, std::set<SVCall>> sv_calls = sv_caller.run();  // Run the SV caller
+    // std::string output_dir = this->input_data->getOutputDir();  // Get the output directory
     
-    std::cout << "Writing SV calls to file " << output_dir << "/output.vcf..." << std::endl;
-    sv_calls.saveToVCF(ref_genome, output_dir);  // Save the SV calls to a VCF file
+    // std::cout << "Writing SV calls to file " << output_dir << "/output.vcf..." << std::endl;
+    // sv_caller.saveToVCF(ref_genome, output_dir);  // Save the SV calls to a VCF file
+    // sv_calls.saveToVCF(ref_genome, output_dir);  // Save the SV calls to a VCF file
     std::cout << "SV calling complete." << std::endl;
 
     return 0;
diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp
index e1bc9bea..ee220d1c 100644
--- a/src/fasta_query.cpp
+++ b/src/fasta_query.cpp
@@ -12,7 +12,7 @@
 /// @endcond
 
 
-int FASTAQuery::setFilepath(std::string fasta_filepath)
+int ReferenceGenome::setFilepath(std::string fasta_filepath)
 {
     if (fasta_filepath == "")
     {
@@ -92,13 +92,13 @@ int FASTAQuery::setFilepath(std::string fasta_filepath)
     return 0;
 }
 
-std::string FASTAQuery::getFilepath()
+std::string ReferenceGenome::getFilepath() const
 {
     return this->fasta_filepath;
 }
 
 // Function to get the reference sequence at a given position range
-std::string FASTAQuery::query(std::string chr, int64_t pos_start, int64_t pos_end)
+std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const
 {    
     // Convert positions from 1-indexed (reference) to 0-indexed (string indexing)
     pos_start--;
@@ -110,15 +110,17 @@ std::string FASTAQuery::query(std::string chr, int64_t pos_start, int64_t pos_en
     {
         return "";
     }
-    if (pos_end >= (int64_t)this->chr_to_seq[chr].length())
+    // if (pos_end >= (uint32_t)this->chr_to_seq[chr].length())
+    if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length())
     {
         return "";
     }
 
-    int64_t length = pos_end - pos_start + 1;
+    uint32_t length = pos_end - pos_start + 1;
     
     // Get the sequence
-    const std::string& sequence = this->chr_to_seq[chr];
+    const std::string& sequence = this->chr_to_seq.at(chr);
+    // const std::string& sequence = this->chr_to_seq[chr];
 
     // Get the substring
     // std::string subsequence = sequence.substr(pos_start, length);
@@ -133,7 +135,7 @@ std::string FASTAQuery::query(std::string chr, int64_t pos_start, int64_t pos_en
 }
 
 // Function to get the chromosome contig lengths in VCF header format
-std::string FASTAQuery::getContigHeader()
+std::string ReferenceGenome::getContigHeader() const
 {
     std::string contig_header = "";
 
@@ -149,7 +151,8 @@ std::string FASTAQuery::getContigHeader()
     for (auto const& chr : chromosomes)
     {
         // Add the contig header line
-        contig_header += "##contig=<ID=" + chr + ",length=" + std::to_string(this->chr_to_seq[chr].length()) + ">\n";
+        contig_header += "##contig=<ID=" + chr + ",length=" + std::to_string(this->chr_to_seq.at(chr).length()) + ">\n";
+        // contig_header += "##contig=<ID=" + chr + ",length=" + std::to_string(this->chr_to_seq[chr].length()) + ">\n";
     }
 
     // Remove the last newline character
@@ -158,12 +161,12 @@ std::string FASTAQuery::getContigHeader()
     return contig_header;
 }
 
-std::vector<std::string> FASTAQuery::getChromosomes()
+std::vector<std::string> ReferenceGenome::getChromosomes()
 {
     return this->chromosomes;
 }
 
-int64_t FASTAQuery::getChromosomeLength(std::string chr)
+uint32_t ReferenceGenome::getChromosomeLength(std::string chr)
 {
     return this->chr_to_seq[chr].length();
 }
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 99a4cade..186e4617 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -90,7 +90,7 @@ void InputData::setRefGenome(std::string fasta_filepath)
     this->fasta_query.setFilepath(fasta_filepath);
 }
 
-const FASTAQuery &InputData::getRefGenome() const
+const ReferenceGenome& InputData::getRefGenome() const
 {
     return this->fasta_query;
 }
diff --git a/src/main.cpp b/src/main.cpp
index e1ac4c12..558e2493 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -10,12 +10,14 @@
 // Placeholder for ContextSV library includes
 // #include "ContextSV.h"
 
-void runContextSV(const std::string& bamFile, const std::string& refFile, const std::string& vcfFile, const std::string& outputDir, int threadCount = 1) {
+void runContextSV(const std::string& bamFile, const std::string& refFile, const std::string& vcfFile, const std::string& outputDir, int threadCount = 1, const std::string& hmmFile = "", int windowSize = 2500, int minCNV = 2500, const std::string& eth = "", const std::string& pfbFile = "")
+{
     // Placeholder for setting up input data and running ContextSV
     std::cout << "Running ContextSV with the following files:" << std::endl;
     std::cout << "BAM file: " << bamFile << std::endl;
     std::cout << "Reference file: " << refFile << std::endl;
     std::cout << "VCF file: " << vcfFile << std::endl;
+    std::cout << "Thread count: " << threadCount << std::endl;
     std::cout << "Output directory: " << outputDir << std::endl;
 
     // Set up input data
@@ -24,21 +26,23 @@ void runContextSV(const std::string& bamFile, const std::string& refFile, const
     input_data.setLongReadBam(bamFile);
     input_data.setRefGenome(refFile);
     input_data.setSNPFilepath(vcfFile);
-    input_data.setChromosome("21");
-    input_data.setRegion("14486099-14515105");
-    input_data.setThreadCount(1);
-    input_data.setAlleleFreqFilepaths("");
-    input_data.setHMMFilepath("");
+    //input_data.setChromosome("21");
+    //input_data.setRegion("14486099-14515105");
+    input_data.setThreadCount(threadCount);
+    input_data.setAlleleFreqFilepaths(pfbFile);
+    input_data.setHMMFilepath(hmmFile);
     input_data.setOutputDir(outputDir);
-    input_data.saveCNVData(true);
+    input_data.saveCNVData(false);
     input_data.setThreadCount(threadCount);
+    input_data.setWindowSize(windowSize);
+    input_data.setMinCNVLength(minCNV);
 
     // Run ContextSV
     run(input_data);
 }
 
 int main(int argc, char* argv[]) {
-    if (argc != 6) {
+    if (argc < 6) {
         std::cerr << "Usage: " << argv[0] << " <bam_file> <ref_file> <vcf_file> <output_dir> <thread_count>" << std::endl;
         return 1;
     }
@@ -48,8 +52,29 @@ int main(int argc, char* argv[]) {
     std::string vcfFile = argv[3];
     std::string outputDir = argv[4];
     int threadCount = std::stoi(argv[5]);
+
+    std::string hmmFile = "";
+    int windowSize = 2500;
+    int minCNV = 2500;
+    std::string eth = "";
+    std::string pfbFile = "";
+    if (argc == 11) {
+        hmmFile = argv[6];
+        windowSize = std::stoi(argv[7]);
+        minCNV = std::stoi(argv[8]);
+        eth = argv[9];
+        pfbFile = argv[10];
+    }
+    
+    runContextSV(bamFile, refFile, vcfFile, outputDir, threadCount, hmmFile, windowSize, minCNV, eth, pfbFile);
+    
+    //std::string hmmFile = argv[6];
+    //int windowSize = std::stoi(argv[7]);
+    //int minCNV = std::stoi(argv[8]);
+    //std::string eth = argv[9];
+    //std::string pfbFile = argv[10];
     
-    runContextSV(bamFile, refFile, vcfFile, outputDir, threadCount);
+    //runContextSV(bamFile, refFile, vcfFile, outputDir, threadCount, "", 2500, 2500, "", "");
 
     return 0;
 }
diff --git a/src/snp_info.cpp b/src/snp_info.cpp
index 90045402..36efeb4b 100644
--- a/src/snp_info.cpp
+++ b/src/snp_info.cpp
@@ -52,7 +52,7 @@ void SNPInfo::insertSNPPopulationFrequency(std::string chr, int64_t pos, double
 std::tuple<std::vector<int64_t>, std::vector<double>, std::vector<double>> SNPInfo::querySNPs(std::string chr, int64_t start, int64_t end)
 {
     // Lock the mutex for reading SNP information
-    std::lock_guard<std::mutex> lock(this->snp_info_mtx);
+    // std::lock_guard<std::mutex> lock(this->snp_info_mtx);
 
     chr = removeChrPrefix(chr);
 
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 3cc41aed..73b6cfea 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -17,6 +17,7 @@
 #include <future>
 #include <cmath>
 #include <algorithm>
+#include <fstream>
 
 #include "utils.h"
 #include "sv_types.h"
@@ -24,16 +25,22 @@
 
 # define DUP_SEQSIM_THRESHOLD 0.9  // Sequence similarity threshold for duplication detection
 
+SVCaller::SVCaller(InputData &input_data)
+    : input_data(input_data)  // Initialize the input data
+{
+}
+
 int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
 {
     int ret = sam_itr_next(fp_in, itr, bam1);
     return ret;
 }
 
-RegionData SVCaller::detectSVsFromRegion(std::string region)
+// RegionData SVCaller::detectSVsFromRegion(std::string region)
+std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(std::string region)
 {
     // Open the BAM file
-    std::string bam_filepath = this->input_data->getLongReadBam();
+    std::string bam_filepath = this->input_data.getLongReadBam();
     samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
     if (fp_in == NULL) {
         std::cerr << "ERROR: failed to open " << bam_filepath << std::endl;
@@ -42,78 +49,87 @@ RegionData SVCaller::detectSVsFromRegion(std::string region)
 
     // Load the header for the BAM file
     bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
-    if (bamHdr == NULL) {
-        std::cerr << "ERROR: failed to read header for " << bam_filepath << std::endl;
-        exit(1);
+    if (!bamHdr) {
+        sam_close(fp_in);
+        throw std::runtime_error("ERROR: failed to read header for " + bam_filepath);
     }
 
     // Load the index for the BAM file
     hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str());
-    if (idx == NULL) {
-        std::cerr << "ERROR: failed to load index for " << bam_filepath << std::endl;
-        exit(1);
+    if (!idx) {
+        bam_hdr_destroy(bamHdr);
+        sam_close(fp_in);
+        throw std::runtime_error("ERROR: failed to load index for " + bam_filepath);
     }
 
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
+    if (!bam1) {
+        hts_idx_destroy(idx);
+        bam_hdr_destroy(bamHdr);
+        sam_close(fp_in);
+        throw std::runtime_error("ERROR: failed to initialize BAM record");
+    }
     hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str());
+    if (!itr) {
+        bam_destroy1(bam1);
+        hts_idx_destroy(idx);
+        bam_hdr_destroy(bamHdr);
+        sam_close(fp_in);
+        throw std::runtime_error("ERROR: failed to query region " + region);
+    }
 
     // Main loop to process the alignments
-    SVData sv_calls;
+    // SVData sv_calls;
+    std::set<SVCall> sv_calls;
     int num_alignments = 0;
     PrimaryMap primary_alignments;
     SuppMap supplementary_alignments;
     while (readNextAlignment(fp_in, itr, bam1) >= 0) {
 
-        // Skip secondary and unmapped alignments, duplicates, and QC failures
-        if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL) {
-            // Do nothing
-
-        // Skip alignments with low mapping quality
-        } else if (bam1->core.qual < this->min_mapq) {
-            // Do nothing
-
-        } else {
-            std::string qname = bam_get_qname(bam1);  // Query template name
-
-            // Process primary alignments
-            if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
-
-                // Get the primary alignment information
-                std::string chr = bamHdr->target_name[bam1->core.tid];
-                int64_t start = bam1->core.pos;
-                int64_t end = bam_endpos(bam1);  // This is the first position after the alignment
-                bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
-
-                // Call SVs directly from the CIGAR string
-                std::tuple<std::unordered_map<int, int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true);
-                std::unordered_map<int, int> match_map = std::get<0>(query_info);
-                int32_t query_start = std::get<1>(query_info);
-                int32_t query_end = std::get<2>(query_info);
-
-                // Add the primary alignment to the map
-                AlignmentData alignment(chr, start, end, ".", query_start, query_end, std::move(match_map), fwd_strand);
-                primary_alignments[qname] = std::move(alignment);
-
-            // Process supplementary alignments
-            } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
-
-                // Get the supplementary alignment information
-                std::string chr = bamHdr->target_name[bam1->core.tid];
-                int32_t start = bam1->core.pos;
-                int32_t end = bam_endpos(bam1);
-                bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
-
-                // Get CIGAR string information, but don't call SVs
-                std::tuple<std::unordered_map<int, int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false);
-                const std::unordered_map<int, int>& match_map = std::get<0>(query_info);
-                int32_t query_start = std::get<1>(query_info);
-                int32_t query_end = std::get<2>(query_info);
-
-                // Add the supplementary alignment to the map
-                AlignmentData alignment(chr, start, end, ".", query_start, query_end, std::move(match_map), fwd_strand);
-                supplementary_alignments[qname].emplace_back(alignment);
-            }
+        // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
+        if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) {
+            continue;
+        }
+        const std::string qname = bam_get_qname(bam1);  // Query template name
+
+        // Process primary alignments
+        if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
+
+            // Get the primary alignment information
+            std::string chr = bamHdr->target_name[bam1->core.tid];
+            int64_t start = bam1->core.pos;
+            int64_t end = bam_endpos(bam1);  // This is the first position after the alignment
+            bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
+
+            // Call SVs directly from the CIGAR string
+            std::tuple<std::unordered_map<int, int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true);
+            std::unordered_map<int, int> match_map = std::get<0>(query_info);
+            int32_t query_start = std::get<1>(query_info);
+            int32_t query_end = std::get<2>(query_info);
+
+            // Add the primary alignment to the map
+            AlignmentData alignment(chr, start, end, ".", query_start, query_end, match_map, fwd_strand);
+            primary_alignments[qname] = alignment;
+
+        // Process supplementary alignments
+        } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
+
+            // Get the supplementary alignment information
+            std::string chr = bamHdr->target_name[bam1->core.tid];
+            int32_t start = bam1->core.pos;
+            int32_t end = bam_endpos(bam1);
+            bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
+
+            // Get CIGAR string information, but don't call SVs
+            std::tuple<std::unordered_map<int, int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false);
+            const std::unordered_map<int, int>& match_map = std::get<0>(query_info);
+            int32_t query_start = std::get<1>(query_info);
+            int32_t query_end = std::get<2>(query_info);
+
+            // Add the supplementary alignment to the map
+            AlignmentData alignment(chr, start, end, ".", query_start, query_end, match_map, fwd_strand);
+            supplementary_alignments[qname].emplace_back(alignment);
         }
 
         num_alignments++;
@@ -121,12 +137,11 @@ RegionData SVCaller::detectSVsFromRegion(std::string region)
 
     hts_itr_destroy(itr);
     bam_destroy1(bam1);
-    sam_close(fp_in);
-    bam_hdr_destroy(bamHdr);
     hts_idx_destroy(idx);
+    bam_hdr_destroy(bamHdr);
+    sam_close(fp_in);
 
-    // Return the SV calls and the primary and supplementary alignments
-    return std::make_tuple(std::move(sv_calls), std::move(primary_alignments), std::move(supplementary_alignments));
+    return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments);
 }
 
 double SVCaller::calculateMismatchRate(std::unordered_map<int, int> &match_map, int32_t start, int32_t end)
@@ -147,12 +162,7 @@ double SVCaller::calculateMismatchRate(std::unordered_map<int, int> &match_map,
     return mismatch_rate;
 }
 
-SVCaller::SVCaller(InputData &input_data)
-{
-    this->input_data = &input_data;
-}
-
-std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& sv_calls, bool is_primary)
+std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set<SVCall>& sv_calls, bool is_primary)
 {
     std::string chr = header->target_name[alignment->core.tid];  // Chromosome name
     int32_t pos = alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
@@ -170,7 +180,7 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
     int32_t query_start = 0;  // First alignment position in the query
     int32_t query_end = 0;    // Last alignment position in the query
     bool first_op = false;  // First alignment operation for the query
-    double default_lh = std::numeric_limits<double>::lowest();  // Default likelihood
+    double default_lh = 0.0;
     // double default_lh = std::numeric_limits<double>::quiet_NaN();  // Default likelihood
     for (int i = 0; i < cigar_len; i++) {
 
@@ -205,7 +215,7 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
 
                     // Get the string for the window (1-based coordinates)
                     ins_ref_pos = j + 1;
-                    std::string window_str = this->input_data->queryRefGenome(chr, ins_ref_pos, ins_ref_pos + op_len - 1);
+                    std::string window_str = this->input_data.queryRefGenome(chr, ins_ref_pos, ins_ref_pos + op_len - 1);
 
                     // Continue if the window string is empty (out-of-range)
                     if (window_str == "") {
@@ -228,13 +238,31 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
                     }
                 }
 
+                // Determine whether to use a symbolic allele (>50bp) or the
+                // actual sequence
+                if (op_len > 50) {
+                    ins_seq_str = "<INS>";
+                } else {
+                    ins_seq_str = ins_seq_str;
+                }
+
                 // Add to SV calls (1-based) with the appropriate SV type
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
                 if (is_duplication) {
-                    sv_calls.add(chr, ref_pos, ref_end, SVType::DUP, ins_seq_str, "CIGARDUP", "./.", default_lh);
+                    // sv_calls.add(chr, ref_pos, ref_end, SVType::DUP,
+                    // ins_seq_str, "CIGARDUP", "./.", default_lh);
+                    //printMessage("[TEST] FOUND CIGAR DUP");
+                    // sv_calls.insert(SVCall{(uint32_t)ref_pos,
+                    // (uint32_t)ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.",
+                    // default_lh});
+                    addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh);
                 } else {
-                    sv_calls.add(chr, ref_pos, ref_end, SVType::INS, ins_seq_str, "CIGARINS", "./.", default_lh);
+                    // sv_calls.add(chr, ref_pos, ref_end, SVType::INS, ins_seq_str, "CIGARINS", "./.", default_lh);
+                    // sv_calls.insert(SVCall{(uint32_t)ref_pos,
+                    // (uint32_t)ref_end, "INS", ins_seq_str, "CIGARINS", "./.",
+                    // default_lh});
+                    addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh);
                 }
             }
 
@@ -246,13 +274,17 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
             {
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
-                sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", "CIGARDEL", "./.", default_lh);  // Add to SV calls (1-based)
+                // sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".",
+                // "CIGARDEL", "./.", default_lh);  // Add to SV calls (1-based)
+                // sv_calls.insert(SVCall{(uint32_t)ref_pos, (uint32_t)ref_end,
+                // "DEL", ".", "CIGARDEL", "./.", default_lh});
+                addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh);
             }
 
         // Check if the CIGAR operation is a clipped base
         } else if (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP) {
 
-            sv_calls.updateClippedBaseSupport(chr, pos);  // Update clipped base support
+            // sv_calls.updateClippedBaseSupport(chr, pos);  // Update clipped base support
 
             // Update the query alignment start position
             if (!first_op) {
@@ -280,7 +312,7 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
 
             // Get the corresponding reference sequence
             int cmatch_pos = pos + 1;  // Querying the reference genome is 1-based
-            std::string cmatch_ref_str = this->input_data->queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1);
+            std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1);
 
             // Check that the two sequence lengths are equal
             if (cmatch_seq_str.length() != cmatch_ref_str.length()) {
@@ -325,37 +357,42 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
     return std::tuple<std::unordered_map<int, int>, int32_t, int32_t>(query_match_map, query_start, query_end);
 }
 
-SVData SVCaller::run()
+std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
 {
     // Get the chromosomes to process
     std::vector<std::string> chromosomes;
-    if (this->input_data->getChromosome() != "") {
-        chromosomes.push_back(this->input_data->getChromosome());
+    if (this->input_data.getChromosome() != "") {
+        chromosomes.push_back(this->input_data.getChromosome());
     } else {
-        chromosomes = this->input_data->getRefGenomeChromosomes();
+        chromosomes = this->input_data.getRefGenomeChromosomes();
     }
 
     // [TEST] Only process the last N chromosomes
-    // last_n = 10;
+    // int last_n = 3;
     // chromosomes = std::vector<std::string>(chromosomes.end()-last_n, chromosomes.end());
-    // chromosomes = std::vector<std::string>(chromosomes.end()-3, chromosomes.end());
+    // std::cout << "[DEBUG] Running last " << last_n << " chromosomes" << std::endl;
+    // //chromosomes = std::vector<std::string>(chromosomes.end()-3, chromosomes.end());
 
     // Loop through each region and detect SVs in chunks
     int chr_count = chromosomes.size();
     int current_chr = 0;
     std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl;
     int chunk_count = 100;  // Number of chunks to split the chromosome into
-    SVData sv_calls;
-    int min_cnv_length = this->input_data->getMinCNVLength();
+    // SVData sv_calls;
+    // std::vector<std::map<SVCandidate, SVInfo>> sv_calls;
+    // std::unordered_map<std::string, std::map<uint32_t, uint32_t>> sv_calls;
+    uint32_t total_sv_count = 0;
+    std::unordered_map<std::string, std::set<SVCall>> whole_genome_sv_calls;
+    int min_cnv_length = this->input_data.getMinCNVLength();
     for (const auto& chr : chromosomes) {
         std::cout << "Running SV detection for chromosome " << chr << "..." << std::endl;
 
         // Split the chromosome into chunks
         std::vector<std::string> region_chunks;
-        if (this->input_data->isRegionSet()) {
+        if (this->input_data.isRegionSet()) {
 
             // Use one chunk for the specified region
-            std::pair<int32_t, int32_t> region = this->input_data->getRegion();
+            std::pair<int32_t, int32_t> region = this->input_data.getRegion();
             int region_start = region.first;
             int region_end = region.second;
             std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end);
@@ -363,7 +400,7 @@ SVData SVCaller::run()
             std::cout << "Using specified region " << chunk << "..." << std::endl;
             
         } else {
-            int chr_len = this->input_data->getRefGenomeChromosomeLength(chr);
+            int chr_len = this->input_data.getRefGenomeChromosomeLength(chr);
             int chunk_size = std::ceil((double)chr_len / chunk_count);
             for (int i = 0; i < chunk_count; i++) {
                 int start = i * chunk_size + 1;  // 1-based
@@ -379,58 +416,86 @@ SVData SVCaller::run()
 
         // Load chromosome data for copy number predictions
         std::cout << "Loading chromosome data for copy number predictions..." << std::endl;
-        CNVCaller cnv_caller(*this->input_data);
+        CNVCaller cnv_caller(this->input_data);
         cnv_caller.loadChromosomeData(chr);
 
         // Process each chunk one at a time
         std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl;
         int region_count = region_chunks.size();
         int current_region = 0;
+        std::set<SVCall> combined_sv_calls;
         for (const auto& sub_region : region_chunks) {
             // std::cout << "Detecting CIGAR string SVs from " << sub_region << "..." << std::endl;
-            RegionData region_data = this->detectSVsFromRegion(sub_region);
-            SVData& sv_calls_region = std::get<0>(region_data);
+            std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(sub_region);
+            std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
             PrimaryMap& primary_map = std::get<1>(region_data);
             SuppMap& supp_map = std::get<2>(region_data);
-            int region_sv_count = sv_calls_region.totalCalls();
-            if (region_sv_count > 0) {
-                std::cout << "Detected " << region_sv_count << " CIGAR SVs from " << sub_region << "..." << std::endl;
-            }
+            // SVData& subregion_sv_calls = std::get<0>(region_data);
+            // PrimaryMap& primary_map = std::get<1>(region_data);
+            // SuppMap& supp_map = std::get<2>(region_data);
+            // int region_sv_count = subregion_sv_calls.totalCalls();
+            // if (region_sv_count > 0) {
+            //     std::cout << "Detected " << region_sv_count << " CIGAR SVs from " << sub_region << "..." << std::endl;
+            // }
+            // int region_sv_count = subregion_sv_calls.count();
+            int region_sv_count = getSVCount(subregion_sv_calls);
+            printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
             // Run copy number variant predictions on the SVs detected from the
             // CIGAR string, using a minimum CNV length threshold
             // std::cout << "Detecting copy number variants from CIGAR string SVs..." << std::endl;
-            std::map<SVCandidate, SVInfo>& cigar_svs = sv_calls_region.getChromosomeSVs(chr);
-            if (cigar_svs.size() > 0) {
+            // std::map<SVCandidate, SVInfo>& cigar_svs = subregion_sv_calls.getChromosomeSVs(chr);
+            // if (cigar_svs.size() > 0) {
+            if (region_sv_count > 0) {
                 std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl;
-                cnv_caller.runCIGARCopyNumberPrediction(chr, cigar_svs, min_cnv_length);
+                // cnv_caller.runCIGARCopyNumberPrediction(chr, cigar_svs,
+                // min_cnv_length);
+                cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length);
             }
 
-            // Run split-read SV detection in a single thread, combined with
-            // copy number variant predictions
+            // Run split-read SV and copy number variant predictions
             std::cout << "Detecting copy number variants from split reads..." << std::endl;
-            this->detectSVsFromSplitReads(sv_calls_region, primary_map, supp_map, cnv_caller);
-            sv_calls.concatenate(sv_calls_region);  // Add the calls to the main set
+            this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller);
+            // sv_calls.concatenate(subregion_sv_calls);  // Add the calls to the
+            // main set
+            // sv_calls.emplace_back(subregion_sv_calls);
+
+            // Combine the SV calls from the current region
+            std::cout << "Combining SV calls from " << sub_region << "..." << std::endl;
+            concatenateSVCalls(combined_sv_calls, subregion_sv_calls);
             std::cout << "Completed " << ++current_region << " of " << region_count << " region(s)..." << std::endl;
+
+            // [TEST] Break after the first region
+            // std::cout << "[DEBUG] Breaking after the first region" << std::endl;
+            // break;
         }
 
         std::cout << "Completed " << ++current_chr << " of " << chr_count << " chromosome(s)..." << std::endl;
+        int chr_sv_count = getSVCount(combined_sv_calls);
+        whole_genome_sv_calls[chr] = combined_sv_calls;
+        std::cout << "Total SVs detected for chromosome " << chr << ": " << chr_sv_count << std::endl;
+        total_sv_count += chr_sv_count;
+        std::cout << "Cumulative total SVs: " << total_sv_count << std::endl;
         // std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)" << std::endl;
     }
-    
+
+    // SVData sv_calls_combined;
+    // for (const auto& subregion_sv_calls : sv_calls) {
+    //     sv_calls_combined.concatenate(subregion_sv_calls);
+    // }
 
     std::cout << "SV calling completed." << std::endl;
 
-    return sv_calls;
+    return whole_genome_sv_calls;
 }
 
 
 // Detect SVs from split read alignments
-void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller)
+void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller)
 {
     // Find split-read SV evidence
     int sv_count = 0;
-    int min_cnv_length = this->input_data->getMinCNVLength();
+    int min_cnv_length = this->input_data.getMinCNVLength();
     for (const auto& entry : primary_map) {
         std::string qname = entry.first;
         AlignmentData primary_alignment = entry.second;
@@ -688,7 +753,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
 
             // Continue if unknown SV type
             if (chosen_type == SVType::UNKNOWN) {
-                std::cerr << "ERROR: Unknown SV type" << std::endl;
+                // std::cerr << "ERROR: Unknown SV type" << std::endl;
                 continue;
             }
 
@@ -820,7 +885,9 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
             }
 
             // Add the best split alignment as the SV call
-            sv_calls.add(primary_chr, sv_start, sv_end, best_supp_type, ".", "SPLITREAD", "./.", best_split_aln_lh_norm);
+            // sv_calls.add(primary_chr, sv_start, sv_end, best_supp_type, ".",
+            // "SPLITREAD", "./.", best_split_aln_lh_norm);
+            std::string sv_type_str = getSVTypeString(best_supp_type);
             sv_count++;
         } else {
             // Resolve complex SVs
@@ -851,14 +918,18 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
                     std::string complex_sv_type_str = supp_type_str + "+" + primary_type_str;
 
                     // Add the complex SV call
-                    sv_calls.add(primary_chr, std::get<1>(largest_supp_alignment), primary_end, SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm);
+                    addSVCall(sv_calls, (uint32_t)std::get<1>(largest_supp_alignment), (uint32_t)primary_end, "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm);
+                    // sv_calls.insert(SVCall{(uint32_t)std::get<1>(largest_supp_alignment), (uint32_t)primary_end, "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm});
+                    // sv_calls.add(primary_chr, std::get<1>(largest_supp_alignment), primary_end, SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm);
                     sv_count++;
                 } else {
                     // [primary] -- [supp_start] -- [supp_end]
                     std::string complex_sv_type_str = primary_type_str + "+" + supp_type_str;
 
                     // Add the complex SV call
-                    sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm);
+                    addSVCall(sv_calls, (uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm);
+                    // sv_calls.insert(SVCall{(uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm});
+                    // sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm);
                     sv_count++;
                 }
             } else {
@@ -929,7 +1000,11 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
                     // Add the complex SV call if not empty
                     if (complex_sv_type_str != "") {
                         std::cout << "Found complex SV type: " << complex_sv_type_str << std::endl;
-                        sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm);
+                        // sv_calls.add(primary_chr, primary_start,
+                        // std::get<2>(largest_supp_alignment), SVType::COMPLEX,
+                        // ".", complex_sv_type_str, "./.", complex_lh_norm);
+                        // sv_calls.insert(SVCall{(uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm});
+                        addSVCall(sv_calls, (uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm);
                         sv_count++;
                     }
                 }                
@@ -942,3 +1017,215 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map
         std::cout << "Found " << sv_count << " SVs from split-read alignments" << std::endl;
     }
 }
+
+void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall> >& sv_calls, const ReferenceGenome& ref_genome)
+{
+    std::cout << "Creating VCF writer..." << std::endl;
+    // std::string output_vcf = output_dir + "/output.vcf";
+    std::string output_vcf = this->input_data.getOutputDir() + "/output.vcf";
+    std::cout << "Writing VCF file to " << output_vcf << std::endl;
+	std::ofstream vcf_stream(output_vcf);
+    if (!vcf_stream.is_open()) {
+        throw std::runtime_error("Failed to open VCF file for writing.");
+    }
+    std::string sample_name = "SAMPLE";
+
+    std::cout << "Getting reference genome filepath..." << std::endl;
+    try {
+        std::string ref_fp = ref_genome.getFilepath();
+        std::cout << "Reference genome filepath: " << ref_fp << std::endl;
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return;
+    }
+
+    std::cout << "Getting reference genome header..." << std::endl;
+    try {
+        ref_genome.getContigHeader();
+    } catch (const std::exception& e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+        return;
+    }
+
+    // Set the header lines
+    std::vector<std::string> header_lines = {
+        std::string("##reference=") + ref_genome.getFilepath(),
+        ref_genome.getContigHeader(),
+        "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">",
+        "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">",
+        "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">",
+        "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Method used to call the structural variant\">",
+        "##INFO=<ID=ALN,Number=1,Type=String,Description=\"Alignment type used to call the structural variant\">",
+        "##INFO=<ID=CLIPSUP,Number=1,Type=Integer,Description=\"Clipped base support at the start and end positions\">",
+        "##INFO=<ID=SUPPORT,Number=1,Type=Integer,Description=\"Number of reads supporting the variant\">",
+        "##INFO=<ID=REPTYPE,Number=1,Type=String,Description=\"Repeat type\">",
+        "##INFO=<ID=HMM,Number=1,Type=Float,Description=\"HMM likelihood\">",
+        "##FILTER=<ID=PASS,Description=\"All filters passed\">",
+        "##FILTER=<ID=LowQual,Description=\"Low quality\">",
+        "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
+        "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth\">"
+    };
+
+    std::cout << "Writing VCF header..." << std::endl;
+
+    // Add the file format
+    std::string file_format = "##fileformat=VCFv4.2";
+    vcf_stream << file_format << std::endl;
+
+    // Add date and time
+    time_t rawtime;
+    struct tm * timeinfo;
+    char buffer[80];
+    time (&rawtime);
+    timeinfo = localtime(&rawtime);
+    strftime(buffer, sizeof(buffer), "%Y%m%d", timeinfo);
+    vcf_stream << "##fileDate=" << buffer << std::endl;
+
+    // Add source
+    std::string source = "##source=ContexSV";
+    vcf_stream << source << std::endl;
+
+    // Loop over the header metadata lines
+    for (const auto &line : header_lines) {
+        vcf_stream << line << std::endl;
+    }
+
+    // Add the header line
+    std::string header_line = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE";
+    vcf_stream << header_line << std::endl;
+
+    // Flush the stream to ensure that the header is written
+    //this->file_stream.flush();
+
+    std::cout << "Saving SV calls to " << output_vcf << std::endl;
+    std::string sv_method = "CONTEXTSVv0.1";
+    int skip_count = 0;
+    int total_count = 0;
+    // std::set<std::string> chrs = this->getChromosomes();
+   //for (auto const& chr : chrs) {
+   for (const auto& pair : sv_calls) {
+        // if (this->sv_calls.find(chr) == this->sv_calls.end()) {
+        //     continue;
+        // }
+        std::string chr = pair.first;
+        const std::set<SVCall>& sv_calls = pair.second;
+        std::cout << "Saving SV calls for " << chr << "..." << std::endl;
+        // for (auto const& sv_call : this->sv_calls[chr]) {
+        for (const auto& sv_call : sv_calls) {
+            // Get the SV candidate and SV info
+            uint32_t start = sv_call.start;
+            uint32_t end = sv_call.end;
+            std::string sv_type_str = sv_call.sv_type;
+            std::string genotype = sv_call.genotype;
+            std::string data_type_str = sv_call.data_type;
+            std::string alt_allele = sv_call.alt_allele;
+            double hmm_likelihood = sv_call.hmm_likelihood;
+            int sv_length = end - start;
+            if (sv_type_str == "DEL") {
+            	sv_length++;
+        	}
+            int read_support = sv_call.support;
+            int read_depth = 0;
+            // SVType sv_type = sv_call.sv_type;
+            // SVCandidate candidate = sv_call.first;
+            // SVInfo info = sv_call.second;
+            // SVType sv_type = info.sv_type;
+            // int read_support = info.read_support;
+            // int read_depth = info.read_depth;
+            // int read_depth = 0;
+            // int read_support = 0;
+            // int sv_length = info.sv_length;
+            // std::set<std::string> data_type = info.data_type;
+            // std::string genotype = info.genotype;
+            // double hmm_likelihood = info.hmm_likelihood;
+
+            // Convert the data type set to a string
+            // std::string data_type_str = "";
+            // for (auto const& type : data_type) {
+            //     data_type_str += type + ",";
+            // }
+
+            // Get the CHROM, POS, END, and ALT
+            // uint32_t pos = std::get<0>(candidate);
+            // uint32_t end = std::get<1>(candidate);
+
+            // If the SV type is unknown, skip it
+            if (sv_type_str == "UNKNOWN" || sv_type_str == "NEUTRAL") {
+                skip_count += 1;
+                continue;
+            } else {
+                total_count += 1;
+            }
+
+            // Process by SV type
+            std::string ref_allele = ".";
+            // std::string alt_allele = ".";
+            std::string repeat_type = "NA";
+
+            // Deletion
+            if (sv_type_str == "DEL") {
+                // Get the deleted sequence from the reference genome, also including the preceding base
+                int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
+                ref_allele = ref_genome.query(chr, preceding_pos, end);
+
+                // Use the preceding base as the alternate allele 
+                if (ref_allele != "") {
+                    alt_allele = ref_allele.at(0);
+                } else {
+                    alt_allele = "<DEL>";  // Symbolic allele
+                    std::cerr << "Warning: Reference allele is empty for deletion at " << chr << ":" << start << "-" << end << std::endl;
+                }
+
+                sv_length = -1 * sv_length;  // Negative length for deletions
+
+                start = preceding_pos;  // Update the position to the preceding base
+
+            // Other types (duplications, insertions, inversions)
+            } else {
+                // Use the preceding base as the reference allele
+                int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
+                ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
+
+                // Format novel insertions
+                if (sv_type_str == "INS") {
+                    // Check if in symbolic form
+                    if (alt_allele != "<INS>") {
+                        // Use the insertion sequence as the alternate allele
+                        // alt_allele = std::get<2>(candidate);
+                        alt_allele.insert(0, ref_allele);
+                    }
+                    start = preceding_pos;  // Update the position to the preceding base
+
+                    // Update the end position to the start position to change from
+                    // query to reference coordinates for insertions
+                    end = start;
+                }
+            }
+
+            // Create the VCF parameter strings
+            // int clipped_base_support = this->getClippedBaseSupport(chr, pos,
+            // end);
+            int clipped_base_support = 0;
+            // std::string sv_type_str = getSVTypeString(sv_type);
+            std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
+                ";SVLEN=" + std::to_string(sv_length) + ";SUPPORT=" + std::to_string(read_support) + \
+                ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";CLIPSUP=" + std::to_string(clipped_base_support) + \
+                ";REPTYPE=" + repeat_type + ";HMM=" + std::to_string(hmm_likelihood);
+                
+            std::string format_str = "GT:DP";
+            std::string sample_str = genotype + ":" + std::to_string(read_depth);
+            std::vector<std::string> samples = {sample_str};
+
+            // Write the SV call to the file (CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLES)
+            vcf_stream << chr << "\t" << start << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "PASS" << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl;
+            if (total_count % 1000 == 0)
+            {
+            	std::cout << "Wrote SV at " << chr << ": " << start << ", total=" << total_count << std::endl;
+        	}
+        }
+    }
+
+    // Print the number of SV calls skipped
+    std::cout << "Finished writing VCF file. Total SV calls: " << total_count << ", skipped: " << skip_count << " with unknown SV type" << std::endl;
+}
+
diff --git a/src/sv_data.cpp b/src/sv_data.cpp
index a0611c8d..d2dfd605 100644
--- a/src/sv_data.cpp
+++ b/src/sv_data.cpp
@@ -139,7 +139,7 @@ int SVData::getClippedBaseSupport(std::string chr, int64_t pos, int64_t end)
     return clipped_base_support;
 }
 
-void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir)
+void SVData::saveToVCF(ReferenceGenome& ref_genome, std::string output_dir)
 {
     // Create a VCF writer
     std::cout << "Creating VCF writer..." << std::endl;
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
new file mode 100644
index 00000000..15aefdfd
--- /dev/null
+++ b/src/sv_object.cpp
@@ -0,0 +1,126 @@
+#include "sv_object.h"
+#include "sv_object.h"
+#include <algorithm>
+#include <tuple>
+#include <memory>
+#include <cmath>
+#include <stdexcept>
+#include <iostream>
+
+bool SVCall::operator<(const SVCall & other) const
+{
+    return std::tie(start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood) <
+           std::tie(other.start, other.end, other.sv_type, other.alt_allele, other.data_type, other.genotype, other.hmm_likelihood);
+}
+
+void addSVCall(std::set<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
+{
+    // Throw an error if unknown SV type
+    if (sv_type == "UNKNOWN") {
+        throw std::runtime_error("ERROR: Cannot add unknown SV type");
+    }
+    
+    if (start >= end) {
+        throw std::runtime_error("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end));
+    }
+
+    // If the SV call already exists (start and end position), then update all information if the
+    // likelihood is higher
+    // std::cout << "[TEST1] Adding SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl;
+    std::vector<SVCall> updates;
+    bool print_out = false;
+    for (auto it = sv_calls.begin(); it != sv_calls.end();)
+    {
+        if (it->start == start && it->end == end)
+        {
+            if (hmm_likelihood > it->hmm_likelihood)
+            {
+                std::cout << "[DEBUG] Found higher likelihood for SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl;
+                print_out = true;
+                // Update the data type and support
+                std::string new_data_type = it->data_type + "," + data_type;
+                int new_support = it->support + 1;
+
+                updates.push_back(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support});
+
+                // Erase and re-insert the SV call
+                // Erase the current iterator and safely insert the new SV call
+                // sv_calls.erase(it);
+                it = sv_calls.erase(it);  // Erase and get the next iterator
+                // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support});
+            } else {
+                // Return if no update is needed
+                return;
+            }
+        } else {
+            // Increment the iterator if the SV call does not match
+            ++it;
+        }
+    }
+
+    if (print_out)
+    {
+        std::cout << "[DEBUG] Adding updates" << std::endl;
+    }
+    
+    // Insert the updates
+    for (const auto& update : updates)
+    {
+        sv_calls.insert(update);
+    }
+
+    if (print_out)
+    {
+        std::cout << "[DEBUG] Added updates" << std::endl;
+    }
+
+
+    // Add the SV call if it does not exist
+    // std::cout << "[TEST2] Adding SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl;
+    // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1});
+    // std::cout << "[TEST3] Added SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl;
+}
+
+std::vector<std::set<SVCall>> splitSVsIntoChunks(std::set<SVCall>& sv_calls, int chunk_count)
+{
+    // Split the SV calls into chunks
+    std::vector<std::set<SVCall>> sv_chunks;
+    int sv_count = (int) sv_calls.size();
+    int chunk_size = std::ceil((double) sv_count / (double) chunk_count);
+    int current_chunk = 0;
+    std::set<SVCall> current_sv_chunk;
+    for (const auto& sv_call : sv_calls)
+    {
+        current_sv_chunk.insert(sv_call);
+
+        // If the current chunk size is reached, then add the chunk to the
+        // vector and reset the current chunk
+        if ((int) current_sv_chunk.size() == chunk_size)
+        {
+            // sv_chunks.insert(current_sv_chunk);
+            sv_chunks.push_back(current_sv_chunk);
+            current_sv_chunk.clear();
+            current_chunk++;
+        }
+    }
+
+    // Add the last chunk if it is not empty
+    if (!current_sv_chunk.empty())
+    {
+        sv_chunks.push_back(current_sv_chunk);
+        // sv_chunks.insert(current_sv_chunk);
+    }
+
+    return sv_chunks;
+}
+
+uint32_t getSVCount(const std::set<SVCall>& sv_calls)
+{
+    return (uint32_t) sv_calls.size();
+}
+
+void concatenateSVCalls(std::set<SVCall> &target, const std::set<SVCall> &source)
+{
+    // Efficiently concatenate two sets of SV calls
+    target.insert(source.begin(), source.end());
+}

From d5d7dcdc0e0c9116679005a0c1092deabb884b01 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 23 Nov 2024 12:52:30 -0500
Subject: [PATCH 019/134] Add merging and efficiency updates

---
 include/cnv_caller.h  |  17 +-
 include/input_data.h  |   2 +-
 include/snp_info.h    |  14 +-
 include/sv_caller.h   |   4 +-
 include/sv_object.h   |   2 +
 include/utils.h       |   2 +
 python/sv_merger.py   |   2 +-
 src/cnv_caller.cpp    | 511 ++++++++++++++++++++++--------------------
 src/input_data.cpp    |   2 +-
 src/snp_info.cpp      |  44 ++--
 src/sv_caller.cpp     | 122 ++++++----
 src/sv_object.cpp     | 124 ++++++++--
 src/utils.cpp         |  11 +-
 tests/test_general.py |   4 +-
 14 files changed, 513 insertions(+), 348 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 22a14cc9..858c7454 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -27,7 +27,7 @@ using namespace sv_types;
 // SNP data is a struct containing vectors used in predicting copy number
 // states. It is sorted by SNP position.
 struct SNPData {
-    std::vector<int64_t> pos;
+    std::vector<uint32_t> pos;
     std::vector<double> pfb;
     std::vector<double> baf;
     std::vector<double> log2_cov;
@@ -57,6 +57,9 @@ class CNVCaller {
         SNPInfo snp_info;
         double mean_chr_cov = 0.0;
         std::unordered_map<uint32_t, int> pos_depth_map;
+        std::unordered_map<uint32_t, double> snp_baf_map;
+        std::set<uint32_t> snp_baf_keys;
+        std::unordered_map<uint32_t, double> snp_pfb_map;
 
         // Define a map of CNV genotypes by HMM predicted state.
         // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output.
@@ -79,22 +82,24 @@ class CNVCaller {
             {6, "1/1"}
         };
 
-        void updateSNPData(SNPData& snp_data, int64_t pos, double pfb, double baf, double log2_cov, bool is_snp);
+        void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp);
 
         std::pair<std::vector<int>, double> runViterbi(CHMM hmm, SNPData &snp_data);
 
         // Query a region for SNPs and return the SNP data
-        std::pair<SNPData, bool> querySNPRegion(std::string chr, int64_t start_pos, int64_t end_pos, SNPInfo &snp_info, std::unordered_map<uint32_t, int> &pos_depth_map, double mean_chr_cov);
+        std::pair<SNPData, bool> querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, SNPInfo &snp_info, std::unordered_map<uint32_t, int> &pos_depth_map, double mean_chr_cov);
+
+        std::tuple<std::vector<uint32_t>, std::vector<double>, std::vector<double>> querySNPs(std::string chr, uint32_t start, uint32_t end);
 
         // Run copy number prediction for a chunk of SV candidates from CIGAR strings
-        void runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map<uint32_t, int>& pos_depth_map);
+        void runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, CHMM hmm, int window_size, double mean_chr_cov);
 
         void updateSVCopyNumber(std::map<SVCandidate, SVInfo>& sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood);
 
         void updateDPValue(std::map<SVCandidate, SVInfo>& sv_candidates, SVCandidate key, int dp_value);
 
         // Split a region into chunks for parallel processing
-        std::vector<std::string> splitRegionIntoChunks(std::string chr, int64_t start_pos, int64_t end_pos, int chunk_count);
+        std::vector<std::string> splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count);
 
         // Split SV candidates into chunks for parallel processing
         std::vector<std::vector<SVCandidate>> splitSVCandidatesIntoChunks(std::map<SVCandidate, SVInfo>& sv_candidates, int chunk_count);
@@ -131,7 +136,7 @@ class CNVCaller {
         void getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info);
 
         // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions
-        void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, int64_t start, int64_t end, std::string sv_type, double likelihood);
+        void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood);
 };
 
 #endif // CNV_CALLER_H
diff --git a/include/input_data.h b/include/input_data.h
index 0a74125f..718b5264 100644
--- a/include/input_data.h
+++ b/include/input_data.h
@@ -43,7 +43,7 @@ class InputData {
         // FASTAQuery getRefGenome();
 
         // Query the reference genome for a sequence.
-        std::string queryRefGenome(std::string chr, int64_t pos_start, int64_t pos_end);
+        std::string queryRefGenome(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const;
 
         // Get the chromosomes in the reference genome.
         std::vector<std::string> getRefGenomeChromosomes();
diff --git a/include/snp_info.h b/include/snp_info.h
index 0b57a629..51278951 100644
--- a/include/snp_info.h
+++ b/include/snp_info.h
@@ -11,30 +11,30 @@
 // Define the comparator for the binary search tree by SNP position (first
 // element of tuple)
 struct SNPCompare {
-    bool operator()(const std::tuple<int64_t, double>& a, const std::tuple<int64_t, double>& b) const {
+    bool operator()(const std::tuple<uint32_t, double>& a, const std::tuple<uint32_t, double>& b) const {
         return std::get<0>(a) < std::get<0>(b);
     }
 };
 
 // Define the data structure for SNP frequencies sorted by position
-using BST = std::set<std::tuple<int64_t, double>, SNPCompare>;
+using BST = std::set<std::tuple<uint32_t, double>, SNPCompare>;
 
 class SNPInfo {
 public:
     SNPInfo() {}
 
     // Insert a SNP into the map with its position and B-allele frequency
-    void insertSNPAlleleFrequency(std::string chr, int64_t pos, double baf);
+    void insertSNPAlleleFrequency(const std::string& chr, uint32_t pos, double baf);
 
     // Insert a SNP into the map with its position and population frequency of
     // the B allele
-    void insertSNPPopulationFrequency(std::string chr, int64_t pos, double pfb);
+    void insertSNPPopulationFrequency(const std::string& chr, uint32_t pos, double pfb);
     
     // Query SNPs within a range (start, end) and return their BAF and PFB values
-    std::tuple<std::vector<int64_t>, std::vector<double>, std::vector<double>> querySNPs(std::string chr, int64_t start, int64_t end);
+    std::tuple<std::vector<uint32_t>, std::vector<double>, std::vector<double>> querySNPs(std::string chr, uint32_t start, uint32_t end);
 
     // Get the range of SNP positions for a given chromosome
-    std::pair<int64_t, int64_t> getSNPRange(std::string chr);
+    std::pair<uint32_t, uint32_t> getSNPRange(std::string chr);
 
 
 private:
@@ -45,7 +45,7 @@ class SNPInfo {
     std::unordered_map<std::string, BST> snp_baf_map;
 
     // Define the map of chromosome to SNP population frequency
-    std::unordered_map<std::string, std::unordered_map<int64_t, double>> snp_pfb_map;
+    std::unordered_map<std::string, std::unordered_map<uint32_t, double>> snp_pfb_map;
 };
 
 #endif // SNP_INFO_H
diff --git a/include/sv_caller.h b/include/sv_caller.h
index 371cc53d..184248d7 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -41,7 +41,7 @@ class SVCaller {
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
         // RegionData detectSVsFromRegion(std::string region);
-        std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> detectCIGARSVs(std::string region);
+        std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region);
  
         // Read the next alignment from the BAM file in a thread-safe manner
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
@@ -54,7 +54,7 @@ class SVCaller {
         // sequence
         double calculateMismatchRate(std::unordered_map<int, int>& mismatch_map, int32_t start, int32_t end);
 
-        void saveToVCF(const std::unordered_map<std::string, std::set<SVCall>>& sv_calls, const ReferenceGenome& ref_genome);
+        void saveToVCF(const std::unordered_map<std::string, std::set<SVCall>>& sv_calls);
 
     public:
         explicit SVCaller(InputData& input_data);
diff --git a/include/sv_object.h b/include/sv_object.h
index 3128dfe1..fb52691e 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -28,6 +28,8 @@ struct SVCall {
 
 void addSVCall(std::set<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood);
 
+void mergeSVs(std::set<SVCall>& sv_calls);
+
 std::vector<std::set<SVCall>> splitSVsIntoChunks(std::set<SVCall>& sv_calls, int chunk_count);
 
 uint32_t getSVCount(const std::set<SVCall>& sv_calls);
diff --git a/include/utils.h b/include/utils.h
index 41efb411..4ec19138 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -23,4 +23,6 @@ void printError(std::string message);
 
 std::string getElapsedTime(std::chrono::high_resolution_clock::time_point start, std::chrono::high_resolution_clock::time_point end);
 
+std::string removeChrPrefix(std::string chr);
+
 #endif // UTILS_H
diff --git a/python/sv_merger.py b/python/sv_merger.py
index 4254e0ad..78733f6d 100644
--- a/python/sv_merger.py
+++ b/python/sv_merger.py
@@ -153,7 +153,7 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
 
     # Merge SVs with the same label
     unique_labels = np.unique(cluster_labels)
-    # logging.info("Unique labels: %s", unique_labels)
+    logging.info("Unique labels: %s", unique_labels)
 
     for label in unique_labels:
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 55e363ae..e8428ec5 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -50,26 +50,29 @@ std::pair<std::vector<int>, double> CNVCaller::runViterbi(CHMM hmm, SNPData& snp
 }
 
 // Function to obtain SNP information for a region
-std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, int64_t start_pos, int64_t end_pos, SNPInfo& snp_info, std::unordered_map<uint32_t, int>& pos_depth_map, double mean_chr_cov)
+std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, SNPInfo& snp_info, std::unordered_map<uint32_t, int>& pos_depth_map, double mean_chr_cov)
 {
     SNPData snp_data;
     bool snps_found = false;
-    int window_size = this->input_data.getWindowSize();
+    uint32_t window_size = (uint32_t)this->input_data.getWindowSize();
 
     // printMessage("Querying SNPs for region " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
-    for (int64_t i = start_pos; i <= end_pos; i += window_size)
+    for (uint32_t i = start_pos; i <= end_pos; i += window_size)
     {
         // Run a sliding non-overlapping window of size window_size across
         // the SV region and calculate the log2 ratio for each window
-        int64_t window_start = i;
-        int64_t window_end = std::min(i + window_size - 1, end_pos);
+        uint32_t window_start = i;
+        uint32_t window_end = std::min(i + window_size - 1, end_pos);
 
         // Get the SNP info for the window
         // std::cout << "Querying SNPs for window " << chr << ":" << window_start << "-" << window_end << "..." << std::endl;
         // this->snp_data_mtx.lock();
-        std::tuple<std::vector<int64_t>, std::vector<double>, std::vector<double>> window_snps = snp_info.querySNPs(chr, window_start, window_end);
+        // std::tuple<std::vector<uint32_t>, std::vector<double>,
+        // std::vector<double>> window_snps = snp_info.querySNPs(chr,
+        // window_start, window_end);
+        std::tuple<std::vector<uint32_t>, std::vector<double>, std::vector<double>> window_snps = this->querySNPs(chr, window_start, window_end);
         // this->snp_data_mtx.unlock();
-        std::vector<int64_t>& snp_window_pos = std::get<0>(window_snps);  // SNP positions
+        std::vector<uint32_t>& snp_window_pos = std::get<0>(window_snps);  // SNP positions
         std::vector<double>& snp_window_bafs = std::get<1>(window_snps);  // B-allele frequencies
         std::vector<double>& snp_window_pfbs = std::get<2>(window_snps);  // Population frequencies of the B allele
 
@@ -93,8 +96,8 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, int64_t star
             snps_found = true;
 
             // Loop through the SNPs and calculate the log2 ratios
-            int64_t bin_start = window_start;
-            int64_t bin_end = 0;
+            uint32_t bin_start = window_start;
+            uint32_t bin_end = 0;
             for (int j = 0; j < snp_count; j++)
             {
                 // SNP bin starts at 1/2 the distance between the previous SNP
@@ -104,7 +107,7 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, int64_t star
                 // between the first SNP and the next SNP, and for the last
                 // SNP, the bin starts at 1/2 the distance between the previous
                 // SNP and the last SNP and ends at the window end.
-                int64_t snp_pos = snp_window_pos[j];
+                uint32_t snp_pos = snp_window_pos[j];
                 bin_end = snp_pos + (j == snp_count-1 ? (window_end - snp_pos) / 2 : (snp_window_pos[j+1] - snp_pos) / 2);
 
                 // Calculate the log2 ratio for the SNP bin
@@ -123,14 +126,17 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, int64_t star
 std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate)
 {
      // Get the start and end positions of the SV call
-    int64_t start_pos = std::get<0>(candidate);
-    int64_t end_pos = std::get<1>(candidate);
+    uint32_t start_pos = std::get<0>(candidate);
+    uint32_t end_pos = std::get<1>(candidate);
 
     // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2
     // the SV length
-    int64_t sv_length = (end_pos - start_pos) / 2.0;
-    int64_t snp_start_pos = std::max((int64_t) 1, start_pos - sv_length);
-    int64_t snp_end_pos = end_pos + sv_length;
+    uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
+    // uint32_t snp_start_pos = std::max((uint32_t)1, start_pos - sv_length);
+    // Prevent underflow (start_pos - sv_length) if start_pos < sv_length
+    uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
+    uint32_t snp_end_pos = end_pos + sv_half_length;
+    // std::cout << "CNP for " << chr << ":" << start_pos << "-" << end_pos << "(" << snp_start_pos << ", " << snp_end_pos << ")" << std::endl;
     // printMessage("Running copy number prediction for SV candidate " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + " with SNP region " + chr + ":" + std::to_string(snp_start_pos) + "-" + std::to_string(snp_end_pos) + "...");
 
     // Query the SNP region for the SV candidate
@@ -138,6 +144,13 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     SNPData& sv_snps = snp_call.first;
     bool sv_snps_found = snp_call.second;
 
+	/*
+    if (sv_snps.pos.size() == 0) {
+    	std::cerr << "ERROR [2]: No windows for SV " << chr << ":" << std::to_string((int)start_pos) << "-" << std::to_string((int)end_pos) << " (" << snp_start_pos << "," << snp_end_pos << std::endl;
+    	continue;
+    }
+    */
+
     // Run the Viterbi algorithm
     // printMessage("[TEST] Running Viterbi algorithm for SV candidate " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
     std::pair<std::vector<int>, double> prediction = runViterbi(this->hmm, sv_snps);
@@ -211,7 +224,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall> &
     //     cnv_type_counts[i] = 0;
     // }
 
-    runCIGARCopyNumberPredictionChunk(chr, sv_candidates, this->snp_info, hmm, window_size, mean_chr_cov, this->pos_depth_map);
+    runCIGARCopyNumberPredictionChunk(chr, sv_candidates, hmm, window_size, mean_chr_cov);
     // // Split the SV candidates into chunks for each thread
     // int chunk_count = this->input_data.getThreadCount();
     // // std::vector<std::vector<SVCandidate>> sv_chunks = splitSVCandidatesIntoChunks(sv_candidates, chunk_count);
@@ -248,7 +261,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall> &
     printMessage("Finished predicting copy number states for chromosome " + chr + "...");
 }
 
-void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map<uint32_t, int>& pos_depth_map)
+void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, CHMM hmm, int window_size, double mean_chr_cov)
 {
     // printMessage("Running copy number prediction for " + std::to_string(sv_chunk.size()) + " SV candidates on chromosome " + chr + "...");
     // Map with counts for each CNV type
@@ -261,19 +274,29 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCa
     // Loop through each SV candidate and predict the copy number state
     for (auto& sv_call : sv_chunk)
     {
+
         // Get the SV candidate
         // const SVCandidate& candidate = sv_call;
         // int64_t start_pos = std::get<0>(candidate);
         // int64_t end_pos = std::get<1>(candidate);
         uint32_t start_pos = sv_call.start;
         uint32_t end_pos = sv_call.end;
+        
+        // Error if start > end
+        if (start_pos >= end_pos)
+        {
+        	std::cerr << "Position error for CIGAR SV at " << chr << ":" << start_pos << "-" << end_pos << std::endl;
+        	continue;
+        }
 
         // Skip if not the minimum length for CNV predictions
-        if ((int)(end_pos - start_pos) < this->input_data.getMinCNVLength())
+        if ((end_pos - start_pos) < (uint32_t)this->input_data.getMinCNVLength())
         {
             continue;
         }
 
+    	std::cout << "CIGAR SV at " << chr << ":" << start_pos << "-" << end_pos << std::endl;
+
         // Get the depth at the start position. This is used as the FORMAT/DP
         // value in the VCF file
         // int dp_value = pos_depth_map[start_pos];
@@ -281,18 +304,26 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCa
 
         // Loop through the SV region +/- 1/2 SV length and run copy number
         // predictions
-        int64_t sv_half_length = (end_pos - start_pos) / 2.0;
-        int64_t query_start = std::max((int64_t) 1, start_pos - sv_half_length);
-        int64_t query_end = end_pos + sv_half_length;
+        uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
+        uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
+        uint32_t snp_end_pos = end_pos + sv_half_length;
+        // std::cout << "CIGAR sv_half_length:" << sv_half_length << std::endl;
+        // std::cout << "CIGAR SV query at " << chr << ":" << query_start << "-" << query_end << std::endl;
 
         // printMessage("Querying SNPs for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", qstart = " + std::to_string(query_start) + ", qend = " + std::to_string(query_end));
-        std::pair<SNPData, bool> snp_call = this->querySNPRegion(chr, query_start, query_end, snp_info, pos_depth_map, mean_chr_cov);
+        std::pair<SNPData, bool> snp_call = this->querySNPRegion(chr, snp_start_pos, snp_end_pos, snp_info, this->pos_depth_map, mean_chr_cov);
         // printMessage("Finished querying SNPs for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         SNPData& sv_snps = snp_call.first;
         bool snps_found = snp_call.second;
 
         // Run the Viterbi algorithm
         // printMessage("[TEST2] Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
+        
+        if (sv_snps.pos.size() == 0) {
+        	std::cerr << "ERROR: No windows for SV " << chr << ":" << start_pos << "-" << end_pos << " (" << snp_start_pos << "," << snp_end_pos << std::endl;
+        	continue;
+        }
+        
         std::pair<std::vector<int>, double> prediction = runViterbi(hmm, sv_snps);
         std::vector<int>& state_sequence = prediction.first;
         double likelihood = prediction.second;
@@ -411,14 +442,14 @@ void CNVCaller::updateDPValue(std::map<SVCandidate,SVInfo>& sv_candidates, SVCan
     sv_candidates[key].read_depth = dp_value;
 }
 
-std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, int64_t start_pos, int64_t end_pos, int chunk_count)
+std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count)
 {
     // Split the region into chunks
     std::vector<std::string> region_chunks;
-    int64_t region_length = end_pos - start_pos + 1;
-    int64_t chunk_size = std::ceil((double) region_length / (double) chunk_count);
-    int64_t chunk_start = start_pos;
-    int64_t chunk_end = 0;
+    uint32_t region_length = end_pos - start_pos + 1;
+    uint32_t chunk_size = std::ceil((double) region_length / (double) chunk_count);
+    uint32_t chunk_start = start_pos;
+    uint32_t chunk_end = 0;
     for (int i = 0; i < chunk_count; i++)
     {
         chunk_end = chunk_start + chunk_size - 1;
@@ -474,9 +505,9 @@ void CNVCaller::loadChromosomeData(std::string chr)
     this->hmm = ReadCHMM(hmm_filepath.c_str());
 
     printMessage("Calculating mean chromosome coverage for " + chr + "...");
-    mean_chr_cov = calculateMeanChromosomeCoverage(chr);
+    this->mean_chr_cov = calculateMeanChromosomeCoverage(chr);
+    //this->mean_chr_cov = 30.0;
     printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov));
-    this->mean_chr_cov = mean_chr_cov;
 
     std::cout << "Reading SNP allele frequencies for chromosome " << chr << " from VCF file..." << std::endl;
     std::string snp_filepath = this->input_data.getSNPFilepath();
@@ -490,124 +521,109 @@ void CNVCaller::loadChromosomeData(std::string chr)
 // Calculate the mean chromosome coverage
 double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
 {
+    // Open the BAM file
+    std::string bam_filepath = this->input_data.getShortReadBam();
+    samFile *bam_file = sam_open(bam_filepath.c_str(), "r");
+    if (!bam_file)
+    {
+        throw std::runtime_error("ERROR: Could not open BAM file: " + bam_filepath);
+    }
 
-	bool test=true;
-	if (test) {
-		return 30.0;
-	}
+    // Read the header
+    bam_hdr_t *bam_header = sam_hdr_read(bam_file);
+    if (!bam_header)
+    {
+        sam_close(bam_file);
+        throw std::runtime_error("ERROR: Could not read header from BAM file: " + bam_filepath);
+    }
 
-    // Use a maximum of 8 threads to avoid overloading the system with too many
-    // parallel processes
-    int num_threads = this->input_data.getThreadCount();
-    if (num_threads > 8)
+    // Load the index
+    hts_idx_t *bam_index = sam_index_load(bam_file, bam_filepath.c_str());
+    if (!bam_index)
     {
-        num_threads = 8;
+        bam_hdr_destroy(bam_header);
+        sam_close(bam_file);
+        throw std::runtime_error("ERROR: Could not load index for BAM file: " + bam_filepath);
     }
 
-    // Split the chromosome into equal parts for each thread
-    uint32_t chr_len = this->input_data.getRefGenomeChromosomeLength(chr);
-    if (chr_len == 0)
+    // Create an iterator for the chromosome
+    hts_itr_t *bam_iter = sam_itr_querys(bam_index, bam_header, chr.c_str());
+    if (!bam_iter)
     {
-    	printError("ERROR: Chromosome length is zero for: " + chr);
-        return 0.0;
+        hts_idx_destroy(bam_index);
+        bam_hdr_destroy(bam_header);
+        sam_close(bam_file);
+        throw std::runtime_error("ERROR: Could not create iterator for chromosome: " + chr);
     }
-    std::vector<std::string> region_chunks = splitRegionIntoChunks(chr, 1, chr_len, num_threads);
-    if (region_chunks.empty())
+
+    // Initialize the record
+    bam1_t *bam_record = bam_init1();
+    if (!bam_record)
     {
-        printError("ERROR: Failed to split chromosome into regions.");
-        return 0.0;
+        hts_itr_destroy(bam_iter);
+        hts_idx_destroy(bam_index);
+        bam_hdr_destroy(bam_header);
+        sam_close(bam_file);
+        throw std::runtime_error("ERROR: Could not initialize BAM record.");
     }
 
-    // Calculate the mean chromosome coverage in parallel
-    uint32_t pos_count = 0;
-    uint64_t cum_depth = 0;
-    std::vector<std::future<std::tuple<uint32_t, uint32_t, std::unordered_map<uint32_t, int>>>> futures;
-    std::string input_filepath = this->input_data.getShortReadBam();
-    for (const auto& region_chunk : region_chunks)
+    // Iterate through the chromosome and update the depth map
+    std::unordered_map<uint32_t, int> chr_pos_depth_map;
+    while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0)
     {
-        // Create a lambda function to get the mean chromosome coverage for the
-        // region chunk
-        auto get_mean_chr_cov = [region_chunk, input_filepath]() -> std::tuple<uint32_t, uint32_t, std::unordered_map<uint32_t, int>>
+        
+        // Parse the CIGAR string to get the depth (match, sequence match, and
+        // mismatch)
+        // uint32_t depth = 0;
+        uint32_t pos = bam_record->core.pos + 1;  // 0-based to 1-based
+        uint32_t ref_pos = pos;
+        uint32_t cigar_len = bam_record->core.n_cigar;
+        uint32_t *cigar = bam_get_cigar(bam_record);
+        for (uint32_t i = 0; i < cigar_len; i++)
         {
-            // Run samtools depth on the entire region, and print positions and
-            // depths (not chromosome)
-            size_t cmd_size = input_filepath.size() + 256;
-            std::vector<char> cmd(cmd_size);
-            snprintf(cmd.data(), cmd_size,\
-                "samtools depth -r %s %s | awk '{print $2, $3}'",\
-                region_chunk.c_str(), input_filepath.c_str());
-
-            // Open a pipe to read the output of the command
-            FILE *fp = popen(cmd.data(), "r");
-            if (fp == NULL)
+            uint32_t op = bam_cigar_op(cigar[i]);
+            uint32_t op_len = bam_cigar_oplen(cigar[i]);
+            if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF)
             {
-                throw std::runtime_error("ERROR: Could not open pipe for command: " + std::string(cmd.data()));
-            }
-
-            // Parse the outputs (position and depth)
-            std::unordered_map<uint32_t, int> pos_depth_map;
-            const int line_size = 1024;
-            char line[line_size];
-            uint32_t pos;
-            int depth;
-            uint32_t pos_count = 0;
-            uint64_t cum_depth = 0;
-            while (fgets(line, line_size, fp) != NULL)
-            {
-                if (sscanf(line, "%u%d", &pos, &depth) == 2)
+                // Update the depth for each position in the alignment
+                for (uint32_t j = 0; j < op_len; j++)
                 {
-                    pos_depth_map[pos] = depth;
-                    pos_count++;
-                    cum_depth += depth;
+                    chr_pos_depth_map[ref_pos + j]++;
                 }
             }
             
-            // Check if pclose fails
-            if (pclose(fp) == -1)
-            {
-                throw std::runtime_error("ERROR: Failed to close pipe for command: " + std::string(cmd.data()));
+            // Update the reference coordinate based on the CIGAR operation
+            // https://samtools.github.io/hts-specs/SAMv1.pdf
+            if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
+                ref_pos += op_len;
+            } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) {
+                // Do nothing
+            } else {
+                throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op));
             }
-            //pclose(fp);  // Close the process
-
-            return std::make_tuple(pos_count, cum_depth, pos_depth_map);
-        };
-        
-        futures.emplace_back(std::async(std::launch::async, get_mean_chr_cov));
-        //std::future<std::tuple<uint32_t, uint32_t, std::unordered_map<uint32_t, int>>> future = std::async(std::launch::async, get_mean_chr_cov);
-        //futures.push_back(std::move(future));
-    }
-
-    // Thread-safe map merging (using mutex)
-    std::mutex merge_mutex;
-    for (auto& future : futures)
-    {
-        try
-        {
-            future.wait();
-            auto result = std::move(future.get());
-
-            // Safely merge results
-            std::lock_guard<std::mutex> lock(merge_mutex);
-            pos_count += std::get<0>(result);
-            cum_depth += std::get<1>(result);
-            this->mergePosDepthMaps(this->pos_depth_map, std::get<2>(result));
-        }
-        catch (const std::exception& ex)
-        {
-            printError("ERROR: Exception in thread execution - " + std::string(ex.what()));
-            return 0.0;
         }
     }
-    
-    // Validate and calculate mean chromosome coverage
-    if (pos_count == 0)
+
+    // Clean up
+    bam_destroy1(bam_record);
+    hts_itr_destroy(bam_iter);
+    hts_idx_destroy(bam_index);
+    bam_hdr_destroy(bam_header);
+    sam_close(bam_file);
+
+    // Calculate the mean chromosome coverage
+    uint64_t cum_depth = 0;
+    uint32_t pos_count = 0;
+    for (auto& pos_depth : chr_pos_depth_map)
     {
-        printError("ERROR: No positions found in chromosome coverage calculation.");
-        return 0.0;
+        cum_depth += pos_depth.second;
+        pos_count++;
     }
-    
-    double mean_chr_cov = static_cast<double>(cum_depth) / static_cast<double>(pos_count);
 
+    double mean_chr_cov = (double) cum_depth / (double) pos_count;
+
+    // Update the position depth map
+    this->pos_depth_map = std::move(chr_pos_depth_map);
 
     return mean_chr_cov;
 }
@@ -615,10 +631,9 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
 void CNVCaller::mergePosDepthMaps(std::unordered_map<uint32_t, int>& main_map, std::unordered_map<uint32_t, int>& map_update)
 {
     // Merge the second depth map into the first
-    main_map.reserve(main_map.size() + map_update.size());
     for (auto& pos_depth : map_update)
     {
-        main_map[pos_depth.first] = std::move(pos_depth.second);
+        main_map[pos_depth.first] = pos_depth.second;
     }
 }
 
@@ -660,6 +675,7 @@ double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::
 
 void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, SNPInfo& snp_info)
 {
+
     // Check that the SNP file is sorted by running bcftools index and reading
     // the error output
     std::string index_cmd = "bcftools index " + filepath + " 2>&1 | grep -i error";
@@ -699,7 +715,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath,
     }
 
     std::string filtered_snp_vcf_filepath = this->input_data.getOutputDir() + "/filtered_snps.vcf";
-    std::string cmd = "bcftools view -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath;
+    int thread_count = this->input_data.getThreadCount();
+    // std::string cmd = "bcftools view -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath;
+    std::string cmd = "bcftools view --threads " + std::to_string(thread_count) + " -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath;
     if (this->input_data.getVerbose()) {
         std::cout << "Filtering SNPs by depth and quality..." << std::endl;
         std::cout << "Command: " << cmd << std::endl;
@@ -725,13 +743,14 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath,
 
     // Read the reference and alternate allele depths from the VCF file
     std::string alt_allele = "";  // Alternate allele
-    uint64_t pos = 0;
+    uint32_t pos = 0;
     int ref_ad = 0;
     int alt_ad = 0;
-    const int line_size = 256;
+    const int line_size = 1024;
     char line[line_size];  // Line buffer
     std::vector<int64_t> locations;
     std::vector<double> bafs;
+    std::string chr_no_prefix = removeChrPrefix(chr);
     while (fgets(line, line_size, fp) != NULL)
     {
         // Parse the line
@@ -742,7 +761,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath,
             // Get the position from column 2
             if (col == 0)
             {
-                pos = atoi(tok);
+                pos = (uint32_t)atoi(tok);
             }
 
             // Get the AD for the reference allele from column 3
@@ -768,7 +787,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath,
 
         // Add a new location and BAF value to the chromosome's SNP data
         // (population frequency and log2 ratio will be added later)
-        snp_info.insertSNPAlleleFrequency(chr, pos, baf);
+        // snp_info.insertSNPAlleleFrequency(chr_no_prefix, pos, baf);
+        this->snp_baf_map[pos] = baf;
+        this->snp_baf_keys.insert(pos);
     }
 
     pclose(fp);  // Close the process
@@ -787,7 +808,7 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
         std::cout << "No population frequency file provided for chromosome " << chr << std::endl;
         return;
     }
-
+    
     // Determine the ethnicity-specific allele frequency key
     std::string AF_key = "AF";
     if (this->input_data.getEthnicity() != "")
@@ -796,12 +817,11 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
     }
 
     // Check if the filepath uses the 'chr' prefix notations based on the
-    // chromosome name (e.g., *.chr1.vcf.gz vs *.1.vcf.gz)
+    // chromosome name (*.chr1.vcf.gz vs *.1.vcf.gz)
     std::string chr_gnomad = chr;  // gnomAD data may or may not have the 'chr' prefix
     std::string chr_prefix = "chr";
     if (pfb_filepath.find(chr_prefix) == std::string::npos)
     {
-        // gnomaAD does not use the 'chr' prefix
         // Remove the 'chr' prefix from the chromosome name
         if (chr_gnomad.find(chr_prefix) != std::string::npos)
         {
@@ -817,125 +837,68 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
 
     // Remove the 'chr' prefix from the chromosome name for SNP data. All
     // SNP data in this program does not use the 'chr' prefix
-    std::string chr_snp = chr;
-    if (chr_snp.find(chr_prefix) != std::string::npos)
-    {
-        chr_snp = chr_snp.substr(chr_prefix.length());
-    }
-    std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl;
-
-    // Get the start and end SNP positions for the chromosome (1-based
-    // index)
-    std::pair<int64_t, int64_t> snp_range = snp_info.getSNPRange(chr);
-    int64_t snp_start = snp_range.first;
-    int64_t snp_end = snp_range.second;
-    if (this->input_data.isRegionSet())
-    {
-        // Get the user-defined region
-        std::pair<int32_t, int32_t> region = this->input_data.getRegion();
-        if (snp_start < region.first) {
-            snp_start = region.first;
-        } else if (snp_end > region.second) {
-            snp_end = region.second;
-        }
-    }
+    std::string chr_no_prefix = removeChrPrefix(chr);
 
-    // Use a maximum of 8 threads to avoid overloading the system with too many
-    // processes
-    int num_threads = this->input_data.getThreadCount();
-    if (num_threads > 8)
-    {
-        num_threads = 8;
-    }
+    std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl;
+    int thread_count = this->input_data.getThreadCount();
+
+    // Run bcftools query to get the population frequencies for the
+    // chromosome within the SNP region, filtering for SNPS only,
+    // and within the MIN-MAX range of frequencies.
+    std::string snps_fp = this->input_data.getOutputDir() + "/filtered_snps.vcf";
+    std::string filter_criteria = "INFO/variant_type=\"snv\" && " + AF_key + " >= " + std::to_string(MIN_PFB) + " && " + AF_key + " <= " + std::to_string(MAX_PFB);
+    std::string cmd = \
+        "bcftools view --threads " + std::to_string(thread_count) + " -T " + snps_fp + " -i '" + filter_criteria + "' " + pfb_filepath + " | bcftools query -f '%POS\t%" + AF_key + "\n' 2>/dev/null";
+        
+    // printMessage("Running command: " + cmd);
+    std::cout << "Running command: " << cmd << std::endl;
 
-    // Split region into chunks and get the population frequencies in parallel
-    std::cout << "SNP range for chromosome " << chr << ": " << snp_start << "-" << snp_end << std::endl;
-    std::vector<std::string> region_chunks = splitRegionIntoChunks(chr_gnomad, snp_start, snp_end, num_threads);
-    std::unordered_map<int, double> pos_pfb_map;
-    // std::vector<std::thread> threads;
-    std::vector<std::future<std::unordered_map<int, double>>> futures;
-    for (const auto& region_chunk : region_chunks)
+    // Open a pipe to read the output of the command
+    FILE *fp = popen(cmd.c_str(), "r");
+    if (fp == NULL)
     {
-        // Create a lambda function to get the population frequencies for the
-        // region chunk
-        auto get_pfb = [region_chunk, pfb_filepath, AF_key]() -> std::unordered_map<int, double>
-        {
-            // Run bcftools query to get the population frequencies for the
-            // chromosome within the SNP region, filtering for SNPS only,
-            // and within the MIN-MAX range of frequencies.
-            std::string filter_criteria = "INFO/variant_type=\"snv\" && " + AF_key + " >= " + std::to_string(MIN_PFB) + " && " + AF_key + " <= " + std::to_string(MAX_PFB);
-            std::string cmd = \
-                "bcftools query -r " + region_chunk + " -f '%POS\t%" + AF_key + "\n' -i '" + filter_criteria + "' " + pfb_filepath + " 2>/dev/null";
-
-            // std::cout << "Command: " << cmd << std::endl;
-            printMessage("Running command: " + cmd);
-
-            // Open a pipe to read the output of the command
-            FILE *fp = popen(cmd.c_str(), "r");
-            if (fp == NULL)
-            {
-                std::cerr << "ERROR: Could not open pipe for command: " << cmd << std::endl;
-                exit(1);
-            }
-
-            // Loop through the BCFTOOLS output and populate the map of population
-            // frequencies
-            // printMessage("Parsing population frequencies for chromosome " + chr + "...");
-            std::unordered_map<int, double> pos_pfb_map;
-            const int line_size = 256;
-            char line[line_size];
-            while (fgets(line, line_size, fp) != NULL)
-            {
-                // Parse the line
-                int pos;
-                double pfb;
-                if (sscanf(line, "%d%lf", &pos, &pfb) == 2)
-                {
-                    pos_pfb_map[pos] = pfb;  // Add the position and population frequency to the map
-                }
-            }
-            pclose(fp);
-            // printMessage("Finished parsing population frequencies for chromosome " + chr + "...");
-
-            return pos_pfb_map;
-        };
-
-        // Create a future for the thread
-        futures.emplace_back(std::async(std::launch::async, get_pfb));
-        // std::future<std::unordered_map<int, double>> future = std::async(std::launch::async, get_pfb);
-        // futures.push_back(std::move(future));
+        throw std::runtime_error("ERROR: Could not open pipe for command: " + cmd);
     }
 
-    // Loop through the futures and get the results
-    int pfb_count = 0;
-    for (auto& future : futures)
+    // Loop through the BCFTOOLS output and populate the map of population
+    // frequencies
+    // printMessage("Parsing population frequencies for chromosome " + chr +
+    // "...");
+    std::cout << "Parsing population frequencies for chromosome " << chr << "..." << std::endl;
+    // const int line_size = 256;
+    // char line[line_size];
+    int print_count = 0;
+    // while (fgets(line, line_size, fp) != NULL)
+    char line[2048];
+    while (fgets(line, sizeof(line), fp) != NULL)
     {
-        future.wait();
-        std::unordered_map<int, double> result = std::move(future.get());
-
-        // Loop through the result and add to SNPInfo
-        // printMessage("Adding population frequencies to SNPInfo...");
-        for (auto& pair : result)
-        {
-            int pos = pair.first;
-            double pfb = pair.second;
-
-            // Add the population frequency to the SNPInfo
-            // this->snp_data_mtx.lock();
-            snp_info.insertSNPPopulationFrequency(chr_snp, pos, pfb);
-            // this->snp_data_mtx.unlock();
-            pfb_count++;
-
-            // [TEST] Print 15 values
-            if (pfb_count < 15)
+        std::istringstream iss(line);
+        // Parse the line
+        int pos;
+        double pfb;
+        // if (sscanf(line, "%d\t%lf", &pos, &pfb) == 2)
+        if (iss >> pos >> pfb){
+            // pos_pfb_map[pos] = pfb;  // Add the position and population
+            // frequency to the map
+            // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb);
+
+            // Print the first 10 population frequencies
+            if (print_count < 10)
             {
-                printMessage("Population frequency for " + chr + ":" + std::to_string(pos) + " = " + std::to_string(pfb));
+                std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
+                // printMessage("Population frequency for " + chr + ":" +
+                // std::to_string(pos) + " = " + std::to_string(pfb));
+                this->snp_pfb_map[pos] = pfb;
+                print_count++;
             }
         }
     }
+    pclose(fp);
+    std::cout << "Finished reading population frequencies for chromosome " << chr << std::endl;
+    // printMessage("Finished parsing population frequencies for chromosome " + chr + "...");
 }
 
-void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, int64_t start, int64_t end, std::string sv_type, double likelihood)
+void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood)
 {
     // Open the TSV file for writing
     std::ofstream tsv_file(filepath);
@@ -988,7 +951,7 @@ void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, s
     for (int i = 0; i < snp_count; i++)
     {
         // Get the SNP data
-        int64_t pos        = snp_data.pos[i];
+        uint32_t pos        = snp_data.pos[i];
         bool    is_snp     = snp_data.is_snp[i];
         double  pfb        = snp_data.pfb[i];
         double  baf        = snp_data.baf[i];
@@ -1017,7 +980,7 @@ void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, s
     tsv_file.close();
 }
 
-void CNVCaller::updateSNPData(SNPData& snp_data, int64_t pos, double pfb, double baf, double log2_cov, bool is_snp)
+void CNVCaller::updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp)
 {
     // Update the SNP data
     snp_data.pos.emplace_back(pos);
@@ -1026,3 +989,65 @@ void CNVCaller::updateSNPData(SNPData& snp_data, int64_t pos, double pfb, double
     snp_data.log2_cov.emplace_back(log2_cov);
     snp_data.is_snp.emplace_back(is_snp);
 }
+
+std::tuple<std::vector<uint32_t>, std::vector<double>, std::vector<double>> CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end)
+{
+    // Lock the mutex for reading SNP information
+    // std::lock_guard<std::mutex> lock(this->snp_info_mtx);
+
+    chr = removeChrPrefix(chr);
+
+    // Create an ordered map of SNP positions to BAF and PFB values
+    std::map<uint32_t, std::tuple<double, double>> snp_map;
+
+    // Query SNPs within a range (start, end) and return their BAF and PFB
+    // values as separate vectors
+    std::vector<double> bafs;
+    std::vector<double> pfbs;
+    std::vector<uint32_t> pos;
+    double pfb_default = 0.5;
+
+    // Query the SNPs within the range and return their BAFs and corresponding
+    // positions
+    auto snp_start = this->snp_baf_keys.lower_bound(start);
+    auto snp_end = this->snp_baf_keys.upper_bound(end);
+
+    if (snp_start == this->snp_baf_keys.end())
+    {
+        return std::make_tuple(pos, bafs, pfbs);
+    }
+
+    for (auto it = snp_start; it != snp_end; it++)
+    {
+        uint32_t snp_pos = *it;
+        pos.push_back(snp_pos);
+        bafs.push_back(this->snp_baf_map[snp_pos]);
+
+        // Get the PFB value for the SNP
+        if (this->snp_pfb_map.find(snp_pos) != this->snp_pfb_map.end())
+        {
+            pfbs.push_back(this->snp_pfb_map[snp_pos]);
+        } else {
+            pfbs.push_back(pfb_default);
+        }
+    }
+    // auto& baf_bst = this->snp_baf_map[chr];
+    // auto baf_start = baf_bst.lower_bound({start, 0.0});
+    // auto baf_end = baf_bst.upper_bound({end, 0.0});
+    // for (auto it = baf_start; it != baf_end; it++) {
+    //     bafs.push_back(std::get<1>(*it));
+    //     pos.push_back(std::get<0>(*it));
+    // }
+
+
+
+    // auto& pfb_map = this->snp_pfb_map[chr];
+    // for (size_t i = 0; i < pos.size(); i++) {
+    //     uint32_t snp_pos = pos[i];
+    //     if (pfb_map.find(snp_pos) != pfb_map.end()) {
+    //         pfbs[i] = pfb_map[snp_pos];
+    //     }
+    // }
+    
+    return std::make_tuple(pos, bafs, pfbs);
+}
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 186e4617..572ed92a 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -95,7 +95,7 @@ const ReferenceGenome& InputData::getRefGenome() const
     return this->fasta_query;
 }
 
-std::string InputData::queryRefGenome(std::string chr, int64_t pos_start, int64_t pos_end)
+std::string InputData::queryRefGenome(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const
 {
     return this->fasta_query.query(chr, pos_start, pos_end);
 }
diff --git a/src/snp_info.cpp b/src/snp_info.cpp
index 36efeb4b..1dc7b4a7 100644
--- a/src/snp_info.cpp
+++ b/src/snp_info.cpp
@@ -1,4 +1,5 @@
 #include "snp_info.h"
+#include "utils.h"
 
 /// @cond
 #include <string>
@@ -11,45 +12,36 @@
 #define MIN_PFB 0.01
 
 
-// Function to remove the 'chr' prefix from chromosome names
-std::string removeChrPrefix(std::string chr)
+void SNPInfo::insertSNPAlleleFrequency(const std::string& chr, uint32_t pos, double baf)
 {
-    if (chr.find("chr") != std::string::npos) {
-        return chr.substr(3);
-    }
-    return chr;
-}
-
-void SNPInfo::insertSNPAlleleFrequency(std::string chr, int64_t pos, double baf)
-{
-    chr = removeChrPrefix(chr);
+    // chr = removeChrPrefix(chr);
 
     // Add the chromosome to the SNP B-allele frequency map if it does not exist
-    if (this->snp_baf_map.find(chr) == this->snp_baf_map.end()) {
-        this->snp_baf_map[chr] = BST();
-    }
+    // if (this->snp_baf_map.find(chr) == this->snp_baf_map.end()) {
+    //     this->snp_baf_map[chr] = BST();
+    // }
 
     // Insert the SNP into the map with its position and B-allele frequency
     // using a binary search tree to keep the SNP positions sorted
     this->snp_baf_map[chr].insert({pos, baf});
 }
 
-void SNPInfo::insertSNPPopulationFrequency(std::string chr, int64_t pos, double pfb)
+void SNPInfo::insertSNPPopulationFrequency(const std::string& chr, uint32_t pos, double pfb)
 {
-    chr = removeChrPrefix(chr);
+    // chr = removeChrPrefix(chr);
 
     // Add the chromosome to the SNP population frequency map if it does not
     // exist
-    if (this->snp_pfb_map.find(chr) == this->snp_pfb_map.end()) {
-        this->snp_pfb_map[chr] = std::unordered_map<int64_t, double>();
-    }
+    // if (this->snp_pfb_map.find(chr) == this->snp_pfb_map.end()) {
+    //     this->snp_pfb_map[chr] = std::unordered_map<uint32_t, double>();
+    // }
 
     // Insert the SNP into the map with its position and population frequency of
     // the B allele
     this->snp_pfb_map[chr][pos] = pfb;
 }
 
-std::tuple<std::vector<int64_t>, std::vector<double>, std::vector<double>> SNPInfo::querySNPs(std::string chr, int64_t start, int64_t end)
+std::tuple<std::vector<uint32_t>, std::vector<double>, std::vector<double>> SNPInfo::querySNPs(std::string chr, uint32_t start, uint32_t end)
 {
     // Lock the mutex for reading SNP information
     // std::lock_guard<std::mutex> lock(this->snp_info_mtx);
@@ -57,13 +49,13 @@ std::tuple<std::vector<int64_t>, std::vector<double>, std::vector<double>> SNPIn
     chr = removeChrPrefix(chr);
 
     // Create an ordered map of SNP positions to BAF and PFB values
-    std::map<int64_t, std::tuple<double, double>> snp_map;
+    std::map<uint32_t, std::tuple<double, double>> snp_map;
 
     // Query SNPs within a range (start, end) and return their BAF and PFB
     // values as separate vectors
     std::vector<double> bafs;
     std::vector<double> pfbs;
-    std::vector<int64_t> pos;
+    std::vector<uint32_t> pos;
     
     // Check if the chromosome exists in the B-allele frequency map
     if (this->snp_baf_map.find(chr) == this->snp_baf_map.end()) {
@@ -91,7 +83,7 @@ std::tuple<std::vector<int64_t>, std::vector<double>, std::vector<double>> SNPIn
     // Query the PFBs for all SNP positions with PFB data
     auto& pfb_map = this->snp_pfb_map[chr];
     for (size_t i = 0; i < pos.size(); i++) {
-        int64_t snp_pos = pos[i];
+        uint32_t snp_pos = pos[i];
         if (pfb_map.find(snp_pos) != pfb_map.end()) {
             pfbs[i] = pfb_map[snp_pos];
         }
@@ -100,13 +92,13 @@ std::tuple<std::vector<int64_t>, std::vector<double>, std::vector<double>> SNPIn
     return std::make_tuple(pos, bafs, pfbs);
 }
 
-std::pair<int64_t, int64_t> SNPInfo::getSNPRange(std::string chr)
+std::pair<uint32_t, uint32_t> SNPInfo::getSNPRange(std::string chr)
 {
     chr = removeChrPrefix(chr);
 
     // Get the range of SNP positions for a given chromosome
-    int64_t start = 0;
-    int64_t end = 0;
+    uint32_t start = 0;
+    uint32_t end = 0;
     if (this->snp_baf_map.find(chr) != this->snp_baf_map.end()) {
         auto& baf_bst = this->snp_baf_map[chr];
         start = std::get<0>(*baf_bst.begin());
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 73b6cfea..cd44585e 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -37,30 +37,30 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
 }
 
 // RegionData SVCaller::detectSVsFromRegion(std::string region)
-std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(std::string region)
+std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region)
 {
-    // Open the BAM file
-    std::string bam_filepath = this->input_data.getLongReadBam();
-    samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
-    if (fp_in == NULL) {
-        std::cerr << "ERROR: failed to open " << bam_filepath << std::endl;
-        exit(1);
-    }
+    // // Open the BAM file
+    // std::string bam_filepath = this->input_data.getLongReadBam();
+    // samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
+    // if (fp_in == NULL) {
+    //     std::cerr << "ERROR: failed to open " << bam_filepath << std::endl;
+    //     exit(1);
+    // }
 
-    // Load the header for the BAM file
-    bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
-    if (!bamHdr) {
-        sam_close(fp_in);
-        throw std::runtime_error("ERROR: failed to read header for " + bam_filepath);
-    }
+    // // Load the header for the BAM file
+    // bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
+    // if (!bamHdr) {
+    //     sam_close(fp_in);
+    //     throw std::runtime_error("ERROR: failed to read header for " + bam_filepath);
+    // }
 
-    // Load the index for the BAM file
-    hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str());
-    if (!idx) {
-        bam_hdr_destroy(bamHdr);
-        sam_close(fp_in);
-        throw std::runtime_error("ERROR: failed to load index for " + bam_filepath);
-    }
+    // // Load the index for the BAM file
+    // hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str());
+    // if (!idx) {
+    //     bam_hdr_destroy(bamHdr);
+    //     sam_close(fp_in);
+    //     throw std::runtime_error("ERROR: failed to load index for " + bam_filepath);
+    // }
 
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
@@ -135,11 +135,15 @@ std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(std::
         num_alignments++;
     }
 
+    // Clean up the iterator and alignment
     hts_itr_destroy(itr);
     bam_destroy1(bam1);
-    hts_idx_destroy(idx);
-    bam_hdr_destroy(bamHdr);
-    sam_close(fp_in);
+    
+    // hts_itr_destroy(itr);
+    // bam_destroy1(bam1);
+    // hts_idx_destroy(idx);
+    // bam_hdr_destroy(bamHdr);
+    // sam_close(fp_in);
 
     return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments);
 }
@@ -181,7 +185,6 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
     int32_t query_end = 0;    // Last alignment position in the query
     bool first_op = false;  // First alignment operation for the query
     double default_lh = 0.0;
-    // double default_lh = std::numeric_limits<double>::quiet_NaN();  // Default likelihood
     for (int i = 0; i < cigar_len; i++) {
 
         int op = bam_cigar_op(cigar[i]);  // CIGAR operation
@@ -373,6 +376,28 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
     // std::cout << "[DEBUG] Running last " << last_n << " chromosomes" << std::endl;
     // //chromosomes = std::vector<std::string>(chromosomes.end()-3, chromosomes.end());
 
+    // Open the BAM file
+    std::string bam_filepath = this->input_data.getLongReadBam();
+    samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
+    if (!fp_in) {
+        throw std::runtime_error("ERROR: failed to open " + bam_filepath);
+    }
+
+    // Load the header for the BAM file
+    bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
+    if (!bamHdr) {
+        sam_close(fp_in);
+        throw std::runtime_error("ERROR: failed to read header for " + bam_filepath);
+    }
+
+    // Load the index for the BAM file
+    hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str());
+    if (!idx) {
+        bam_hdr_destroy(bamHdr);
+        sam_close(fp_in);
+        throw std::runtime_error("ERROR: failed to load index for " + bam_filepath);
+    }
+
     // Loop through each region and detect SVs in chunks
     int chr_count = chromosomes.size();
     int current_chr = 0;
@@ -426,7 +451,8 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
         std::set<SVCall> combined_sv_calls;
         for (const auto& sub_region : region_chunks) {
             // std::cout << "Detecting CIGAR string SVs from " << sub_region << "..." << std::endl;
-            std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(sub_region);
+            // std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(sub_region);
+            std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region);
             std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
             PrimaryMap& primary_map = std::get<1>(region_data);
             SuppMap& supp_map = std::get<2>(region_data);
@@ -460,6 +486,10 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
             // main set
             // sv_calls.emplace_back(subregion_sv_calls);
 
+            // Merge the SV calls from the current region
+            std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
+            mergeSVs(subregion_sv_calls);
+
             // Combine the SV calls from the current region
             std::cout << "Combining SV calls from " << sub_region << "..." << std::endl;
             concatenateSVCalls(combined_sv_calls, subregion_sv_calls);
@@ -479,12 +509,19 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
         // std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)" << std::endl;
     }
 
+    // Clean up the BAM file, header, and index
+    hts_idx_destroy(idx);
+    bam_hdr_destroy(bamHdr);
+    sam_close(fp_in);
+
     // SVData sv_calls_combined;
     // for (const auto& subregion_sv_calls : sv_calls) {
     //     sv_calls_combined.concatenate(subregion_sv_calls);
     // }
 
-    std::cout << "SV calling completed." << std::endl;
+    // Save to VCF
+    std::cout << "Saving SVs to VCF..." << std::endl;
+    this->saveToVCF(whole_genome_sv_calls);
 
     return whole_genome_sv_calls;
 }
@@ -570,6 +607,7 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
         int32_t primary_lh_t = 0;
         if (primary_end - primary_start >= min_cnv_length) {
             SVCandidate sv_candidate(primary_start+1, primary_end+1, ".");
+            // std::cout << "TEST5" << std::endl;
             std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
             primary_lh = std::get<0>(result);
             // primary_log_likelihood /= (double)(primary_end - primary_start);  // Normalize the log likelihood by the length
@@ -613,6 +651,7 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
         int largest_supp_lh_t = 0;
         if (largest_supp_length >= min_cnv_length) {
             SVCandidate sv_candidate(std::get<1>(largest_supp_alignment)+1, std::get<2>(largest_supp_alignment)+1, ".");
+            // std::cout << "TEST1" << std::endl;
             std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
             largest_supp_lh = std::get<0>(result);
             // largest_supp_log_likelihood /= (double)largest_supp_length;  // Normalize the log likelihood by the length
@@ -627,6 +666,7 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
         if (largest_supp_alignment != closest_supp_alignment) {
             if (closest_supp_length >= min_cnv_length) {
                 SVCandidate sv_candidate(std::get<1>(closest_supp_alignment)+1, std::get<2>(closest_supp_alignment)+1, ".");
+                // std::cout << "TEST2" << std::endl;
                 std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
                 closest_supp_lh = std::get<0>(result);
                 // closest_supp_log_likelihood /= (double)closest_supp_length;  // Normalize the log likelihood by the length
@@ -708,6 +748,8 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
             std::string chosen_candidate_str = "BOUNDARY";
             int split_scenario = NOCALL;
             for (const auto& sv_candidate : sv_candidates) {
+            	// std::cout << "TEST3: primary = " << primary_start << ", " << primary_end << " supp = " << supp_start << ", " << supp_end << std::endl;
+            	// std::cout << "Position: " << std::get<0>(sv_candidate) << ", " << std::get<1>(sv_candidate) << std::endl;
                 std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
                 double current_lh = std::get<0>(result);
                 SVType current_type = std::get<1>(result);
@@ -957,6 +999,7 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
                     // the closest supplementary alignment and the largest
                     // supplementary alignment
                     SVCandidate sv_candidate(std::get<2>(closest_supp_alignment)+1, std::get<1>(largest_supp_alignment)+1, ".");
+                    // std::cout << "TEST4" << std::endl;
                     std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
                     // double complex_log_likelihood = std::get<0>(result);
                     SVType complex_type = std::get<1>(result);
@@ -1018,7 +1061,7 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
     }
 }
 
-void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall> >& sv_calls, const ReferenceGenome& ref_genome)
+void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall> >& sv_calls)
 {
     std::cout << "Creating VCF writer..." << std::endl;
     // std::string output_vcf = output_dir + "/output.vcf";
@@ -1032,25 +1075,19 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall>
 
     std::cout << "Getting reference genome filepath..." << std::endl;
     try {
-        std::string ref_fp = ref_genome.getFilepath();
+        std::string ref_fp = this->input_data.getRefGenome().getFilepath();
         std::cout << "Reference genome filepath: " << ref_fp << std::endl;
     } catch (const std::exception& e) {
         std::cerr << "Error: " << e.what() << std::endl;
         return;
     }
 
-    std::cout << "Getting reference genome header..." << std::endl;
-    try {
-        ref_genome.getContigHeader();
-    } catch (const std::exception& e) {
-        std::cerr << "Error: " << e.what() << std::endl;
-        return;
-    }
-
     // Set the header lines
+    std::cout << "Getting reference genome header..." << std::endl;
+    const std::string contig_header = this->input_data.getRefGenome().getContigHeader();
     std::vector<std::string> header_lines = {
-        std::string("##reference=") + ref_genome.getFilepath(),
-        ref_genome.getContigHeader(),
+        std::string("##reference=") + 
+        contig_header,
         "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">",
         "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">",
         "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">",
@@ -1166,7 +1203,8 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall>
             if (sv_type_str == "DEL") {
                 // Get the deleted sequence from the reference genome, also including the preceding base
                 int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
-                ref_allele = ref_genome.query(chr, preceding_pos, end);
+                // ref_allele = ref_genome.query(chr, preceding_pos, end);
+                ref_allele = this->input_data.queryRefGenome(chr, preceding_pos, end);
 
                 // Use the preceding base as the alternate allele 
                 if (ref_allele != "") {
@@ -1184,7 +1222,9 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall>
             } else {
                 // Use the preceding base as the reference allele
                 int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
-                ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
+                // ref_allele = ref_genome.query(chr, preceding_pos,
+                // preceding_pos);
+                ref_allele = this->input_data.queryRefGenome(chr, preceding_pos, preceding_pos);
 
                 // Format novel insertions
                 if (sv_type_str == "INS") {
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 15aefdfd..b2c5827f 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -9,8 +9,8 @@
 
 bool SVCall::operator<(const SVCall & other) const
 {
-    return std::tie(start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood) <
-           std::tie(other.start, other.end, other.sv_type, other.alt_allele, other.data_type, other.genotype, other.hmm_likelihood);
+	return start < other.start || (start == other.start && end < other.end);
+    //return std::tie(start, end) < std::tie(other.start, other.end);
 }
 
 void addSVCall(std::set<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
@@ -27,29 +27,41 @@ void addSVCall(std::set<SVCall>& sv_calls, uint32_t start, uint32_t end, std::st
     // If the SV call already exists (start and end position), then update all information if the
     // likelihood is higher
     // std::cout << "[TEST1] Adding SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl;
-    std::vector<SVCall> updates;
+    SVCall new_sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1};
+    
+    sv_calls.insert(new_sv_call);
+    
+    /*
+    bool exists = false;
     bool print_out = false;
     for (auto it = sv_calls.begin(); it != sv_calls.end();)
     {
         if (it->start == start && it->end == end)
         {
+            exists = true;
             if (hmm_likelihood > it->hmm_likelihood)
             {
-                std::cout << "[DEBUG] Found higher likelihood for SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl;
+                //std::cout << "[DEBUG] Found higher likelihood for SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl;
                 print_out = true;
                 // Update the data type and support
-                std::string new_data_type = it->data_type + "," + data_type;
-                int new_support = it->support + 1;
+                // std::string new_data_type = it->data_type + "," + data_type;
+                // int new_support = it->support + 1;
+                new_sv_call.data_type = it->data_type + "," + data_type;
+                new_sv_call.support = it->support + 1;
+                //higher_lh = true;
 
-                updates.push_back(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support});
+                // updates.push_back(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support});
 
                 // Erase and re-insert the SV call
-                // Erase the current iterator and safely insert the new SV call
-                // sv_calls.erase(it);
-                it = sv_calls.erase(it);  // Erase and get the next iterator
+                // Erase the current iterator and safely insert the new SV calls
+                std::cout << "Erasing iterator." << std::endl;
+                sv_calls.erase(it);
+                std::cout << "Iterator erased." << std::endl;
+                break;
+                //it = sv_calls.erase(it);  // Erase and get the next iterator
                 // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support});
             } else {
-                // Return if no update is needed
+                // End if the SV exists but is lower lh
                 return;
             }
         } else {
@@ -62,23 +74,35 @@ void addSVCall(std::set<SVCall>& sv_calls, uint32_t start, uint32_t end, std::st
     {
         std::cout << "[DEBUG] Adding updates" << std::endl;
     }
-    
-    // Insert the updates
-    for (const auto& update : updates)
+
+    // Update the SV call if it does not exist, or if the likelihood is higher
+    // than the existing call
+    if (print_out)
     {
-        sv_calls.insert(update);
+        std::cout << "[DEBUG] Inserting call" << std::endl;
     }
-
+    sv_calls.insert(new_sv_call);
     if (print_out)
     {
-        std::cout << "[DEBUG] Added updates" << std::endl;
+        std::cout << "[DEBUG] Call inserted" << std::endl;
     }
+    // Insert the updates
+    // for (const auto& update : updates)
+    // {
+    //     sv_calls.insert(update);
+    // }
+
+    // if (print_out)
+    // {
+    //     std::cout << "[DEBUG] Added updates" << std::endl;
+    // }
 
 
     // Add the SV call if it does not exist
     // std::cout << "[TEST2] Adding SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl;
     // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1});
     // std::cout << "[TEST3] Added SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl;
+    */
 }
 
 std::vector<std::set<SVCall>> splitSVsIntoChunks(std::set<SVCall>& sv_calls, int chunk_count)
@@ -124,3 +148,69 @@ void concatenateSVCalls(std::set<SVCall> &target, const std::set<SVCall> &source
     // Efficiently concatenate two sets of SV calls
     target.insert(source.begin(), source.end());
 }
+
+void mergeSVs(std::set<SVCall>& sv_calls) {
+    if (sv_calls.size() < 2) {
+        return;
+    }
+
+    // Merge SV calls if they overlap by at least 50%
+    int initial_size = sv_calls.size();
+    std::vector<SVCall> merged_sv_calls;
+    auto it = sv_calls.begin();
+    SVCall current_merge = *it++;
+
+    for (; it != sv_calls.end(); ++it) {
+        const SVCall& next = *it;
+
+        // Check if the SV calls overlap by at least 50%
+        uint32_t overlap_start = std::max(current_merge.start, next.start);
+        uint32_t overlap_end = std::min(current_merge.end, next.end);
+        uint32_t overlap_length = (overlap_start < overlap_end) ? overlap_end - overlap_start : 0;
+
+        uint32_t current_length = current_merge.end - current_merge.start;
+        uint32_t next_length = next.end - next.start;
+
+        // Merge the SV calls if the overlap is at least 50% of the current or
+        // next SV call
+        double overlap_pct_current = static_cast<double>(overlap_length) / current_length;
+        double overlap_pct_next = static_cast<double>(overlap_length) / next_length;
+
+        if (overlap_pct_current >= 0.5 || overlap_pct_next >= 0.5) {
+            // Merge the SV calls based on the likelihood
+            if (next.hmm_likelihood != 0.0) {
+                // Update the likelihood if the next SV call has a likelihood
+                // and it is higher than the current merged SV call
+                if (next.hmm_likelihood > current_merge.hmm_likelihood) {
+                    current_merge = next;
+                }
+            } else {
+                // If both have no likelihood (CIGAR only), then merge the SV calls
+                // based on largest SV length
+                if (next.hmm_likelihood == current_merge.hmm_likelihood) {
+                    if (next_length > current_length) {
+                        current_merge = next;
+                    }
+                }
+                // if (next_length > current_length) {
+                //     current_merge = next;
+                // }
+            }
+        } else {
+            // No overlap: Save the SV and continue
+            merged_sv_calls.push_back(current_merge);
+            current_merge = next;
+        }
+    }
+
+    // Add the last merged SV call
+    merged_sv_calls.push_back(current_merge);
+
+    // Update the SV calls
+    sv_calls.clear();
+    for (const auto& sv_call : merged_sv_calls) {
+        sv_calls.insert(sv_call);
+    }
+    int updated_size = sv_calls.size();
+    std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl;
+}
diff --git a/src/utils.cpp b/src/utils.cpp
index 62088fe2..db083f97 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -99,4 +99,13 @@ std::string getElapsedTime(std::chrono::high_resolution_clock::time_point start,
     int seconds = elapsed.count() - (hours * 3600) - (minutes * 60);
     std::string elapsed_time = std::to_string(hours) + ":" + std::to_string(minutes) + ":" + std::to_string(seconds);
     return elapsed_time;
-}
\ No newline at end of file
+}
+
+// Function to remove the 'chr' prefix from chromosome names
+std::string removeChrPrefix(std::string chr)
+{
+    if (chr.find("chr") != std::string::npos) {
+        return chr.substr(3);
+    }
+    return chr;
+}
diff --git a/tests/test_general.py b/tests/test_general.py
index dbca30b9..1d81fe96 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -64,11 +64,11 @@ def test_run():
 
     # Check that the VCF file has the correct number of lines.
     with open(output_file, 'r', encoding='utf-8') as f:
-        assert len(f.readlines()) == 41
+        assert len(f.readlines()) == 32
 
     # Check that the VCF file has the correct header, and the correct
     # VCF CHROM, POS, and INFO fields in the next 2 lines.
-    header_line = 18
+    header_line = 17
     with open(output_file, 'r', encoding='utf-8') as f:
         for i, line in enumerate(f):
             if i == header_line:

From 7d36424f278d0537ffc85189df8ba0947bcd9ec5 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 23 Nov 2024 14:59:04 -0500
Subject: [PATCH 020/134] Update pfb to htslib

---
 .gitignore           |   1 +
 include/cnv_caller.h |   9 ++-
 src/cnv_caller.cpp   | 182 +++++++++++++++++++++++++++++++++----------
 src/sv_caller.cpp    |   7 ++
 4 files changed, 153 insertions(+), 46 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5b1177ee..8284049d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -73,6 +73,7 @@ tests/cpp_module_out
 data/gnomadv2_filepaths.txt
 data/gnomadv3_filepaths.txt
 data/gnomadv4_filepaths.txt
+data/gnomadv4_filepaths_ssd.txt
 data/gnomadv4_hg19_filepaths.txt
 
 # Training data
diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 858c7454..f518f055 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -56,10 +56,11 @@ class CNVCaller {
         SNPData snp_data;
         SNPInfo snp_info;
         double mean_chr_cov = 0.0;
-        std::unordered_map<uint32_t, int> pos_depth_map;
-        std::unordered_map<uint32_t, double> snp_baf_map;
-        std::set<uint32_t> snp_baf_keys;
-        std::unordered_map<uint32_t, double> snp_pfb_map;
+        std::unordered_map<uint32_t, int> pos_depth_map;  // Read depth map
+        std::unordered_map<uint32_t, double> snp_baf_map;  // SNP B-allele frequency map
+        // std::set<uint32_t> snp_alt_map;  // SNP B-allele map
+        std::set<uint32_t> snp_baf_keys;  // SNP positions for BAF values
+        std::unordered_map<uint32_t, double> snp_pfb_map;  // SNP population frequency map
 
         // Define a map of CNV genotypes by HMM predicted state.
         // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output.
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index e8428ec5..de19d28f 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -3,6 +3,9 @@
 
 #include <htslib/sam.h>
 
+#include <htslib/vcf.h>
+#include <htslib/hts.h>
+
 /// @cond
 #include <iostream>
 #include <sstream>
@@ -295,7 +298,7 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCa
             continue;
         }
 
-    	std::cout << "CIGAR SV at " << chr << ":" << start_pos << "-" << end_pos << std::endl;
+    	// std::cout << "CIGAR SV at " << chr << ":" << start_pos << "-" << end_pos << std::endl;
 
         // Get the depth at the start position. This is used as the FORMAT/DP
         // value in the VCF file
@@ -529,6 +532,9 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
         throw std::runtime_error("ERROR: Could not open BAM file: " + bam_filepath);
     }
 
+    // Enable multi-threading
+    hts_set_threads(bam_file, this->input_data.getThreadCount());
+
     // Read the header
     bam_hdr_t *bam_header = sam_hdr_read(bam_file);
     if (!bam_header)
@@ -842,58 +848,150 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
     std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl;
     int thread_count = this->input_data.getThreadCount();
 
-    // Run bcftools query to get the population frequencies for the
-    // chromosome within the SNP region, filtering for SNPS only,
-    // and within the MIN-MAX range of frequencies.
-    std::string snps_fp = this->input_data.getOutputDir() + "/filtered_snps.vcf";
-    std::string filter_criteria = "INFO/variant_type=\"snv\" && " + AF_key + " >= " + std::to_string(MIN_PFB) + " && " + AF_key + " <= " + std::to_string(MAX_PFB);
-    std::string cmd = \
-        "bcftools view --threads " + std::to_string(thread_count) + " -T " + snps_fp + " -i '" + filter_criteria + "' " + pfb_filepath + " | bcftools query -f '%POS\t%" + AF_key + "\n' 2>/dev/null";
-        
-    // printMessage("Running command: " + cmd);
-    std::cout << "Running command: " << cmd << std::endl;
+    // Open the population frequency file
+    std::cout << "Opening population frequency file: " << pfb_filepath << std::endl;
+    htsFile *pfb_file = hts_open(pfb_filepath.c_str(), "r");
+    if (!pfb_file)
+    {
+        throw std::runtime_error("ERROR: Could not open population frequency file: " + pfb_filepath);
+    }
 
-    // Open a pipe to read the output of the command
-    FILE *fp = popen(cmd.c_str(), "r");
-    if (fp == NULL)
+    // Enable multi-threading
+    std::cout << "Setting number of threads to " << thread_count << std::endl;
+    hts_set_threads(pfb_file, thread_count);
+
+    // Read the header
+    std::cout << "Reading header from population frequency file..." << std::endl;
+    bcf_hdr_t *pfb_header = bcf_hdr_read(pfb_file);
+    if (!pfb_header)
     {
-        throw std::runtime_error("ERROR: Could not open pipe for command: " + cmd);
+        bcf_close(pfb_file);
+        throw std::runtime_error("ERROR: Could not read header from population frequency file: " + pfb_filepath);
     }
 
-    // Loop through the BCFTOOLS output and populate the map of population
-    // frequencies
-    // printMessage("Parsing population frequencies for chromosome " + chr +
-    // "...");
-    std::cout << "Parsing population frequencies for chromosome " << chr << "..." << std::endl;
-    // const int line_size = 256;
-    // char line[line_size];
+    // Set up the record
+    std::cout << "Initializing BCF record..." << std::endl;
+    bcf1_t *pfb_record = bcf_init();
+    if (!pfb_record)
+    {
+        bcf_hdr_destroy(pfb_header);
+        bcf_close(pfb_file);
+        throw std::runtime_error("ERROR: Could not initialize BCF record.");
+    }
+
+    // Read the population frequencies for the chromosome
+    std::cout << "[TEST] Reading population frequencies for chromosome " << chr << " (AF_key = " << AF_key << ")..." << std::endl;
     int print_count = 0;
-    // while (fgets(line, line_size, fp) != NULL)
-    char line[2048];
-    while (fgets(line, sizeof(line), fp) != NULL)
+    while (bcf_read(pfb_file, pfb_header, pfb_record) == 0)
     {
-        std::istringstream iss(line);
-        // Parse the line
-        int pos;
-        double pfb;
-        // if (sscanf(line, "%d\t%lf", &pos, &pfb) == 2)
-        if (iss >> pos >> pfb){
-            // pos_pfb_map[pos] = pfb;  // Add the position and population
-            // frequency to the map
-            // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb);
-
-            // Print the first 10 population frequencies
-            if (print_count < 10)
+        // Get the chromosome and position
+        // std::cout << "Reading record..." << std::endl;
+        // std::string record_chr = bcf_hdr_id2name(pfb_header, pfb_record->rid);
+        uint32_t pos = pfb_record->pos + 1;  // 0-based to 1-based
+
+        // Skip if not a SNP, or if the position is not in the BAF map
+        // if (!bcf_is_snp(pfb_record) || this->snp_baf_keys.find(pos) == this->snp_baf_keys.end())
+        if (!bcf_is_snp(pfb_record) || this->snp_baf_keys.count(pos) == 0)
+        {
+            continue;
+        }
+
+        // Get the population frequency for the SNP
+        // std::cout << "Getting population frequency..." << std::endl;
+        // double pfb = 0.0;
+        // int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb, NULL);
+        // if (pfb_status < 0)
+        // {
+        //     continue;
+        // }
+        float *pfb_f = NULL;
+        int count = 0;
+        int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count);
+        if (pfb_status < 0 || count == 0)
+        {
+            std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl;
+            continue;
+        }
+        double pfb = (double) pfb_f[0];
+        free(pfb_f);
+
+        // Continue if the population frequency is outside the threshold
+        if (pfb <= MIN_PFB || pfb >= MAX_PFB)
+        {
+            continue;
+        }
+
+        // Add the population frequency to the SNP data
+        // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb);
+        if (this->snp_pfb_map.find(pos) == this->snp_pfb_map.end())
+        {
+            this->snp_pfb_map[pos] = pfb;
+        } else {
+            // Keep the larger population frequency
+            if (pfb > this->snp_pfb_map[pos])
             {
-                std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
-                // printMessage("Population frequency for " + chr + ":" +
-                // std::to_string(pos) + " = " + std::to_string(pfb));
                 this->snp_pfb_map[pos] = pfb;
-                print_count++;
             }
         }
+        if (print_count < 10)
+        {
+            std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
+            print_count++;
+        }
     }
-    pclose(fp);
+
+    // // Run bcftools query to get the population frequencies for the
+    // // chromosome within the SNP region, filtering for SNPS only,
+    // // and within the MIN-MAX range of frequencies.
+    // std::string snps_fp = this->input_data.getOutputDir() + "/filtered_snps.vcf";
+    // std::string filter_criteria = "INFO/variant_type=\"snv\" && " + AF_key + " >= " + std::to_string(MIN_PFB) + " && " + AF_key + " <= " + std::to_string(MAX_PFB);
+    // std::string cmd = \
+    //     "bcftools view --threads " + std::to_string(thread_count) + " -T " + snps_fp + " -i '" + filter_criteria + "' " + pfb_filepath + " | bcftools query -f '%POS\t%" + AF_key + "\n' 2>/dev/null";
+        
+    // // printMessage("Running command: " + cmd);
+    // std::cout << "Running command: " << cmd << std::endl;
+
+    // // Open a pipe to read the output of the command
+    // FILE *fp = popen(cmd.c_str(), "r");
+    // if (fp == NULL)
+    // {
+    //     throw std::runtime_error("ERROR: Could not open pipe for command: " + cmd);
+    // }
+
+    // // Loop through the BCFTOOLS output and populate the map of population
+    // // frequencies
+    // // printMessage("Parsing population frequencies for chromosome " + chr +
+    // // "...");
+    // std::cout << "Parsing population frequencies for chromosome " << chr << "..." << std::endl;
+    // // const int line_size = 256;
+    // // char line[line_size];
+    // int print_count = 0;
+    // // while (fgets(line, line_size, fp) != NULL)
+    // char line[2048];
+    // while (fgets(line, sizeof(line), fp) != NULL)
+    // {
+    //     std::istringstream iss(line);
+    //     // Parse the line
+    //     int pos;
+    //     double pfb;
+    //     // if (sscanf(line, "%d\t%lf", &pos, &pfb) == 2)
+    //     if (iss >> pos >> pfb){
+    //         // pos_pfb_map[pos] = pfb;  // Add the position and population
+    //         // frequency to the map
+    //         // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb);
+
+    //         // Print the first 10 population frequencies
+    //         if (print_count < 10)
+    //         {
+    //             std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
+    //             // printMessage("Population frequency for " + chr + ":" +
+    //             // std::to_string(pos) + " = " + std::to_string(pfb));
+    //             this->snp_pfb_map[pos] = pfb;
+    //             print_count++;
+    //         }
+    //     }
+    // }
+    // pclose(fp);
     std::cout << "Finished reading population frequencies for chromosome " << chr << std::endl;
     // printMessage("Finished parsing population frequencies for chromosome " + chr + "...");
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index cd44585e..228a93a2 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -383,6 +383,13 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
         throw std::runtime_error("ERROR: failed to open " + bam_filepath);
     }
 
+    // Enable multi-threading
+    int num_threads = this->input_data.getThreadCount();
+    if (num_threads > 1) {
+        std::cout << "Running SV detection with " << num_threads << " thread(s)..." << std::endl;
+    }
+    hts_set_threads(fp_in, num_threads);
+
     // Load the header for the BAM file
     bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
     if (!bamHdr) {

From ba19aeb24ef8214f34047a39aa3f39ce9615ac44 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 25 Nov 2024 01:58:15 -0500
Subject: [PATCH 021/134] update reading snp vcf to htslib

---
 include/cnv_caller.h |   15 +-
 python/sv_merger.py  |    2 +-
 src/cnv_caller.cpp   | 1124 ++++++++++++++++++++++++++++++++----------
 src/sv_object.cpp    |   12 +-
 4 files changed, 876 insertions(+), 277 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index f518f055..e8d3d3b8 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -57,10 +57,10 @@ class CNVCaller {
         SNPInfo snp_info;
         double mean_chr_cov = 0.0;
         std::unordered_map<uint32_t, int> pos_depth_map;  // Read depth map
-        std::unordered_map<uint32_t, double> snp_baf_map;  // SNP B-allele frequency map
+        // std::unordered_map<uint32_t, double> snp_baf_map;  // SNP B-allele frequency map
         // std::set<uint32_t> snp_alt_map;  // SNP B-allele map
-        std::set<uint32_t> snp_baf_keys;  // SNP positions for BAF values
-        std::unordered_map<uint32_t, double> snp_pfb_map;  // SNP population frequency map
+        // std::set<uint32_t> snp_baf_keys;  // SNP positions for BAF values
+        // std::unordered_map<uint32_t, double> snp_pfb_map;  // SNP population frequency map
 
         // Define a map of CNV genotypes by HMM predicted state.
         // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output.
@@ -90,7 +90,7 @@ class CNVCaller {
         // Query a region for SNPs and return the SNP data
         std::pair<SNPData, bool> querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, SNPInfo &snp_info, std::unordered_map<uint32_t, int> &pos_depth_map, double mean_chr_cov);
 
-        std::tuple<std::vector<uint32_t>, std::vector<double>, std::vector<double>> querySNPs(std::string chr, uint32_t start, uint32_t end);
+        void querySNPs(std::string chr, uint32_t start, uint32_t end, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf, std::unordered_map<uint32_t, double>& snp_pfb);
 
         // Run copy number prediction for a chunk of SV candidates from CIGAR strings
         void runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, CHMM hmm, int window_size, double mean_chr_cov);
@@ -130,11 +130,14 @@ class CNVCaller {
         double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::unordered_map<uint32_t, int>& pos_depth_map, double mean_chr_cov);
 
         // Read SNP positions and BAF values from the VCF file of SNP calls
-        void readSNPAlleleFrequencies(std::string chr, std::string filepath, SNPInfo& snp_info);
+        // void readSNPAlleleFrequencies(std::string chr, std::string filepath, SNPInfo& snp_info);
 
         // Read SNP population frequencies from the PFB file and return a vector
         // of population frequencies for each SNP location
-        void getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info);
+        // void getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info);
+
+        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf);
+        void readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map<uint32_t, double>& snp_pfb_map);
 
         // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions
         void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood);
diff --git a/python/sv_merger.py b/python/sv_merger.py
index 78733f6d..b2d1491a 100644
--- a/python/sv_merger.py
+++ b/python/sv_merger.py
@@ -153,7 +153,7 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
 
     # Merge SVs with the same label
     unique_labels = np.unique(cluster_labels)
-    logging.info("Unique labels: %s", unique_labels)
+    #logging.info("Unique labels: %s", unique_labels)
 
     for label in unique_labels:
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index de19d28f..4fcc1604 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -5,6 +5,7 @@
 
 #include <htslib/vcf.h>
 #include <htslib/hts.h>
+#include <htslib/synced_bcf_reader.h>
 
 /// @cond
 #include <iostream>
@@ -59,7 +60,18 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
     bool snps_found = false;
     uint32_t window_size = (uint32_t)this->input_data.getWindowSize();
 
-    // printMessage("Querying SNPs for region " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
+    // Query the SNPs for the entire region
+    std::set<uint32_t> snp_pos;
+    std::unordered_map<uint32_t, double> snp_baf;
+    std::unordered_map<uint32_t, double> snp_pfb;
+    this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb);
+    // std::pair<std::vector<uint32_t>, std::vector<double>, std::vector<double>> snp_query = this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb);
+    // std::vector<uint32_t>& snp_pos = std::get<0>(snp_query);
+    // std::vector<double>& snp_pfb = std::get<1>(snp_query);
+    // std::vector<double>& snp_baf = std::get<2>(snp_query);
+
+    // Loop through the range of the SV region and query the SNPs in a sliding
+    // window, then calculate the log2 ratio for each window
     for (uint32_t i = start_pos; i <= end_pos; i += window_size)
     {
         // Run a sliding non-overlapping window of size window_size across
@@ -68,27 +80,25 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
         uint32_t window_end = std::min(i + window_size - 1, end_pos);
 
         // Get the SNP info for the window
-        // std::cout << "Querying SNPs for window " << chr << ":" << window_start << "-" << window_end << "..." << std::endl;
-        // this->snp_data_mtx.lock();
-        // std::tuple<std::vector<uint32_t>, std::vector<double>,
-        // std::vector<double>> window_snps = snp_info.querySNPs(chr,
-        // window_start, window_end);
-        std::tuple<std::vector<uint32_t>, std::vector<double>, std::vector<double>> window_snps = this->querySNPs(chr, window_start, window_end);
-        // this->snp_data_mtx.unlock();
-        std::vector<uint32_t>& snp_window_pos = std::get<0>(window_snps);  // SNP positions
-        std::vector<double>& snp_window_bafs = std::get<1>(window_snps);  // B-allele frequencies
-        std::vector<double>& snp_window_pfbs = std::get<2>(window_snps);  // Population frequencies of the B allele
+        std::vector<uint32_t> snp_window_pos;
+        std::vector<double> snp_window_bafs;
+        std::vector<double> snp_window_pfbs;
+        auto it_start = snp_pos.lower_bound(window_start);
+        auto it_end = snp_pos.upper_bound(window_end);
+        for (auto it = it_start; it != it_end; it++)
+        {
+            snp_window_pos.push_back(*it);
+            snp_window_bafs.push_back(snp_baf[*it]);
+            snp_window_pfbs.push_back(snp_pfb[*it]);
+        }
 
         // Loop though the SNP positions and calculate the log2 ratio for
         // the window up to the SNP, then calculate the log2 ratio centered
         // at the SNP, and finally calculate the log2 ratio for the window
         // after the SNP, and continue until the end of the window
-        std::vector<double> window_log2_ratios;
-        int snp_count = (int) snp_window_pos.size();
-
-        // If there are no SNPs in the window, then use the default BAF and
-        // PFB values, and the coverage log2 ratio
-        if (snp_count == 0)
+        // (If there are no SNPs in the window, then use the default BAF and
+        // PFB values, and the coverage log2 ratio)
+        if (snp_window_pos.size() == 0)
         {
             double window_log2_ratio = calculateLog2Ratio(window_start, window_end, pos_depth_map, mean_chr_cov);
             double pfb_default = 0.5;
@@ -99,23 +109,18 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
             snps_found = true;
 
             // Loop through the SNPs and calculate the log2 ratios
-            uint32_t bin_start = window_start;
-            uint32_t bin_end = 0;
-            for (int j = 0; j < snp_count; j++)
+            // uint32_t bin_start = window_start;
+            // uint32_t bin_end = 0;
+            for (int j = 0; j < (int) snp_window_pos.size(); j++)
             {
-                // SNP bin starts at 1/2 the distance between the previous SNP
-                // and the current SNP, and ends at 1/2 the distance between
-                // the current SNP and the next SNP. For the first SNP, the
-                // bin starts at the window start and ends at 1/2 the distance
-                // between the first SNP and the next SNP, and for the last
-                // SNP, the bin starts at 1/2 the distance between the previous
-                // SNP and the last SNP and ends at the window end.
-                uint32_t snp_pos = snp_window_pos[j];
-                bin_end = snp_pos + (j == snp_count-1 ? (window_end - snp_pos) / 2 : (snp_window_pos[j+1] - snp_pos) / 2);
+                // Just use a window centered at the SNP position
+                uint32_t bin_start = snp_window_pos[j] - window_size / 2;
+                uint32_t bin_end = snp_window_pos[j] + window_size / 2;
 
                 // Calculate the log2 ratio for the SNP bin
                 double bin_cov = calculateLog2Ratio(bin_start, bin_end, pos_depth_map, mean_chr_cov);
-                this->updateSNPData(snp_data, snp_pos, snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true);
+                this->updateSNPData(snp_data, snp_window_pos[j], snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true);
+                // this->updateSNPData(snp_data, snp_pos, snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true);
 
                 // Update the previous bin start
                 bin_start = bin_end + 1;
@@ -512,13 +517,13 @@ void CNVCaller::loadChromosomeData(std::string chr)
     //this->mean_chr_cov = 30.0;
     printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov));
 
-    std::cout << "Reading SNP allele frequencies for chromosome " << chr << " from VCF file..." << std::endl;
-    std::string snp_filepath = this->input_data.getSNPFilepath();
-    readSNPAlleleFrequencies(chr, snp_filepath, this->snp_info);
+    // std::cout << "Reading SNP allele frequencies for chromosome " << chr << " from VCF file..." << std::endl;
+    // std::string snp_filepath = this->input_data.getSNPFilepath();
+    // readSNPAlleleFrequencies(chr, snp_filepath, this->snp_info);
 
-    std::cout << "Obtaining SNP population frequencies for chromosome " << chr << "..." << std::endl;
-    getSNPPopulationFrequencies(chr, this->snp_info);
-    std::cout << "Finished loading chromosome data for " << chr << std::endl;
+    // std::cout << "Obtaining SNP population frequencies for chromosome " << chr << "..." << std::endl;
+    // getSNPPopulationFrequencies(chr, this->snp_info);
+    // std::cout << "Finished loading chromosome data for " << chr << std::endl;
 }
 
 // Calculate the mean chromosome coverage
@@ -679,133 +684,563 @@ double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::
     return window_log2_ratio;
 }
 
-void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, SNPInfo& snp_info)
+void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf)
 {
-
-    // Check that the SNP file is sorted by running bcftools index and reading
-    // the error output
-    std::string index_cmd = "bcftools index " + filepath + " 2>&1 | grep -i error";
-    if (this->input_data.getVerbose()) {
-        std::cout << "Command: " << index_cmd << std::endl;
-    }
-
-    // Open a pipe to read the output of the command
-    FILE *index_fp = popen(index_cmd.c_str(), "r");
-    if (index_fp == NULL)
+    // Get the SNP file path
+    std::string snp_filepath = this->input_data.getSNPFilepath();
+    if (snp_filepath.empty())
     {
-        std::cerr << "ERROR: Could not open pipe for command: " << index_cmd << std::endl;
-        exit(1);
+        throw std::runtime_error("ERROR: SNP file path is empty.");
     }
 
-    // Read the output of the command
-    const int error_size = 256;
-    char index_error[error_size];
-    while (fgets(index_error, error_size, index_fp) != NULL)
+    // Initialize the synced reader
+    bcf_srs_t *snp_reader = bcf_sr_init();
+    if (!snp_reader)
     {
-        std::cerr << "ERROR: " << index_error << std::endl;
-        exit(1);
-    }
-    pclose(index_fp);  // Close the process
-
-    // Filter variants by depth, quality, and region
-    if (this->input_data.getVerbose()) {
-        std::cout << "Filtering SNPs by depth, quality, and region..." << std::endl;
+        throw std::runtime_error("ERROR: Could not initialize SNP reader.");
     }
 
-    // Check if a region was specified by the user
-    std::string region_str = chr;
-    if (this->input_data.isRegionSet())
+    // Read the SNP header
+    htsFile *snp_file = bcf_open(snp_filepath.c_str(), "r");
+    bcf_hdr_t *snp_header = bcf_hdr_read(snp_file);
+    if (!snp_header)
     {
-        std::pair<int32_t, int32_t> region = this->input_data.getRegion();
-        region_str = chr + ":" + std::to_string(region.first) + "-" + std::to_string(region.second);
+        bcf_sr_destroy(snp_reader);
+        bcf_close(snp_file);
+        throw std::runtime_error("ERROR: Could not initialize SNP header.");
     }
 
-    std::string filtered_snp_vcf_filepath = this->input_data.getOutputDir() + "/filtered_snps.vcf";
+    // Set multi-threading
     int thread_count = this->input_data.getThreadCount();
-    // std::string cmd = "bcftools view -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath;
-    std::string cmd = "bcftools view --threads " + std::to_string(thread_count) + " -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath;
-    if (this->input_data.getVerbose()) {
-        std::cout << "Filtering SNPs by depth and quality..." << std::endl;
-        std::cout << "Command: " << cmd << std::endl;
-    }
-    system(cmd.c_str());
-    
-    if (this->input_data.getVerbose()) {
-        std::cout << "Filtered SNPs written to " << filtered_snp_vcf_filepath << std::endl;
-    }
+    bcf_sr_set_threads(snp_reader, thread_count);
 
-    // Extract B-allele frequency data from the VCF file and sort by chromosome
-    // and position
-    if (this->input_data.getVerbose()) {
-        std::cout << "Extracting B-allele frequency data from filtered SNPs..." << std::endl;
+    // Enable index usage
+    snp_reader->require_index = 1;
+
+    // Add the SNP file to the reader
+    if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0)
+    {
+        bcf_sr_destroy(snp_reader);
+        bcf_hdr_destroy(snp_header);
+        bcf_close(snp_file);
+        throw std::runtime_error("ERROR: Could not add SNP file to reader: " + snp_filepath);
     }
-    cmd = "bcftools query -f '%POS,[%AD]\n' " + filtered_snp_vcf_filepath + " 2>/dev/null";
-    FILE *fp = popen(cmd.c_str(), "r");
-    if (fp == NULL)
+
+    // Set the region
+    std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
+    if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)
     {
-        std::cerr << "ERROR: Could not open pipe for command: " << cmd << std::endl;
-        exit(1);
+        bcf_sr_destroy(snp_reader);
+        bcf_hdr_destroy(snp_header);
+        bcf_close(snp_file);
+        throw std::runtime_error("ERROR: Could not set region for SNP reader: " + region_str);
     }
 
-    // Read the reference and alternate allele depths from the VCF file
-    std::string alt_allele = "";  // Alternate allele
-    uint32_t pos = 0;
-    int ref_ad = 0;
-    int alt_ad = 0;
-    const int line_size = 1024;
-    char line[line_size];  // Line buffer
-    std::vector<int64_t> locations;
-    std::vector<double> bafs;
-    std::string chr_no_prefix = removeChrPrefix(chr);
-    while (fgets(line, line_size, fp) != NULL)
+    std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl;
+    int print_count = 0;
+    while (bcf_sr_next_line(snp_reader) >= 0)
     {
-        // Parse the line
-        char *tok = strtok(line, ",");  // Tokenize the line
-        int col = 0;  // Column index
-        while (tok != NULL)
+        bcf1_t *snp_record = bcf_sr_get_line(snp_reader, 0);
+        if (snp_record)
         {
-            // Get the position from column 2
-            if (col == 0)
+            uint32_t pos = (uint32_t)snp_record->pos + 1;
+
+            // Skip if not a SNP
+            if (!bcf_is_snp(snp_record))
             {
-                pos = (uint32_t)atoi(tok);
+                continue;
             }
 
-            // Get the AD for the reference allele from column 3
-            else if (col == 1)
+            // Get the QUAL, DP, and AD values
+            float qual = snp_record->qual;
+            if (bcf_float_is_missing(qual))
+            {
+                std::cerr << "ERROR: QUAL value is missing for SNP at " << chr << ":" << pos << std::endl;
+            }
+            // Skip if quality is less than 30
+            if (qual <= 30)
             {
-                ref_ad = atoi(tok);
+                continue;
             }
 
-            // Get the AD for the non-reference allele from column 4
-            else if (col == 2)
+            // Extract DP from FORMAT field
+            int32_t *dp = 0;
+            int dp_count = 0;
+            // int dp_count = bcf_get_info_int32(snp_header, snp_record, "DP",
+            // &dp, &dp_count);
+            int dp_ret = bcf_get_format_int32(snp_header, snp_record, "DP", &dp, &dp_count);
+            if (dp_ret < 0)
             {
-                alt_ad = atoi(tok);
+                std::cerr << "ERROR: Could not get DP value for SNP at " << chr << ":" << pos << std::endl;
+            } else {
+                // Skip if depth is not greater than 10
+                for (int i = 0; i < dp_count; i++)
+                {
+                    if (dp[i] <= 10)
+                    {
+                        continue;
+                    }
+                }
+                // if (dp <= 10)
+                // {
+                //     continue;
+                // }
+            }
+            free(dp);
+            // // Skip if depth is not greater than 10
+            // if (dp <= 10)
+            // {
+            //     continue;
+            // }
+            
+            // Skip if the SNP does not pass the filter
+            if (bcf_has_filter(snp_header, snp_record, const_cast<char*>("PASS")) != 1)
+            {
+                continue;
             }
 
-            // Move to the next token
-            tok = strtok(NULL, ",");
-            col++;
-        }
+            // Extract AD from FORMAT field
+            // int32_t ad[2] = {0, 0};
+            int32_t *ad = 0;
+            int ad_count = 0;
+            // int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD",
+            // &ad, &ad_count);
+            int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count);
+
+            // Skip if AD value is missing
+            if (ad_ret < 0)
+            {
+                std::cerr << "ERROR: AD value is missing for SNP at " << chr << ":" << pos << std::endl;
+                continue;
+            }
 
-        // Calculate the B-allele frequency (BAF) as the ratio of the alternate
-        // allele depth to the total depth (reference + alternate)
-        double baf = (double) alt_ad / (double) (ref_ad + alt_ad);
+            // Calculate the B-allele frequency (BAF)
+            double baf = 0.0;
+            // double ad0 = (double) ad[0];
+            // double ad1 = (double) ad[1];
+            double ad0 = 0.0;
+            double ad1 = 0.0;
+            for (int i = 0; i < ad_count; i++)
+            {
+                if (i == 0)
+                {
+                    ad0 = (double) ad[i];
+                } else if (i == 1) {
+                    ad1 = (double) ad[i];
+                }
+            }
+            free(ad);
+            try {
+                // std::cout << "AD[0]: " << ad0 << ", AD[1]: " << ad1 << std::endl;
+                baf = ad1 / (ad0 + ad1);
+                // std::cout << "AD[0]: " << ad[0] << ", AD[1]: " << ad[1] << std::endl;
+                // baf = (double) ad[1] / (double) (ad[0] + ad[1]);
+            } catch (const std::exception& e) {
+                std::cerr << "ERROR: Could not calculate BAF for SNP at " << chr << ":" << pos << std::endl;
+                continue;
+            }
+
+            // Insert the SNP position and BAF into the maps
+            snp_pos.insert(pos);
+            snp_baf[pos] = baf;
 
-        // Add a new location and BAF value to the chromosome's SNP data
-        // (population frequency and log2 ratio will be added later)
-        // snp_info.insertSNPAlleleFrequency(chr_no_prefix, pos, baf);
-        this->snp_baf_map[pos] = baf;
-        this->snp_baf_keys.insert(pos);
+            // Print the SNP position and BAF
+            if (print_count < 10)
+            {
+                std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << "(REF=" << ad0 << ",ALT=" << ad1 << ")" << std::endl;
+                print_count++;
+            }
+        }
     }
 
-    pclose(fp);  // Close the process
+    // Clean up
+    std::cout << "Cleaning up SNP reader..." << std::endl;
+    bcf_sr_destroy(snp_reader);
+    bcf_hdr_destroy(snp_header);
+    bcf_close(snp_file);
+
+    // std::cout << "Opening SNP file: " << snp_filepath << std::endl;
+    // htsFile *snp_file = bcf_open(snp_filepath.c_str(), "r");
+    // if (!snp_file)
+    // {
+    //     throw std::runtime_error("ERROR: Could not open SNP file: " + snp_filepath);
+    // }
+
+    // // Enable multi-threading
+    // hts_set_threads(snp_file, thread_count);
+
+    // // Read the header
+    // bcf_hdr_t *snp_header = bcf_hdr_read(snp_file);
+    // if (!snp_header)
+    // {
+    //     bcf_close(snp_file);
+    //     throw std::runtime_error("ERROR: Could not read header from SNP file: " + snp_filepath);
+    // }
+
+    // // Load the index
+    // hts_idx_t *snp_index = bcf_index_load(snp_filepath.c_str());
+    // if (!snp_index)
+    // {
+    //     bcf_hdr_destroy(snp_header);
+    //     bcf_close(snp_file);
+    //     throw std::runtime_error("ERROR: Could not load index for SNP file: " + snp_filepath);
+    // }
+
+    // // Construct the region string
+    // std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
+    // hts_itr_t *snp_iter = bcf_itr_querys(snp_index, snp_header, region_str.c_str());
+    // if (!snp_iter)
+    // {
+    //     hts_idx_destroy(snp_index);
+    //     bcf_hdr_destroy(snp_header);
+    //     bcf_close(snp_file);
+    //     throw std::runtime_error("ERROR: Could not create iterator for SNP region: " + region_str);
+    // }
+
+    // // Set up the record
+    // bcf1_t *snp_record = bcf_init();
+    // if (!snp_record)
+    // {
+    //     bcf_hdr_destroy(snp_header);
+    //     bcf_close(snp_file);
+    //     throw std::runtime_error("ERROR: Could not initialize SNP record.");
+    // }
+
+    // // Read the SNPs in the chromosome region
+    // int print_count = 0;
+    // while (bcf_itr_next(snp_file, snp_iter, snp_record) >= 0)
+    // {
+    //     // Get the position and B-allele frequency (BAF) from the SNP record
+    //     uint32_t pos = snp_record->pos + 1;  // 0-based to 1-based
+
+    //     // Get QUAL, DP, and AD values
+    //     float qual = snp_record->qual;
+    //     if (bcf_float_is_missing(qual))
+    //     {
+    //         std::cerr << "ERROR: QUAL value is missing for SNP at " << chr << ":" << pos << std::endl;
+    //     }
+    //     // Skip if quality is less than 30
+    //     if (qual <= 30)
+    //     {
+    //         continue;
+    //     }
+
+    //     // Get FILTER status
+    //     int pass_id = bcf_hdr_id2int(snp_header, BCF_DT_ID, "PASS");
+    //     if (pass_id == -1)
+    //     {
+    //         std::cerr << "ERROR: Could not get PASS ID for SNP at " << chr << ":" << pos << std::endl;
+    //     }
+    //     std::string pass_filter = "PASS";
+    //     if (bcf_has_filter(snp_header, snp_record, const_cast<char*>(pass_filter.c_str())) != 1)
+    //     {
+    //         // Skip if the SNP does not pass the filter
+    //         continue;
+    //     }
+
+    //     // Extract DP from INFO field
+    //     int32_t dp = 0;
+    //     int dp_count = bcf_get_info_int32(snp_header, snp_record, "DP", &dp, &dp_count);
+    //     if (dp_count != 1)
+    //     {
+    //         std::cerr << "ERROR: Could not get DP value for SNP at " << chr << ":" << pos << std::endl;
+    //     }
+    //     // Skip if depth is not greater than 10
+    //     if (dp <= 10)
+    //     {
+    //         continue;
+    //     }
+
+    //     // Skip if not a SNP
+    //     if (!bcf_is_snp(snp_record))
+    //     {
+    //         continue;
+    //     }
+
+    //     // Extract AD from FORMAT field
+    //     int32_t ad[2] = {0, 0};
+    //     int ad_count = 0;
+    //     int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count);
+    //     // if (ad_count != 2)
+    //     // {
+    //     //     std::cerr << "ERROR: Could not get AD value for SNP at " << chr << ":" << pos << std::endl;
+    //     // }
+
+    //     // Calculate the BAF
+    //     if (ad_ret > 0 && ad_count > 0)
+    //     {
+    //         double baf = (double) ad[1] / (double) (ad[0] + ad[1]);
+    //         snp_pos.insert(pos);
+    //         snp_baf[pos] = baf;
+
+    //         // Print the SNP position and BAF
+    //         if (print_count < 10)
+    //         {
+    //             std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << std::endl;
+    //             print_count++;
+    //         }
+    //     }
+    // }
+
+    // // Clean up
+    // bcf_destroy(snp_record);
+    // hts_itr_destroy(snp_iter);
+    // hts_idx_destroy(snp_index);
+    // bcf_hdr_destroy(snp_header);
+    // bcf_close(snp_file);
+
+    // // Check that the SNP file is sorted by running bcftools index and reading
+    // // the error output
+    // std::string index_cmd = "bcftools index " + filepath + " 2>&1 | grep -i error";
+    // if (this->input_data.getVerbose()) {
+    //     std::cout << "Command: " << index_cmd << std::endl;
+    // }
+
+    // // Open a pipe to read the output of the command
+    // FILE *index_fp = popen(index_cmd.c_str(), "r");
+    // if (index_fp == NULL)
+    // {
+    //     std::cerr << "ERROR: Could not open pipe for command: " << index_cmd << std::endl;
+    //     exit(1);
+    // }
+
+    // // Read the output of the command
+    // const int error_size = 256;
+    // char index_error[error_size];
+    // while (fgets(index_error, error_size, index_fp) != NULL)
+    // {
+    //     std::cerr << "ERROR: " << index_error << std::endl;
+    //     exit(1);
+    // }
+    // pclose(index_fp);  // Close the process
+
+    // // Filter variants by depth, quality, and region
+    // if (this->input_data.getVerbose()) {
+    //     std::cout << "Filtering SNPs by depth, quality, and region..." << std::endl;
+    // }
+
+    // // Check if a region was specified by the user
+    // std::string region_str = chr;
+    // if (this->input_data.isRegionSet())
+    // {
+    //     std::pair<int32_t, int32_t> region = this->input_data.getRegion();
+    //     region_str = chr + ":" + std::to_string(region.first) + "-" + std::to_string(region.second);
+    // }
+
+    // std::string filtered_snp_vcf_filepath = this->input_data.getOutputDir() + "/filtered_snps.vcf";
+    // int thread_count = this->input_data.getThreadCount();
+    // // std::string cmd = "bcftools view -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath;
+    // std::string cmd = "bcftools view --threads " + std::to_string(thread_count) + " -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath;
+    // if (this->input_data.getVerbose()) {
+    //     std::cout << "Filtering SNPs by depth and quality..." << std::endl;
+    //     std::cout << "Command: " << cmd << std::endl;
+    // }
+    // system(cmd.c_str());
+    
+    // if (this->input_data.getVerbose()) {
+    //     std::cout << "Filtered SNPs written to " << filtered_snp_vcf_filepath << std::endl;
+    // }
+
+    // // Extract B-allele frequency data from the VCF file and sort by chromosome
+    // // and position
+    // if (this->input_data.getVerbose()) {
+    //     std::cout << "Extracting B-allele frequency data from filtered SNPs..." << std::endl;
+    // }
+    // cmd = "bcftools query -f '%POS,[%AD]\n' " + filtered_snp_vcf_filepath + " 2>/dev/null";
+    // FILE *fp = popen(cmd.c_str(), "r");
+    // if (fp == NULL)
+    // {
+    //     std::cerr << "ERROR: Could not open pipe for command: " << cmd << std::endl;
+    //     exit(1);
+    // }
+
+    // // Read the reference and alternate allele depths from the VCF file
+    // std::string alt_allele = "";  // Alternate allele
+    // uint32_t pos = 0;
+    // int ref_ad = 0;
+    // int alt_ad = 0;
+    // const int line_size = 1024;
+    // char line[line_size];  // Line buffer
+    // std::vector<int64_t> locations;
+    // std::vector<double> bafs;
+    // std::string chr_no_prefix = removeChrPrefix(chr);
+    // while (fgets(line, line_size, fp) != NULL)
+    // {
+    //     // Parse the line
+    //     char *tok = strtok(line, ",");  // Tokenize the line
+    //     int col = 0;  // Column index
+    //     while (tok != NULL)
+    //     {
+    //         // Get the position from column 2
+    //         if (col == 0)
+    //         {
+    //             pos = (uint32_t)atoi(tok);
+    //         }
+
+    //         // Get the AD for the reference allele from column 3
+    //         else if (col == 1)
+    //         {
+    //             ref_ad = atoi(tok);
+    //         }
+
+    //         // Get the AD for the non-reference allele from column 4
+    //         else if (col == 2)
+    //         {
+    //             alt_ad = atoi(tok);
+    //         }
+
+    //         // Move to the next token
+    //         tok = strtok(NULL, ",");
+    //         col++;
+    //     }
+
+    //     // Calculate the B-allele frequency (BAF) as the ratio of the alternate
+    //     // allele depth to the total depth (reference + alternate)
+    //     double baf = (double) alt_ad / (double) (ref_ad + alt_ad);
+
+    //     // Add a new location and BAF value to the chromosome's SNP data
+    //     // (population frequency and log2 ratio will be added later)
+    //     // snp_info.insertSNPAlleleFrequency(chr_no_prefix, pos, baf);
+    //     this->snp_baf_map[pos] = baf;
+    //     this->snp_baf_keys.insert(pos);
+    // }
+
+    // pclose(fp);  // Close the process
 
     if (this->input_data.getVerbose()) {
         std::cout << "Finished extracting B-allele frequency data from filtered SNPs" << std::endl;
     }
 }
 
-void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
+// void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
+// {
+//     // Get the population frequency file for the chromosome
+//     std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr);
+//     if (pfb_filepath.empty())
+//     {
+//         std::cout << "No population frequency file provided for chromosome " << chr << std::endl;
+//         return;
+//     }
+    
+//     // Determine the ethnicity-specific allele frequency key
+//     std::string AF_key = "AF";
+//     if (this->input_data.getEthnicity() != "")
+//     {
+//         AF_key += "_" + this->input_data.getEthnicity();
+//     }
+
+//     // Check if the filepath uses the 'chr' prefix notations based on the
+//     // chromosome name (*.chr1.vcf.gz vs *.1.vcf.gz)
+//     std::string chr_gnomad = chr;  // gnomAD data may or may not have the 'chr' prefix
+//     std::string chr_prefix = "chr";
+//     if (pfb_filepath.find(chr_prefix) == std::string::npos)
+//     {
+//         // Remove the 'chr' prefix from the chromosome name
+//         if (chr_gnomad.find(chr_prefix) != std::string::npos)
+//         {
+//             chr_gnomad = chr_gnomad.substr(chr_prefix.length());
+//         }
+//     } else {
+//         // Add the 'chr' prefix to the chromosome name
+//         if (chr_gnomad.find(chr_prefix) == std::string::npos)
+//         {
+//             chr_gnomad = chr_prefix + chr;
+//         }
+//     }
+
+//     // Remove the 'chr' prefix from the chromosome name for SNP data. All
+//     // SNP data in this program does not use the 'chr' prefix
+//     std::string chr_no_prefix = removeChrPrefix(chr);
+
+//     std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl;
+//     int thread_count = this->input_data.getThreadCount();
+
+//     // Open the population frequency file
+//     std::cout << "Opening population frequency file: " << pfb_filepath << std::endl;
+//     htsFile *pfb_file = hts_open(pfb_filepath.c_str(), "r");
+//     if (!pfb_file)
+//     {
+//         throw std::runtime_error("ERROR: Could not open population frequency file: " + pfb_filepath);
+//     }
+
+//     // Enable multi-threading
+//     std::cout << "Setting number of threads to " << thread_count << std::endl;
+//     hts_set_threads(pfb_file, thread_count);
+
+//     // Read the header
+//     std::cout << "Reading header from population frequency file..." << std::endl;
+//     bcf_hdr_t *pfb_header = bcf_hdr_read(pfb_file);
+//     if (!pfb_header)
+//     {
+//         bcf_close(pfb_file);
+//         throw std::runtime_error("ERROR: Could not read header from population frequency file: " + pfb_filepath);
+//     }
+
+//     // Set up the record
+//     std::cout << "Initializing BCF record..." << std::endl;
+//     bcf1_t *pfb_record = bcf_init();
+//     if (!pfb_record)
+//     {
+//         bcf_hdr_destroy(pfb_header);
+//         bcf_close(pfb_file);
+//         throw std::runtime_error("ERROR: Could not initialize BCF record.");
+//     }
+
+//     // Read the population frequencies for the chromosome
+//     std::cout << "[TEST] Reading population frequencies for chromosome " << chr << " (AF_key = " << AF_key << ")..." << std::endl;
+//     int print_count = 0;
+//     while (bcf_read(pfb_file, pfb_header, pfb_record) == 0)
+//     {
+//         // Get the chromosome and position
+//         // std::cout << "Reading record..." << std::endl;
+//         uint32_t pos = pfb_record->pos + 1;  // 0-based to 1-based
+
+//         // Skip if not a SNP, or if the position is not in the BAF map
+//         if (!bcf_is_snp(pfb_record) || this->snp_baf_keys.count(pos) == 0)
+//         {
+//             continue;
+//         }
+
+//         // Get the population frequency for the SNP
+//         float *pfb_f = NULL;
+//         int count = 0;
+//         int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count);
+//         if (pfb_status < 0 || count == 0)
+//         {
+//             std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl;
+//             continue;
+//         }
+//         double pfb = (double) pfb_f[0];
+//         free(pfb_f);
+
+//         // Continue if the population frequency is outside the threshold
+//         if (pfb <= MIN_PFB || pfb >= MAX_PFB)
+//         {
+//             continue;
+//         }
+
+//         // Add the population frequency to the SNP data
+//         // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb);
+//         if (this->snp_pfb_map.find(pos) == this->snp_pfb_map.end())
+//         {
+//             this->snp_pfb_map[pos] = pfb;
+//         } else {
+//             // Keep the larger population frequency
+//             if (pfb > this->snp_pfb_map[pos])
+//             {
+//                 this->snp_pfb_map[pos] = pfb;
+//             }
+//         }
+//         if (print_count < 10)
+//         {
+//             std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
+//             print_count++;
+//         }
+//     }
+//     std::cout << "Finished reading population frequencies for chromosome " << chr << std::endl;
+// }
+
+void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map<uint32_t, double>& snp_pfb_map)
 {
     // Get the population frequency file for the chromosome
     std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr);
@@ -848,152 +1283,278 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
     std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl;
     int thread_count = this->input_data.getThreadCount();
 
-    // Open the population frequency file
-    std::cout << "Opening population frequency file: " << pfb_filepath << std::endl;
-    htsFile *pfb_file = hts_open(pfb_filepath.c_str(), "r");
-    if (!pfb_file)
+    // Initialize the synced reader
+    bcf_srs_t *pfb_reader = bcf_sr_init();
+    if (!pfb_reader)
     {
-        throw std::runtime_error("ERROR: Could not open population frequency file: " + pfb_filepath);
+        throw std::runtime_error("ERROR: Could not initialize synced reader for population frequency file: " + pfb_filepath);
     }
 
-    // Enable multi-threading
-    std::cout << "Setting number of threads to " << thread_count << std::endl;
-    hts_set_threads(pfb_file, thread_count);
+    // Set multi-threading
+    bcf_sr_set_threads(pfb_reader, thread_count);
 
-    // Read the header
-    std::cout << "Reading header from population frequency file..." << std::endl;
-    bcf_hdr_t *pfb_header = bcf_hdr_read(pfb_file);
-    if (!pfb_header)
+    // Enable index usage
+    pfb_reader->require_index = 1;
+
+    // Add the population frequency file to the synced reader
+    if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0)
     {
-        bcf_close(pfb_file);
-        throw std::runtime_error("ERROR: Could not read header from population frequency file: " + pfb_filepath);
+        bcf_sr_destroy(pfb_reader);
+        throw std::runtime_error("ERROR: Could not add population frequency file to synced reader: " + pfb_filepath);
     }
 
-    // Set up the record
-    std::cout << "Initializing BCF record..." << std::endl;
-    bcf1_t *pfb_record = bcf_init();
-    if (!pfb_record)
+    // Set the region for the synced reader
+    std::string region_str = chr_gnomad + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
+    if (bcf_sr_set_regions(pfb_reader, region_str.c_str(), 0) < 0)
     {
-        bcf_hdr_destroy(pfb_header);
-        bcf_close(pfb_file);
-        throw std::runtime_error("ERROR: Could not initialize BCF record.");
+        bcf_sr_destroy(pfb_reader);
+        throw std::runtime_error("ERROR: Could not set region for synced reader: " + region_str);
+    } else {
+        std::cout << "Successfully set region for synced reader: " << region_str << std::endl;
     }
 
-    // Read the population frequencies for the chromosome
-    std::cout << "[TEST] Reading population frequencies for chromosome " << chr << " (AF_key = " << AF_key << ")..." << std::endl;
-    int print_count = 0;
-    while (bcf_read(pfb_file, pfb_header, pfb_record) == 0)
+    // Iterate through the records in the population frequency file
+    // bcf1_t *pfb_record = bcf_init();
+    // if (!pfb_record)
+    // {
+    //     bcf_sr_destroy(pfb_reader);
+    //     throw std::runtime_error("ERROR: Could not initialize BCF record for population frequency file: " + pfb_filepath);
+    // }
+
+    int test_count = 0;
+    std::cout << "Iterating through records..." << std::endl;
+    while (bcf_sr_next_line(pfb_reader) >= 0)
     {
-        // Get the chromosome and position
         // std::cout << "Reading record..." << std::endl;
-        // std::string record_chr = bcf_hdr_id2name(pfb_header, pfb_record->rid);
-        uint32_t pos = pfb_record->pos + 1;  // 0-based to 1-based
-
-        // Skip if not a SNP, or if the position is not in the BAF map
-        // if (!bcf_is_snp(pfb_record) || this->snp_baf_keys.find(pos) == this->snp_baf_keys.end())
-        if (!bcf_is_snp(pfb_record) || this->snp_baf_keys.count(pos) == 0)
+        // pfb_record = bcf_sr_get_line(pfb_reader, 0);
+        bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0);
+        // Do something with the record
+        if (pfb_record)
         {
-            continue;
-        }
+            // Skip if not a SNP
+            if (!bcf_is_snp(pfb_record))
+            {
+                // std::cout << "Skipping non-SNP at " << chr << ":" << pfb_record->pos << std::endl;
+                continue;
+            }
 
-        // Get the population frequency for the SNP
-        // std::cout << "Getting population frequency..." << std::endl;
-        // double pfb = 0.0;
-        // int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb, NULL);
-        // if (pfb_status < 0)
-        // {
-        //     continue;
-        // }
-        float *pfb_f = NULL;
-        int count = 0;
-        int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count);
-        if (pfb_status < 0 || count == 0)
-        {
-            std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl;
-            continue;
-        }
-        double pfb = (double) pfb_f[0];
-        free(pfb_f);
+            uint32_t pos = (uint32_t) pfb_record->pos + 1;  // 0-based to 1-based
 
-        // Continue if the population frequency is outside the threshold
-        if (pfb <= MIN_PFB || pfb >= MAX_PFB)
-        {
-            continue;
-        }
+            // Get the population frequency for the SNP
+            float *pfb_f = NULL;
+            int count = 0;
+            int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count);
+            if (pfb_status < 0 || count == 0)
+            {
+                // std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl;
+                continue;
+            }
+            double pfb = (double) pfb_f[0];
+            free(pfb_f);
 
-        // Add the population frequency to the SNP data
-        // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb);
-        if (this->snp_pfb_map.find(pos) == this->snp_pfb_map.end())
-        {
-            this->snp_pfb_map[pos] = pfb;
-        } else {
-            // Keep the larger population frequency
-            if (pfb > this->snp_pfb_map[pos])
+            // Continue if the population frequency is outside the threshold
+            if (pfb <= MIN_PFB || pfb >= MAX_PFB)
             {
-                this->snp_pfb_map[pos] = pfb;
+                continue;
+            }
+
+            // Add the population frequency to the SNP data
+            if (snp_pfb_map.find(pos) == snp_pfb_map.end())
+            {
+                snp_pfb_map[pos] = pfb;
+            } else {
+                // Keep the larger population frequency
+                if (pfb > snp_pfb_map[pos])
+                {
+                    snp_pfb_map[pos] = pfb;
+                }
+            }
+
+            if (test_count < 10)
+            {
+                std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
+                test_count++;
             }
         }
-        if (print_count < 10)
-        {
-            std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
-            print_count++;
-        }
+            // std::cout << "Record: " << pfb_record->pos << std::endl;
+            // std::cout << "QUAL: " << pfb_record->qual << std::endl;
+
+            //     // Skip if not a SNP
+            //     if (!bcf_is_snp(pfb_record))
+            //     {
+            //         std::cout << "Skipping non-SNP at " << chr << ":" << pos << std::endl;
+            //         continue;
+            //     }
+
+            //     // Get the population frequency for the SNP
+            //     float *pfb_f = NULL;
+            //     int count = 0;
+            //     int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count);
+            //     if (pfb_status < 0 || count == 0)
+            //     {
+            //         std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl;
+            //         continue;
+            //     }
+            //     double pfb = (double) pfb_f[0];
+            //     free(pfb_f);
+
+            //     // Continue if the population frequency is outside the threshold
+            //     if (pfb <= MIN_PFB || pfb >= MAX_PFB)
+            //     {
+            //         continue;
+            //     }
+
+            //     // Add the population frequency to the SNP data
+            //     // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb);
+            //     if (snp_pfb_map.find(pos) == snp_pfb_map.end())
+            //     {
+            //         snp_pfb_map[pos] = pfb;
+            //     } else {
+            //         // Keep the larger population frequency
+            //         if (pfb > snp_pfb_map[pos])
+            //         {
+            //             snp_pfb_map[pos] = pfb;
+            //         }
+            //     }
+            //     if (print_count < 10)
+            //     {
+            //         std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
+            //         print_count++;
+            //     }        }
+    }
+    if (pfb_reader->errnum)
+    {
+        std::cerr << "ERROR: " <<bcf_sr_strerror(pfb_reader->errnum) << std::endl;
     }
 
-    // // Run bcftools query to get the population frequencies for the
-    // // chromosome within the SNP region, filtering for SNPS only,
-    // // and within the MIN-MAX range of frequencies.
-    // std::string snps_fp = this->input_data.getOutputDir() + "/filtered_snps.vcf";
-    // std::string filter_criteria = "INFO/variant_type=\"snv\" && " + AF_key + " >= " + std::to_string(MIN_PFB) + " && " + AF_key + " <= " + std::to_string(MAX_PFB);
-    // std::string cmd = \
-    //     "bcftools view --threads " + std::to_string(thread_count) + " -T " + snps_fp + " -i '" + filter_criteria + "' " + pfb_filepath + " | bcftools query -f '%POS\t%" + AF_key + "\n' 2>/dev/null";
-        
-    // // printMessage("Running command: " + cmd);
-    // std::cout << "Running command: " << cmd << std::endl;
+    // std::cout << "Test count: " << test_count << std::endl;
 
-    // // Open a pipe to read the output of the command
-    // FILE *fp = popen(cmd.c_str(), "r");
-    // if (fp == NULL)
+    // Clean up
+    // bcf_destroy(pfb_record);
+    bcf_sr_destroy(pfb_reader);
+    std::cout << "Finished reading population frequencies for SV region" << std::endl;
+
+
+    // // Open the population frequency file
+    // std::cout << "Opening population frequency file: " << pfb_filepath << std::endl;
+    // htsFile *pfb_file = hts_open(pfb_filepath.c_str(), "r");
+    // if (!pfb_file)
+    // {
+    //     throw std::runtime_error("ERROR: Could not open population frequency file: " + pfb_filepath);
+    // }
+
+    // // Enable multi-threading
+    // std::cout << "Setting number of threads to " << thread_count << std::endl;
+    // hts_set_threads(pfb_file, thread_count);
+
+    // // Read the header
+    // std::cout << "Reading header from population frequency file..." << std::endl;
+    // bcf_hdr_t *pfb_header = bcf_hdr_read(pfb_file);
+    // if (!pfb_header)
+    // {
+    //     bcf_close(pfb_file);
+    //     throw std::runtime_error("ERROR: Could not read header from population frequency file: " + pfb_filepath);
+    // }
+
+    // // Load the index
+    // hts_idx_t *pfb_index = bcf_index_load(pfb_filepath.c_str());
+    // if (!pfb_index)
+    // {
+    //     bcf_hdr_destroy(pfb_header);
+    //     bcf_close(pfb_file);
+    //     throw std::runtime_error("ERROR: Could not load index for population frequency file: " + pfb_filepath);
+    // }
+
+    // // Construct the region string
+    // std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
+    // hts_itr_t *pfb_iter = bcf_itr_querys(pfb_index, pfb_header, region_str.c_str());
+    // if (!pfb_iter)
+    // {
+    //     // Try using the other chromosome notation
+    //     std::string alt_region_str = "chr" + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
+    //     pfb_iter = bcf_itr_querys(pfb_index, pfb_header, alt_region_str.c_str());
+    //     if (!pfb_iter)
+    //     {
+    //         hts_idx_destroy(pfb_index);
+    //         bcf_hdr_destroy(pfb_header);
+    //         bcf_close(pfb_file);
+    //         throw std::runtime_error("ERROR: Could not create iterator for region: " + alt_region_str);
+    //     } else {
+    //         region_str = alt_region_str;
+    //         std::cout << "Successfully created iterator for region: " << region_str << std::endl;
+    //     }
+    //     // hts_idx_destroy(pfb_index);
+    //     // bcf_hdr_destroy(pfb_header);
+    //     // bcf_close(pfb_file);
+    //     // throw std::runtime_error("ERROR: Could not create iterator for region: " + region_str);
+    // }
+
+    // // Set up the record
+    // std::cout << "Initializing BCF record..." << std::endl;
+    // bcf1_t *pfb_record = bcf_init();
+    // if (!pfb_record)
     // {
-    //     throw std::runtime_error("ERROR: Could not open pipe for command: " + cmd);
+    //     bcf_hdr_destroy(pfb_header);
+    //     bcf_close(pfb_file);
+    //     throw std::runtime_error("ERROR: Could not initialize BCF record.");
     // }
 
-    // // Loop through the BCFTOOLS output and populate the map of population
-    // // frequencies
-    // // printMessage("Parsing population frequencies for chromosome " + chr +
-    // // "...");
-    // std::cout << "Parsing population frequencies for chromosome " << chr << "..." << std::endl;
-    // // const int line_size = 256;
-    // // char line[line_size];
+    // // Read the population frequencies for the region
+    // std::cout << "[TEST] Reading population frequencies for region " << region_str << " (AF_key = " << AF_key << ")..." << std::endl;
     // int print_count = 0;
-    // // while (fgets(line, line_size, fp) != NULL)
-    // char line[2048];
-    // while (fgets(line, sizeof(line), fp) != NULL)
+    // int test_count = 0;
+    // while (bcf_itr_next(pfb_file, pfb_iter, pfb_record) >= 0)
     // {
-    //     std::istringstream iss(line);
-    //     // Parse the line
-    //     int pos;
-    //     double pfb;
-    //     // if (sscanf(line, "%d\t%lf", &pos, &pfb) == 2)
-    //     if (iss >> pos >> pfb){
-    //         // pos_pfb_map[pos] = pfb;  // Add the position and population
-    //         // frequency to the map
-    //         // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb);
-
-    //         // Print the first 10 population frequencies
-    //         if (print_count < 10)
+    //     test_count++;
+    //     // Get the chromosome and position
+    //     // std::cout << "Reading record..." << std::endl;
+    //     uint32_t pos = pfb_record->pos + 1;  // 0-based to 1-based
+
+    //     // Skip if not a SNP
+    //     if (!bcf_is_snp(pfb_record))
+    //     {
+    //         std::cout << "Skipping non-SNP at " << chr << ":" << pos << std::endl;
+    //         continue;
+    //     }
+
+    //     // Get the population frequency for the SNP
+    //     float *pfb_f = NULL;
+    //     int count = 0;
+    //     int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count);
+    //     if (pfb_status < 0 || count == 0)
+    //     {
+    //         std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl;
+    //         continue;
+    //     }
+    //     double pfb = (double) pfb_f[0];
+    //     free(pfb_f);
+
+    //     // Continue if the population frequency is outside the threshold
+    //     if (pfb <= MIN_PFB || pfb >= MAX_PFB)
+    //     {
+    //         continue;
+    //     }
+
+    //     // Add the population frequency to the SNP data
+    //     // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb);
+    //     if (snp_pfb_map.find(pos) == snp_pfb_map.end())
+    //     {
+    //         snp_pfb_map[pos] = pfb;
+    //     } else {
+    //         // Keep the larger population frequency
+    //         if (pfb > snp_pfb_map[pos])
     //         {
-    //             std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
-    //             // printMessage("Population frequency for " + chr + ":" +
-    //             // std::to_string(pos) + " = " + std::to_string(pfb));
-    //             this->snp_pfb_map[pos] = pfb;
-    //             print_count++;
+    //             snp_pfb_map[pos] = pfb;
     //         }
     //     }
+    //     if (print_count < 10)
+    //     {
+    //         std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
+    //         print_count++;
+    //     }
     // }
-    // pclose(fp);
-    std::cout << "Finished reading population frequencies for chromosome " << chr << std::endl;
-    // printMessage("Finished parsing population frequencies for chromosome " + chr + "...");
+    // std::cout << "Finished reading population frequencies for region " << region_str << std::endl;
+    // std::cout << "Test count: " << test_count << std::endl;
 }
 
 void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood)
@@ -1088,11 +1649,9 @@ void CNVCaller::updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, doubl
     snp_data.is_snp.emplace_back(is_snp);
 }
 
-std::tuple<std::vector<uint32_t>, std::vector<double>, std::vector<double>> CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end)
+void CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf, std::unordered_map<uint32_t, double>& snp_pfb)
 {
-    // Lock the mutex for reading SNP information
-    // std::lock_guard<std::mutex> lock(this->snp_info_mtx);
-
+    std::string snp_chr = chr;
     chr = removeChrPrefix(chr);
 
     // Create an ordered map of SNP positions to BAF and PFB values
@@ -1100,35 +1659,72 @@ std::tuple<std::vector<uint32_t>, std::vector<double>, std::vector<double>> CNVC
 
     // Query SNPs within a range (start, end) and return their BAF and PFB
     // values as separate vectors
-    std::vector<double> bafs;
-    std::vector<double> pfbs;
-    std::vector<uint32_t> pos;
+    // std::vector<double> bafs;
+    // std::vector<double> pfbs;
+    // std::vector<uint32_t> pos;
     double pfb_default = 0.5;
 
+    // Read the SNP data from the VCF file
+    this->readSNPAlleleFrequencies(snp_chr, start, end, snp_pos, snp_baf);
+
     // Query the SNPs within the range and return their BAFs and corresponding
     // positions
-    auto snp_start = this->snp_baf_keys.lower_bound(start);
-    auto snp_end = this->snp_baf_keys.upper_bound(end);
+    // auto snp_start = this->snp_baf_keys.lower_bound(start);
+    // auto snp_end = this->snp_baf_keys.upper_bound(end);
+    // if (snp_start == this->snp_baf_keys.end())
+    // {
+    //     // return std::make_tuple(pos, bafs, pfbs);
+    //     return;
+    // }
 
-    if (snp_start == this->snp_baf_keys.end())
-    {
-        return std::make_tuple(pos, bafs, pfbs);
-    }
+    // Query the population frequencies for the SNPs
+    std::unordered_map<uint32_t, double> pfb_map;
+    this->readSNPPopulationFrequencies(chr, start, end, pfb_map);
 
-    for (auto it = snp_start; it != snp_end; it++)
+    // Filter out the SNP population frequencies that are not in the SNP
+    // position set
+    // std::unordered_map<uint32_t, double> snp_pfb;
+    for (auto& pos : snp_pos)
     {
-        uint32_t snp_pos = *it;
-        pos.push_back(snp_pos);
-        bafs.push_back(this->snp_baf_map[snp_pos]);
-
-        // Get the PFB value for the SNP
-        if (this->snp_pfb_map.find(snp_pos) != this->snp_pfb_map.end())
+        if (pfb_map.find(pos) != pfb_map.end())
         {
-            pfbs.push_back(this->snp_pfb_map[snp_pos]);
+            snp_pfb[pos] = pfb_map[pos];
         } else {
-            pfbs.push_back(pfb_default);
+            snp_pfb[pos] = pfb_default;
         }
     }
+
+    // // Get the PFB values for the SNPs from the keys
+    // // Create the PFB vector using the SNP positions (loop through snp_pos,
+    // // query the pfb_map, and push the value to the vector)
+    // for (size_t i = 0; i < snp_pos.size(); i++)
+    // {
+    //     uint32_t snp_pos = snp_pos[i];
+    //     double pfb = pfb_default;
+    //     if (pfb_map.find(snp_pos) != pfb_map.end())
+    //     {
+    //         pfb = pfb_map[snp_pos];
+    //     } else {
+    //         pfb = pfb_default;
+    //     }
+    //     snp_pfb.push_back(pfb);
+    // }
+
+    // // Get the PFB values for the SNPs from the keys
+    // for (auto it = snp_start; it != snp_end; it++)
+    // {
+    //     uint32_t snp_pos = *it;
+    //     pos.push_back(snp_pos);
+    //     bafs.push_back(this->snp_baf_map[snp_pos]);
+
+    //     // Get the PFB value for the SNP
+    //     if (this->snp_pfb_map.find(snp_pos) != this->snp_pfb_map.end())
+    //     {
+    //         pfbs.push_back(this->snp_pfb_map[snp_pos]);
+    //     } else {
+    //         pfbs.push_back(pfb_default);
+    //     }
+    // }
     // auto& baf_bst = this->snp_baf_map[chr];
     // auto baf_start = baf_bst.lower_bound({start, 0.0});
     // auto baf_end = baf_bst.upper_bound({end, 0.0});
@@ -1147,5 +1743,5 @@ std::tuple<std::vector<uint32_t>, std::vector<double>, std::vector<double>> CNVC
     //     }
     // }
     
-    return std::make_tuple(pos, bafs, pfbs);
+    // return std::make_tuple(pos, bafs, pfbs);
 }
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index b2c5827f..327f3fd2 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -171,12 +171,12 @@ void mergeSVs(std::set<SVCall>& sv_calls) {
         uint32_t current_length = current_merge.end - current_merge.start;
         uint32_t next_length = next.end - next.start;
 
-        // Merge the SV calls if the overlap is at least 50% of the current or
-        // next SV call
-        double overlap_pct_current = static_cast<double>(overlap_length) / current_length;
-        double overlap_pct_next = static_cast<double>(overlap_length) / next_length;
+        // Merge the SV calls if the overlap is > 0
+        //double overlap_pct_current = static_cast<double>(overlap_length) / current_length;
+        //double overlap_pct_next = static_cast<double>(overlap_length) / next_length;
 
-        if (overlap_pct_current >= 0.5 || overlap_pct_next >= 0.5) {
+        //if (overlap_pct_current >= 0.5 || overlap_pct_next >= 0.5) {
+        if (overlap_length > 0) {
             // Merge the SV calls based on the likelihood
             if (next.hmm_likelihood != 0.0) {
                 // Update the likelihood if the next SV call has a likelihood
@@ -197,7 +197,7 @@ void mergeSVs(std::set<SVCall>& sv_calls) {
                 // }
             }
         } else {
-            // No overlap: Save the SV and continue
+            // No overlap: Save the previous SV and continue
             merged_sv_calls.push_back(current_merge);
             current_merge = next;
         }

From 53473af46d40f052b31ad263682b810223c183e4 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 25 Nov 2024 14:11:42 -0500
Subject: [PATCH 022/134] Fix reading AFs

---
 src/cnv_caller.cpp | 155 +++++++++++++++++++++++----------------------
 1 file changed, 81 insertions(+), 74 deletions(-)

diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 4fcc1604..f2602d88 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -700,14 +700,12 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         throw std::runtime_error("ERROR: Could not initialize SNP reader.");
     }
 
-    // Read the SNP header
-    htsFile *snp_file = bcf_open(snp_filepath.c_str(), "r");
-    bcf_hdr_t *snp_header = bcf_hdr_read(snp_file);
-    if (!snp_header)
+    // Set the region
+    std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
+    if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)
     {
         bcf_sr_destroy(snp_reader);
-        bcf_close(snp_file);
-        throw std::runtime_error("ERROR: Could not initialize SNP header.");
+        throw std::runtime_error("ERROR: Could not set region for SNP reader: " + region_str);
     }
 
     // Set multi-threading
@@ -721,30 +719,48 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0)
     {
         bcf_sr_destroy(snp_reader);
-        bcf_hdr_destroy(snp_header);
-        bcf_close(snp_file);
         throw std::runtime_error("ERROR: Could not add SNP file to reader: " + snp_filepath);
     }
 
-    // Set the region
-    std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
-    if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)
+    // Get the header
+    bcf_hdr_t *snp_header = bcf_sr_get_header(snp_reader, 0);
+    if (!snp_header)
     {
         bcf_sr_destroy(snp_reader);
-        bcf_hdr_destroy(snp_header);
-        bcf_close(snp_file);
-        throw std::runtime_error("ERROR: Could not set region for SNP reader: " + region_str);
+        throw std::runtime_error("ERROR: Could not get header for SNP reader.");
     }
 
     std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl;
     int print_count = 0;
-    while (bcf_sr_next_line(snp_reader) >= 0)
+    int record_count = 0;
+    int duplicate_count = 0;
+    uint32_t last_pos = 0;
+    while (bcf_sr_next_line(snp_reader) > 0)
     {
+        if (!bcf_sr_has_line(snp_reader, 0))
+        {
+            continue;
+        }
         bcf1_t *snp_record = bcf_sr_get_line(snp_reader, 0);
         if (snp_record)
         {
+            record_count++;
             uint32_t pos = (uint32_t)snp_record->pos + 1;
 
+            // Skip if 3 or more duplicate positions found
+            // if (pos == last_pos)
+            // {
+            //     duplicate_count++;
+            //     if (duplicate_count >= 10)
+            //     {
+            //         std::cerr << "ERROR: 3 or more duplicate positions found in SNP file at " << chr << ":" << pos << std::endl;
+            //         break;
+            //     }
+            // } else {
+            //     duplicate_count = 0;
+            // }
+            // last_pos = pos;
+
             // Skip if not a SNP
             if (!bcf_is_snp(snp_record))
             {
@@ -755,7 +771,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             float qual = snp_record->qual;
             if (bcf_float_is_missing(qual))
             {
-                std::cerr << "ERROR: QUAL value is missing for SNP at " << chr << ":" << pos << std::endl;
+                // std::cerr << "ERROR: QUAL value is missing for SNP at " << chr << ":" << pos << std::endl;
             }
             // Skip if quality is less than 30
             if (qual <= 30)
@@ -766,33 +782,28 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             // Extract DP from FORMAT field
             int32_t *dp = 0;
             int dp_count = 0;
-            // int dp_count = bcf_get_info_int32(snp_header, snp_record, "DP",
-            // &dp, &dp_count);
             int dp_ret = bcf_get_format_int32(snp_header, snp_record, "DP", &dp, &dp_count);
+            bool dp_skip = false;
             if (dp_ret < 0)
             {
-                std::cerr << "ERROR: Could not get DP value for SNP at " << chr << ":" << pos << std::endl;
+                // std::cerr << "ERROR: Could not get DP value for SNP at " << chr << ":" << pos << std::endl;
             } else {
                 // Skip if depth is not greater than 10
                 for (int i = 0; i < dp_count; i++)
                 {
                     if (dp[i] <= 10)
                     {
-                        continue;
+                        dp_skip = true;
+                        break;
                     }
                 }
-                // if (dp <= 10)
-                // {
-                //     continue;
-                // }
             }
             free(dp);
-            // // Skip if depth is not greater than 10
-            // if (dp <= 10)
-            // {
-            //     continue;
-            // }
-            
+            if (dp_skip)
+            {
+                continue;
+            }
+
             // Skip if the SNP does not pass the filter
             if (bcf_has_filter(snp_header, snp_record, const_cast<char*>("PASS")) != 1)
             {
@@ -800,24 +811,20 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             }
 
             // Extract AD from FORMAT field
-            // int32_t ad[2] = {0, 0};
             int32_t *ad = 0;
             int ad_count = 0;
-            // int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD",
-            // &ad, &ad_count);
             int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count);
 
             // Skip if AD value is missing
             if (ad_ret < 0)
             {
-                std::cerr << "ERROR: AD value is missing for SNP at " << chr << ":" << pos << std::endl;
-                continue;
+                // std::cerr << "ERROR: AD value is missing for SNP at " << chr
+                // << ":" << pos << std::endl;
+                throw std::runtime_error("ERROR: AD value is missing for SNP at " + chr + ":" + std::to_string(pos));
             }
 
             // Calculate the B-allele frequency (BAF)
             double baf = 0.0;
-            // double ad0 = (double) ad[0];
-            // double ad1 = (double) ad[1];
             double ad0 = 0.0;
             double ad1 = 0.0;
             for (int i = 0; i < ad_count; i++)
@@ -830,34 +837,29 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
                 }
             }
             free(ad);
-            try {
-                // std::cout << "AD[0]: " << ad0 << ", AD[1]: " << ad1 << std::endl;
-                baf = ad1 / (ad0 + ad1);
-                // std::cout << "AD[0]: " << ad[0] << ", AD[1]: " << ad[1] << std::endl;
-                // baf = (double) ad[1] / (double) (ad[0] + ad[1]);
-            } catch (const std::exception& e) {
-                std::cerr << "ERROR: Could not calculate BAF for SNP at " << chr << ":" << pos << std::endl;
-                continue;
-            }
+            baf = ad1 / (ad0 + ad1);
 
             // Insert the SNP position and BAF into the maps
             snp_pos.insert(pos);
             snp_baf[pos] = baf;
 
             // Print the SNP position and BAF
-            if (print_count < 10)
-            {
-                std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << "(REF=" << ad0 << ",ALT=" << ad1 << ")" << std::endl;
-                print_count++;
-            }
+            // std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << "(REF=" << ad0 << ",ALT=" << ad1 << ")" << std::endl;
+            // print_count++;
+            // if (print_count < 10)
+            // {
+            //     std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << "(REF=" << ad0 << ",ALT=" << ad1 << ")" << std::endl;
+            //     print_count++;
+            // }
         }
     }
 
+    std::cout << "[TEST] SNP record count: " << record_count << std::endl;
+
     // Clean up
     std::cout << "Cleaning up SNP reader..." << std::endl;
     bcf_sr_destroy(snp_reader);
-    bcf_hdr_destroy(snp_header);
-    bcf_close(snp_file);
+    std::cout << "Finished reading SNP allele frequencies for chromosome " << chr << std::endl;
 
     // std::cout << "Opening SNP file: " << snp_filepath << std::endl;
     // htsFile *snp_file = bcf_open(snp_filepath.c_str(), "r");
@@ -1290,6 +1292,14 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
         throw std::runtime_error("ERROR: Could not initialize synced reader for population frequency file: " + pfb_filepath);
     }
 
+    // Set the region for the synced reader
+    std::string region_str = chr_gnomad + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
+    if (bcf_sr_set_regions(pfb_reader, region_str.c_str(), 0) < 0)
+    {
+        bcf_sr_destroy(pfb_reader);
+        throw std::runtime_error("ERROR: Could not set region for synced reader: " + region_str);
+    }
+
     // Set multi-threading
     bcf_sr_set_threads(pfb_reader, thread_count);
 
@@ -1303,34 +1313,30 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
         throw std::runtime_error("ERROR: Could not add population frequency file to synced reader: " + pfb_filepath);
     }
 
-    // Set the region for the synced reader
-    std::string region_str = chr_gnomad + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
-    if (bcf_sr_set_regions(pfb_reader, region_str.c_str(), 0) < 0)
+    // Get the header
+    bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0);
+    if (!pfb_header)
     {
         bcf_sr_destroy(pfb_reader);
-        throw std::runtime_error("ERROR: Could not set region for synced reader: " + region_str);
-    } else {
-        std::cout << "Successfully set region for synced reader: " << region_str << std::endl;
+        throw std::runtime_error("ERROR: Could not get header for population frequency file: " + pfb_filepath);
     }
 
-    // Iterate through the records in the population frequency file
-    // bcf1_t *pfb_record = bcf_init();
-    // if (!pfb_record)
-    // {
-    //     bcf_sr_destroy(pfb_reader);
-    //     throw std::runtime_error("ERROR: Could not initialize BCF record for population frequency file: " + pfb_filepath);
-    // }
-
     int test_count = 0;
-    std::cout << "Iterating through records..." << std::endl;
-    while (bcf_sr_next_line(pfb_reader) >= 0)
+    int record_count = 0;
+    std::cout << "Iterating through records for region " << region_str << "..." << std::endl;
+    while (bcf_sr_next_line(pfb_reader) > 0)
     {
+        if (!bcf_sr_has_line(pfb_reader, 0))
+        {
+            continue;
+        }
         // std::cout << "Reading record..." << std::endl;
         // pfb_record = bcf_sr_get_line(pfb_reader, 0);
         bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0);
         // Do something with the record
         if (pfb_record)
         {
+            record_count++;
             // Skip if not a SNP
             if (!bcf_is_snp(pfb_record))
             {
@@ -1370,11 +1376,11 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
                 }
             }
 
-            if (test_count < 10)
-            {
-                std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
-                test_count++;
-            }
+            // if (test_count < 10)
+            // {
+            //     std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
+            //     test_count++;
+            // }
         }
             // std::cout << "Record: " << pfb_record->pos << std::endl;
             // std::cout << "QUAL: " << pfb_record->qual << std::endl;
@@ -1428,6 +1434,7 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
     }
 
     // std::cout << "Test count: " << test_count << std::endl;
+    std::cout << "Record count: " << record_count << std::endl;
 
     // Clean up
     // bcf_destroy(pfb_record);

From 4b83e62b514d6b1bb5dcfc179a571027b92105fa Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 25 Nov 2024 15:50:46 -0500
Subject: [PATCH 023/134] Fix mean chr cov

---
 python/cnv_plots.py |  2 +-
 src/cnv_caller.cpp  | 45 +++++++++++++++++++++++++++++++--------------
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/python/cnv_plots.py b/python/cnv_plots.py
index ec9ba842..67c831c6 100644
--- a/python/cnv_plots.py
+++ b/python/cnv_plots.py
@@ -76,7 +76,7 @@ def run(cnv_data_file, output_html):
             line = f.readline().strip()
             if '=' in line:
                 key, value = line.split("=")
-                log.info("Metadata: %s=%s", key, value)
+                # log.info("Metadata: %s=%s", key, value)
                 value = value.strip()
                 metadata[key] = value
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index f2602d88..0c6c432a 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -74,6 +74,7 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
     // window, then calculate the log2 ratio for each window
     for (uint32_t i = start_pos; i <= end_pos; i += window_size)
     {
+        // std::cout << "Querying SNP region for " << chr << ":" << i << "-" << std::min(i + window_size - 1, end_pos) << std::endl;
         // Run a sliding non-overlapping window of size window_size across
         // the SV region and calculate the log2 ratio for each window
         uint32_t window_start = i;
@@ -98,6 +99,8 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
         // after the SNP, and continue until the end of the window
         // (If there are no SNPs in the window, then use the default BAF and
         // PFB values, and the coverage log2 ratio)
+
+        // If no SNPs, then calculate the log2 ratio for the window
         if (snp_window_pos.size() == 0)
         {
             double window_log2_ratio = calculateLog2Ratio(window_start, window_end, pos_depth_map, mean_chr_cov);
@@ -117,6 +120,20 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
                 uint32_t bin_start = snp_window_pos[j] - window_size / 2;
                 uint32_t bin_end = snp_window_pos[j] + window_size / 2;
 
+                // Trim the bin start and end to 1/2 the distance from the
+                // neighboring SNPs (or the start/end of the window)
+                if (j > 0)
+                {
+                    bin_start = std::max(bin_start, (snp_window_pos[j-1] + snp_window_pos[j]) / 2);
+                }
+
+                if (j < (int) snp_window_pos.size() - 1)
+                {
+                    bin_end = std::min(bin_end, (snp_window_pos[j] + snp_window_pos[j+1]) / 2);
+                }
+                // std::cout << "bin_start: " << bin_start << std::endl;
+                // std::cout << "bin_end: " << bin_end << std::endl;
+
                 // Calculate the log2 ratio for the SNP bin
                 double bin_cov = calculateLog2Ratio(bin_start, bin_end, pos_depth_map, mean_chr_cov);
                 this->updateSNPData(snp_data, snp_window_pos[j], snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true);
@@ -582,6 +599,11 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
     std::unordered_map<uint32_t, int> chr_pos_depth_map;
     while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0)
     {
+        // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads
+        if (bam_record->core.flag & BAM_FUNMAP || bam_record->core.flag & BAM_FSECONDARY || bam_record->core.flag & BAM_FQCFAIL || bam_record->core.flag & BAM_FDUP)
+        {
+            continue;
+        }
         
         // Parse the CIGAR string to get the depth (match, sequence match, and
         // mismatch)
@@ -622,11 +644,18 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
     bam_hdr_destroy(bam_header);
     sam_close(bam_file);
 
-    // Calculate the mean chromosome coverage
+    // Calculate the mean chromosome coverage for positions with non-zero depth
     uint64_t cum_depth = 0;
     uint32_t pos_count = 0;
     for (auto& pos_depth : chr_pos_depth_map)
     {
+        // if (pos_depth.second > 0)
+        // {
+        //     cum_depth += pos_depth.second;
+        //     pos_count++;
+        // } else {
+        //     std::cout << "Zero depth at position " << pos_depth.first << std::endl;
+        // }
         cum_depth += pos_depth.second;
         pos_count++;
     }
@@ -730,7 +759,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         throw std::runtime_error("ERROR: Could not get header for SNP reader.");
     }
 
-    std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl;
+    // std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl;
     int print_count = 0;
     int record_count = 0;
     int duplicate_count = 0;
@@ -854,12 +883,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         }
     }
 
-    std::cout << "[TEST] SNP record count: " << record_count << std::endl;
-
     // Clean up
-    std::cout << "Cleaning up SNP reader..." << std::endl;
     bcf_sr_destroy(snp_reader);
-    std::cout << "Finished reading SNP allele frequencies for chromosome " << chr << std::endl;
 
     // std::cout << "Opening SNP file: " << snp_filepath << std::endl;
     // htsFile *snp_file = bcf_open(snp_filepath.c_str(), "r");
@@ -1281,8 +1306,6 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
     // Remove the 'chr' prefix from the chromosome name for SNP data. All
     // SNP data in this program does not use the 'chr' prefix
     std::string chr_no_prefix = removeChrPrefix(chr);
-
-    std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl;
     int thread_count = this->input_data.getThreadCount();
 
     // Initialize the synced reader
@@ -1323,7 +1346,6 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
 
     int test_count = 0;
     int record_count = 0;
-    std::cout << "Iterating through records for region " << region_str << "..." << std::endl;
     while (bcf_sr_next_line(pfb_reader) > 0)
     {
         if (!bcf_sr_has_line(pfb_reader, 0))
@@ -1433,14 +1455,9 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
         std::cerr << "ERROR: " <<bcf_sr_strerror(pfb_reader->errnum) << std::endl;
     }
 
-    // std::cout << "Test count: " << test_count << std::endl;
-    std::cout << "Record count: " << record_count << std::endl;
-
     // Clean up
     // bcf_destroy(pfb_record);
     bcf_sr_destroy(pfb_reader);
-    std::cout << "Finished reading population frequencies for SV region" << std::endl;
-
 
     // // Open the population frequency file
     // std::cout << "Opening population frequency file: " << pfb_filepath << std::endl;

From 302598e806fcb6001b2448348555a18b6e8159e0 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 25 Nov 2024 15:57:52 -0500
Subject: [PATCH 024/134] Update gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 8284049d..f2253893 100644
--- a/.gitignore
+++ b/.gitignore
@@ -75,6 +75,7 @@ data/gnomadv3_filepaths.txt
 data/gnomadv4_filepaths.txt
 data/gnomadv4_filepaths_ssd.txt
 data/gnomadv4_hg19_filepaths.txt
+data/gnomadv4_hg19_filepaths_ssd.txt
 
 # Training data
 data/sv_scoring_dataset/

From 9d3164c03aa758221f1fcb78adffe7ee17360cf8 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 25 Nov 2024 16:03:29 -0500
Subject: [PATCH 025/134] Update test

---
 tests/test_general.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test_general.py b/tests/test_general.py
index 1d81fe96..689dda55 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -64,7 +64,7 @@ def test_run():
 
     # Check that the VCF file has the correct number of lines.
     with open(output_file, 'r', encoding='utf-8') as f:
-        assert len(f.readlines()) == 32
+        assert len(f.readlines()) == 30
 
     # Check that the VCF file has the correct header, and the correct
     # VCF CHROM, POS, and INFO fields in the next 2 lines.
@@ -78,11 +78,11 @@ def test_run():
                 fields = line.strip().split('\t')
                 assert fields[0] == "21"
                 assert fields[1] == "14458394"
-                assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1341;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS,;CLIPSUP=0;REPTYPE=NA;HMM=-179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.000000"
+                assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1344;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS;CLIPSUP=0;REPTYPE=NA;HMM=0.000000"
             elif i == header_line + 2:
                 fields = line.strip().split('\t')
                 assert fields[0] == "21"
-                assert fields[1] == "14458394"
-                assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1344;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS,;CLIPSUP=0;REPTYPE=NA;HMM=-179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.000000"
+                assert fields[1] == "14469910"
+                assert fields[7] == "END=14470078;SVTYPE=DEL;SVLEN=-168;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARDEL;CLIPSUP=0;REPTYPE=NA;HMM=0.000000"
                 break
             
\ No newline at end of file

From 2077f2636cc4d463bed44b1b1d15b566da073286 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 25 Nov 2024 22:57:14 -0500
Subject: [PATCH 026/134] Fix trimming

---
 include/sv_caller.h |   2 +
 src/sv_caller.cpp   | 621 ++++++++++----------------------------------
 2 files changed, 138 insertions(+), 485 deletions(-)

diff --git a/include/sv_caller.h b/include/sv_caller.h
index 184248d7..cc9f2630 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -56,6 +56,8 @@ class SVCaller {
 
         void saveToVCF(const std::unordered_map<std::string, std::set<SVCall>>& sv_calls);
 
+        void trimOverlappingAlignments(AlignmentData& primary_alignment, AlignmentData& supp_alignment);
+
     public:
         explicit SVCaller(InputData& input_data);
 
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 228a93a2..72d61b14 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -463,6 +463,8 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
             std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
             PrimaryMap& primary_map = std::get<1>(region_data);
             SuppMap& supp_map = std::get<2>(region_data);
+            std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl;
+            mergeSVs(subregion_sv_calls);
             // SVData& subregion_sv_calls = std::get<0>(region_data);
             // PrimaryMap& primary_map = std::get<1>(region_data);
             // SuppMap& supp_map = std::get<2>(region_data);
@@ -550,515 +552,107 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
         int32_t primary_query_end = std::get<5>(primary_alignment);
         std::unordered_map<int, int> primary_match_map = std::get<6>(primary_alignment);
         // bool primary_strand = std::get<7>(primary_alignment);
+
+        // Skip primary alignments that do not have supplementary alignments
         if (supp_map.find(qname) == supp_map.end()) {
             continue;
         }
 
-        // // Resolve overlaps between the primary and supplementary query
-        // // sequences
-        // for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
-        //     std::string supp_chr = std::get<0>(*it);
-        //     // int32_t supp_start = std::get<1>(*it);
-        //     // int32_t supp_end = std::get<2>(*it);
-        //     int32_t supp_query_start = std::get<4>(*it);
-        //     int32_t supp_query_end = std::get<5>(*it);
-        //     std::unordered_map<int, int> supp_match_map = std::get<6>(*it);
-        //     // bool supp_strand = std::get<7>(*it);
-
-        //     // Resolve overlaps between the primary and supplementary query
-        //     // sequences
-        //     if (primary_query_start < supp_query_end && primary_query_end > supp_query_start || supp_query_start < primary_query_end && supp_query_end > primary_query_start) {
-
-        //         // Calculate the mismatch rate for each alignment at the overlap
-        //         double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1);
-        //         double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1);
-        //         // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl;
-        //         // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl;
-
-        //         // Trim the overlap from the alignment with the higher mismatch
-        //         // rate
-        //         if (primary_mismatch_rate > supp_mismatch_rate) {
-        //             if (overlap_start == primary_query_start) {
-        //                 primary_start += overlap_length;
-        //             } else if (overlap_end == primary_query_end) {
-        //                 primary_end -= overlap_length;
-        //             }
-
-        //         } else {
-        //             if (overlap_start == supp_query_start) {
-        //                 // supp_start += overlap_length;
-        //                 // Update the value in the supp map
-        //                 std::get<1>(*it) += overlap_length;
-        //             } else if (overlap_end == supp_query_end) {
-        //                 // supp_end -= overlap_length;
-        //                 // Update the value in the supp map
-        //                 std::get<2>(*it) -= overlap_length;
-        //             }
-        //         }
-        //     }
-        // }
-
-        // Remove supplementary alignments that are not on the same chromosome
-        // as the primary alignment
-        for (auto it = supp_map[qname].begin(); it != supp_map[qname].end();) {
-            if (std::get<0>(*it) != primary_chr) {
-                it = supp_map[qname].erase(it);
-            } else {
-                ++it;
-            }
-        }
-
-        // Run copy number variant predictions on the primary alignment
-        SVType primary_type = SVType::UNKNOWN;
-        double primary_lh = std::numeric_limits<double>::lowest();
-        int32_t primary_lh_t = 0;
-        if (primary_end - primary_start >= min_cnv_length) {
-            SVCandidate sv_candidate(primary_start+1, primary_end+1, ".");
-            // std::cout << "TEST5" << std::endl;
-            std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-            primary_lh = std::get<0>(result);
-            // primary_log_likelihood /= (double)(primary_end - primary_start);  // Normalize the log likelihood by the length
-            primary_type = std::get<1>(result);
-        }
-
-        // Loop through the supplementary alignments, find the largest
-        // supplementary alignment, and the closest non-overlapping
-        // supplementary alignment to the primary alignment
+        // Find the largest supplementary alignment, and also identify inversions
         AlignmentData largest_supp_alignment = supp_map[qname][0];
-        AlignmentData closest_supp_alignment = supp_map[qname][0];
         int32_t largest_supp_length = 0;
-        int32_t closest_supp_distance = std::numeric_limits<int32_t>::max();
-        int32_t closest_supp_length = 0;
         for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
-            // const auto& supp_chr = std::get<0>(*it);
+            const auto& supp_chr = std::get<0>(*it);
+            if (primary_chr != supp_chr) {
+                continue;  // Skip supplementary alignments on different chromosomes
+            }
             int32_t supp_start = std::get<1>(*it);
             int32_t supp_end = std::get<2>(*it);
             int32_t supp_length = supp_end - supp_start + 1;
-            int32_t supp_distance = std::numeric_limits<int32_t>::max();
-            if (supp_start > primary_end) {
-                supp_distance = supp_start - primary_end;
-            } else if (supp_end < primary_start) {
-                supp_distance = primary_start - supp_end;
-            }
             if (supp_length > largest_supp_length) {
                 largest_supp_length = supp_length;
                 largest_supp_alignment = *it;
             }
-            if (supp_distance < closest_supp_distance) {
-                closest_supp_length = supp_length;
-                closest_supp_alignment = *it;
-                closest_supp_distance = supp_distance;
-            }
-        }
-
-        // Run copy number variant predictions on the largest supplementary
-        // alignment
-        double largest_supp_lh = std::numeric_limits<double>::lowest();
-        SVType largest_supp_type = SVType::UNKNOWN;
-        int largest_supp_lh_t = 0;
-        if (largest_supp_length >= min_cnv_length) {
-            SVCandidate sv_candidate(std::get<1>(largest_supp_alignment)+1, std::get<2>(largest_supp_alignment)+1, ".");
-            // std::cout << "TEST1" << std::endl;
-            std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-            largest_supp_lh = std::get<0>(result);
-            // largest_supp_log_likelihood /= (double)largest_supp_length;  // Normalize the log likelihood by the length
-            largest_supp_type = std::get<1>(result);
-        }
-
-        // Run copy number variant predictions on the closest non-overlapping
-        // supplementary alignment (if not the same as the largest)
-        double closest_supp_lh = std::numeric_limits<double>::lowest();
-        SVType closest_supp_type = SVType::UNKNOWN;
-        int closest_supp_lh_t = 0;
-        if (largest_supp_alignment != closest_supp_alignment) {
-            if (closest_supp_length >= min_cnv_length) {
-                SVCandidate sv_candidate(std::get<1>(closest_supp_alignment)+1, std::get<2>(closest_supp_alignment)+1, ".");
-                // std::cout << "TEST2" << std::endl;
-                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-                closest_supp_lh = std::get<0>(result);
-                // closest_supp_log_likelihood /= (double)closest_supp_length;  // Normalize the log likelihood by the length
-                closest_supp_type = std::get<1>(result);
-                int32_t closest_supp_start = std::get<1>(closest_supp_alignment);
-                int32_t closest_supp_end = std::get<2>(closest_supp_alignment);
-            }
-        }
-
-        // Define constants representing read scenarios used for SV detection
-        const int NOCALL = -1;  // Default
-        const int PRIM_SUPP_BD = 0;  // Primary and supplementary boundary
-        const int PRIM_SUPP_GAP = 1;  // Primary and supplementary gap
-        const int SUPP_PRIM_BD = 2;  // Supplementary and primary boundary
-        const int SUPP_PRIM_GAP = 3;  // Supplementary and primary gap
-
-        // Loop through all the supplementary alignments and find the highest
-        // likelihood prediction
-        double best_split_aln_lh = std::numeric_limits<double>::lowest();
-        double best_split_aln_lh_norm = std::numeric_limits<double>::lowest();
-        // int best_split_aln_length = 0;
-        SVType best_supp_type = SVType::UNKNOWN;
-        std::pair<int32_t, int32_t> best_supp_candidate;
-        AlignmentData& best_split_alignment = supp_map[qname][0];
-        int best_scenario = NOCALL;
-        for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
-            int32_t supp_start = std::get<1>(*it);
-            int32_t supp_end = std::get<2>(*it);
-            bool primary_before_supp = primary_start < supp_start;
-
-            // Create the SV candidate as the boundary of the primary and
-            // supplementary alignments
-            SVCandidate split_boundary;
-            SVCandidate split_gap;
-            bool invalid_gap = false;
-            if (primary_before_supp) {
-                split_boundary = SVCandidate(primary_start+1, supp_end+1, ".");
-
-                // Check for an invalid gap (overlap)
-                if (primary_end >= supp_start) {
-                    invalid_gap = true;
-                } else {
-                    split_gap = SVCandidate(primary_end+1, supp_start+1, ".");
-                }
-                // split_gap = SVCandidate(primary_end+1, supp_start+1, ".");
-
-            } else {
-                split_boundary = SVCandidate(supp_start+1, primary_end+1, ".");
-
-                // Check for an invalid gap (overlap)
-                if (supp_end >= primary_start) {
-                    invalid_gap = true;
-                } else {
-                    split_gap = SVCandidate(supp_end+1, primary_start+1, ".");
-                }
-            }
 
-            // Create a vector of the two SV candidates, don't add the gap if
-            // it is an overlap, or if either SV is less than the minimum CNV
-            // length
-            std::vector<SVCandidate> sv_candidates;
-            if (!invalid_gap && std::get<1>(split_gap) - std::get<0>(split_gap) >= min_cnv_length) {
-                sv_candidates.push_back(split_gap);
-            }
-            if (std::get<1>(split_boundary) - std::get<0>(split_boundary) >= min_cnv_length) {
-                sv_candidates.push_back(split_boundary);
-            }
-
-            // Continue if no SV candidates
-            if (sv_candidates.size() == 0) {
-                continue;
-            }
-
-            // Run copy number variant predictions on both, and keep the
-            // prediction with the highest normalized log likelihood
-            double chosen_lh_norm = std::numeric_limits<double>::lowest();
-            SVType chosen_type = SVType::UNKNOWN;
-            std::pair<int32_t, int32_t> chosen_candidate;
-            std::string chosen_candidate_str = "BOUNDARY";
-            int split_scenario = NOCALL;
-            for (const auto& sv_candidate : sv_candidates) {
-            	// std::cout << "TEST3: primary = " << primary_start << ", " << primary_end << " supp = " << supp_start << ", " << supp_end << std::endl;
-            	// std::cout << "Position: " << std::get<0>(sv_candidate) << ", " << std::get<1>(sv_candidate) << std::endl;
-                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-                double current_lh = std::get<0>(result);
-                SVType current_type = std::get<1>(result);
-
-                // Normalize the log likelihood by the state sequence length
-                double current_lh_norm = current_lh;// / (double)T;
-                // if (sv_candidate == split_boundary) {
-                //     std::cout << "Boundary candidate: " << current_lh_norm << ", before normalization: " << current_lh << std::endl;
-                // } else if (sv_candidate == split_gap) {
-                //     std::cout << "Gap candidate: " << current_lh_norm << ", before normalization: " << current_lh << std::endl;
-                // }
-
-                // Update the current SV candidate if the likelihood is higher
-                if (current_type != SVType::UNKNOWN && current_lh_norm > chosen_lh_norm) {
-                    chosen_lh_norm = current_lh_norm;
-                    chosen_type = current_type;
-                    chosen_candidate = std::make_pair(std::get<0>(sv_candidate), std::get<1>(sv_candidate));
-
-                    // Update the candidate string
-                    if (sv_candidate == split_boundary) {
-                        chosen_candidate_str = "BOUNDARY";
-                        if (primary_before_supp) {
-                            split_scenario = PRIM_SUPP_BD;
-                        } else {
-                            split_scenario = SUPP_PRIM_BD;
-                        }
-                    } else if (sv_candidate == split_gap) {
-                        chosen_candidate_str = "GAP";
-                        if (primary_before_supp) {
-                            split_scenario = PRIM_SUPP_GAP;
-                        } else {
-                            split_scenario = SUPP_PRIM_GAP;
-                        }
+            // Inversion detection
+            bool is_opposite_strand = std::get<7>(primary_alignment) != std::get<7>(*it);
+            if (is_opposite_strand) {
+                if (supp_length >= min_cnv_length) {
+                    SVCandidate sv_candidate(supp_start+1, supp_end+1, ".");
+                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
+                    double supp_lh = std::get<0>(result);
+                    SVType supp_type = std::get<1>(result);
+                    if (supp_type == SVType::NEUTRAL) {
+                        addSVCall(sv_calls, (uint32_t)(supp_start+1), (uint32_t)(supp_end+1), "INV", ".", "HMM", "./.", supp_lh);
+                        sv_count++;
+                    } else if (supp_type == SVType::DUP) {
+                        addSVCall(sv_calls, (uint32_t)(supp_start+1), (uint32_t)(supp_end+1), "INVDUP", ".", "HMM", "./.", supp_lh);
+                        sv_count++;
                     }
-                    // std::cout << "Updated candidate: " << chosen_candidate_str << " with likelihood: " << current_lh_norm << std::endl;
-                } else if (current_type == SVType::UNKNOWN) {
-                    // std::cerr << "ERROR: Unknown SV type" << std::endl;
-                    // exit(1);
-                }
-            }
-
-            // std::cout << "Chosen candidate: " << chosen_candidate_str << std::endl;
-
-            // Continue if unknown SV type
-            if (chosen_type == SVType::UNKNOWN) {
-                // std::cerr << "ERROR: Unknown SV type" << std::endl;
-                continue;
-            }
-
-            // If opposite strand, set the type to INV or INV_DUP
-            bool same_strand = std::get<7>(*it) == std::get<7>(primary_alignment);
-            if (!same_strand) {
-                if (chosen_type == SVType::NEUTRAL) {
-                    chosen_type = SVType::INV;
-                } else if (chosen_type == SVType::DUP) {
-                    chosen_type = SVType::INV_DUP;
+                } else {
+                    // Add the inversion without running copy number predictions
+                    // (too small for predictions)
+                    addSVCall(sv_calls, (uint32_t)(supp_start+1), (uint32_t)(supp_end+1), "INV", ".", "REV", "./.", 0.0);
+                    sv_count++;
                 }
             }
-
-            if (chosen_lh_norm > best_split_aln_lh_norm) {
-                // best_supp_log_likelihood = supp_likelihood;
-                // best_supp_log_likelihood /= (double)(sv_end - sv_start);  //
-                // Normalize the log likelihood by the length
-                // best_split_aln_lh = split_aln_lh;
-                best_split_aln_lh_norm = chosen_lh_norm;
-                // best_split_aln_length = split_aln_length;
-                best_supp_type = chosen_type;
-                best_supp_candidate = chosen_candidate;
-                best_split_alignment = *it;
-                best_scenario = split_scenario;
-            } else if (chosen_lh_norm <= best_split_aln_lh_norm) {
-                // std::cerr << "ERROR: split_aln_lh_norm is less than or equal to best_split_aln_lh_norm" << std::endl;
-                // exit(1);
-            }
-        }
-
-        // If the likelihood is equal to the lowest value, print an error
-        if (best_split_aln_lh_norm == std::numeric_limits<double>::lowest()) {
-            // std::cerr << "ERROR: best_supp_log_likelihood is the lowest value" << std::endl;
-            // exit(1);
         }
 
-        // Print the likelihoods
-        // std::cout << "Primary log likelihood: " << primary_lh << std::endl;
-        // std::cout << "Largest supplementary log likelihood: " << largest_supp_lh << std::endl;
-        // std::cout << "Closest supplementary log likelihood: " << closest_supp_lh << std::endl;
-        // // std::cout << "Best split alignment log likelihood: " << best_split_aln_lh << std::endl;
-        // std::cout << "Best split alignment log likelihood (normalized): " << best_split_aln_lh_norm << std::endl;
-        // std::cout << "Best scenario: " << best_scenario << std::endl;
-
-        // Add the SV call with the highest likelihood prediction
-        // 
-        // Determine the normalized log likelihood for the combined alignments
-        // by summing and normalizing the log likelihoods by the length
-        double complex_lh = 0.0;
-        double complex_lh_norm = 0.0;
-        if (largest_supp_alignment == closest_supp_alignment) {
-            int32_t complex_t = primary_lh_t + largest_supp_lh_t;
-            complex_lh = primary_lh + largest_supp_lh;
-            complex_lh_norm = complex_lh;// / complex_t;
+        // Trim overlapping alignments
+        int32_t supp_start = std::get<1>(largest_supp_alignment);
+        int32_t supp_end = std::get<2>(largest_supp_alignment);
+        bool primary_before_supp = primary_start < supp_start;
+        trimOverlappingAlignments(primary_alignment, largest_supp_alignment);
+
+        // Create the SV candidate using both alignments
+        supp_start = std::get<1>(largest_supp_alignment);
+        supp_end = std::get<2>(largest_supp_alignment);
+        primary_start = std::get<1>(primary_alignment);
+        primary_end = std::get<2>(primary_alignment);
+        SVCandidate split_boundary;
+        SVCandidate split_gap;
+        bool gap_exists = false;
+        int32_t boundary_left, boundary_right, gap_left, gap_right;
+        if (primary_before_supp) {
+            boundary_left = primary_start+1;
+            boundary_right = supp_end+1;
+            gap_left = primary_end+1;
+            gap_right = supp_start+1;
+            gap_exists = primary_end < supp_start;
         } else {
-            int32_t complex_t = primary_lh_t + largest_supp_lh_t + closest_supp_lh_t;
-            complex_lh = primary_lh + largest_supp_lh + closest_supp_lh;
-            complex_lh_norm = complex_lh;// / complex_t;
+            boundary_left = supp_start+1;
+            boundary_right = primary_end+1;
+            gap_left = supp_end+1;
+            gap_right = primary_start+1;
+            gap_exists = supp_end < primary_start;
         }
-        // std::cout << "Complex log likelihood (normalized): " << complex_lh_norm << std::endl;
-
-        // Compare the best split alignment likelihood to the complex likelihood
-        // if (best_supp_log_likelihood > primary_log_likelihood || best_supp_log_likelihood > largest_supp_log_likelihood || best_supp_log_likelihood > closest_supp_log_likelihood) {
-        if (best_split_aln_lh_norm > complex_lh_norm) {
-            int32_t sv_start = best_supp_candidate.first;
-            int32_t sv_end = best_supp_candidate.second;
-
-            // Print an error and continue if the end is less than the start
-            if (sv_end < sv_start) {
-                std::cerr << "ERROR: SV end is less than the start: " << sv_start << " - " << sv_end << ", SV type: " << getSVTypeString(best_supp_type) << std::endl;
-                continue;
-            }
-
-            // Resolve overlaps between the primary and supplementary query
-            // sequences for deletions (not usually an issue for other types)
-            if (best_supp_type == SVType::DEL) {
-                AlignmentData& best_supp_alignment = best_split_alignment;
-                int32_t supp_start = std::get<1>(best_supp_alignment);
-                int32_t supp_end = std::get<2>(best_supp_alignment);
-                int32_t supp_query_start = std::get<4>(best_supp_alignment);
-                int32_t supp_query_end = std::get<5>(best_supp_alignment);
-                std::unordered_map<int, int> supp_match_map = std::get<6>(best_supp_alignment);
-
-                // Resolve overlaps between the primary and supplementary query
-                // sequences
-                // int32_t overlap_start = std::max(primary_query_start, supp_query_start);
-                // int32_t overlap_end = std::min(primary_query_end, supp_query_end);
-                // int32_t overlap_length = overlap_end - overlap_start;
-                bool gap_present = primary_query_end < supp_query_start || supp_query_end < primary_query_start;
-                if (!gap_present) {
-                    int32_t overlap_start = std::max(primary_query_start, supp_query_start);
-                    int32_t overlap_end = std::min(primary_query_end, supp_query_end);
-                    int32_t overlap_length = overlap_end - overlap_start;
-
-                    // Calculate the mismatch rate for each alignment at the overlap
-                    double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end);
-                    double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end);
-                    // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl;
-                    // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl;
-
-                    // Trim the overlap from the alignment with the higher mismatch
-                    // rate
-                    if (primary_mismatch_rate > supp_mismatch_rate) {
-
-                        // Handle each scenario
-                        if (best_scenario == PRIM_SUPP_BD || best_scenario == PRIM_SUPP_GAP) {
-                            // Primary is first, incorporate the overlap into
-                            // the beginning of the deletion
-                            sv_start -= overlap_length;
-                        } else if (best_scenario == SUPP_PRIM_BD || best_scenario == SUPP_PRIM_GAP) {
-                            // Primary is last, incorporate the overlap into
-                            // the end of the deletion
-                            sv_end += overlap_length;
-                        }
-                    } else {
-
-                        // Handle each scenario
-                        if (best_scenario == SUPP_PRIM_BD || best_scenario == SUPP_PRIM_GAP) {
-                            // Supplementary is first, incorporate the overlap into
-                            // the beginning of the deletion
-                            sv_start -= overlap_length;
-                        } else if (best_scenario == PRIM_SUPP_BD || best_scenario == PRIM_SUPP_GAP) {
-                            // Supplementary is last, incorporate the overlap into
-                            // the end of the deletion
-                            sv_end += overlap_length;
-                        }
-                    }
-                }
-            }
-
-            // Add the best split alignment as the SV call
-            // sv_calls.add(primary_chr, sv_start, sv_end, best_supp_type, ".",
-            // "SPLITREAD", "./.", best_split_aln_lh_norm);
-            std::string sv_type_str = getSVTypeString(best_supp_type);
-            sv_count++;
-        } else {
-            // Resolve complex SVs
-
-            // Simplest case: Largest supplementary is also the closest
-            if (largest_supp_alignment == closest_supp_alignment) {
-                // [primary] -- [supp_start] -- [supp_end]
-                // Determine if opposite strands
-                bool opposite_strands = std::get<7>(largest_supp_alignment) != std::get<7>(primary_alignment);
-
-                // Determine if the supplementary alignment is an inversion
-                if (opposite_strands) {
-                    if (largest_supp_type == SVType::NEUTRAL) {
-                        largest_supp_type = SVType::INV;
-                    } else if (largest_supp_type == SVType::DUP) {
-                        largest_supp_type = SVType::INV_DUP;
-                    }
-                }
-
-                // Get the SV type strings
-                std::string primary_type_str = getSVTypeString(primary_type);
-                std::string supp_type_str = getSVTypeString(largest_supp_type);
-
-                // Determine the order of the primary and supplementary
-                // alignment to resolve the SV
-                if (std::get<1>(largest_supp_alignment) < primary_start) {
-                    // [supp_start] -- [supp_end] -- [primary]
-                    std::string complex_sv_type_str = supp_type_str + "+" + primary_type_str;
-
-                    // Add the complex SV call
-                    addSVCall(sv_calls, (uint32_t)std::get<1>(largest_supp_alignment), (uint32_t)primary_end, "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm);
-                    // sv_calls.insert(SVCall{(uint32_t)std::get<1>(largest_supp_alignment), (uint32_t)primary_end, "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm});
-                    // sv_calls.add(primary_chr, std::get<1>(largest_supp_alignment), primary_end, SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm);
-                    sv_count++;
-                } else {
-                    // [primary] -- [supp_start] -- [supp_end]
-                    std::string complex_sv_type_str = primary_type_str + "+" + supp_type_str;
-
-                    // Add the complex SV call
-                    addSVCall(sv_calls, (uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm);
-                    // sv_calls.insert(SVCall{(uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm});
-                    // sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm);
-                    sv_count++;
-                }
+        
+        // Run copy number variant predictions on the boundary
+        split_boundary = SVCandidate(boundary_left, boundary_right, ".");
+        std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_boundary);
+        double bd_lh = std::get<0>(bd_result);
+        SVType bd_type = std::get<1>(bd_result);
+
+        // Run copy number variant predictions on the gap if it exists
+        if (gap_exists) {
+            split_gap = SVCandidate(gap_left, gap_right, ".");
+            std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_gap);
+            double gap_lh = std::get<0>(gap_result);
+            SVType gap_type = std::get<1>(gap_result);
+
+            // If higher likelihood than the boundary, add the gap as the SV call
+            if (gap_lh > bd_lh) {
+                addSVCall(sv_calls, (uint32_t)(gap_left), (uint32_t)(gap_right), "GAP", ".", "GAP", "./.", gap_lh);
+                sv_count++;
             } else {
-                // Resolve complex SVs with multiple supplementary alignments
-                // Determine the order of the primary and supplementary
-                // alignments
-                // [primary] -- [closest_supp] -- [largest_supp]
-                // [closest_supp] -- [primary] -- [largest_supp]
-                // [largest_supp] -- [closest_supp] -- [primary]
-                // [largest_supp] -- [primary] -- [closest_supp]
-                // Only consider case 1 for efficiency:
-                if (primary_end < std::get<1>(closest_supp_alignment) && std::get<2>(closest_supp_alignment) < std::get<1>(largest_supp_alignment)) {
-                    // [primary] -- [closest_supp] -- [largest_supp]
-                    // Determine if the closest supplementary alignment is an
-                    // inversion
-                    if (std::get<7>(closest_supp_alignment) != std::get<7>(primary_alignment)) {
-                        if (closest_supp_type == SVType::NEUTRAL) {
-                            closest_supp_type = SVType::INV;
-                        } else if (closest_supp_type == SVType::DUP) {
-                            closest_supp_type = SVType::INV_DUP;
-                        }
-                    }
-
-                    // Run copy number variant predictions on the region between
-                    // the closest supplementary alignment and the largest
-                    // supplementary alignment
-                    SVCandidate sv_candidate(std::get<2>(closest_supp_alignment)+1, std::get<1>(largest_supp_alignment)+1, ".");
-                    // std::cout << "TEST4" << std::endl;
-                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
-                    // double complex_log_likelihood = std::get<0>(result);
-                    SVType complex_type = std::get<1>(result);
-
-                    // if (std::get<7>(largest_supp_alignment) != std::get<7>(primary_alignment)) {
-                    //     if (largest_supp_type == SVType::NEUTRAL) {
-                    //         largest_supp_type = SVType::INV;
-                    //     } else if (largest_supp_type == SVType::DUP) {
-                    //         largest_supp_type = SVType::INV_DUP;
-                    //     }
-                    // }
-
-                    std::string primary_type_str = getSVTypeString(primary_type);
-                    std::string closest_supp_type_str = getSVTypeString(closest_supp_type);
-                    // std::string largest_supp_type_str = getSVTypeString(largest_supp_type);
-                    // std::string complex_sv_type_str = primary_type_str + "+" + closest_supp_type_str;
-
-
-                    // Combine the types if equal and not unknown/neutral
-                    std::cout << "Resolving complex SVs..." << std::endl;
-                    std::string complex_sv_type_str = "";
-                    if (primary_type != SVType::UNKNOWN && primary_type != SVType::NEUTRAL) {
-                        complex_sv_type_str += primary_type_str;
-                        std::cout << "[1] Updated to type: " << complex_sv_type_str << std::endl;
-                    }
-                    if (closest_supp_type != primary_type && closest_supp_type != SVType::UNKNOWN && closest_supp_type != SVType::NEUTRAL) {
-                        if (complex_sv_type_str != "") {
-                            complex_sv_type_str += "+";
-                        }
-                        complex_sv_type_str += closest_supp_type_str;
-                        std::cout << "[2] Updated to type: " << complex_sv_type_str << std::endl;
-                    }
-                    if (complex_type != closest_supp_type && complex_type != primary_type && complex_type != SVType::UNKNOWN && complex_type != SVType::NEUTRAL) {
-                        if (complex_sv_type_str != "") {
-                            complex_sv_type_str += "+";
-                        }
-                        complex_sv_type_str += getSVTypeString(complex_type);
-                        std::cout << "[3] Updated to type: " << complex_sv_type_str << std::endl;
-                    }
-
-                    // Add the complex SV call if not empty
-                    if (complex_sv_type_str != "") {
-                        std::cout << "Found complex SV type: " << complex_sv_type_str << std::endl;
-                        // sv_calls.add(primary_chr, primary_start,
-                        // std::get<2>(largest_supp_alignment), SVType::COMPLEX,
-                        // ".", complex_sv_type_str, "./.", complex_lh_norm);
-                        // sv_calls.insert(SVCall{(uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm});
-                        addSVCall(sv_calls, (uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm);
-                        sv_count++;
-                    }
-                }                
+                // Add the boundary as the SV call
+                addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), "BOUNDARY", ".", "BOUNDARY", "./.", bd_lh);
+                sv_count++;
             }
+        } else {
+            // Add the boundary as the SV call
+            addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), "BOUNDARY", ".", "BOUNDARY", "./.", bd_lh);
+            sv_count++;
         }
     }
 
@@ -1276,3 +870,60 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall>
     std::cout << "Finished writing VCF file. Total SV calls: " << total_count << ", skipped: " << skip_count << " with unknown SV type" << std::endl;
 }
 
+void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, AlignmentData& supp_alignment)
+{
+    // Get the start and end read positions for the primary and supplementary
+    // alignments
+    int32_t primary_query_start = std::get<4>(primary_alignment);
+    int32_t primary_query_end = std::get<5>(primary_alignment);
+    int32_t supp_query_start = std::get<4>(supp_alignment);
+    int32_t supp_query_end = std::get<5>(supp_alignment);
+    std::unordered_map<int, int>& primary_match_map = std::get<6>(primary_alignment);
+    std::unordered_map<int, int>& supp_match_map = std::get<6>(supp_alignment);
+    int32_t primary_alignment_start = std::get<1>(primary_alignment);
+    int32_t primary_alignment_end = std::get<2>(primary_alignment);
+    int32_t supp_alignment_start = std::get<1>(supp_alignment);
+    int32_t supp_alignment_end = std::get<2>(supp_alignment);
+
+    // Check if the alignments overlap
+    bool primary_before_supp = primary_query_start < supp_query_start;
+    if (primary_before_supp) {
+        // Primary before supplementary in the query
+        if (primary_query_end >= supp_query_start) {
+            // Calculate the mismatch rates at the overlapping region
+            double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, supp_query_start, primary_query_end);
+            double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, supp_query_start, primary_query_end);
+            int32_t overlap_length = primary_query_end - supp_query_start + 1;
+
+            // Trim the ailgnment with the higher mismatch rate
+            if (primary_mismatch_rate > supp_mismatch_rate) {
+                // Trim the end of the primary alignment
+                std::get<2>(primary_alignment) = primary_alignment_end - overlap_length;
+                std::cout << "Trimming primary alignment" << std::endl;
+            } else {
+                // Trim the beginning of the supplementary alignment
+                std::get<1>(supp_alignment) = supp_alignment_start + overlap_length;
+                std::cout << "Trimming supplementary alignment" << std::endl;
+            }
+        }
+    } else {
+        // Supplementary before primary in the query
+        if (supp_query_end >= primary_query_start) {
+            // Calculate the mismatch rates at the overlapping region
+            double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, primary_query_start, supp_query_end);
+            double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, primary_query_start, supp_query_end);
+            int32_t overlap_length = supp_query_end - primary_query_start + 1;
+
+            // Trim the ailgnment with the higher mismatch rate
+            if (supp_mismatch_rate > primary_mismatch_rate) {
+                // Trim the end of the supplementary alignment
+                std::get<2>(supp_alignment) = supp_alignment_end - overlap_length;
+                std::cout << "Trimming supplementary alignment" << std::endl;
+            } else {
+                // Trim the beginning of the primary alignment
+                std::get<1>(primary_alignment) = primary_alignment_start + overlap_length;
+                std::cout << "Trimming primary alignment" << std::endl;
+            }
+        }
+    }
+}

From 942ce746af3a040f6ca984ed6a0d68b237f5b0ba Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 25 Nov 2024 22:58:36 -0500
Subject: [PATCH 027/134] Reduce debug outputs

---
 src/sv_caller.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 72d61b14..e1ae71d6 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -899,11 +899,9 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align
             if (primary_mismatch_rate > supp_mismatch_rate) {
                 // Trim the end of the primary alignment
                 std::get<2>(primary_alignment) = primary_alignment_end - overlap_length;
-                std::cout << "Trimming primary alignment" << std::endl;
             } else {
                 // Trim the beginning of the supplementary alignment
                 std::get<1>(supp_alignment) = supp_alignment_start + overlap_length;
-                std::cout << "Trimming supplementary alignment" << std::endl;
             }
         }
     } else {
@@ -918,11 +916,9 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align
             if (supp_mismatch_rate > primary_mismatch_rate) {
                 // Trim the end of the supplementary alignment
                 std::get<2>(supp_alignment) = supp_alignment_end - overlap_length;
-                std::cout << "Trimming supplementary alignment" << std::endl;
             } else {
                 // Trim the beginning of the primary alignment
                 std::get<1>(primary_alignment) = primary_alignment_start + overlap_length;
-                std::cout << "Trimming primary alignment" << std::endl;
             }
         }
     }

From f7a17e263b18550827c42ec7f769b8a8a4b333a9 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 25 Nov 2024 23:21:39 -0500
Subject: [PATCH 028/134] Update test

---
 tests/test_general.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_general.py b/tests/test_general.py
index 689dda55..ac7d5d8d 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -64,7 +64,7 @@ def test_run():
 
     # Check that the VCF file has the correct number of lines.
     with open(output_file, 'r', encoding='utf-8') as f:
-        assert len(f.readlines()) == 30
+        assert len(f.readlines()) == 22
 
     # Check that the VCF file has the correct header, and the correct
     # VCF CHROM, POS, and INFO fields in the next 2 lines.
@@ -82,7 +82,7 @@ def test_run():
             elif i == header_line + 2:
                 fields = line.strip().split('\t')
                 assert fields[0] == "21"
-                assert fields[1] == "14469910"
-                assert fields[7] == "END=14470078;SVTYPE=DEL;SVLEN=-168;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARDEL;CLIPSUP=0;REPTYPE=NA;HMM=0.000000"
+                assert fields[1] == "14502888"
+                assert fields[7] == "END=14502953;SVTYPE=BOUNDARY;SVLEN=65;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=BOUNDARY;CLIPSUP=0;REPTYPE=NA;HMM=-4.606171"
                 break
             
\ No newline at end of file

From bb4d2c4c905e1a201ef8bda077adb317493bfa7b Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 26 Nov 2024 13:24:08 -0500
Subject: [PATCH 029/134] fix warnings

---
 src/cnv_caller.cpp |  4 ----
 src/sv_caller.cpp  | 50 ++++++++++++++++++++++++----------------------
 src/sv_object.cpp  | 13 +++++++-----
 3 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 0c6c432a..3bc3f39e 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -760,10 +760,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     }
 
     // std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl;
-    int print_count = 0;
     int record_count = 0;
-    int duplicate_count = 0;
-    uint32_t last_pos = 0;
     while (bcf_sr_next_line(snp_reader) > 0)
     {
         if (!bcf_sr_has_line(snp_reader, 0))
@@ -1344,7 +1341,6 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
         throw std::runtime_error("ERROR: Could not get header for population frequency file: " + pfb_filepath);
     }
 
-    int test_count = 0;
     int record_count = 0;
     while (bcf_sr_next_line(pfb_reader) > 0)
     {
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index e1ae71d6..3761325d 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -548,8 +548,8 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
         std::string primary_chr = std::get<0>(primary_alignment);
         int32_t primary_start = std::get<1>(primary_alignment);
         int32_t primary_end = std::get<2>(primary_alignment);
-        int32_t primary_query_start = std::get<4>(primary_alignment);
-        int32_t primary_query_end = std::get<5>(primary_alignment);
+        // int32_t primary_query_start = std::get<4>(primary_alignment);
+        // int32_t primary_query_end = std::get<5>(primary_alignment);
         std::unordered_map<int, int> primary_match_map = std::get<6>(primary_alignment);
         // bool primary_strand = std::get<7>(primary_alignment);
 
@@ -627,32 +627,34 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
             gap_exists = supp_end < primary_start;
         }
         
-        // Run copy number variant predictions on the boundary
-        split_boundary = SVCandidate(boundary_left, boundary_right, ".");
-        std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_boundary);
-        double bd_lh = std::get<0>(bd_result);
-        SVType bd_type = std::get<1>(bd_result);
-
-        // Run copy number variant predictions on the gap if it exists
-        if (gap_exists) {
-            split_gap = SVCandidate(gap_left, gap_right, ".");
-            std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_gap);
-            double gap_lh = std::get<0>(gap_result);
-            SVType gap_type = std::get<1>(gap_result);
-
-            // If higher likelihood than the boundary, add the gap as the SV call
-            if (gap_lh > bd_lh) {
-                addSVCall(sv_calls, (uint32_t)(gap_left), (uint32_t)(gap_right), "GAP", ".", "GAP", "./.", gap_lh);
-                sv_count++;
+        // Run copy number variant predictions on the boundary if large enough
+        if (boundary_right - boundary_left >= min_cnv_length) {
+            split_boundary = SVCandidate(boundary_left, boundary_right, ".");
+            std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_boundary);
+            double bd_lh = std::get<0>(bd_result);
+            SVType bd_type = std::get<1>(bd_result);
+
+            // Run copy number variant predictions on the gap if it exists
+            if (gap_exists && gap_right - gap_left >= min_cnv_length) {
+                split_gap = SVCandidate(gap_left, gap_right, ".");
+                std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_gap);
+                double gap_lh = std::get<0>(gap_result);
+                SVType gap_type = std::get<1>(gap_result);
+
+                // If higher likelihood than the boundary, add the gap as the SV call
+                if (gap_lh > bd_lh) {
+                    addSVCall(sv_calls, (uint32_t)(gap_left), (uint32_t)(gap_right), getSVTypeString(gap_type), ".", "GAP", "./.", gap_lh);
+                    sv_count++;
+                } else {
+                    // Add the boundary as the SV call
+                    addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh);
+                    sv_count++;
+                }
             } else {
                 // Add the boundary as the SV call
-                addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), "BOUNDARY", ".", "BOUNDARY", "./.", bd_lh);
+                addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh);
                 sv_count++;
             }
-        } else {
-            // Add the boundary as the SV call
-            addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), "BOUNDARY", ".", "BOUNDARY", "./.", bd_lh);
-            sv_count++;
         }
     }
 
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 327f3fd2..c28d6b21 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -15,9 +15,9 @@ bool SVCall::operator<(const SVCall & other) const
 
 void addSVCall(std::set<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
 {
-    // Throw an error if unknown SV type
+    // Ignore unknown SV types
     if (sv_type == "UNKNOWN") {
-        throw std::runtime_error("ERROR: Cannot add unknown SV type");
+        return;
     }
     
     if (start >= end) {
@@ -26,10 +26,13 @@ void addSVCall(std::set<SVCall>& sv_calls, uint32_t start, uint32_t end, std::st
 
     // If the SV call already exists (start and end position), then update all information if the
     // likelihood is higher
-    // std::cout << "[TEST1] Adding SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl;
-    SVCall new_sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1};
+    // std::cout << "[TEST1] Adding SV call: " << start << "-" << end << " " <<
+    // sv_type << " " << alt_allele << " " << data_type << " " << genotype << "
+    // " << hmm_likelihood << std::endl;
+    sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1});
+    // SVCall new_sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1};
     
-    sv_calls.insert(new_sv_call);
+    // sv_calls.insert(new_sv_call);
     
     /*
     bool exists = false;

From f241af5c3f7368b9648f542735ba816ddc37525c Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 26 Nov 2024 13:38:20 -0500
Subject: [PATCH 030/134] Read hmm once only

---
 include/cnv_caller.h |  6 ++--
 include/sv_caller.h  |  2 +-
 src/cnv_caller.cpp   | 66 ++++----------------------------------------
 src/sv_caller.cpp    | 18 ++++++++----
 4 files changed, 22 insertions(+), 70 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index e8d3d3b8..9f16c5f6 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -52,7 +52,7 @@ class CNVCaller {
         mutable std::mutex sv_candidates_mtx; // SV candidate map mutex
         mutable std::mutex snp_data_mtx;  // SNP data mutex
         mutable std::mutex hmm_mtx;  // HMM mutex
-        CHMM hmm;
+        // CHMM hmm;
         SNPData snp_data;
         SNPInfo snp_info;
         double mean_chr_cov = 0.0;
@@ -116,11 +116,11 @@ class CNVCaller {
 
         // Run copy number prediction for a single SV candidate, returning the
         // likelihood, predicted CNV type, genotype, and whether SNPs were found
-        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate);
+        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate, CHMM hmm);
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
         // SNPData runCIGARCopyNumberPrediction(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, int min_length);
-        void runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall>& sv_candidates, int min_length);
+        void runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall>& sv_candidates, int min_length, CHMM hmm);
 
         // Calculate the mean chromosome coverage
         double calculateMeanChromosomeCoverage(std::string chr);
diff --git a/include/sv_caller.h b/include/sv_caller.h
index cc9f2630..dee58dea 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -47,7 +47,7 @@ class SVCaller {
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
 
         // Detect SVs from split alignments
-        void detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller);
+        void detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, CHMM hmm);
 
         // Calculate the mismatch rate given a map of query positions to
         // match/mismatch (1/0) values within a specified range of the query
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 3bc3f39e..c6525c1d 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -148,7 +148,7 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
     return std::make_pair(snp_data, snps_found);
 }
 
-std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate)
+std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate, CHMM hmm)
 {
      // Get the start and end positions of the SV call
     uint32_t start_pos = std::get<0>(candidate);
@@ -157,28 +157,16 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2
     // the SV length
     uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
-    // uint32_t snp_start_pos = std::max((uint32_t)1, start_pos - sv_length);
-    // Prevent underflow (start_pos - sv_length) if start_pos < sv_length
     uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
     uint32_t snp_end_pos = end_pos + sv_half_length;
-    // std::cout << "CNP for " << chr << ":" << start_pos << "-" << end_pos << "(" << snp_start_pos << ", " << snp_end_pos << ")" << std::endl;
-    // printMessage("Running copy number prediction for SV candidate " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + " with SNP region " + chr + ":" + std::to_string(snp_start_pos) + "-" + std::to_string(snp_end_pos) + "...");
 
     // Query the SNP region for the SV candidate
     std::pair<SNPData, bool> snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, this->snp_info, this->pos_depth_map, this->mean_chr_cov);
     SNPData& sv_snps = snp_call.first;
     bool sv_snps_found = snp_call.second;
 
-	/*
-    if (sv_snps.pos.size() == 0) {
-    	std::cerr << "ERROR [2]: No windows for SV " << chr << ":" << std::to_string((int)start_pos) << "-" << std::to_string((int)end_pos) << " (" << snp_start_pos << "," << snp_end_pos << std::endl;
-    	continue;
-    }
-    */
-
     // Run the Viterbi algorithm
-    // printMessage("[TEST] Running Viterbi algorithm for SV candidate " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "...");
-    std::pair<std::vector<int>, double> prediction = runViterbi(this->hmm, sv_snps);
+    std::pair<std::vector<int>, double> prediction = runViterbi(hmm, sv_snps);
     std::vector<int>& state_sequence = prediction.first;
     double likelihood = prediction.second;
 
@@ -235,54 +223,12 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 }
 
 
-void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall> &sv_candidates, int min_length)
+void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall> &sv_candidates, int min_length, CHMM hmm)
 {
-    CHMM& hmm = this->hmm;
     int window_size = this->input_data.getWindowSize();
     double mean_chr_cov = this->mean_chr_cov;  
     printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "...");
-
-    // Create a map with counts for each CNV type
-    // std::map<int, int> cnv_type_counts;
-    // for (int i = 0; i < 6; i++)
-    // {
-    //     cnv_type_counts[i] = 0;
-    // }
-
     runCIGARCopyNumberPredictionChunk(chr, sv_candidates, hmm, window_size, mean_chr_cov);
-    // // Split the SV candidates into chunks for each thread
-    // int chunk_count = this->input_data.getThreadCount();
-    // // std::vector<std::vector<SVCandidate>> sv_chunks = splitSVCandidatesIntoChunks(sv_candidates, chunk_count);
-    // std::vector<std::set<SVCall>> sv_chunks = splitSVsIntoChunks(sv_candidates, chunk_count);
-
-    // // Loop through each SV chunk and run the copy number prediction in parallel
-    // // std::vector<std::future<SNPData>> futures;
-    // std::vector<std::future<void>> futures;
-    // for (auto& sv_chunk : sv_chunks)
-    // {
-    //     // Run the copy number prediction for the SV chunk
-    //     futures.emplace_back(std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, std::ref(sv_chunk), std::ref(this->snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map)));
-    //     // futures.emplace_back(std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, std::ref(sv_chunk), std::ref(this->snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map)));
-    //     // std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, sv_chunk, std::ref(this->snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map));
-    // }
-
-    // // Wait for all the futures to finish
-    // int current_chunk = 0;
-    // for (auto& future : futures)
-    // {
-    //     current_chunk++;
-    //     try {
-    //         future.wait();
-    //         // SNPData chunk_snp_data = std::move(future.get());
-    //         if (this->input_data.getVerbose())
-    //         {
-    //             printMessage("Finished processing SV chunk " + std::to_string(current_chunk) + " of " + std::to_string(chunk_count) + "...");
-    //         }
-    //     } catch (const std::exception& e) {
-    //         printError("Error processing SV chunk " + std::to_string(current_chunk) + " of " + std::to_string(chunk_count) + ": " + e.what());
-    //     }
-    // }
-
     printMessage("Finished predicting copy number states for chromosome " + chr + "...");
 }
 
@@ -525,9 +471,9 @@ std::vector<std::vector<SVCandidate>> CNVCaller::splitSVCandidatesIntoChunks(std
 
 void CNVCaller::loadChromosomeData(std::string chr)
 {
-    std::string hmm_filepath = this->input_data.getHMMFilepath();
-    std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
-    this->hmm = ReadCHMM(hmm_filepath.c_str());
+    // std::string hmm_filepath = this->input_data.getHMMFilepath();
+    // std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
+    // this->hmm = ReadCHMM(hmm_filepath.c_str());
 
     printMessage("Calculating mean chromosome coverage for " + chr + "...");
     this->mean_chr_cov = calculateMeanChromosomeCoverage(chr);
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 3761325d..b40ecaed 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -383,6 +383,12 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
         throw std::runtime_error("ERROR: failed to open " + bam_filepath);
     }
 
+    // Read the HMM from the file
+    std::string hmm_filepath = this->input_data.getHMMFilepath();
+    std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
+    CHMM hmm = ReadCHMM(hmm_filepath.c_str());
+    // this->hmm = ReadCHMM(hmm_filepath.c_str());
+
     // Enable multi-threading
     int num_threads = this->input_data.getThreadCount();
     if (num_threads > 1) {
@@ -485,12 +491,12 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
                 std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl;
                 // cnv_caller.runCIGARCopyNumberPrediction(chr, cigar_svs,
                 // min_cnv_length);
-                cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length);
+                cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm);
             }
 
             // Run split-read SV and copy number variant predictions
             std::cout << "Detecting copy number variants from split reads..." << std::endl;
-            this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller);
+            this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm);
             // sv_calls.concatenate(subregion_sv_calls);  // Add the calls to the
             // main set
             // sv_calls.emplace_back(subregion_sv_calls);
@@ -537,7 +543,7 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
 
 
 // Detect SVs from split read alignments
-void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller)
+void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, CHMM hmm)
 {
     // Find split-read SV evidence
     int sv_count = 0;
@@ -579,7 +585,7 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
             if (is_opposite_strand) {
                 if (supp_length >= min_cnv_length) {
                     SVCandidate sv_candidate(supp_start+1, supp_end+1, ".");
-                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate);
+                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate, hmm);
                     double supp_lh = std::get<0>(result);
                     SVType supp_type = std::get<1>(result);
                     if (supp_type == SVType::NEUTRAL) {
@@ -630,14 +636,14 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
         // Run copy number variant predictions on the boundary if large enough
         if (boundary_right - boundary_left >= min_cnv_length) {
             split_boundary = SVCandidate(boundary_left, boundary_right, ".");
-            std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_boundary);
+            std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_boundary, hmm);
             double bd_lh = std::get<0>(bd_result);
             SVType bd_type = std::get<1>(bd_result);
 
             // Run copy number variant predictions on the gap if it exists
             if (gap_exists && gap_right - gap_left >= min_cnv_length) {
                 split_gap = SVCandidate(gap_left, gap_right, ".");
-                std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_gap);
+                std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_gap, hmm);
                 double gap_lh = std::get<0>(gap_result);
                 SVType gap_type = std::get<1>(gap_result);
 

From 39602fe09d3ab10cad02c2667aafe616f7e93087 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 26 Nov 2024 13:41:14 -0500
Subject: [PATCH 031/134] remove some comments

---
 src/sv_caller.cpp | 41 -----------------------------------------
 1 file changed, 41 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index b40ecaed..7335148e 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -370,12 +370,6 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
         chromosomes = this->input_data.getRefGenomeChromosomes();
     }
 
-    // [TEST] Only process the last N chromosomes
-    // int last_n = 3;
-    // chromosomes = std::vector<std::string>(chromosomes.end()-last_n, chromosomes.end());
-    // std::cout << "[DEBUG] Running last " << last_n << " chromosomes" << std::endl;
-    // //chromosomes = std::vector<std::string>(chromosomes.end()-3, chromosomes.end());
-
     // Open the BAM file
     std::string bam_filepath = this->input_data.getLongReadBam();
     samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
@@ -387,7 +381,6 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
     std::string hmm_filepath = this->input_data.getHMMFilepath();
     std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
     CHMM hmm = ReadCHMM(hmm_filepath.c_str());
-    // this->hmm = ReadCHMM(hmm_filepath.c_str());
 
     // Enable multi-threading
     int num_threads = this->input_data.getThreadCount();
@@ -416,9 +409,6 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
     int current_chr = 0;
     std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl;
     int chunk_count = 100;  // Number of chunks to split the chromosome into
-    // SVData sv_calls;
-    // std::vector<std::map<SVCandidate, SVInfo>> sv_calls;
-    // std::unordered_map<std::string, std::map<uint32_t, uint32_t>> sv_calls;
     uint32_t total_sv_count = 0;
     std::unordered_map<std::string, std::set<SVCall>> whole_genome_sv_calls;
     int min_cnv_length = this->input_data.getMinCNVLength();
@@ -463,43 +453,25 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
         int current_region = 0;
         std::set<SVCall> combined_sv_calls;
         for (const auto& sub_region : region_chunks) {
-            // std::cout << "Detecting CIGAR string SVs from " << sub_region << "..." << std::endl;
-            // std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(sub_region);
             std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region);
             std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
             PrimaryMap& primary_map = std::get<1>(region_data);
             SuppMap& supp_map = std::get<2>(region_data);
             std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl;
             mergeSVs(subregion_sv_calls);
-            // SVData& subregion_sv_calls = std::get<0>(region_data);
-            // PrimaryMap& primary_map = std::get<1>(region_data);
-            // SuppMap& supp_map = std::get<2>(region_data);
-            // int region_sv_count = subregion_sv_calls.totalCalls();
-            // if (region_sv_count > 0) {
-            //     std::cout << "Detected " << region_sv_count << " CIGAR SVs from " << sub_region << "..." << std::endl;
-            // }
-            // int region_sv_count = subregion_sv_calls.count();
             int region_sv_count = getSVCount(subregion_sv_calls);
             printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
             // Run copy number variant predictions on the SVs detected from the
             // CIGAR string, using a minimum CNV length threshold
-            // std::cout << "Detecting copy number variants from CIGAR string SVs..." << std::endl;
-            // std::map<SVCandidate, SVInfo>& cigar_svs = subregion_sv_calls.getChromosomeSVs(chr);
-            // if (cigar_svs.size() > 0) {
             if (region_sv_count > 0) {
                 std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl;
-                // cnv_caller.runCIGARCopyNumberPrediction(chr, cigar_svs,
-                // min_cnv_length);
                 cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm);
             }
 
             // Run split-read SV and copy number variant predictions
             std::cout << "Detecting copy number variants from split reads..." << std::endl;
             this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm);
-            // sv_calls.concatenate(subregion_sv_calls);  // Add the calls to the
-            // main set
-            // sv_calls.emplace_back(subregion_sv_calls);
 
             // Merge the SV calls from the current region
             std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
@@ -509,10 +481,6 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
             std::cout << "Combining SV calls from " << sub_region << "..." << std::endl;
             concatenateSVCalls(combined_sv_calls, subregion_sv_calls);
             std::cout << "Completed " << ++current_region << " of " << region_count << " region(s)..." << std::endl;
-
-            // [TEST] Break after the first region
-            // std::cout << "[DEBUG] Breaking after the first region" << std::endl;
-            // break;
         }
 
         std::cout << "Completed " << ++current_chr << " of " << chr_count << " chromosome(s)..." << std::endl;
@@ -521,7 +489,6 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
         std::cout << "Total SVs detected for chromosome " << chr << ": " << chr_sv_count << std::endl;
         total_sv_count += chr_sv_count;
         std::cout << "Cumulative total SVs: " << total_sv_count << std::endl;
-        // std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)" << std::endl;
     }
 
     // Clean up the BAM file, header, and index
@@ -529,11 +496,6 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
     bam_hdr_destroy(bamHdr);
     sam_close(fp_in);
 
-    // SVData sv_calls_combined;
-    // for (const auto& subregion_sv_calls : sv_calls) {
-    //     sv_calls_combined.concatenate(subregion_sv_calls);
-    // }
-
     // Save to VCF
     std::cout << "Saving SVs to VCF..." << std::endl;
     this->saveToVCF(whole_genome_sv_calls);
@@ -554,10 +516,7 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
         std::string primary_chr = std::get<0>(primary_alignment);
         int32_t primary_start = std::get<1>(primary_alignment);
         int32_t primary_end = std::get<2>(primary_alignment);
-        // int32_t primary_query_start = std::get<4>(primary_alignment);
-        // int32_t primary_query_end = std::get<5>(primary_alignment);
         std::unordered_map<int, int> primary_match_map = std::get<6>(primary_alignment);
-        // bool primary_strand = std::get<7>(primary_alignment);
 
         // Skip primary alignments that do not have supplementary alignments
         if (supp_map.find(qname) == supp_map.end()) {

From 1884f675357e119d30b9eb3d21e1be1baa026505 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 26 Nov 2024 17:32:54 -0500
Subject: [PATCH 032/134] Fix multithreading

---
 include/sv_caller.h |   2 +
 setup.py            |   2 +-
 src/cnv_caller.cpp  | 713 +-------------------------------------------
 src/sv_caller.cpp   | 352 +++++++++++-----------
 4 files changed, 199 insertions(+), 870 deletions(-)

diff --git a/include/sv_caller.h b/include/sv_caller.h
index dee58dea..ec89aa8a 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -38,6 +38,8 @@ class SVCaller {
         // mismatch rate, and the start and end positions of the query sequence
         std::tuple<std::unordered_map<int, int>, int32_t, int32_t> detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set<SVCall>& sv_calls, bool is_primary);
 
+        void processChromosome(const std::string& chr, const std::string& bam_filepath, CHMM hmm, std::set<SVCall>& combined_sv_calls, int min_cnv_length);
+
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
         // RegionData detectSVsFromRegion(std::string region);
diff --git a/setup.py b/setup.py
index 5b6d9ef2..ce0c428f 100644
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,7 @@
     name="_" + NAME,
     sources=SRC_FILES,
     include_dirs=[INCLUDE_DIR, conda_include_dir],
-    extra_compile_args=["-std=c++11"],
+    extra_compile_args=["-std=c++14"],
     language="c++",
     libraries=["hts"],
     library_dirs=[conda_lib_dir]
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index c6525c1d..fd7a6c7c 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -74,7 +74,6 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
     // window, then calculate the log2 ratio for each window
     for (uint32_t i = start_pos; i <= end_pos; i += window_size)
     {
-        // std::cout << "Querying SNP region for " << chr << ":" << i << "-" << std::min(i + window_size - 1, end_pos) << std::endl;
         // Run a sliding non-overlapping window of size window_size across
         // the SV region and calculate the log2 ratio for each window
         uint32_t window_start = i;
@@ -131,13 +130,10 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
                 {
                     bin_end = std::min(bin_end, (snp_window_pos[j] + snp_window_pos[j+1]) / 2);
                 }
-                // std::cout << "bin_start: " << bin_start << std::endl;
-                // std::cout << "bin_end: " << bin_end << std::endl;
 
                 // Calculate the log2 ratio for the SNP bin
                 double bin_cov = calculateLog2Ratio(bin_start, bin_end, pos_depth_map, mean_chr_cov);
                 this->updateSNPData(snp_data, snp_window_pos[j], snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true);
-                // this->updateSNPData(snp_data, snp_pos, snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true);
 
                 // Update the previous bin start
                 bin_start = bin_end + 1;
@@ -215,7 +211,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     {
         std::string cnv_type_str = getSVTypeString(predicted_cnv_type);
         std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv";
-        std::cout << "Saving SV split-alignment copy number predictions to " << sv_filename << std::endl;
+        printMessage("Saving SV split-alignment copy number predictions to " + sv_filename + "...");
         this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood);
     }
     
@@ -247,9 +243,6 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCa
     {
 
         // Get the SV candidate
-        // const SVCandidate& candidate = sv_call;
-        // int64_t start_pos = std::get<0>(candidate);
-        // int64_t end_pos = std::get<1>(candidate);
         uint32_t start_pos = sv_call.start;
         uint32_t end_pos = sv_call.end;
         
@@ -266,8 +259,6 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCa
             continue;
         }
 
-    	// std::cout << "CIGAR SV at " << chr << ":" << start_pos << "-" << end_pos << std::endl;
-
         // Get the depth at the start position. This is used as the FORMAT/DP
         // value in the VCF file
         // int dp_value = pos_depth_map[start_pos];
@@ -278,18 +269,11 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCa
         uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
         uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
         uint32_t snp_end_pos = end_pos + sv_half_length;
-        // std::cout << "CIGAR sv_half_length:" << sv_half_length << std::endl;
-        // std::cout << "CIGAR SV query at " << chr << ":" << query_start << "-" << query_end << std::endl;
-
-        // printMessage("Querying SNPs for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", qstart = " + std::to_string(query_start) + ", qend = " + std::to_string(query_end));
         std::pair<SNPData, bool> snp_call = this->querySNPRegion(chr, snp_start_pos, snp_end_pos, snp_info, this->pos_depth_map, mean_chr_cov);
-        // printMessage("Finished querying SNPs for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         SNPData& sv_snps = snp_call.first;
         bool snps_found = snp_call.second;
 
-        // Run the Viterbi algorithm
-        // printMessage("[TEST2] Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
-        
+        // Run the Viterbi algorithm        
         if (sv_snps.pos.size() == 0) {
         	std::cerr << "ERROR: No windows for SV " << chr << ":" << start_pos << "-" << end_pos << " (" << snp_start_pos << "," << snp_end_pos << std::endl;
         	continue;
@@ -353,13 +337,7 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCa
         {
             std::string sv_type_str = getSVTypeString(updated_sv_type);
             addSVCall(sv_chunk, sv_call.start, sv_call.end, sv_type_str, ".", data_type, genotype, likelihood);
-            // std::string sv_type_str = getSVTypeString(updated_sv_type);
-            // sv_call.sv_type = sv_type_str;
-            // sv_call.data_type += "," + data_type;
-            // sv_call.genotype = genotype;
-            // sv_call.hmm_likelihood = likelihood;
         }
-        // this->updateSVCopyNumber(sv_candidates, sv_call, cnv_type, data_type, genotype, likelihood);
 
         // Save the SV calls as a TSV file if enabled, if the SV type is
         // known, and the length is greater than 10 kb
@@ -372,8 +350,6 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCa
             // Save the SV calls as a TSV file
             std::string cnv_type_str = getSVTypeString(updated_sv_type);
             std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_CIGAR.tsv";
-            // std::cout << "Saving SV CIGAR copy number predictions to " <<
-            // sv_filename << std::endl;
             printMessage("Saving SV CIGAR copy number predictions to " + sv_filename);
             this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood);
         }
@@ -471,22 +447,9 @@ std::vector<std::vector<SVCandidate>> CNVCaller::splitSVCandidatesIntoChunks(std
 
 void CNVCaller::loadChromosomeData(std::string chr)
 {
-    // std::string hmm_filepath = this->input_data.getHMMFilepath();
-    // std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
-    // this->hmm = ReadCHMM(hmm_filepath.c_str());
-
     printMessage("Calculating mean chromosome coverage for " + chr + "...");
     this->mean_chr_cov = calculateMeanChromosomeCoverage(chr);
-    //this->mean_chr_cov = 30.0;
     printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov));
-
-    // std::cout << "Reading SNP allele frequencies for chromosome " << chr << " from VCF file..." << std::endl;
-    // std::string snp_filepath = this->input_data.getSNPFilepath();
-    // readSNPAlleleFrequencies(chr, snp_filepath, this->snp_info);
-
-    // std::cout << "Obtaining SNP population frequencies for chromosome " << chr << "..." << std::endl;
-    // getSNPPopulationFrequencies(chr, this->snp_info);
-    // std::cout << "Finished loading chromosome data for " << chr << std::endl;
 }
 
 // Calculate the mean chromosome coverage
@@ -501,7 +464,7 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
     }
 
     // Enable multi-threading
-    hts_set_threads(bam_file, this->input_data.getThreadCount());
+    // hts_set_threads(bam_file, this->input_data.getThreadCount());
 
     // Read the header
     bam_hdr_t *bam_header = sam_hdr_read(bam_file);
@@ -595,13 +558,6 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
     uint32_t pos_count = 0;
     for (auto& pos_depth : chr_pos_depth_map)
     {
-        // if (pos_depth.second > 0)
-        // {
-        //     cum_depth += pos_depth.second;
-        //     pos_count++;
-        // } else {
-        //     std::cout << "Zero depth at position " << pos_depth.first << std::endl;
-        // }
         cum_depth += pos_depth.second;
         pos_count++;
     }
@@ -684,8 +640,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     }
 
     // Set multi-threading
-    int thread_count = this->input_data.getThreadCount();
-    bcf_sr_set_threads(snp_reader, thread_count);
+    // int thread_count = this->input_data.getThreadCount();
+    // bcf_sr_set_threads(snp_reader, thread_count);
 
     // Enable index usage
     snp_reader->require_index = 1;
@@ -719,20 +675,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             record_count++;
             uint32_t pos = (uint32_t)snp_record->pos + 1;
 
-            // Skip if 3 or more duplicate positions found
-            // if (pos == last_pos)
-            // {
-            //     duplicate_count++;
-            //     if (duplicate_count >= 10)
-            //     {
-            //         std::cerr << "ERROR: 3 or more duplicate positions found in SNP file at " << chr << ":" << pos << std::endl;
-            //         break;
-            //     }
-            // } else {
-            //     duplicate_count = 0;
-            // }
-            // last_pos = pos;
-
             // Skip if not a SNP
             if (!bcf_is_snp(snp_record))
             {
@@ -814,409 +756,20 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             // Insert the SNP position and BAF into the maps
             snp_pos.insert(pos);
             snp_baf[pos] = baf;
-
-            // Print the SNP position and BAF
-            // std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << "(REF=" << ad0 << ",ALT=" << ad1 << ")" << std::endl;
-            // print_count++;
-            // if (print_count < 10)
-            // {
-            //     std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << "(REF=" << ad0 << ",ALT=" << ad1 << ")" << std::endl;
-            //     print_count++;
-            // }
         }
     }
 
     // Clean up
     bcf_sr_destroy(snp_reader);
-
-    // std::cout << "Opening SNP file: " << snp_filepath << std::endl;
-    // htsFile *snp_file = bcf_open(snp_filepath.c_str(), "r");
-    // if (!snp_file)
-    // {
-    //     throw std::runtime_error("ERROR: Could not open SNP file: " + snp_filepath);
-    // }
-
-    // // Enable multi-threading
-    // hts_set_threads(snp_file, thread_count);
-
-    // // Read the header
-    // bcf_hdr_t *snp_header = bcf_hdr_read(snp_file);
-    // if (!snp_header)
-    // {
-    //     bcf_close(snp_file);
-    //     throw std::runtime_error("ERROR: Could not read header from SNP file: " + snp_filepath);
-    // }
-
-    // // Load the index
-    // hts_idx_t *snp_index = bcf_index_load(snp_filepath.c_str());
-    // if (!snp_index)
-    // {
-    //     bcf_hdr_destroy(snp_header);
-    //     bcf_close(snp_file);
-    //     throw std::runtime_error("ERROR: Could not load index for SNP file: " + snp_filepath);
-    // }
-
-    // // Construct the region string
-    // std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
-    // hts_itr_t *snp_iter = bcf_itr_querys(snp_index, snp_header, region_str.c_str());
-    // if (!snp_iter)
-    // {
-    //     hts_idx_destroy(snp_index);
-    //     bcf_hdr_destroy(snp_header);
-    //     bcf_close(snp_file);
-    //     throw std::runtime_error("ERROR: Could not create iterator for SNP region: " + region_str);
-    // }
-
-    // // Set up the record
-    // bcf1_t *snp_record = bcf_init();
-    // if (!snp_record)
-    // {
-    //     bcf_hdr_destroy(snp_header);
-    //     bcf_close(snp_file);
-    //     throw std::runtime_error("ERROR: Could not initialize SNP record.");
-    // }
-
-    // // Read the SNPs in the chromosome region
-    // int print_count = 0;
-    // while (bcf_itr_next(snp_file, snp_iter, snp_record) >= 0)
-    // {
-    //     // Get the position and B-allele frequency (BAF) from the SNP record
-    //     uint32_t pos = snp_record->pos + 1;  // 0-based to 1-based
-
-    //     // Get QUAL, DP, and AD values
-    //     float qual = snp_record->qual;
-    //     if (bcf_float_is_missing(qual))
-    //     {
-    //         std::cerr << "ERROR: QUAL value is missing for SNP at " << chr << ":" << pos << std::endl;
-    //     }
-    //     // Skip if quality is less than 30
-    //     if (qual <= 30)
-    //     {
-    //         continue;
-    //     }
-
-    //     // Get FILTER status
-    //     int pass_id = bcf_hdr_id2int(snp_header, BCF_DT_ID, "PASS");
-    //     if (pass_id == -1)
-    //     {
-    //         std::cerr << "ERROR: Could not get PASS ID for SNP at " << chr << ":" << pos << std::endl;
-    //     }
-    //     std::string pass_filter = "PASS";
-    //     if (bcf_has_filter(snp_header, snp_record, const_cast<char*>(pass_filter.c_str())) != 1)
-    //     {
-    //         // Skip if the SNP does not pass the filter
-    //         continue;
-    //     }
-
-    //     // Extract DP from INFO field
-    //     int32_t dp = 0;
-    //     int dp_count = bcf_get_info_int32(snp_header, snp_record, "DP", &dp, &dp_count);
-    //     if (dp_count != 1)
-    //     {
-    //         std::cerr << "ERROR: Could not get DP value for SNP at " << chr << ":" << pos << std::endl;
-    //     }
-    //     // Skip if depth is not greater than 10
-    //     if (dp <= 10)
-    //     {
-    //         continue;
-    //     }
-
-    //     // Skip if not a SNP
-    //     if (!bcf_is_snp(snp_record))
-    //     {
-    //         continue;
-    //     }
-
-    //     // Extract AD from FORMAT field
-    //     int32_t ad[2] = {0, 0};
-    //     int ad_count = 0;
-    //     int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count);
-    //     // if (ad_count != 2)
-    //     // {
-    //     //     std::cerr << "ERROR: Could not get AD value for SNP at " << chr << ":" << pos << std::endl;
-    //     // }
-
-    //     // Calculate the BAF
-    //     if (ad_ret > 0 && ad_count > 0)
-    //     {
-    //         double baf = (double) ad[1] / (double) (ad[0] + ad[1]);
-    //         snp_pos.insert(pos);
-    //         snp_baf[pos] = baf;
-
-    //         // Print the SNP position and BAF
-    //         if (print_count < 10)
-    //         {
-    //             std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << std::endl;
-    //             print_count++;
-    //         }
-    //     }
-    // }
-
-    // // Clean up
-    // bcf_destroy(snp_record);
-    // hts_itr_destroy(snp_iter);
-    // hts_idx_destroy(snp_index);
-    // bcf_hdr_destroy(snp_header);
-    // bcf_close(snp_file);
-
-    // // Check that the SNP file is sorted by running bcftools index and reading
-    // // the error output
-    // std::string index_cmd = "bcftools index " + filepath + " 2>&1 | grep -i error";
-    // if (this->input_data.getVerbose()) {
-    //     std::cout << "Command: " << index_cmd << std::endl;
-    // }
-
-    // // Open a pipe to read the output of the command
-    // FILE *index_fp = popen(index_cmd.c_str(), "r");
-    // if (index_fp == NULL)
-    // {
-    //     std::cerr << "ERROR: Could not open pipe for command: " << index_cmd << std::endl;
-    //     exit(1);
-    // }
-
-    // // Read the output of the command
-    // const int error_size = 256;
-    // char index_error[error_size];
-    // while (fgets(index_error, error_size, index_fp) != NULL)
-    // {
-    //     std::cerr << "ERROR: " << index_error << std::endl;
-    //     exit(1);
-    // }
-    // pclose(index_fp);  // Close the process
-
-    // // Filter variants by depth, quality, and region
-    // if (this->input_data.getVerbose()) {
-    //     std::cout << "Filtering SNPs by depth, quality, and region..." << std::endl;
-    // }
-
-    // // Check if a region was specified by the user
-    // std::string region_str = chr;
-    // if (this->input_data.isRegionSet())
-    // {
-    //     std::pair<int32_t, int32_t> region = this->input_data.getRegion();
-    //     region_str = chr + ":" + std::to_string(region.first) + "-" + std::to_string(region.second);
-    // }
-
-    // std::string filtered_snp_vcf_filepath = this->input_data.getOutputDir() + "/filtered_snps.vcf";
-    // int thread_count = this->input_data.getThreadCount();
-    // // std::string cmd = "bcftools view -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath;
-    // std::string cmd = "bcftools view --threads " + std::to_string(thread_count) + " -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath;
-    // if (this->input_data.getVerbose()) {
-    //     std::cout << "Filtering SNPs by depth and quality..." << std::endl;
-    //     std::cout << "Command: " << cmd << std::endl;
-    // }
-    // system(cmd.c_str());
-    
-    // if (this->input_data.getVerbose()) {
-    //     std::cout << "Filtered SNPs written to " << filtered_snp_vcf_filepath << std::endl;
-    // }
-
-    // // Extract B-allele frequency data from the VCF file and sort by chromosome
-    // // and position
-    // if (this->input_data.getVerbose()) {
-    //     std::cout << "Extracting B-allele frequency data from filtered SNPs..." << std::endl;
-    // }
-    // cmd = "bcftools query -f '%POS,[%AD]\n' " + filtered_snp_vcf_filepath + " 2>/dev/null";
-    // FILE *fp = popen(cmd.c_str(), "r");
-    // if (fp == NULL)
-    // {
-    //     std::cerr << "ERROR: Could not open pipe for command: " << cmd << std::endl;
-    //     exit(1);
-    // }
-
-    // // Read the reference and alternate allele depths from the VCF file
-    // std::string alt_allele = "";  // Alternate allele
-    // uint32_t pos = 0;
-    // int ref_ad = 0;
-    // int alt_ad = 0;
-    // const int line_size = 1024;
-    // char line[line_size];  // Line buffer
-    // std::vector<int64_t> locations;
-    // std::vector<double> bafs;
-    // std::string chr_no_prefix = removeChrPrefix(chr);
-    // while (fgets(line, line_size, fp) != NULL)
-    // {
-    //     // Parse the line
-    //     char *tok = strtok(line, ",");  // Tokenize the line
-    //     int col = 0;  // Column index
-    //     while (tok != NULL)
-    //     {
-    //         // Get the position from column 2
-    //         if (col == 0)
-    //         {
-    //             pos = (uint32_t)atoi(tok);
-    //         }
-
-    //         // Get the AD for the reference allele from column 3
-    //         else if (col == 1)
-    //         {
-    //             ref_ad = atoi(tok);
-    //         }
-
-    //         // Get the AD for the non-reference allele from column 4
-    //         else if (col == 2)
-    //         {
-    //             alt_ad = atoi(tok);
-    //         }
-
-    //         // Move to the next token
-    //         tok = strtok(NULL, ",");
-    //         col++;
-    //     }
-
-    //     // Calculate the B-allele frequency (BAF) as the ratio of the alternate
-    //     // allele depth to the total depth (reference + alternate)
-    //     double baf = (double) alt_ad / (double) (ref_ad + alt_ad);
-
-    //     // Add a new location and BAF value to the chromosome's SNP data
-    //     // (population frequency and log2 ratio will be added later)
-    //     // snp_info.insertSNPAlleleFrequency(chr_no_prefix, pos, baf);
-    //     this->snp_baf_map[pos] = baf;
-    //     this->snp_baf_keys.insert(pos);
-    // }
-
-    // pclose(fp);  // Close the process
-
-    if (this->input_data.getVerbose()) {
-        std::cout << "Finished extracting B-allele frequency data from filtered SNPs" << std::endl;
-    }
 }
 
-// void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info)
-// {
-//     // Get the population frequency file for the chromosome
-//     std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr);
-//     if (pfb_filepath.empty())
-//     {
-//         std::cout << "No population frequency file provided for chromosome " << chr << std::endl;
-//         return;
-//     }
-    
-//     // Determine the ethnicity-specific allele frequency key
-//     std::string AF_key = "AF";
-//     if (this->input_data.getEthnicity() != "")
-//     {
-//         AF_key += "_" + this->input_data.getEthnicity();
-//     }
-
-//     // Check if the filepath uses the 'chr' prefix notations based on the
-//     // chromosome name (*.chr1.vcf.gz vs *.1.vcf.gz)
-//     std::string chr_gnomad = chr;  // gnomAD data may or may not have the 'chr' prefix
-//     std::string chr_prefix = "chr";
-//     if (pfb_filepath.find(chr_prefix) == std::string::npos)
-//     {
-//         // Remove the 'chr' prefix from the chromosome name
-//         if (chr_gnomad.find(chr_prefix) != std::string::npos)
-//         {
-//             chr_gnomad = chr_gnomad.substr(chr_prefix.length());
-//         }
-//     } else {
-//         // Add the 'chr' prefix to the chromosome name
-//         if (chr_gnomad.find(chr_prefix) == std::string::npos)
-//         {
-//             chr_gnomad = chr_prefix + chr;
-//         }
-//     }
-
-//     // Remove the 'chr' prefix from the chromosome name for SNP data. All
-//     // SNP data in this program does not use the 'chr' prefix
-//     std::string chr_no_prefix = removeChrPrefix(chr);
-
-//     std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl;
-//     int thread_count = this->input_data.getThreadCount();
-
-//     // Open the population frequency file
-//     std::cout << "Opening population frequency file: " << pfb_filepath << std::endl;
-//     htsFile *pfb_file = hts_open(pfb_filepath.c_str(), "r");
-//     if (!pfb_file)
-//     {
-//         throw std::runtime_error("ERROR: Could not open population frequency file: " + pfb_filepath);
-//     }
-
-//     // Enable multi-threading
-//     std::cout << "Setting number of threads to " << thread_count << std::endl;
-//     hts_set_threads(pfb_file, thread_count);
-
-//     // Read the header
-//     std::cout << "Reading header from population frequency file..." << std::endl;
-//     bcf_hdr_t *pfb_header = bcf_hdr_read(pfb_file);
-//     if (!pfb_header)
-//     {
-//         bcf_close(pfb_file);
-//         throw std::runtime_error("ERROR: Could not read header from population frequency file: " + pfb_filepath);
-//     }
-
-//     // Set up the record
-//     std::cout << "Initializing BCF record..." << std::endl;
-//     bcf1_t *pfb_record = bcf_init();
-//     if (!pfb_record)
-//     {
-//         bcf_hdr_destroy(pfb_header);
-//         bcf_close(pfb_file);
-//         throw std::runtime_error("ERROR: Could not initialize BCF record.");
-//     }
-
-//     // Read the population frequencies for the chromosome
-//     std::cout << "[TEST] Reading population frequencies for chromosome " << chr << " (AF_key = " << AF_key << ")..." << std::endl;
-//     int print_count = 0;
-//     while (bcf_read(pfb_file, pfb_header, pfb_record) == 0)
-//     {
-//         // Get the chromosome and position
-//         // std::cout << "Reading record..." << std::endl;
-//         uint32_t pos = pfb_record->pos + 1;  // 0-based to 1-based
-
-//         // Skip if not a SNP, or if the position is not in the BAF map
-//         if (!bcf_is_snp(pfb_record) || this->snp_baf_keys.count(pos) == 0)
-//         {
-//             continue;
-//         }
-
-//         // Get the population frequency for the SNP
-//         float *pfb_f = NULL;
-//         int count = 0;
-//         int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count);
-//         if (pfb_status < 0 || count == 0)
-//         {
-//             std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl;
-//             continue;
-//         }
-//         double pfb = (double) pfb_f[0];
-//         free(pfb_f);
-
-//         // Continue if the population frequency is outside the threshold
-//         if (pfb <= MIN_PFB || pfb >= MAX_PFB)
-//         {
-//             continue;
-//         }
-
-//         // Add the population frequency to the SNP data
-//         // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb);
-//         if (this->snp_pfb_map.find(pos) == this->snp_pfb_map.end())
-//         {
-//             this->snp_pfb_map[pos] = pfb;
-//         } else {
-//             // Keep the larger population frequency
-//             if (pfb > this->snp_pfb_map[pos])
-//             {
-//                 this->snp_pfb_map[pos] = pfb;
-//             }
-//         }
-//         if (print_count < 10)
-//         {
-//             std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
-//             print_count++;
-//         }
-//     }
-//     std::cout << "Finished reading population frequencies for chromosome " << chr << std::endl;
-// }
-
 void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map<uint32_t, double>& snp_pfb_map)
 {
     // Get the population frequency file for the chromosome
     std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr);
     if (pfb_filepath == "")
     {
-        std::cout << "No population frequency file provided for chromosome " << chr << std::endl;
+        printError("No population frequency file provided for chromosome " + chr);
         return;
     }
     
@@ -1249,7 +802,7 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
     // Remove the 'chr' prefix from the chromosome name for SNP data. All
     // SNP data in this program does not use the 'chr' prefix
     std::string chr_no_prefix = removeChrPrefix(chr);
-    int thread_count = this->input_data.getThreadCount();
+    // int thread_count = this->input_data.getThreadCount();
 
     // Initialize the synced reader
     bcf_srs_t *pfb_reader = bcf_sr_init();
@@ -1267,7 +820,7 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
     }
 
     // Set multi-threading
-    bcf_sr_set_threads(pfb_reader, thread_count);
+    // bcf_sr_set_threads(pfb_reader, thread_count);
 
     // Enable index usage
     pfb_reader->require_index = 1;
@@ -1294,7 +847,6 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
         {
             continue;
         }
-        // std::cout << "Reading record..." << std::endl;
         // pfb_record = bcf_sr_get_line(pfb_reader, 0);
         bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0);
         // Do something with the record
@@ -1304,7 +856,6 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
             // Skip if not a SNP
             if (!bcf_is_snp(pfb_record))
             {
-                // std::cout << "Skipping non-SNP at " << chr << ":" << pfb_record->pos << std::endl;
                 continue;
             }
 
@@ -1316,7 +867,6 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
             int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count);
             if (pfb_status < 0 || count == 0)
             {
-                // std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl;
                 continue;
             }
             double pfb = (double) pfb_f[0];
@@ -1339,58 +889,7 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
                     snp_pfb_map[pos] = pfb;
                 }
             }
-
-            // if (test_count < 10)
-            // {
-            //     std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
-            //     test_count++;
-            // }
         }
-            // std::cout << "Record: " << pfb_record->pos << std::endl;
-            // std::cout << "QUAL: " << pfb_record->qual << std::endl;
-
-            //     // Skip if not a SNP
-            //     if (!bcf_is_snp(pfb_record))
-            //     {
-            //         std::cout << "Skipping non-SNP at " << chr << ":" << pos << std::endl;
-            //         continue;
-            //     }
-
-            //     // Get the population frequency for the SNP
-            //     float *pfb_f = NULL;
-            //     int count = 0;
-            //     int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count);
-            //     if (pfb_status < 0 || count == 0)
-            //     {
-            //         std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl;
-            //         continue;
-            //     }
-            //     double pfb = (double) pfb_f[0];
-            //     free(pfb_f);
-
-            //     // Continue if the population frequency is outside the threshold
-            //     if (pfb <= MIN_PFB || pfb >= MAX_PFB)
-            //     {
-            //         continue;
-            //     }
-
-            //     // Add the population frequency to the SNP data
-            //     // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb);
-            //     if (snp_pfb_map.find(pos) == snp_pfb_map.end())
-            //     {
-            //         snp_pfb_map[pos] = pfb;
-            //     } else {
-            //         // Keep the larger population frequency
-            //         if (pfb > snp_pfb_map[pos])
-            //         {
-            //             snp_pfb_map[pos] = pfb;
-            //         }
-            //     }
-            //     if (print_count < 10)
-            //     {
-            //         std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
-            //         print_count++;
-            //     }        }
     }
     if (pfb_reader->errnum)
     {
@@ -1398,129 +897,7 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
     }
 
     // Clean up
-    // bcf_destroy(pfb_record);
     bcf_sr_destroy(pfb_reader);
-
-    // // Open the population frequency file
-    // std::cout << "Opening population frequency file: " << pfb_filepath << std::endl;
-    // htsFile *pfb_file = hts_open(pfb_filepath.c_str(), "r");
-    // if (!pfb_file)
-    // {
-    //     throw std::runtime_error("ERROR: Could not open population frequency file: " + pfb_filepath);
-    // }
-
-    // // Enable multi-threading
-    // std::cout << "Setting number of threads to " << thread_count << std::endl;
-    // hts_set_threads(pfb_file, thread_count);
-
-    // // Read the header
-    // std::cout << "Reading header from population frequency file..." << std::endl;
-    // bcf_hdr_t *pfb_header = bcf_hdr_read(pfb_file);
-    // if (!pfb_header)
-    // {
-    //     bcf_close(pfb_file);
-    //     throw std::runtime_error("ERROR: Could not read header from population frequency file: " + pfb_filepath);
-    // }
-
-    // // Load the index
-    // hts_idx_t *pfb_index = bcf_index_load(pfb_filepath.c_str());
-    // if (!pfb_index)
-    // {
-    //     bcf_hdr_destroy(pfb_header);
-    //     bcf_close(pfb_file);
-    //     throw std::runtime_error("ERROR: Could not load index for population frequency file: " + pfb_filepath);
-    // }
-
-    // // Construct the region string
-    // std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
-    // hts_itr_t *pfb_iter = bcf_itr_querys(pfb_index, pfb_header, region_str.c_str());
-    // if (!pfb_iter)
-    // {
-    //     // Try using the other chromosome notation
-    //     std::string alt_region_str = "chr" + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
-    //     pfb_iter = bcf_itr_querys(pfb_index, pfb_header, alt_region_str.c_str());
-    //     if (!pfb_iter)
-    //     {
-    //         hts_idx_destroy(pfb_index);
-    //         bcf_hdr_destroy(pfb_header);
-    //         bcf_close(pfb_file);
-    //         throw std::runtime_error("ERROR: Could not create iterator for region: " + alt_region_str);
-    //     } else {
-    //         region_str = alt_region_str;
-    //         std::cout << "Successfully created iterator for region: " << region_str << std::endl;
-    //     }
-    //     // hts_idx_destroy(pfb_index);
-    //     // bcf_hdr_destroy(pfb_header);
-    //     // bcf_close(pfb_file);
-    //     // throw std::runtime_error("ERROR: Could not create iterator for region: " + region_str);
-    // }
-
-    // // Set up the record
-    // std::cout << "Initializing BCF record..." << std::endl;
-    // bcf1_t *pfb_record = bcf_init();
-    // if (!pfb_record)
-    // {
-    //     bcf_hdr_destroy(pfb_header);
-    //     bcf_close(pfb_file);
-    //     throw std::runtime_error("ERROR: Could not initialize BCF record.");
-    // }
-
-    // // Read the population frequencies for the region
-    // std::cout << "[TEST] Reading population frequencies for region " << region_str << " (AF_key = " << AF_key << ")..." << std::endl;
-    // int print_count = 0;
-    // int test_count = 0;
-    // while (bcf_itr_next(pfb_file, pfb_iter, pfb_record) >= 0)
-    // {
-    //     test_count++;
-    //     // Get the chromosome and position
-    //     // std::cout << "Reading record..." << std::endl;
-    //     uint32_t pos = pfb_record->pos + 1;  // 0-based to 1-based
-
-    //     // Skip if not a SNP
-    //     if (!bcf_is_snp(pfb_record))
-    //     {
-    //         std::cout << "Skipping non-SNP at " << chr << ":" << pos << std::endl;
-    //         continue;
-    //     }
-
-    //     // Get the population frequency for the SNP
-    //     float *pfb_f = NULL;
-    //     int count = 0;
-    //     int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count);
-    //     if (pfb_status < 0 || count == 0)
-    //     {
-    //         std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl;
-    //         continue;
-    //     }
-    //     double pfb = (double) pfb_f[0];
-    //     free(pfb_f);
-
-    //     // Continue if the population frequency is outside the threshold
-    //     if (pfb <= MIN_PFB || pfb >= MAX_PFB)
-    //     {
-    //         continue;
-    //     }
-
-    //     // Add the population frequency to the SNP data
-    //     // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb);
-    //     if (snp_pfb_map.find(pos) == snp_pfb_map.end())
-    //     {
-    //         snp_pfb_map[pos] = pfb;
-    //     } else {
-    //         // Keep the larger population frequency
-    //         if (pfb > snp_pfb_map[pos])
-    //         {
-    //             snp_pfb_map[pos] = pfb;
-    //         }
-    //     }
-    //     if (print_count < 10)
-    //     {
-    //         std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl;
-    //         print_count++;
-    //     }
-    // }
-    // std::cout << "Finished reading population frequencies for region " << region_str << std::endl;
-    // std::cout << "Test count: " << test_count << std::endl;
 }
 
 void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood)
@@ -1620,36 +997,17 @@ void CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end, std::se
     std::string snp_chr = chr;
     chr = removeChrPrefix(chr);
 
-    // Create an ordered map of SNP positions to BAF and PFB values
+    // Query the SNP allele frequencies for the SNPs
     std::map<uint32_t, std::tuple<double, double>> snp_map;
-
-    // Query SNPs within a range (start, end) and return their BAF and PFB
-    // values as separate vectors
-    // std::vector<double> bafs;
-    // std::vector<double> pfbs;
-    // std::vector<uint32_t> pos;
-    double pfb_default = 0.5;
-
-    // Read the SNP data from the VCF file
     this->readSNPAlleleFrequencies(snp_chr, start, end, snp_pos, snp_baf);
 
-    // Query the SNPs within the range and return their BAFs and corresponding
-    // positions
-    // auto snp_start = this->snp_baf_keys.lower_bound(start);
-    // auto snp_end = this->snp_baf_keys.upper_bound(end);
-    // if (snp_start == this->snp_baf_keys.end())
-    // {
-    //     // return std::make_tuple(pos, bafs, pfbs);
-    //     return;
-    // }
-
     // Query the population frequencies for the SNPs
     std::unordered_map<uint32_t, double> pfb_map;
     this->readSNPPopulationFrequencies(chr, start, end, pfb_map);
 
     // Filter out the SNP population frequencies that are not in the SNP
     // position set
-    // std::unordered_map<uint32_t, double> snp_pfb;
+    double pfb_default = 0.5;
     for (auto& pos : snp_pos)
     {
         if (pfb_map.find(pos) != pfb_map.end())
@@ -1659,55 +1017,4 @@ void CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end, std::se
             snp_pfb[pos] = pfb_default;
         }
     }
-
-    // // Get the PFB values for the SNPs from the keys
-    // // Create the PFB vector using the SNP positions (loop through snp_pos,
-    // // query the pfb_map, and push the value to the vector)
-    // for (size_t i = 0; i < snp_pos.size(); i++)
-    // {
-    //     uint32_t snp_pos = snp_pos[i];
-    //     double pfb = pfb_default;
-    //     if (pfb_map.find(snp_pos) != pfb_map.end())
-    //     {
-    //         pfb = pfb_map[snp_pos];
-    //     } else {
-    //         pfb = pfb_default;
-    //     }
-    //     snp_pfb.push_back(pfb);
-    // }
-
-    // // Get the PFB values for the SNPs from the keys
-    // for (auto it = snp_start; it != snp_end; it++)
-    // {
-    //     uint32_t snp_pos = *it;
-    //     pos.push_back(snp_pos);
-    //     bafs.push_back(this->snp_baf_map[snp_pos]);
-
-    //     // Get the PFB value for the SNP
-    //     if (this->snp_pfb_map.find(snp_pos) != this->snp_pfb_map.end())
-    //     {
-    //         pfbs.push_back(this->snp_pfb_map[snp_pos]);
-    //     } else {
-    //         pfbs.push_back(pfb_default);
-    //     }
-    // }
-    // auto& baf_bst = this->snp_baf_map[chr];
-    // auto baf_start = baf_bst.lower_bound({start, 0.0});
-    // auto baf_end = baf_bst.upper_bound({end, 0.0});
-    // for (auto it = baf_start; it != baf_end; it++) {
-    //     bafs.push_back(std::get<1>(*it));
-    //     pos.push_back(std::get<0>(*it));
-    // }
-
-
-
-    // auto& pfb_map = this->snp_pfb_map[chr];
-    // for (size_t i = 0; i < pos.size(); i++) {
-    //     uint32_t snp_pos = pos[i];
-    //     if (pfb_map.find(snp_pos) != pfb_map.end()) {
-    //         pfbs[i] = pfb_map[snp_pos];
-    //     }
-    // }
-    
-    // return std::make_tuple(pos, bafs, pfbs);
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 7335148e..aa32dda2 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -39,29 +39,6 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
 // RegionData SVCaller::detectSVsFromRegion(std::string region)
 std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region)
 {
-    // // Open the BAM file
-    // std::string bam_filepath = this->input_data.getLongReadBam();
-    // samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
-    // if (fp_in == NULL) {
-    //     std::cerr << "ERROR: failed to open " << bam_filepath << std::endl;
-    //     exit(1);
-    // }
-
-    // // Load the header for the BAM file
-    // bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
-    // if (!bamHdr) {
-    //     sam_close(fp_in);
-    //     throw std::runtime_error("ERROR: failed to read header for " + bam_filepath);
-    // }
-
-    // // Load the index for the BAM file
-    // hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str());
-    // if (!idx) {
-    //     bam_hdr_destroy(bamHdr);
-    //     sam_close(fp_in);
-    //     throw std::runtime_error("ERROR: failed to load index for " + bam_filepath);
-    // }
-
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
     if (!bam1) {
@@ -80,7 +57,6 @@ std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFi
     }
 
     // Main loop to process the alignments
-    // SVData sv_calls;
     std::set<SVCall> sv_calls;
     int num_alignments = 0;
     PrimaryMap primary_alignments;
@@ -138,12 +114,6 @@ std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFi
     // Clean up the iterator and alignment
     hts_itr_destroy(itr);
     bam_destroy1(bam1);
-    
-    // hts_itr_destroy(itr);
-    // bam_destroy1(bam1);
-    // hts_idx_destroy(idx);
-    // bam_hdr_destroy(bamHdr);
-    // sam_close(fp_in);
 
     return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments);
 }
@@ -204,11 +174,6 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
                 // for sequence identity between the insertion and the
                 // reference genome (duplications are typically >= 90%)
 
-                // Loop from the leftmost position of the insertion (pos-op_len)
-                // to the rightmost position of the insertion (pos+op_len-1) and
-                // calculate the sequence identity at each window of the
-                // insertion length to identify potential duplications.
-
                 // Loop through the reference sequence and calculate the
                 // sequence identity +/- insertion length from the insertion
                 // position.
@@ -253,18 +218,8 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
                 if (is_duplication) {
-                    // sv_calls.add(chr, ref_pos, ref_end, SVType::DUP,
-                    // ins_seq_str, "CIGARDUP", "./.", default_lh);
-                    //printMessage("[TEST] FOUND CIGAR DUP");
-                    // sv_calls.insert(SVCall{(uint32_t)ref_pos,
-                    // (uint32_t)ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.",
-                    // default_lh});
                     addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh);
                 } else {
-                    // sv_calls.add(chr, ref_pos, ref_end, SVType::INS, ins_seq_str, "CIGARINS", "./.", default_lh);
-                    // sv_calls.insert(SVCall{(uint32_t)ref_pos,
-                    // (uint32_t)ref_end, "INS", ins_seq_str, "CIGARINS", "./.",
-                    // default_lh});
                     addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh);
                 }
             }
@@ -277,10 +232,6 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
             {
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
-                // sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".",
-                // "CIGARDEL", "./.", default_lh);  // Add to SV calls (1-based)
-                // sv_calls.insert(SVCall{(uint32_t)ref_pos, (uint32_t)ref_end,
-                // "DEL", ".", "CIGARDEL", "./.", default_lh});
                 addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh);
             }
 
@@ -319,8 +270,7 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
 
             // Check that the two sequence lengths are equal
             if (cmatch_seq_str.length() != cmatch_ref_str.length()) {
-                std::cerr << "ERROR: Sequence lengths do not match" << std::endl;
-                exit(1);
+                throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op));
             }
 
             // Compare the two sequences and update the mismatch map
@@ -340,8 +290,7 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
         } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) {
             // Do nothing
         } else {
-            std::cerr << "ERROR: Unknown CIGAR operation " << op << std::endl;
-            exit(1);
+            throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op));
         }
 
         // Update the query position based on the CIGAR operation (M, I, S, H)
@@ -350,8 +299,7 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
         } else if (op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) {
             // Do nothing
         } else {
-            std::cerr << "ERROR: Unknown CIGAR operation " << op << std::endl;
-            exit(1);
+            throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op));
         }
     }
 
@@ -360,43 +308,22 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
     return std::tuple<std::unordered_map<int, int>, int32_t, int32_t>(query_match_map, query_start, query_end);
 }
 
-std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
+void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, CHMM hmm, std::set<SVCall>& combined_sv_calls, int min_cnv_length)
 {
-    // Get the chromosomes to process
-    std::vector<std::string> chromosomes;
-    if (this->input_data.getChromosome() != "") {
-        chromosomes.push_back(this->input_data.getChromosome());
-    } else {
-        chromosomes = this->input_data.getRefGenomeChromosomes();
-    }
-
     // Open the BAM file
-    std::string bam_filepath = this->input_data.getLongReadBam();
     samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
     if (!fp_in) {
         throw std::runtime_error("ERROR: failed to open " + bam_filepath);
     }
 
-    // Read the HMM from the file
-    std::string hmm_filepath = this->input_data.getHMMFilepath();
-    std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
-    CHMM hmm = ReadCHMM(hmm_filepath.c_str());
-
-    // Enable multi-threading
-    int num_threads = this->input_data.getThreadCount();
-    if (num_threads > 1) {
-        std::cout << "Running SV detection with " << num_threads << " thread(s)..." << std::endl;
-    }
-    hts_set_threads(fp_in, num_threads);
-
-    // Load the header for the BAM file
+    // Load the header
     bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
     if (!bamHdr) {
         sam_close(fp_in);
-        throw std::runtime_error("ERROR: failed to read header for " + bam_filepath);
+        throw std::runtime_error("ERROR: failed to read header from " + bam_filepath);
     }
 
-    // Load the index for the BAM file
+    // Load the index
     hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str());
     if (!idx) {
         bam_hdr_destroy(bamHdr);
@@ -404,97 +331,196 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
         throw std::runtime_error("ERROR: failed to load index for " + bam_filepath);
     }
 
-    // Loop through each region and detect SVs in chunks
-    int chr_count = chromosomes.size();
-    int current_chr = 0;
-    std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl;
-    int chunk_count = 100;  // Number of chunks to split the chromosome into
-    uint32_t total_sv_count = 0;
-    std::unordered_map<std::string, std::set<SVCall>> whole_genome_sv_calls;
-    int min_cnv_length = this->input_data.getMinCNVLength();
-    for (const auto& chr : chromosomes) {
-        std::cout << "Running SV detection for chromosome " << chr << "..." << std::endl;
-
-        // Split the chromosome into chunks
-        std::vector<std::string> region_chunks;
-        if (this->input_data.isRegionSet()) {
-
-            // Use one chunk for the specified region
-            std::pair<int32_t, int32_t> region = this->input_data.getRegion();
-            int region_start = region.first;
-            int region_end = region.second;
-            std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end);
-            region_chunks.push_back(chunk);
-            std::cout << "Using specified region " << chunk << "..." << std::endl;
-            
-        } else {
-            int chr_len = this->input_data.getRefGenomeChromosomeLength(chr);
-            int chunk_size = std::ceil((double)chr_len / chunk_count);
-            for (int i = 0; i < chunk_count; i++) {
-                int start = i * chunk_size + 1;  // 1-based
-                int end = start + chunk_size;
-                if (i == chunk_count - 1) {
-                    end = chr_len;
-                }
-                std::string chunk = chr + ":" + std::to_string(start) + "-" + std::to_string(end);
-                region_chunks.push_back(chunk);
+    // Split the chromosome into chunks for memory efficiency
+    std::vector<std::string> region_chunks;
+    int chunk_count = 100;
+    if (this->input_data.isRegionSet()) {
+
+        // Use one chunk for the specified region
+        std::pair<int32_t, int32_t> region = this->input_data.getRegion();
+        int region_start = region.first;
+        int region_end = region.second;
+        std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end);
+        region_chunks.push_back(chunk);
+        // std::cout << "Using specified region " << chunk << "..." << std::endl;
+        
+    } else {
+        int chr_len = this->input_data.getRefGenomeChromosomeLength(chr);
+        int chunk_size = std::ceil((double)chr_len / chunk_count);
+        for (int i = 0; i < chunk_count; i++) {
+            int start = i * chunk_size + 1;  // 1-based
+            int end = start + chunk_size;
+            if (i == chunk_count - 1) {
+                end = chr_len;
             }
-            std::cout << "Split chromosome " << chr << " into " << region_chunks.size() << " chunks of size " << chunk_size << "..." << std::endl;
+            std::string chunk = chr + ":" + std::to_string(start) + "-" + std::to_string(end);
+            region_chunks.push_back(chunk);
         }
+        printMessage("Split chromosome " + chr + " into " + std::to_string(region_chunks.size()) + " chunks of size " + std::to_string(chunk_size) + "...");
+    }
 
-        // Load chromosome data for copy number predictions
-        std::cout << "Loading chromosome data for copy number predictions..." << std::endl;
-        CNVCaller cnv_caller(this->input_data);
-        cnv_caller.loadChromosomeData(chr);
-
-        // Process each chunk one at a time
-        std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl;
-        int region_count = region_chunks.size();
-        int current_region = 0;
-        std::set<SVCall> combined_sv_calls;
-        for (const auto& sub_region : region_chunks) {
-            std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region);
-            std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
-            PrimaryMap& primary_map = std::get<1>(region_data);
-            SuppMap& supp_map = std::get<2>(region_data);
-            std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl;
-            mergeSVs(subregion_sv_calls);
-            int region_sv_count = getSVCount(subregion_sv_calls);
-            printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
-
-            // Run copy number variant predictions on the SVs detected from the
-            // CIGAR string, using a minimum CNV length threshold
-            if (region_sv_count > 0) {
-                std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl;
-                cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm);
-            }
+    // Load chromosome data for copy number predictions
+    // std::cout << "Loading chromosome data for copy number predictions..." << std::endl;
+    CNVCaller cnv_caller(this->input_data);
+    cnv_caller.loadChromosomeData(chr);
+
+    // Process each chunk one at a time
+    // std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl;
+    int region_count = region_chunks.size();
+    int current_region = 0;
+    // std::set<SVCall> combined_sv_calls;
+    for (const auto& sub_region : region_chunks) {
+        std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region);
+        std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
+        PrimaryMap& primary_map = std::get<1>(region_data);
+        SuppMap& supp_map = std::get<2>(region_data);
+        // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl;
+        mergeSVs(subregion_sv_calls);
+        int region_sv_count = getSVCount(subregion_sv_calls);
+        // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
+
+        // Run copy number variant predictions on the SVs detected from the
+        // CIGAR string, using a minimum CNV length threshold
+        if (region_sv_count > 0) {
+            // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl;
+            cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm);
+        }
 
-            // Run split-read SV and copy number variant predictions
-            std::cout << "Detecting copy number variants from split reads..." << std::endl;
-            this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm);
+        // Run split-read SV and copy number variant predictions
+        // std::cout << "Detecting copy number variants from split reads..." << std::endl;
+        this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm);
 
-            // Merge the SV calls from the current region
-            std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
-            mergeSVs(subregion_sv_calls);
+        // Merge the SV calls from the current region
+        // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
+        mergeSVs(subregion_sv_calls);
 
-            // Combine the SV calls from the current region
-            std::cout << "Combining SV calls from " << sub_region << "..." << std::endl;
-            concatenateSVCalls(combined_sv_calls, subregion_sv_calls);
-            std::cout << "Completed " << ++current_region << " of " << region_count << " region(s)..." << std::endl;
-        }
-
-        std::cout << "Completed " << ++current_chr << " of " << chr_count << " chromosome(s)..." << std::endl;
-        int chr_sv_count = getSVCount(combined_sv_calls);
-        whole_genome_sv_calls[chr] = combined_sv_calls;
-        std::cout << "Total SVs detected for chromosome " << chr << ": " << chr_sv_count << std::endl;
-        total_sv_count += chr_sv_count;
-        std::cout << "Cumulative total SVs: " << total_sv_count << std::endl;
+        // Combine the SV calls from the current region
+        // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl;
+        concatenateSVCalls(combined_sv_calls, subregion_sv_calls);
+        current_region++;
+        printMessage("Completed " + std::to_string(current_region) + " of " + std::to_string(region_count) + " region(s) for chromosome " + chr + "...");
     }
 
     // Clean up the BAM file, header, and index
     hts_idx_destroy(idx);
     bam_hdr_destroy(bamHdr);
     sam_close(fp_in);
+}
+
+std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
+{
+    // Get the chromosomes to process
+    std::vector<std::string> chromosomes;
+    if (this->input_data.getChromosome() != "") {
+        chromosomes.push_back(this->input_data.getChromosome());
+    } else {
+        chromosomes = this->input_data.getRefGenomeChromosomes();
+    }
+
+    // Ignore all alternate contigs (contains 'alt', 'GL', 'NC', 'hs', etc.)
+    chromosomes.erase(std::remove_if(chromosomes.begin(), chromosomes.end(), [](const std::string& chr) {
+        return chr.find("alt") != std::string::npos || chr.find("GL") != std::string::npos || chr.find("NC") != std::string::npos || chr.find("hs") != std::string::npos;
+    }), chromosomes.end());
+
+    // Read the HMM from the file
+    std::string hmm_filepath = this->input_data.getHMMFilepath();
+    std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
+    CHMM hmm = ReadCHMM(hmm_filepath.c_str());
+
+    // Set up threads for processing each chromosome
+    std::vector<std::future<void>> futures;
+    std::unordered_map<std::string, std::set<SVCall>> whole_genome_sv_calls;
+    std::mutex sv_mutex;
+
+    // Set a thread count for processing each chromosome. Keep it low to avoid
+    // memory issues.
+    int max_threads = 6;  // Number of chromosomes to process in parallel
+    int batch_count = 0;
+    int completed_threads = 0;
+    int chr_count = chromosomes.size();
+    for (const auto& chr : chromosomes) {
+        printMessage("Launching thread for chromosome " + chr + "...");
+        futures.push_back(std::async(std::launch::async, [&]() {
+            std::set<SVCall> sv_calls;
+            this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength());
+            {
+                std::lock_guard<std::mutex> lock(sv_mutex);
+                whole_genome_sv_calls[chr] = std::move(sv_calls);
+            }
+        }
+        ));
+        batch_count++;
+        if (batch_count >= max_threads || batch_count >= chr_count) {
+            // Wait for all threads to finish
+            // printMessage("Waiting for all threads to finish for " + std::to_string(batch_count) + " chromosome(s)...");
+            for (auto& future : futures) {
+                future.get();
+                completed_threads++;
+                printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
+            }
+            // completed_threads += batch_count;
+            // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
+            batch_count = 0;
+            futures.clear();
+        }
+    }
+
+    // Wait for remaining threads to finish
+    if (futures.size() > 0) {
+        // printMessage("Waiting for remaining threads to finish for " + std::to_string(futures.size()) + " chromosome(s)...");
+        for (auto& future : futures) {
+            future.get();
+            completed_threads++;
+            printMessage("[TEST] Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
+        }
+        // completed_threads += futures.size();
+        // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
+    }
+
+    // // Loop through each region and detect SVs in chunks
+    // std::string bam_filepath = this->input_data.getLongReadBam();
+    // int chr_count = chromosomes.size();
+    // std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl;
+    // // int thread_count = this->input_data.getThreadCount();
+    // int thread_count = chr_count;
+    // int min_cnv_length = this->input_data.getMinCNVLength();
+    // for (const auto& chr : chromosomes) {
+    //     printMessage("Launching thread for chromosome " + chr + "...");
+    //     futures.push_back(std::async(std::launch::async, [&]() {
+    //         std::set<SVCall> sv_calls;
+    //         this->processChromosome(chr, bam_filepath, hmm, sv_calls, min_cnv_length);
+    //         {
+    //             std::lock_guard<std::mutex> lock(sv_mutex);
+    //             whole_genome_sv_calls[chr] = std::move(sv_calls);
+    //         }
+    //     }
+    //     ));
+    // }
+
+    // // Wait for all threads to finish
+    // printMessage("Waiting for all threads to finish for " + std::to_string(chr_count) + " chromosome(s)...");
+    // int threads_finished = 0;
+    // for (auto& future : futures) {
+    //     try{
+    //         // future.wait();
+    //         future.get();  // Wait and handle exceptions
+    //         threads_finished++;
+    //         printMessage("Completed " + std::to_string(threads_finished) + " of " + std::to_string(thread_count) + " threads...");
+    //     } catch (const std::exception& e) {
+    //         std::cerr << "Error in thread: " << e.what() << std::endl;
+    //     }
+    // }
+    printMessage("All threads have finished.");
+
+    // Print the total number of SVs detected for each chromosome
+    uint32_t total_sv_count = 0;
+    for (const auto& entry : whole_genome_sv_calls) {
+        std::string chr = entry.first;
+        int sv_count = getSVCount(entry.second);
+        total_sv_count += sv_count;
+        printMessage("Total SVs detected for chromosome " + chr + ": " + std::to_string(sv_count));
+    }
+    printMessage("Total SVs detected for all chromosomes: " + std::to_string(total_sv_count));
 
     // Save to VCF
     std::cout << "Saving SVs to VCF..." << std::endl;
@@ -706,16 +732,10 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall>
     std::string sv_method = "CONTEXTSVv0.1";
     int skip_count = 0;
     int total_count = 0;
-    // std::set<std::string> chrs = this->getChromosomes();
-   //for (auto const& chr : chrs) {
-   for (const auto& pair : sv_calls) {
-        // if (this->sv_calls.find(chr) == this->sv_calls.end()) {
-        //     continue;
-        // }
+    for (const auto& pair : sv_calls) {
         std::string chr = pair.first;
         const std::set<SVCall>& sv_calls = pair.second;
         std::cout << "Saving SV calls for " << chr << "..." << std::endl;
-        // for (auto const& sv_call : this->sv_calls[chr]) {
         for (const auto& sv_call : sv_calls) {
             // Get the SV candidate and SV info
             uint32_t start = sv_call.start;

From 128575ab4d5dbf1de07bdb907435953fffaddd4c Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 26 Nov 2024 22:04:11 -0500
Subject: [PATCH 033/134] implement thread pooling

---
 src/cnv_caller.cpp |   4 +-
 src/sv_caller.cpp  | 129 ++++++++++++++++++++++++---------------------
 src/sv_object.cpp  |   6 +--
 3 files changed, 75 insertions(+), 64 deletions(-)

diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index fd7a6c7c..a38c37a4 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -223,9 +223,9 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall> &
 {
     int window_size = this->input_data.getWindowSize();
     double mean_chr_cov = this->mean_chr_cov;  
-    printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "...");
+    // printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "...");
     runCIGARCopyNumberPredictionChunk(chr, sv_candidates, hmm, window_size, mean_chr_cov);
-    printMessage("Finished predicting copy number states for chromosome " + chr + "...");
+    // printMessage("Finished predicting copy number states for chromosome " + chr + "...");
 }
 
 void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, CHMM hmm, int window_size, double mean_chr_cov)
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index aa32dda2..bdcd081c 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -18,6 +18,7 @@
 #include <cmath>
 #include <algorithm>
 #include <fstream>
+#include <condition_variable>
 
 #include "utils.h"
 #include "sv_types.h"
@@ -428,89 +429,99 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
     CHMM hmm = ReadCHMM(hmm_filepath.c_str());
 
     // Set up threads for processing each chromosome
+    // const int max_threads = 6;
+    const int max_threads = 10;
     std::vector<std::future<void>> futures;
     std::unordered_map<std::string, std::set<SVCall>> whole_genome_sv_calls;
     std::mutex sv_mutex;
+    std::condition_variable cv;
+    int active_threads = 0;
 
-    // Set a thread count for processing each chromosome. Keep it low to avoid
-    // memory issues.
-    int max_threads = 6;  // Number of chromosomes to process in parallel
-    int batch_count = 0;
-    int completed_threads = 0;
-    int chr_count = chromosomes.size();
-    for (const auto& chr : chromosomes) {
+    // Lambda to process a chromosome
+    auto process_chr = [&](const std::string& chr) {
         printMessage("Launching thread for chromosome " + chr + "...");
-        futures.push_back(std::async(std::launch::async, [&]() {
-            std::set<SVCall> sv_calls;
-            this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength());
-            {
-                std::lock_guard<std::mutex> lock(sv_mutex);
-                whole_genome_sv_calls[chr] = std::move(sv_calls);
-            }
+        std::set<SVCall> sv_calls;
+        this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength());
+        {
+            std::lock_guard<std::mutex> lock(sv_mutex);
+            whole_genome_sv_calls[chr] = std::move(sv_calls);
         }
-        ));
-        batch_count++;
-        if (batch_count >= max_threads || batch_count >= chr_count) {
-            // Wait for all threads to finish
-            // printMessage("Waiting for all threads to finish for " + std::to_string(batch_count) + " chromosome(s)...");
-            for (auto& future : futures) {
-                future.get();
-                completed_threads++;
-                printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
-            }
-            // completed_threads += batch_count;
-            // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
-            batch_count = 0;
-            futures.clear();
+        printMessage("Completed chromosome " + chr);
+
+        // Notify thread completion
+        {
+            std::lock_guard<std::mutex> lock(sv_mutex);
+            active_threads--;
+        }
+        cv.notify_one();
+    };
+
+    // Thread management
+    std::vector<std::thread> threads;
+    for (const auto& chr : chromosomes) {
+        {
+            std::unique_lock<std::mutex> lock(sv_mutex);
+            cv.wait(lock, [&] { return active_threads < max_threads; });
+            active_threads++;
         }
+
+        // Launch a new thread
+        threads.emplace_back(process_chr, chr);
     }
 
-    // Wait for remaining threads to finish
-    if (futures.size() > 0) {
-        // printMessage("Waiting for remaining threads to finish for " + std::to_string(futures.size()) + " chromosome(s)...");
-        for (auto& future : futures) {
-            future.get();
-            completed_threads++;
-            printMessage("[TEST] Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
+    // Wait for all threads to complete
+    for (auto& thread : threads) {
+        if (thread.joinable()) {
+            thread.join();
         }
-        // completed_threads += futures.size();
-        // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
     }
 
-    // // Loop through each region and detect SVs in chunks
-    // std::string bam_filepath = this->input_data.getLongReadBam();
+    printMessage("All threads have finished.");
+
+    /////////////////////////////////////////////////
+
+    // // Set a thread count for processing each chromosome. Keep it low to avoid
+    // // memory issues.
+    // int batch_count = 0;
+    // int completed_threads = 0;
     // int chr_count = chromosomes.size();
-    // std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl;
-    // // int thread_count = this->input_data.getThreadCount();
-    // int thread_count = chr_count;
-    // int min_cnv_length = this->input_data.getMinCNVLength();
     // for (const auto& chr : chromosomes) {
     //     printMessage("Launching thread for chromosome " + chr + "...");
     //     futures.push_back(std::async(std::launch::async, [&]() {
     //         std::set<SVCall> sv_calls;
-    //         this->processChromosome(chr, bam_filepath, hmm, sv_calls, min_cnv_length);
+    //         this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength());
     //         {
     //             std::lock_guard<std::mutex> lock(sv_mutex);
     //             whole_genome_sv_calls[chr] = std::move(sv_calls);
     //         }
     //     }
     //     ));
+    //     batch_count++;
+    //     if (batch_count >= max_threads || batch_count >= chr_count) {
+    //         // Wait for all threads to finish
+    //         // printMessage("Waiting for all threads to finish for " + std::to_string(batch_count) + " chromosome(s)...");
+    //         for (auto& future : futures) {
+    //             future.get();
+    //             completed_threads++;
+    //             printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
+    //         }
+    //         // completed_threads += batch_count;
+    //         // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
+    //         batch_count = 0;
+    //         futures.clear();
+    //     }
     // }
 
-    // // Wait for all threads to finish
-    // printMessage("Waiting for all threads to finish for " + std::to_string(chr_count) + " chromosome(s)...");
-    // int threads_finished = 0;
-    // for (auto& future : futures) {
-    //     try{
-    //         // future.wait();
-    //         future.get();  // Wait and handle exceptions
-    //         threads_finished++;
-    //         printMessage("Completed " + std::to_string(threads_finished) + " of " + std::to_string(thread_count) + " threads...");
-    //     } catch (const std::exception& e) {
-    //         std::cerr << "Error in thread: " << e.what() << std::endl;
+    // // Wait for remaining threads to finish
+    // if (futures.size() > 0) {
+    //     // printMessage("Waiting for remaining threads to finish for " + std::to_string(futures.size()) + " chromosome(s)...");
+    //     for (auto& future : futures) {
+    //         future.get();
+    //         completed_threads++;
+    //         printMessage("[TEST] Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
     //     }
     // }
-    printMessage("All threads have finished.");
+    // printMessage("All threads have finished.");
 
     // Print the total number of SVs detected for each chromosome
     uint32_t total_sv_count = 0;
@@ -650,9 +661,9 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
     }
 
     // Print the number of SVs detected from split-read alignments
-    if (sv_count > 0) {
-        std::cout << "Found " << sv_count << " SVs from split-read alignments" << std::endl;
-    }
+    // if (sv_count > 0) {
+    //     std::cout << "Found " << sv_count << " SVs from split-read alignments" << std::endl;
+    // }
 }
 
 void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall> >& sv_calls)
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index c28d6b21..09203b59 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -158,7 +158,7 @@ void mergeSVs(std::set<SVCall>& sv_calls) {
     }
 
     // Merge SV calls if they overlap by at least 50%
-    int initial_size = sv_calls.size();
+    // int initial_size = sv_calls.size();
     std::vector<SVCall> merged_sv_calls;
     auto it = sv_calls.begin();
     SVCall current_merge = *it++;
@@ -214,6 +214,6 @@ void mergeSVs(std::set<SVCall>& sv_calls) {
     for (const auto& sv_call : merged_sv_calls) {
         sv_calls.insert(sv_call);
     }
-    int updated_size = sv_calls.size();
-    std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl;
+    // int updated_size = sv_calls.size();
+    // std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl;
 }

From d62fe120fd2dd919eb417511cd6f021732e0ecba Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 27 Nov 2024 14:46:55 -0500
Subject: [PATCH 034/134] fix duplication error

---
 Makefile-cpp       |  2 +-
 src/cnv_caller.cpp |  2 +-
 src/sv_caller.cpp  | 33 ++++++++++++++++++++++++++-------
 3 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/Makefile-cpp b/Makefile-cpp
index e6ba7d30..2b117f0e 100644
--- a/Makefile-cpp
+++ b/Makefile-cpp
@@ -11,7 +11,7 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib
 
 # Compiler and Flags
 CXX := g++
-CXXFLAGS := -std=c++14 -I$(INCL_DIR) -I$(CONDA_INCL_DIR)
+CXXFLAGS := -std=c++14 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR)
 LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR)  # Add rpath for shared libraries
 
 # Link htslib
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index a38c37a4..feace2e8 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -769,7 +769,7 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
     std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr);
     if (pfb_filepath == "")
     {
-        printError("No population frequency file provided for chromosome " + chr);
+        // printError("No population frequency file provided for chromosome " + chr);
         return;
     }
     
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index bdcd081c..b38e6fe8 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -173,14 +173,15 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
 
                 // To determine whether the insertion is a duplication, check
                 // for sequence identity between the insertion and the
-                // reference genome (duplications are typically >= 90%)
-
+                // reference genome (duplications are typically >= 90%):
                 // Loop through the reference sequence and calculate the
                 // sequence identity +/- insertion length from the insertion
                 // position.
                 bool is_duplication = false;
                 int ins_ref_pos;
-                for (int j = pos - op_len; j <= pos; j++) {
+                int dup_start = std::max(0, pos - op_len);
+                // for (int j = pos - op_len; j <= pos; j++) {
+                for (int j = dup_start; j <= pos; j++) {
 
                     // Get the string for the window (1-based coordinates)
                     ins_ref_pos = j + 1;
@@ -267,6 +268,7 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
 
             // Get the corresponding reference sequence
             int cmatch_pos = pos + 1;  // Querying the reference genome is 1-based
+            // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1));
             std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1);
 
             // Check that the two sequence lengths are equal
@@ -362,6 +364,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
 
     // Load chromosome data for copy number predictions
     // std::cout << "Loading chromosome data for copy number predictions..." << std::endl;
+    printMessage(chr + ": Loading chromosome data...");
     CNVCaller cnv_caller(this->input_data);
     cnv_caller.loadChromosomeData(chr);
 
@@ -371,11 +374,14 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
     int current_region = 0;
     // std::set<SVCall> combined_sv_calls;
     for (const auto& sub_region : region_chunks) {
+        current_region++;
+        printMessage(chr + ": CIGAR SVs...");
         std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region);
         std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
         PrimaryMap& primary_map = std::get<1>(region_data);
         SuppMap& supp_map = std::get<2>(region_data);
         // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl;
+        printMessage(chr + ": Merging CIGAR...");
         mergeSVs(subregion_sv_calls);
         int region_sv_count = getSVCount(subregion_sv_calls);
         // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
@@ -384,21 +390,25 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
         // CIGAR string, using a minimum CNV length threshold
         if (region_sv_count > 0) {
             // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl;
+            printMessage(chr + ": CIGAR predictions...");
             cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm);
         }
 
         // Run split-read SV and copy number variant predictions
         // std::cout << "Detecting copy number variants from split reads..." << std::endl;
+        printMessage(chr + ": Split read SVs...");
         this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm);
 
         // Merge the SV calls from the current region
         // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
+        printMessage(chr + ": Merging split reads...");
         mergeSVs(subregion_sv_calls);
 
         // Combine the SV calls from the current region
         // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl;
+        printMessage(chr + ": Concatenating calls...");
         concatenateSVCalls(combined_sv_calls, subregion_sv_calls);
-        current_region++;
+
         printMessage("Completed " + std::to_string(current_region) + " of " + std::to_string(region_count) + " region(s) for chromosome " + chr + "...");
     }
 
@@ -422,15 +432,24 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
     chromosomes.erase(std::remove_if(chromosomes.begin(), chromosomes.end(), [](const std::string& chr) {
         return chr.find("alt") != std::string::npos || chr.find("GL") != std::string::npos || chr.find("NC") != std::string::npos || chr.find("hs") != std::string::npos;
     }), chromosomes.end());
-
+    
+    /*
+    // Test only on a subset 241125_ALL/output.merged.vcf
+    chromosomes = {"chr2", "chr3", "chr5", "chr6", "chr7", "chr4"};
+    */
+    
+    
+    // Test only on a subset 241125_ALL/output.merged.vcf
+    // chromosomes = {"chrM", "chr8", "chr9", "chr10", "chr11", "chr1"};
+        
     // Read the HMM from the file
     std::string hmm_filepath = this->input_data.getHMMFilepath();
     std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
     CHMM hmm = ReadCHMM(hmm_filepath.c_str());
 
     // Set up threads for processing each chromosome
-    // const int max_threads = 6;
-    const int max_threads = 10;
+    const int max_threads = 8;
+    // const int max_threads = 10;
     std::vector<std::future<void>> futures;
     std::unordered_map<std::string, std::set<SVCall>> whole_genome_sv_calls;
     std::mutex sv_mutex;

From da4d72f142ee7727ab8f6c277075202344297cd9 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 30 Nov 2024 12:27:08 -0500
Subject: [PATCH 035/134] add arguments and version and reduce copies

---
 Makefile-cpp         |  10 ++-
 include/cnv_caller.h |   8 +-
 include/khmm.h       |  14 ++--
 include/sv_caller.h  |   4 +-
 include/version.h    |   2 +
 src/cnv_caller.cpp   |   9 ++-
 src/input_data.cpp   |   2 -
 src/khmm.cpp         |  38 ---------
 src/main.cpp         | 189 +++++++++++++++++++++++++++++++------------
 src/sv_caller.cpp    |  69 +++-------------
 10 files changed, 173 insertions(+), 172 deletions(-)
 create mode 100644 include/version.h

diff --git a/Makefile-cpp b/Makefile-cpp
index 2b117f0e..58139e9c 100644
--- a/Makefile-cpp
+++ b/Makefile-cpp
@@ -4,6 +4,13 @@ SRC_DIR := $(CURDIR)/src
 BUILD_DIR := $(CURDIR)/build
 LIB_DIR := $(CURDIR)/lib
 
+# Version header
+VERSION := $(shell git describe --tags --always)
+VERSION_HEADER := $(INCL_DIR)/version.h
+.PHONY: $(VERSION_HEADER)
+	@echo "#pragma once" > $@
+	@echo "#define VERSION \"$(VERSION)\"" >> $@
+
 # Conda environment directories
 CONDA_PREFIX := $(shell echo $$CONDA_PREFIX)
 CONDA_INCL_DIR := $(CONDA_PREFIX)/include
@@ -13,10 +20,7 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib
 CXX := g++
 CXXFLAGS := -std=c++14 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR)
 LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR)  # Add rpath for shared libraries
-
-# Link htslib
 LDLIBS := -lhts  # Link with libhts.a or libhts.so
-# LDLIBS := -lmylib  # Link with libraries in LIB_DIR, e.g., libmylib.a or libmylib.so
 
 # Sources and Output
 # SOURCES := $(wildcard $(SRC_DIR)/*.cpp)
diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 9f16c5f6..be3c1479 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -85,7 +85,7 @@ class CNVCaller {
 
         void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp);
 
-        std::pair<std::vector<int>, double> runViterbi(CHMM hmm, SNPData &snp_data);
+        std::pair<std::vector<int>, double> runViterbi(const CHMM& hmm, SNPData &snp_data);
 
         // Query a region for SNPs and return the SNP data
         std::pair<SNPData, bool> querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, SNPInfo &snp_info, std::unordered_map<uint32_t, int> &pos_depth_map, double mean_chr_cov);
@@ -93,7 +93,7 @@ class CNVCaller {
         void querySNPs(std::string chr, uint32_t start, uint32_t end, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf, std::unordered_map<uint32_t, double>& snp_pfb);
 
         // Run copy number prediction for a chunk of SV candidates from CIGAR strings
-        void runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, CHMM hmm, int window_size, double mean_chr_cov);
+        void runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov);
 
         void updateSVCopyNumber(std::map<SVCandidate, SVInfo>& sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood);
 
@@ -116,11 +116,11 @@ class CNVCaller {
 
         // Run copy number prediction for a single SV candidate, returning the
         // likelihood, predicted CNV type, genotype, and whether SNPs were found
-        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate, CHMM hmm);
+        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate, const CHMM& hmm);
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
         // SNPData runCIGARCopyNumberPrediction(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, int min_length);
-        void runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall>& sv_candidates, int min_length, CHMM hmm);
+        void runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall>& sv_candidates, int min_length, const CHMM& hmm);
 
         // Calculate the mean chromosome coverage
         double calculateMeanChromosomeCoverage(std::string chr);
diff --git a/include/khmm.h b/include/khmm.h
index 8f86e7a4..9585635f 100644
--- a/include/khmm.h
+++ b/include/khmm.h
@@ -13,22 +13,22 @@
 // Struct for HMM (C++ RAII style)
 struct CHMM
 {
-	int N;	// Number of states
-	int M; 	// Number of observation symbols
+	int N = 0;	// Number of states
+	int M = 0; 	// Number of observation symbols
 	std::vector<std::vector<double>> A;  // Transition probability matrix
 	std::vector<std::vector<double>> B;  // Emission probability matrix
 	std::vector<double> pi;  // Initial state distribution
 	std::vector<double> B1_mean;  // Mean of a continuous Gaussian distribution for state 1 through N
 	std::vector<double> B1_sd;  // Standard deviation of B1 values, which is the same for all states
-	double B1_uf;  // B1_uniform_fraction: the contribution of uniform distribution to the finite mixture model
+	double B1_uf = 0.0;  // B1_uniform_fraction: the contribution of uniform distribution to the finite mixture model
 	std::vector<double> B2_mean;  // B2_mean[1..4] is the average of B_allele_freq
 	std::vector<double> B2_sd;  // B2_sd[1..4] is the standard deviation of four B_allele_freq, B2_sd[5] is specially for state1, where B is modelled as a wide normal distribution
-	double B2_uf;  // B2_uniform_fraction: the fraction of uniform distribution in the finite mixture model
-	int NP_flag;
+	double B2_uf = 0.0;  // B2_uniform_fraction: the fraction of uniform distribution in the finite mixture model
+	int NP_flag = 0;
 	std::vector<double> B3_mean;
 	std::vector<double> B3_sd;
-	double B3_uf;
-	int dist;
+	double B3_uf = 0.0;
+	int dist = 0;
 };
 
 
diff --git a/include/sv_caller.h b/include/sv_caller.h
index ec89aa8a..cdafab0b 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -38,7 +38,7 @@ class SVCaller {
         // mismatch rate, and the start and end positions of the query sequence
         std::tuple<std::unordered_map<int, int>, int32_t, int32_t> detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set<SVCall>& sv_calls, bool is_primary);
 
-        void processChromosome(const std::string& chr, const std::string& bam_filepath, CHMM hmm, std::set<SVCall>& combined_sv_calls, int min_cnv_length);
+        void processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::set<SVCall>& combined_sv_calls, int min_cnv_length);
 
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
@@ -49,7 +49,7 @@ class SVCaller {
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
 
         // Detect SVs from split alignments
-        void detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, CHMM hmm);
+        void detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm);
 
         // Calculate the mismatch rate given a map of query positions to
         // match/mismatch (1/0) values within a specified range of the query
diff --git a/include/version.h b/include/version.h
new file mode 100644
index 00000000..d38178a8
--- /dev/null
+++ b/include/version.h
@@ -0,0 +1,2 @@
+#pragma once
+#define VERSION "v0,1,0-41-gd62fe12"
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index feace2e8..dac33183 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -41,7 +41,7 @@ CNVCaller::CNVCaller(InputData &input_data)
 }
 
 // Function to call the Viterbi algorithm for the CHMM
-std::pair<std::vector<int>, double> CNVCaller::runViterbi(CHMM hmm, SNPData& snp_data)
+std::pair<std::vector<int>, double> CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data)
 {
     int data_count = (int) snp_data.pos.size();
     if (data_count == 0)
@@ -144,7 +144,7 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
     return std::make_pair(snp_data, snps_found);
 }
 
-std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate, CHMM hmm)
+std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate, const CHMM& hmm)
 {
      // Get the start and end positions of the SV call
     uint32_t start_pos = std::get<0>(candidate);
@@ -162,6 +162,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     bool sv_snps_found = snp_call.second;
 
     // Run the Viterbi algorithm
+    printMessage("Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " (" + std::to_string(sv_snps.pos.size()) + " SNPs, start=" + std::to_string(snp_start_pos) + ", end=" + std::to_string(snp_end_pos) + ")...");
     std::pair<std::vector<int>, double> prediction = runViterbi(hmm, sv_snps);
     std::vector<int>& state_sequence = prediction.first;
     double likelihood = prediction.second;
@@ -219,7 +220,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 }
 
 
-void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall> &sv_candidates, int min_length, CHMM hmm)
+void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall> &sv_candidates, int min_length, const CHMM& hmm)
 {
     int window_size = this->input_data.getWindowSize();
     double mean_chr_cov = this->mean_chr_cov;  
@@ -228,7 +229,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall> &
     // printMessage("Finished predicting copy number states for chromosome " + chr + "...");
 }
 
-void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, CHMM hmm, int window_size, double mean_chr_cov)
+void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov)
 {
     // printMessage("Running copy number prediction for " + std::to_string(sv_chunk.size()) + " SV candidates on chromosome " + chr + "...");
     // Map with counts for each CNV type
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 572ed92a..e152a6cf 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -217,8 +217,6 @@ bool InputData::isRegionSet()
 
 void InputData::setAlleleFreqFilepaths(std::string filepath)
 {
-    // this->pfb_filepath = filepath;
-
     // Check if empty string
     if (filepath == "")
     {
diff --git a/src/khmm.cpp b/src/khmm.cpp
index bdb6eb8b..d375a1a0 100644
--- a/src/khmm.cpp
+++ b/src/khmm.cpp
@@ -459,17 +459,6 @@ CHMM ReadCHMM(const std::string filename)
 		throw std::runtime_error("Error reading A");
 	}
 
-	// Print A
-	// std::cout << "A: " << std::endl;
-	// for (int i = 0; i < hmm.N; i++)
-	// {
-	// 	for (int j = 0; j < hmm.N; j++)
-	// 	{
-	// 		std::cout << std::setprecision(10) << hmm.A[i][j] << " ";
-	// 	}
-	// 	std::cout << std::endl;
-	// }
-
 	// Read B
 	std::getline(file, line);
 	if (line != "B:")
@@ -494,13 +483,6 @@ CHMM ReadCHMM(const std::string filename)
 		throw std::runtime_error("Error reading pi");
 	}
 
-	// Print pi
-	// std::cout << "pi: ";
-	// for (int i = 0; i < hmm.N; i++)
-	// {
-	// 	std::cout << std::setprecision(10) << hmm.pi[i] << " ";
-	// }
-
 	// Read B1_mean
 	std::getline(file, line);
 	if (line != "B1_mean:")
@@ -513,13 +495,6 @@ CHMM ReadCHMM(const std::string filename)
 		throw std::runtime_error("Error reading B1_mean");
 	}
 
-	// Print B1_mean
-	// std::cout << "B1_mean: ";
-	// for (int i = 0; i < hmm.N; i++)
-	// {
-	// 	std::cout << std::setprecision(10) << hmm.B1_mean[i] << " ";
-	// }
-
 	// Read B1_sd
 	std::getline(file, line);
 	if (line != "B1_sd:")
@@ -532,13 +507,6 @@ CHMM ReadCHMM(const std::string filename)
 		throw std::runtime_error("Error reading B1_sd");
 	}
 
-	// Print B1_sd
-	// std::cout << "B1_sd: ";
-	// for (int i = 0; i < hmm.N; i++)
-	// {
-	// 	std::cout << std::setprecision(10) << hmm.B1_sd[i] << " ";
-	// }
-
 	// Read B1_uf
 	std::getline(file, line);
 	if (line != "B1_uf:")
@@ -552,9 +520,6 @@ CHMM ReadCHMM(const std::string filename)
 		throw std::runtime_error("Error reading B1_uf");
 	}
 
-	// Print B1_uf
-	// std::cout << "B1_uf: " << std::setprecision(10) << hmm.B1_uf << std::endl;
-
 	// Read B2_mean
 	std::getline(file, line);
 	if (line != "B2_mean:")
@@ -592,9 +557,6 @@ CHMM ReadCHMM(const std::string filename)
 		throw std::runtime_error("Error reading B2_uf");
 	}
 
-	// Print B2_uf
-	// std::cout << "B2_uf: " << std::setprecision(10) << hmm.B2_uf << std::endl;
-
 	return hmm;
 }
 
diff --git a/src/main.cpp b/src/main.cpp
index 558e2493..1d78ad5f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,80 +1,165 @@
 
 #include "swig_interface.h"
 #include "input_data.h"
+#include "version.h"
 
 /// @cond DOXYGEN_IGNORE
 #include <iostream>
 #include <string>
+// #include <optional>
 /// @endcond
 
 // Placeholder for ContextSV library includes
 // #include "ContextSV.h"
 
-void runContextSV(const std::string& bamFile, const std::string& refFile, const std::string& vcfFile, const std::string& outputDir, int threadCount = 1, const std::string& hmmFile = "", int windowSize = 2500, int minCNV = 2500, const std::string& eth = "", const std::string& pfbFile = "")
+void runContextSV(const std::unordered_map<std::string, std::string>& args)
 {
     // Placeholder for setting up input data and running ContextSV
-    std::cout << "Running ContextSV with the following files:" << std::endl;
-    std::cout << "BAM file: " << bamFile << std::endl;
-    std::cout << "Reference file: " << refFile << std::endl;
-    std::cout << "VCF file: " << vcfFile << std::endl;
-    std::cout << "Thread count: " << threadCount << std::endl;
-    std::cout << "Output directory: " << outputDir << std::endl;
+    std::cout << "ContextSV version " << VERSION << std::endl;
+    std::cout << "Input parameters:" << std::endl;
+    for (const auto& arg : args) {
+        std::cout << arg.first << ": " << arg.second << std::endl;
+    }
 
     // Set up input data
     InputData input_data;
-    input_data.setShortReadBam(bamFile);
-    input_data.setLongReadBam(bamFile);
-    input_data.setRefGenome(refFile);
-    input_data.setSNPFilepath(vcfFile);
-    //input_data.setChromosome("21");
-    //input_data.setRegion("14486099-14515105");
-    input_data.setThreadCount(threadCount);
-    input_data.setAlleleFreqFilepaths(pfbFile);
-    input_data.setHMMFilepath(hmmFile);
-    input_data.setOutputDir(outputDir);
-    input_data.saveCNVData(false);
-    input_data.setThreadCount(threadCount);
-    input_data.setWindowSize(windowSize);
-    input_data.setMinCNVLength(minCNV);
+    input_data.setLongReadBam(args.at("bam-file"));
+    input_data.setShortReadBam(args.at("bam-file"));
+    input_data.setRefGenome(args.at("ref-file"));
+    input_data.setSNPFilepath(args.at("snps-file"));
+    input_data.setOutputDir(args.at("output-dir"));
+    if (args.find("chr") != args.end()) {
+        input_data.setChromosome(args.at("chr"));
+    }
+    if (args.find("region") != args.end()) {
+        input_data.setRegion(args.at("region"));
+    }
+    if (args.find("thread-count") != args.end()) {
+        input_data.setThreadCount(std::stoi(args.at("thread-count")));
+    }
+    if (args.find("hmm-file") != args.end()) {
+        input_data.setHMMFilepath(args.at("hmm-file"));
+    }
+    if (args.find("window-size") != args.end()) {
+        input_data.setWindowSize(std::stoi(args.at("window-size")));
+    }
+    if (args.find("min-cnv") != args.end()) {
+        input_data.setMinCNVLength(std::stoi(args.at("min-cnv")));
+    }
+    if (args.find("eth") != args.end()) {
+        input_data.setEthnicity(args.at("eth"));
+    }
+    if (args.find("pfb-file") != args.end()) {
+        input_data.setAlleleFreqFilepaths(args.at("pfb-file"));
+    }
+    if (args.find("save-cnv") != args.end()) {
+        input_data.saveCNVData(true);
+    }
+    if (args.find("debug") != args.end()) {
+        input_data.setVerbose(true);
+    }
+    // input_data.setShortReadBam(bamFile);
+    // input_data.setLongReadBam(bamFile);
+    // input_data.setRefGenome(refFile);
+    // input_data.setSNPFilepath(vcfFile);
+    // //input_data.setChromosome("21");
+    // //input_data.setRegion("14486099-14515105");
+    // input_data.setThreadCount(threadCount);
+    // input_data.setAlleleFreqFilepaths(pfbFile);
+    // input_data.setHMMFilepath(hmmFile);
+    // input_data.setOutputDir(outputDir);
+    // input_data.saveCNVData(false);
+    // input_data.setThreadCount(threadCount);
+    // input_data.setWindowSize(windowSize);
+    // input_data.setMinCNVLength(minCNV);
 
     // Run ContextSV
     run(input_data);
 }
 
-int main(int argc, char* argv[]) {
-    if (argc < 6) {
-        std::cerr << "Usage: " << argv[0] << " <bam_file> <ref_file> <vcf_file> <output_dir> <thread_count>" << std::endl;
-        return 1;
-    }
+void printUsage(const std::string& programName) {
+    std::cerr << "Usage: " << programName << " [options]\n"
+                << "Options:\n"
+                << "  -b, --bam <bam_file>          Long-read BAM file (required)\n"
+                << "  -r, --ref <ref_file>          Reference genome FASTA file (required)\n"
+                << "  -s, --snp <vcf_file>          SNPs VCF file (required)\n"
+                << "  -o, --outdir <output_dir>     Output directory (required)\n"
+                << "  -c, --chr <chromosome>        Chromosome\n"
+                << "  -r, --region <region>         Region (e.g., 14486099-14515105)\n"
+                << "  -t, --threads <thread_count>  Number of threads\n"
+                << "  -h, --hmm <hmm_file>          HMM file\n"
+                << "  -w, --window <window_size>    Window size\n"
+                << "     --min-cnv <min_length>     Minimum CNV length\n"
+                << "  -e, --eth <eth_file>          ETH file\n"
+                << "  -p, --pfb <pfb_file>          PFB file\n"
+                << "     --save-cnv                 Save CNV data\n"
+                << "     --debug                    Debug mode\n"
+                << "     --version                  Print version and exit\n"
+                << "  -h, --help                    Print usage and exit\n";
+}
+
+std::unordered_map<std::string, std::string> parseArguments(int argc, char* argv[]) {
+    std::unordered_map<std::string, std::string> args;
+    for (int i = 1; i < argc; i++) {
+        std::string arg = argv[i];
 
-    std::string bamFile = argv[1];
-    std::string refFile = argv[2];
-    std::string vcfFile = argv[3];
-    std::string outputDir = argv[4];
-    int threadCount = std::stoi(argv[5]);
+        // Handle short and long options
+        if ((arg == "-b" || arg == "--bam") && i + 1 < argc) {
+            args["bam-file"] = argv[++i];
+        } else if ((arg == "-r" || arg == "--ref") && i + 1 < argc) {
+            args["ref-file"] = argv[++i];
+        } else if ((arg == "-s" || arg == "--snp") && i + 1 < argc) {
+            args["snps-file"] = argv[++i];
+        } else if ((arg == "-o" || arg == "--outdir") && i + 1 < argc) {
+            args["output-dir"] = argv[++i];
+        } else if ((arg == "-c" || arg == "--chr") && i + 1 < argc) {
+            args["chr"] = argv[++i];
+        } else if ((arg == "-r" || arg == "--region") && i + 1 < argc) {
+            args["region"] = argv[++i];
+        } else if ((arg == "-t" || arg == "--threads") && i + 1 < argc) {
+            args["thread-count"] = argv[++i];
+        } else if ((arg == "-h" || arg == "--hmm") && i + 1 < argc) {
+            args["hmm-file"] = argv[++i];
+        } else if ((arg == "-w" || arg == "--window") && i + 1 < argc) {
+            args["window-size"] = argv[++i];
+        } else if (arg == "--min-cnv" && i + 1 < argc) {
+            args["min-cnv"] = argv[++i];
+        } else if ((arg == "-e" || arg == "--eth") && i + 1 < argc) {
+            args["eth"] = argv[++i];
+        } else if ((arg == "-p" || arg == "--pfb") && i + 1 < argc) {
+            args["pfb-file"] = argv[++i];
+        } else if (arg == "--save-cnv") {
+            args["save-cnv"] = "true";
+        } else if (arg == "--debug") {
+            args["debug"] = "true";
+        } else if ((arg == "-v" || arg == "--version")) {
+            std::cout << "ContextSV version " << VERSION << std::endl;
+            exit(0);
+        } else if (arg == "-h" || arg == "--help") {
+            printUsage(argv[0]);
+            exit(0);
+        } else {
+            std::cerr << "Unknown option: " << arg << std::endl;
+        }
+    }
 
-    std::string hmmFile = "";
-    int windowSize = 2500;
-    int minCNV = 2500;
-    std::string eth = "";
-    std::string pfbFile = "";
-    if (argc == 11) {
-        hmmFile = argv[6];
-        windowSize = std::stoi(argv[7]);
-        minCNV = std::stoi(argv[8]);
-        eth = argv[9];
-        pfbFile = argv[10];
+    // Check for required arguments
+    bool hasLR = args.find("bam-file") != args.end();
+    bool hasOutput = args.find("output-dir") != args.end();
+    bool hasRef = args.find("ref-file") != args.end();
+    bool hasSNPs = args.find("snps-file") != args.end();
+    bool requiredArgs = hasLR && hasOutput && hasRef && hasSNPs;
+    if (!requiredArgs) {
+        std::cerr << "Missing required argument(s): -b/--bam, -r/--ref, -s/--snp, -o/--outdir" << std::endl;
+        exit(1);
     }
-    
-    runContextSV(bamFile, refFile, vcfFile, outputDir, threadCount, hmmFile, windowSize, minCNV, eth, pfbFile);
-    
-    //std::string hmmFile = argv[6];
-    //int windowSize = std::stoi(argv[7]);
-    //int minCNV = std::stoi(argv[8]);
-    //std::string eth = argv[9];
-    //std::string pfbFile = argv[10];
-    
-    //runContextSV(bamFile, refFile, vcfFile, outputDir, threadCount, "", 2500, 2500, "", "");
+
+    return args;
+}
+
+int main(int argc, char* argv[]) {
+    auto args = parseArguments(argc, argv);
+    runContextSV(args);
 
     return 0;
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index b38e6fe8..b9038e5c 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -311,7 +311,7 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
     return std::tuple<std::unordered_map<int, int>, int32_t, int32_t>(query_match_map, query_start, query_end);
 }
 
-void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, CHMM hmm, std::set<SVCall>& combined_sv_calls, int min_cnv_length)
+void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::set<SVCall>& combined_sv_calls, int min_cnv_length)
 {
     // Open the BAM file
     samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
@@ -432,24 +432,15 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
     chromosomes.erase(std::remove_if(chromosomes.begin(), chromosomes.end(), [](const std::string& chr) {
         return chr.find("alt") != std::string::npos || chr.find("GL") != std::string::npos || chr.find("NC") != std::string::npos || chr.find("hs") != std::string::npos;
     }), chromosomes.end());
-    
-    /*
-    // Test only on a subset 241125_ALL/output.merged.vcf
-    chromosomes = {"chr2", "chr3", "chr5", "chr6", "chr7", "chr4"};
-    */
-    
-    
-    // Test only on a subset 241125_ALL/output.merged.vcf
-    // chromosomes = {"chrM", "chr8", "chr9", "chr10", "chr11", "chr1"};
         
     // Read the HMM from the file
     std::string hmm_filepath = this->input_data.getHMMFilepath();
     std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
-    CHMM hmm = ReadCHMM(hmm_filepath.c_str());
+    const CHMM& hmm = ReadCHMM(hmm_filepath.c_str());
 
     // Set up threads for processing each chromosome
-    const int max_threads = 8;
-    // const int max_threads = 10;
+    const int max_threads = this->input_data.getThreadCount();
+    std::cout << "Using " << max_threads << " threads for processing..." << std::endl;
     std::vector<std::future<void>> futures;
     std::unordered_map<std::string, std::set<SVCall>> whole_genome_sv_calls;
     std::mutex sv_mutex;
@@ -458,7 +449,7 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
 
     // Lambda to process a chromosome
     auto process_chr = [&](const std::string& chr) {
-        printMessage("Launching thread for chromosome " + chr + "...");
+        // printMessage("Launching thread for chromosome " + chr + "...");
         std::set<SVCall> sv_calls;
         this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength());
         {
@@ -471,6 +462,7 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
         {
             std::lock_guard<std::mutex> lock(sv_mutex);
             active_threads--;
+            printMessage("Active threads: " + std::to_string(active_threads));
         }
         cv.notify_one();
     };
@@ -480,8 +472,10 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
     for (const auto& chr : chromosomes) {
         {
             std::unique_lock<std::mutex> lock(sv_mutex);
+            printMessage("Waiting for thread slot. Active threads: " + std::to_string(active_threads));
             cv.wait(lock, [&] { return active_threads < max_threads; });
             active_threads++;
+            printMessage("Launching thread for chromosome " + chr + ". Active threads: " + std::to_string(active_threads));
         }
 
         // Launch a new thread
@@ -497,51 +491,6 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
 
     printMessage("All threads have finished.");
 
-    /////////////////////////////////////////////////
-
-    // // Set a thread count for processing each chromosome. Keep it low to avoid
-    // // memory issues.
-    // int batch_count = 0;
-    // int completed_threads = 0;
-    // int chr_count = chromosomes.size();
-    // for (const auto& chr : chromosomes) {
-    //     printMessage("Launching thread for chromosome " + chr + "...");
-    //     futures.push_back(std::async(std::launch::async, [&]() {
-    //         std::set<SVCall> sv_calls;
-    //         this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength());
-    //         {
-    //             std::lock_guard<std::mutex> lock(sv_mutex);
-    //             whole_genome_sv_calls[chr] = std::move(sv_calls);
-    //         }
-    //     }
-    //     ));
-    //     batch_count++;
-    //     if (batch_count >= max_threads || batch_count >= chr_count) {
-    //         // Wait for all threads to finish
-    //         // printMessage("Waiting for all threads to finish for " + std::to_string(batch_count) + " chromosome(s)...");
-    //         for (auto& future : futures) {
-    //             future.get();
-    //             completed_threads++;
-    //             printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
-    //         }
-    //         // completed_threads += batch_count;
-    //         // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
-    //         batch_count = 0;
-    //         futures.clear();
-    //     }
-    // }
-
-    // // Wait for remaining threads to finish
-    // if (futures.size() > 0) {
-    //     // printMessage("Waiting for remaining threads to finish for " + std::to_string(futures.size()) + " chromosome(s)...");
-    //     for (auto& future : futures) {
-    //         future.get();
-    //         completed_threads++;
-    //         printMessage("[TEST] Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)");
-    //     }
-    // }
-    // printMessage("All threads have finished.");
-
     // Print the total number of SVs detected for each chromosome
     uint32_t total_sv_count = 0;
     for (const auto& entry : whole_genome_sv_calls) {
@@ -561,7 +510,7 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
 
 
 // Detect SVs from split read alignments
-void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, CHMM hmm)
+void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm)
 {
     // Find split-read SV evidence
     int sv_count = 0;

From ea06c1cd08c5a0d8e0a1b7d1cab8006efde24b8f Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 30 Nov 2024 19:31:58 -0500
Subject: [PATCH 036/134] Fix errors and reduce copying

---
 include/cnv_caller.h |  54 ++-----
 include/cnv_data.h   |  32 ----
 include/contextsv.h  |   2 -
 include/input_data.h |   3 +-
 include/snp_info.h   |  51 -------
 include/sv_caller.h  |  15 +-
 include/sv_data.h    |  52 -------
 include/sv_types.h   |  19 ---
 src/cnv_caller.cpp   | 338 +++++++++++++++++-----------------------
 src/cnv_data.cpp     |  73 ---------
 src/contextsv.cpp    |   4 +-
 src/input_data.cpp   |   2 +-
 src/snp_info.cpp     | 108 -------------
 src/sv_caller.cpp    | 151 +++++++++++-------
 src/sv_data.cpp      | 355 -------------------------------------------
 src/sv_object.cpp    | 139 +++--------------
 16 files changed, 283 insertions(+), 1115 deletions(-)
 delete mode 100644 include/cnv_data.h
 delete mode 100644 include/snp_info.h
 delete mode 100644 include/sv_data.h
 delete mode 100644 src/cnv_data.cpp
 delete mode 100644 src/snp_info.cpp
 delete mode 100644 src/sv_data.cpp

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index be3c1479..ad22b449 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -6,8 +6,6 @@
 
 #include "khmm.h"
 #include "input_data.h"
-#include "cnv_data.h"
-#include "sv_data.h"
 #include "sv_types.h"
 #include "sv_object.h"
 
@@ -19,7 +17,6 @@
 #include <mutex>
 #include <future>
 
-#include "snp_info.h"
 /// @endcond
 
 using namespace sv_types;
@@ -49,18 +46,14 @@ struct SNPData {
 class CNVCaller {
     private:
         InputData& input_data;
-        mutable std::mutex sv_candidates_mtx; // SV candidate map mutex
-        mutable std::mutex snp_data_mtx;  // SNP data mutex
-        mutable std::mutex hmm_mtx;  // HMM mutex
+        mutable std::mutex snp_file_mtx;  // SNP file mutex
+        mutable std::mutex pfb_file_mtx;  // Population frequency file mutex
+        mutable std::mutex bam_file_mtx;  // BAM file mutex
+        
         // CHMM hmm;
         SNPData snp_data;
-        SNPInfo snp_info;
-        double mean_chr_cov = 0.0;
-        std::unordered_map<uint32_t, int> pos_depth_map;  // Read depth map
-        // std::unordered_map<uint32_t, double> snp_baf_map;  // SNP B-allele frequency map
-        // std::set<uint32_t> snp_alt_map;  // SNP B-allele map
-        // std::set<uint32_t> snp_baf_keys;  // SNP positions for BAF values
-        // std::unordered_map<uint32_t, double> snp_pfb_map;  // SNP population frequency map
+        // double mean_chr_cov = 0.0;
+        // std::unordered_map<uint32_t, int> pos_depth_map;  // Read depth map
 
         // Define a map of CNV genotypes by HMM predicted state.
         // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output.
@@ -85,56 +78,35 @@ class CNVCaller {
 
         void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp);
 
-        std::pair<std::vector<int>, double> runViterbi(const CHMM& hmm, SNPData &snp_data);
+        std::pair<std::vector<int>, double> runViterbi(const CHMM& hmm, SNPData& snp_data);
 
         // Query a region for SNPs and return the SNP data
-        std::pair<SNPData, bool> querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, SNPInfo &snp_info, std::unordered_map<uint32_t, int> &pos_depth_map, double mean_chr_cov);
+        std::pair<SNPData, bool> querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& pos_depth_map, double mean_chr_cov);
 
         void querySNPs(std::string chr, uint32_t start, uint32_t end, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf, std::unordered_map<uint32_t, double>& snp_pfb);
 
         // Run copy number prediction for a chunk of SV candidates from CIGAR strings
-        void runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov);
-
-        void updateSVCopyNumber(std::map<SVCandidate, SVInfo>& sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood);
-
-        void updateDPValue(std::map<SVCandidate, SVInfo>& sv_candidates, SVCandidate key, int dp_value);
+        void runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map);
 
         // Split a region into chunks for parallel processing
         std::vector<std::string> splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count);
 
-        // Split SV candidates into chunks for parallel processing
-        std::vector<std::vector<SVCandidate>> splitSVCandidatesIntoChunks(std::map<SVCandidate, SVInfo>& sv_candidates, int chunk_count);
-
-        // Merge the read depths from a chunk into the main read depth map
-        void mergePosDepthMaps(std::unordered_map<uint32_t, int>& main_map, std::unordered_map<uint32_t, int>& map_update);
-
     public:
         explicit CNVCaller(InputData& input_data);
 
-        // Load file data for a chromosome (SNP positions, BAF values, and PFB values)
-        void loadChromosomeData(std::string chr);
-
         // Run copy number prediction for a single SV candidate, returning the
         // likelihood, predicted CNV type, genotype, and whether SNPs were found
-        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate, const CHMM& hmm);
+        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map);
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
-        // SNPData runCIGARCopyNumberPrediction(std::string chr, std::map<SVCandidate, SVInfo>& sv_candidates, int min_length);
-        void runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall>& sv_candidates, int min_length, const CHMM& hmm);
+        void runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall>& sv_candidates, int min_length, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map);
 
         // Calculate the mean chromosome coverage
-        double calculateMeanChromosomeCoverage(std::string chr);
+        std::pair<double, std::vector<uint32_t>> calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len);
 
         // Calculate the log2 ratio for a region given the read depths and mean
         // chromosome coverage
-        double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::unordered_map<uint32_t, int>& pos_depth_map, double mean_chr_cov);
-
-        // Read SNP positions and BAF values from the VCF file of SNP calls
-        // void readSNPAlleleFrequencies(std::string chr, std::string filepath, SNPInfo& snp_info);
-
-        // Read SNP population frequencies from the PFB file and return a vector
-        // of population frequencies for each SNP location
-        // void getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info);
+        double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& pos_depth_map, double mean_chr_cov);
 
         void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf);
         void readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map<uint32_t, double>& snp_pfb_map);
diff --git a/include/cnv_data.h b/include/cnv_data.h
deleted file mode 100644
index a2ebd403..00000000
--- a/include/cnv_data.h
+++ /dev/null
@@ -1,32 +0,0 @@
-#ifndef CNV_DATA_H
-#define CNV_DATA_H
-
-/// @cond
-#include <string>
-#include <set>
-#include <map>
-/// @endcond
-
-// CNV candidate location map
-// (chr, snp_pos) : cnv_type
-
-using SNPLocation = std::pair<std::string, int64_t>;
-using SNPToCNVMap = std::map<SNPLocation,  int>;
-
-
-class CNVData {
-    private:
-        SNPToCNVMap cnv_calls;  // Map of SNP positions to CNV types
-
-    public:
-        // Add a CNV call to the map
-        void addCNVCall(std::string chr, int snp_pos, int cnv_type);
-
-        // Get the most common CNV type within the SV region start and end positions
-        std::tuple<int, std::string> getMostCommonCNV(std::string chr, int start, int end);
-
-        // Load CNV calls from file
-        void loadFromFile(std::string filepath);
-};
-
-#endif // CNV_DATA_H
diff --git a/include/contextsv.h b/include/contextsv.h
index 56a82a54..97d7bce9 100644
--- a/include/contextsv.h
+++ b/include/contextsv.h
@@ -7,8 +7,6 @@
 #define CONTEXTSV_H
 
 #include "input_data.h"
-#include "cnv_data.h"
-#include "sv_data.h"
 
 
 class ContextSV {
diff --git a/include/input_data.h b/include/input_data.h
index 718b5264..7d577784 100644
--- a/include/input_data.h
+++ b/include/input_data.h
@@ -40,7 +40,6 @@ class InputData {
 
         // Return a reference to the ReferenceGenome object.
         const ReferenceGenome& getRefGenome() const;
-        // FASTAQuery getRefGenome();
 
         // Query the reference genome for a sequence.
         std::string queryRefGenome(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const;
@@ -49,7 +48,7 @@ class InputData {
         std::vector<std::string> getRefGenomeChromosomes();
 
         // Get a chromosome's length in the reference genome.
-        int64_t getRefGenomeChromosomeLength(std::string chr);
+        uint32_t getRefGenomeChromosomeLength(std::string chr);
 
         // Set the filepath to the text file containing the locations of the
         // VCF files with population frequencies for each chromosome.
diff --git a/include/snp_info.h b/include/snp_info.h
deleted file mode 100644
index 51278951..00000000
--- a/include/snp_info.h
+++ /dev/null
@@ -1,51 +0,0 @@
-#ifndef SNP_INFO_H
-#define SNP_INFO_H
-
-#include <unordered_map>
-#include <map>
-#include <string>
-#include <vector>
-#include <set>
-#include <mutex>
-
-// Define the comparator for the binary search tree by SNP position (first
-// element of tuple)
-struct SNPCompare {
-    bool operator()(const std::tuple<uint32_t, double>& a, const std::tuple<uint32_t, double>& b) const {
-        return std::get<0>(a) < std::get<0>(b);
-    }
-};
-
-// Define the data structure for SNP frequencies sorted by position
-using BST = std::set<std::tuple<uint32_t, double>, SNPCompare>;
-
-class SNPInfo {
-public:
-    SNPInfo() {}
-
-    // Insert a SNP into the map with its position and B-allele frequency
-    void insertSNPAlleleFrequency(const std::string& chr, uint32_t pos, double baf);
-
-    // Insert a SNP into the map with its position and population frequency of
-    // the B allele
-    void insertSNPPopulationFrequency(const std::string& chr, uint32_t pos, double pfb);
-    
-    // Query SNPs within a range (start, end) and return their BAF and PFB values
-    std::tuple<std::vector<uint32_t>, std::vector<double>, std::vector<double>> querySNPs(std::string chr, uint32_t start, uint32_t end);
-
-    // Get the range of SNP positions for a given chromosome
-    std::pair<uint32_t, uint32_t> getSNPRange(std::string chr);
-
-
-private:
-    // Mutex for reading SNP information
-    std::mutex snp_info_mtx;
-
-    // Define the map of chromosome to SNP B-allele frequency
-    std::unordered_map<std::string, BST> snp_baf_map;
-
-    // Define the map of chromosome to SNP population frequency
-    std::unordered_map<std::string, std::unordered_map<uint32_t, double>> snp_pfb_map;
-};
-
-#endif // SNP_INFO_H
diff --git a/include/sv_caller.h b/include/sv_caller.h
index cdafab0b..f3f78af9 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -5,8 +5,6 @@
 
 #include "cnv_caller.h"
 #include "input_data.h"
-#include "cnv_data.h"
-#include "sv_data.h"
 #include "sv_object.h"
 #include "fasta_query.h"
 
@@ -20,7 +18,7 @@
 
 // SV candidate alignment data (chr, start, end, sequence, query start, query
 // end, mismatch map, strand)
-using AlignmentData   = std::tuple<std::string, int64_t, int64_t, std::string, int32_t, int32_t, std::unordered_map<int, int>, bool>;
+using AlignmentData   = std::tuple<std::string, int32_t, int32_t, int32_t, int32_t, std::vector<int>, bool>;
 using AlignmentVector = std::vector<AlignmentData>;
 
 // Query map (query name, alignment vector)
@@ -36,25 +34,26 @@ class SVCaller {
 
         // Detect SVs from the CIGAR string of a read alignment, and return the
         // mismatch rate, and the start and end positions of the query sequence
-        std::tuple<std::unordered_map<int, int>, int32_t, int32_t> detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set<SVCall>& sv_calls, bool is_primary);
+        void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set<SVCall>& sv_calls, std::tuple<std::vector<int>, int32_t, int32_t>& query_info, bool is_primary);
 
         void processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::set<SVCall>& combined_sv_calls, int min_cnv_length);
 
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
         // RegionData detectSVsFromRegion(std::string region);
-        std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region);
+        void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::set<SVCall>& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments);
  
         // Read the next alignment from the BAM file in a thread-safe manner
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
 
         // Detect SVs from split alignments
-        void detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm);
+        void detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map);
 
         // Calculate the mismatch rate given a map of query positions to
         // match/mismatch (1/0) values within a specified range of the query
         // sequence
-        double calculateMismatchRate(std::unordered_map<int, int>& mismatch_map, int32_t start, int32_t end);
+        // double calculateMismatchRate(std::unordered_map<int, int>& mismatch_map, int32_t start, int32_t end);
+        double calculateMismatchRate(const std::vector<int>& mismatch_map, int32_t start, int32_t end);
 
         void saveToVCF(const std::unordered_map<std::string, std::set<SVCall>>& sv_calls);
 
@@ -64,7 +63,7 @@ class SVCaller {
         explicit SVCaller(InputData& input_data);
 
         // Detect SVs and predict SV type from long read alignments and CNV calls
-        std::unordered_map<std::string, std::set<SVCall>> run();
+        void run();
 };
 
 #endif // SV_CALLER_H
diff --git a/include/sv_data.h b/include/sv_data.h
deleted file mode 100644
index fef815ed..00000000
--- a/include/sv_data.h
+++ /dev/null
@@ -1,52 +0,0 @@
-#ifndef SV_DATA_H
-#define SV_DATA_H
-
-
-/// @cond
-#include <string>
-#include <map>
-#include <set>
-#include <mutex>
-
-#include "sv_types.h"
-#include "fasta_query.h"
-/// @endcond
-
-// Include the SV types namespace
-using namespace sv_types;
-
-// SV data class
-class SVData {
-    private:
-        SVDepthMap sv_calls;
-
-        // Map of clipped base support by position (chr, pos) : depth
-        std::map<std::pair<std::string, int64_t>, int> clipped_base_support;
-        
-    public:
-        SVData() {};
-
-        int add(std::string chr, int32_t start, int32_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood);
-
-        void concatenate(const SVData& sv_data);
-
-        // Update clipped base support for a given breakpoint location
-        void updateClippedBaseSupport(std::string chr, int64_t pos);
-
-        int getClippedBaseSupport(std::string chr, int64_t pos, int64_t end);
-        
-        void saveToVCF(ReferenceGenome& ref_genome, std::string output_dir);
-
-        std::map<SVCandidate, SVInfo>& getChromosomeSVs(std::string chr);
-
-        std::set<std::string> getChromosomes();
-
-        // Begin and end iterators for the SV candidate map
-        SVDepthMap::iterator begin() { return this->sv_calls.begin(); }
-        SVDepthMap::iterator end() { return this->sv_calls.end(); }
-
-        // Get the total number of calls (For summary purposes)
-        int totalCalls();
-};
-
-#endif // SV_DATA_H
diff --git a/include/sv_types.h b/include/sv_types.h
index f58e6f7b..60471a01 100644
--- a/include/sv_types.h
+++ b/include/sv_types.h
@@ -57,25 +57,6 @@ namespace sv_types {
     inline SVType getSVTypeFromCNState(int cn_state) {
         return CNVTypeMap.at(cn_state);
     }
-
-    // Create a struct for storing SV information
-    struct SVInfo {
-        SVType sv_type;
-        int read_support;  // Number of reads supporting the SV breakpoints
-        int read_depth;  // Read depth at the SV start position
-        std::set<std::string> data_type;  // Alignment type used to call the SV
-        int sv_length;
-        std::string genotype = "./.";  // Default genotype (no call)
-        double hmm_likelihood = 0.0;  // HMM likelihood score for the state sequence
-
-        SVInfo() = default;
-        SVInfo(SVType sv_type, int read_support, int read_depth, std::string data_type, int sv_length, std::string genotype, double hmm_likelihood) :
-            sv_type(sv_type), read_support(read_support), read_depth(read_depth), data_type({data_type}), sv_length(sv_length), genotype(genotype), hmm_likelihood(hmm_likelihood) {}
-    };
-
-    // Type definition for SV-related structures
-    using SVCandidate = std::tuple<int32_t, int32_t, std::string>;  // SV (start, end, alt_allele)
-    using SVDepthMap = std::unordered_map<std::string, std::map<SVCandidate, SVInfo>>;  // Chromosome -> SV candidate -> SV info
 }
 
 #endif // SV_TYPES_H
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index dac33183..5f1c2e1d 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -24,9 +24,9 @@
 #include <future>
 #include <string>
 #include <algorithm>  // std::max
+#include <utility>    // std::pair
 
 #include "utils.h"
-#include "sv_data.h"
 #include "sv_types.h"
 
 #define MIN_PFB 0.01
@@ -48,13 +48,12 @@ std::pair<std::vector<int>, double> CNVCaller::runViterbi(const CHMM& hmm, SNPDa
     {
         throw std::runtime_error("Error: No SNP data found for Viterbi algorithm.");
     }
-    // std::lock_guard<std::mutex> lock(this->hmm_mtx);  // Lock the mutex for the HMM
     std::pair<std::vector<int>, double> state_sequence = testVit_CHMM(hmm, data_count, snp_data.log2_cov, snp_data.baf, snp_data.pfb);
     return state_sequence;
 }
 
 // Function to obtain SNP information for a region
-std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, SNPInfo& snp_info, std::unordered_map<uint32_t, int>& pos_depth_map, double mean_chr_cov)
+std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& pos_depth_map, double mean_chr_cov)
 {
     SNPData snp_data;
     bool snps_found = false;
@@ -65,10 +64,6 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
     std::unordered_map<uint32_t, double> snp_baf;
     std::unordered_map<uint32_t, double> snp_pfb;
     this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb);
-    // std::pair<std::vector<uint32_t>, std::vector<double>, std::vector<double>> snp_query = this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb);
-    // std::vector<uint32_t>& snp_pos = std::get<0>(snp_query);
-    // std::vector<double>& snp_pfb = std::get<1>(snp_query);
-    // std::vector<double>& snp_baf = std::get<2>(snp_query);
 
     // Loop through the range of the SV region and query the SNPs in a sliding
     // window, then calculate the log2 ratio for each window
@@ -98,7 +93,6 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
         // after the SNP, and continue until the end of the window
         // (If there are no SNPs in the window, then use the default BAF and
         // PFB values, and the coverage log2 ratio)
-
         // If no SNPs, then calculate the log2 ratio for the window
         if (snp_window_pos.size() == 0)
         {
@@ -111,8 +105,6 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
             snps_found = true;
 
             // Loop through the SNPs and calculate the log2 ratios
-            // uint32_t bin_start = window_start;
-            // uint32_t bin_end = 0;
             for (int j = 0; j < (int) snp_window_pos.size(); j++)
             {
                 // Just use a window centered at the SNP position
@@ -144,12 +136,8 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
     return std::make_pair(snp_data, snps_found);
 }
 
-std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate, const CHMM& hmm)
+std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map)
 {
-     // Get the start and end positions of the SV call
-    uint32_t start_pos = std::get<0>(candidate);
-    uint32_t end_pos = std::get<1>(candidate);
-
     // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2
     // the SV length
     uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
@@ -157,12 +145,12 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     uint32_t snp_end_pos = end_pos + sv_half_length;
 
     // Query the SNP region for the SV candidate
-    std::pair<SNPData, bool> snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, this->snp_info, this->pos_depth_map, this->mean_chr_cov);
+    std::pair<SNPData, bool> snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov);
     SNPData& sv_snps = snp_call.first;
     bool sv_snps_found = snp_call.second;
 
     // Run the Viterbi algorithm
-    printMessage("Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " (" + std::to_string(sv_snps.pos.size()) + " SNPs, start=" + std::to_string(snp_start_pos) + ", end=" + std::to_string(snp_end_pos) + ")...");
+    // printMessage("Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " (" + std::to_string(sv_snps.pos.size()) + " SNPs, start=" + std::to_string(snp_start_pos) + ", end=" + std::to_string(snp_end_pos) + ")...");
     std::pair<std::vector<int>, double> prediction = runViterbi(hmm, sv_snps);
     std::vector<int>& state_sequence = prediction.first;
     double likelihood = prediction.second;
@@ -220,16 +208,16 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 }
 
 
-void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall> &sv_candidates, int min_length, const CHMM& hmm)
+void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall> &sv_candidates, int min_length, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map)
 {
     int window_size = this->input_data.getWindowSize();
-    double mean_chr_cov = this->mean_chr_cov;  
+    // double mean_chr_cov = this->mean_chr_cov;  
     // printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "...");
-    runCIGARCopyNumberPredictionChunk(chr, sv_candidates, hmm, window_size, mean_chr_cov);
+    runCIGARCopyNumberPredictionChunk(chr, sv_candidates, hmm, window_size, mean_chr_cov, pos_depth_map);
     // printMessage("Finished predicting copy number states for chromosome " + chr + "...");
 }
 
-void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov)
+void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map)
 {
     // printMessage("Running copy number prediction for " + std::to_string(sv_chunk.size()) + " SV candidates on chromosome " + chr + "...");
     // Map with counts for each CNV type
@@ -270,7 +258,7 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCa
         uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
         uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
         uint32_t snp_end_pos = end_pos + sv_half_length;
-        std::pair<SNPData, bool> snp_call = this->querySNPRegion(chr, snp_start_pos, snp_end_pos, snp_info, this->pos_depth_map, mean_chr_cov);
+        std::pair<SNPData, bool> snp_call = this->querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov);
         SNPData& sv_snps = snp_call.first;
         bool snps_found = snp_call.second;
 
@@ -357,39 +345,6 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCa
     }
 }
 
-void CNVCaller::updateSVCopyNumber(std::map<SVCandidate, SVInfo> &sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood)
-{
-    // Update SV data from the HMM copy number prediction
-    // Lock the SV candidate map
-    std::lock_guard<std::mutex> lock(this->sv_candidates_mtx);
-
-    // Update the SV type if the update is not unknown, and if the types don't
-    // conflict (To avoid overwriting previous calls)
-    SVType current_sv_type = sv_candidates[key].sv_type;
-    if ((sv_type_update != SVType::UNKNOWN) && ((current_sv_type == sv_type_update) || (current_sv_type == SVType::UNKNOWN)))
-    {
-        sv_candidates[key].sv_type = sv_type_update;  // Update the SV type
-        sv_candidates[key].data_type.insert(data_type);  // Update the data type
-
-        // Update the likelihood if it is greater than the existing likelihood,
-        // or if it is currently unknown (0.0)
-        double previous_likelihood = sv_candidates[key].hmm_likelihood;
-        if (previous_likelihood == 0.0 || hmm_likelihood > previous_likelihood)
-        {
-            sv_candidates[key].hmm_likelihood = hmm_likelihood;
-        }
-
-        // Update the genotype
-        sv_candidates[key].genotype = genotype;
-    }
-}
-
-void CNVCaller::updateDPValue(std::map<SVCandidate,SVInfo>& sv_candidates, SVCandidate key, int dp_value)
-{
-    std::lock_guard<std::mutex> lock(this->sv_candidates_mtx);
-    sv_candidates[key].read_depth = dp_value;
-}
-
 std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count)
 {
     // Split the region into chunks
@@ -415,187 +370,168 @@ std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint3
     return region_chunks;
 }
 
-std::vector<std::vector<SVCandidate>> CNVCaller::splitSVCandidatesIntoChunks(std::map<SVCandidate,SVInfo>& sv_candidates, int chunk_count)
-{
-    // Split the SV candidates into chunks
-    std::vector<std::vector<SVCandidate>> sv_chunks;
-    int sv_count = (int) sv_candidates.size();
-    int chunk_size = std::ceil((double) sv_count / (double) chunk_count);
-    int current_chunk = 0;
-    std::vector<SVCandidate> current_sv_chunk;
-    for (auto const& sv_call : sv_candidates)
-    {
-        current_sv_chunk.push_back(sv_call.first);
-
-        // If the current chunk size is reached, then add the chunk to the
-        // vector and reset the current chunk
-        if ((int) current_sv_chunk.size() == chunk_size)
-        {
-            sv_chunks.push_back(current_sv_chunk);
-            current_sv_chunk.clear();
-            current_chunk++;
-        }
-    }
-
-    // Add the remaining SV candidates to the last chunk
-    if (current_sv_chunk.size() > 0)
-    {
-        sv_chunks.push_back(current_sv_chunk);
-    }
-
-    return sv_chunks;
-}
-
-void CNVCaller::loadChromosomeData(std::string chr)
-{
-    printMessage("Calculating mean chromosome coverage for " + chr + "...");
-    this->mean_chr_cov = calculateMeanChromosomeCoverage(chr);
-    printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov));
-}
+// std::pair<double, std::unordered_map<uint32_t, int>> CNVCaller::loadChromosomeData(std::string chr)
+// {
+//     printMessage("Calculating mean chromosome coverage for " + chr + "...");
+//     // this->mean_chr_cov = calculateMeanChromosomeCoverage(chr);
+//     std::pair<double, std::unordered_map<uint32_t, int>> depth_data = calculateMeanChromosomeCoverage(chr);
+//     printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov));
+// }
 
 // Calculate the mean chromosome coverage
-double CNVCaller::calculateMeanChromosomeCoverage(std::string chr)
+std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len)
 {
-    // Open the BAM file
-    std::string bam_filepath = this->input_data.getShortReadBam();
-    samFile *bam_file = sam_open(bam_filepath.c_str(), "r");
-    if (!bam_file)
+    // std::unordered_map<uint32_t, int> chr_pos_depth_map;
+    std::vector<uint32_t> chr_pos_depth_map(chr_len+1, 0); // 1-based index
     {
-        throw std::runtime_error("ERROR: Could not open BAM file: " + bam_filepath);
-    }
+        // Lock the bam file
+        std::lock_guard<std::mutex> lock(this->bam_file_mtx);
 
-    // Enable multi-threading
-    // hts_set_threads(bam_file, this->input_data.getThreadCount());
+        // Open the BAM file
+        std::string bam_filepath = this->input_data.getShortReadBam();
+        samFile *bam_file = sam_open(bam_filepath.c_str(), "r");
+        if (!bam_file)
+        {
+            throw std::runtime_error("ERROR: Could not open BAM file: " + bam_filepath);
+        }
 
-    // Read the header
-    bam_hdr_t *bam_header = sam_hdr_read(bam_file);
-    if (!bam_header)
-    {
-        sam_close(bam_file);
-        throw std::runtime_error("ERROR: Could not read header from BAM file: " + bam_filepath);
-    }
+        // Enable multi-threading
+        // hts_set_threads(bam_file, this->input_data.getThreadCount());
 
-    // Load the index
-    hts_idx_t *bam_index = sam_index_load(bam_file, bam_filepath.c_str());
-    if (!bam_index)
-    {
-        bam_hdr_destroy(bam_header);
-        sam_close(bam_file);
-        throw std::runtime_error("ERROR: Could not load index for BAM file: " + bam_filepath);
-    }
+        // Read the header
+        bam_hdr_t *bam_header = sam_hdr_read(bam_file);
+        if (!bam_header)
+        {
+            sam_close(bam_file);
+            throw std::runtime_error("ERROR: Could not read header from BAM file: " + bam_filepath);
+        }
 
-    // Create an iterator for the chromosome
-    hts_itr_t *bam_iter = sam_itr_querys(bam_index, bam_header, chr.c_str());
-    if (!bam_iter)
-    {
-        hts_idx_destroy(bam_index);
-        bam_hdr_destroy(bam_header);
-        sam_close(bam_file);
-        throw std::runtime_error("ERROR: Could not create iterator for chromosome: " + chr);
-    }
+        // Load the index
+        hts_idx_t *bam_index = sam_index_load(bam_file, bam_filepath.c_str());
+        if (!bam_index)
+        {
+            bam_hdr_destroy(bam_header);
+            sam_close(bam_file);
+            throw std::runtime_error("ERROR: Could not load index for BAM file: " + bam_filepath);
+        }
 
-    // Initialize the record
-    bam1_t *bam_record = bam_init1();
-    if (!bam_record)
-    {
-        hts_itr_destroy(bam_iter);
-        hts_idx_destroy(bam_index);
-        bam_hdr_destroy(bam_header);
-        sam_close(bam_file);
-        throw std::runtime_error("ERROR: Could not initialize BAM record.");
-    }
+        // Create an iterator for the chromosome
+        hts_itr_t *bam_iter = sam_itr_querys(bam_index, bam_header, chr.c_str());
+        if (!bam_iter)
+        {
+            hts_idx_destroy(bam_index);
+            bam_hdr_destroy(bam_header);
+            sam_close(bam_file);
+            throw std::runtime_error("ERROR: Could not create iterator for chromosome: " + chr + ", check if the chromosome exists in the BAM file.");
+        }
 
-    // Iterate through the chromosome and update the depth map
-    std::unordered_map<uint32_t, int> chr_pos_depth_map;
-    while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0)
-    {
-        // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads
-        if (bam_record->core.flag & BAM_FUNMAP || bam_record->core.flag & BAM_FSECONDARY || bam_record->core.flag & BAM_FQCFAIL || bam_record->core.flag & BAM_FDUP)
+        // Initialize the record
+        bam1_t *bam_record = bam_init1();
+        if (!bam_record)
         {
-            continue;
+            hts_itr_destroy(bam_iter);
+            hts_idx_destroy(bam_index);
+            bam_hdr_destroy(bam_header);
+            sam_close(bam_file);
+            throw std::runtime_error("ERROR: Could not initialize BAM record.");
         }
-        
-        // Parse the CIGAR string to get the depth (match, sequence match, and
-        // mismatch)
-        // uint32_t depth = 0;
-        uint32_t pos = bam_record->core.pos + 1;  // 0-based to 1-based
-        uint32_t ref_pos = pos;
-        uint32_t cigar_len = bam_record->core.n_cigar;
-        uint32_t *cigar = bam_get_cigar(bam_record);
-        for (uint32_t i = 0; i < cigar_len; i++)
+
+        // Iterate through the chromosome and update the depth map
+        // std::unordered_map<uint32_t, int> chr_pos_depth_map;
+        while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0)
         {
-            uint32_t op = bam_cigar_op(cigar[i]);
-            uint32_t op_len = bam_cigar_oplen(cigar[i]);
-            if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF)
+            // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads
+            if (bam_record->core.flag & BAM_FUNMAP || bam_record->core.flag & BAM_FSECONDARY || bam_record->core.flag & BAM_FQCFAIL || bam_record->core.flag & BAM_FDUP)
             {
-                // Update the depth for each position in the alignment
-                for (uint32_t j = 0; j < op_len; j++)
-                {
-                    chr_pos_depth_map[ref_pos + j]++;
-                }
+                continue;
             }
             
-            // Update the reference coordinate based on the CIGAR operation
-            // https://samtools.github.io/hts-specs/SAMv1.pdf
-            if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
-                ref_pos += op_len;
-            } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) {
-                // Do nothing
-            } else {
-                throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op));
+            // Parse the CIGAR string to get the depth (match, sequence match, and
+            // mismatch)
+            uint32_t pos = bam_record->core.pos + 1;  // 0-based to 1-based
+            uint32_t ref_pos = pos;
+            uint32_t cigar_len = bam_record->core.n_cigar;
+            uint32_t *cigar = bam_get_cigar(bam_record);
+            for (uint32_t i = 0; i < cigar_len; i++)
+            {
+                uint32_t op = bam_cigar_op(cigar[i]);
+                uint32_t op_len = bam_cigar_oplen(cigar[i]);
+                if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF)
+                {
+                    // Update the depth for each position in the alignment
+                    for (uint32_t j = 0; j < op_len; j++)
+                    {
+                        try {
+                            chr_pos_depth_map[ref_pos + j]++;
+                        } catch (const std::out_of_range& oor) {
+                            std::cerr << "Out of range error for " << chr << ":" << ref_pos+j << std::endl;
+                        }
+                        // chr_pos_depth_map[ref_pos + j]++;
+                    }
+                }
+                
+                // Update the reference coordinate based on the CIGAR operation
+                // https://samtools.github.io/hts-specs/SAMv1.pdf
+                if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
+                    ref_pos += op_len;
+                } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) {
+                    // Do nothing
+                } else {
+                    throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op));
+                }
             }
         }
-    }
 
-    // Clean up
-    bam_destroy1(bam_record);
-    hts_itr_destroy(bam_iter);
-    hts_idx_destroy(bam_index);
-    bam_hdr_destroy(bam_header);
-    sam_close(bam_file);
+        // Clean up
+        bam_destroy1(bam_record);
+        hts_itr_destroy(bam_iter);
+        hts_idx_destroy(bam_index);
+        bam_hdr_destroy(bam_header);
+        sam_close(bam_file);
+    }
 
     // Calculate the mean chromosome coverage for positions with non-zero depth
     uint64_t cum_depth = 0;
     uint32_t pos_count = 0;
-    for (auto& pos_depth : chr_pos_depth_map)
+    for (const auto& pos_depth : chr_pos_depth_map)
     {
-        cum_depth += pos_depth.second;
-        pos_count++;
+        if (pos_depth > 0)
+        {
+            cum_depth += pos_depth;
+            pos_count++;
+        }
     }
 
-    double mean_chr_cov = (double) cum_depth / (double) pos_count;
-
-    // Update the position depth map
-    this->pos_depth_map = std::move(chr_pos_depth_map);
-
-    return mean_chr_cov;
-}
-
-void CNVCaller::mergePosDepthMaps(std::unordered_map<uint32_t, int>& main_map, std::unordered_map<uint32_t, int>& map_update)
-{
-    // Merge the second depth map into the first
-    for (auto& pos_depth : map_update)
+    double mean_chr_cov = 0.0;
+    if (pos_count > 0)
     {
-        main_map[pos_depth.first] = pos_depth.second;
+        mean_chr_cov = static_cast<double>(cum_depth) / static_cast<double>(pos_count);
     }
+
+    return std::make_pair(mean_chr_cov, chr_pos_depth_map);
 }
 
-double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::unordered_map<uint32_t, int> &pos_depth_map, double mean_chr_cov)
+double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& pos_depth_map, double mean_chr_cov)
 {
     // Use the position and depth map to calculate the log2 ratio
     double cum_depth = 0;
     int pos_count = 0;
     for (uint32_t i = start_pos; i <= end_pos; i++)
     {
-        // Check if the position is in the map
-        auto it = pos_depth_map.find(i);
-        if (it == pos_depth_map.end())
+        if (i < pos_depth_map.size() && pos_depth_map[i] > 0)
         {
-            continue;
+            cum_depth += pos_depth_map[i];
+            pos_count++;
         }
-        int depth = pos_depth_map[i];
-        pos_count++;
-        cum_depth += depth;
+        // // Check if the position is in the map
+        // auto it = pos_depth_map.find(i);
+        // if (it == pos_depth_map.end())
+        // {
+        //     continue;
+        // }
+        // int depth = pos_depth_map[i];
+        // pos_count++;
+        // cum_depth += depth;
     }
 
     // Calculate the window coverage log2 ratio (0 if no positions)
@@ -632,6 +568,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         throw std::runtime_error("ERROR: Could not initialize SNP reader.");
     }
 
+    // Lock during reading
+    std::lock_guard<std::mutex> lock(this->snp_file_mtx);
+
     // Set the region
     std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
     if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)
@@ -812,6 +751,9 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
         throw std::runtime_error("ERROR: Could not initialize synced reader for population frequency file: " + pfb_filepath);
     }
 
+    // Lock during reading
+    std::lock_guard<std::mutex> lock(this->pfb_file_mtx);
+
     // Set the region for the synced reader
     std::string region_str = chr_gnomad + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
     if (bcf_sr_set_regions(pfb_reader, region_str.c_str(), 0) < 0)
diff --git a/src/cnv_data.cpp b/src/cnv_data.cpp
deleted file mode 100644
index 0c4593c0..00000000
--- a/src/cnv_data.cpp
+++ /dev/null
@@ -1,73 +0,0 @@
-#include "cnv_data.h"
-
-/// @cond
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <fstream>
-#include <vector>
-#include <algorithm>
-
-#include "sv_types.h"
-/// @endcond
-
-// Include the SV types namespace
-using namespace sv_types;
-
-void CNVData::addCNVCall(std::string chr, int snp_pos, int cnv_type)
-{
-    // Add the CNV call to the map
-    SNPLocation key(chr, snp_pos);
-    this->cnv_calls[key] = cnv_type;
-}
-
-void CNVData::loadFromFile(std::string filepath)
-{
-    // Load CNV calls from file
-    std::ifstream cnv_file(filepath);
-    std::string line;
-    std::string chr;
-    int snp_pos;
-    int cnv_type;
-
-    // Check if the file was opened successfully
-    if (!cnv_file.is_open()) {
-        std::cerr << "Error: Could not open CNV file " << filepath << std::endl;
-        exit(1);
-    }
-
-    // Skip the first line (header)
-    std::getline(cnv_file, line);
-
-    // Read the file line by line
-    int line_num = 1;
-    while (std::getline(cnv_file, line)) {
-
-        // Parse the line
-        std::istringstream iss(line);
-
-        // Get columns 1, 2, and 5 (chr, pos, cnv_type)
-        std::string chr;
-        std::getline(iss, chr, '\t');
-
-        std::string pos_str;
-        std::getline(iss, pos_str, '\t');
-        snp_pos = std::stoi(pos_str);
-
-        std::string skip_str;
-        std::getline(iss, skip_str, '\t');
-        std::getline(iss, skip_str, '\t');
-
-        std::string cnv_type_str;
-        std::getline(iss, cnv_type_str, '\t');
-        cnv_type = std::stoi(cnv_type_str);
-
-        // Add the CNV call to the map
-        this->addCNVCall(chr, snp_pos, cnv_type);
-
-        line_num++;
-    }
-    cnv_file.close();
-
-    std::cout << "Loaded " << line_num << " CNV calls" << std::endl;
-}
diff --git a/src/contextsv.cpp b/src/contextsv.cpp
index 1e22b650..5a9e7ffd 100644
--- a/src/contextsv.cpp
+++ b/src/contextsv.cpp
@@ -23,7 +23,9 @@ int ContextSV::run()
     SVCaller sv_caller(this->input_data);  // Create an SV caller object
     // SVCaller sv_caller(*this->input_data);  // Create an SV caller object
     // SVData sv_calls = sv_caller.run();  // Run the SV caller
-    std::unordered_map<std::string, std::set<SVCall>> sv_calls = sv_caller.run();  // Run the SV caller
+    // std::unordered_map<std::string, std::set<SVCall>> sv_calls =
+    // sv_caller.run();  // Run the SV caller
+    sv_caller.run();  // Run the SV caller
     // std::string output_dir = this->input_data->getOutputDir();  // Get the output directory
     
     // std::cout << "Writing SV calls to file " << output_dir << "/output.vcf..." << std::endl;
diff --git a/src/input_data.cpp b/src/input_data.cpp
index e152a6cf..74dd9788 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -105,7 +105,7 @@ std::vector<std::string> InputData::getRefGenomeChromosomes()
     return this->fasta_query.getChromosomes();
 }
 
-int64_t InputData::getRefGenomeChromosomeLength(std::string chr)
+uint32_t InputData::getRefGenomeChromosomeLength(std::string chr)
 {
     return this->fasta_query.getChromosomeLength(chr);
 }
diff --git a/src/snp_info.cpp b/src/snp_info.cpp
deleted file mode 100644
index 1dc7b4a7..00000000
--- a/src/snp_info.cpp
+++ /dev/null
@@ -1,108 +0,0 @@
-#include "snp_info.h"
-#include "utils.h"
-
-/// @cond
-#include <string>
-#include <tuple>
-#include <mutex>
-#include <iostream>
-#include <utility>
-/// @endcond
-
-#define MIN_PFB 0.01
-
-
-void SNPInfo::insertSNPAlleleFrequency(const std::string& chr, uint32_t pos, double baf)
-{
-    // chr = removeChrPrefix(chr);
-
-    // Add the chromosome to the SNP B-allele frequency map if it does not exist
-    // if (this->snp_baf_map.find(chr) == this->snp_baf_map.end()) {
-    //     this->snp_baf_map[chr] = BST();
-    // }
-
-    // Insert the SNP into the map with its position and B-allele frequency
-    // using a binary search tree to keep the SNP positions sorted
-    this->snp_baf_map[chr].insert({pos, baf});
-}
-
-void SNPInfo::insertSNPPopulationFrequency(const std::string& chr, uint32_t pos, double pfb)
-{
-    // chr = removeChrPrefix(chr);
-
-    // Add the chromosome to the SNP population frequency map if it does not
-    // exist
-    // if (this->snp_pfb_map.find(chr) == this->snp_pfb_map.end()) {
-    //     this->snp_pfb_map[chr] = std::unordered_map<uint32_t, double>();
-    // }
-
-    // Insert the SNP into the map with its position and population frequency of
-    // the B allele
-    this->snp_pfb_map[chr][pos] = pfb;
-}
-
-std::tuple<std::vector<uint32_t>, std::vector<double>, std::vector<double>> SNPInfo::querySNPs(std::string chr, uint32_t start, uint32_t end)
-{
-    // Lock the mutex for reading SNP information
-    // std::lock_guard<std::mutex> lock(this->snp_info_mtx);
-
-    chr = removeChrPrefix(chr);
-
-    // Create an ordered map of SNP positions to BAF and PFB values
-    std::map<uint32_t, std::tuple<double, double>> snp_map;
-
-    // Query SNPs within a range (start, end) and return their BAF and PFB
-    // values as separate vectors
-    std::vector<double> bafs;
-    std::vector<double> pfbs;
-    std::vector<uint32_t> pos;
-    
-    // Check if the chromosome exists in the B-allele frequency map
-    if (this->snp_baf_map.find(chr) == this->snp_baf_map.end()) {
-        return std::make_tuple(pos, bafs, pfbs);
-    }
-
-    // Query the SNPs within the range and return their BAFs and corresponding
-    // positions
-    auto& baf_bst = this->snp_baf_map[chr];
-    auto baf_start = baf_bst.lower_bound({start, 0.0});
-    auto baf_end = baf_bst.upper_bound({end, 0.0});
-    for (auto it = baf_start; it != baf_end; it++) {
-        bafs.push_back(std::get<1>(*it));
-        pos.push_back(std::get<0>(*it));
-    }
-
-    // Define a default PFB value (0.5) for SNPs with no population frequency data
-    pfbs = std::vector<double>(bafs.size(), 0.5);
-
-    // Check if the chromosome exists in the population frequency map
-    if (this->snp_pfb_map.find(chr) == this->snp_pfb_map.end()) {
-        return std::make_tuple(pos, bafs, pfbs);
-    }
-
-    // Query the PFBs for all SNP positions with PFB data
-    auto& pfb_map = this->snp_pfb_map[chr];
-    for (size_t i = 0; i < pos.size(); i++) {
-        uint32_t snp_pos = pos[i];
-        if (pfb_map.find(snp_pos) != pfb_map.end()) {
-            pfbs[i] = pfb_map[snp_pos];
-        }
-    }
-    
-    return std::make_tuple(pos, bafs, pfbs);
-}
-
-std::pair<uint32_t, uint32_t> SNPInfo::getSNPRange(std::string chr)
-{
-    chr = removeChrPrefix(chr);
-
-    // Get the range of SNP positions for a given chromosome
-    uint32_t start = 0;
-    uint32_t end = 0;
-    if (this->snp_baf_map.find(chr) != this->snp_baf_map.end()) {
-        auto& baf_bst = this->snp_baf_map[chr];
-        start = std::get<0>(*baf_bst.begin());
-        end = std::get<0>(*baf_bst.rbegin());
-    }
-    return std::make_pair(start, end);
-}
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index b9038e5c..8918a131 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -38,7 +38,10 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
 }
 
 // RegionData SVCaller::detectSVsFromRegion(std::string region)
-std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region)
+// std::tuple<std::set<SVCall>, PrimaryMap, SuppMap>
+// SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
+// const std::string& region)
+void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::set<SVCall>& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments)
 {
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
@@ -58,10 +61,10 @@ std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFi
     }
 
     // Main loop to process the alignments
-    std::set<SVCall> sv_calls;
+    // std::set<SVCall> sv_calls;
     int num_alignments = 0;
-    PrimaryMap primary_alignments;
-    SuppMap supplementary_alignments;
+    // PrimaryMap primary_alignments;
+    // SuppMap supplementary_alignments;
     while (readNextAlignment(fp_in, itr, bam1) >= 0) {
 
         // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
@@ -80,13 +83,15 @@ std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFi
             bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
 
             // Call SVs directly from the CIGAR string
-            std::tuple<std::unordered_map<int, int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true);
-            std::unordered_map<int, int> match_map = std::get<0>(query_info);
+            std::tuple<std::vector<int>, int32_t, int32_t> query_info;
+            this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true);
+            // std::tuple<std::vector<int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true);
+            const std::vector<int>& match_map = std::get<0>(query_info);
             int32_t query_start = std::get<1>(query_info);
             int32_t query_end = std::get<2>(query_info);
 
             // Add the primary alignment to the map
-            AlignmentData alignment(chr, start, end, ".", query_start, query_end, match_map, fwd_strand);
+            AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand);
             primary_alignments[qname] = alignment;
 
         // Process supplementary alignments
@@ -99,13 +104,16 @@ std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFi
             bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
 
             // Get CIGAR string information, but don't call SVs
-            std::tuple<std::unordered_map<int, int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false);
-            const std::unordered_map<int, int>& match_map = std::get<0>(query_info);
+            // std::tuple<std::vector<int>, int32_t, int32_t> query_info =
+            // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false);
+            std::tuple<std::vector<int>, int32_t, int32_t> query_info;
+            this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, false);
+            const std::vector<int>& match_map = std::get<0>(query_info);
             int32_t query_start = std::get<1>(query_info);
             int32_t query_end = std::get<2>(query_info);
 
             // Add the supplementary alignment to the map
-            AlignmentData alignment(chr, start, end, ".", query_start, query_end, match_map, fwd_strand);
+            AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand);
             supplementary_alignments[qname].emplace_back(alignment);
         }
 
@@ -116,35 +124,57 @@ std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFi
     hts_itr_destroy(itr);
     bam_destroy1(bam1);
 
-    return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments);
+    // return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments);
 }
 
-double SVCaller::calculateMismatchRate(std::unordered_map<int, int> &match_map, int32_t start, int32_t end)
+double SVCaller::calculateMismatchRate(const std::vector<int>& mismatch_map, int32_t start, int32_t end)
 {
+    start = std::max(start, 0);
+    end = std::min(end, (int32_t)mismatch_map.size() - 1);
     int match_count = 0;
     int mismatch_count = 0;
+    int MATCH = 1;
+    int MISMATCH = -1;
     for (int i = start; i <= end; i++) {
-        if (match_map.find(i) != match_map.end()) {
-            if (match_map[i] == 1) {
-                match_count++;
-            } else {
-                mismatch_count++;
-            }
+        if (mismatch_map[i] == MATCH) {
+            match_count++;
+        } else if (mismatch_map[i] == MISMATCH) {
+            mismatch_count++;
         }
     }
-    double mismatch_rate = (double)mismatch_count / (double)(match_count + mismatch_count);
+
+    // Avoid division by zero
+    if (match_count + mismatch_count == 0) {
+        return 0.0;
+    }
+
+    double mismatch_rate = static_cast<double>(mismatch_count) / static_cast<double>(match_count + mismatch_count);
+    // int match_count = 0;
+    // int mismatch_count = 0;
+    // for (int i = start; i <= end; i++) {
+    //     if (match_map.find(i) != match_map.end()) {
+    //         if (match_map[i] == 1) {
+    //             match_count++;
+    //         } else {
+    //             mismatch_count++;
+    //         }
+    //     }
+    // }
+    // double mismatch_rate = (double)mismatch_count / (double)(match_count + mismatch_count);
 
     return mismatch_rate;
 }
 
-std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set<SVCall>& sv_calls, bool is_primary)
+void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set<SVCall>& sv_calls, std::tuple<std::vector<int>, int32_t, int32_t>& query_info, bool is_primary)
 {
     std::string chr = header->target_name[alignment->core.tid];  // Chromosome name
     int32_t pos = alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
     uint32_t* cigar = bam_get_cigar(alignment);  // CIGAR array
     int cigar_len = alignment->core.n_cigar;
     int query_pos = 0;
-    std::unordered_map<int, int> query_match_map;  // Query position to match/mismatch (1/0) map
+    // std::unordered_map<int, int> query_match_map;  // Query position to
+    // match/mismatch (1/0) map
+    std::vector<int> query_match_map(alignment->core.l_qseq, 0);  // Query position to match/mismatch (1/0) map
 
     // Loop through the CIGAR string, process operations, detect SVs (primary
     // only), update clipped base support, calculate sequence identity for
@@ -250,13 +280,15 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
         }
 
         // Update match/mismatch query map
+        int MATCH = 1;
+        int MISMATCH = -1;
         if (op == BAM_CEQUAL) {
             for (int j = 0; j < op_len; j++) {
-                query_match_map[query_pos + j] = 1;
+                query_match_map[query_pos + j] = MATCH;
             }
         } else if (op == BAM_CDIFF) {
             for (int j = 0; j < op_len; j++) {
-                query_match_map[query_pos + j] = 0;
+                query_match_map[query_pos + j] = MISMATCH;
             }
         } else if (op == BAM_CMATCH) {
             // Get the read sequence
@@ -279,9 +311,9 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
             // Compare the two sequences and update the mismatch map
             for (int j = 0; j < op_len; j++) {
                 if (cmatch_seq_str[j] != cmatch_ref_str[j]) {
-                    query_match_map[query_pos + j] = 0;
+                    query_match_map[query_pos + j] = MISMATCH;
                 } else {
-                    query_match_map[query_pos + j] = 1;
+                    query_match_map[query_pos + j] = MATCH;
                 }
             }
         }
@@ -308,7 +340,9 @@ std::tuple<std::unordered_map<int, int>, int32_t, int32_t> SVCaller::detectSVsFr
 
     query_end = query_pos;  // Last alignment position in the query
 
-    return std::tuple<std::unordered_map<int, int>, int32_t, int32_t>(query_match_map, query_start, query_end);
+    query_info = std::tuple<std::vector<int>, int32_t, int32_t>(std::move(query_match_map), query_start, query_end);
+
+    // return std::tuple<std::vector<int>, int32_t, int32_t>(query_match_map, query_start, query_end);
 }
 
 void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::set<SVCall>& combined_sv_calls, int min_cnv_length)
@@ -337,6 +371,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
     // Split the chromosome into chunks for memory efficiency
     std::vector<std::string> region_chunks;
     int chunk_count = 100;
+    uint32_t chr_len = this->input_data.getRefGenomeChromosomeLength(chr);
     if (this->input_data.isRegionSet()) {
 
         // Use one chunk for the specified region
@@ -348,7 +383,6 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
         // std::cout << "Using specified region " << chunk << "..." << std::endl;
         
     } else {
-        int chr_len = this->input_data.getRefGenomeChromosomeLength(chr);
         int chunk_size = std::ceil((double)chr_len / chunk_count);
         for (int i = 0; i < chunk_count; i++) {
             int start = i * chunk_size + 1;  // 1-based
@@ -366,7 +400,8 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
     // std::cout << "Loading chromosome data for copy number predictions..." << std::endl;
     printMessage(chr + ": Loading chromosome data...");
     CNVCaller cnv_caller(this->input_data);
-    cnv_caller.loadChromosomeData(chr);
+    // cnv_caller.loadChromosomeData(chr);
+    std::pair<double, std::vector<uint32_t>> chr_data = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_len);
 
     // Process each chunk one at a time
     // std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl;
@@ -376,10 +411,13 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
     for (const auto& sub_region : region_chunks) {
         current_region++;
         printMessage(chr + ": CIGAR SVs...");
-        std::tuple<std::set<SVCall>, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region);
-        std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
-        PrimaryMap& primary_map = std::get<1>(region_data);
-        SuppMap& supp_map = std::get<2>(region_data);
+        PrimaryMap primary_map;
+        SuppMap supp_map;
+        std::set<SVCall> subregion_sv_calls;
+        this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map);
+        // std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
+        // PrimaryMap& primary_map = std::get<1>(region_data);
+        // SuppMap& supp_map = std::get<2>(region_data);
         // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl;
         printMessage(chr + ": Merging CIGAR...");
         mergeSVs(subregion_sv_calls);
@@ -391,13 +429,13 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
         if (region_sv_count > 0) {
             // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl;
             printMessage(chr + ": CIGAR predictions...");
-            cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm);
+            cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm, chr_data.first, chr_data.second);
         }
 
         // Run split-read SV and copy number variant predictions
         // std::cout << "Detecting copy number variants from split reads..." << std::endl;
         printMessage(chr + ": Split read SVs...");
-        this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm);
+        this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, chr_data.first, chr_data.second);
 
         // Merge the SV calls from the current region
         // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
@@ -418,7 +456,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
     sam_close(fp_in);
 }
 
-std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
+void SVCaller::run()
 {
     // Get the chromosomes to process
     std::vector<std::string> chromosomes;
@@ -505,12 +543,12 @@ std::unordered_map<std::string, std::set<SVCall>> SVCaller::run()
     std::cout << "Saving SVs to VCF..." << std::endl;
     this->saveToVCF(whole_genome_sv_calls);
 
-    return whole_genome_sv_calls;
+    // return whole_genome_sv_calls;
 }
 
 
 // Detect SVs from split read alignments
-void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm)
+void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map)
 {
     // Find split-read SV evidence
     int sv_count = 0;
@@ -521,7 +559,6 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
         std::string primary_chr = std::get<0>(primary_alignment);
         int32_t primary_start = std::get<1>(primary_alignment);
         int32_t primary_end = std::get<2>(primary_alignment);
-        std::unordered_map<int, int> primary_match_map = std::get<6>(primary_alignment);
 
         // Skip primary alignments that do not have supplementary alignments
         if (supp_map.find(qname) == supp_map.end()) {
@@ -545,11 +582,11 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
             }
 
             // Inversion detection
-            bool is_opposite_strand = std::get<7>(primary_alignment) != std::get<7>(*it);
+            bool is_opposite_strand = std::get<6>(primary_alignment) != std::get<6>(*it);
             if (is_opposite_strand) {
                 if (supp_length >= min_cnv_length) {
-                    SVCandidate sv_candidate(supp_start+1, supp_end+1, ".");
-                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate, hmm);
+                    // SVCandidate sv_candidate(supp_start+1, supp_end+1, ".");
+                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map);
                     double supp_lh = std::get<0>(result);
                     SVType supp_type = std::get<1>(result);
                     if (supp_type == SVType::NEUTRAL) {
@@ -579,8 +616,6 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
         supp_end = std::get<2>(largest_supp_alignment);
         primary_start = std::get<1>(primary_alignment);
         primary_end = std::get<2>(primary_alignment);
-        SVCandidate split_boundary;
-        SVCandidate split_gap;
         bool gap_exists = false;
         int32_t boundary_left, boundary_right, gap_left, gap_right;
         if (primary_before_supp) {
@@ -599,15 +634,15 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
         
         // Run copy number variant predictions on the boundary if large enough
         if (boundary_right - boundary_left >= min_cnv_length) {
-            split_boundary = SVCandidate(boundary_left, boundary_right, ".");
-            std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_boundary, hmm);
+            // split_boundary = SVCandidate(boundary_left, boundary_right, ".");
+            std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map);
             double bd_lh = std::get<0>(bd_result);
             SVType bd_type = std::get<1>(bd_result);
 
             // Run copy number variant predictions on the gap if it exists
             if (gap_exists && gap_right - gap_left >= min_cnv_length) {
-                split_gap = SVCandidate(gap_left, gap_right, ".");
-                std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_gap, hmm);
+                // split_gap = SVCandidate(gap_left, gap_right, ".");
+                std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map);
                 double gap_lh = std::get<0>(gap_result);
                 SVType gap_type = std::get<1>(gap_result);
 
@@ -840,16 +875,26 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align
 {
     // Get the start and end read positions for the primary and supplementary
     // alignments
-    int32_t primary_query_start = std::get<4>(primary_alignment);
-    int32_t primary_query_end = std::get<5>(primary_alignment);
-    int32_t supp_query_start = std::get<4>(supp_alignment);
-    int32_t supp_query_end = std::get<5>(supp_alignment);
-    std::unordered_map<int, int>& primary_match_map = std::get<6>(primary_alignment);
-    std::unordered_map<int, int>& supp_match_map = std::get<6>(supp_alignment);
     int32_t primary_alignment_start = std::get<1>(primary_alignment);
     int32_t primary_alignment_end = std::get<2>(primary_alignment);
     int32_t supp_alignment_start = std::get<1>(supp_alignment);
     int32_t supp_alignment_end = std::get<2>(supp_alignment);
+    int32_t primary_query_start = std::get<3>(primary_alignment);
+    int32_t primary_query_end = std::get<4>(primary_alignment);
+    int32_t supp_query_start = std::get<3>(supp_alignment);
+    int32_t supp_query_end = std::get<4>(supp_alignment);
+    const std::vector<int>& primary_match_map = std::get<5>(primary_alignment);
+    const std::vector<int>& supp_match_map = std::get<5>(supp_alignment);
+    // int32_t primary_query_start = std::get<4>(primary_alignment);
+    // int32_t primary_query_end = std::get<5>(primary_alignment);
+    // int32_t supp_query_start = std::get<4>(supp_alignment);
+    // int32_t supp_query_end = std::get<5>(supp_alignment);
+    // const std::vector<int>& primary_match_map = std::get<6>(primary_alignment);
+    // const std::vector<int>& supp_match_map = std::get<6>(supp_alignment);
+    // int32_t primary_alignment_start = std::get<1>(primary_alignment);
+    // int32_t primary_alignment_end = std::get<2>(primary_alignment);
+    // int32_t supp_alignment_start = std::get<1>(supp_alignment);
+    // int32_t supp_alignment_end = std::get<2>(supp_alignment);
 
     // Check if the alignments overlap
     bool primary_before_supp = primary_query_start < supp_query_start;
diff --git a/src/sv_data.cpp b/src/sv_data.cpp
deleted file mode 100644
index d2dfd605..00000000
--- a/src/sv_data.cpp
+++ /dev/null
@@ -1,355 +0,0 @@
-#include "sv_data.h"
-
-/// @cond
-#include <unordered_set>
-#include <iostream>
-#include <fstream>
-/// @endcond
-
-int SVData::add(std::string chr, int32_t start, int32_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
-{
-    // Throw an error if the genotype is not valid
-    if (genotype != "./." && genotype != "0/0" && genotype != "0/1" && genotype != "1/1") {
-        std::cerr << "Error: Invalid genotype " << genotype << std::endl;
-        return -1;
-    }
-
-    // Trim the alternate allele if it is too long
-    if (alt_allele.length() > 100) {
-        alt_allele = alt_allele.substr(0, 100);
-    }
-
-    // Check if the alternate allele contains ambiguous bases
-    const std::unordered_set<char> ambiguous_bases = {'R', 'Y', 'W', 'S', 'K', 'M', 'B', 'D', 'H', 'V'};
-    for (char &c : alt_allele) {
-        if (ambiguous_bases.count(c) > 0) {
-            c = 'N';
-        }
-    }
-
-    // Check if the SV candidate already exists in the map
-    SVCandidate candidate(start, end, alt_allele);
-    if (this->sv_calls[chr].find(candidate) != this->sv_calls[chr].end()) {
-        
-        // Update the alignment-based support count
-        SVInfo& sv_info = this->sv_calls[chr][candidate];
-        sv_info.read_support += 1;
-
-        // Update the SV type if it is unknown
-        if (sv_info.sv_type == SVType::UNKNOWN) {
-            sv_info.sv_type = sv_type;
-        }
-
-        // Update the genotype if it is unknown
-        if (sv_info.genotype == "./.") {
-            sv_info.genotype = genotype;
-        }
-
-        // Update the HMM likelihood
-        if ((sv_info.hmm_likelihood == 0.0) || (hmm_likelihood > sv_info.hmm_likelihood)) {
-            sv_info.hmm_likelihood = hmm_likelihood;
-        }
-        sv_info.data_type.insert(data_type);  // Add the alignment type to the set
-
-        return 0;  // SV call already exists
-
-    // Otherwise, add the SV candidate to the map
-    } else {
-        int sv_length = end - start;
-
-        // For deletions, the SV length is the length of the deletion, including the start position
-        if (sv_type == SVType::DEL) {
-            sv_length++;
-        }
-
-        SVInfo sv_info(sv_type, 1, 0, data_type, sv_length, genotype, hmm_likelihood);
-        this->sv_calls[chr][candidate] = sv_info;  // Add the SV candidate to the map
-
-        return 1;  // SV call added
-    }
-}
-
-void SVData::concatenate(const SVData &sv_data)
-{
-    if (sv_data.sv_calls.empty()) {
-        std::cerr << "Error: SVData object is empty." << std::endl;
-        return;
-    }
-
-    // Iterate over the chromosomes in the other SVData object
-    for (auto const& chr_sv_calls : sv_data.sv_calls) {
-        const auto &chr = chr_sv_calls.first;
-        // std::string chr = chr_sv_calls.first;
-        auto &current_chr_calls = this->sv_calls[chr];
-
-        // Iterate over the SV calls in the other SVData object
-        for (auto const& sv_call : chr_sv_calls.second) {
-
-            // Add the SV call to the map of candidate locations
-            std::pair<std::map<SVCandidate, SVInfo>::iterator, bool> result = current_chr_calls.emplace(sv_call);
-            bool inserted = result.second;
-
-            // Throw a warning if the SV candidate already exists
-            if (!inserted) {
-                std::cerr << "Warning: SV candidate already exists in the map." << std::endl;
-            }
-        }
-    }
-}
-
-void SVData::updateClippedBaseSupport(std::string chr, int64_t pos)
-{
-    // Update clipped base support
-    std::pair<std::string, int64_t> key(chr, pos);
-    if (this->clipped_base_support.find(key) != this->clipped_base_support.end()) {
-        this->clipped_base_support[key] += 1;
-    } else {
-        this->clipped_base_support[key] = 1;
-    }
-}
-
-int SVData::getClippedBaseSupport(std::string chr, int64_t pos, int64_t end)
-{
-    // Clipped base support is the maximum clipped base support at the start
-    // and end positions
-    int clipped_base_support = 0;
-    std::pair<std::string, int64_t> pos_key(chr, pos);
-
-    if (pos == end) {
-        // If the start and end positions are the same, then the clipped base
-        // support is the same at both positions
-        clipped_base_support = this->clipped_base_support[pos_key];
-
-    } else{
-
-        // Otherwise, get the clipped base support at the start and end
-        // positions
-        int pos_support = 0;
-        int end_support = 0;
-        std::pair<std::string, int64_t> end_key(chr, end);
-        if (this->clipped_base_support.find(pos_key) != this->clipped_base_support.end()) {
-            pos_support = this->clipped_base_support[pos_key];
-        }
-        if (this->clipped_base_support.find(end_key) != this->clipped_base_support.end()) {
-            end_support = this->clipped_base_support[end_key];
-        }
-        clipped_base_support = std::max(pos_support, end_support);
-    }
-    
-    return clipped_base_support;
-}
-
-void SVData::saveToVCF(ReferenceGenome& ref_genome, std::string output_dir)
-{
-    // Create a VCF writer
-    std::cout << "Creating VCF writer..." << std::endl;
-    std::string output_vcf = output_dir + "/output.vcf";
-    std::cout << "Writing VCF file to " << output_vcf << std::endl;
-	std::ofstream vcf_stream(output_vcf);
-    if (!vcf_stream.is_open()) {
-        throw std::runtime_error("Failed to open VCF file for writing.");
-    }
-    std::string sample_name = "SAMPLE";
-
-    std::cout << "Getting reference genome filepath..." << std::endl;
-    try {
-        std::string ref_fp = ref_genome.getFilepath();
-        std::cout << "Reference genome filepath: " << ref_fp << std::endl;
-    } catch (const std::exception& e) {
-        std::cerr << "Error: " << e.what() << std::endl;
-        return;
-    }
-
-    std::cout << "Getting reference genome header..." << std::endl;
-    try {
-        ref_genome.getContigHeader();
-    } catch (const std::exception& e) {
-        std::cerr << "Error: " << e.what() << std::endl;
-        return;
-    }
-
-    // Set the header lines
-    std::vector<std::string> header_lines = {
-        std::string("##reference=") + ref_genome.getFilepath(),
-        ref_genome.getContigHeader(),
-        "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">",
-        "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">",
-        "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">",
-        "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Method used to call the structural variant\">",
-        "##INFO=<ID=ALN,Number=1,Type=String,Description=\"Alignment type used to call the structural variant\">",
-        "##INFO=<ID=CLIPSUP,Number=1,Type=Integer,Description=\"Clipped base support at the start and end positions\">",
-        "##INFO=<ID=SUPPORT,Number=1,Type=Integer,Description=\"Number of reads supporting the variant\">",
-        "##INFO=<ID=REPTYPE,Number=1,Type=String,Description=\"Repeat type\">",
-        "##INFO=<ID=HMM,Number=1,Type=Float,Description=\"HMM likelihood\">",
-        "##FILTER=<ID=PASS,Description=\"All filters passed\">",
-        "##FILTER=<ID=LowQual,Description=\"Low quality\">",
-        "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
-        "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth\">"
-    };
-
-    std::cout << "Writing VCF header..." << std::endl;
-
-    // Add the file format
-    std::string file_format = "##fileformat=VCFv4.2";
-    vcf_stream << file_format << std::endl;
-
-    // Add date and time
-    time_t rawtime;
-    struct tm * timeinfo;
-    char buffer[80];
-    time (&rawtime);
-    timeinfo = localtime(&rawtime);
-    strftime(buffer, sizeof(buffer), "%Y%m%d", timeinfo);
-    vcf_stream << "##fileDate=" << buffer << std::endl;
-
-    // Add source
-    std::string source = "##source=ContexSV";
-    vcf_stream << source << std::endl;
-
-    // Loop over the header metadata lines
-    for (const auto &line : header_lines) {
-        vcf_stream << line << std::endl;
-    }
-
-    // Add the header line
-    std::string header_line = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE";
-    vcf_stream << header_line << std::endl;
-
-    // Flush the stream to ensure that the header is written
-    //this->file_stream.flush();
-
-    std::cout << "Saving SV calls to " << output_vcf << std::endl;
-    std::string sv_method = "CONTEXTSVv0.1";
-    int skip_count = 0;
-    int total_count = 0;
-    std::set<std::string> chrs = this->getChromosomes();
-    for (auto const& chr : chrs) {
-        if (this->sv_calls.find(chr) == this->sv_calls.end()) {
-            continue;
-        }
-        std::cout << "Saving SV calls for " << chr << " (" << this->sv_calls[chr].size() << " SV calls)..." << std::endl;
-        for (auto const& sv_call : this->sv_calls[chr]) {
-
-            // Get the SV candidate and SV info
-            SVCandidate candidate = sv_call.first;
-            SVInfo info = sv_call.second;
-            SVType sv_type = info.sv_type;
-            int read_support = info.read_support;
-            int read_depth = info.read_depth;
-            int sv_length = info.sv_length;
-            std::set<std::string> data_type = info.data_type;
-            std::string genotype = info.genotype;
-            double hmm_likelihood = info.hmm_likelihood;
-
-            // Convert the data type set to a string
-            std::string data_type_str = "";
-            for (auto const& type : data_type) {
-                data_type_str += type + ",";
-            }
-
-            // Get the CHROM, POS, END, and ALT
-            int64_t pos = std::get<0>(candidate);
-            int64_t end = std::get<1>(candidate);
-
-            // If the SV type is unknown, skip it
-            if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) {
-                skip_count += 1;
-                continue;
-            } else {
-                total_count += 1;
-            }
-
-            // Process by SV type
-            std::string ref_allele = ".";
-            std::string alt_allele = ".";
-            std::string repeat_type = "NA";
-
-            // Deletion
-            if (sv_type == SVType::DEL) {
-                // Get the deleted sequence from the reference genome, also including the preceding base
-                int64_t preceding_pos = (int64_t) std::max(1, (int) pos-1);  // Make sure the position is not negative
-                ref_allele = ref_genome.query(chr, preceding_pos, end);
-
-                // Use the preceding base as the alternate allele 
-                if (ref_allele != "") {
-                    alt_allele = ref_allele.at(0);
-                } else {
-                    alt_allele = "<DEL>";  // Symbolic allele
-                    std::cerr << "Warning: Reference allele is empty for deletion at " << chr << ":" << pos << "-" << end << std::endl;
-                }
-
-                sv_length = -1 * sv_length;  // Negative length for deletions
-
-                pos = preceding_pos;  // Update the position to the preceding base
-
-            // Other types (duplications, insertions, inversions)
-            } else {
-                // Use the preceding base as the reference allele
-                int64_t preceding_pos = (int64_t) std::max(1, (int) pos-1);  // Make sure the position is not negative
-                ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
-
-                // Format novel insertions
-                if (sv_type == SVType::INS) {
-                    // Use the insertion sequence as the alternate allele
-                    alt_allele = std::get<2>(candidate);
-                    alt_allele.insert(0, ref_allele);
-
-                    pos = preceding_pos;  // Update the position to the preceding base
-
-                    // Update the end position to the start position to change from
-                    // query to reference coordinates for insertions
-                    end = pos;
-                } else if (sv_type == SVType::DUP) {                    
-                    alt_allele = "<DUP>";  // Symbolic allele
-                    repeat_type = "TANDEM";
-                }
-            }
-
-            // Create the VCF parameter strings
-            int clipped_base_support = this->getClippedBaseSupport(chr, pos, end);
-            std::string sv_type_str = getSVTypeString(sv_type);
-            std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
-                ";SVLEN=" + std::to_string(sv_length) + ";SUPPORT=" + std::to_string(read_support) + \
-                ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";CLIPSUP=" + std::to_string(clipped_base_support) + \
-                ";REPTYPE=" + repeat_type + ";HMM=" + std::to_string(hmm_likelihood);
-                
-            std::string format_str = "GT:DP";
-            std::string sample_str = genotype + ":" + std::to_string(read_depth);
-            std::vector<std::string> samples = {sample_str};
-
-            // Write the SV call to the file (CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLES)
-            vcf_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "PASS" << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl;
-            if (total_count % 1000 == 0)
-            {
-            	std::cout << "Wrote SV at " << chr << ": " << pos << ", total=" << total_count << std::endl;
-        	}
-        }
-    }
-
-    // Print the number of SV calls skipped
-    std::cout << "Finished writing VCF file. Total SV calls: " << total_count << ", skipped: " << skip_count << " with unknown SV type" << std::endl;
-}
-
-std::map<SVCandidate, SVInfo>& SVData::getChromosomeSVs(std::string chr)
-{
-    return this->sv_calls[chr];
-}
-
-std::set<std::string> SVData::getChromosomes()
-{
-    std::set<std::string> chromosomes;
-    for (auto const& sv_call : this->sv_calls) {
-        chromosomes.insert(sv_call.first);
-    }
-    return chromosomes;
-}
-
-int SVData::totalCalls()
-{
-    int sv_calls = 0;
-    for (auto const& sv_call : this->sv_calls) {
-        sv_calls += sv_call.second.size();
-    }
-
-    return sv_calls;
-}
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 09203b59..f24143fd 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -1,5 +1,5 @@
 #include "sv_object.h"
-#include "sv_object.h"
+
 #include <algorithm>
 #include <tuple>
 #include <memory>
@@ -7,10 +7,11 @@
 #include <stdexcept>
 #include <iostream>
 
+#include "utils.h"
+
 bool SVCall::operator<(const SVCall & other) const
 {
 	return start < other.start || (start == other.start && end < other.end);
-    //return std::tie(start, end) < std::tie(other.start, other.end);
 }
 
 void addSVCall(std::set<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
@@ -24,88 +25,8 @@ void addSVCall(std::set<SVCall>& sv_calls, uint32_t start, uint32_t end, std::st
         throw std::runtime_error("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end));
     }
 
-    // If the SV call already exists (start and end position), then update all information if the
-    // likelihood is higher
-    // std::cout << "[TEST1] Adding SV call: " << start << "-" << end << " " <<
-    // sv_type << " " << alt_allele << " " << data_type << " " << genotype << "
-    // " << hmm_likelihood << std::endl;
+    // printMessage("Adding SV call: " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end - start) + " and type " + sv_type);
     sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1});
-    // SVCall new_sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1};
-    
-    // sv_calls.insert(new_sv_call);
-    
-    /*
-    bool exists = false;
-    bool print_out = false;
-    for (auto it = sv_calls.begin(); it != sv_calls.end();)
-    {
-        if (it->start == start && it->end == end)
-        {
-            exists = true;
-            if (hmm_likelihood > it->hmm_likelihood)
-            {
-                //std::cout << "[DEBUG] Found higher likelihood for SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl;
-                print_out = true;
-                // Update the data type and support
-                // std::string new_data_type = it->data_type + "," + data_type;
-                // int new_support = it->support + 1;
-                new_sv_call.data_type = it->data_type + "," + data_type;
-                new_sv_call.support = it->support + 1;
-                //higher_lh = true;
-
-                // updates.push_back(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support});
-
-                // Erase and re-insert the SV call
-                // Erase the current iterator and safely insert the new SV calls
-                std::cout << "Erasing iterator." << std::endl;
-                sv_calls.erase(it);
-                std::cout << "Iterator erased." << std::endl;
-                break;
-                //it = sv_calls.erase(it);  // Erase and get the next iterator
-                // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support});
-            } else {
-                // End if the SV exists but is lower lh
-                return;
-            }
-        } else {
-            // Increment the iterator if the SV call does not match
-            ++it;
-        }
-    }
-
-    if (print_out)
-    {
-        std::cout << "[DEBUG] Adding updates" << std::endl;
-    }
-
-    // Update the SV call if it does not exist, or if the likelihood is higher
-    // than the existing call
-    if (print_out)
-    {
-        std::cout << "[DEBUG] Inserting call" << std::endl;
-    }
-    sv_calls.insert(new_sv_call);
-    if (print_out)
-    {
-        std::cout << "[DEBUG] Call inserted" << std::endl;
-    }
-    // Insert the updates
-    // for (const auto& update : updates)
-    // {
-    //     sv_calls.insert(update);
-    // }
-
-    // if (print_out)
-    // {
-    //     std::cout << "[DEBUG] Added updates" << std::endl;
-    // }
-
-
-    // Add the SV call if it does not exist
-    // std::cout << "[TEST2] Adding SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl;
-    // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1});
-    // std::cout << "[TEST3] Added SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl;
-    */
 }
 
 std::vector<std::set<SVCall>> splitSVsIntoChunks(std::set<SVCall>& sv_calls, int chunk_count)
@@ -158,47 +79,26 @@ void mergeSVs(std::set<SVCall>& sv_calls) {
     }
 
     // Merge SV calls if they overlap by at least 50%
-    // int initial_size = sv_calls.size();
+    int initial_size = sv_calls.size();
     std::vector<SVCall> merged_sv_calls;
     auto it = sv_calls.begin();
     SVCall current_merge = *it++;
-
     for (; it != sv_calls.end(); ++it) {
         const SVCall& next = *it;
 
-        // Check if the SV calls overlap by at least 50%
-        uint32_t overlap_start = std::max(current_merge.start, next.start);
-        uint32_t overlap_end = std::min(current_merge.end, next.end);
-        uint32_t overlap_length = (overlap_start < overlap_end) ? overlap_end - overlap_start : 0;
-
-        uint32_t current_length = current_merge.end - current_merge.start;
-        uint32_t next_length = next.end - next.start;
-
-        // Merge the SV calls if the overlap is > 0
-        //double overlap_pct_current = static_cast<double>(overlap_length) / current_length;
-        //double overlap_pct_next = static_cast<double>(overlap_length) / next_length;
-
-        //if (overlap_pct_current >= 0.5 || overlap_pct_next >= 0.5) {
-        if (overlap_length > 0) {
-            // Merge the SV calls based on the likelihood
-            if (next.hmm_likelihood != 0.0) {
-                // Update the likelihood if the next SV call has a likelihood
-                // and it is higher than the current merged SV call
-                if (next.hmm_likelihood > current_merge.hmm_likelihood) {
-                    current_merge = next;
-                }
-            } else {
-                // If both have no likelihood (CIGAR only), then merge the SV calls
-                // based on largest SV length
-                if (next.hmm_likelihood == current_merge.hmm_likelihood) {
-                    if (next_length > current_length) {
-                        current_merge = next;
-                    }
-                }
-                // if (next_length > current_length) {
-                //     current_merge = next;
-                // }
+        // Find overlap
+        if (next.start <= current_merge.end) {
+            // Merge the SV calls if it is a subset
+            if (next.end <= current_merge.end) {
+                continue;
             }
+
+            // Merge the SV calls based on HMM log likelihood (keep the higher
+            // likelihood), 0.0 indicates no likelihood
+            if (next.hmm_likelihood != 0.0 && next.hmm_likelihood > current_merge.hmm_likelihood) {
+                current_merge = next;  // Continue with the next call
+            }
+
         } else {
             // No overlap: Save the previous SV and continue
             merged_sv_calls.push_back(current_merge);
@@ -207,6 +107,7 @@ void mergeSVs(std::set<SVCall>& sv_calls) {
     }
 
     // Add the last merged SV call
+    printMessage("Saving SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood));
     merged_sv_calls.push_back(current_merge);
 
     // Update the SV calls
@@ -214,6 +115,6 @@ void mergeSVs(std::set<SVCall>& sv_calls) {
     for (const auto& sv_call : merged_sv_calls) {
         sv_calls.insert(sv_call);
     }
-    // int updated_size = sv_calls.size();
-    // std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl;
+    int updated_size = sv_calls.size();
+    std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl;
 }

From 50cc5058146cd1bf7aee3e288af4fa27cbc64eb8 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 30 Nov 2024 19:45:13 -0500
Subject: [PATCH 037/134] reduce output

---
 src/sv_object.cpp | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index f24143fd..9af58235 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -107,14 +107,17 @@ void mergeSVs(std::set<SVCall>& sv_calls) {
     }
 
     // Add the last merged SV call
-    printMessage("Saving SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood));
+    // printMessage("Saving SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood));
     merged_sv_calls.push_back(current_merge);
 
-    // Update the SV calls
-    sv_calls.clear();
-    for (const auto& sv_call : merged_sv_calls) {
-        sv_calls.insert(sv_call);
-    }
+    // Replace contents of the SV calls
+    sv_calls = std::set<SVCall>(merged_sv_calls.begin(), merged_sv_calls.end());
+
+    // // Update the SV calls
+    // sv_calls.clear();
+    // for (const auto& sv_call : merged_sv_calls) {
+    //     sv_calls.insert(sv_call);
+    // }
     int updated_size = sv_calls.size();
     std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl;
 }

From 18bd4a99ee026b5bceb54857ee7e4c5b325861ed Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sun, 1 Dec 2024 17:00:54 -0500
Subject: [PATCH 038/134] Improve merging

---
 include/cnv_caller.h |  10 +-
 include/sv_caller.h  |  14 +--
 include/sv_object.h  |  18 ++--
 python/sv_merger.py  | 106 +++++++------------
 src/cnv_caller.cpp   |  24 ++---
 src/khmm.cpp         |   4 -
 src/sv_caller.cpp    | 246 ++++++++++++++++++++++---------------------
 src/sv_object.cpp    | 138 ++++++++++++++++--------
 8 files changed, 283 insertions(+), 277 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index ad22b449..3663d184 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -49,11 +49,6 @@ class CNVCaller {
         mutable std::mutex snp_file_mtx;  // SNP file mutex
         mutable std::mutex pfb_file_mtx;  // Population frequency file mutex
         mutable std::mutex bam_file_mtx;  // BAM file mutex
-        
-        // CHMM hmm;
-        SNPData snp_data;
-        // double mean_chr_cov = 0.0;
-        // std::unordered_map<uint32_t, int> pos_depth_map;  // Read depth map
 
         // Define a map of CNV genotypes by HMM predicted state.
         // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output.
@@ -85,9 +80,6 @@ class CNVCaller {
 
         void querySNPs(std::string chr, uint32_t start, uint32_t end, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf, std::unordered_map<uint32_t, double>& snp_pfb);
 
-        // Run copy number prediction for a chunk of SV candidates from CIGAR strings
-        void runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map);
-
         // Split a region into chunks for parallel processing
         std::vector<std::string> splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count);
 
@@ -99,7 +91,7 @@ class CNVCaller {
         std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map);
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
-        void runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall>& sv_candidates, int min_length, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map);
+        void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map);
 
         // Calculate the mean chromosome coverage
         std::pair<double, std::vector<uint32_t>> calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len);
diff --git a/include/sv_caller.h b/include/sv_caller.h
index f3f78af9..b4f6eaac 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -18,7 +18,7 @@
 
 // SV candidate alignment data (chr, start, end, sequence, query start, query
 // end, mismatch map, strand)
-using AlignmentData   = std::tuple<std::string, int32_t, int32_t, int32_t, int32_t, std::vector<int>, bool>;
+using AlignmentData   = std::tuple<std::string, uint32_t, uint32_t, uint32_t, uint32_t, std::vector<int>, bool>;
 using AlignmentVector = std::vector<AlignmentData>;
 
 // Query map (query name, alignment vector)
@@ -34,20 +34,20 @@ class SVCaller {
 
         // Detect SVs from the CIGAR string of a read alignment, and return the
         // mismatch rate, and the start and end positions of the query sequence
-        void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set<SVCall>& sv_calls, std::tuple<std::vector<int>, int32_t, int32_t>& query_info, bool is_primary);
+        void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, std::tuple<std::vector<int>, uint32_t, uint32_t>& query_info, bool is_primary, std::unordered_map<uint32_t, uint32_t>& breakpoint_depth);
 
-        void processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::set<SVCall>& combined_sv_calls, int min_cnv_length);
+        void processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, int min_cnv_length);
 
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
         // RegionData detectSVsFromRegion(std::string region);
-        void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::set<SVCall>& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments);
+        void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, std::unordered_map<uint32_t, uint32_t>& breakpoint_depth);
  
         // Read the next alignment from the BAM file in a thread-safe manner
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
 
         // Detect SVs from split alignments
-        void detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map);
+        void detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map, std::unordered_map<uint32_t, uint32_t>& breakpoint_depth);
 
         // Calculate the mismatch rate given a map of query positions to
         // match/mismatch (1/0) values within a specified range of the query
@@ -55,10 +55,12 @@ class SVCaller {
         // double calculateMismatchRate(std::unordered_map<int, int>& mismatch_map, int32_t start, int32_t end);
         double calculateMismatchRate(const std::vector<int>& mismatch_map, int32_t start, int32_t end);
 
-        void saveToVCF(const std::unordered_map<std::string, std::set<SVCall>>& sv_calls);
+        void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls);
 
         void trimOverlappingAlignments(AlignmentData& primary_alignment, AlignmentData& supp_alignment);
 
+        void updateBreakpointDepth(std::unordered_map<uint32_t, uint32_t>& breakpoint_depth, uint32_t start, uint32_t end);
+
     public:
         explicit SVCaller(InputData& input_data);
 
diff --git a/include/sv_object.h b/include/sv_object.h
index fb52691e..7f8b9d96 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -6,6 +6,7 @@
 #include <string>
 #include <set>
 #include <stdexcept>
+#include <unordered_map>
 
 // Struct to represent a structural variant call
 struct SVCall {
@@ -16,24 +17,25 @@ struct SVCall {
     std::string data_type = "NA";
     std::string genotype = "./.";
     double hmm_likelihood = 0.0;
-    int support = 0;
+    int support = 0;  // Exact breakpoint support
+    int total_support = 0;  // Support at either breakpoint
 
     // Comparison operator for std::set
     bool operator<(const SVCall& other) const;
 
     // Constructor with parameters for all fields
-    SVCall(uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int support) :
-        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), support(support) {}
+    SVCall(uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int support, int total_support) :
+        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), support(support), total_support(support) {}
 };
 
-void addSVCall(std::set<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood);
+void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood);
 
-void mergeSVs(std::set<SVCall>& sv_calls);
+void mergeSVs(std::vector<SVCall>& sv_calls, std::unordered_map<uint32_t, uint32_t>& breakpoint_support);
 
-std::vector<std::set<SVCall>> splitSVsIntoChunks(std::set<SVCall>& sv_calls, int chunk_count);
+void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, std::unordered_map<uint32_t, uint32_t> &breakpoint_support, int min_support);
 
-uint32_t getSVCount(const std::set<SVCall>& sv_calls);
+uint32_t getSVCount(const std::vector<SVCall>& sv_calls);
 
-void concatenateSVCalls(std::set<SVCall>& sv_calls, const std::set<SVCall>& sv_calls_update);
+void concatenateSVCalls(std::vector<SVCall>& sv_calls, const std::vector<SVCall>& sv_calls_update);
 
 #endif // SV_OBJECT_H
diff --git a/python/sv_merger.py b/python/sv_merger.py
index b2d1491a..2f5cb94f 100644
--- a/python/sv_merger.py
+++ b/python/sv_merger.py
@@ -89,11 +89,11 @@ def update_support(record, cluster_size):
 
     return record
 
-def weighted_score(read_support, hmm_score, weight_hmm):
+def weighted_score(sv_len, hmm_score, weight_hmm):
     """
     Calculate a weighted score based on read support and HMM score.
     """
-    return (1 - weight_hmm) * read_support + weight_hmm * hmm_score
+    return (1 - weight_hmm) * sv_len + weight_hmm * hmm_score
 
 def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
     """
@@ -157,28 +157,8 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
 
     for label in unique_labels:
 
-        # Skip label -1 (outliers)
-        # if label == -1:
         # Skip label -1 (outliers) only if there are no other clusters
         if label == -1 and len(unique_labels) > 1:
-            # # Print the positions if any are within a certain range
-            # pos_min = 180915940
-            # pos_max = 180950356
-
-            # Debug if position is found
-            target_pos = 180949217
-
-            idx = cluster_labels == label
-            pos_values = breakpoints[idx][:, 0]
-            if target_pos in pos_values:
-                logging.info(f"Outlier deletion positions: {pos_values}")
-
-            # if (np.any(pos_values >= pos_min) and np.any(pos_values <= pos_max)):
-                # Print all within range
-                # pos_within_range = pos_values[(pos_values >= pos_min) & (pos_values <= pos_max)]
-                # logging.info(f"Outlier deletion positions: {pos_within_range}")
-                # logging.info(f"Outlier deletion positions: {pos_values}")
-
             continue
 
         # Get the indices of SVs with the same label
@@ -187,39 +167,47 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
         # Get HMM and read support values for the cluster
         # max_score_idx = 0  # Default to the first SV in the cluster
         cluster_hmm_scores = np.array(hmm_scores[idx])
-        cluster_depth_scores = np.array(sv_support[idx])
+        # cluster_depth_scores = np.array(sv_support[idx])
         cluster_sv_lengths = np.array(breakpoints[idx][:, 1] - breakpoints[idx][:, 0] + 1)
-        max_hmm = None
-        max_support = None
-        max_hmm_idx = None
-        max_support_idx = None
+        # max_hmm = None
+        # max_support = None
+        # max_hmm_idx = None
+        # max_support_idx = None
 
         # Find the maximum HMM score
-        if len(np.unique(cluster_hmm_scores)) > 1:
-            max_hmm_idx = np.nanargmax(cluster_hmm_scores)
-            max_hmm = cluster_hmm_scores[max_hmm_idx]
+        # if len(np.unique(cluster_hmm_scores)) > 1:
+        #     max_hmm_idx = np.nanargmax(cluster_hmm_scores)
+        #     max_hmm = cluster_hmm_scores[max_hmm_idx]
 
         # Find the maximum read alignment and clipped base support
-        if len(np.unique(cluster_depth_scores)) > 1:
-            max_support_idx = np.argmax(cluster_depth_scores)
-            max_support = cluster_depth_scores[max_support_idx]
+        # if len(np.unique(cluster_depth_scores)) > 1:
+        #     max_support_idx = np.argmax(cluster_depth_scores)
+        #     max_support = cluster_depth_scores[max_support_idx]
+
+        # Normalize the HMM scores. Since the HMM scores are negative (log lh), we
+        # normalize them to the range [0, 1] by subtracting the minimum value
+        cluster_hmm_norm = (cluster_hmm_scores - np.min(cluster_hmm_scores)) / (np.max(cluster_hmm_scores) - np.min(cluster_hmm_scores))
+
+        # Normalize the SV lengths to the range [0, 1]
+        cluster_sv_lengths_norm = (cluster_sv_lengths - np.min(cluster_sv_lengths)) / (np.max(cluster_sv_lengths) - np.min(cluster_sv_lengths))
 
         # Use a weighted approach to choose the best SV based on HMM and
         # support. Deletions have higher priority for HMM scores, while
         # insertions and duplications have higher priority for read alignment
         # support.
         # hmm_weight = 0.7 if sv_type == 'DEL' else 0.3
-        hmm_weight = 0.4
+        hmm_weight = 0.5
         max_score_idx = 0  # Default to the first SV in the cluster
-        max_score = weighted_score(cluster_depth_scores[max_score_idx], cluster_hmm_scores[max_score_idx], hmm_weight)
-        for k, hmm_loglh in enumerate(cluster_hmm_scores):
-            read_support = cluster_depth_scores[k]
-            score = weighted_score(read_support, hmm_loglh, hmm_weight)
+        max_score = weighted_score(cluster_hmm_norm[max_score_idx], cluster_sv_lengths_norm[max_score_idx], hmm_weight)
+        # max_score = weighted_score(cluster_sv_lengths[max_score_idx], cluster_hmm_scores[max_score_idx], hmm_weight)
+        for k, hmm_norm in enumerate(cluster_hmm_norm):
+            svlen_norm = cluster_sv_lengths_norm[k]
+            score = weighted_score(svlen_norm, hmm_norm, hmm_weight)
             if score > max_score:
                 max_score = score
                 max_score_idx = k
 
-        # Get the VCF record with the highest depth score
+        # Get the VCF record with the highest score
         max_record = vcf_df.iloc[idx, :].iloc[max_score_idx, :]
 
         # # For deletions, choose the SV with the highest HMM score if available
@@ -238,7 +226,7 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
         #         max_score_idx = max_hmm_idx
 
         # Get the VCF record with the highest depth score
-        max_record = vcf_df.iloc[idx, :].iloc[max_score_idx, :]
+        # max_record = vcf_df.iloc[idx, :].iloc[max_score_idx, :]
 
         # Get the number of SVs in this cluster
         cluster_size = np.sum(idx)
@@ -246,30 +234,7 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min):
 
         # Update the SUPPORT field in the INFO column
         max_record = update_support(max_record, cluster_size)
-
-        # Get all position values in the cluster
-        pos_values = breakpoints[idx][:, 0]
-
-        # Debug if position is found
-        target_pos = 180949217
-        if target_pos in pos_values:
-            logging.info(f"Cluster size: {cluster_size}")
-            logging.info(f"Pos values:")
-            for k, pos in enumerate(pos_values):
-                logging.info(f"Row {k+1} - Pos: {pos}, HMM: {cluster_hmm_scores[k]}, support: {cluster_depth_scores[k]}")
-
-            logging.info(f"Chosen position: {max_record['POS']} - HMM: {max_hmm}, support: {max_support}")
-
-        # # If the POS value is a certain value, plot the support
-        # pos_min = 180915940
-        # pos_max = 180950356
-        # # if (np.any(pos_values >= pos_min) and np.any(pos_values <= pos_max)) or cluster_size > 1000:
-        # if (np.any(pos_values >= pos_min) and np.any(pos_values <= pos_max)):
-        #     logging.info(f"Cluster size: {cluster_size}")
-        #     logging.info(f"Pos values:")
-        #     for k, pos in enumerate(pos_values):
-        #         logging.info(f"Row {k+1} - Pos: {pos}, HMM: {cluster_hmm_scores[k]}, support: {cluster_depth_scores[k]}")
-
+        # pos_values = breakpoints[idx][:, 0]
 
         # Append the chosen record to the dataframe of records that will
         # form the merged VCF file
@@ -322,16 +287,19 @@ def sv_merger(vcf_file_path, cluster_size_min=3, suffix='.merged'):
         del chr_del_df
 
         # Cluster insertions and duplications
-        logging.info("Clustering insertions and duplications on chromosome %s...", chromosome)
-        chr_ins_dup_df = vcf_df[(vcf_df['CHROM'] == chromosome) & ((vcf_df['INFO'].str.contains('SVTYPE=INS')) | (vcf_df['INFO'].str.contains('SVTYPE=DUP')))]
-        ins_dup_records = cluster_breakpoints(chr_ins_dup_df, 'INS/DUP', cluster_size_min)
-        del chr_ins_dup_df
+        logging.info("Clustering all other SVs on chromosome %s...", chromosome)
+        # chr_ins_dup_df = vcf_df[(vcf_df['CHROM'] == chromosome) &
+        # ((vcf_df['INFO'].str.contains('SVTYPE=INS')) |
+        # (vcf_df['INFO'].str.contains('SVTYPE=DUP')))]
+        chr_non_del_df = vcf_df[(vcf_df['CHROM'] == chromosome) & (~vcf_df['INFO'].str.contains('SVTYPE=DEL'))]
+        ins_dup_records = cluster_breakpoints(chr_non_del_df, 'INS/DUP', cluster_size_min)
+        del chr_non_del_df
 
         # Summarize the number of deletions and insertions/duplications
         del_count = del_records.shape[0]
         ins_dup_count = ins_dup_records.shape[0]
         records_processed += del_count + ins_dup_count
-        logging.info("Chromosome %s - %d deletions, %d insertions, and duplications merged.", chromosome, del_count, ins_dup_count)
+        logging.info("Chromosome %s - %d deletions, %d other types merged.", chromosome, del_count, ins_dup_count)
 
         # Append the deletion and insertion/duplication records to the merged
         # records DataFrame
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 5f1c2e1d..35e36e3b 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -208,18 +208,11 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 }
 
 
-void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set<SVCall> &sv_candidates, int min_length, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map)
+void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall> &sv_candidates, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map)
 {
+    int min_length = this->input_data.getMinCNVLength();
     int window_size = this->input_data.getWindowSize();
-    // double mean_chr_cov = this->mean_chr_cov;  
-    // printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "...");
-    runCIGARCopyNumberPredictionChunk(chr, sv_candidates, hmm, window_size, mean_chr_cov, pos_depth_map);
-    // printMessage("Finished predicting copy number states for chromosome " + chr + "...");
-}
 
-void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCall>& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map)
-{
-    // printMessage("Running copy number prediction for " + std::to_string(sv_chunk.size()) + " SV candidates on chromosome " + chr + "...");
     // Map with counts for each CNV type
     std::map<int, int> cnv_type_counts;
     for (int i = 0; i < 6; i++)
@@ -228,7 +221,7 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCa
     }
     
     // Loop through each SV candidate and predict the copy number state
-    for (auto& sv_call : sv_chunk)
+    for (auto& sv_call : sv_candidates)
     {
 
         // Get the SV candidate
@@ -243,7 +236,7 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCa
         }
 
         // Skip if not the minimum length for CNV predictions
-        if ((end_pos - start_pos) < (uint32_t)this->input_data.getMinCNVLength())
+        if ((end_pos - start_pos) < (uint32_t) min_length)
         {
             continue;
         }
@@ -322,10 +315,15 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set<SVCa
 
         // Update the SV copy number data if not unknown
         // printMessage("Updating SV copy number data for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
-        if (updated_sv_type != SVType::UNKNOWN)
+        if (updated_sv_type != SVType::UNKNOWN && updated_sv_type != SVType::NEUTRAL)
         {
             std::string sv_type_str = getSVTypeString(updated_sv_type);
-            addSVCall(sv_chunk, sv_call.start, sv_call.end, sv_type_str, ".", data_type, genotype, likelihood);
+            sv_call.sv_type = sv_type_str;
+            sv_call.data_type = data_type;
+            sv_call.genotype = genotype;
+            sv_call.hmm_likelihood = likelihood;
+            sv_call.support = 1;
+            // addSVCall(sv_chunk, sv_call.start, sv_call.end, sv_type_str, ".", data_type, genotype, likelihood);
         }
 
         // Save the SV calls as a TSV file if enabled, if the SV type is
diff --git a/src/khmm.cpp b/src/khmm.cpp
index d375a1a0..22d4a269 100644
--- a/src/khmm.cpp
+++ b/src/khmm.cpp
@@ -285,10 +285,6 @@ std::pair<std::vector<int>, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect
 	// Threshold any zero values to avoid calculation issues.
 	for (i = 1; i <= hmm.N; i++)
 	{
-		// if (hmm.pi[i] == 0)
-		// 	hmm.pi[i] = 1e-9; /*eliminate problems with zero probability*/
-		// hmm.pi[i] = log(hmm.pi[i]);  // Convert to log probability due to underflow
-
 		// Update to 0-based indexing
 		if (hmm.pi[i-1] == 0) {
 			hmm.pi[i-1] = 1e-9; /*eliminate problems with zero probability*/
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 8918a131..51a2a332 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -37,11 +37,7 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
     return ret;
 }
 
-// RegionData SVCaller::detectSVsFromRegion(std::string region)
-// std::tuple<std::set<SVCall>, PrimaryMap, SuppMap>
-// SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
-// const std::string& region)
-void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::set<SVCall>& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments)
+void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, std::unordered_map<uint32_t, uint32_t>& breakpoint_depth)
 {
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
@@ -61,10 +57,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
     }
 
     // Main loop to process the alignments
-    // std::set<SVCall> sv_calls;
     int num_alignments = 0;
-    // PrimaryMap primary_alignments;
-    // SuppMap supplementary_alignments;
     while (readNextAlignment(fp_in, itr, bam1) >= 0) {
 
         // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
@@ -78,17 +71,27 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
 
             // Get the primary alignment information
             std::string chr = bamHdr->target_name[bam1->core.tid];
-            int64_t start = bam1->core.pos;
-            int64_t end = bam_endpos(bam1);  // This is the first position after the alignment
+            uint32_t start = (uint32_t)bam1->core.pos;
+            uint32_t end = (uint32_t)bam_endpos(bam1);  // This is the first position after the alignment
             bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
 
+            // Check for underflow
+            if (start > 4000000000 || end > 4000000000) {
+                throw std::runtime_error("ERROR: Integer underflow for alignment at position " + std::to_string(start) + "-" + std::to_string(end));
+            }
+
             // Call SVs directly from the CIGAR string
-            std::tuple<std::vector<int>, int32_t, int32_t> query_info;
-            this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true);
+            std::tuple<std::vector<int>, uint32_t, uint32_t> query_info;
+            this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true, breakpoint_depth);
             // std::tuple<std::vector<int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true);
             const std::vector<int>& match_map = std::get<0>(query_info);
-            int32_t query_start = std::get<1>(query_info);
-            int32_t query_end = std::get<2>(query_info);
+            uint32_t query_start = std::get<1>(query_info);
+            uint32_t query_end = std::get<2>(query_info);
+
+            // Check for underflow
+            if (query_start > 4000000000 || query_end > 4000000000) {
+                throw std::runtime_error("ERROR: Integer underflow for query at position " + std::to_string(query_start) + "-" + std::to_string(query_end));
+            }
 
             // Add the primary alignment to the map
             AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand);
@@ -99,18 +102,18 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
 
             // Get the supplementary alignment information
             std::string chr = bamHdr->target_name[bam1->core.tid];
-            int32_t start = bam1->core.pos;
-            int32_t end = bam_endpos(bam1);
+            uint32_t start = bam1->core.pos;
+            uint32_t end = bam_endpos(bam1);
             bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
 
             // Get CIGAR string information, but don't call SVs
             // std::tuple<std::vector<int>, int32_t, int32_t> query_info =
             // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false);
-            std::tuple<std::vector<int>, int32_t, int32_t> query_info;
-            this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, false);
+            std::tuple<std::vector<int>, uint32_t, uint32_t> query_info;
+            this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, false, breakpoint_depth);
             const std::vector<int>& match_map = std::get<0>(query_info);
-            int32_t query_start = std::get<1>(query_info);
-            int32_t query_end = std::get<2>(query_info);
+            uint32_t query_start = std::get<1>(query_info);
+            uint32_t query_end = std::get<2>(query_info);
 
             // Add the supplementary alignment to the map
             AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand);
@@ -149,29 +152,17 @@ double SVCaller::calculateMismatchRate(const std::vector<int>& mismatch_map, int
     }
 
     double mismatch_rate = static_cast<double>(mismatch_count) / static_cast<double>(match_count + mismatch_count);
-    // int match_count = 0;
-    // int mismatch_count = 0;
-    // for (int i = start; i <= end; i++) {
-    //     if (match_map.find(i) != match_map.end()) {
-    //         if (match_map[i] == 1) {
-    //             match_count++;
-    //         } else {
-    //             mismatch_count++;
-    //         }
-    //     }
-    // }
-    // double mismatch_rate = (double)mismatch_count / (double)(match_count + mismatch_count);
 
     return mismatch_rate;
 }
 
-void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set<SVCall>& sv_calls, std::tuple<std::vector<int>, int32_t, int32_t>& query_info, bool is_primary)
+void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, std::tuple<std::vector<int>, uint32_t, uint32_t>& query_info, bool is_primary, std::unordered_map<uint32_t, uint32_t>& breakpoint_depth)
 {
     std::string chr = header->target_name[alignment->core.tid];  // Chromosome name
-    int32_t pos = alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
+    uint32_t pos = (uint32_t)alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
     uint32_t* cigar = bam_get_cigar(alignment);  // CIGAR array
     int cigar_len = alignment->core.n_cigar;
-    int query_pos = 0;
+    uint32_t query_pos = 0;
     // std::unordered_map<int, int> query_match_map;  // Query position to
     // match/mismatch (1/0) map
     std::vector<int> query_match_map(alignment->core.l_qseq, 0);  // Query position to match/mismatch (1/0) map
@@ -180,10 +171,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set
     // only), update clipped base support, calculate sequence identity for
     // potential duplications (primary only), and calculate
     // the clipped base support and mismatch rate
-    int32_t ref_pos;
-    int32_t ref_end;
-    int32_t query_start = 0;  // First alignment position in the query
-    int32_t query_end = 0;    // Last alignment position in the query
+    uint32_t ref_pos;
+    uint32_t ref_end;
+    uint32_t query_start = 0;  // First alignment position in the query
+    uint32_t query_end = 0;    // Last alignment position in the query
     bool first_op = false;  // First alignment operation for the query
     double default_lh = 0.0;
     for (int i = 0; i < cigar_len; i++) {
@@ -209,9 +200,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set
                 // position.
                 bool is_duplication = false;
                 int ins_ref_pos;
-                int dup_start = std::max(0, pos - op_len);
+                uint32_t dup_start = std::max(0, (int)pos - op_len);
+                // int dup_start = std::max(0, pos - op_len);
                 // for (int j = pos - op_len; j <= pos; j++) {
-                for (int j = dup_start; j <= pos; j++) {
+                for (uint32_t j = dup_start; j <= pos; j++) {
 
                     // Get the string for the window (1-based coordinates)
                     ins_ref_pos = j + 1;
@@ -250,10 +242,11 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
                 if (is_duplication) {
-                    addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh);
+                    addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh);
                 } else {
-                    addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh);
+                    addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh);
                 }
+                this->updateBreakpointDepth(breakpoint_depth, ref_pos, ref_end);
             }
 
         // Check if the CIGAR operation is a deletion
@@ -264,7 +257,8 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set
             {
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
-                addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh);
+                addSVCall(sv_calls, ref_pos, ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh);
+                this->updateBreakpointDepth(breakpoint_depth, ref_pos, ref_end);
             }
 
         // Check if the CIGAR operation is a clipped base
@@ -340,12 +334,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set
 
     query_end = query_pos;  // Last alignment position in the query
 
-    query_info = std::tuple<std::vector<int>, int32_t, int32_t>(std::move(query_match_map), query_start, query_end);
-
-    // return std::tuple<std::vector<int>, int32_t, int32_t>(query_match_map, query_start, query_end);
+    query_info = std::tuple<std::vector<int>, uint32_t, uint32_t>(std::move(query_match_map), query_start, query_end);
 }
 
-void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::set<SVCall>& combined_sv_calls, int min_cnv_length)
+void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, int min_cnv_length)
 {
     // Open the BAM file
     samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
@@ -408,19 +400,20 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
     int region_count = region_chunks.size();
     int current_region = 0;
     // std::set<SVCall> combined_sv_calls;
+    std::unordered_map<uint32_t, uint32_t> breakpoint_depth;
     for (const auto& sub_region : region_chunks) {
         current_region++;
         printMessage(chr + ": CIGAR SVs...");
         PrimaryMap primary_map;
         SuppMap supp_map;
-        std::set<SVCall> subregion_sv_calls;
-        this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map);
+        std::vector<SVCall> subregion_sv_calls;
+        this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map, breakpoint_depth);
         // std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
         // PrimaryMap& primary_map = std::get<1>(region_data);
         // SuppMap& supp_map = std::get<2>(region_data);
         // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl;
         printMessage(chr + ": Merging CIGAR...");
-        mergeSVs(subregion_sv_calls);
+        mergeSVs(subregion_sv_calls, breakpoint_depth);
         int region_sv_count = getSVCount(subregion_sv_calls);
         // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
@@ -429,18 +422,18 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
         if (region_sv_count > 0) {
             // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl;
             printMessage(chr + ": CIGAR predictions...");
-            cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm, chr_data.first, chr_data.second);
+            cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, hmm, chr_data.first, chr_data.second);
         }
 
         // Run split-read SV and copy number variant predictions
         // std::cout << "Detecting copy number variants from split reads..." << std::endl;
         printMessage(chr + ": Split read SVs...");
-        this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, chr_data.first, chr_data.second);
+        this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, chr_data.first, chr_data.second, breakpoint_depth);
 
         // Merge the SV calls from the current region
         // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
         printMessage(chr + ": Merging split reads...");
-        mergeSVs(subregion_sv_calls);
+        mergeSVs(subregion_sv_calls, breakpoint_depth);
 
         // Combine the SV calls from the current region
         // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl;
@@ -450,6 +443,13 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
         printMessage("Completed " + std::to_string(current_region) + " of " + std::to_string(region_count) + " region(s) for chromosome " + chr + "...");
     }
 
+    // Run a final merge on the combined SV calls
+    printMessage(chr + ": Merging final calls...");
+    mergeSVs(combined_sv_calls, breakpoint_depth);
+
+    // Insert breakpoint support and filter SVs with low support
+    filterSVsWithLowSupport(combined_sv_calls, breakpoint_depth, 5);
+
     // Clean up the BAM file, header, and index
     hts_idx_destroy(idx);
     bam_hdr_destroy(bamHdr);
@@ -480,7 +480,7 @@ void SVCaller::run()
     const int max_threads = this->input_data.getThreadCount();
     std::cout << "Using " << max_threads << " threads for processing..." << std::endl;
     std::vector<std::future<void>> futures;
-    std::unordered_map<std::string, std::set<SVCall>> whole_genome_sv_calls;
+    std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
     std::mutex sv_mutex;
     std::condition_variable cv;
     int active_threads = 0;
@@ -488,7 +488,7 @@ void SVCaller::run()
     // Lambda to process a chromosome
     auto process_chr = [&](const std::string& chr) {
         // printMessage("Launching thread for chromosome " + chr + "...");
-        std::set<SVCall> sv_calls;
+        std::vector<SVCall> sv_calls;
         this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength());
         {
             std::lock_guard<std::mutex> lock(sv_mutex);
@@ -548,7 +548,7 @@ void SVCaller::run()
 
 
 // Detect SVs from split read alignments
-void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map)
+void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map, std::unordered_map<uint32_t, uint32_t>& breakpoint_depth)
 {
     // Find split-read SV evidence
     int sv_count = 0;
@@ -557,8 +557,8 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
         std::string qname = entry.first;
         AlignmentData primary_alignment = entry.second;
         std::string primary_chr = std::get<0>(primary_alignment);
-        int32_t primary_start = std::get<1>(primary_alignment);
-        int32_t primary_end = std::get<2>(primary_alignment);
+        uint32_t primary_start = std::get<1>(primary_alignment);
+        uint32_t primary_end = std::get<2>(primary_alignment);
 
         // Skip primary alignments that do not have supplementary alignments
         if (supp_map.find(qname) == supp_map.end()) {
@@ -567,15 +567,15 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
 
         // Find the largest supplementary alignment, and also identify inversions
         AlignmentData largest_supp_alignment = supp_map[qname][0];
-        int32_t largest_supp_length = 0;
+        uint32_t largest_supp_length = 0;
         for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
             const auto& supp_chr = std::get<0>(*it);
             if (primary_chr != supp_chr) {
                 continue;  // Skip supplementary alignments on different chromosomes
             }
-            int32_t supp_start = std::get<1>(*it);
-            int32_t supp_end = std::get<2>(*it);
-            int32_t supp_length = supp_end - supp_start + 1;
+            uint32_t supp_start = std::get<1>(*it);
+            uint32_t supp_end = std::get<2>(*it);
+            uint32_t supp_length = supp_end - supp_start + 1;
             if (supp_length > largest_supp_length) {
                 largest_supp_length = supp_length;
                 largest_supp_alignment = *it;
@@ -585,29 +585,31 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
             bool is_opposite_strand = std::get<6>(primary_alignment) != std::get<6>(*it);
             if (is_opposite_strand) {
                 if (supp_length >= min_cnv_length) {
-                    // SVCandidate sv_candidate(supp_start+1, supp_end+1, ".");
+                    // printMessage("Running copy number prediction on inversion: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1));
                     std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map);
                     double supp_lh = std::get<0>(result);
                     SVType supp_type = std::get<1>(result);
                     if (supp_type == SVType::NEUTRAL) {
-                        addSVCall(sv_calls, (uint32_t)(supp_start+1), (uint32_t)(supp_end+1), "INV", ".", "HMM", "./.", supp_lh);
+                        addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", ".", "HMM", "./.", supp_lh);
+                        this->updateBreakpointDepth(breakpoint_depth, supp_start+1, supp_end+1);
+                        
                         sv_count++;
                     } else if (supp_type == SVType::DUP) {
-                        addSVCall(sv_calls, (uint32_t)(supp_start+1), (uint32_t)(supp_end+1), "INVDUP", ".", "HMM", "./.", supp_lh);
-                        sv_count++;
+                        addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", ".", "HMM", "./.", supp_lh);
+                        this->updateBreakpointDepth(breakpoint_depth, supp_start+1, supp_end+1);
                     }
                 } else {
                     // Add the inversion without running copy number predictions
                     // (too small for predictions)
-                    addSVCall(sv_calls, (uint32_t)(supp_start+1), (uint32_t)(supp_end+1), "INV", ".", "REV", "./.", 0.0);
-                    sv_count++;
+                    addSVCall(sv_calls, supp_start+1, (supp_end+1), "INV", ".", "REV", "./.", 0.0);
+                    this->updateBreakpointDepth(breakpoint_depth, supp_start+1, supp_end+1);
                 }
             }
         }
 
         // Trim overlapping alignments
-        int32_t supp_start = std::get<1>(largest_supp_alignment);
-        int32_t supp_end = std::get<2>(largest_supp_alignment);
+        uint32_t supp_start = std::get<1>(largest_supp_alignment);
+        uint32_t supp_end = std::get<2>(largest_supp_alignment);
         bool primary_before_supp = primary_start < supp_start;
         trimOverlappingAlignments(primary_alignment, largest_supp_alignment);
 
@@ -617,7 +619,7 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
         primary_start = std::get<1>(primary_alignment);
         primary_end = std::get<2>(primary_alignment);
         bool gap_exists = false;
-        int32_t boundary_left, boundary_right, gap_left, gap_right;
+        uint32_t boundary_left, boundary_right, gap_left, gap_right;
         if (primary_before_supp) {
             boundary_left = primary_start+1;
             boundary_right = supp_end+1;
@@ -634,42 +636,37 @@ void SVCaller::detectSVsFromSplitReads(std::set<SVCall>& sv_calls, PrimaryMap& p
         
         // Run copy number variant predictions on the boundary if large enough
         if (boundary_right - boundary_left >= min_cnv_length) {
-            // split_boundary = SVCandidate(boundary_left, boundary_right, ".");
+            // printMessage("Running copy number prediction on boundary: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right));
             std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map);
             double bd_lh = std::get<0>(bd_result);
             SVType bd_type = std::get<1>(bd_result);
 
             // Run copy number variant predictions on the gap if it exists
             if (gap_exists && gap_right - gap_left >= min_cnv_length) {
-                // split_gap = SVCandidate(gap_left, gap_right, ".");
+                // printMessage("Running copy number prediction on gap: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right));
                 std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map);
                 double gap_lh = std::get<0>(gap_result);
                 SVType gap_type = std::get<1>(gap_result);
 
                 // If higher likelihood than the boundary, add the gap as the SV call
                 if (gap_lh > bd_lh) {
-                    addSVCall(sv_calls, (uint32_t)(gap_left), (uint32_t)(gap_right), getSVTypeString(gap_type), ".", "GAP", "./.", gap_lh);
-                    sv_count++;
+                    addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), ".", "GAP", "./.", gap_lh);
+                    this->updateBreakpointDepth(breakpoint_depth, gap_left, gap_right);
                 } else {
                     // Add the boundary as the SV call
-                    addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh);
-                    sv_count++;
+                    addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh);
+                    this->updateBreakpointDepth(breakpoint_depth, boundary_left, boundary_right);
                 }
             } else {
                 // Add the boundary as the SV call
-                addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh);
-                sv_count++;
+                addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh);
+                this->updateBreakpointDepth(breakpoint_depth, boundary_left, boundary_right);
             }
         }
     }
-
-    // Print the number of SVs detected from split-read alignments
-    // if (sv_count > 0) {
-    //     std::cout << "Found " << sv_count << " SVs from split-read alignments" << std::endl;
-    // }
 }
 
-void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall> >& sv_calls)
+void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls)
 {
     std::cout << "Creating VCF writer..." << std::endl;
     // std::string output_vcf = output_dir + "/output.vcf";
@@ -701,8 +698,8 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall>
         "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">",
         "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Method used to call the structural variant\">",
         "##INFO=<ID=ALN,Number=1,Type=String,Description=\"Alignment type used to call the structural variant\">",
-        "##INFO=<ID=CLIPSUP,Number=1,Type=Integer,Description=\"Clipped base support at the start and end positions\">",
-        "##INFO=<ID=SUPPORT,Number=1,Type=Integer,Description=\"Number of reads supporting the variant\">",
+        "##INFO=<ID=BPSUP1,Number=1,Type=Integer,Description=\"Number of reads supporting either breakpoint\">",
+        "##INFO=<ID=BPSUP2,Number=1,Type=Integer,Description=\"Number of reads supporting the exact breakpoints\">",
         "##INFO=<ID=REPTYPE,Number=1,Type=String,Description=\"Repeat type\">",
         "##INFO=<ID=HMM,Number=1,Type=Float,Description=\"HMM likelihood\">",
         "##FILTER=<ID=PASS,Description=\"All filters passed\">",
@@ -748,7 +745,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall>
     int total_count = 0;
     for (const auto& pair : sv_calls) {
         std::string chr = pair.first;
-        const std::set<SVCall>& sv_calls = pair.second;
+        const std::vector<SVCall>& sv_calls = pair.second;
         std::cout << "Saving SV calls for " << chr << "..." << std::endl;
         for (const auto& sv_call : sv_calls) {
             // Get the SV candidate and SV info
@@ -763,7 +760,8 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall>
             if (sv_type_str == "DEL") {
             	sv_length++;
         	}
-            int read_support = sv_call.support;
+            int bp_support = sv_call.support;
+            int total_bp_support = sv_call.total_support;
             int read_depth = 0;
             // SVType sv_type = sv_call.sv_type;
             // SVCandidate candidate = sv_call.first;
@@ -845,13 +843,14 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::set<SVCall>
             }
 
             // Create the VCF parameter strings
-            // int clipped_base_support = this->getClippedBaseSupport(chr, pos,
-            // end);
-            int clipped_base_support = 0;
-            // std::string sv_type_str = getSVTypeString(sv_type);
+            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
+            //     ";SVLEN=" + std::to_string(sv_length) + ";SUPPORT=" + std::to_string(read_support) + \
+            //     ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";CLIPSUP=" + std::to_string(clipped_base_support) + \
+            //     ";REPTYPE=" + repeat_type + ";HMM=" +
+            //     std::to_string(hmm_likelihood);
             std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
-                ";SVLEN=" + std::to_string(sv_length) + ";SUPPORT=" + std::to_string(read_support) + \
-                ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";CLIPSUP=" + std::to_string(clipped_base_support) + \
+                ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
+                ";BPSUP1=" + std::to_string(total_bp_support) + ";BPSUP2=" + std::to_string(bp_support) + \
                 ";REPTYPE=" + repeat_type + ";HMM=" + std::to_string(hmm_likelihood);
                 
             std::string format_str = "GT:DP";
@@ -875,28 +874,18 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align
 {
     // Get the start and end read positions for the primary and supplementary
     // alignments
-    int32_t primary_alignment_start = std::get<1>(primary_alignment);
-    int32_t primary_alignment_end = std::get<2>(primary_alignment);
-    int32_t supp_alignment_start = std::get<1>(supp_alignment);
-    int32_t supp_alignment_end = std::get<2>(supp_alignment);
-    int32_t primary_query_start = std::get<3>(primary_alignment);
-    int32_t primary_query_end = std::get<4>(primary_alignment);
-    int32_t supp_query_start = std::get<3>(supp_alignment);
-    int32_t supp_query_end = std::get<4>(supp_alignment);
+    uint32_t primary_alignment_start = std::get<1>(primary_alignment);
+    uint32_t primary_alignment_end = std::get<2>(primary_alignment);
+    uint32_t supp_alignment_start = std::get<1>(supp_alignment);
+    uint32_t supp_alignment_end = std::get<2>(supp_alignment);
+    uint32_t primary_query_start = std::get<3>(primary_alignment);
+    uint32_t primary_query_end = std::get<4>(primary_alignment);
+    uint32_t supp_query_start = std::get<3>(supp_alignment);
+    uint32_t supp_query_end = std::get<4>(supp_alignment);
     const std::vector<int>& primary_match_map = std::get<5>(primary_alignment);
     const std::vector<int>& supp_match_map = std::get<5>(supp_alignment);
-    // int32_t primary_query_start = std::get<4>(primary_alignment);
-    // int32_t primary_query_end = std::get<5>(primary_alignment);
-    // int32_t supp_query_start = std::get<4>(supp_alignment);
-    // int32_t supp_query_end = std::get<5>(supp_alignment);
-    // const std::vector<int>& primary_match_map = std::get<6>(primary_alignment);
-    // const std::vector<int>& supp_match_map = std::get<6>(supp_alignment);
-    // int32_t primary_alignment_start = std::get<1>(primary_alignment);
-    // int32_t primary_alignment_end = std::get<2>(primary_alignment);
-    // int32_t supp_alignment_start = std::get<1>(supp_alignment);
-    // int32_t supp_alignment_end = std::get<2>(supp_alignment);
-
-    // Check if the alignments overlap
+
+    // Check for overlapping read alignments
     bool primary_before_supp = primary_query_start < supp_query_start;
     if (primary_before_supp) {
         // Primary before supplementary in the query
@@ -904,15 +893,19 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align
             // Calculate the mismatch rates at the overlapping region
             double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, supp_query_start, primary_query_end);
             double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, supp_query_start, primary_query_end);
-            int32_t overlap_length = primary_query_end - supp_query_start + 1;
+            uint32_t overlap_length = primary_query_end - supp_query_start + 1;
 
             // Trim the ailgnment with the higher mismatch rate
             if (primary_mismatch_rate > supp_mismatch_rate) {
                 // Trim the end of the primary alignment
-                std::get<2>(primary_alignment) = primary_alignment_end - overlap_length;
+                uint32_t new_end = primary_alignment_end > overlap_length ? primary_alignment_end - overlap_length : 0;
+                std::get<2>(primary_alignment) = new_end;
+                // std::get<2>(primary_alignment) = primary_alignment_end - overlap_length;
             } else {
                 // Trim the beginning of the supplementary alignment
-                std::get<1>(supp_alignment) = supp_alignment_start + overlap_length;
+                uint32_t new_start = supp_alignment_start + overlap_length;
+                std::get<1>(supp_alignment) = new_start;
+                // std::get<1>(supp_alignment) = supp_alignment_start + overlap_length;
             }
         }
     } else {
@@ -921,16 +914,25 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align
             // Calculate the mismatch rates at the overlapping region
             double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, primary_query_start, supp_query_end);
             double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, primary_query_start, supp_query_end);
-            int32_t overlap_length = supp_query_end - primary_query_start + 1;
+            uint32_t overlap_length = supp_query_end - primary_query_start + 1;
 
             // Trim the ailgnment with the higher mismatch rate
             if (supp_mismatch_rate > primary_mismatch_rate) {
                 // Trim the end of the supplementary alignment
-                std::get<2>(supp_alignment) = supp_alignment_end - overlap_length;
+                uint32_t new_end = supp_alignment_end > overlap_length ? supp_alignment_end - overlap_length : 0;
+                // std::get<2>(supp_alignment) = supp_alignment_end - overlap_length;
             } else {
                 // Trim the beginning of the primary alignment
-                std::get<1>(primary_alignment) = primary_alignment_start + overlap_length;
+                uint32_t new_start = primary_alignment_start + overlap_length;
+                std::get<1>(primary_alignment) = new_start;
+                // std::get<1>(primary_alignment) = primary_alignment_start + overlap_length;
             }
         }
     }
 }
+
+void SVCaller::updateBreakpointDepth(std::unordered_map<uint32_t, uint32_t> &breakpoint_depth, uint32_t start, uint32_t end)
+{
+    breakpoint_depth[start] += 1;
+    breakpoint_depth[end] += 1;
+}
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 9af58235..9efa9ca2 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -14,10 +14,15 @@ bool SVCall::operator<(const SVCall & other) const
 	return start < other.start || (start == other.start && end < other.end);
 }
 
-void addSVCall(std::set<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
+void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
 {
+    // Catch underflow errors
+    if (start > 4000000000 || end > 4000000000) {
+        throw std::runtime_error("ERROR: Integer underflow for SV call at position " + std::to_string(start) + "-" + std::to_string(end));
+    }
+
     // Ignore unknown SV types
-    if (sv_type == "UNKNOWN") {
+    if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") {
         return;
     }
     
@@ -25,66 +30,74 @@ void addSVCall(std::set<SVCall>& sv_calls, uint32_t start, uint32_t end, std::st
         throw std::runtime_error("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end));
     }
 
-    // printMessage("Adding SV call: " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end - start) + " and type " + sv_type);
-    sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1});
-}
+    // Insert the SV call in sorted order
+    SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1, 0};
+    auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call);
+    // sv_calls.insert(it, sv_call);
 
-std::vector<std::set<SVCall>> splitSVsIntoChunks(std::set<SVCall>& sv_calls, int chunk_count)
-{
-    // Split the SV calls into chunks
-    std::vector<std::set<SVCall>> sv_chunks;
-    int sv_count = (int) sv_calls.size();
-    int chunk_size = std::ceil((double) sv_count / (double) chunk_count);
-    int current_chunk = 0;
-    std::set<SVCall> current_sv_chunk;
-    for (const auto& sv_call : sv_calls)
+    // Update the SV type if the SV call already exists (if likelihood is
+    // higher)
+    if (it != sv_calls.end() && it->start == start && it->end == end)
     {
-        current_sv_chunk.insert(sv_call);
-
-        // If the current chunk size is reached, then add the chunk to the
-        // vector and reset the current chunk
-        if ((int) current_sv_chunk.size() == chunk_size)
+        if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood)
         {
-            // sv_chunks.insert(current_sv_chunk);
-            sv_chunks.push_back(current_sv_chunk);
-            current_sv_chunk.clear();
-            current_chunk++;
+            // Update the SV call
+            it->sv_type = sv_type;
+            it->data_type = data_type;
+            it->genotype = genotype;
+            it->hmm_likelihood = hmm_likelihood;
+            it->support++;  // Update support
+        } else {
+            it->support++;  // Update support
         }
+    } else {
+        sv_calls.insert(it, sv_call);  // Insert the new SV call
     }
 
-    // Add the last chunk if it is not empty
-    if (!current_sv_chunk.empty())
+    // printMessage("Adding SV call: " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end - start) + " and type " + sv_type);
+    // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1});
+}
+
+void updateSVType(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string data_type, std::string genotype, double hmm_likelihood)
+{
+    // Update the SV type for an existing SV call
+    auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), SVCall{start, end, "", "", "", "", 0.0, 0, 0});
+    if (it != sv_calls.end() && it->start == start && it->end == end)
     {
-        sv_chunks.push_back(current_sv_chunk);
-        // sv_chunks.insert(current_sv_chunk);
+        it->sv_type = sv_type;
+        it->data_type = data_type;
+        it->genotype = genotype;
+        it->hmm_likelihood = hmm_likelihood;
+    } else {
+        throw std::runtime_error("ERROR: SV call not found for update at position " + std::to_string(start) + "-" + std::to_string(end));
     }
-
-    return sv_chunks;
 }
 
-uint32_t getSVCount(const std::set<SVCall>& sv_calls)
+uint32_t getSVCount(const std::vector<SVCall>& sv_calls)
 {
     return (uint32_t) sv_calls.size();
 }
 
-void concatenateSVCalls(std::set<SVCall> &target, const std::set<SVCall> &source)
+void concatenateSVCalls(std::vector<SVCall> &target, const std::vector<SVCall>& source)
 {
     // Efficiently concatenate two sets of SV calls
-    target.insert(source.begin(), source.end());
+    // target.insert(source.begin(), source.end());
+    target.insert(target.end(), source.begin(), source.end());
 }
 
-void mergeSVs(std::set<SVCall>& sv_calls) {
+void mergeSVs(std::vector<SVCall>& sv_calls, std::unordered_map<uint32_t, uint32_t>& breakpoint_support)
+{
     if (sv_calls.size() < 2) {
         return;
     }
 
-    // Merge SV calls if they overlap by at least 50%
+    // Merge SV calls if they overlap
     int initial_size = sv_calls.size();
     std::vector<SVCall> merged_sv_calls;
     auto it = sv_calls.begin();
     SVCall current_merge = *it++;
     for (; it != sv_calls.end(); ++it) {
-        const SVCall& next = *it;
+        SVCall& next = *it;
 
         // Find overlap
         if (next.start <= current_merge.end) {
@@ -94,30 +107,63 @@ void mergeSVs(std::set<SVCall>& sv_calls) {
             }
 
             // Merge the SV calls based on HMM log likelihood (keep the higher
-            // likelihood), 0.0 indicates no likelihood
-            if (next.hmm_likelihood != 0.0 && next.hmm_likelihood > current_merge.hmm_likelihood) {
+            // likelihood), 0.0 indicates no likelihood (Also update support)
+            if (next.hmm_likelihood != 0.0) {
+                if (next.hmm_likelihood > current_merge.hmm_likelihood) {
+                    current_merge = next;  // Continue with the next call
+                }
+
+            // Merge based on support
+            } else if (next.support > current_merge.support) {
                 current_merge = next;  // Continue with the next call
+
+            } else {
+                // Merge based on breakpoint depth
+                uint32_t next_depth = breakpoint_support[next.start] + breakpoint_support[next.end];
+                uint32_t current_depth = breakpoint_support[current_merge.start] + breakpoint_support[current_merge.end];
+                if (next_depth > current_depth) {
+                    current_merge = next;  // Continue with the next call
+                
+                // Merge based on SV length
+                } else if (next.end - next.start > current_merge.end - current_merge.start) {
+                    current_merge = next;  // Continue with the next call
+                }
             }
 
         } else {
             // No overlap: Save the previous SV and continue
-            merged_sv_calls.push_back(current_merge);
+            merged_sv_calls.emplace_back(current_merge);
             current_merge = next;
         }
     }
 
     // Add the last merged SV call
     // printMessage("Saving SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood));
-    merged_sv_calls.push_back(current_merge);
+    merged_sv_calls.emplace_back(current_merge);
 
     // Replace contents of the SV calls
-    sv_calls = std::set<SVCall>(merged_sv_calls.begin(), merged_sv_calls.end());
-
-    // // Update the SV calls
-    // sv_calls.clear();
-    // for (const auto& sv_call : merged_sv_calls) {
-    //     sv_calls.insert(sv_call);
-    // }
+    sv_calls = merged_sv_calls;
+    
     int updated_size = sv_calls.size();
     std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl;
 }
+
+void filterSVsWithLowSupport(std::vector<SVCall>& sv_calls, std::unordered_map<uint32_t, uint32_t>& breakpoint_support, int min_support)
+{
+    // Insert breakpoint support for each SV call, and remove SV calls with low
+    // support
+    int prev_size = sv_calls.size();
+    for (auto& sv_call : sv_calls)
+    {
+        sv_call.total_support = breakpoint_support[sv_call.start] + breakpoint_support[sv_call.end];
+        printMessage("SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " with support " + std::to_string(sv_call.total_support) + " and likelihood " + std::to_string(sv_call.hmm_likelihood) + " and length " + std::to_string(sv_call.end - sv_call.start));
+    }
+
+    // Remove SV calls with low support, unless they are large (> 20 kb)
+    sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) {
+        return (sv_call.total_support < min_support && (sv_call.end - sv_call.start) < 20000);
+    }), sv_calls.end());
+
+    int updated_size = sv_calls.size();
+    printMessage("Filtered " + std::to_string(prev_size) + " SV calls to " + std::to_string(updated_size) + " SV calls with support >= " + std::to_string(min_support));
+}

From 3458bc8e3a3a773885da0d97f0719d27e6a0e712 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 2 Dec 2024 01:49:03 -0500
Subject: [PATCH 039/134] add support back and improve merging

---
 include/cnv_caller.h |  10 +--
 include/sv_caller.h  |  10 +--
 include/sv_object.h  |  14 ++--
 src/cnv_caller.cpp   |  78 ++++++++++---------
 src/sv_caller.cpp    | 176 +++++++++++++++++++++----------------------
 src/sv_object.cpp    | 123 +++++++++++++++++-------------
 6 files changed, 212 insertions(+), 199 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 3663d184..b36d414f 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -73,10 +73,10 @@ class CNVCaller {
 
         void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp);
 
-        std::pair<std::vector<int>, double> runViterbi(const CHMM& hmm, SNPData& snp_data);
+        void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction);
 
         // Query a region for SNPs and return the SNP data
-        std::pair<SNPData, bool> querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& pos_depth_map, double mean_chr_cov);
+        std::pair<SNPData, bool> querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov);
 
         void querySNPs(std::string chr, uint32_t start, uint32_t end, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf, std::unordered_map<uint32_t, double>& snp_pfb);
 
@@ -88,17 +88,17 @@ class CNVCaller {
 
         // Run copy number prediction for a single SV candidate, returning the
         // likelihood, predicted CNV type, genotype, and whether SNPs were found
-        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map);
+        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map);
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
-        void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map);
+        void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map);
 
         // Calculate the mean chromosome coverage
         std::pair<double, std::vector<uint32_t>> calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len);
 
         // Calculate the log2 ratio for a region given the read depths and mean
         // chromosome coverage
-        double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& pos_depth_map, double mean_chr_cov);
+        double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov);
 
         void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf);
         void readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map<uint32_t, double>& snp_pfb_map);
diff --git a/include/sv_caller.h b/include/sv_caller.h
index b4f6eaac..82a833f3 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -34,32 +34,32 @@ class SVCaller {
 
         // Detect SVs from the CIGAR string of a read alignment, and return the
         // mismatch rate, and the start and end positions of the query sequence
-        void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, std::tuple<std::vector<int>, uint32_t, uint32_t>& query_info, bool is_primary, std::unordered_map<uint32_t, uint32_t>& breakpoint_depth);
+        void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, std::tuple<std::vector<int>, uint32_t, uint32_t>& query_info, bool is_primary, const std::vector<uint32_t>& pos_depth_map);
 
         void processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, int min_cnv_length);
 
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
         // RegionData detectSVsFromRegion(std::string region);
-        void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, std::unordered_map<uint32_t, uint32_t>& breakpoint_depth);
+        void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, const std::vector<uint32_t>& pos_depth_map);
  
         // Read the next alignment from the BAM file in a thread-safe manner
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
 
         // Detect SVs from split alignments
-        void detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map, std::unordered_map<uint32_t, uint32_t>& breakpoint_depth);
+        void detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map);
 
         // Calculate the mismatch rate given a map of query positions to
         // match/mismatch (1/0) values within a specified range of the query
         // sequence
-        // double calculateMismatchRate(std::unordered_map<int, int>& mismatch_map, int32_t start, int32_t end);
         double calculateMismatchRate(const std::vector<int>& mismatch_map, int32_t start, int32_t end);
 
         void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls);
 
         void trimOverlappingAlignments(AlignmentData& primary_alignment, AlignmentData& supp_alignment);
 
-        void updateBreakpointDepth(std::unordered_map<uint32_t, uint32_t>& breakpoint_depth, uint32_t start, uint32_t end);
+        // Calculate the read depth (INFO/DP) for a region
+        int calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end);
 
     public:
         explicit SVCaller(InputData& input_data);
diff --git a/include/sv_object.h b/include/sv_object.h
index 7f8b9d96..e36e8624 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -17,22 +17,22 @@ struct SVCall {
     std::string data_type = "NA";
     std::string genotype = "./.";
     double hmm_likelihood = 0.0;
-    int support = 0;  // Exact breakpoint support
-    int total_support = 0;  // Support at either breakpoint
+    int read_depth = 0;  // Breakpoint depth
+    int support = 0;  // Number of supporting reads
 
     // Comparison operator for std::set
     bool operator<(const SVCall& other) const;
 
     // Constructor with parameters for all fields
-    SVCall(uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int support, int total_support) :
-        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), support(support), total_support(support) {}
+    SVCall(uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) :
+        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {}
 };
 
-void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood);
+void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth);
 
-void mergeSVs(std::vector<SVCall>& sv_calls, std::unordered_map<uint32_t, uint32_t>& breakpoint_support);
+void mergeSVs(std::vector<SVCall>& sv_calls);
 
-void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, std::unordered_map<uint32_t, uint32_t> &breakpoint_support, int min_support);
+void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, int min_depth);
 
 uint32_t getSVCount(const std::vector<SVCall>& sv_calls);
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 35e36e3b..9002e2b6 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -41,19 +41,18 @@ CNVCaller::CNVCaller(InputData &input_data)
 }
 
 // Function to call the Viterbi algorithm for the CHMM
-std::pair<std::vector<int>, double> CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data)
+void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction)
 {
     int data_count = (int) snp_data.pos.size();
     if (data_count == 0)
     {
         throw std::runtime_error("Error: No SNP data found for Viterbi algorithm.");
     }
-    std::pair<std::vector<int>, double> state_sequence = testVit_CHMM(hmm, data_count, snp_data.log2_cov, snp_data.baf, snp_data.pfb);
-    return state_sequence;
+    prediction = testVit_CHMM(hmm, data_count, snp_data.log2_cov, snp_data.baf, snp_data.pfb);
 }
 
 // Function to obtain SNP information for a region
-std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& pos_depth_map, double mean_chr_cov)
+std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov)
 {
     SNPData snp_data;
     bool snps_found = false;
@@ -136,13 +135,28 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
     return std::make_pair(snp_data, snps_found);
 }
 
-std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map)
+std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map)
 {
+    // Check that the start position is less than the end position
+    if (start_pos >= end_pos)
+    {
+        throw std::runtime_error("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
+    }
+
     // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2
     // the SV length
-    uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
-    uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
-    uint32_t snp_end_pos = end_pos + sv_half_length;
+    // Only extened the region if "save CNV data" is enabled
+    uint32_t snp_start_pos = start_pos;
+    uint32_t snp_end_pos = end_pos;
+    if (this->input_data.getSaveCNVData())
+    {
+        uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
+        snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
+        snp_end_pos = end_pos + sv_half_length;
+    }
+    // uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
+    // uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
+    // uint32_t snp_end_pos = end_pos + sv_half_length;
 
     // Query the SNP region for the SV candidate
     std::pair<SNPData, bool> snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov);
@@ -151,7 +165,8 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 
     // Run the Viterbi algorithm
     // printMessage("Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " (" + std::to_string(sv_snps.pos.size()) + " SNPs, start=" + std::to_string(snp_start_pos) + ", end=" + std::to_string(snp_end_pos) + ")...");
-    std::pair<std::vector<int>, double> prediction = runViterbi(hmm, sv_snps);
+    std::pair<std::vector<int>, double> prediction;
+    runViterbi(hmm, sv_snps, prediction);
     std::vector<int>& state_sequence = prediction.first;
     double likelihood = prediction.second;
 
@@ -208,7 +223,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 }
 
 
-void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall> &sv_candidates, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map)
+void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall> &sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map)
 {
     int min_length = this->input_data.getMinCNVLength();
     int window_size = this->input_data.getWindowSize();
@@ -241,27 +256,29 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
             continue;
         }
 
-        // Get the depth at the start position. This is used as the FORMAT/DP
-        // value in the VCF file
-        // int dp_value = pos_depth_map[start_pos];
-        // this->updateDPValue(sv_candidates, sv_call, dp_value);
-
         // Loop through the SV region +/- 1/2 SV length and run copy number
         // predictions
-        uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
-        uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
-        uint32_t snp_end_pos = end_pos + sv_half_length;
+        // Only extend the region if "save CNV data" is enabled
+        uint32_t snp_start_pos = start_pos;
+        uint32_t snp_end_pos = end_pos;
+        if (this->input_data.getSaveCNVData())
+        {
+            uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
+            snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
+            snp_end_pos = end_pos + sv_half_length;
+        }
         std::pair<SNPData, bool> snp_call = this->querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov);
         SNPData& sv_snps = snp_call.first;
         bool snps_found = snp_call.second;
 
-        // Run the Viterbi algorithm        
+        // Run the Viterbi algorithm
         if (sv_snps.pos.size() == 0) {
         	std::cerr << "ERROR: No windows for SV " << chr << ":" << start_pos << "-" << end_pos << " (" << snp_start_pos << "," << snp_end_pos << std::endl;
         	continue;
         }
         
-        std::pair<std::vector<int>, double> prediction = runViterbi(hmm, sv_snps);
+        std::pair<std::vector<int>, double> prediction;
+        runViterbi(hmm, sv_snps, prediction);
         std::vector<int>& state_sequence = prediction.first;
         double likelihood = prediction.second;
         // printMessage("Finished running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
@@ -322,8 +339,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
             sv_call.data_type = data_type;
             sv_call.genotype = genotype;
             sv_call.hmm_likelihood = likelihood;
-            sv_call.support = 1;
-            // addSVCall(sv_chunk, sv_call.start, sv_call.end, sv_type_str, ".", data_type, genotype, likelihood);
         }
 
         // Save the SV calls as a TSV file if enabled, if the SV type is
@@ -368,14 +383,6 @@ std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint3
     return region_chunks;
 }
 
-// std::pair<double, std::unordered_map<uint32_t, int>> CNVCaller::loadChromosomeData(std::string chr)
-// {
-//     printMessage("Calculating mean chromosome coverage for " + chr + "...");
-//     // this->mean_chr_cov = calculateMeanChromosomeCoverage(chr);
-//     std::pair<double, std::unordered_map<uint32_t, int>> depth_data = calculateMeanChromosomeCoverage(chr);
-//     printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov));
-// }
-
 // Calculate the mean chromosome coverage
 std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len)
 {
@@ -509,7 +516,7 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
     return std::make_pair(mean_chr_cov, chr_pos_depth_map);
 }
 
-double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& pos_depth_map, double mean_chr_cov)
+double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov)
 {
     // Use the position and depth map to calculate the log2 ratio
     double cum_depth = 0;
@@ -521,15 +528,6 @@ double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::
             cum_depth += pos_depth_map[i];
             pos_count++;
         }
-        // // Check if the position is in the map
-        // auto it = pos_depth_map.find(i);
-        // if (it == pos_depth_map.end())
-        // {
-        //     continue;
-        // }
-        // int depth = pos_depth_map[i];
-        // pos_count++;
-        // cum_depth += depth;
     }
 
     // Calculate the window coverage log2 ratio (0 if no positions)
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 51a2a332..3635fe15 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -22,6 +22,7 @@
 
 #include "utils.h"
 #include "sv_types.h"
+#include "version.h"
 /// @endcond
 
 # define DUP_SEQSIM_THRESHOLD 0.9  // Sequence similarity threshold for duplication detection
@@ -37,7 +38,7 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
     return ret;
 }
 
-void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, std::unordered_map<uint32_t, uint32_t>& breakpoint_depth)
+void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, const std::vector<uint32_t>& pos_depth_map)
 {
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
@@ -82,7 +83,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
 
             // Call SVs directly from the CIGAR string
             std::tuple<std::vector<int>, uint32_t, uint32_t> query_info;
-            this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true, breakpoint_depth);
+            this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true, pos_depth_map);
             // std::tuple<std::vector<int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true);
             const std::vector<int>& match_map = std::get<0>(query_info);
             uint32_t query_start = std::get<1>(query_info);
@@ -110,7 +111,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
             // std::tuple<std::vector<int>, int32_t, int32_t> query_info =
             // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false);
             std::tuple<std::vector<int>, uint32_t, uint32_t> query_info;
-            this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, false, breakpoint_depth);
+            this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, false, pos_depth_map);
             const std::vector<int>& match_map = std::get<0>(query_info);
             uint32_t query_start = std::get<1>(query_info);
             uint32_t query_end = std::get<2>(query_info);
@@ -156,7 +157,7 @@ double SVCaller::calculateMismatchRate(const std::vector<int>& mismatch_map, int
     return mismatch_rate;
 }
 
-void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, std::tuple<std::vector<int>, uint32_t, uint32_t>& query_info, bool is_primary, std::unordered_map<uint32_t, uint32_t>& breakpoint_depth)
+void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, std::tuple<std::vector<int>, uint32_t, uint32_t>& query_info, bool is_primary, const std::vector<uint32_t>& pos_depth_map)
 {
     std::string chr = header->target_name[alignment->core.tid];  // Chromosome name
     uint32_t pos = (uint32_t)alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
@@ -241,12 +242,12 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 // Add to SV calls (1-based) with the appropriate SV type
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
+                int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
                 if (is_duplication) {
-                    addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh);
+                    addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth);
                 } else {
-                    addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh);
+                    addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth);
                 }
-                this->updateBreakpointDepth(breakpoint_depth, ref_pos, ref_end);
             }
 
         // Check if the CIGAR operation is a deletion
@@ -257,8 +258,8 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
             {
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
-                addSVCall(sv_calls, ref_pos, ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh);
-                this->updateBreakpointDepth(breakpoint_depth, ref_pos, ref_end);
+                int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
+                addSVCall(sv_calls, ref_pos, ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh, read_depth);
             }
 
         // Check if the CIGAR operation is a clipped base
@@ -399,21 +400,19 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
     // std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl;
     int region_count = region_chunks.size();
     int current_region = 0;
-    // std::set<SVCall> combined_sv_calls;
-    std::unordered_map<uint32_t, uint32_t> breakpoint_depth;
     for (const auto& sub_region : region_chunks) {
         current_region++;
         printMessage(chr + ": CIGAR SVs...");
         PrimaryMap primary_map;
         SuppMap supp_map;
         std::vector<SVCall> subregion_sv_calls;
-        this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map, breakpoint_depth);
+        this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map, chr_data.second);
         // std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
         // PrimaryMap& primary_map = std::get<1>(region_data);
         // SuppMap& supp_map = std::get<2>(region_data);
         // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl;
         printMessage(chr + ": Merging CIGAR...");
-        mergeSVs(subregion_sv_calls, breakpoint_depth);
+        mergeSVs(subregion_sv_calls);
         int region_sv_count = getSVCount(subregion_sv_calls);
         // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
@@ -428,12 +427,12 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
         // Run split-read SV and copy number variant predictions
         // std::cout << "Detecting copy number variants from split reads..." << std::endl;
         printMessage(chr + ": Split read SVs...");
-        this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, chr_data.first, chr_data.second, breakpoint_depth);
+        this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, chr_data.first, chr_data.second);
 
         // Merge the SV calls from the current region
         // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
         printMessage(chr + ": Merging split reads...");
-        mergeSVs(subregion_sv_calls, breakpoint_depth);
+        mergeSVs(subregion_sv_calls);
 
         // Combine the SV calls from the current region
         // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl;
@@ -445,10 +444,11 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
 
     // Run a final merge on the combined SV calls
     printMessage(chr + ": Merging final calls...");
-    mergeSVs(combined_sv_calls, breakpoint_depth);
+    mergeSVs(combined_sv_calls);
 
     // Insert breakpoint support and filter SVs with low support
-    filterSVsWithLowSupport(combined_sv_calls, breakpoint_depth, 5);
+    // filterSVsWithLowSupport(combined_sv_calls, breakpoint_depth, 5);
+    filterSVsWithLowSupport(combined_sv_calls, 10);
 
     // Clean up the BAM file, header, and index
     hts_idx_destroy(idx);
@@ -548,7 +548,7 @@ void SVCaller::run()
 
 
 // Detect SVs from split read alignments
-void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector<uint32_t>& pos_depth_map, std::unordered_map<uint32_t, uint32_t>& breakpoint_depth)
+void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map)
 {
     // Find split-read SV evidence
     int sv_count = 0;
@@ -589,20 +589,20 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
                     std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map);
                     double supp_lh = std::get<0>(result);
                     SVType supp_type = std::get<1>(result);
+                    int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
                     if (supp_type == SVType::NEUTRAL) {
-                        addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", ".", "HMM", "./.", supp_lh);
-                        this->updateBreakpointDepth(breakpoint_depth, supp_start+1, supp_end+1);
+                        addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", ".", "HMM", "./.", supp_lh, read_depth);
                         
                         sv_count++;
                     } else if (supp_type == SVType::DUP) {
-                        addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", ".", "HMM", "./.", supp_lh);
-                        this->updateBreakpointDepth(breakpoint_depth, supp_start+1, supp_end+1);
+                        int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
+                        addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", ".", "HMM", "./.", supp_lh, read_depth);
                     }
                 } else {
                     // Add the inversion without running copy number predictions
                     // (too small for predictions)
-                    addSVCall(sv_calls, supp_start+1, (supp_end+1), "INV", ".", "REV", "./.", 0.0);
-                    this->updateBreakpointDepth(breakpoint_depth, supp_start+1, supp_end+1);
+                    int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
+                    addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", ".", "REV", "./.", 0.0, read_depth);
                 }
             }
         }
@@ -650,17 +650,17 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
 
                 // If higher likelihood than the boundary, add the gap as the SV call
                 if (gap_lh > bd_lh) {
-                    addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), ".", "GAP", "./.", gap_lh);
-                    this->updateBreakpointDepth(breakpoint_depth, gap_left, gap_right);
+                    int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right);
+                    addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), ".", "GAP", "./.", gap_lh, read_depth);
                 } else {
                     // Add the boundary as the SV call
-                    addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh);
-                    this->updateBreakpointDepth(breakpoint_depth, boundary_left, boundary_right);
+                    int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
+                    addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh, read_depth);
                 }
             } else {
                 // Add the boundary as the SV call
-                addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh);
-                this->updateBreakpointDepth(breakpoint_depth, boundary_left, boundary_right);
+                int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
+                addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh, read_depth);
             }
         }
     }
@@ -697,15 +697,12 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">",
         "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">",
         "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Method used to call the structural variant\">",
-        "##INFO=<ID=ALN,Number=1,Type=String,Description=\"Alignment type used to call the structural variant\">",
-        "##INFO=<ID=BPSUP1,Number=1,Type=Integer,Description=\"Number of reads supporting either breakpoint\">",
-        "##INFO=<ID=BPSUP2,Number=1,Type=Integer,Description=\"Number of reads supporting the exact breakpoints\">",
-        "##INFO=<ID=REPTYPE,Number=1,Type=String,Description=\"Repeat type\">",
+        "##INFO=<ID=ALN,Number=1,Type=String,Description=\"Feature used to identify the structural variant\">",
         "##INFO=<ID=HMM,Number=1,Type=Float,Description=\"HMM likelihood\">",
         "##FILTER=<ID=PASS,Description=\"All filters passed\">",
         "##FILTER=<ID=LowQual,Description=\"Low quality\">",
         "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
-        "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth\">"
+        "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth at the variant site (sum of start and end positions)\">",
     };
 
     std::cout << "Writing VCF header..." << std::endl;
@@ -740,7 +737,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     //this->file_stream.flush();
 
     std::cout << "Saving SV calls to " << output_vcf << std::endl;
-    std::string sv_method = "CONTEXTSVv0.1";
+    std::string sv_method = "CONTEXTSV" + std::string(VERSION);
     int skip_count = 0;
     int total_count = 0;
     for (const auto& pair : sv_calls) {
@@ -760,31 +757,8 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             if (sv_type_str == "DEL") {
             	sv_length++;
         	}
-            int bp_support = sv_call.support;
-            int total_bp_support = sv_call.total_support;
-            int read_depth = 0;
-            // SVType sv_type = sv_call.sv_type;
-            // SVCandidate candidate = sv_call.first;
-            // SVInfo info = sv_call.second;
-            // SVType sv_type = info.sv_type;
-            // int read_support = info.read_support;
-            // int read_depth = info.read_depth;
-            // int read_depth = 0;
-            // int read_support = 0;
-            // int sv_length = info.sv_length;
-            // std::set<std::string> data_type = info.data_type;
-            // std::string genotype = info.genotype;
-            // double hmm_likelihood = info.hmm_likelihood;
-
-            // Convert the data type set to a string
-            // std::string data_type_str = "";
-            // for (auto const& type : data_type) {
-            //     data_type_str += type + ",";
-            // }
-
-            // Get the CHROM, POS, END, and ALT
-            // uint32_t pos = std::get<0>(candidate);
-            // uint32_t end = std::get<1>(candidate);
+            int read_depth = sv_call.read_depth;
+            std::string ref_allele = ".";
 
             // If the SV type is unknown, skip it
             if (sv_type_str == "UNKNOWN" || sv_type_str == "NEUTRAL") {
@@ -794,11 +768,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                 total_count += 1;
             }
 
-            // Process by SV type
-            std::string ref_allele = ".";
-            // std::string alt_allele = ".";
-            std::string repeat_type = "NA";
-
             // Deletion
             if (sv_type_str == "DEL") {
                 // Get the deleted sequence from the reference genome, also including the preceding base
@@ -822,8 +791,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             } else {
                 // Use the preceding base as the reference allele
                 int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
-                // ref_allele = ref_genome.query(chr, preceding_pos,
-                // preceding_pos);
                 ref_allele = this->input_data.queryRefGenome(chr, preceding_pos, preceding_pos);
 
                 // Format novel insertions
@@ -831,7 +798,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                     // Check if in symbolic form
                     if (alt_allele != "<INS>") {
                         // Use the insertion sequence as the alternate allele
-                        // alt_allele = std::get<2>(candidate);
                         alt_allele.insert(0, ref_allele);
                     }
                     start = preceding_pos;  // Update the position to the preceding base
@@ -843,15 +809,9 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             }
 
             // Create the VCF parameter strings
-            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
-            //     ";SVLEN=" + std::to_string(sv_length) + ";SUPPORT=" + std::to_string(read_support) + \
-            //     ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";CLIPSUP=" + std::to_string(clipped_base_support) + \
-            //     ";REPTYPE=" + repeat_type + ";HMM=" +
-            //     std::to_string(hmm_likelihood);
             std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
                 ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
-                ";BPSUP1=" + std::to_string(total_bp_support) + ";BPSUP2=" + std::to_string(bp_support) + \
-                ";REPTYPE=" + repeat_type + ";HMM=" + std::to_string(hmm_likelihood);
+                ";HMM=" + std::to_string(hmm_likelihood);
                 
             std::string format_str = "GT:DP";
             std::string sample_str = genotype + ":" + std::to_string(read_depth);
@@ -897,14 +857,24 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align
 
             // Trim the ailgnment with the higher mismatch rate
             if (primary_mismatch_rate > supp_mismatch_rate) {
-                // Trim the end of the primary alignment
-                uint32_t new_end = primary_alignment_end > overlap_length ? primary_alignment_end - overlap_length : 0;
-                std::get<2>(primary_alignment) = new_end;
+                // Trim the end of the primary alignment, ensuring that the new
+                // end is not less than the start
+                if (primary_alignment_end > overlap_length && (primary_alignment_end - overlap_length) > primary_alignment_start) {
+                    // Trim the end of the primary alignment
+                    uint32_t new_end = primary_alignment_end - overlap_length;
+                    std::get<2>(primary_alignment) = new_end;
+                }
                 // std::get<2>(primary_alignment) = primary_alignment_end - overlap_length;
             } else {
-                // Trim the beginning of the supplementary alignment
-                uint32_t new_start = supp_alignment_start + overlap_length;
-                std::get<1>(supp_alignment) = new_start;
+                // Trim the beginning of the supplementary alignment, ensuring
+                // that the new start is not greater than the end
+                if (supp_alignment_start + overlap_length < supp_alignment_end) {
+                    // Trim the beginning of the supplementary alignment
+                    uint32_t new_start = supp_alignment_start + overlap_length;
+                    std::get<1>(supp_alignment) = new_start;
+                }
+                // uint32_t new_start = supp_alignment_start + overlap_length;
+                // std::get<1>(supp_alignment) = new_start;
                 // std::get<1>(supp_alignment) = supp_alignment_start + overlap_length;
             }
         }
@@ -918,21 +888,47 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align
 
             // Trim the ailgnment with the higher mismatch rate
             if (supp_mismatch_rate > primary_mismatch_rate) {
-                // Trim the end of the supplementary alignment
-                uint32_t new_end = supp_alignment_end > overlap_length ? supp_alignment_end - overlap_length : 0;
+                // Trim the end of the supplementary alignment, ensuring that
+                // the new end is not less than the start
+                if (supp_alignment_end > overlap_length && (supp_alignment_end - overlap_length) > supp_alignment_start) {
+                    // Trim the end of the supplementary alignment
+                    uint32_t new_end = supp_alignment_end - overlap_length;
+                    std::get<2>(supp_alignment) = new_end;
+                }
+                // uint32_t new_end = supp_alignment_end > overlap_length ? supp_alignment_end - overlap_length : 0;
+                // std::get<2>(supp_alignment) = new_end;
                 // std::get<2>(supp_alignment) = supp_alignment_end - overlap_length;
             } else {
-                // Trim the beginning of the primary alignment
-                uint32_t new_start = primary_alignment_start + overlap_length;
-                std::get<1>(primary_alignment) = new_start;
+                // Trim the beginning of the primary alignment, ensuring that
+                // the new start is not greater than the end
+                if (primary_alignment_start + overlap_length < primary_alignment_end) {
+                    // Trim the beginning of the primary alignment
+                    uint32_t new_start = primary_alignment_start + overlap_length;
+                    std::get<1>(primary_alignment) = new_start;
+                }
+                // uint32_t new_start = primary_alignment_start + overlap_length;
+                // std::get<1>(primary_alignment) = new_start;
                 // std::get<1>(primary_alignment) = primary_alignment_start + overlap_length;
             }
         }
     }
 }
 
-void SVCaller::updateBreakpointDepth(std::unordered_map<uint32_t, uint32_t> &breakpoint_depth, uint32_t start, uint32_t end)
+int SVCaller::calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end)
 {
-    breakpoint_depth[start] += 1;
-    breakpoint_depth[end] += 1;
+    int read_depth = 0;
+    try {
+        // printMessage("Read depth at start: " + std::to_string(pos_depth_map.at(start)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start));
+        read_depth += pos_depth_map.at(start);
+    } catch (const std::out_of_range& e) {
+        std::cerr << "Warning: Start position " << start << " not found in depth map." << std::endl;
+    }
+    try {
+        // printMessage("Read depth at end: " + std::to_string(pos_depth_map.at(end)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start));
+        read_depth += pos_depth_map.at(end);
+    } catch (const std::out_of_range& e) {
+        std::cerr << "Warning: End position " << end << " not found in depth map." << std::endl;
+    }
+    // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth));
+    return read_depth;
 }
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 9efa9ca2..479357a2 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -14,7 +14,7 @@ bool SVCall::operator<(const SVCall & other) const
 	return start < other.start || (start == other.start && end < other.end);
 }
 
-void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood)
+void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth)
 {
     // Catch underflow errors
     if (start > 4000000000 || end > 4000000000) {
@@ -31,14 +31,15 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std:
     }
 
     // Insert the SV call in sorted order
-    SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1, 0};
+    SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1};
     auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call);
-    // sv_calls.insert(it, sv_call);
 
     // Update the SV type if the SV call already exists (if likelihood is
     // higher)
     if (it != sv_calls.end() && it->start == start && it->end == end)
     {
+        it->support += 1;  // Update the read support
+        // printMessage("Updating SV call with length " + std::to_string(end - start) + " and type " + sv_type + " and support " + std::to_string(it->support));
         if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood)
         {
             // Update the SV call
@@ -46,16 +47,10 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std:
             it->data_type = data_type;
             it->genotype = genotype;
             it->hmm_likelihood = hmm_likelihood;
-            it->support++;  // Update support
-        } else {
-            it->support++;  // Update support
         }
     } else {
         sv_calls.insert(it, sv_call);  // Insert the new SV call
     }
-
-    // printMessage("Adding SV call: " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end - start) + " and type " + sv_type);
-    // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1});
 }
 
 void updateSVType(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string data_type, std::string genotype, double hmm_likelihood)
@@ -80,12 +75,10 @@ uint32_t getSVCount(const std::vector<SVCall>& sv_calls)
 
 void concatenateSVCalls(std::vector<SVCall> &target, const std::vector<SVCall>& source)
 {
-    // Efficiently concatenate two sets of SV calls
-    // target.insert(source.begin(), source.end());
     target.insert(target.end(), source.begin(), source.end());
 }
 
-void mergeSVs(std::vector<SVCall>& sv_calls, std::unordered_map<uint32_t, uint32_t>& breakpoint_support)
+void mergeSVs(std::vector<SVCall>& sv_calls)
 {
     if (sv_calls.size() < 2) {
         return;
@@ -96,74 +89,100 @@ void mergeSVs(std::vector<SVCall>& sv_calls, std::unordered_map<uint32_t, uint32
     std::vector<SVCall> merged_sv_calls;
     auto it = sv_calls.begin();
     SVCall current_merge = *it++;
+    double log_lh_eps = 1.0;  // Log likelihood epsilon
     for (; it != sv_calls.end(); ++it) {
         SVCall& next = *it;
 
         // Find overlap
+        // printMessage("[0] Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support));
+        // printMessage("[0] Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support));
         if (next.start <= current_merge.end) {
-            // Merge the SV calls if it is a subset
-            if (next.end <= current_merge.end) {
-                continue;
-            }
 
-            // Merge the SV calls based on HMM log likelihood (keep the higher
-            // likelihood), 0.0 indicates no likelihood (Also update support)
-            if (next.hmm_likelihood != 0.0) {
-                if (next.hmm_likelihood > current_merge.hmm_likelihood) {
+            // Merge based on read support
+            if (next.support > current_merge.support) {
+                // Compare only if lengths are within 20% of each other
+                uint32_t current_length = current_merge.end - current_merge.start;
+                uint32_t next_length = next.end - next.start;
+                double length_diff = std::abs((int)current_length - (int)next_length);
+                double length_threshold = 0.2 * (int)current_length;
+                if (length_diff <= length_threshold) {
                     current_merge = next;  // Continue with the next call
+                    // printMessage("Keeping next SV call with support " + std::to_string(next.support));
+                } else {
+                    // Keep the larger SV
+                    if (next_length > current_length) {
+                        current_merge = next;
+                        // printMessage("Keeping next SV call with length " + std::to_string(next_length));
+                    }
                 }
+                // printMessage("Keeping next SV call with support " + std::to_string(next.support));
 
-            // Merge based on support
-            } else if (next.support > current_merge.support) {
-                current_merge = next;  // Continue with the next call
-
-            } else {
-                // Merge based on breakpoint depth
-                uint32_t next_depth = breakpoint_support[next.start] + breakpoint_support[next.end];
-                uint32_t current_depth = breakpoint_support[current_merge.start] + breakpoint_support[current_merge.end];
-                if (next_depth > current_depth) {
-                    current_merge = next;  // Continue with the next call
-                
-                // Merge based on SV length
-                } else if (next.end - next.start > current_merge.end - current_merge.start) {
+            } else if (next.support == current_merge.support) {
+                // Merge based on existence of predictions
+                if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood == 0.0) {
                     current_merge = next;  // Continue with the next call
+                    // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood));
+
+                // Merge based on prediction log likelihood
+                } else if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0) {
+                    
+                    // Print all SV information
+                    // printMessage("Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support));
+                    // printMessage("Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support));
+                    // printMessage("Comparing likelihoods: " + std::to_string(current_merge.hmm_likelihood) + " vs " + std::to_string(next.hmm_likelihood));
+
+                    // Keep the SV call with the higher likelihood. Compare only if
+                    // lengths are within 20% of each other
+                    uint32_t current_length = current_merge.end - current_merge.start;
+                    uint32_t next_length = next.end - next.start;
+                    double length_diff = std::abs((int)current_length - (int)next_length);
+                    double length_threshold = 0.2 * (int)current_length;
+                    if (length_diff <= length_threshold) {
+                        // printMessage("Length difference is within threshold: " + std::to_string(length_diff) + " <= " + std::to_string(length_threshold));
+
+                        if (next.hmm_likelihood > current_merge.hmm_likelihood) {
+                            current_merge = next;  // Continue with the next call
+                            // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood));
+                        }
+                    
+                    } else {
+                        // Keep the larger SV
+                        if (next_length > current_length) {
+                            current_merge = next;
+                            // printMessage("[2] Keeping next SV call with length " + std::to_string(next_length));
+                        }
+                    }
                 }
             }
 
         } else {
-            // No overlap: Save the previous SV and continue
+            // No overlap: Save the call and continue
             merged_sv_calls.emplace_back(current_merge);
             current_merge = next;
         }
     }
+    merged_sv_calls.emplace_back(current_merge);  // Save the last call
+    sv_calls = merged_sv_calls;  // Update the SV calls
 
-    // Add the last merged SV call
-    // printMessage("Saving SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood));
-    merged_sv_calls.emplace_back(current_merge);
-
-    // Replace contents of the SV calls
-    sv_calls = merged_sv_calls;
-    
     int updated_size = sv_calls.size();
     std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl;
 }
 
-void filterSVsWithLowSupport(std::vector<SVCall>& sv_calls, std::unordered_map<uint32_t, uint32_t>& breakpoint_support, int min_support)
+void filterSVsWithLowSupport(std::vector<SVCall>& sv_calls, int min_depth)
 {
-    // Insert breakpoint support for each SV call, and remove SV calls with low
-    // support
     int prev_size = sv_calls.size();
-    for (auto& sv_call : sv_calls)
-    {
-        sv_call.total_support = breakpoint_support[sv_call.start] + breakpoint_support[sv_call.end];
-        printMessage("SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " with support " + std::to_string(sv_call.total_support) + " and likelihood " + std::to_string(sv_call.hmm_likelihood) + " and length " + std::to_string(sv_call.end - sv_call.start));
+
+    // Print read depth for each SV call
+    for (const auto& sv_call : sv_calls) {
+        std::cout << "SV call: " << sv_call.start << "-" << sv_call.end << " with depth " << sv_call.read_depth << " and length " << (sv_call.end - sv_call.start) << std::endl;
     }
 
-    // Remove SV calls with low support, unless they are large (> 20 kb)
-    sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) {
-        return (sv_call.total_support < min_support && (sv_call.end - sv_call.start) < 20000);
+    // Remove SV calls with low read depth
+    sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_depth](const SVCall& sv_call) {
+        return sv_call.read_depth < min_depth;
+        // return (sv_call.total_support < min_support && (sv_call.end - sv_call.start) < 20000);
     }), sv_calls.end());
 
     int updated_size = sv_calls.size();
-    printMessage("Filtered " + std::to_string(prev_size) + " SV calls to " + std::to_string(updated_size) + " SV calls with support >= " + std::to_string(min_support));
+    printMessage("Filtered " + std::to_string(prev_size) + " SV calls to " + std::to_string(updated_size) + " SV calls with DP >= " + std::to_string(min_depth));
 }

From ef4df0b1a9b4e6b73d9e0ada1fc8f79464efdcd0 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 2 Dec 2024 02:09:11 -0500
Subject: [PATCH 040/134] remove filter

---
 src/sv_caller.cpp | 3 +--
 src/sv_object.cpp | 1 -
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 3635fe15..b55e3834 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -447,8 +447,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
     mergeSVs(combined_sv_calls);
 
     // Insert breakpoint support and filter SVs with low support
-    // filterSVsWithLowSupport(combined_sv_calls, breakpoint_depth, 5);
-    filterSVsWithLowSupport(combined_sv_calls, 10);
+    // filterSVsWithLowSupport(combined_sv_calls, 10);
 
     // Clean up the BAM file, header, and index
     hts_idx_destroy(idx);
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 479357a2..8529b181 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -180,7 +180,6 @@ void filterSVsWithLowSupport(std::vector<SVCall>& sv_calls, int min_depth)
     // Remove SV calls with low read depth
     sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_depth](const SVCall& sv_call) {
         return sv_call.read_depth < min_depth;
-        // return (sv_call.total_support < min_support && (sv_call.end - sv_call.start) < 20000);
     }), sv_calls.end());
 
     int updated_size = sv_calls.size();

From c79c1da4ee2822284a788e63eca4518ddf2fe976 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 2 Dec 2024 21:21:27 -0500
Subject: [PATCH 041/134] Fix breakpoint error and improve filtering

---
 src/sv_caller.cpp | 123 ++++++++++++++++-------
 src/sv_object.cpp | 244 ++++++++++++++++++++++++++++++++--------------
 2 files changed, 255 insertions(+), 112 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index b55e3834..b8f934f7 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -74,6 +74,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
             std::string chr = bamHdr->target_name[bam1->core.tid];
             uint32_t start = (uint32_t)bam1->core.pos;
             uint32_t end = (uint32_t)bam_endpos(bam1);  // This is the first position after the alignment
+            end--;  // Adjust to the last position of the alignment
             bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
 
             // Check for underflow
@@ -104,7 +105,8 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
             // Get the supplementary alignment information
             std::string chr = bamHdr->target_name[bam1->core.tid];
             uint32_t start = bam1->core.pos;
-            uint32_t end = bam_endpos(bam1);
+            uint32_t end = bam_endpos(bam1);  // This is the first position after the alignment
+            end--;  // Adjust to the last position of the alignment
             bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
 
             // Get CIGAR string information, but don't call SVs
@@ -400,6 +402,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
     // std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl;
     int region_count = region_chunks.size();
     int current_region = 0;
+    int filter_threshold = 4;
     for (const auto& sub_region : region_chunks) {
         current_region++;
         printMessage(chr + ": CIGAR SVs...");
@@ -413,6 +416,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
         // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl;
         printMessage(chr + ": Merging CIGAR...");
         mergeSVs(subregion_sv_calls);
+        filterSVsWithLowSupport(subregion_sv_calls, filter_threshold);
         int region_sv_count = getSVCount(subregion_sv_calls);
         // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
@@ -433,6 +437,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
         // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
         printMessage(chr + ": Merging split reads...");
         mergeSVs(subregion_sv_calls);
+        filterSVsWithLowSupport(subregion_sv_calls, filter_threshold);
 
         // Combine the SV calls from the current region
         // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl;
@@ -445,6 +450,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
     // Run a final merge on the combined SV calls
     printMessage(chr + ": Merging final calls...");
     mergeSVs(combined_sv_calls);
+    filterSVsWithLowSupport(combined_sv_calls, filter_threshold);
 
     // Insert breakpoint support and filter SVs with low support
     // filterSVsWithLowSupport(combined_sv_calls, 10);
@@ -486,37 +492,53 @@ void SVCaller::run()
 
     // Lambda to process a chromosome
     auto process_chr = [&](const std::string& chr) {
-        // printMessage("Launching thread for chromosome " + chr + "...");
-        std::vector<SVCall> sv_calls;
-        this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength());
-        {
-            std::lock_guard<std::mutex> lock(sv_mutex);
-            whole_genome_sv_calls[chr] = std::move(sv_calls);
-        }
-        printMessage("Completed chromosome " + chr);
-
-        // Notify thread completion
-        {
-            std::lock_guard<std::mutex> lock(sv_mutex);
-            active_threads--;
-            printMessage("Active threads: " + std::to_string(active_threads));
+        try {
+            // printMessage("Launching thread for chromosome " + chr + "...");
+            std::vector<SVCall> sv_calls;
+            this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength());
+            {
+                std::lock_guard<std::mutex> lock(sv_mutex);
+                whole_genome_sv_calls[chr] = std::move(sv_calls);
+            }
+            printMessage("Completed chromosome " + chr);
+
+            // // Notify thread completion
+            // {
+            //     std::lock_guard<std::mutex> lock(sv_mutex);
+            //     active_threads--;
+            //     printMessage("Active threads: " + std::to_string(active_threads));
+            // }
+            // cv.notify_one();
+        } catch (const std::exception& e) {
+            printError("Error processing chromosome " + chr + ": " + e.what());
         }
-        cv.notify_one();
     };
 
     // Thread management
     std::vector<std::thread> threads;
     for (const auto& chr : chromosomes) {
-        {
-            std::unique_lock<std::mutex> lock(sv_mutex);
-            printMessage("Waiting for thread slot. Active threads: " + std::to_string(active_threads));
-            cv.wait(lock, [&] { return active_threads < max_threads; });
-            active_threads++;
-            printMessage("Launching thread for chromosome " + chr + ". Active threads: " + std::to_string(active_threads));
-        }
+        // Wait for a thread slot
+        std::unique_lock<std::mutex> lock(sv_mutex);
+        cv.wait(lock, [&] { return threads.size() < max_threads; });
 
         // Launch a new thread
-        threads.emplace_back(process_chr, chr);
+        threads.emplace_back([&, chr] {
+            process_chr(chr);
+
+            // Notify thread completion
+            std::lock_guard<std::mutex> lock(sv_mutex);
+            cv.notify_one();
+        });
+        // {
+        //     std::unique_lock<std::mutex> lock(sv_mutex);
+        //     printMessage("Waiting for thread slot. Active threads: " + std::to_string(active_threads));
+        //     cv.wait(lock, [&] { return active_threads < max_threads; });
+        //     active_threads++;
+        //     printMessage("Launching thread for chromosome " + chr + ". Active threads: " + std::to_string(active_threads));
+        // }
+
+        // // Launch a new thread
+        // threads.emplace_back(process_chr, chr);
     }
 
     // Wait for all threads to complete
@@ -551,7 +573,7 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
 {
     // Find split-read SV evidence
     int sv_count = 0;
-    int min_cnv_length = this->input_data.getMinCNVLength();
+    uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength();
     for (const auto& entry : primary_map) {
         std::string qname = entry.first;
         AlignmentData primary_alignment = entry.second;
@@ -584,6 +606,14 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
             bool is_opposite_strand = std::get<6>(primary_alignment) != std::get<6>(*it);
             if (is_opposite_strand) {
                 if (supp_length >= min_cnv_length) {
+
+                    // Print error if the start position is greater than the end
+                    // position
+                    if (supp_start+1 > supp_end+1) {
+                        printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1));
+                        continue;
+                    }
+
                     // printMessage("Running copy number prediction on inversion: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1));
                     std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map);
                     double supp_lh = std::get<0>(result);
@@ -621,20 +651,30 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
         uint32_t boundary_left, boundary_right, gap_left, gap_right;
         if (primary_before_supp) {
             boundary_left = primary_start+1;
-            boundary_right = supp_end+1;
+            // boundary_right = supp_end+1;
+            boundary_right = std::max(primary_end, supp_end)+1;
             gap_left = primary_end+1;
             gap_right = supp_start+1;
-            gap_exists = primary_end < supp_start;
+            gap_exists = gap_left < gap_right;
         } else {
             boundary_left = supp_start+1;
-            boundary_right = primary_end+1;
+            // boundary_right = primary_end+1;
+            boundary_right = std::max(primary_end, supp_end)+1;
             gap_left = supp_end+1;
             gap_right = primary_start+1;
-            gap_exists = supp_end < primary_start;
+            gap_exists = gap_left < gap_right;
         }
         
         // Run copy number variant predictions on the boundary if large enough
         if (boundary_right - boundary_left >= min_cnv_length) {
+
+            // Print error if the start position is greater than the end
+            // position
+            if (boundary_left > boundary_right) {
+                printError("ERROR: Invalid boundary coordinates: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right));
+                continue;
+            }
+
             // printMessage("Running copy number prediction on boundary: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right));
             std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map);
             double bd_lh = std::get<0>(bd_result);
@@ -642,6 +682,14 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
 
             // Run copy number variant predictions on the gap if it exists
             if (gap_exists && gap_right - gap_left >= min_cnv_length) {
+
+                // Print error if the start position is greater than the end
+                // position
+                if (gap_left > gap_right) {
+                    printError("ERROR: Invalid gap coordinates: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right));
+                    continue;
+                }
+
                 // printMessage("Running copy number prediction on gap: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right));
                 std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map);
                 double gap_lh = std::get<0>(gap_result);
@@ -690,7 +738,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     std::cout << "Getting reference genome header..." << std::endl;
     const std::string contig_header = this->input_data.getRefGenome().getContigHeader();
     std::vector<std::string> header_lines = {
-        std::string("##reference=") + 
+        std::string("##reference=") + this->input_data.getRefGenome().getFilepath(),
         contig_header,
         "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">",
         "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">",
@@ -698,6 +746,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Method used to call the structural variant\">",
         "##INFO=<ID=ALN,Number=1,Type=String,Description=\"Feature used to identify the structural variant\">",
         "##INFO=<ID=HMM,Number=1,Type=Float,Description=\"HMM likelihood\">",
+        "##INFO=<ID=SUPPORT,Number=1,Type=Integer,Description=\"Number of reads supporting the variant\">",
         "##FILTER=<ID=PASS,Description=\"All filters passed\">",
         "##FILTER=<ID=LowQual,Description=\"Low quality\">",
         "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
@@ -720,7 +769,8 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     vcf_stream << "##fileDate=" << buffer << std::endl;
 
     // Add source
-    std::string source = "##source=ContexSV";
+    std::string sv_method = "ContextSV" + std::string(VERSION);
+    std::string source = "##source=" + sv_method;
     vcf_stream << source << std::endl;
 
     // Loop over the header metadata lines
@@ -731,12 +781,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     // Add the header line
     std::string header_line = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE";
     vcf_stream << header_line << std::endl;
-
-    // Flush the stream to ensure that the header is written
-    //this->file_stream.flush();
-
     std::cout << "Saving SV calls to " << output_vcf << std::endl;
-    std::string sv_method = "CONTEXTSV" + std::string(VERSION);
     int skip_count = 0;
     int total_count = 0;
     for (const auto& pair : sv_calls) {
@@ -758,6 +803,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         	}
             int read_depth = sv_call.read_depth;
             std::string ref_allele = ".";
+            int support = sv_call.support;
 
             // If the SV type is unknown, skip it
             if (sv_type_str == "UNKNOWN" || sv_type_str == "NEUTRAL") {
@@ -808,9 +854,12 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             }
 
             // Create the VCF parameter strings
+            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
+            //     ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
+            //     ";HMM=" + std::to_string(hmm_likelihood);
             std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
                 ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
-                ";HMM=" + std::to_string(hmm_likelihood);
+                ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(support);
                 
             std::string format_str = "GT:DP";
             std::string sample_str = genotype + ":" + std::to_string(read_depth);
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 8529b181..d9b8e457 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -86,102 +86,196 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
 
     // Merge SV calls if they overlap
     int initial_size = sv_calls.size();
+    
+    // Merge any SV calls that have >90% reciprocal overlap
     std::vector<SVCall> merged_sv_calls;
-    auto it = sv_calls.begin();
-    SVCall current_merge = *it++;
-    double log_lh_eps = 1.0;  // Log likelihood epsilon
-    for (; it != sv_calls.end(); ++it) {
-        SVCall& next = *it;
-
-        // Find overlap
-        // printMessage("[0] Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support));
-        // printMessage("[0] Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support));
+    SVCall current_merge = sv_calls[0];
+    for (size_t i = 1; i < sv_calls.size(); i++) {
+        SVCall& next = sv_calls[i];
+        // Check for overlap
         if (next.start <= current_merge.end) {
+            // printMessage("Comparing SV " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " (support " + std::to_string(current_merge.support) + ", length " + std::to_string(current_merge.end - current_merge.start) + ") with " + std::to_string(next.start) + "-" + std::to_string(next.end) + " (support " + std::to_string(next.support) + ", length " + std::to_string(next.end - next.start) + ")");
+            // if (current_merge.start <= next.end && next.start <= current_merge.end) {
+            // Calculate reciprocal overlap
+            uint32_t overlap = std::max(0, (int)std::min(current_merge.end, next.end) - (int)std::max(current_merge.start, next.start));
+            uint32_t union_length = std::max(current_merge.end, next.end) - std::min(current_merge.start, next.start);
+            double overlap_fraction = static_cast<double>(overlap) / union_length;
+            // printMessage("Overlap fraction: " + std::to_string(overlap_fraction));
 
-            // Merge based on read support
-            if (next.support > current_merge.support) {
-                // Compare only if lengths are within 20% of each other
-                uint32_t current_length = current_merge.end - current_merge.start;
-                uint32_t next_length = next.end - next.start;
-                double length_diff = std::abs((int)current_length - (int)next_length);
-                double length_threshold = 0.2 * (int)current_length;
-                if (length_diff <= length_threshold) {
-                    current_merge = next;  // Continue with the next call
-                    // printMessage("Keeping next SV call with support " + std::to_string(next.support));
-                } else {
-                    // Keep the larger SV
-                    if (next_length > current_length) {
+            // Merge if reciprocal overlap is >90%
+            if (overlap_fraction > 0.90) {
+                // printMessage("Merging SV calls with overlap " + std::to_string(overlap_fraction));
+                // Keep the SV call with the higher read support
+                if (next.support > current_merge.support) {
+                    current_merge = next;
+                } else if (next.support == current_merge.support) {
+                    // Keep the SV call with the higher likelihood
+                    if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0 && next.hmm_likelihood > current_merge.hmm_likelihood) {
                         current_merge = next;
-                        // printMessage("Keeping next SV call with length " + std::to_string(next_length));
-                    }
-                }
-                // printMessage("Keeping next SV call with support " + std::to_string(next.support));
-
-            } else if (next.support == current_merge.support) {
-                // Merge based on existence of predictions
-                if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood == 0.0) {
-                    current_merge = next;  // Continue with the next call
-                    // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood));
-
-                // Merge based on prediction log likelihood
-                } else if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0) {
-                    
-                    // Print all SV information
-                    // printMessage("Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support));
-                    // printMessage("Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support));
-                    // printMessage("Comparing likelihoods: " + std::to_string(current_merge.hmm_likelihood) + " vs " + std::to_string(next.hmm_likelihood));
-
-                    // Keep the SV call with the higher likelihood. Compare only if
-                    // lengths are within 20% of each other
-                    uint32_t current_length = current_merge.end - current_merge.start;
-                    uint32_t next_length = next.end - next.start;
-                    double length_diff = std::abs((int)current_length - (int)next_length);
-                    double length_threshold = 0.2 * (int)current_length;
-                    if (length_diff <= length_threshold) {
-                        // printMessage("Length difference is within threshold: " + std::to_string(length_diff) + " <= " + std::to_string(length_threshold));
-
-                        if (next.hmm_likelihood > current_merge.hmm_likelihood) {
-                            current_merge = next;  // Continue with the next call
-                            // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood));
-                        }
-                    
-                    } else {
-                        // Keep the larger SV
-                        if (next_length > current_length) {
+                    } else if (next.hmm_likelihood == current_merge.hmm_likelihood) {
+                        // Keep the SV call with the higher read depth
+                        if (next.read_depth > current_merge.read_depth) {
                             current_merge = next;
-                            // printMessage("[2] Keeping next SV call with length " + std::to_string(next_length));
                         }
                     }
+                    // // Keep the SV call with the higher read depth
+                    // if (next.read_depth > current_merge.read_depth) {
+                    //     current_merge = next;
+                    // } else if (next.read_depth == current_merge.read_depth) {
+                    //     // Keep the SV call with the higher likelihood
+                    //     if (next.hmm_likelihood > current_merge.hmm_likelihood) {
+                    //         current_merge = next;
+                    //     }
+                    // }
                 }
+            } else {
+                merged_sv_calls.push_back(current_merge);
+                current_merge = next;
             }
-
         } else {
-            // No overlap: Save the call and continue
-            merged_sv_calls.emplace_back(current_merge);
+            merged_sv_calls.push_back(current_merge);
             current_merge = next;
         }
     }
-    merged_sv_calls.emplace_back(current_merge);  // Save the last call
-    sv_calls = merged_sv_calls;  // Update the SV calls
+
+    // Add the last SV call
+    merged_sv_calls.push_back(current_merge);
+
+    // Update the SV calls
+    sv_calls = merged_sv_calls;
+    // for (size_t i = 0; i < sv_calls.size(); i++) {
+    //     SVCall& current = sv_calls[i];
+    //     bool merged = false;
+    //     for (size_t j = i + 1; j < sv_calls.size(); j++) {
+    //         SVCall& next = sv_calls[j];
+    //         if (current.start <= next.end && next.start <= current.end) {
+    //             // Calculate reciprocal overlap
+    //             uint32_t overlap = std::max(0, (int)std::min(current.end, next.end) - (int)std::max(current.start, next.start));
+    //             uint32_t union_length = std::max(current.end, next.end) - std::min(current.start, next.start);
+    //             double overlap_fraction = static_cast<double>(overlap) / union_length;
+
+    //             // Merge if reciprocal overlap is >90%
+    //             if (overlap_fraction > 0.9) {
+    //                 // Keep the SV call with the higher likelihood
+    //                 if (next.hmm_likelihood > current.hmm_likelihood) {
+    //                     current = next;
+    //                 }
+    //                 merged = true;
+    //             }
+
+    //             // Remove the merged SV call
+    //             sv_calls.erase(sv_calls.begin() + j);
+    //             j--;
+
+    //     }
+    //     if (!merged) {
+    //         merged_sv_calls.push_back(current);
+    //     }
+    // }
+    
+
+
+    // std::vector<SVCall> merged_sv_calls;
+    // auto it = sv_calls.begin();
+    // SVCall current_merge = *it++;
+    // double log_lh_eps = 1.0;  // Log likelihood epsilon
+    // for (; it != sv_calls.end(); ++it) {
+    //     SVCall& next = *it;
+
+    //     // Find overlap
+    //     // printMessage("[0] Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support));
+    //     // printMessage("[0] Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support));
+    //     if (next.start <= current_merge.end) {
+
+    //         // Merge based on read support
+    //         if (next.support > current_merge.support) {
+    //             // Compare only if lengths are within 20% of each other
+    //             uint32_t current_length = current_merge.end - current_merge.start;
+    //             uint32_t next_length = next.end - next.start;
+    //             double length_diff = std::abs((int)current_length - (int)next_length);
+    //             double length_threshold = 0.2 * (int)current_length;
+    //             if (length_diff <= length_threshold) {
+    //                 current_merge = next;  // Continue with the next call
+    //                 // printMessage("Keeping next SV call with support " + std::to_string(next.support));
+    //             } else {
+    //                 // Keep the larger SV
+    //                 if (next_length > current_length) {
+    //                     current_merge = next;
+    //                     // printMessage("Keeping next SV call with length " + std::to_string(next_length));
+    //                 }
+    //             }
+    //             // printMessage("Keeping next SV call with support " + std::to_string(next.support));
+
+    //         } else if (next.support == current_merge.support) {
+    //             // Merge based on existence of predictions
+    //             if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood == 0.0) {
+    //                 current_merge = next;  // Continue with the next call
+    //                 // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood));
+
+    //             // Merge based on prediction log likelihood
+    //             } else if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0) {
+                    
+    //                 // Print all SV information
+    //                 // printMessage("Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support));
+    //                 // printMessage("Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support));
+    //                 // printMessage("Comparing likelihoods: " + std::to_string(current_merge.hmm_likelihood) + " vs " + std::to_string(next.hmm_likelihood));
+
+    //                 // Keep the SV call with the higher likelihood. Compare only if
+    //                 // lengths are within 20% of each other
+    //                 uint32_t current_length = current_merge.end - current_merge.start;
+    //                 uint32_t next_length = next.end - next.start;
+    //                 double length_diff = std::abs((int)current_length - (int)next_length);
+    //                 double length_threshold = 0.2 * (int)current_length;
+    //                 if (length_diff <= length_threshold) {
+    //                     // printMessage("Length difference is within threshold: " + std::to_string(length_diff) + " <= " + std::to_string(length_threshold));
+
+    //                     if (next.hmm_likelihood > current_merge.hmm_likelihood) {
+    //                         current_merge = next;  // Continue with the next call
+    //                         // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood));
+    //                     }
+                    
+    //                 } else {
+    //                     // Keep the larger SV
+    //                     if (next_length > current_length) {
+    //                         current_merge = next;
+    //                         // printMessage("[2] Keeping next SV call with length " + std::to_string(next_length));
+    //                     }
+    //                 }
+    //             }
+    //         }
+
+    //     } else {
+    //         // No overlap: Save the call and continue
+    //         merged_sv_calls.emplace_back(current_merge);
+    //         current_merge = next;
+    //     }
+    // }
+    // merged_sv_calls.emplace_back(current_merge);  // Save the last call
+    // sv_calls = merged_sv_calls;  // Update the SV calls
 
     int updated_size = sv_calls.size();
     std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl;
 }
 
-void filterSVsWithLowSupport(std::vector<SVCall>& sv_calls, int min_depth)
+void filterSVsWithLowSupport(std::vector<SVCall>& sv_calls, int min_support)
 {
     int prev_size = sv_calls.size();
 
-    // Print read depth for each SV call
-    for (const auto& sv_call : sv_calls) {
-        std::cout << "SV call: " << sv_call.start << "-" << sv_call.end << " with depth " << sv_call.read_depth << " and length " << (sv_call.end - sv_call.start) << std::endl;
-    }
-
-    // Remove SV calls with low read depth
-    sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_depth](const SVCall& sv_call) {
-        return sv_call.read_depth < min_depth;
+    // Filter SV calls with low read support
+    sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) {
+        return sv_call.support < min_support;
     }), sv_calls.end());
 
-    int updated_size = sv_calls.size();
-    printMessage("Filtered " + std::to_string(prev_size) + " SV calls to " + std::to_string(updated_size) + " SV calls with DP >= " + std::to_string(min_depth));
+    // // Print read depth for each SV call
+    // for (const auto& sv_call : sv_calls) {
+    //     std::cout << "SV call: " << sv_call.start << "-" << sv_call.end << " with depth " << sv_call.read_depth << " and length " << (sv_call.end - sv_call.start) << std::endl;
+    // }
+
+    // // Remove SV calls with low read depth
+    // sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_depth](const SVCall& sv_call) {
+    //     return sv_call.read_depth < min_depth;
+    // }), sv_calls.end());
+
+    // int updated_size = sv_calls.size();
+    // printMessage("Filtered " + std::to_string(prev_size) + " SV calls to " + std::to_string(updated_size) + " SV calls with DP >= " + std::to_string(min_depth));
 }

From 3afeaac11841b31579d587992e0fa37c520c8e7a Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 2 Dec 2024 21:59:23 -0500
Subject: [PATCH 042/134] Update filtering

---
 src/sv_caller.cpp | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index b8f934f7..d4182cbb 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -415,8 +415,8 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
         // SuppMap& supp_map = std::get<2>(region_data);
         // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl;
         printMessage(chr + ": Merging CIGAR...");
-        mergeSVs(subregion_sv_calls);
         filterSVsWithLowSupport(subregion_sv_calls, filter_threshold);
+        mergeSVs(subregion_sv_calls);
         int region_sv_count = getSVCount(subregion_sv_calls);
         // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
@@ -436,8 +436,8 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
         // Merge the SV calls from the current region
         // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
         printMessage(chr + ": Merging split reads...");
-        mergeSVs(subregion_sv_calls);
         filterSVsWithLowSupport(subregion_sv_calls, filter_threshold);
+        mergeSVs(subregion_sv_calls);
 
         // Combine the SV calls from the current region
         // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl;
@@ -450,10 +450,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_
     // Run a final merge on the combined SV calls
     printMessage(chr + ": Merging final calls...");
     mergeSVs(combined_sv_calls);
-    filterSVsWithLowSupport(combined_sv_calls, filter_threshold);
-
-    // Insert breakpoint support and filter SVs with low support
-    // filterSVsWithLowSupport(combined_sv_calls, 10);
+    // filterSVsWithLowSupport(combined_sv_calls, filter_threshold);
 
     // Clean up the BAM file, header, and index
     hts_idx_destroy(idx);

From 6f6ecc58aee989fe0abbc05240af09fd5c61c9c0 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 4 Dec 2024 14:07:30 -0500
Subject: [PATCH 043/134] Improve multithreading

---
 .gitignore        |  5 ++++
 src/sv_caller.cpp | 68 +++++++++++++++++++++++------------------------
 2 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/.gitignore b/.gitignore
index f2253893..343adf0f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -91,3 +91,8 @@ python/dist_plots
 # Temporary files
 lib/.nfs*
 valgrind.log
+
+# Log files
+*.log
+*.err
+*.out
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index d4182cbb..f23fe002 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -481,16 +481,12 @@ void SVCaller::run()
     // Set up threads for processing each chromosome
     const int max_threads = this->input_data.getThreadCount();
     std::cout << "Using " << max_threads << " threads for processing..." << std::endl;
-    std::vector<std::future<void>> futures;
     std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
     std::mutex sv_mutex;
-    std::condition_variable cv;
-    int active_threads = 0;
 
     // Lambda to process a chromosome
     auto process_chr = [&](const std::string& chr) {
         try {
-            // printMessage("Launching thread for chromosome " + chr + "...");
             std::vector<SVCall> sv_calls;
             this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength());
             {
@@ -498,52 +494,58 @@ void SVCaller::run()
                 whole_genome_sv_calls[chr] = std::move(sv_calls);
             }
             printMessage("Completed chromosome " + chr);
-
-            // // Notify thread completion
-            // {
-            //     std::lock_guard<std::mutex> lock(sv_mutex);
-            //     active_threads--;
-            //     printMessage("Active threads: " + std::to_string(active_threads));
-            // }
-            // cv.notify_one();
         } catch (const std::exception& e) {
             printError("Error processing chromosome " + chr + ": " + e.what());
         }
     };
 
     // Thread management
-    std::vector<std::thread> threads;
+    // std::vector<std::thread> threads;
+    std::vector<std::future<void>> futures;
+    std::atomic<int> active_threads(0);
+    std::mutex cv_mutex;
+    std::condition_variable cv;
     for (const auto& chr : chromosomes) {
         // Wait for a thread slot
-        std::unique_lock<std::mutex> lock(sv_mutex);
-        cv.wait(lock, [&] { return threads.size() < max_threads; });
+        {
+            std::unique_lock<std::mutex> lock(cv_mutex);
+            cv.wait(lock, [&] { return active_threads.load() < max_threads; });
+            active_threads.fetch_add(1);
+        }
 
-        // Launch a new thread
-        threads.emplace_back([&, chr] {
+        // Launch a task
+        futures.push_back(std::async(std::launch::async, [&, chr] {
             process_chr(chr);
+            {
+                std::lock_guard<std::mutex> lock(cv_mutex);
+                active_threads.fetch_sub(1);
 
-            // Notify thread completion
-            std::lock_guard<std::mutex> lock(sv_mutex);
-            cv.notify_one();
-        });
-        // {
-        //     std::unique_lock<std::mutex> lock(sv_mutex);
-        //     printMessage("Waiting for thread slot. Active threads: " + std::to_string(active_threads));
-        //     cv.wait(lock, [&] { return active_threads < max_threads; });
-        //     active_threads++;
-        //     printMessage("Launching thread for chromosome " + chr + ". Active threads: " + std::to_string(active_threads));
+                // Notify threads waiting for a slot
+                cv.notify_all();
+            }
+        }));
+        // while (active_threads.load() >= max_threads) {
+        //     std::this_thread::yield();
         // }
 
         // // Launch a new thread
-        // threads.emplace_back(process_chr, chr);
+        // threads.emplace_back([&, chr] {
+        //     active_threads.fetch_add(1);
+        //     process_chr(chr);
+        //     active_threads.fetch_sub(1);
+        // });
     }
 
     // Wait for all threads to complete
-    for (auto& thread : threads) {
-        if (thread.joinable()) {
-            thread.join();
-        }
+    printMessage("Waiting for all threads to finish...");
+    for (auto& future : futures) {
+        future.get();
     }
+    // for (auto& thread : threads) {
+    //     if (thread.joinable()) {
+    //         thread.join();
+    //     }
+    // }
 
     printMessage("All threads have finished.");
 
@@ -560,8 +562,6 @@ void SVCaller::run()
     // Save to VCF
     std::cout << "Saving SVs to VCF..." << std::endl;
     this->saveToVCF(whole_genome_sv_calls);
-
-    // return whole_genome_sv_calls;
 }
 
 

From 1486176b14738e1d8f78727b89fb0f9957be3ee5 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 4 Dec 2024 19:45:59 -0500
Subject: [PATCH 044/134] Fix warnings

---
 Makefile-cpp        |  2 +-
 include/sv_caller.h |  2 +-
 src/sv_caller.cpp   | 31 ++++++++++++++++---------------
 3 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/Makefile-cpp b/Makefile-cpp
index 58139e9c..55630e9b 100644
--- a/Makefile-cpp
+++ b/Makefile-cpp
@@ -18,7 +18,7 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib
 
 # Compiler and Flags
 CXX := g++
-CXXFLAGS := -std=c++14 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR)
+CXXFLAGS := -std=c++14 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic
 LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR)  # Add rpath for shared libraries
 LDLIBS := -lhts  # Link with libhts.a or libhts.so
 
diff --git a/include/sv_caller.h b/include/sv_caller.h
index 82a833f3..c0f9ce23 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -36,7 +36,7 @@ class SVCaller {
         // mismatch rate, and the start and end positions of the query sequence
         void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, std::tuple<std::vector<int>, uint32_t, uint32_t>& query_info, bool is_primary, const std::vector<uint32_t>& pos_depth_map);
 
-        void processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, int min_cnv_length);
+        void processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls);
 
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index f23fe002..4d1cf0ef 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -261,7 +261,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
                 int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
-                addSVCall(sv_calls, ref_pos, ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh, read_depth);
+                addSVCall(sv_calls, ref_pos, ref_end, "DEL", "<DEL>", "CIGARDEL", "./.", default_lh, read_depth);
             }
 
         // Check if the CIGAR operation is a clipped base
@@ -340,9 +340,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     query_info = std::tuple<std::vector<int>, uint32_t, uint32_t>(std::move(query_match_map), query_start, query_end);
 }
 
-void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, int min_cnv_length)
+void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls)
 {
     // Open the BAM file
+    std::string bam_filepath = this->input_data.getLongReadBam();
     samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
     if (!fp_in) {
         throw std::runtime_error("ERROR: failed to open " + bam_filepath);
@@ -469,9 +470,9 @@ void SVCaller::run()
     }
 
     // Ignore all alternate contigs (contains 'alt', 'GL', 'NC', 'hs', etc.)
-    chromosomes.erase(std::remove_if(chromosomes.begin(), chromosomes.end(), [](const std::string& chr) {
-        return chr.find("alt") != std::string::npos || chr.find("GL") != std::string::npos || chr.find("NC") != std::string::npos || chr.find("hs") != std::string::npos;
-    }), chromosomes.end());
+    // chromosomes.erase(std::remove_if(chromosomes.begin(), chromosomes.end(), [](const std::string& chr) {
+    //     return chr.find("alt") != std::string::npos || chr.find("GL") != std::string::npos || chr.find("NC") != std::string::npos || chr.find("hs") != std::string::npos;
+    // }), chromosomes.end());
         
     // Read the HMM from the file
     std::string hmm_filepath = this->input_data.getHMMFilepath();
@@ -488,7 +489,7 @@ void SVCaller::run()
     auto process_chr = [&](const std::string& chr) {
         try {
             std::vector<SVCall> sv_calls;
-            this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength());
+            this->processChromosome(chr, hmm, sv_calls);
             {
                 std::lock_guard<std::mutex> lock(sv_mutex);
                 whole_genome_sv_calls[chr] = std::move(sv_calls);
@@ -617,18 +618,18 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
                     SVType supp_type = std::get<1>(result);
                     int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
                     if (supp_type == SVType::NEUTRAL) {
-                        addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", ".", "HMM", "./.", supp_lh, read_depth);
+                        addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "<INV>", "HMM", "./.", supp_lh, read_depth);
                         
                         sv_count++;
                     } else if (supp_type == SVType::DUP) {
                         int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
-                        addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", ".", "HMM", "./.", supp_lh, read_depth);
+                        addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", "<INV>", "HMM", "./.", supp_lh, read_depth);
                     }
                 } else {
                     // Add the inversion without running copy number predictions
                     // (too small for predictions)
                     int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
-                    addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", ".", "REV", "./.", 0.0, read_depth);
+                    addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "<INV>", "REV", "./.", 0.0, read_depth);
                 }
             }
         }
@@ -695,16 +696,19 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
                 // If higher likelihood than the boundary, add the gap as the SV call
                 if (gap_lh > bd_lh) {
                     int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right);
-                    addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), ".", "GAP", "./.", gap_lh, read_depth);
+                    std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
+                    addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "GAP", "./.", gap_lh, read_depth);
                 } else {
                     // Add the boundary as the SV call
                     int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
-                    addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh, read_depth);
+                    std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
+                    addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth);
                 }
             } else {
                 // Add the boundary as the SV call
                 int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
-                addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh, read_depth);
+                std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
+                addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth);
             }
         }
     }
@@ -851,9 +855,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             }
 
             // Create the VCF parameter strings
-            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
-            //     ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
-            //     ";HMM=" + std::to_string(hmm_likelihood);
             std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
                 ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
                 ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(support);

From 21154921b9115f766fa88f93b4978bab5aa3433e Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 5 Dec 2024 11:10:31 -0500
Subject: [PATCH 045/134] add threadpool

---
 include/ThreadPool.h |  98 ++++++++++++++++++++++++++++++++++++++++
 src/cnv_caller.cpp   | 104 +++++++++++++++++++++++++++++++++----------
 src/khmm.cpp         |  62 +++++++++++++++++++-------
 src/sv_caller.cpp    |  85 ++++++++++++++++-------------------
 4 files changed, 263 insertions(+), 86 deletions(-)
 create mode 100644 include/ThreadPool.h

diff --git a/include/ThreadPool.h b/include/ThreadPool.h
new file mode 100644
index 00000000..41832030
--- /dev/null
+++ b/include/ThreadPool.h
@@ -0,0 +1,98 @@
+#ifndef THREAD_POOL_H
+#define THREAD_POOL_H
+
+#include <vector>
+#include <queue>
+#include <memory>
+#include <thread>
+#include <mutex>
+#include <condition_variable>
+#include <future>
+#include <functional>
+#include <stdexcept>
+
+class ThreadPool {
+public:
+    ThreadPool(size_t);
+    template<class F, class... Args>
+    auto enqueue(F&& f, Args&&... args) 
+        -> std::future<typename std::result_of<F(Args...)>::type>;
+    ~ThreadPool();
+private:
+    // need to keep track of threads so we can join them
+    std::vector< std::thread > workers;
+    // the task queue
+    std::queue< std::function<void()> > tasks;
+    
+    // synchronization
+    std::mutex queue_mutex;
+    std::condition_variable condition;
+    bool stop;
+};
+ 
+// the constructor just launches some amount of workers
+inline ThreadPool::ThreadPool(size_t threads)
+    :   stop(false)
+{
+    for(size_t i = 0;i<threads;++i)
+        workers.emplace_back(
+            [this]
+            {
+                for(;;)
+                {
+                    std::function<void()> task;
+
+                    {
+                        std::unique_lock<std::mutex> lock(this->queue_mutex);
+                        this->condition.wait(lock,
+                            [this]{ return this->stop || !this->tasks.empty(); });
+                        if(this->stop && this->tasks.empty())
+                            return;
+                        task = std::move(this->tasks.front());
+                        this->tasks.pop();
+                    }
+
+                    task();
+                }
+            }
+        );
+}
+
+// add new work item to the pool
+template<class F, class... Args>
+auto ThreadPool::enqueue(F&& f, Args&&... args) 
+    -> std::future<typename std::result_of<F(Args...)>::type>
+{
+    using return_type = typename std::result_of<F(Args...)>::type;
+
+    auto task = std::make_shared< std::packaged_task<return_type()> >(
+            std::bind(std::forward<F>(f), std::forward<Args>(args)...)
+        );
+        
+    std::future<return_type> res = task->get_future();
+    {
+        std::unique_lock<std::mutex> lock(queue_mutex);
+
+        // don't allow enqueueing after stopping the pool
+        if(stop)
+            throw std::runtime_error("enqueue on stopped ThreadPool");
+
+        tasks.emplace([task](){ (*task)(); });
+    }
+    condition.notify_one();
+    return res;
+}
+
+// the destructor joins all threads
+inline ThreadPool::~ThreadPool()
+{
+    {
+        std::unique_lock<std::mutex> lock(queue_mutex);
+        stop = true;
+    }
+    condition.notify_all();
+    for(std::thread &worker: workers)
+        worker.join();
+}
+
+#endif
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 9002e2b6..2b3a48b4 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -46,7 +46,10 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::ve
     int data_count = (int) snp_data.pos.size();
     if (data_count == 0)
     {
-        throw std::runtime_error("Error: No SNP data found for Viterbi algorithm.");
+        // throw std::runtime_error("Error: No SNP data found for Viterbi
+        // algorithm.");
+        printError("ERROR: No SNP data found for Viterbi algorithm.");
+        prediction = std::make_pair(std::vector<int>(), 0.0);
     }
     prediction = testVit_CHMM(hmm, data_count, snp_data.log2_cov, snp_data.baf, snp_data.pfb);
 }
@@ -140,7 +143,9 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     // Check that the start position is less than the end position
     if (start_pos >= end_pos)
     {
-        throw std::runtime_error("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
+        // throw std::runtime_error("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
+        printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
+        return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false);
     }
 
     // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2
@@ -167,6 +172,11 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     // printMessage("Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " (" + std::to_string(sv_snps.pos.size()) + " SNPs, start=" + std::to_string(snp_start_pos) + ", end=" + std::to_string(snp_end_pos) + ")...");
     std::pair<std::vector<int>, double> prediction;
     runViterbi(hmm, sv_snps, prediction);
+    if (prediction.first.size() == 0)
+    {
+        return std::make_tuple(0.0, SVType::UNKNOWN, "./.", sv_snps_found);
+    }
+
     std::vector<int>& state_sequence = prediction.first;
     double likelihood = prediction.second;
 
@@ -225,9 +235,6 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 
 void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall> &sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map)
 {
-    int min_length = this->input_data.getMinCNVLength();
-    int window_size = this->input_data.getWindowSize();
-
     // Map with counts for each CNV type
     std::map<int, int> cnv_type_counts;
     for (int i = 0; i < 6; i++)
@@ -236,6 +243,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
     }
     
     // Loop through each SV candidate and predict the copy number state
+    int min_length = this->input_data.getMinCNVLength();
     for (auto& sv_call : sv_candidates)
     {
 
@@ -386,7 +394,6 @@ std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint3
 // Calculate the mean chromosome coverage
 std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len)
 {
-    // std::unordered_map<uint32_t, int> chr_pos_depth_map;
     std::vector<uint32_t> chr_pos_depth_map(chr_len+1, 0); // 1-based index
     {
         // Lock the bam file
@@ -397,7 +404,10 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
         samFile *bam_file = sam_open(bam_filepath.c_str(), "r");
         if (!bam_file)
         {
-            throw std::runtime_error("ERROR: Could not open BAM file: " + bam_filepath);
+            // throw std::runtime_error("ERROR: Could not open BAM file: " +
+            // bam_filepath);
+            printError("ERROR: Could not open BAM file: " + bam_filepath);
+            return std::make_pair(0.0, chr_pos_depth_map);
         }
 
         // Enable multi-threading
@@ -408,7 +418,9 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
         if (!bam_header)
         {
             sam_close(bam_file);
-            throw std::runtime_error("ERROR: Could not read header from BAM file: " + bam_filepath);
+            printError("ERROR: Could not read header from BAM file: " + bam_filepath);
+            return std::make_pair(0.0, chr_pos_depth_map);
+            // throw std::runtime_error("ERROR: Could not read header from BAM file: " + bam_filepath);
         }
 
         // Load the index
@@ -417,7 +429,10 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
         {
             bam_hdr_destroy(bam_header);
             sam_close(bam_file);
-            throw std::runtime_error("ERROR: Could not load index for BAM file: " + bam_filepath);
+            // throw std::runtime_error("ERROR: Could not load index for BAM
+            // file: " + bam_filepath);
+            printError("ERROR: Could not load index for BAM file: " + bam_filepath);
+            return std::make_pair(0.0, chr_pos_depth_map);  
         }
 
         // Create an iterator for the chromosome
@@ -427,7 +442,11 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
             hts_idx_destroy(bam_index);
             bam_hdr_destroy(bam_header);
             sam_close(bam_file);
-            throw std::runtime_error("ERROR: Could not create iterator for chromosome: " + chr + ", check if the chromosome exists in the BAM file.");
+            // throw std::runtime_error("ERROR: Could not create iterator for
+            // chromosome: " + chr + ", check if the chromosome exists in the
+            // BAM file.");
+            printError("ERROR: Could not create iterator for chromosome: " + chr + ", check if the chromosome exists in the BAM file.");
+            return std::make_pair(0.0, chr_pos_depth_map);
         }
 
         // Initialize the record
@@ -438,7 +457,10 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
             hts_idx_destroy(bam_index);
             bam_hdr_destroy(bam_header);
             sam_close(bam_file);
-            throw std::runtime_error("ERROR: Could not initialize BAM record.");
+            // throw std::runtime_error("ERROR: Could not initialize BAM
+            // record.");
+            printError("ERROR: Could not initialize BAM record.");
+            return std::make_pair(0.0, chr_pos_depth_map);
         }
 
         // Iterate through the chromosome and update the depth map
@@ -469,7 +491,9 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
                         try {
                             chr_pos_depth_map[ref_pos + j]++;
                         } catch (const std::out_of_range& oor) {
-                            std::cerr << "Out of range error for " << chr << ":" << ref_pos+j << std::endl;
+                            // std::cerr << "Out of range error for " << chr <<
+                            // ":" << ref_pos+j << std::endl;
+                            printError("Out of range error for " + chr + ":" + std::to_string(ref_pos+j));
                         }
                         // chr_pos_depth_map[ref_pos + j]++;
                     }
@@ -482,7 +506,9 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
                 } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) {
                     // Do nothing
                 } else {
-                    throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op));
+                    // throw std::runtime_error("ERROR: Unknown CIGAR operation:
+                    // " + std::to_string(op));
+                    printError("ERROR: Unknown CIGAR operation: " + std::to_string(op));
                 }
             }
         }
@@ -554,14 +580,18 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     std::string snp_filepath = this->input_data.getSNPFilepath();
     if (snp_filepath.empty())
     {
-        throw std::runtime_error("ERROR: SNP file path is empty.");
+        // throw std::runtime_error("ERROR: SNP file path is empty.");
+        printError("ERROR: SNP file path is empty.");
+        return;
     }
 
     // Initialize the synced reader
     bcf_srs_t *snp_reader = bcf_sr_init();
     if (!snp_reader)
     {
-        throw std::runtime_error("ERROR: Could not initialize SNP reader.");
+        // throw std::runtime_error("ERROR: Could not initialize SNP reader.");
+        printError("ERROR: Could not initialize SNP reader.");
+        return;
     }
 
     // Lock during reading
@@ -572,7 +602,10 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)
     {
         bcf_sr_destroy(snp_reader);
-        throw std::runtime_error("ERROR: Could not set region for SNP reader: " + region_str);
+        // throw std::runtime_error("ERROR: Could not set region for SNP reader:
+        // " + region_str);
+        printError("ERROR: Could not set region for SNP reader: " + region_str);
+        return;
     }
 
     // Set multi-threading
@@ -586,7 +619,10 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0)
     {
         bcf_sr_destroy(snp_reader);
-        throw std::runtime_error("ERROR: Could not add SNP file to reader: " + snp_filepath);
+        // throw std::runtime_error("ERROR: Could not add SNP file to reader: "
+        // + snp_filepath);
+        printError("ERROR: Could not add SNP file to reader: " + snp_filepath);
+        return;
     }
 
     // Get the header
@@ -594,7 +630,10 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     if (!snp_header)
     {
         bcf_sr_destroy(snp_reader);
-        throw std::runtime_error("ERROR: Could not get header for SNP reader.");
+        // throw std::runtime_error("ERROR: Could not get header for SNP
+        // reader.");
+        printError("ERROR: Could not get header for SNP reader.");
+        return;
     }
 
     // std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl;
@@ -670,7 +709,10 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             {
                 // std::cerr << "ERROR: AD value is missing for SNP at " << chr
                 // << ":" << pos << std::endl;
-                throw std::runtime_error("ERROR: AD value is missing for SNP at " + chr + ":" + std::to_string(pos));
+                // throw std::runtime_error("ERROR: AD value is missing for SNP
+                // at " + chr + ":" + std::to_string(pos));
+                printError("ERROR: AD value is missing for SNP at " + chr + ":" + std::to_string(pos));
+                continue;
             }
 
             // Calculate the B-allele frequency (BAF)
@@ -744,7 +786,10 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
     bcf_srs_t *pfb_reader = bcf_sr_init();
     if (!pfb_reader)
     {
-        throw std::runtime_error("ERROR: Could not initialize synced reader for population frequency file: " + pfb_filepath);
+        // throw std::runtime_error("ERROR: Could not initialize synced reader
+        // for population frequency file: " + pfb_filepath);
+        printError("ERROR: Could not initialize synced reader for population frequency file: " + pfb_filepath);
+        return;
     }
 
     // Lock during reading
@@ -755,7 +800,10 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
     if (bcf_sr_set_regions(pfb_reader, region_str.c_str(), 0) < 0)
     {
         bcf_sr_destroy(pfb_reader);
-        throw std::runtime_error("ERROR: Could not set region for synced reader: " + region_str);
+        // throw std::runtime_error("ERROR: Could not set region for synced
+        // reader: " + region_str);
+        printError("ERROR: Could not set region for synced reader: " + region_str);
+        return;
     }
 
     // Set multi-threading
@@ -768,7 +816,10 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
     if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0)
     {
         bcf_sr_destroy(pfb_reader);
-        throw std::runtime_error("ERROR: Could not add population frequency file to synced reader: " + pfb_filepath);
+        // throw std::runtime_error("ERROR: Could not add population frequency
+        // file to synced reader: " + pfb_filepath);
+        printError("ERROR: Could not add population frequency file to synced reader: " + pfb_filepath);
+        return;
     }
 
     // Get the header
@@ -776,7 +827,10 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
     if (!pfb_header)
     {
         bcf_sr_destroy(pfb_reader);
-        throw std::runtime_error("ERROR: Could not get header for population frequency file: " + pfb_filepath);
+        // throw std::runtime_error("ERROR: Could not get header for population
+        // frequency file: " + pfb_filepath);
+        printError("ERROR: Could not get header for population frequency file: " + pfb_filepath);
+        return;
     }
 
     int record_count = 0;
@@ -832,7 +886,9 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
     }
     if (pfb_reader->errnum)
     {
-        std::cerr << "ERROR: " <<bcf_sr_strerror(pfb_reader->errnum) << std::endl;
+        // std::cerr << "ERROR: " <<bcf_sr_strerror(pfb_reader->errnum) <<
+        // std::endl;
+        printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum)));
     }
 
     // Clean up
diff --git a/src/khmm.cpp b/src/khmm.cpp
index 22d4a269..b5b5bf02 100644
--- a/src/khmm.cpp
+++ b/src/khmm.cpp
@@ -12,6 +12,8 @@
 #include <limits>
 /// @endcond
 
+#include "utils.h"
+
 #define STATE_CHANGE 100000.0 /*this is the expected changes (D value) in the transition matrix*/
 #define VITHUGE 100000000000.0
 #define FLOAT_MINIMUM 1.175494351e-38 /*this is indeed machine dependent*/
@@ -423,7 +425,9 @@ CHMM ReadCHMM(const std::string filename)
 	std::ifstream file(filename);
 	if (!file.is_open())
 	{
-		throw std::runtime_error("Error opening file");
+		// throw std::runtime_error("Error opening file");
+		printError("Error opening file");
+		return CHMM();
 	}
 	CHMM hmm;
 
@@ -433,87 +437,115 @@ CHMM ReadCHMM(const std::string filename)
 	std::getline(file, line);
 	if (sscanf(line.c_str(), "M=%d", &hmm.M) != 1)
 	{
-		throw std::runtime_error("Error reading M");
+		// throw std::runtime_error("Error reading M");
+		printError("Error reading M");
+		return CHMM();
 	}
 
 	// Read N
 	std::getline(file, line);
 	if (sscanf(line.c_str(), "N=%d", &hmm.N) != 1)
 	{
-		throw std::runtime_error("Error reading N");
+		// throw std::runtime_error("Error reading N");
+		printError("Error reading N");
+		return CHMM();
 	}
 
 	// Read A
 	std::getline(file, line);
 	if (line != "A:")
 	{
-		throw std::runtime_error("Error reading A");
+		// throw std::runtime_error("Error reading A");
+		printError("Error reading A");
+		return CHMM();
 	}
 	hmm.A = readMatrix(file, hmm.N, hmm.N);
 	if (hmm.A.size() != (size_t)hmm.N || hmm.A[0].size() != (size_t)hmm.N)
 	{
-		throw std::runtime_error("Error reading A");
+		// throw std::runtime_error("Error reading A");
+		printError("Error reading A");
+		return CHMM();
 	}
 
 	// Read B
 	std::getline(file, line);
 	if (line != "B:")
 	{
-		throw std::runtime_error("Error reading B");
+		// throw std::runtime_error("Error reading B");
+		printError("Error reading B");
+		return CHMM();
 	}
 	hmm.B = readMatrix(file, hmm.N, hmm.M);
 	if (hmm.B.size() != (size_t)hmm.N || hmm.B[0].size() != (size_t)hmm.M)
 	{
-		throw std::runtime_error("Error reading B");
+		// throw std::runtime_error("Error reading B");
+		printError("Error reading B");
+		return CHMM();
 	}
 
 	// Read pi
 	std::getline(file, line);
 	if (line != "pi:")
 	{
-		throw std::runtime_error("Error reading pi");
+		// throw std::runtime_error("Error reading pi");
+		printError("Error reading pi");
+		return CHMM();
 	}
 	hmm.pi = readVector(file, hmm.N);
 	if (hmm.pi.size() != (size_t)hmm.N)
 	{
-		throw std::runtime_error("Error reading pi");
+		// throw std::runtime_error("Error reading pi");
+		printError("Error reading pi");
+		return CHMM();
 	}
 
 	// Read B1_mean
 	std::getline(file, line);
 	if (line != "B1_mean:")
 	{
-		throw std::runtime_error("Error reading B1_mean");
+		// throw std::runtime_error("Error reading B1_mean");
+		printError("Error reading B1_mean");
+		return CHMM();
 	}
 	hmm.B1_mean = readVector(file, hmm.N);
 	if (hmm.B1_mean.size() != (size_t)hmm.N)
 	{
-		throw std::runtime_error("Error reading B1_mean");
+		// throw std::runtime_error("Error reading B1_mean");
+		printError("Error reading B1_mean");
+		return CHMM();
 	}
 
 	// Read B1_sd
 	std::getline(file, line);
 	if (line != "B1_sd:")
 	{
-		throw std::runtime_error("Error reading B1_sd");
+		// throw std::runtime_error("Error reading B1_sd");
+		printError("Error reading B1_sd");
+		return CHMM();
 	}
 	hmm.B1_sd = readVector(file, hmm.N);
 	if (hmm.B1_sd.size() != (size_t)hmm.N)
 	{
-		throw std::runtime_error("Error reading B1_sd");
+		// throw std::runtime_error("Error reading B1_sd");
+		printError("Error reading B1_sd");
+		return CHMM();
 	}
 
 	// Read B1_uf
 	std::getline(file, line);
 	if (line != "B1_uf:")
 	{
-		throw std::runtime_error("Error reading B1_uf");
+		// throw std::runtime_error("Error reading B1_uf");
+		printError("Error reading B1_uf");
+		return CHMM();
 	}
 	std::getline(file, line);
 	try {
 		hmm.B1_uf = std::stod(line);
 	} catch (const std::invalid_argument& e) {
-		throw std::runtime_error("Error reading B1_uf");
+		// throw std::runtime_error("Error reading B1_uf");
+		printError("Error reading B1_uf");
+		return CHMM();
 	}
 
 	// Read B2_mean
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 4d1cf0ef..c79d3149 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -20,6 +20,7 @@
 #include <fstream>
 #include <condition_variable>
 
+#include "ThreadPool.h"
 #include "utils.h"
 #include "sv_types.h"
 #include "version.h"
@@ -398,6 +399,12 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     CNVCaller cnv_caller(this->input_data);
     // cnv_caller.loadChromosomeData(chr);
     std::pair<double, std::vector<uint32_t>> chr_data = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_len);
+    if (chr_data.first == 0.0 || chr_data.second.size() == 0) {
+        hts_idx_destroy(idx);
+        bam_hdr_destroy(bamHdr);
+        sam_close(fp_in);
+        return;
+    }
 
     // Process each chunk one at a time
     // std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl;
@@ -468,20 +475,18 @@ void SVCaller::run()
     } else {
         chromosomes = this->input_data.getRefGenomeChromosomes();
     }
-
-    // Ignore all alternate contigs (contains 'alt', 'GL', 'NC', 'hs', etc.)
-    // chromosomes.erase(std::remove_if(chromosomes.begin(), chromosomes.end(), [](const std::string& chr) {
-    //     return chr.find("alt") != std::string::npos || chr.find("GL") != std::string::npos || chr.find("NC") != std::string::npos || chr.find("hs") != std::string::npos;
-    // }), chromosomes.end());
         
     // Read the HMM from the file
     std::string hmm_filepath = this->input_data.getHMMFilepath();
     std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
     const CHMM& hmm = ReadCHMM(hmm_filepath.c_str());
 
-    // Set up threads for processing each chromosome
+    // Set up thread pool
     const int max_threads = this->input_data.getThreadCount();
     std::cout << "Using " << max_threads << " threads for processing..." << std::endl;
+    ThreadPool pool(max_threads);
+
+    // Shared resources
     std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
     std::mutex sv_mutex;
 
@@ -497,58 +502,34 @@ void SVCaller::run()
             printMessage("Completed chromosome " + chr);
         } catch (const std::exception& e) {
             printError("Error processing chromosome " + chr + ": " + e.what());
+        } catch (...) {
+            printError("Unknown error processing chromosome " + chr);
         }
     };
 
-    // Thread management
-    // std::vector<std::thread> threads;
+    // Futures vector
     std::vector<std::future<void>> futures;
-    std::atomic<int> active_threads(0);
-    std::mutex cv_mutex;
-    std::condition_variable cv;
-    for (const auto& chr : chromosomes) {
-        // Wait for a thread slot
-        {
-            std::unique_lock<std::mutex> lock(cv_mutex);
-            cv.wait(lock, [&] { return active_threads.load() < max_threads; });
-            active_threads.fetch_add(1);
-        }
 
-        // Launch a task
-        futures.push_back(std::async(std::launch::async, [&, chr] {
+    // Submit tasks to the thread pool and track futures
+    for (const auto& chr : chromosomes) {
+        futures.emplace_back(pool.enqueue([&, chr] {
+            printMessage("Processing chromosome " + chr);
             process_chr(chr);
-            {
-                std::lock_guard<std::mutex> lock(cv_mutex);
-                active_threads.fetch_sub(1);
-
-                // Notify threads waiting for a slot
-                cv.notify_all();
-            }
         }));
-        // while (active_threads.load() >= max_threads) {
-        //     std::this_thread::yield();
-        // }
-
-        // // Launch a new thread
-        // threads.emplace_back([&, chr] {
-        //     active_threads.fetch_add(1);
-        //     process_chr(chr);
-        //     active_threads.fetch_sub(1);
-        // });
     }
 
-    // Wait for all threads to complete
-    printMessage("Waiting for all threads to finish...");
+    // Wait for all tasks to complete
     for (auto& future : futures) {
-        future.get();
+        try {
+            future.get();
+            printMessage("Chromosome task completed.");
+        } catch (const std::exception& e) {
+            printError("Error processing chromosome task: " + std::string(e.what()));
+        } catch (...) {
+            printError("Unknown error processing chromosome task.");
+        }
     }
-    // for (auto& thread : threads) {
-    //     if (thread.joinable()) {
-    //         thread.join();
-    //     }
-    // }
-
-    printMessage("All threads have finished.");
+    printMessage("All tasks have finished.");
 
     // Print the total number of SVs detected for each chromosome
     uint32_t total_sv_count = 0;
@@ -614,6 +595,10 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
 
                     // printMessage("Running copy number prediction on inversion: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1));
                     std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map);
+                    if (std::get<1>(result) == SVType::UNKNOWN) {
+                        continue;
+                    }
+
                     double supp_lh = std::get<0>(result);
                     SVType supp_type = std::get<1>(result);
                     int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
@@ -675,6 +660,9 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
 
             // printMessage("Running copy number prediction on boundary: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right));
             std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map);
+            if (std::get<1>(bd_result) == SVType::UNKNOWN) {
+                continue;
+            }
             double bd_lh = std::get<0>(bd_result);
             SVType bd_type = std::get<1>(bd_result);
 
@@ -690,6 +678,9 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
 
                 // printMessage("Running copy number prediction on gap: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right));
                 std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map);
+                if (std::get<1>(gap_result) == SVType::UNKNOWN) {
+                    continue;
+                }
                 double gap_lh = std::get<0>(gap_result);
                 SVType gap_type = std::get<1>(gap_result);
 

From 934350f34026db79b23140bb6806e517c25f87cb Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 5 Dec 2024 16:53:05 -0500
Subject: [PATCH 046/134] Calculate ME rates

---
 python/mendelian_inheritance.py | 51 +++++++++++++++++++++++++++++++++
 python/plot_venn.py             |  5 ++--
 src/sv_caller.cpp               |  1 -
 3 files changed, 54 insertions(+), 3 deletions(-)
 create mode 100644 python/mendelian_inheritance.py

diff --git a/python/mendelian_inheritance.py b/python/mendelian_inheritance.py
new file mode 100644
index 00000000..911b994b
--- /dev/null
+++ b/python/mendelian_inheritance.py
@@ -0,0 +1,51 @@
+import csv
+import sys
+
+
+def read_tsv(file_path):
+    with open(file_path, 'r') as file:
+        reader = csv.reader(file, delimiter='\t')
+        return [row for row in reader]
+
+def calculate_mendelian_error(father_genotype, mother_genotype, child_genotype):
+    # Generate all possible child genotypes
+    child_genotypes = set()
+    for allele1 in father_genotype.split('/'):
+        for allele2 in mother_genotype.split('/'):
+            child_genotypes.add('/'.join(sorted([allele1, allele2])))
+    
+    # Check if the child genotype is valid
+    return 0 if child_genotype in child_genotypes else 1
+
+
+def main(father_file, mother_file, child_file):
+    father_records = read_tsv(father_file)
+    mother_records = read_tsv(mother_file)
+    child_records = read_tsv(child_file)
+
+    if len(father_records) != len(mother_records) or len(father_records) != len(child_records):
+        raise ValueError("All files must have the same number of records")
+
+    total_records = len(father_records)
+    error_count = 0
+
+    for i in range(total_records):
+        father_genotype = father_records[i][5]
+        mother_genotype = mother_records[i][5]
+        child_genotype = child_records[i][5]
+
+        error_count += calculate_mendelian_error(father_genotype, mother_genotype, child_genotype)
+
+    error_rate = error_count / total_records
+    print(f"Mendelian Inheritance Error Rate: {error_rate:.2%} for {total_records} SVs")
+
+if __name__ == "__main__":
+    if len(sys.argv) != 4:
+        print("Usage: python mendelian_inheritance.py <father_tsv> <mother_tsv> <child_tsv>")
+        sys.exit(1)
+
+    father_file = sys.argv[1]
+    mother_file = sys.argv[2]
+    child_file = sys.argv[3]
+
+    main(father_file, mother_file, child_file)
diff --git a/python/plot_venn.py b/python/plot_venn.py
index eb7e8e78..757f4408 100644
--- a/python/plot_venn.py
+++ b/python/plot_venn.py
@@ -4,7 +4,7 @@
 
 import matplotlib.pyplot as plt
 
-def plot_venn(AB, Ab, aB, output, title_AB, title_Ab, title_aB):
+def plot_venn(AB, Ab, aB, output, plot_title, title_Ab, title_aB):
     plt.figure(figsize=(8, 8))
 
     print('AB:', AB)
@@ -27,7 +27,8 @@ def plot_venn(AB, Ab, aB, output, title_AB, title_Ab, title_aB):
     venn.get_label_by_id('11').set_text(str(AB))
 
     # Update the title
-    plt.title("contextsv and " + title_aB + " venn diagram (all SV types)")
+    # plt.title("contextsv and " + title_aB + " venn diagram (all SV types)")
+    plt.title(plot_title)
     plt.savefig(output)
     plt.close()
 
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index c79d3149..95bf1441 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -397,7 +397,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     // std::cout << "Loading chromosome data for copy number predictions..." << std::endl;
     printMessage(chr + ": Loading chromosome data...");
     CNVCaller cnv_caller(this->input_data);
-    // cnv_caller.loadChromosomeData(chr);
     std::pair<double, std::vector<uint32_t>> chr_data = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_len);
     if (chr_data.first == 0.0 || chr_data.second.size() == 0) {
         hts_idx_destroy(idx);

From d5923fecd65ac2a0004fe66cc86132b2dcf9b1e6 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 6 Dec 2024 15:05:02 -0500
Subject: [PATCH 047/134] Add ME debug output

---
 python/mendelian_inheritance.py | 33 ++++++++++++++++++++++++++++++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/python/mendelian_inheritance.py b/python/mendelian_inheritance.py
index 911b994b..a8050e53 100644
--- a/python/mendelian_inheritance.py
+++ b/python/mendelian_inheritance.py
@@ -13,6 +13,10 @@ def calculate_mendelian_error(father_genotype, mother_genotype, child_genotype):
     for allele1 in father_genotype.split('/'):
         for allele2 in mother_genotype.split('/'):
             child_genotypes.add('/'.join(sorted([allele1, allele2])))
+
+    # Print the parent and child genotypes if invalid
+    # if child_genotype not in child_genotypes:
+    #     print(f"ME: Father: {father_genotype}, Mother: {mother_genotype}, Child: {child_genotype}")
     
     # Check if the child genotype is valid
     return 0 if child_genotype in child_genotypes else 1
@@ -29,15 +33,38 @@ def main(father_file, mother_file, child_file):
     total_records = len(father_records)
     error_count = 0
 
+    sv_type_dict = {}
+    sv_type_error_dict = {}
+
     for i in range(total_records):
         father_genotype = father_records[i][5]
         mother_genotype = mother_records[i][5]
         child_genotype = child_records[i][5]
+        child_sv_type = child_records[i][2]
+        sv_type_dict[child_sv_type] = sv_type_dict.get(child_sv_type, 0) + 1
+
+        # Print SV size if error occurs
+        error_value = calculate_mendelian_error(father_genotype, mother_genotype, child_genotype)
+        if error_value == 1:
+            # print(f"SV size: {father_records[i][2]}")
+            sv_type_error_dict[child_sv_type] = sv_type_error_dict.get(child_sv_type, 0) + 1
+
+        error_count += error_value
+        # error_count += calculate_mendelian_error(father_genotype, mother_genotype, child_genotype)
+
+    if total_records == 0:
+        error_rate = 0
+        print("No records found")
+    else:
+        error_rate = error_count / total_records
 
-        error_count += calculate_mendelian_error(father_genotype, mother_genotype, child_genotype)
+    print(f"Mendelian Inheritance Error Rate: {error_rate:.2%} for {total_records} shared trio SVs")
 
-    error_rate = error_count / total_records
-    print(f"Mendelian Inheritance Error Rate: {error_rate:.2%} for {total_records} SVs")
+    print("SV Type Distribution:")
+    for sv_type, count in sv_type_dict.items():
+        error_count = sv_type_error_dict.get(sv_type, 0)
+        error_rate = error_count / count
+        print(f"{sv_type}: {error_rate:.2%} ({error_count}/{count})")
 
 if __name__ == "__main__":
     if len(sys.argv) != 4:

From 7e3bac451a6204c33f49e44b38d0e793235afeed Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 11 Dec 2024 12:19:28 -0500
Subject: [PATCH 048/134] Fix alt allele error

---
 python/mendelian_inheritance.py |   4 +-
 src/cnv_caller.cpp              |  16 +++-
 src/sv_caller.cpp               |  49 +++++++---
 src/sv_object.cpp               | 152 +++++---------------------------
 4 files changed, 74 insertions(+), 147 deletions(-)

diff --git a/python/mendelian_inheritance.py b/python/mendelian_inheritance.py
index a8050e53..128b1d1a 100644
--- a/python/mendelian_inheritance.py
+++ b/python/mendelian_inheritance.py
@@ -15,8 +15,8 @@ def calculate_mendelian_error(father_genotype, mother_genotype, child_genotype):
             child_genotypes.add('/'.join(sorted([allele1, allele2])))
 
     # Print the parent and child genotypes if invalid
-    # if child_genotype not in child_genotypes:
-    #     print(f"ME: Father: {father_genotype}, Mother: {mother_genotype}, Child: {child_genotype}")
+    if child_genotype not in child_genotypes:
+        print(f"ME: Father: {father_genotype}, Mother: {mother_genotype}, Child: {child_genotype}")
     
     # Check if the child genotype is valid
     return 0 if child_genotype in child_genotypes else 1
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 2b3a48b4..4eb0bc26 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -338,15 +338,23 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
             data_type = "Log2CNV";
         }
 
-        // Update the SV copy number data if not unknown
+        // Update the SV genotype if known
+        if (updated_sv_type != SVType::UNKNOWN)
+        {
+            sv_call.genotype = genotype;
+            sv_call.data_type = data_type;
+            sv_call.hmm_likelihood = likelihood;
+        }
+
+        // Update the SV type if known
         // printMessage("Updating SV copy number data for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
         if (updated_sv_type != SVType::UNKNOWN && updated_sv_type != SVType::NEUTRAL)
         {
             std::string sv_type_str = getSVTypeString(updated_sv_type);
             sv_call.sv_type = sv_type_str;
-            sv_call.data_type = data_type;
-            sv_call.genotype = genotype;
-            sv_call.hmm_likelihood = likelihood;
+            // sv_call.data_type = data_type;
+            // sv_call.genotype = genotype;
+            // sv_call.hmm_likelihood = likelihood;
         }
 
         // Save the SV calls as a TSV file if enabled, if the SV type is
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 95bf1441..97672a91 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -367,7 +367,8 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
 
     // Split the chromosome into chunks for memory efficiency
     std::vector<std::string> region_chunks;
-    int chunk_count = 100;
+    //int chunk_count = 100;
+    int chunk_count = 1;
     uint32_t chr_len = this->input_data.getRefGenomeChromosomeLength(chr);
     if (this->input_data.isRegionSet()) {
 
@@ -420,7 +421,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         // std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
         // PrimaryMap& primary_map = std::get<1>(region_data);
         // SuppMap& supp_map = std::get<2>(region_data);
-        // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl;
+        // std::cout << " CIGAR SV calls from " << sub_region << "..." << std::endl;
         printMessage(chr + ": Merging CIGAR...");
         filterSVsWithLowSupport(subregion_sv_calls, filter_threshold);
         mergeSVs(subregion_sv_calls);
@@ -552,19 +553,25 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
     // Find split-read SV evidence
     int sv_count = 0;
     uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength();
+    int primary_count = primary_map.size();
+    int current_primary = 0;
     for (const auto& entry : primary_map) {
+        current_primary++;
         std::string qname = entry.first;
         AlignmentData primary_alignment = entry.second;
         std::string primary_chr = std::get<0>(primary_alignment);
         uint32_t primary_start = std::get<1>(primary_alignment);
         uint32_t primary_end = std::get<2>(primary_alignment);
+        printMessage("Processing primary alignment " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " (Location: " + primary_chr + ":" + std::to_string(primary_start+1) + "-" + std::to_string(primary_end+1) + ")...");
 
         // Skip primary alignments that do not have supplementary alignments
         if (supp_map.find(qname) == supp_map.end()) {
             continue;
         }
 
-        // Find the largest supplementary alignment, and also identify inversions
+        // Find the largest supplementary alignment, and also identify
+        // inversions
+        printMessage("Finding largest supplementary alignment...");
         AlignmentData largest_supp_alignment = supp_map[qname][0];
         uint32_t largest_supp_length = 0;
         for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
@@ -592,7 +599,7 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
                         continue;
                     }
 
-                    // printMessage("Running copy number prediction on inversion: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1));
+                    printMessage("Running copy number prediction on inversion: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1) + " of length " + std::to_string(supp_length) + " bp...");
                     std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map);
                     if (std::get<1>(result) == SVType::UNKNOWN) {
                         continue;
@@ -600,18 +607,20 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
 
                     double supp_lh = std::get<0>(result);
                     SVType supp_type = std::get<1>(result);
+                    printMessage("Calculating read depth for inversion (length: " + std::to_string(supp_length) + " bp)...");
                     int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
                     if (supp_type == SVType::NEUTRAL) {
                         addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "<INV>", "HMM", "./.", supp_lh, read_depth);
                         
                         sv_count++;
                     } else if (supp_type == SVType::DUP) {
-                        int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
+                        // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
                         addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", "<INV>", "HMM", "./.", supp_lh, read_depth);
                     }
                 } else {
                     // Add the inversion without running copy number predictions
                     // (too small for predictions)
+                    printMessage("Calculating read depth for small inversion (length: " + std::to_string(supp_length) + " bp)...");
                     int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
                     addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "<INV>", "REV", "./.", 0.0, read_depth);
                 }
@@ -619,12 +628,14 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
         }
 
         // Trim overlapping alignments
+        printMessage("Trimming overlapping alignments...");
         uint32_t supp_start = std::get<1>(largest_supp_alignment);
         uint32_t supp_end = std::get<2>(largest_supp_alignment);
         bool primary_before_supp = primary_start < supp_start;
         trimOverlappingAlignments(primary_alignment, largest_supp_alignment);
 
         // Create the SV candidate using both alignments
+        printMessage("Creating SV candidates...");
         supp_start = std::get<1>(largest_supp_alignment);
         supp_end = std::get<2>(largest_supp_alignment);
         primary_start = std::get<1>(primary_alignment);
@@ -658,6 +669,7 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
             }
 
             // printMessage("Running copy number prediction on boundary: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right));
+            printMessage("Running copy number prediction on boundary (Length: " + std::to_string(boundary_right - boundary_left) + " bp)...");
             std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map);
             if (std::get<1>(bd_result) == SVType::UNKNOWN) {
                 continue;
@@ -675,7 +687,10 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
                     continue;
                 }
 
-                // printMessage("Running copy number prediction on gap: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right));
+                // printMessage("Running copy number prediction on gap: " +
+                // primary_chr + ":" + std::to_string(gap_left) + "-" +
+                // std::to_string(gap_right));
+                printMessage("Running copy number prediction on gap (Length: " + std::to_string(gap_right - gap_left) + " bp)...");
                 std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map);
                 if (std::get<1>(gap_result) == SVType::UNKNOWN) {
                     continue;
@@ -686,16 +701,19 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
                 // If higher likelihood than the boundary, add the gap as the SV call
                 if (gap_lh > bd_lh) {
                     int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right);
-                    std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
+                    printMessage("Calculating read depth for gap (length: " + std::to_string(gap_right - gap_left) + " bp)...");
+                    std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">";
                     addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "GAP", "./.", gap_lh, read_depth);
                 } else {
                     // Add the boundary as the SV call
+                    printMessage("Calculating read depth for boundary (length: " + std::to_string(boundary_right - boundary_left) + " bp)...");
                     int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                     std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
                     addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth);
                 }
             } else {
                 // Add the boundary as the SV call
+                printMessage("Calculating read depth for boundary (length: " + std::to_string(boundary_right - boundary_left) + " bp)...");
                 int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                 std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
                 addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth);
@@ -829,18 +847,25 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                 int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
                 ref_allele = this->input_data.queryRefGenome(chr, preceding_pos, preceding_pos);
 
-                // Format novel insertions
+                // Update the start position to the preceding base
+                start = preceding_pos;
+
+                // Update the end position to the same base for duplications and insertions
+                if (sv_type_str == "DUP" || sv_type_str == "INS") {
+                    end = start;
+                }
+
                 if (sv_type_str == "INS") {
                     // Check if in symbolic form
                     if (alt_allele != "<INS>") {
                         // Use the insertion sequence as the alternate allele
                         alt_allele.insert(0, ref_allele);
                     }
-                    start = preceding_pos;  // Update the position to the preceding base
+                    // start = preceding_pos;  // Update the position to the preceding base
 
-                    // Update the end position to the start position to change from
-                    // query to reference coordinates for insertions
-                    end = start;
+                    // // Update the end position to the start position to change from
+                    // // query to reference coordinates for insertions
+                    // end = start;
                 }
             }
 
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index d9b8e457..04201066 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -25,6 +25,16 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std:
     if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") {
         return;
     }
+
+    // Set the alt allele to <DUP> or <DEL> if the SV type is DUP or DEL, throw
+    // an error otherwise
+    if (sv_type == "DUP" && alt_allele == ".") {
+        printError("ERROR: Invalid alt allele for duplication at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele);
+        alt_allele = "<DUP>";
+    } else if (sv_type == "DEL" && alt_allele == ".") {
+        printError("ERROR: Invalid alt allele for deletion at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele);
+        alt_allele = "<DEL>";
+    }
     
     if (start >= end) {
         throw std::runtime_error("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end));
@@ -94,17 +104,18 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
         SVCall& next = sv_calls[i];
         // Check for overlap
         if (next.start <= current_merge.end) {
-            // printMessage("Comparing SV " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " (support " + std::to_string(current_merge.support) + ", length " + std::to_string(current_merge.end - current_merge.start) + ") with " + std::to_string(next.start) + "-" + std::to_string(next.end) + " (support " + std::to_string(next.support) + ", length " + std::to_string(next.end - next.start) + ")");
+            //XprintMessage("Comparing SV " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " (support " + std::to_string(current_merge.support) + ", length " + std::to_string(current_merge.end - current_merge.start) + ") with " + std::to_string(next.start) + "-" + std::to_string(next.end) + " (support " + std::to_string(next.support) + ", length " + std::to_string(next.end - next.start) + ")");
+            
             // if (current_merge.start <= next.end && next.start <= current_merge.end) {
             // Calculate reciprocal overlap
             uint32_t overlap = std::max(0, (int)std::min(current_merge.end, next.end) - (int)std::max(current_merge.start, next.start));
             uint32_t union_length = std::max(current_merge.end, next.end) - std::min(current_merge.start, next.start);
             double overlap_fraction = static_cast<double>(overlap) / union_length;
-            // printMessage("Overlap fraction: " + std::to_string(overlap_fraction));
+            //XprintMessage("Overlap fraction: " + std::to_string(overlap_fraction));
 
             // Merge if reciprocal overlap is >90%
             if (overlap_fraction > 0.90) {
-                // printMessage("Merging SV calls with overlap " + std::to_string(overlap_fraction));
+                //XprintMessage("Merging SV calls with overlap " + std::to_string(overlap_fraction));
                 // Keep the SV call with the higher read support
                 if (next.support > current_merge.support) {
                     current_merge = next;
@@ -118,19 +129,14 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
                             current_merge = next;
                         }
                     }
-                    // // Keep the SV call with the higher read depth
-                    // if (next.read_depth > current_merge.read_depth) {
-                    //     current_merge = next;
-                    // } else if (next.read_depth == current_merge.read_depth) {
-                    //     // Keep the SV call with the higher likelihood
-                    //     if (next.hmm_likelihood > current_merge.hmm_likelihood) {
-                    //         current_merge = next;
-                    //     }
-                    // }
                 }
             } else {
-                merged_sv_calls.push_back(current_merge);
-                current_merge = next;
+            	// Continue with the larger length
+				uint32_t current_length = current_merge.end - current_merge.start;
+				uint32_t next_length = next.end - next.start;
+				if (next_length > current_length) {  // And support meets threshold
+					current_merge = next;
+				}
             }
         } else {
             merged_sv_calls.push_back(current_merge);
@@ -138,120 +144,8 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
         }
     }
 
-    // Add the last SV call
-    merged_sv_calls.push_back(current_merge);
-
-    // Update the SV calls
-    sv_calls = merged_sv_calls;
-    // for (size_t i = 0; i < sv_calls.size(); i++) {
-    //     SVCall& current = sv_calls[i];
-    //     bool merged = false;
-    //     for (size_t j = i + 1; j < sv_calls.size(); j++) {
-    //         SVCall& next = sv_calls[j];
-    //         if (current.start <= next.end && next.start <= current.end) {
-    //             // Calculate reciprocal overlap
-    //             uint32_t overlap = std::max(0, (int)std::min(current.end, next.end) - (int)std::max(current.start, next.start));
-    //             uint32_t union_length = std::max(current.end, next.end) - std::min(current.start, next.start);
-    //             double overlap_fraction = static_cast<double>(overlap) / union_length;
-
-    //             // Merge if reciprocal overlap is >90%
-    //             if (overlap_fraction > 0.9) {
-    //                 // Keep the SV call with the higher likelihood
-    //                 if (next.hmm_likelihood > current.hmm_likelihood) {
-    //                     current = next;
-    //                 }
-    //                 merged = true;
-    //             }
-
-    //             // Remove the merged SV call
-    //             sv_calls.erase(sv_calls.begin() + j);
-    //             j--;
-
-    //     }
-    //     if (!merged) {
-    //         merged_sv_calls.push_back(current);
-    //     }
-    // }
-    
-
-
-    // std::vector<SVCall> merged_sv_calls;
-    // auto it = sv_calls.begin();
-    // SVCall current_merge = *it++;
-    // double log_lh_eps = 1.0;  // Log likelihood epsilon
-    // for (; it != sv_calls.end(); ++it) {
-    //     SVCall& next = *it;
-
-    //     // Find overlap
-    //     // printMessage("[0] Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support));
-    //     // printMessage("[0] Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support));
-    //     if (next.start <= current_merge.end) {
-
-    //         // Merge based on read support
-    //         if (next.support > current_merge.support) {
-    //             // Compare only if lengths are within 20% of each other
-    //             uint32_t current_length = current_merge.end - current_merge.start;
-    //             uint32_t next_length = next.end - next.start;
-    //             double length_diff = std::abs((int)current_length - (int)next_length);
-    //             double length_threshold = 0.2 * (int)current_length;
-    //             if (length_diff <= length_threshold) {
-    //                 current_merge = next;  // Continue with the next call
-    //                 // printMessage("Keeping next SV call with support " + std::to_string(next.support));
-    //             } else {
-    //                 // Keep the larger SV
-    //                 if (next_length > current_length) {
-    //                     current_merge = next;
-    //                     // printMessage("Keeping next SV call with length " + std::to_string(next_length));
-    //                 }
-    //             }
-    //             // printMessage("Keeping next SV call with support " + std::to_string(next.support));
-
-    //         } else if (next.support == current_merge.support) {
-    //             // Merge based on existence of predictions
-    //             if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood == 0.0) {
-    //                 current_merge = next;  // Continue with the next call
-    //                 // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood));
-
-    //             // Merge based on prediction log likelihood
-    //             } else if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0) {
-                    
-    //                 // Print all SV information
-    //                 // printMessage("Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support));
-    //                 // printMessage("Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support));
-    //                 // printMessage("Comparing likelihoods: " + std::to_string(current_merge.hmm_likelihood) + " vs " + std::to_string(next.hmm_likelihood));
-
-    //                 // Keep the SV call with the higher likelihood. Compare only if
-    //                 // lengths are within 20% of each other
-    //                 uint32_t current_length = current_merge.end - current_merge.start;
-    //                 uint32_t next_length = next.end - next.start;
-    //                 double length_diff = std::abs((int)current_length - (int)next_length);
-    //                 double length_threshold = 0.2 * (int)current_length;
-    //                 if (length_diff <= length_threshold) {
-    //                     // printMessage("Length difference is within threshold: " + std::to_string(length_diff) + " <= " + std::to_string(length_threshold));
-
-    //                     if (next.hmm_likelihood > current_merge.hmm_likelihood) {
-    //                         current_merge = next;  // Continue with the next call
-    //                         // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood));
-    //                     }
-                    
-    //                 } else {
-    //                     // Keep the larger SV
-    //                     if (next_length > current_length) {
-    //                         current_merge = next;
-    //                         // printMessage("[2] Keeping next SV call with length " + std::to_string(next_length));
-    //                     }
-    //                 }
-    //             }
-    //         }
-
-    //     } else {
-    //         // No overlap: Save the call and continue
-    //         merged_sv_calls.emplace_back(current_merge);
-    //         current_merge = next;
-    //     }
-    // }
-    // merged_sv_calls.emplace_back(current_merge);  // Save the last call
-    // sv_calls = merged_sv_calls;  // Update the SV calls
+    merged_sv_calls.push_back(current_merge);  // Add the last SV call
+    sv_calls = merged_sv_calls;  // Update the SV calls
 
     int updated_size = sv_calls.size();
     std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl;
@@ -259,7 +153,7 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
 
 void filterSVsWithLowSupport(std::vector<SVCall>& sv_calls, int min_support)
 {
-    int prev_size = sv_calls.size();
+    // int prev_size = sv_calls.size();
 
     // Filter SV calls with low read support
     sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) {

From a169b731bbc1d68a8e6d475d530c2ba01bb4563c Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 11 Dec 2024 18:07:36 -0500
Subject: [PATCH 049/134] Replace fixed window with sample size

---
 include/cnv_caller.h |  10 +-
 include/input_data.h |   8 +-
 src/cnv_caller.cpp   | 538 ++++++++++++++++++++++---------------------
 src/input_data.cpp   |  10 +-
 src/main.cpp         |  28 +--
 src/sv_caller.cpp    |  41 +---
 src/sv_object.cpp    |   5 -
 7 files changed, 308 insertions(+), 332 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index b36d414f..457336e1 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -76,9 +76,9 @@ class CNVCaller {
         void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction);
 
         // Query a region for SNPs and return the SNP data
-        std::pair<SNPData, bool> querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov);
+        void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data);
 
-        void querySNPs(std::string chr, uint32_t start, uint32_t end, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf, std::unordered_map<uint32_t, double>& snp_pfb);
+        void querySNPs(std::string chr, uint32_t start, uint32_t end, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp);
 
         // Split a region into chunks for parallel processing
         std::vector<std::string> splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count);
@@ -100,8 +100,10 @@ class CNVCaller {
         // chromosome coverage
         double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov);
 
-        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf);
-        void readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map<uint32_t, double>& snp_pfb_map);
+        void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& pos_log2);
+
+        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp);
+        // void readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<double>& snp_pfb);
 
         // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions
         void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood);
diff --git a/include/input_data.h b/include/input_data.h
index 7d577784..65051e77 100644
--- a/include/input_data.h
+++ b/include/input_data.h
@@ -69,9 +69,9 @@ class InputData {
         void setEthnicity(std::string ethnicity);
         std::string getEthnicity();
 
-        // Set the window size for the log2 ratio calculation.
-        void setWindowSize(int window_size);
-        int getWindowSize();
+        // Set the sample size for HMM predictions.
+        void setSampleSize(int sample_size);
+        int getSampleSize();
 
         // Set the minimum CNV length to use for copy number predictions.
         void setMinCNVLength(int min_cnv_length);
@@ -112,7 +112,7 @@ class InputData {
         std::unordered_map<std::string, std::string> pfb_filepaths;  // Map of population frequency VCF filepaths by chromosome
         ReferenceGenome fasta_query;
         std::string output_dir;
-        int window_size;
+        int sample_size;
         int min_cnv_length;
         std::string chr;  // Chromosome to analyze
         std::pair<int32_t, int32_t> start_end;  // Region to analyze
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 4eb0bc26..eb217728 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -46,8 +46,6 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::ve
     int data_count = (int) snp_data.pos.size();
     if (data_count == 0)
     {
-        // throw std::runtime_error("Error: No SNP data found for Viterbi
-        // algorithm.");
         printError("ERROR: No SNP data found for Viterbi algorithm.");
         prediction = std::make_pair(std::vector<int>(), 0.0);
     }
@@ -55,18 +53,44 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::ve
 }
 
 // Function to obtain SNP information for a region
-std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov)
+void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data)
 {
-    SNPData snp_data;
-    bool snps_found = false;
-    uint32_t window_size = (uint32_t)this->input_data.getWindowSize();
+    // uint32_t window_size = (uint32_t)this->input_data.getWindowSize();
+
+    // Initialize the SNP data with default values and sample size length
+    int sample_size = this->input_data.getSampleSize();
+    int region_length = (int) (end_pos - start_pos + 1);
+    if (region_length < sample_size)
+    {
+        sample_size = region_length;
+    }
+
+    printMessage("Querying SNPs for region length " + std::to_string(region_length) + " bp with sample size " + std::to_string(sample_size) + "...");
+
+    // std::set<uint32_t> snp_pos(sample_size);
+    std::vector<uint32_t> snp_pos(sample_size, 0);
+    std::vector<double> snp_baf(sample_size, -1.0);
+    std::vector<double> snp_pfb(sample_size, 0.5);
+    std::vector<double> snp_log2_cov(sample_size, 0.0);
+    std::vector<bool> is_snp(sample_size, false);
+    // std::unordered_map<uint32_t, double> snp_baf(sample_size, -1.0);
+    // std::unordered_map<uint32_t, double> snp_pfb(sample_size, 0.5);
 
     // Query the SNPs for the entire region
-    std::set<uint32_t> snp_pos;
-    std::unordered_map<uint32_t, double> snp_baf;
-    std::unordered_map<uint32_t, double> snp_pfb;
-    this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb);
+    this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp);
+
+    // Get the log2 ratio for <sample_size> evenly spaced positions in the
+    // region
+    this->calculateRegionLog2Ratio(start_pos, end_pos, sample_size, pos_depth_map, mean_chr_cov, snp_log2_cov);
+
+    // Update the SNP data with all information
+    snp_data.pos = std::move(snp_pos);
+    snp_data.baf = std::move(snp_baf);
+    snp_data.pfb = std::move(snp_pfb);
+    snp_data.log2_cov = std::move(snp_log2_cov);
+    snp_data.is_snp = std::move(is_snp);
 
+    /*
     // Loop through the range of the SV region and query the SNPs in a sliding
     // window, then calculate the log2 ratio for each window
     for (uint32_t i = start_pos; i <= end_pos; i += window_size)
@@ -134,8 +158,8 @@ std::pair<SNPData, bool> CNVCaller::querySNPRegion(std::string chr, uint32_t sta
             }
         }
     }
-
-    return std::make_pair(snp_data, snps_found);
+    */
+    // return std::make_pair(snp_data, snps_found);
 }
 
 std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map)
@@ -164,17 +188,19 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     // uint32_t snp_end_pos = end_pos + sv_half_length;
 
     // Query the SNP region for the SV candidate
-    std::pair<SNPData, bool> snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov);
-    SNPData& sv_snps = snp_call.first;
-    bool sv_snps_found = snp_call.second;
+    SNPData snp_data;
+    querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data);
+    // std::pair<SNPData, bool> snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov);
+    // SNPData& sv_snps = snp_call.first;
+    // bool sv_snps_found = snp_call.second;
 
     // Run the Viterbi algorithm
-    // printMessage("Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " (" + std::to_string(sv_snps.pos.size()) + " SNPs, start=" + std::to_string(snp_start_pos) + ", end=" + std::to_string(snp_end_pos) + ")...");
+    printMessage("Running Viterbi algorithm for SV with length " + std::to_string((int) (end_pos - start_pos)) + " bp, snp data found: " + std::to_string(snp_data.pos.size()) + "...");
     std::pair<std::vector<int>, double> prediction;
-    runViterbi(hmm, sv_snps, prediction);
+    runViterbi(hmm, snp_data, prediction);
     if (prediction.first.size() == 0)
     {
-        return std::make_tuple(0.0, SVType::UNKNOWN, "./.", sv_snps_found);
+        return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false);
     }
 
     std::vector<int>& state_sequence = prediction.first;
@@ -184,7 +210,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     std::vector<int> sv_states;
     for (size_t i = 0; i < state_sequence.size(); i++)
     {
-        if (sv_snps.pos[i] >= start_pos && sv_snps.pos[i] <= end_pos)
+        if (snp_data.pos[i] >= start_pos && snp_data.pos[i] <= end_pos)
         {
             sv_states.push_back(state_sequence[i]);
         }
@@ -217,7 +243,8 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
         predicted_cnv_type = getSVTypeFromCNState(max_state);
         genotype = cnv_genotype_map[max_state];
     }
-    sv_snps.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
+    snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
+    printMessage("Finished running Viterbi algorithm for SV with length " + std::to_string((int) (end_pos - start_pos)) + " bp: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", returning...");
 
     // Save the SV calls as a TSV file if enabled
     bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL);
@@ -226,10 +253,10 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
         std::string cnv_type_str = getSVTypeString(predicted_cnv_type);
         std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv";
         printMessage("Saving SV split-alignment copy number predictions to " + sv_filename + "...");
-        this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood);
+        this->saveSVCopyNumberToTSV(snp_data, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood);
     }
     
-    return std::make_tuple(likelihood, predicted_cnv_type, genotype, sv_snps_found);
+    return std::make_tuple(likelihood, predicted_cnv_type, genotype, true);
 }
 
 
@@ -275,18 +302,17 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
             snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
             snp_end_pos = end_pos + sv_half_length;
         }
-        std::pair<SNPData, bool> snp_call = this->querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov);
-        SNPData& sv_snps = snp_call.first;
-        bool snps_found = snp_call.second;
+        SNPData snp_data;
+        this->querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data);
 
         // Run the Viterbi algorithm
-        if (sv_snps.pos.size() == 0) {
+        if (snp_data.pos.size() == 0) {
         	std::cerr << "ERROR: No windows for SV " << chr << ":" << start_pos << "-" << end_pos << " (" << snp_start_pos << "," << snp_end_pos << std::endl;
         	continue;
         }
         
         std::pair<std::vector<int>, double> prediction;
-        runViterbi(hmm, sv_snps, prediction);
+        runViterbi(hmm, snp_data, prediction);
         std::vector<int>& state_sequence = prediction.first;
         double likelihood = prediction.second;
         // printMessage("Finished running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
@@ -296,7 +322,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         std::vector<int> sv_states;
         for (size_t i = 0; i < state_sequence.size(); i++)
         {
-            if (sv_snps.pos[i] >= start_pos && sv_snps.pos[i] <= end_pos)
+            if (snp_data.pos[i] >= start_pos && snp_data.pos[i] <= end_pos)
             {
                 sv_states.push_back(state_sequence[i]);
             }
@@ -331,12 +357,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         // Determine the SV calling method used to call the SV
         // (SNPCNV=SNP-based, Log2CNV=coverage-based)
         std::string data_type;
-        if (snps_found)
-        {
-            data_type = "SNPCNV";
-        } else {
-            data_type = "Log2CNV";
-        }
+        data_type = "HMM";
 
         // Update the SV genotype if known
         if (updated_sv_type != SVType::UNKNOWN)
@@ -352,24 +373,20 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         {
             std::string sv_type_str = getSVTypeString(updated_sv_type);
             sv_call.sv_type = sv_type_str;
-            // sv_call.data_type = data_type;
-            // sv_call.genotype = genotype;
-            // sv_call.hmm_likelihood = likelihood;
         }
 
         // Save the SV calls as a TSV file if enabled, if the SV type is
         // known, and the length is greater than 10 kb
-        // SVType updated_sv_type = sv_candidates[sv_call].sv_type;
         if (this->input_data.getSaveCNVData() && updated_sv_type != SVType::UNKNOWN && (end_pos - start_pos) > 10000)
         {
             // Add the state sequence to the SNP data (avoid copying the data)
-            sv_snps.state_sequence = std::move(state_sequence);
+            snp_data.state_sequence = std::move(state_sequence);
 
             // Save the SV calls as a TSV file
             std::string cnv_type_str = getSVTypeString(updated_sv_type);
             std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_CIGAR.tsv";
             printMessage("Saving SV CIGAR copy number predictions to " + sv_filename);
-            this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood);
+            this->saveSVCopyNumberToTSV(snp_data, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood);
         }
     }
 }
@@ -582,18 +599,42 @@ double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, const
     return window_log2_ratio;
 }
 
-void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf)
+void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& log2_region)
+{
+    uint32_t region_length = end_pos - start_pos + 1;
+    for (int i = 0; i < sample_size; i++)
+    {
+        uint32_t pos = start_pos + ((double)region_length / sample_size) * i;
+        try {
+            uint32_t depth = pos_depth_map.at(pos);
+
+            // Calculate the log2 ratio for the position
+            if (depth == 0)
+            {
+                log2_region[i] = 0.0;
+            } else {
+                log2_region[i] = log2((double) depth / mean_chr_cov);
+            }
+
+        } catch (const std::out_of_range& e) {
+            log2_region[i] = 0.0;
+        }
+        // printMessage("Position: " + std::to_string((int)pos) + ", log2 ratio: " + std::to_string(log2_region[i]));
+    }
+}
+
+void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp)
 {
+    // --------- SNP file ---------
     // Get the SNP file path
     std::string snp_filepath = this->input_data.getSNPFilepath();
     if (snp_filepath.empty())
     {
-        // throw std::runtime_error("ERROR: SNP file path is empty.");
         printError("ERROR: SNP file path is empty.");
         return;
     }
 
-    // Initialize the synced reader
+    // Initialize the SNP file reader
     bcf_srs_t *snp_reader = bcf_sr_init();
     if (!snp_reader)
     {
@@ -601,34 +642,20 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         printError("ERROR: Could not initialize SNP reader.");
         return;
     }
+    snp_reader->require_index = 1;
 
-    // Lock during reading
-    std::lock_guard<std::mutex> lock(this->snp_file_mtx);
-
-    // Set the region
-    std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
-    if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)
+    // Set multi-threading if running on a single chromosome
+    if (this->input_data.getChromosome() != "")
     {
-        bcf_sr_destroy(snp_reader);
-        // throw std::runtime_error("ERROR: Could not set region for SNP reader:
-        // " + region_str);
-        printError("ERROR: Could not set region for SNP reader: " + region_str);
-        return;
+        int thread_count = this->input_data.getThreadCount() - 1;  // Leave one thread for the main thread
+        printMessage("Setting SNP reader threads to " + std::to_string(thread_count / 2));
+        bcf_sr_set_threads(snp_reader, std::max(1, thread_count / 2));
     }
 
-    // Set multi-threading
-    // int thread_count = this->input_data.getThreadCount();
-    // bcf_sr_set_threads(snp_reader, thread_count);
-
-    // Enable index usage
-    snp_reader->require_index = 1;
-
     // Add the SNP file to the reader
     if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0)
     {
         bcf_sr_destroy(snp_reader);
-        // throw std::runtime_error("ERROR: Could not add SNP file to reader: "
-        // + snp_filepath);
         printError("ERROR: Could not add SNP file to reader: " + snp_filepath);
         return;
     }
@@ -638,124 +665,17 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     if (!snp_header)
     {
         bcf_sr_destroy(snp_reader);
-        // throw std::runtime_error("ERROR: Could not get header for SNP
-        // reader.");
         printError("ERROR: Could not get header for SNP reader.");
         return;
     }
 
-    // std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl;
-    int record_count = 0;
-    while (bcf_sr_next_line(snp_reader) > 0)
-    {
-        if (!bcf_sr_has_line(snp_reader, 0))
-        {
-            continue;
-        }
-        bcf1_t *snp_record = bcf_sr_get_line(snp_reader, 0);
-        if (snp_record)
-        {
-            record_count++;
-            uint32_t pos = (uint32_t)snp_record->pos + 1;
-
-            // Skip if not a SNP
-            if (!bcf_is_snp(snp_record))
-            {
-                continue;
-            }
-
-            // Get the QUAL, DP, and AD values
-            float qual = snp_record->qual;
-            if (bcf_float_is_missing(qual))
-            {
-                // std::cerr << "ERROR: QUAL value is missing for SNP at " << chr << ":" << pos << std::endl;
-            }
-            // Skip if quality is less than 30
-            if (qual <= 30)
-            {
-                continue;
-            }
-
-            // Extract DP from FORMAT field
-            int32_t *dp = 0;
-            int dp_count = 0;
-            int dp_ret = bcf_get_format_int32(snp_header, snp_record, "DP", &dp, &dp_count);
-            bool dp_skip = false;
-            if (dp_ret < 0)
-            {
-                // std::cerr << "ERROR: Could not get DP value for SNP at " << chr << ":" << pos << std::endl;
-            } else {
-                // Skip if depth is not greater than 10
-                for (int i = 0; i < dp_count; i++)
-                {
-                    if (dp[i] <= 10)
-                    {
-                        dp_skip = true;
-                        break;
-                    }
-                }
-            }
-            free(dp);
-            if (dp_skip)
-            {
-                continue;
-            }
-
-            // Skip if the SNP does not pass the filter
-            if (bcf_has_filter(snp_header, snp_record, const_cast<char*>("PASS")) != 1)
-            {
-                continue;
-            }
-
-            // Extract AD from FORMAT field
-            int32_t *ad = 0;
-            int ad_count = 0;
-            int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count);
-
-            // Skip if AD value is missing
-            if (ad_ret < 0)
-            {
-                // std::cerr << "ERROR: AD value is missing for SNP at " << chr
-                // << ":" << pos << std::endl;
-                // throw std::runtime_error("ERROR: AD value is missing for SNP
-                // at " + chr + ":" + std::to_string(pos));
-                printError("ERROR: AD value is missing for SNP at " + chr + ":" + std::to_string(pos));
-                continue;
-            }
-
-            // Calculate the B-allele frequency (BAF)
-            double baf = 0.0;
-            double ad0 = 0.0;
-            double ad1 = 0.0;
-            for (int i = 0; i < ad_count; i++)
-            {
-                if (i == 0)
-                {
-                    ad0 = (double) ad[i];
-                } else if (i == 1) {
-                    ad1 = (double) ad[i];
-                }
-            }
-            free(ad);
-            baf = ad1 / (ad0 + ad1);
-
-            // Insert the SNP position and BAF into the maps
-            snp_pos.insert(pos);
-            snp_baf[pos] = baf;
-        }
-    }
+    // --------- Population allele frequency file ---------
 
-    // Clean up
-    bcf_sr_destroy(snp_reader);
-}
-
-void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map<uint32_t, double>& snp_pfb_map)
-{
-    // Get the population frequency file for the chromosome
+    // Get the population allele frequency file path
     std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr);
-    if (pfb_filepath == "")
+    if (pfb_filepath.empty())
     {
-        // printError("No population frequency file provided for chromosome " + chr);
+        printError("ERROR: Population allele frequency file path is empty.");
         return;
     }
     
@@ -785,48 +705,30 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
         }
     }
 
-    // Remove the 'chr' prefix from the chromosome name for SNP data. All
-    // SNP data in this program does not use the 'chr' prefix
-    std::string chr_no_prefix = removeChrPrefix(chr);
-    // int thread_count = this->input_data.getThreadCount();
-
-    // Initialize the synced reader
+    // Initialize the population allele frequency reader
     bcf_srs_t *pfb_reader = bcf_sr_init();
     if (!pfb_reader)
     {
-        // throw std::runtime_error("ERROR: Could not initialize synced reader
-        // for population frequency file: " + pfb_filepath);
-        printError("ERROR: Could not initialize synced reader for population frequency file: " + pfb_filepath);
+        bcf_sr_destroy(snp_reader);
+        printError("ERROR: Could not initialize population allele frequency reader.");
         return;
     }
+    pfb_reader->require_index = 1;
 
-    // Lock during reading
-    std::lock_guard<std::mutex> lock(this->pfb_file_mtx);
-
-    // Set the region for the synced reader
-    std::string region_str = chr_gnomad + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
-    if (bcf_sr_set_regions(pfb_reader, region_str.c_str(), 0) < 0)
+    // Set multi-threading if running on a single chromosome
+    if (this->input_data.getChromosome() != "")
     {
-        bcf_sr_destroy(pfb_reader);
-        // throw std::runtime_error("ERROR: Could not set region for synced
-        // reader: " + region_str);
-        printError("ERROR: Could not set region for synced reader: " + region_str);
-        return;
+        int thread_count = this->input_data.getThreadCount() - 1;  // Leave one thread for the main thread
+        printMessage("Setting population allele frequency reader threads to " + std::to_string(thread_count / 2));
+        bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2));
     }
 
-    // Set multi-threading
-    // bcf_sr_set_threads(pfb_reader, thread_count);
-
-    // Enable index usage
-    pfb_reader->require_index = 1;
-
-    // Add the population frequency file to the synced reader
+    // Add the population allele frequency file to the reader
     if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0)
     {
+        bcf_sr_destroy(snp_reader);
         bcf_sr_destroy(pfb_reader);
-        // throw std::runtime_error("ERROR: Could not add population frequency
-        // file to synced reader: " + pfb_filepath);
-        printError("ERROR: Could not add population frequency file to synced reader: " + pfb_filepath);
+        printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath);
         return;
     }
 
@@ -834,72 +736,186 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos
     bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0);
     if (!pfb_header)
     {
+        bcf_sr_destroy(snp_reader);
         bcf_sr_destroy(pfb_reader);
-        // throw std::runtime_error("ERROR: Could not get header for population
-        // frequency file: " + pfb_filepath);
-        printError("ERROR: Could not get header for population frequency file: " + pfb_filepath);
+        printError("ERROR: Could not get header for population allele frequency reader.");
         return;
     }
 
-    int record_count = 0;
-    while (bcf_sr_next_line(pfb_reader) > 0)
+    // Split the region into samples
+    int sample_size = snp_pos.size();
+    std::vector<std::string> region_chunks = splitRegionIntoChunks(chr, start_pos, end_pos, sample_size);
+
+    // Loop through the samples and read the SNP data, storing the first
+    // SNP position and BAF value for each sample
+    int print_count = 0;
+    int current_region = 0;
+    for (size_t i = 0; i < region_chunks.size(); ++i)
     {
-        if (!bcf_sr_has_line(pfb_reader, 0))
+        current_region++;
+        // Lock during reading
+        // std::lock_guard<std::mutex> lock(this->snp_file_mtx);
+
+        // Read the SNP data ----------------------------------------------
+
+        // Set the region
+        std::string region_str = region_chunks[i];
+        if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)
         {
-            continue;
+            bcf_sr_destroy(snp_reader);
+            printError("ERROR: Could not set region for SNP reader: " + region_str);
+            return;
         }
-        // pfb_record = bcf_sr_get_line(pfb_reader, 0);
-        bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0);
-        // Do something with the record
-        if (pfb_record)
+
+        // std::cout << "Iterating through SNPs in region " << region_str <<
+        // "..." << std::endl;
+        // printMessage("Iterating through SNPs in region " + std::to_string(current_region) + " of " + std::to_string((int) region_chunks.size()) + " with length " + std::to_string((int) (end_pos - start_pos)) + " bp...");
+        bool snp_found = false;
+        while (bcf_sr_next_line(snp_reader) > 0)
         {
-            record_count++;
-            // Skip if not a SNP
-            if (!bcf_is_snp(pfb_record))
+            if (!bcf_sr_has_line(snp_reader, 0))
             {
                 continue;
             }
+            bcf1_t *snp_record = bcf_sr_get_line(snp_reader, 0);
+            if (snp_record)
+            {
+                uint32_t pos = (uint32_t)snp_record->pos + 1;
 
-            uint32_t pos = (uint32_t) pfb_record->pos + 1;  // 0-based to 1-based
+                // Skip if not a SNP
+                if (!bcf_is_snp(snp_record))
+                {
+                    continue;
+                }
 
-            // Get the population frequency for the SNP
-            float *pfb_f = NULL;
-            int count = 0;
-            int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count);
-            if (pfb_status < 0 || count == 0)
-            {
-                continue;
+                // Get the QUAL, DP, and AD values
+                if (bcf_float_is_missing(snp_record->qual) || snp_record->qual <= 30)
+                {
+                    continue;
+                }
+
+                // Extract DP from FORMAT field
+                int32_t *dp = 0;
+                // int dp_values[2];
+                int dp_count = 0;
+                int dp_ret = bcf_get_format_int32(snp_header, snp_record, "DP", &dp, &dp_count);
+                if (dp_ret < 0 || dp[0] <= 10)
+                {
+                    continue;
+                }
+                free(dp);
+
+                // Skip if the SNP does not pass the filter
+                if (bcf_has_filter(snp_header, snp_record, const_cast<char*>("PASS")) != 1)
+                {
+                    continue;
+                }
+
+                // Extract AD from FORMAT field
+                int32_t *ad = 0;
+                // int ad_values[2];
+                int ad_count = 0;
+                int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count);
+                // int ad_ret = bcf_get_format_int32(snp_header, snp_record,
+                // "AD", &ad, &ad_count);
+                if (ad_ret < 0 || ad_count < 2)
+                {
+                    continue;
+                }
+
+                // Calculate the B-allele frequency (BAF)
+                // double baf = (double) ad_values[1] / (double) (ad_values[0] +
+                // ad_values[1]);
+                double baf = (double) ad[1] / (double) (ad[0] + ad[1]);
+                free(ad);
+
+                // Add the SNP position and BAF information
+                snp_pos[i] = pos;
+                snp_baf[i] = baf;
+                is_snp[i] = true;
+                snp_found = true;
+
+                break;  // Only one SNP per region
             }
-            double pfb = (double) pfb_f[0];
-            free(pfb_f);
+        }
+
+        if (snp_reader->errnum)
+        {
+            printError("ERROR: " + std::string(bcf_sr_strerror(snp_reader->errnum)));
+        }
+
+        // Continue if no SNP was found in the region
+        if (!snp_found)
+        {
+            continue;
+        }
+
+        // Read the population allele frequency data ----------------------
 
-            // Continue if the population frequency is outside the threshold
-            if (pfb <= MIN_PFB || pfb >= MAX_PFB)
+        // Set the region as the SNP position
+        uint32_t target_snp_pos = snp_pos[i];  // Already 1-based
+        std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos);
+        if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0)
+        {
+            bcf_sr_destroy(snp_reader);
+            bcf_sr_destroy(pfb_reader);
+            printError("ERROR: Could not set region for population allele frequency reader: " + region_str);
+            return;
+        }
+
+        // Find the SNP position in the population allele frequency file
+        while (bcf_sr_next_line(pfb_reader) > 0)
+        {
+            if (!bcf_sr_has_line(pfb_reader, 0))
             {
                 continue;
             }
-
-            // Add the population frequency to the SNP data
-            if (snp_pfb_map.find(pos) == snp_pfb_map.end())
+            // pfb_record = bcf_sr_get_line(pfb_reader, 0);
+            bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0);
+            // Do something with the record
+            if (pfb_record)
             {
-                snp_pfb_map[pos] = pfb;
-            } else {
-                // Keep the larger population frequency
-                if (pfb > snp_pfb_map[pos])
+                // Skip if not a SNP
+                if (!bcf_is_snp(pfb_record))
+                {
+                    continue;
+                }
+
+                // Get the population frequency for the SNP
+                float *pfb_f = NULL;
+                int count = 0;
+                int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count);
+                if (pfb_status < 0 || count == 0)
                 {
-                    snp_pfb_map[pos] = pfb;
+                    continue;
+                }
+                double pfb = (double) pfb_f[0];
+                free(pfb_f);
+
+                // Continue if the population frequency is outside the threshold
+                if (pfb <= MIN_PFB || pfb >= MAX_PFB)
+                {
+                    continue;
+                }
+
+                // Add the population frequency to the SNP data
+                snp_pfb[i] = pfb;
+
+                // Break after finding the SNP position
+                break;
+
+                if (print_count < 20) {
+                    printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")");
+                    print_count++;
                 }
             }
         }
+        if (pfb_reader->errnum)
+        {
+            printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum)));
+        }
     }
-    if (pfb_reader->errnum)
-    {
-        // std::cerr << "ERROR: " <<bcf_sr_strerror(pfb_reader->errnum) <<
-        // std::endl;
-        printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum)));
-    }
-
-    // Clean up
+    bcf_sr_destroy(snp_reader);
     bcf_sr_destroy(pfb_reader);
 }
 
@@ -995,29 +1011,29 @@ void CNVCaller::updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, doubl
     snp_data.is_snp.emplace_back(is_snp);
 }
 
-void CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end, std::set<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf, std::unordered_map<uint32_t, double>& snp_pfb)
+void CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp)
 {
     std::string snp_chr = chr;
     chr = removeChrPrefix(chr);
 
     // Query the SNP allele frequencies for the SNPs
-    std::map<uint32_t, std::tuple<double, double>> snp_map;
-    this->readSNPAlleleFrequencies(snp_chr, start, end, snp_pos, snp_baf);
+    // std::map<uint32_t, std::tuple<double, double>> snp_map;
+    this->readSNPAlleleFrequencies(snp_chr, start, end, snp_pos, snp_baf, snp_pfb, is_snp);
 
     // Query the population frequencies for the SNPs
-    std::unordered_map<uint32_t, double> pfb_map;
-    this->readSNPPopulationFrequencies(chr, start, end, pfb_map);
+    // std::unordered_map<uint32_t, double> pfb_map;
+    // this->readSNPPopulationFrequencies(chr, start, end, snp_pfb);
 
     // Filter out the SNP population frequencies that are not in the SNP
     // position set
-    double pfb_default = 0.5;
-    for (auto& pos : snp_pos)
-    {
-        if (pfb_map.find(pos) != pfb_map.end())
-        {
-            snp_pfb[pos] = pfb_map[pos];
-        } else {
-            snp_pfb[pos] = pfb_default;
-        }
-    }
+    // double pfb_default = 0.5;
+    // for (auto& pos : snp_pos)
+    // {
+    //     if (pfb_map.find(pos) != pfb_map.end())
+    //     {
+    //         snp_pfb[pos] = pfb_map[pos];
+    //     } else {
+    //         snp_pfb[pos] = pfb_default;
+    //     }
+    // }
 }
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 74dd9788..4f9ae124 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -24,7 +24,7 @@ InputData::InputData()
     this->start_end = std::make_pair(0, 0);
     this->region_set = false;
     this->output_dir = "";
-    this->window_size = 2500;
+    this->sample_size = 100;
     this->min_cnv_length = 1000;
     this->thread_count = 1;
     this->hmm_filepath = "data/wgs.hmm";
@@ -124,14 +124,14 @@ void InputData::setOutputDir(std::string dirpath)
     system(cmd.c_str());
 }
 
-int InputData::getWindowSize()
+int InputData::getSampleSize()
 {
-    return this->window_size;
+    return this->sample_size;
 }
 
-void InputData::setWindowSize(int window_size)
+void InputData::setSampleSize(int sample_size)
 {
-    this->window_size = window_size;
+    this->sample_size = sample_size;
 }
 
 std::string InputData::getSNPFilepath()
diff --git a/src/main.cpp b/src/main.cpp
index 1d78ad5f..da0d8d93 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -40,8 +40,8 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
     if (args.find("hmm-file") != args.end()) {
         input_data.setHMMFilepath(args.at("hmm-file"));
     }
-    if (args.find("window-size") != args.end()) {
-        input_data.setWindowSize(std::stoi(args.at("window-size")));
+    if (args.find("sample-size") != args.end()) {
+        input_data.setSampleSize(std::stoi(args.at("sample-size")));
     }
     if (args.find("min-cnv") != args.end()) {
         input_data.setMinCNVLength(std::stoi(args.at("min-cnv")));
@@ -58,20 +58,6 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
     if (args.find("debug") != args.end()) {
         input_data.setVerbose(true);
     }
-    // input_data.setShortReadBam(bamFile);
-    // input_data.setLongReadBam(bamFile);
-    // input_data.setRefGenome(refFile);
-    // input_data.setSNPFilepath(vcfFile);
-    // //input_data.setChromosome("21");
-    // //input_data.setRegion("14486099-14515105");
-    // input_data.setThreadCount(threadCount);
-    // input_data.setAlleleFreqFilepaths(pfbFile);
-    // input_data.setHMMFilepath(hmmFile);
-    // input_data.setOutputDir(outputDir);
-    // input_data.saveCNVData(false);
-    // input_data.setThreadCount(threadCount);
-    // input_data.setWindowSize(windowSize);
-    // input_data.setMinCNVLength(minCNV);
 
     // Run ContextSV
     run(input_data);
@@ -85,15 +71,15 @@ void printUsage(const std::string& programName) {
                 << "  -s, --snp <vcf_file>          SNPs VCF file (required)\n"
                 << "  -o, --outdir <output_dir>     Output directory (required)\n"
                 << "  -c, --chr <chromosome>        Chromosome\n"
-                << "  -r, --region <region>         Region (e.g., 14486099-14515105)\n"
+                << "  -r, --region <region>         Region (start-end)\n"
                 << "  -t, --threads <thread_count>  Number of threads\n"
                 << "  -h, --hmm <hmm_file>          HMM file\n"
-                << "  -w, --window <window_size>    Window size\n"
+                << "  -n, --sample-size <size>      Sample size for HMM predictions\n"
                 << "     --min-cnv <min_length>     Minimum CNV length\n"
                 << "  -e, --eth <eth_file>          ETH file\n"
                 << "  -p, --pfb <pfb_file>          PFB file\n"
                 << "     --save-cnv                 Save CNV data\n"
-                << "     --debug                    Debug mode\n"
+                << "     --debug                    Debug mode with verbose logging\n"
                 << "     --version                  Print version and exit\n"
                 << "  -h, --help                    Print usage and exit\n";
 }
@@ -120,8 +106,8 @@ std::unordered_map<std::string, std::string> parseArguments(int argc, char* argv
             args["thread-count"] = argv[++i];
         } else if ((arg == "-h" || arg == "--hmm") && i + 1 < argc) {
             args["hmm-file"] = argv[++i];
-        } else if ((arg == "-w" || arg == "--window") && i + 1 < argc) {
-            args["window-size"] = argv[++i];
+        } else if ((arg == "-n" || arg == "--sample-size") && i + 1 < argc) {
+            args["sample-size"] = argv[++i];
         } else if (arg == "--min-cnv" && i + 1 < argc) {
             args["min-cnv"] = argv[++i];
         } else if ((arg == "-e" || arg == "--eth") && i + 1 < argc) {
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 97672a91..2e2295da 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -78,11 +78,6 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
             end--;  // Adjust to the last position of the alignment
             bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
 
-            // Check for underflow
-            if (start > 4000000000 || end > 4000000000) {
-                throw std::runtime_error("ERROR: Integer underflow for alignment at position " + std::to_string(start) + "-" + std::to_string(end));
-            }
-
             // Call SVs directly from the CIGAR string
             std::tuple<std::vector<int>, uint32_t, uint32_t> query_info;
             this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true, pos_depth_map);
@@ -91,11 +86,6 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
             uint32_t query_start = std::get<1>(query_info);
             uint32_t query_end = std::get<2>(query_info);
 
-            // Check for underflow
-            if (query_start > 4000000000 || query_end > 4000000000) {
-                throw std::runtime_error("ERROR: Integer underflow for query at position " + std::to_string(query_start) + "-" + std::to_string(query_end));
-            }
-
             // Add the primary alignment to the map
             AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand);
             primary_alignments[qname] = alignment;
@@ -481,9 +471,13 @@ void SVCaller::run()
     std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
     const CHMM& hmm = ReadCHMM(hmm_filepath.c_str());
 
-    // Set up thread pool
-    const int max_threads = this->input_data.getThreadCount();
-    std::cout << "Using " << max_threads << " threads for processing..." << std::endl;
+    // Use multi-threading across chromosomes unless a single chromosome is
+    // specified
+    int max_threads = 1;
+    if (this->input_data.getChromosome() == "") {
+        max_threads = this->input_data.getThreadCount();
+        std::cout << "Using " << max_threads << " threads for processing..." << std::endl;
+    }
     ThreadPool pool(max_threads);
 
     // Shared resources
@@ -507,10 +501,8 @@ void SVCaller::run()
         }
     };
 
-    // Futures vector
-    std::vector<std::future<void>> futures;
-
     // Submit tasks to the thread pool and track futures
+    std::vector<std::future<void>> futures;
     for (const auto& chr : chromosomes) {
         futures.emplace_back(pool.enqueue([&, chr] {
             printMessage("Processing chromosome " + chr);
@@ -553,16 +545,12 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
     // Find split-read SV evidence
     int sv_count = 0;
     uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength();
-    int primary_count = primary_map.size();
-    int current_primary = 0;
     for (const auto& entry : primary_map) {
-        current_primary++;
         std::string qname = entry.first;
         AlignmentData primary_alignment = entry.second;
         std::string primary_chr = std::get<0>(primary_alignment);
         uint32_t primary_start = std::get<1>(primary_alignment);
         uint32_t primary_end = std::get<2>(primary_alignment);
-        printMessage("Processing primary alignment " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " (Location: " + primary_chr + ":" + std::to_string(primary_start+1) + "-" + std::to_string(primary_end+1) + ")...");
 
         // Skip primary alignments that do not have supplementary alignments
         if (supp_map.find(qname) == supp_map.end()) {
@@ -571,7 +559,7 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
 
         // Find the largest supplementary alignment, and also identify
         // inversions
-        printMessage("Finding largest supplementary alignment...");
+        // printMessage("Finding largest supplementary alignment...");
         AlignmentData largest_supp_alignment = supp_map[qname][0];
         uint32_t largest_supp_length = 0;
         for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
@@ -599,7 +587,6 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
                         continue;
                     }
 
-                    printMessage("Running copy number prediction on inversion: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1) + " of length " + std::to_string(supp_length) + " bp...");
                     std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map);
                     if (std::get<1>(result) == SVType::UNKNOWN) {
                         continue;
@@ -607,20 +594,17 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
 
                     double supp_lh = std::get<0>(result);
                     SVType supp_type = std::get<1>(result);
-                    printMessage("Calculating read depth for inversion (length: " + std::to_string(supp_length) + " bp)...");
                     int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
                     if (supp_type == SVType::NEUTRAL) {
                         addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "<INV>", "HMM", "./.", supp_lh, read_depth);
                         
                         sv_count++;
                     } else if (supp_type == SVType::DUP) {
-                        // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
                         addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", "<INV>", "HMM", "./.", supp_lh, read_depth);
                     }
                 } else {
                     // Add the inversion without running copy number predictions
                     // (too small for predictions)
-                    printMessage("Calculating read depth for small inversion (length: " + std::to_string(supp_length) + " bp)...");
                     int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
                     addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "<INV>", "REV", "./.", 0.0, read_depth);
                 }
@@ -628,14 +612,12 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
         }
 
         // Trim overlapping alignments
-        printMessage("Trimming overlapping alignments...");
         uint32_t supp_start = std::get<1>(largest_supp_alignment);
         uint32_t supp_end = std::get<2>(largest_supp_alignment);
         bool primary_before_supp = primary_start < supp_start;
         trimOverlappingAlignments(primary_alignment, largest_supp_alignment);
 
         // Create the SV candidate using both alignments
-        printMessage("Creating SV candidates...");
         supp_start = std::get<1>(largest_supp_alignment);
         supp_end = std::get<2>(largest_supp_alignment);
         primary_start = std::get<1>(primary_alignment);
@@ -669,7 +651,6 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
             }
 
             // printMessage("Running copy number prediction on boundary: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right));
-            printMessage("Running copy number prediction on boundary (Length: " + std::to_string(boundary_right - boundary_left) + " bp)...");
             std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map);
             if (std::get<1>(bd_result) == SVType::UNKNOWN) {
                 continue;
@@ -690,7 +671,6 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
                 // printMessage("Running copy number prediction on gap: " +
                 // primary_chr + ":" + std::to_string(gap_left) + "-" +
                 // std::to_string(gap_right));
-                printMessage("Running copy number prediction on gap (Length: " + std::to_string(gap_right - gap_left) + " bp)...");
                 std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map);
                 if (std::get<1>(gap_result) == SVType::UNKNOWN) {
                     continue;
@@ -701,19 +681,16 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
                 // If higher likelihood than the boundary, add the gap as the SV call
                 if (gap_lh > bd_lh) {
                     int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right);
-                    printMessage("Calculating read depth for gap (length: " + std::to_string(gap_right - gap_left) + " bp)...");
                     std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">";
                     addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "GAP", "./.", gap_lh, read_depth);
                 } else {
                     // Add the boundary as the SV call
-                    printMessage("Calculating read depth for boundary (length: " + std::to_string(boundary_right - boundary_left) + " bp)...");
                     int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                     std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
                     addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth);
                 }
             } else {
                 // Add the boundary as the SV call
-                printMessage("Calculating read depth for boundary (length: " + std::to_string(boundary_right - boundary_left) + " bp)...");
                 int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                 std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
                 addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth);
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 04201066..5330fd9f 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -16,11 +16,6 @@ bool SVCall::operator<(const SVCall & other) const
 
 void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth)
 {
-    // Catch underflow errors
-    if (start > 4000000000 || end > 4000000000) {
-        throw std::runtime_error("ERROR: Integer underflow for SV call at position " + std::to_string(start) + "-" + std::to_string(end));
-    }
-
     // Ignore unknown SV types
     if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") {
         return;

From 9e4367834156e424511626042a04fde373fe5714 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 12 Dec 2024 14:15:01 -0500
Subject: [PATCH 050/134] log memory usage

---
 include/cnv_caller.h |   2 +-
 include/utils.h      |   2 +
 src/cnv_caller.cpp   | 337 +++++++++++++++++--------------------------
 src/contextsv.cpp    |  17 +--
 src/main.cpp         |  12 +-
 src/sv_caller.cpp    |  32 +++-
 src/sv_object.cpp    |   6 +-
 src/utils.cpp        |  12 ++
 8 files changed, 190 insertions(+), 230 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 457336e1..f17e807e 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -94,7 +94,7 @@ class CNVCaller {
         void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map);
 
         // Calculate the mean chromosome coverage
-        std::pair<double, std::vector<uint32_t>> calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len);
+        double calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map);
 
         // Calculate the log2 ratio for a region given the read depths and mean
         // chromosome coverage
diff --git a/include/utils.h b/include/utils.h
index 4ec19138..7311efbc 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -25,4 +25,6 @@ std::string getElapsedTime(std::chrono::high_resolution_clock::time_point start,
 
 std::string removeChrPrefix(std::string chr);
 
+void printMemoryUsage(const std::string &functionName);
+
 #endif // UTILS_H
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index eb217728..2d2c5548 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -2,7 +2,6 @@
 #include "cnv_caller.h"
 
 #include <htslib/sam.h>
-
 #include <htslib/vcf.h>
 #include <htslib/hts.h>
 #include <htslib/synced_bcf_reader.h>
@@ -65,8 +64,6 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
         sample_size = region_length;
     }
 
-    printMessage("Querying SNPs for region length " + std::to_string(region_length) + " bp with sample size " + std::to_string(sample_size) + "...");
-
     // std::set<uint32_t> snp_pos(sample_size);
     std::vector<uint32_t> snp_pos(sample_size, 0);
     std::vector<double> snp_baf(sample_size, -1.0);
@@ -89,77 +86,6 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     snp_data.pfb = std::move(snp_pfb);
     snp_data.log2_cov = std::move(snp_log2_cov);
     snp_data.is_snp = std::move(is_snp);
-
-    /*
-    // Loop through the range of the SV region and query the SNPs in a sliding
-    // window, then calculate the log2 ratio for each window
-    for (uint32_t i = start_pos; i <= end_pos; i += window_size)
-    {
-        // Run a sliding non-overlapping window of size window_size across
-        // the SV region and calculate the log2 ratio for each window
-        uint32_t window_start = i;
-        uint32_t window_end = std::min(i + window_size - 1, end_pos);
-
-        // Get the SNP info for the window
-        std::vector<uint32_t> snp_window_pos;
-        std::vector<double> snp_window_bafs;
-        std::vector<double> snp_window_pfbs;
-        auto it_start = snp_pos.lower_bound(window_start);
-        auto it_end = snp_pos.upper_bound(window_end);
-        for (auto it = it_start; it != it_end; it++)
-        {
-            snp_window_pos.push_back(*it);
-            snp_window_bafs.push_back(snp_baf[*it]);
-            snp_window_pfbs.push_back(snp_pfb[*it]);
-        }
-
-        // Loop though the SNP positions and calculate the log2 ratio for
-        // the window up to the SNP, then calculate the log2 ratio centered
-        // at the SNP, and finally calculate the log2 ratio for the window
-        // after the SNP, and continue until the end of the window
-        // (If there are no SNPs in the window, then use the default BAF and
-        // PFB values, and the coverage log2 ratio)
-        // If no SNPs, then calculate the log2 ratio for the window
-        if (snp_window_pos.size() == 0)
-        {
-            double window_log2_ratio = calculateLog2Ratio(window_start, window_end, pos_depth_map, mean_chr_cov);
-            double pfb_default = 0.5;
-            double baf_default = -1.0;  // Use -1.0 to indicate no BAF data
-            this->updateSNPData(snp_data, (window_start + window_end) / 2, pfb_default, baf_default, window_log2_ratio, false);
-
-        } else {
-            snps_found = true;
-
-            // Loop through the SNPs and calculate the log2 ratios
-            for (int j = 0; j < (int) snp_window_pos.size(); j++)
-            {
-                // Just use a window centered at the SNP position
-                uint32_t bin_start = snp_window_pos[j] - window_size / 2;
-                uint32_t bin_end = snp_window_pos[j] + window_size / 2;
-
-                // Trim the bin start and end to 1/2 the distance from the
-                // neighboring SNPs (or the start/end of the window)
-                if (j > 0)
-                {
-                    bin_start = std::max(bin_start, (snp_window_pos[j-1] + snp_window_pos[j]) / 2);
-                }
-
-                if (j < (int) snp_window_pos.size() - 1)
-                {
-                    bin_end = std::min(bin_end, (snp_window_pos[j] + snp_window_pos[j+1]) / 2);
-                }
-
-                // Calculate the log2 ratio for the SNP bin
-                double bin_cov = calculateLog2Ratio(bin_start, bin_end, pos_depth_map, mean_chr_cov);
-                this->updateSNPData(snp_data, snp_window_pos[j], snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true);
-
-                // Update the previous bin start
-                bin_start = bin_end + 1;
-            }
-        }
-    }
-    */
-    // return std::make_pair(snp_data, snps_found);
 }
 
 std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map)
@@ -195,13 +121,14 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     // bool sv_snps_found = snp_call.second;
 
     // Run the Viterbi algorithm
-    printMessage("Running Viterbi algorithm for SV with length " + std::to_string((int) (end_pos - start_pos)) + " bp, snp data found: " + std::to_string(snp_data.pos.size()) + "...");
+    printMemoryUsage("Before running Viterbi algorithm, ");
     std::pair<std::vector<int>, double> prediction;
     runViterbi(hmm, snp_data, prediction);
     if (prediction.first.size() == 0)
     {
         return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false);
     }
+    printMemoryUsage("After running Viterbi algorithm, ");
 
     std::vector<int>& state_sequence = prediction.first;
     double likelihood = prediction.second;
@@ -244,7 +171,6 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
         genotype = cnv_genotype_map[max_state];
     }
     snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
-    printMessage("Finished running Viterbi algorithm for SV with length " + std::to_string((int) (end_pos - start_pos)) + " bp: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", returning...");
 
     // Save the SV calls as a TSV file if enabled
     bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL);
@@ -315,7 +241,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         runViterbi(hmm, snp_data, prediction);
         std::vector<int>& state_sequence = prediction.first;
         double likelihood = prediction.second;
-        // printMessage("Finished running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
 
         // Get all the states in the SV region
         // printMessage("Getting states for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
@@ -417,9 +342,9 @@ std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint3
 }
 
 // Calculate the mean chromosome coverage
-std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len)
+double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map)
 {
-    std::vector<uint32_t> chr_pos_depth_map(chr_len+1, 0); // 1-based index
+    printMemoryUsage("Before calculating mean chromosome coverage, ");
     {
         // Lock the bam file
         std::lock_guard<std::mutex> lock(this->bam_file_mtx);
@@ -429,10 +354,8 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
         samFile *bam_file = sam_open(bam_filepath.c_str(), "r");
         if (!bam_file)
         {
-            // throw std::runtime_error("ERROR: Could not open BAM file: " +
-            // bam_filepath);
             printError("ERROR: Could not open BAM file: " + bam_filepath);
-            return std::make_pair(0.0, chr_pos_depth_map);
+            return 0.0;
         }
 
         // Enable multi-threading
@@ -444,8 +367,7 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
         {
             sam_close(bam_file);
             printError("ERROR: Could not read header from BAM file: " + bam_filepath);
-            return std::make_pair(0.0, chr_pos_depth_map);
-            // throw std::runtime_error("ERROR: Could not read header from BAM file: " + bam_filepath);
+            return 0.0;
         }
 
         // Load the index
@@ -454,10 +376,8 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
         {
             bam_hdr_destroy(bam_header);
             sam_close(bam_file);
-            // throw std::runtime_error("ERROR: Could not load index for BAM
-            // file: " + bam_filepath);
             printError("ERROR: Could not load index for BAM file: " + bam_filepath);
-            return std::make_pair(0.0, chr_pos_depth_map);  
+            return 0.0;
         }
 
         // Create an iterator for the chromosome
@@ -467,11 +387,8 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
             hts_idx_destroy(bam_index);
             bam_hdr_destroy(bam_header);
             sam_close(bam_file);
-            // throw std::runtime_error("ERROR: Could not create iterator for
-            // chromosome: " + chr + ", check if the chromosome exists in the
-            // BAM file.");
             printError("ERROR: Could not create iterator for chromosome: " + chr + ", check if the chromosome exists in the BAM file.");
-            return std::make_pair(0.0, chr_pos_depth_map);
+            return 0.0;
         }
 
         // Initialize the record
@@ -482,14 +399,11 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
             hts_idx_destroy(bam_index);
             bam_hdr_destroy(bam_header);
             sam_close(bam_file);
-            // throw std::runtime_error("ERROR: Could not initialize BAM
-            // record.");
             printError("ERROR: Could not initialize BAM record.");
-            return std::make_pair(0.0, chr_pos_depth_map);
+            return 0.0;
         }
 
         // Iterate through the chromosome and update the depth map
-        // std::unordered_map<uint32_t, int> chr_pos_depth_map;
         while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0)
         {
             // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads
@@ -516,11 +430,8 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
                         try {
                             chr_pos_depth_map[ref_pos + j]++;
                         } catch (const std::out_of_range& oor) {
-                            // std::cerr << "Out of range error for " << chr <<
-                            // ":" << ref_pos+j << std::endl;
                             printError("Out of range error for " + chr + ":" + std::to_string(ref_pos+j));
                         }
-                        // chr_pos_depth_map[ref_pos + j]++;
                     }
                 }
                 
@@ -531,8 +442,6 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
                 } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) {
                     // Do nothing
                 } else {
-                    // throw std::runtime_error("ERROR: Unknown CIGAR operation:
-                    // " + std::to_string(op));
                     printError("ERROR: Unknown CIGAR operation: " + std::to_string(op));
                 }
             }
@@ -545,6 +454,7 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
         bam_hdr_destroy(bam_header);
         sam_close(bam_file);
     }
+    printMemoryUsage("After calculating mean chromosome coverage, ");
 
     // Calculate the mean chromosome coverage for positions with non-zero depth
     uint64_t cum_depth = 0;
@@ -564,7 +474,8 @@ std::pair<double, std::vector<uint32_t>> CNVCaller::calculateMeanChromosomeCover
         mean_chr_cov = static_cast<double>(cum_depth) / static_cast<double>(pos_count);
     }
 
-    return std::make_pair(mean_chr_cov, chr_pos_depth_map);
+    // return std::make_pair(mean_chr_cov, chr_pos_depth_map);
+    return mean_chr_cov;
 }
 
 double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov)
@@ -625,6 +536,8 @@ void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, i
 
 void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp)
 {
+    printMemoryUsage("Reading SNP allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", ");
+    
     // --------- SNP file ---------
     // Get the SNP file path
     std::string snp_filepath = this->input_data.getSNPFilepath();
@@ -659,6 +572,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         printError("ERROR: Could not add SNP file to reader: " + snp_filepath);
         return;
     }
+    printMemoryUsage("After adding SNP file to reader, ");
 
     // Get the header
     bcf_hdr_t *snp_header = bcf_sr_get_header(snp_reader, 0);
@@ -672,74 +586,82 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     // --------- Population allele frequency file ---------
 
     // Get the population allele frequency file path
+    bool use_pfb = true;
     std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr);
     if (pfb_filepath.empty())
     {
-        printError("ERROR: Population allele frequency file path is empty.");
-        return;
+        use_pfb = false;
+        // printError("ERROR: Population allele frequency file path is empty.");
+        // return;
     }
     
-    // Determine the ethnicity-specific allele frequency key
-    std::string AF_key = "AF";
-    if (this->input_data.getEthnicity() != "")
-    {
-        AF_key += "_" + this->input_data.getEthnicity();
-    }
-
-    // Check if the filepath uses the 'chr' prefix notations based on the
-    // chromosome name (*.chr1.vcf.gz vs *.1.vcf.gz)
-    std::string chr_gnomad = chr;  // gnomAD data may or may not have the 'chr' prefix
-    std::string chr_prefix = "chr";
-    if (pfb_filepath.find(chr_prefix) == std::string::npos)
+    bcf_srs_t *pfb_reader = bcf_sr_init();
+    std::string chr_gnomad;
+    std::string AF_key;
+    if (use_pfb)
     {
-        // Remove the 'chr' prefix from the chromosome name
-        if (chr_gnomad.find(chr_prefix) != std::string::npos)
+        // Determine the ethnicity-specific allele frequency key
+        AF_key = "AF";
+        if (this->input_data.getEthnicity() != "")
         {
-            chr_gnomad = chr_gnomad.substr(chr_prefix.length());
+            AF_key += "_" + this->input_data.getEthnicity();
         }
-    } else {
-        // Add the 'chr' prefix to the chromosome name
-        if (chr_gnomad.find(chr_prefix) == std::string::npos)
+
+        // Check if the filepath uses the 'chr' prefix notations based on the
+        // chromosome name (*.chr1.vcf.gz vs *.1.vcf.gz)
+        chr_gnomad = chr;  // gnomAD data may or may not have the 'chr' prefix
+        std::string chr_prefix = "chr";
+        if (pfb_filepath.find(chr_prefix) == std::string::npos)
         {
-            chr_gnomad = chr_prefix + chr;
+            // Remove the 'chr' prefix from the chromosome name
+            if (chr_gnomad.find(chr_prefix) != std::string::npos)
+            {
+                chr_gnomad = chr_gnomad.substr(chr_prefix.length());
+            }
+        } else {
+            // Add the 'chr' prefix to the chromosome name
+            if (chr_gnomad.find(chr_prefix) == std::string::npos)
+            {
+                chr_gnomad = chr_prefix + chr;
+            }
         }
-    }
 
-    // Initialize the population allele frequency reader
-    bcf_srs_t *pfb_reader = bcf_sr_init();
-    if (!pfb_reader)
-    {
-        bcf_sr_destroy(snp_reader);
-        printError("ERROR: Could not initialize population allele frequency reader.");
-        return;
-    }
-    pfb_reader->require_index = 1;
+        // Initialize the population allele frequency reader
+        if (!pfb_reader)
+        {
+            bcf_sr_destroy(snp_reader);
+            printError("ERROR: Could not initialize population allele frequency reader.");
+            return;
+        }
+        pfb_reader->require_index = 1;
 
-    // Set multi-threading if running on a single chromosome
-    if (this->input_data.getChromosome() != "")
-    {
-        int thread_count = this->input_data.getThreadCount() - 1;  // Leave one thread for the main thread
-        printMessage("Setting population allele frequency reader threads to " + std::to_string(thread_count / 2));
-        bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2));
-    }
+        // Set multi-threading if running on a single chromosome
+        if (this->input_data.getChromosome() != "")
+        {
+            int thread_count = this->input_data.getThreadCount() - 1;  // Leave one thread for the main thread
+            printMessage("Setting population allele frequency reader threads to " + std::to_string(thread_count / 2));
+            bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2));
+        }
 
-    // Add the population allele frequency file to the reader
-    if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0)
-    {
-        bcf_sr_destroy(snp_reader);
-        bcf_sr_destroy(pfb_reader);
-        printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath);
-        return;
-    }
+        // Add the population allele frequency file to the reader
+        if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0)
+        {
+            bcf_sr_destroy(snp_reader);
+            bcf_sr_destroy(pfb_reader);
+            printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath);
+            return;
+        }
+        printMemoryUsage("After adding population allele frequency file to reader, ");
 
-    // Get the header
-    bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0);
-    if (!pfb_header)
-    {
-        bcf_sr_destroy(snp_reader);
-        bcf_sr_destroy(pfb_reader);
-        printError("ERROR: Could not get header for population allele frequency reader.");
-        return;
+        // Get the header
+        bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0);
+        if (!pfb_header)
+        {
+            bcf_sr_destroy(snp_reader);
+            bcf_sr_destroy(pfb_reader);
+            printError("ERROR: Could not get header for population allele frequency reader.");
+            return;
+        }
     }
 
     // Split the region into samples
@@ -759,6 +681,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         // Read the SNP data ----------------------------------------------
 
         // Set the region
+        printMemoryUsage("Before setting region for SNP reader, ");
         std::string region_str = region_chunks[i];
         if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)
         {
@@ -766,6 +689,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             printError("ERROR: Could not set region for SNP reader: " + region_str);
             return;
         }
+        printMemoryUsage("After setting region for SNP reader, and before reading SNPs, ");
 
         // std::cout << "Iterating through SNPs in region " << region_str <<
         // "..." << std::endl;
@@ -839,6 +763,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             }
         }
 
+        printMemoryUsage("After reading SNPs for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", ");
+
         if (snp_reader->errnum)
         {
             printError("ERROR: " + std::string(bcf_sr_strerror(snp_reader->errnum)));
@@ -850,70 +776,75 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             continue;
         }
 
-        // Read the population allele frequency data ----------------------
-
-        // Set the region as the SNP position
-        uint32_t target_snp_pos = snp_pos[i];  // Already 1-based
-        std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos);
-        if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0)
-        {
-            bcf_sr_destroy(snp_reader);
-            bcf_sr_destroy(pfb_reader);
-            printError("ERROR: Could not set region for population allele frequency reader: " + region_str);
-            return;
-        }
+        printMemoryUsage("Before reading population allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", ");
 
-        // Find the SNP position in the population allele frequency file
-        while (bcf_sr_next_line(pfb_reader) > 0)
+        // Read the population allele frequency data ----------------------
+        if (use_pfb)
         {
-            if (!bcf_sr_has_line(pfb_reader, 0))
+            // Set the region as the SNP position
+            uint32_t target_snp_pos = snp_pos[i];  // Already 1-based
+            std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos);
+            if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0)
             {
-                continue;
+                bcf_sr_destroy(snp_reader);
+                bcf_sr_destroy(pfb_reader);
+                printError("ERROR: Could not set region for population allele frequency reader: " + region_str);
+                return;
             }
-            // pfb_record = bcf_sr_get_line(pfb_reader, 0);
-            bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0);
-            // Do something with the record
-            if (pfb_record)
+
+            // Find the SNP position in the population allele frequency file
+            while (bcf_sr_next_line(pfb_reader) > 0)
             {
-                // Skip if not a SNP
-                if (!bcf_is_snp(pfb_record))
+                if (!bcf_sr_has_line(pfb_reader, 0))
                 {
                     continue;
                 }
-
-                // Get the population frequency for the SNP
-                float *pfb_f = NULL;
-                int count = 0;
-                int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count);
-                if (pfb_status < 0 || count == 0)
+                // pfb_record = bcf_sr_get_line(pfb_reader, 0);
+                bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0);
+                // Do something with the record
+                if (pfb_record)
                 {
-                    continue;
-                }
-                double pfb = (double) pfb_f[0];
-                free(pfb_f);
+                    // Skip if not a SNP
+                    if (!bcf_is_snp(pfb_record))
+                    {
+                        continue;
+                    }
 
-                // Continue if the population frequency is outside the threshold
-                if (pfb <= MIN_PFB || pfb >= MAX_PFB)
-                {
-                    continue;
-                }
+                    // Get the population frequency for the SNP
+                    float *pfb_f = NULL;
+                    int count = 0;
+                    int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count);
+                    if (pfb_status < 0 || count == 0)
+                    {
+                        continue;
+                    }
+                    double pfb = (double) pfb_f[0];
+                    free(pfb_f);
+
+                    // Continue if the population frequency is outside the threshold
+                    if (pfb <= MIN_PFB || pfb >= MAX_PFB)
+                    {
+                        continue;
+                    }
 
-                // Add the population frequency to the SNP data
-                snp_pfb[i] = pfb;
+                    // Add the population frequency to the SNP data
+                    snp_pfb[i] = pfb;
 
-                // Break after finding the SNP position
-                break;
+                    // Break after finding the SNP position
+                    break;
 
-                if (print_count < 20) {
-                    printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")");
-                    print_count++;
+                    if (print_count < 20) {
+                        printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")");
+                        print_count++;
+                    }
                 }
             }
+            if (pfb_reader->errnum)
+            {
+                printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum)));
+            }
         }
-        if (pfb_reader->errnum)
-        {
-            printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum)));
-        }
+        printMemoryUsage("After reading population allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", ");
     }
     bcf_sr_destroy(snp_reader);
     bcf_sr_destroy(pfb_reader);
diff --git a/src/contextsv.cpp b/src/contextsv.cpp
index 5a9e7ffd..107a6dae 100644
--- a/src/contextsv.cpp
+++ b/src/contextsv.cpp
@@ -19,19 +19,10 @@ ContextSV::ContextSV(InputData& input_data)
 
 int ContextSV::run()
 {
-    ReferenceGenome ref_genome = this->input_data.getRefGenome();  // Load the reference genome
-    SVCaller sv_caller(this->input_data);  // Create an SV caller object
-    // SVCaller sv_caller(*this->input_data);  // Create an SV caller object
-    // SVData sv_calls = sv_caller.run();  // Run the SV caller
-    // std::unordered_map<std::string, std::set<SVCall>> sv_calls =
-    // sv_caller.run();  // Run the SV caller
-    sv_caller.run();  // Run the SV caller
-    // std::string output_dir = this->input_data->getOutputDir();  // Get the output directory
-    
-    // std::cout << "Writing SV calls to file " << output_dir << "/output.vcf..." << std::endl;
-    // sv_caller.saveToVCF(ref_genome, output_dir);  // Save the SV calls to a VCF file
-    // sv_calls.saveToVCF(ref_genome, output_dir);  // Save the SV calls to a VCF file
-    std::cout << "SV calling complete." << std::endl;
+    printMemoryUsage("Before creating SV caller, ");
+    SVCaller sv_caller(this->input_data); 
+    printMemoryUsage("After creating SV caller, ");
+    sv_caller.run();
 
     return 0;
 }
diff --git a/src/main.cpp b/src/main.cpp
index da0d8d93..c622c34d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -1,7 +1,5 @@
 
 #include "swig_interface.h"
-#include "input_data.h"
-#include "version.h"
 
 /// @cond DOXYGEN_IGNORE
 #include <iostream>
@@ -9,8 +7,10 @@
 // #include <optional>
 /// @endcond
 
-// Placeholder for ContextSV library includes
-// #include "ContextSV.h"
+#include "input_data.h"
+#include "version.h"
+#include "utils.h"
+
 
 void runContextSV(const std::unordered_map<std::string, std::string>& args)
 {
@@ -22,10 +22,13 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
     }
 
     // Set up input data
+    printMemoryUsage("Before setting up input data, ");
     InputData input_data;
     input_data.setLongReadBam(args.at("bam-file"));
     input_data.setShortReadBam(args.at("bam-file"));
+    printMemoryUsage("Before reading reference genome, ");
     input_data.setRefGenome(args.at("ref-file"));
+    printMemoryUsage("After reading reference genome, ");
     input_data.setSNPFilepath(args.at("snps-file"));
     input_data.setOutputDir(args.at("output-dir"));
     if (args.find("chr") != args.end()) {
@@ -58,6 +61,7 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
     if (args.find("debug") != args.end()) {
         input_data.setVerbose(true);
     }
+    printMemoryUsage("After setting up input data, ");
 
     // Run ContextSV
     run(input_data);
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 2e2295da..74a4f891 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -41,6 +41,8 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
 
 void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, const std::vector<uint32_t>& pos_depth_map)
 {
+    printMemoryUsage("Before detecting SVs from CIGAR strings, ");
+
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
     if (!bam1) {
@@ -121,6 +123,8 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
     hts_itr_destroy(itr);
     bam_destroy1(bam1);
 
+    printMemoryUsage("After detecting SVs from CIGAR strings, ");
+
     // return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments);
 }
 
@@ -333,12 +337,14 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
 
 void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls)
 {
+    printMemoryUsage("Before opening BAM file, ");
     // Open the BAM file
     std::string bam_filepath = this->input_data.getLongReadBam();
     samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
     if (!fp_in) {
         throw std::runtime_error("ERROR: failed to open " + bam_filepath);
     }
+    printMemoryUsage("After opening BAM file, ");
 
     // Load the header
     bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
@@ -354,6 +360,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         sam_close(fp_in);
         throw std::runtime_error("ERROR: failed to load index for " + bam_filepath);
     }
+    printMemoryUsage("After loading index, ");
 
     // Split the chromosome into chunks for memory efficiency
     std::vector<std::string> region_chunks;
@@ -383,13 +390,17 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         }
         printMessage("Split chromosome " + chr + " into " + std::to_string(region_chunks.size()) + " chunks of size " + std::to_string(chunk_size) + "...");
     }
+    printMemoryUsage("After splitting chromosome into chunks, ");
 
     // Load chromosome data for copy number predictions
     // std::cout << "Loading chromosome data for copy number predictions..." << std::endl;
     printMessage(chr + ": Loading chromosome data...");
     CNVCaller cnv_caller(this->input_data);
-    std::pair<double, std::vector<uint32_t>> chr_data = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_len);
-    if (chr_data.first == 0.0 || chr_data.second.size() == 0) {
+    printMemoryUsage("Before calculating mean chromosome coverage (top), ");
+    std::vector<uint32_t> chr_pos_depth_map(chr_len+1, 0);  // 1-based index
+    double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map);
+    printMemoryUsage("After calculating mean chromosome coverage (top), ");
+    if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) {
         hts_idx_destroy(idx);
         bam_hdr_destroy(bamHdr);
         sam_close(fp_in);
@@ -404,10 +415,12 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     for (const auto& sub_region : region_chunks) {
         current_region++;
         printMessage(chr + ": CIGAR SVs...");
+        printMemoryUsage("Before detecting CIGAR SVs, ");
         PrimaryMap primary_map;
         SuppMap supp_map;
         std::vector<SVCall> subregion_sv_calls;
-        this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map, chr_data.second);
+        printMemoryUsage("After creating primary and supplementary maps, ");
+        this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map, chr_pos_depth_map);
         // std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
         // PrimaryMap& primary_map = std::get<1>(region_data);
         // SuppMap& supp_map = std::get<2>(region_data);
@@ -423,13 +436,13 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         if (region_sv_count > 0) {
             // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl;
             printMessage(chr + ": CIGAR predictions...");
-            cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, hmm, chr_data.first, chr_data.second);
+            cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map);
         }
 
         // Run split-read SV and copy number variant predictions
         // std::cout << "Detecting copy number variants from split reads..." << std::endl;
         printMessage(chr + ": Split read SVs...");
-        this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, chr_data.first, chr_data.second);
+        this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map);
 
         // Merge the SV calls from the current region
         // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
@@ -458,6 +471,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
 
 void SVCaller::run()
 {
+    printMemoryUsage("Before running SV caller, ");
     // Get the chromosomes to process
     std::vector<std::string> chromosomes;
     if (this->input_data.getChromosome() != "") {
@@ -465,11 +479,13 @@ void SVCaller::run()
     } else {
         chromosomes = this->input_data.getRefGenomeChromosomes();
     }
-        
+    
+    printMemoryUsage("After getting chromosomes, ");
     // Read the HMM from the file
     std::string hmm_filepath = this->input_data.getHMMFilepath();
     std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
     const CHMM& hmm = ReadCHMM(hmm_filepath.c_str());
+    printMemoryUsage("After reading HMM, ");
 
     // Use multi-threading across chromosomes unless a single chromosome is
     // specified
@@ -542,6 +558,8 @@ void SVCaller::run()
 // Detect SVs from split read alignments
 void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map)
 {
+    printMemoryUsage("Before detecting SVs from split reads, ");
+
     // Find split-read SV evidence
     int sv_count = 0;
     uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength();
@@ -697,6 +715,8 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
             }
         }
     }
+
+    printMemoryUsage("After detecting SVs from split reads, ");
 }
 
 void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls)
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 5330fd9f..aedb187c 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -90,7 +90,7 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
     }
 
     // Merge SV calls if they overlap
-    int initial_size = sv_calls.size();
+    // int initial_size = sv_calls.size();
     
     // Merge any SV calls that have >90% reciprocal overlap
     std::vector<SVCall> merged_sv_calls;
@@ -142,8 +142,8 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
     merged_sv_calls.push_back(current_merge);  // Add the last SV call
     sv_calls = merged_sv_calls;  // Update the SV calls
 
-    int updated_size = sv_calls.size();
-    std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl;
+    // int updated_size = sv_calls.size();
+    // std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl;
 }
 
 void filterSVsWithLowSupport(std::vector<SVCall>& sv_calls, int min_support)
diff --git a/src/utils.cpp b/src/utils.cpp
index db083f97..bb82abbc 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -1,6 +1,8 @@
 #include "utils.h"
 
 /// @cond
+#include <sys/resource.h>  // getrusage
+#include <iomanip>
 #include <stdio.h>
 #include <string>
 #include <iostream>
@@ -109,3 +111,13 @@ std::string removeChrPrefix(std::string chr)
     }
     return chr;
 }
+
+void printMemoryUsage(const std::string& functionName) {
+    struct rusage usage;
+    getrusage(RUSAGE_SELF, &usage);
+
+    // Convert from KB to GB
+    double mem_usage_gb = (double)usage.ru_maxrss / 1024.0 / 1024.0;
+    std::cout << functionName << " memory usage: "
+              << std::fixed << std::setprecision(2) << mem_usage_gb << " GB" << std::endl;
+}

From 2d1337c3941472fd78069cd4dbcccd538f9b8545 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 14 Dec 2024 18:40:35 -0500
Subject: [PATCH 051/134] improve mem eff

---
 include/cnv_caller.h  |  26 +-
 include/fasta_query.h |   4 +-
 include/input_data.h  |  36 +--
 include/sv_caller.h   |  41 ++-
 src/cnv_caller.cpp    |  83 +----
 src/fasta_query.cpp   |   6 +-
 src/input_data.cpp    |  34 +-
 src/sv_caller.cpp     | 734 ++++++++++++++++++++++++++++++++----------
 8 files changed, 638 insertions(+), 326 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index f17e807e..12bbce80 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -45,7 +45,7 @@ struct SNPData {
 // CNVCaller: Detect CNVs and return the state sequence by SNP position
 class CNVCaller {
     private:
-        InputData& input_data;
+        const InputData& input_data;
         mutable std::mutex snp_file_mtx;  // SNP file mutex
         mutable std::mutex pfb_file_mtx;  // Population frequency file mutex
         mutable std::mutex bam_file_mtx;  // BAM file mutex
@@ -73,40 +73,32 @@ class CNVCaller {
 
         void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp);
 
-        void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction);
+        void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction) const;
 
         // Query a region for SNPs and return the SNP data
-        void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data);
-
-        void querySNPs(std::string chr, uint32_t start, uint32_t end, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp);
+        void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data) const;
 
         // Split a region into chunks for parallel processing
-        std::vector<std::string> splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count);
+        std::vector<std::string> splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const;
 
     public:
-        explicit CNVCaller(InputData& input_data);
+        explicit CNVCaller(const InputData& input_data);
 
         // Run copy number prediction for a single SV candidate, returning the
         // likelihood, predicted CNV type, genotype, and whether SNPs were found
-        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map);
+        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map) const;
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
         void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map);
 
-        // Calculate the mean chromosome coverage
         double calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map);
 
-        // Calculate the log2 ratio for a region given the read depths and mean
-        // chromosome coverage
-        double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov);
-
-        void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& pos_log2);
+        void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& pos_log2) const;
 
-        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp);
-        // void readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<double>& snp_pfb);
+        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp) const;
 
         // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions
-        void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood);
+        void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const;
 };
 
 #endif // CNV_CALLER_H
diff --git a/include/fasta_query.h b/include/fasta_query.h
index ffa88d8a..b130117a 100644
--- a/include/fasta_query.h
+++ b/include/fasta_query.h
@@ -25,10 +25,10 @@ class ReferenceGenome {
         std::string getContigHeader() const;
 
         // Get the list of chromosomes, used for whole genome analysis
-        std::vector<std::string> getChromosomes();
+        std::vector<std::string> getChromosomes() const;
 
         // Get the length of a chromosome
-        uint32_t getChromosomeLength(std::string chr);
+        uint32_t getChromosomeLength(std::string chr) const;
 };
 
 #endif // FASTA_QUERY_H
diff --git a/include/input_data.h b/include/input_data.h
index 65051e77..43a9790b 100644
--- a/include/input_data.h
+++ b/include/input_data.h
@@ -23,39 +23,35 @@ class InputData {
     public:
         InputData();
 
-        std::string getShortReadBam();
+        std::string getShortReadBam() const;
 
         void setShortReadBam(std::string filepath);
 
-        std::string getLongReadBam();
+        std::string getLongReadBam() const;
 
         void setLongReadBam(std::string filepath);
 
         // Set the filepath to the HMM parameters.
         void setHMMFilepath(std::string filepath);
-        std::string getHMMFilepath();
+        std::string getHMMFilepath() const;
 
         // Set the filepath to the reference genome FASTA file.
 		void setRefGenome(std::string fasta_filepath);
 
         // Return a reference to the ReferenceGenome object.
         const ReferenceGenome& getRefGenome() const;
-
-        // Query the reference genome for a sequence.
         std::string queryRefGenome(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const;
 
         // Get the chromosomes in the reference genome.
-        std::vector<std::string> getRefGenomeChromosomes();
+        std::vector<std::string> getRefGenomeChromosomes() const;
 
         // Get a chromosome's length in the reference genome.
-        uint32_t getRefGenomeChromosomeLength(std::string chr);
+        uint32_t getRefGenomeChromosomeLength(std::string chr) const;
 
         // Set the filepath to the text file containing the locations of the
         // VCF files with population frequencies for each chromosome.
         void setAlleleFreqFilepaths(std::string filepath);
-
-        // Get the chromosome's VCF filepath with population frequencies.
-        std::string getAlleleFreqFilepath(std::string chr);
+        std::string getAlleleFreqFilepath(std::string chr) const;
 
         // Get the population frequency map.
         // PFBMap getPFBMap();
@@ -63,36 +59,36 @@ class InputData {
         // Set the filepath to the VCF file with SNP calls used for CNV
         // detection with the HMM.
         void setSNPFilepath(std::string filepath);
-        std::string getSNPFilepath();
+        std::string getSNPFilepath() const;
 
         // Set the ethnicity for SNP population frequencies.
         void setEthnicity(std::string ethnicity);
-        std::string getEthnicity();
+        std::string getEthnicity() const;
 
         // Set the sample size for HMM predictions.
         void setSampleSize(int sample_size);
-        int getSampleSize();
+        int getSampleSize() const;
 
         // Set the minimum CNV length to use for copy number predictions.
         void setMinCNVLength(int min_cnv_length);
-        int getMinCNVLength();
+        int getMinCNVLength() const;
 
         // Set the chromosome to analyze.
         void setChromosome(std::string chr);
-        std::string getChromosome();
+        std::string getChromosome() const;
 
         // Set the region to analyze.
         void setRegion(std::string region);
-        std::pair<int32_t, int32_t> getRegion();
-        bool isRegionSet();
+        std::pair<int32_t, int32_t> getRegion() const;
+        bool isRegionSet() const;
 
         // Set the output directory where the results will be written.
         void setOutputDir(std::string dirpath);
-        std::string getOutputDir();
+        std::string getOutputDir() const;
 
         // Set the number of threads to use when parallelization is possible.
         void setThreadCount(int thread_count);
-        int getThreadCount();
+        int getThreadCount() const;
 
         // Set the verbose flag to true if verbose output is desired.
         void setVerbose(bool verbose);
@@ -101,7 +97,7 @@ class InputData {
         // Set whether to extend the SNP CNV regions around the SV breakpoints
         // (+/- 1/2 SV length), save a TSV file, and generate HTML reports.
         void saveCNVData(bool save_cnv_data);
-        bool getSaveCNVData();
+        bool getSaveCNVData() const;
         
     private:
         std::string short_read_bam;
diff --git a/include/sv_caller.h b/include/sv_caller.h
index c0f9ce23..abd397d1 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -16,47 +16,58 @@
 #include <future>
 /// @endcond
 
-// SV candidate alignment data (chr, start, end, sequence, query start, query
-// end, mismatch map, strand)
-using AlignmentData   = std::tuple<std::string, uint32_t, uint32_t, uint32_t, uint32_t, std::vector<int>, bool>;
-using AlignmentVector = std::vector<AlignmentData>;
+struct GenomicRegion {
+    int tid;
+    hts_pos_t start;
+    hts_pos_t end;
+    bool strand;
+};
 
-// Query map (query name, alignment vector)
-using PrimaryMap = std::unordered_map<std::string, AlignmentData>;
-using SuppMap = std::unordered_map<std::string, AlignmentVector>;
-// using RegionData = std::tuple<SVData, PrimaryMap, SuppMap>;
+struct MismatchData {
+    uint32_t query_start;
+    uint32_t query_end;
+    std::vector<int> match_map;
+};
 
 class SVCaller {
     private:
         int min_sv_size = 50;       // Minimum SV size to be considered
         int min_mapq = 20;          // Minimum mapping quality to be considered
-        InputData& input_data;
+        const InputData& input_data;
+
+        void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data) const;
+
+        void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map) const;
 
         // Detect SVs from the CIGAR string of a read alignment, and return the
         // mismatch rate, and the start and end positions of the query sequence
-        void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, std::tuple<std::vector<int>, uint32_t, uint32_t>& query_info, bool is_primary, const std::vector<uint32_t>& pos_depth_map);
+        void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map);
 
         void processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls);
 
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
         // RegionData detectSVsFromRegion(std::string region);
-        void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, const std::vector<uint32_t>& pos_depth_map);
+        void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map);
  
         // Read the next alignment from the BAM file in a thread-safe manner
-        int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
+        int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const;
 
         // Detect SVs from split alignments
-        void detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map);
+        void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map);
 
         // Calculate the mismatch rate given a map of query positions to
         // match/mismatch (1/0) values within a specified range of the query
         // sequence
-        double calculateMismatchRate(const std::vector<int>& mismatch_map, int32_t start, int32_t end);
+        double calculateMismatchRate(const MismatchData& mismatch_data) const;
+
+        std::pair<uint32_t, uint32_t> generateMatchMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, hts_itr_t *itr, std::vector<int>& match_map) const;
 
         void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls);
 
-        void trimOverlappingAlignments(AlignmentData& primary_alignment, AlignmentData& supp_alignment);
+        void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches) const;
+
+        // void trimOverlappingAlignments(uint32_t& primary_start, uint32_t& primary_end, uint32_t& supp_start, uint32_t& supp_end, const std::vector<int>& primary_match_map, const std::vector<int>& supp_match_map);
 
         // Calculate the read depth (INFO/DP) for a region
         int calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end);
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 2d2c5548..ac79d598 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -34,13 +34,13 @@
 
 using namespace sv_types;
 
-CNVCaller::CNVCaller(InputData &input_data)
+CNVCaller::CNVCaller(const InputData& input_data)
     : input_data(input_data)  // Initialize the input data
 {
 }
 
 // Function to call the Viterbi algorithm for the CHMM
-void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction)
+void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction) const
 {
     int data_count = (int) snp_data.pos.size();
     if (data_count == 0)
@@ -52,7 +52,7 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::ve
 }
 
 // Function to obtain SNP information for a region
-void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data)
+void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data) const
 {
     // uint32_t window_size = (uint32_t)this->input_data.getWindowSize();
 
@@ -70,11 +70,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     std::vector<double> snp_pfb(sample_size, 0.5);
     std::vector<double> snp_log2_cov(sample_size, 0.0);
     std::vector<bool> is_snp(sample_size, false);
-    // std::unordered_map<uint32_t, double> snp_baf(sample_size, -1.0);
-    // std::unordered_map<uint32_t, double> snp_pfb(sample_size, 0.5);
-
-    // Query the SNPs for the entire region
-    this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp);
+    this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp);
 
     // Get the log2 ratio for <sample_size> evenly spaced positions in the
     // region
@@ -88,7 +84,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     snp_data.is_snp = std::move(is_snp);
 }
 
-std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map)
+std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map) const
 {
     // Check that the start position is less than the end position
     if (start_pos >= end_pos)
@@ -168,7 +164,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     if ((double) max_count / (double) state_count > pct_threshold)
     {
         predicted_cnv_type = getSVTypeFromCNState(max_state);
-        genotype = cnv_genotype_map[max_state];
+        genotype = cnv_genotype_map.at(max_state);
     }
     snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
 
@@ -316,7 +312,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
     }
 }
 
-std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count)
+std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const
 {
     // Split the region into chunks
     std::vector<std::string> region_chunks;
@@ -478,39 +474,7 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
     return mean_chr_cov;
 }
 
-double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov)
-{
-    // Use the position and depth map to calculate the log2 ratio
-    double cum_depth = 0;
-    int pos_count = 0;
-    for (uint32_t i = start_pos; i <= end_pos; i++)
-    {
-        if (i < pos_depth_map.size() && pos_depth_map[i] > 0)
-        {
-            cum_depth += pos_depth_map[i];
-            pos_count++;
-        }
-    }
-
-    // Calculate the window coverage log2 ratio (0 if no positions)
-    double window_mean_cov = 0;
-    if (pos_count > 0)
-    {
-        window_mean_cov = (double) cum_depth / (double) pos_count;
-    }
-
-    // Calculate the log2 ratio for the window
-    // Avoid log2(0) by using a small value
-    if (window_mean_cov == 0)
-    {
-        window_mean_cov = 0.0001;
-    }
-    double window_log2_ratio = log2(window_mean_cov / mean_chr_cov);
-
-    return window_log2_ratio;
-}
-
-void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& log2_region)
+void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& log2_region) const
 {
     uint32_t region_length = end_pos - start_pos + 1;
     for (int i = 0; i < sample_size; i++)
@@ -534,7 +498,7 @@ void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, i
     }
 }
 
-void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp)
+void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp) const
 {
     printMemoryUsage("Reading SNP allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", ");
     
@@ -850,7 +814,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     bcf_sr_destroy(pfb_reader);
 }
 
-void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood)
+void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const
 {
     // Open the TSV file for writing
     std::ofstream tsv_file(filepath);
@@ -941,30 +905,3 @@ void CNVCaller::updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, doubl
     snp_data.log2_cov.emplace_back(log2_cov);
     snp_data.is_snp.emplace_back(is_snp);
 }
-
-void CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp)
-{
-    std::string snp_chr = chr;
-    chr = removeChrPrefix(chr);
-
-    // Query the SNP allele frequencies for the SNPs
-    // std::map<uint32_t, std::tuple<double, double>> snp_map;
-    this->readSNPAlleleFrequencies(snp_chr, start, end, snp_pos, snp_baf, snp_pfb, is_snp);
-
-    // Query the population frequencies for the SNPs
-    // std::unordered_map<uint32_t, double> pfb_map;
-    // this->readSNPPopulationFrequencies(chr, start, end, snp_pfb);
-
-    // Filter out the SNP population frequencies that are not in the SNP
-    // position set
-    // double pfb_default = 0.5;
-    // for (auto& pos : snp_pos)
-    // {
-    //     if (pfb_map.find(pos) != pfb_map.end())
-    //     {
-    //         snp_pfb[pos] = pfb_map[pos];
-    //     } else {
-    //         snp_pfb[pos] = pfb_default;
-    //     }
-    // }
-}
diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp
index ee220d1c..212343f0 100644
--- a/src/fasta_query.cpp
+++ b/src/fasta_query.cpp
@@ -161,12 +161,12 @@ std::string ReferenceGenome::getContigHeader() const
     return contig_header;
 }
 
-std::vector<std::string> ReferenceGenome::getChromosomes()
+std::vector<std::string> ReferenceGenome::getChromosomes() const
 {
     return this->chromosomes;
 }
 
-uint32_t ReferenceGenome::getChromosomeLength(std::string chr)
+uint32_t ReferenceGenome::getChromosomeLength(std::string chr) const
 {
-    return this->chr_to_seq[chr].length();
+    return this->chr_to_seq.at(chr).length();
 }
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 4f9ae124..381d5ac5 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -32,7 +32,7 @@ InputData::InputData()
     this->save_cnv_data = false;
 }
 
-std::string InputData::getShortReadBam()
+std::string InputData::getShortReadBam() const
 {
     return this->short_read_bam;
 }
@@ -58,7 +58,7 @@ void InputData::setShortReadBam(std::string filepath)
     }
 }
 
-std::string InputData::getLongReadBam()
+std::string InputData::getLongReadBam() const
 {
     return this->long_read_bam;
 }
@@ -100,17 +100,17 @@ std::string InputData::queryRefGenome(const std::string& chr, uint32_t pos_start
     return this->fasta_query.query(chr, pos_start, pos_end);
 }
 
-std::vector<std::string> InputData::getRefGenomeChromosomes()
+std::vector<std::string> InputData::getRefGenomeChromosomes() const
 {
     return this->fasta_query.getChromosomes();
 }
 
-uint32_t InputData::getRefGenomeChromosomeLength(std::string chr)
+uint32_t InputData::getRefGenomeChromosomeLength(std::string chr) const
 {
     return this->fasta_query.getChromosomeLength(chr);
 }
 
-std::string InputData::getOutputDir()
+std::string InputData::getOutputDir() const
 {
     return this->output_dir;
 }
@@ -124,7 +124,7 @@ void InputData::setOutputDir(std::string dirpath)
     system(cmd.c_str());
 }
 
-int InputData::getSampleSize()
+int InputData::getSampleSize() const
 {
     return this->sample_size;
 }
@@ -134,7 +134,7 @@ void InputData::setSampleSize(int sample_size)
     this->sample_size = sample_size;
 }
 
-std::string InputData::getSNPFilepath()
+std::string InputData::getSNPFilepath() const
 {
     return this->snp_vcf_filepath;
 }
@@ -144,7 +144,7 @@ void InputData::setSNPFilepath(std::string filepath)
     this->snp_vcf_filepath = filepath;
 }
 
-std::string InputData::getEthnicity()
+std::string InputData::getEthnicity() const
 {
     return this->ethnicity;
 }
@@ -154,7 +154,7 @@ void InputData::setEthnicity(std::string ethnicity)
     this->ethnicity = ethnicity;
 }
 
-int InputData::getMinCNVLength()
+int InputData::getMinCNVLength() const
 {
     return this->min_cnv_length;
 }
@@ -169,7 +169,7 @@ void InputData::setChromosome(std::string chr)
     this->chr = chr;
 }
 
-std::string InputData::getChromosome()
+std::string InputData::getChromosome() const
 {
     return this->chr;
 }
@@ -205,12 +205,12 @@ void InputData::setRegion(std::string region)
     }
 }
 
-std::pair<int32_t, int32_t> InputData::getRegion()
+std::pair<int32_t, int32_t> InputData::getRegion() const
 {
     return this->start_end;
 }
 
-bool InputData::isRegionSet()
+bool InputData::isRegionSet() const
 {
     return this->region_set;
 }
@@ -299,14 +299,14 @@ void InputData::setAlleleFreqFilepaths(std::string filepath)
     }
 }
 
-std::string InputData::getAlleleFreqFilepath(std::string chr)
+std::string InputData::getAlleleFreqFilepath(std::string chr) const
 {
     // Remove the chr notation
     if (chr.find("chr") != std::string::npos)
     {
         chr = chr.substr(3, chr.size() - 3);
     }
-    return this->pfb_filepaths[chr];
+    return this->pfb_filepaths.at(chr);
 }
 
 void InputData::setThreadCount(int thread_count)
@@ -314,12 +314,12 @@ void InputData::setThreadCount(int thread_count)
     this->thread_count = thread_count;
 }
 
-int InputData::getThreadCount()
+int InputData::getThreadCount() const
 {
     return this->thread_count;
 }
 
-std::string InputData::getHMMFilepath()
+std::string InputData::getHMMFilepath() const
 {
     return this->hmm_filepath;
 }
@@ -361,7 +361,7 @@ void InputData::saveCNVData(bool save_cnv_data)
     this->save_cnv_data = save_cnv_data;
 }
 
-bool InputData::getSaveCNVData()
+bool InputData::getSaveCNVData() const
 {
     return this->save_cnv_data;
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 74a4f891..db4da271 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -33,14 +33,16 @@ SVCaller::SVCaller(InputData &input_data)
 {
 }
 
-int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
+int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const
 {
     int ret = sam_itr_next(fp_in, itr, bam1);
     return ret;
 }
 
-void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, const std::vector<uint32_t>& pos_depth_map)
+void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map) const
 {
+    // std::map<std::string, hts_itr_t*> primary_map;
+    // std::map<std::string, std::vector<hts_itr_t*>> supplementary_map;
     printMemoryUsage("Before detecting SVs from CIGAR strings, ");
 
     // Create a read and iterator for the region
@@ -60,8 +62,11 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
         throw std::runtime_error("ERROR: failed to query region " + region);
     }
 
+    uint32_t primary_count = 0;
+    uint32_t supplementary_count = 0;
+
     // Main loop to process the alignments
-    int num_alignments = 0;
+    uint32_t num_alignments = 0;
     while (readNextAlignment(fp_in, itr, bam1) >= 0) {
 
         // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
@@ -72,50 +77,20 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
 
         // Process primary alignments
         if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
-
-            // Get the primary alignment information
-            std::string chr = bamHdr->target_name[bam1->core.tid];
-            uint32_t start = (uint32_t)bam1->core.pos;
-            uint32_t end = (uint32_t)bam_endpos(bam1);  // This is the first position after the alignment
-            end--;  // Adjust to the last position of the alignment
-            bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
-
-            // Call SVs directly from the CIGAR string
-            std::tuple<std::vector<int>, uint32_t, uint32_t> query_info;
-            this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true, pos_depth_map);
-            // std::tuple<std::vector<int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true);
-            const std::vector<int>& match_map = std::get<0>(query_info);
-            uint32_t query_start = std::get<1>(query_info);
-            uint32_t query_end = std::get<2>(query_info);
-
-            // Add the primary alignment to the map
-            AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand);
-            primary_alignments[qname] = alignment;
+            // primary_map[qname] = itr;
+            // Store chromosome (TID), start, and end positions (1-based) of the
+            // primary alignment, and the strand
+            primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE)};
+            primary_count++;
 
         // Process supplementary alignments
         } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
-
-            // Get the supplementary alignment information
-            std::string chr = bamHdr->target_name[bam1->core.tid];
-            uint32_t start = bam1->core.pos;
-            uint32_t end = bam_endpos(bam1);  // This is the first position after the alignment
-            end--;  // Adjust to the last position of the alignment
-            bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
-
-            // Get CIGAR string information, but don't call SVs
-            // std::tuple<std::vector<int>, int32_t, int32_t> query_info =
-            // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false);
-            std::tuple<std::vector<int>, uint32_t, uint32_t> query_info;
-            this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, false, pos_depth_map);
-            const std::vector<int>& match_map = std::get<0>(query_info);
-            uint32_t query_start = std::get<1>(query_info);
-            uint32_t query_end = std::get<2>(query_info);
-
-            // Add the supplementary alignment to the map
-            AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand);
-            supplementary_alignments[qname].emplace_back(alignment);
+            // supp_map[qname].push_back(itr);
+            // Store chromosome (TID), start, and end positions (1-based) of the
+            // supplementary alignment, and the strand
+            supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE)});
+            supplementary_count++;
         }
-
         num_alignments++;
     }
 
@@ -124,12 +99,215 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
     bam_destroy1(bam1);
 
     printMemoryUsage("After detecting SVs from CIGAR strings, ");
+    printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments...");
+}
+
+void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data) const
+{
+    // Create a read and iterator for the region
+    bam1_t *bam1 = bam_init1();
+    if (!bam1) {
+        hts_idx_destroy(idx);
+        bam_hdr_destroy(bamHdr);
+        sam_close(fp_in);
+        throw std::runtime_error("ERROR: failed to initialize BAM record");
+    }
+    hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start, region.end);
+    if (!itr) {
+        bam_destroy1(bam1);
+        hts_idx_destroy(idx);
+        bam_hdr_destroy(bamHdr);
+        sam_close(fp_in);
+        throw std::runtime_error("ERROR: failed to query region " + std::to_string(region.tid) + ":" + std::to_string(region.start) + "-" + std::to_string(region.end));
+    }
+
+    // Read the alignment
+    if (readNextAlignment(fp_in, itr, bam1) < 0) {
+        bam_destroy1(bam1);
+        hts_itr_destroy(itr);
+        printError("ERROR: failed to read alignment");
+        return;
+    }
+
+    // Main loop to process the alignments
+    std::vector<int> match_map;
+    uint32_t query_start = 0;
+    uint32_t query_end = 0;
+    uint32_t query_pos = 0;
+    bool first_op = true;
+
+    // Process mismatches in the CIGAR string
+    const std::string chr = bamHdr->target_name[bam1->core.tid];
+    hts_pos_t pos = bam1->core.pos;  // 0-based position
+    uint32_t* cigar = bam_get_cigar(bam1);  // CIGAR array
+    int cigar_len = bam1->core.n_cigar;
+    for (int i = 0; i < cigar_len; i++) {
+        int op = bam_cigar_op(cigar[i]);  // CIGAR operation
+        int op_len = bam_cigar_oplen(cigar[i]);  // CIGAR operation length
+        
+        // Update match/mismatch query map
+        int MATCH = 1;
+        int MISMATCH = -1;
+        if (op == BAM_CEQUAL) {
+            for (int j = 0; j < op_len; j++) {
+                match_map[query_pos + j] = MATCH;
+            }
+        } else if (op == BAM_CDIFF) {
+            for (int j = 0; j < op_len; j++) {
+                match_map[query_pos + j] = MISMATCH;
+            }
+        } else if (op == BAM_CMATCH) {
+            // Get the read sequence
+            uint8_t* seq_ptr = bam_get_seq(bam1);
+            std::string cmatch_seq_str = "";
+            for (int j = 0; j < op_len; j++) {
+                cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)];
+            }
 
-    // return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments);
+            // Get the corresponding reference sequence
+            int cmatch_pos = pos + 1;  // Querying the reference genome is 1-based
+            // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1));
+            std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1);
+
+            // Check that the two sequence lengths are equal
+            if (cmatch_seq_str.length() != cmatch_ref_str.length()) {
+                throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op));
+            }
+
+            // Compare the two sequences and update the mismatch map
+            for (int j = 0; j < op_len; j++) {
+                if (cmatch_seq_str[j] != cmatch_ref_str[j]) {
+                    match_map[query_pos + j] = MISMATCH;
+                } else {
+                    match_map[query_pos + j] = MATCH;
+                }
+            }
+        } else if (first_op && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP)) {
+            query_start = query_pos + op_len;
+            first_op = false;
+        }
+        
+        // Update the reference position
+        // https://samtools.github.io/hts-specs/SAMv1.pdf
+        if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
+            pos += op_len;
+
+        // Update the query position
+        } else if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
+            query_pos += op_len;
+        }
+    }
+    query_end = query_pos;
+    
+    // Clean up the iterator and alignment
+    hts_itr_destroy(itr);
+    bam_destroy1(bam1);
+
+    // Update the mismatch data
+    mismatch_data.query_start = query_start;
+    mismatch_data.query_end = query_end;
+    mismatch_data.match_map = std::move(match_map);
 }
 
-double SVCaller::calculateMismatchRate(const std::vector<int>& mismatch_map, int32_t start, int32_t end)
+void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map)
 {
+    printMemoryUsage("Before detecting SVs from CIGAR strings, ");
+
+    // Create a read and iterator for the region
+    bam1_t *bam1 = bam_init1();
+    if (!bam1) {
+        hts_idx_destroy(idx);
+        bam_hdr_destroy(bamHdr);
+        sam_close(fp_in);
+        throw std::runtime_error("ERROR: failed to initialize BAM record");
+    }
+    hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str());
+    if (!itr) {
+        bam_destroy1(bam1);
+        hts_idx_destroy(idx);
+        bam_hdr_destroy(bamHdr);
+        sam_close(fp_in);
+        throw std::runtime_error("ERROR: failed to query region " + region);
+    }
+
+    // Main loop to process the alignments
+    while (readNextAlignment(fp_in, itr, bam1) >= 0) {
+
+        // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
+        if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) {
+            continue;
+        }
+        // const std::string qname = bam_get_qname(bam1);  // Query template name
+
+        // Process the alignment
+        bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY);
+        this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, primary, pos_depth_map);
+        // if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
+
+        //     // Get the primary alignment information
+        //     // std::string chr = bamHdr->target_name[bam1->core.tid];
+        //     // uint32_t start = (uint32_t)bam1->core.pos;
+        //     // uint32_t end = (uint32_t)bam_endpos(bam1);  // This is the first position after the alignment
+        //     // end--;  // Adjust to the last position of the alignment
+        //     // bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
+
+        //     // Call SVs directly from the CIGAR string
+        //     // std::tuple<std::vector<int>, uint32_t, uint32_t> query_info;
+        //     this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true, pos_depth_map);
+        //     // std::tuple<std::vector<int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true);
+        //     // const std::vector<int>& match_map = std::get<0>(query_info);
+        //     // uint32_t query_start = std::get<1>(query_info);
+        //     // uint32_t query_end = std::get<2>(query_info);
+
+        //     // Add the primary alignment to the map
+        //     // AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand);
+        //     // primary_alignments[qname] = alignment;
+
+        //     // Add the iterator to the primary map
+        //     // primary_map[qname] = itr;
+
+        // // Process supplementary alignments
+        // } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
+
+        //     // Get the supplementary alignment information
+        //     // std::string chr = bamHdr->target_name[bam1->core.tid];
+        //     // uint32_t start = bam1->core.pos;
+        //     // uint32_t end = bam_endpos(bam1);  // This is the first position after the alignment
+        //     // end--;  // Adjust to the last position of the alignment
+        //     // bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
+
+        //     // Get CIGAR string information, but don't call SVs
+        //     // std::tuple<std::vector<int>, int32_t, int32_t> query_info =
+        //     // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false);
+        //     // std::tuple<std::vector<int>, uint32_t, uint32_t> query_info;
+        //     this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false, pos_depth_map);
+        //     // const std::vector<int>& match_map = std::get<0>(query_info);
+        //     // uint32_t query_start = std::get<1>(query_info);
+        //     // uint32_t query_end = std::get<2>(query_info);
+
+        //     // Add the supplementary alignment to the map
+        //     // AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand);
+        //     // supplementary_alignments[qname].emplace_back(alignment);
+
+        //     // Add the iterator to the supplementary map
+        //     // supplementary_map[qname].push_back(itr);
+        // }
+    }
+
+    // Clean up the iterator and alignment
+    hts_itr_destroy(itr);
+    bam_destroy1(bam1);
+
+    printMemoryUsage("After detecting SVs from CIGAR strings, ");
+}
+
+// double SVCaller::calculateMismatchRate(const std::vector<int>& mismatch_map,
+// int32_t start, int32_t end)
+double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) const
+{
+    int start = mismatch_data.query_start;
+    int end = mismatch_data.query_end;
+    const std::vector<int>& mismatch_map = mismatch_data.match_map;
     start = std::max(start, 0);
     end = std::min(end, (int32_t)mismatch_map.size() - 1);
     int match_count = 0;
@@ -153,8 +331,117 @@ double SVCaller::calculateMismatchRate(const std::vector<int>& mismatch_map, int
 
     return mismatch_rate;
 }
+// {
+//     start = std::max(start, 0);
+//     end = std::min(end, (int32_t)mismatch_map.size() - 1);
+//     int match_count = 0;
+//     int mismatch_count = 0;
+//     int MATCH = 1;
+//     int MISMATCH = -1;
+//     for (int i = start; i <= end; i++) {
+//         if (mismatch_map[i] == MATCH) {
+//             match_count++;
+//         } else if (mismatch_map[i] == MISMATCH) {
+//             mismatch_count++;
+//         }
+//     }
+
+//     // Avoid division by zero
+//     if (match_count + mismatch_count == 0) {
+//         return 0.0;
+//     }
+
+//     double mismatch_rate = static_cast<double>(mismatch_count) / static_cast<double>(match_count + mismatch_count);
+
+//     return mismatch_rate;
+// }
+
+std::pair<uint32_t, uint32_t> SVCaller::generateMatchMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, hts_itr_t* itr, std::vector<int>& match_map) const
+{
+    // Create a read and iterator for the region
+    bam1_t *bam1 = bam_init1();
+    if (!bam1) {
+        printError("ERROR: failed to initialize BAM record");
+        return std::make_pair(0, 0);
+    }
+
+    // Read the alignment
+    if (readNextAlignment(fp_in, itr, bam1) < 0) {
+        bam_destroy1(bam1);
+        printError("ERROR: failed to read alignment");
+        return std::make_pair(0, 0);
+    }
+
+    // Main loop to process the alignments
+    std::string chr = bamHdr->target_name[bam1->core.tid];  // Chromosome name
+    uint32_t pos = (uint32_t)bam1->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
+    uint32_t query_pos = 0;
+    uint32_t query_start = 0;
+    uint32_t query_end = 0;
+    bool first_op = true;
+
+    // Get the CIGAR string
+    uint32_t* cigar = bam_get_cigar(bam1);  // CIGAR array
+    int cigar_len = bam1->core.n_cigar;
+    for (int i = 0; i < cigar_len; i++) {
+        int op = bam_cigar_op(cigar[i]);  // CIGAR operation
+        int op_len = bam_cigar_oplen(cigar[i]);  // CIGAR operation length
+        
+        // Update match/mismatch query map
+        int MATCH = 1;
+        int MISMATCH = -1;
+        if (op == BAM_CEQUAL) {
+            for (int j = 0; j < op_len; j++) {
+                match_map[query_pos + j] = MATCH;
+            }
+        } else if (op == BAM_CDIFF) {
+            for (int j = 0; j < op_len; j++) {
+                match_map[query_pos + j] = MISMATCH;
+            }
+        } else if (op == BAM_CMATCH) {
+            // Get the read sequence
+            uint8_t* seq_ptr = bam_get_seq(bam1);
+            std::string cmatch_seq_str = "";
+            for (int j = 0; j < op_len; j++) {
+                cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)];
+            }
+
+            // Get the corresponding reference sequence
+            int cmatch_pos = pos + 1;  // Querying the reference genome is 1-based
+            // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1));
+            std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1);
+
+            // Check that the two sequence lengths are equal
+            if (cmatch_seq_str.length() != cmatch_ref_str.length()) {
+                throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op));
+            }
+
+            // Compare the two sequences and update the mismatch map
+            for (int j = 0; j < op_len; j++) {
+                if (cmatch_seq_str[j] != cmatch_ref_str[j]) {
+                    match_map[query_pos + j] = MISMATCH;
+                } else {
+                    match_map[query_pos + j] = MATCH;
+                }
+            }
+        } else if (first_op && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP)) {
+            query_start = query_pos + op_len;
+            first_op = false;
+        }
+        
+        // Update the query position
+        if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
+            query_pos += op_len;
+        }
+    }
+    query_end = query_pos;
+    
+    bam_destroy1(bam1);  // Clean up the alignment
+
+    return std::make_pair(query_start, query_end);
+}
 
-void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, std::tuple<std::vector<int>, uint32_t, uint32_t>& query_info, bool is_primary, const std::vector<uint32_t>& pos_depth_map)
+void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map)
 {
     std::string chr = header->target_name[alignment->core.tid];  // Chromosome name
     uint32_t pos = (uint32_t)alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
@@ -163,7 +450,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     uint32_t query_pos = 0;
     // std::unordered_map<int, int> query_match_map;  // Query position to
     // match/mismatch (1/0) map
-    std::vector<int> query_match_map(alignment->core.l_qseq, 0);  // Query position to match/mismatch (1/0) map
+    // std::vector<int> query_match_map(alignment->core.l_qseq, 0);  // Query position to match/mismatch (1/0) map
 
     // Loop through the CIGAR string, process operations, detect SVs (primary
     // only), update clipped base support, calculate sequence identity for
@@ -171,9 +458,9 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     // the clipped base support and mismatch rate
     uint32_t ref_pos;
     uint32_t ref_end;
-    uint32_t query_start = 0;  // First alignment position in the query
-    uint32_t query_end = 0;    // Last alignment position in the query
-    bool first_op = false;  // First alignment operation for the query
+    // uint32_t query_start = 0;  // First alignment position in the query
+    // uint32_t query_end = 0;    // Last alignment position in the query
+    // bool first_op = false;  // First alignment operation for the query
     double default_lh = 0.0;
     for (int i = 0; i < cigar_len; i++) {
 
@@ -265,50 +552,50 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
             // sv_calls.updateClippedBaseSupport(chr, pos);  // Update clipped base support
 
             // Update the query alignment start position
-            if (!first_op) {
-                query_start = query_pos + op_len;
-                first_op = true;
-            }
+            // if (!first_op) {
+            //     query_start = query_pos + op_len;
+            //     first_op = true;
+            // }
         }
 
-        // Update match/mismatch query map
-        int MATCH = 1;
-        int MISMATCH = -1;
-        if (op == BAM_CEQUAL) {
-            for (int j = 0; j < op_len; j++) {
-                query_match_map[query_pos + j] = MATCH;
-            }
-        } else if (op == BAM_CDIFF) {
-            for (int j = 0; j < op_len; j++) {
-                query_match_map[query_pos + j] = MISMATCH;
-            }
-        } else if (op == BAM_CMATCH) {
-            // Get the read sequence
-            uint8_t* seq_ptr = bam_get_seq(alignment);
-            std::string cmatch_seq_str = "";
-            for (int j = 0; j < op_len; j++) {
-                cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)];
-            }
-
-            // Get the corresponding reference sequence
-            int cmatch_pos = pos + 1;  // Querying the reference genome is 1-based
-            // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1));
-            std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1);
-
-            // Check that the two sequence lengths are equal
-            if (cmatch_seq_str.length() != cmatch_ref_str.length()) {
-                throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op));
-            }
-
-            // Compare the two sequences and update the mismatch map
-            for (int j = 0; j < op_len; j++) {
-                if (cmatch_seq_str[j] != cmatch_ref_str[j]) {
-                    query_match_map[query_pos + j] = MISMATCH;
-                } else {
-                    query_match_map[query_pos + j] = MATCH;
-                }
-            }
-        }
+        // // Update match/mismatch query map
+        // int MATCH = 1;
+        // int MISMATCH = -1;
+        // if (op == BAM_CEQUAL) {
+        //     for (int j = 0; j < op_len; j++) {
+        //         query_match_map[query_pos + j] = MATCH;
+        //     }
+        // } else if (op == BAM_CDIFF) {
+        //     for (int j = 0; j < op_len; j++) {
+        //         query_match_map[query_pos + j] = MISMATCH;
+        //     }
+        // } else if (op == BAM_CMATCH) {
+        //     // Get the read sequence
+        //     uint8_t* seq_ptr = bam_get_seq(alignment);
+        //     std::string cmatch_seq_str = "";
+        //     for (int j = 0; j < op_len; j++) {
+        //         cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)];
+        //     }
+
+        //     // Get the corresponding reference sequence
+        //     int cmatch_pos = pos + 1;  // Querying the reference genome is 1-based
+        //     // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1));
+        //     std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1);
+
+        //     // Check that the two sequence lengths are equal
+        //     if (cmatch_seq_str.length() != cmatch_ref_str.length()) {
+        //         throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op));
+        //     }
+
+        //     // Compare the two sequences and update the mismatch map
+        //     for (int j = 0; j < op_len; j++) {
+        //         if (cmatch_seq_str[j] != cmatch_ref_str[j]) {
+        //             query_match_map[query_pos + j] = MISMATCH;
+        //         } else {
+        //             query_match_map[query_pos + j] = MATCH;
+        //         }
+        //     }
+        // }
 
         // Update the reference coordinate based on the CIGAR operation
         // https://samtools.github.io/hts-specs/SAMv1.pdf
@@ -330,9 +617,9 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
         }
     }
 
-    query_end = query_pos;  // Last alignment position in the query
+    // query_end = query_pos;  // Last alignment position in the query
 
-    query_info = std::tuple<std::vector<int>, uint32_t, uint32_t>(std::move(query_match_map), query_start, query_end);
+    // query_info = std::tuple<std::vector<int>, uint32_t, uint32_t>(std::move(query_match_map), query_start, query_end);
 }
 
 void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls)
@@ -414,13 +701,12 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     int filter_threshold = 4;
     for (const auto& sub_region : region_chunks) {
         current_region++;
+
+        // Detect SVs from the CIGAR strings
         printMessage(chr + ": CIGAR SVs...");
-        printMemoryUsage("Before detecting CIGAR SVs, ");
-        PrimaryMap primary_map;
-        SuppMap supp_map;
-        std::vector<SVCall> subregion_sv_calls;
-        printMemoryUsage("After creating primary and supplementary maps, ");
-        this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map, chr_pos_depth_map);
+        std::vector<SVCall> subregion_sv_calls;        
+        this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, chr_pos_depth_map);
+
         // std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
         // PrimaryMap& primary_map = std::get<1>(region_data);
         // SuppMap& supp_map = std::get<2>(region_data);
@@ -442,7 +728,8 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         // Run split-read SV and copy number variant predictions
         // std::cout << "Detecting copy number variants from split reads..." << std::endl;
         printMessage(chr + ": Split read SVs...");
-        this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map);
+        this->detectSVsFromSplitReads(sub_region, fp_in, idx, bamHdr, subregion_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map);
+        // this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map);
 
         // Merge the SV calls from the current region
         // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
@@ -556,45 +843,105 @@ void SVCaller::run()
 
 
 // Detect SVs from split read alignments
-void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map)
+void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map)
 {
     printMemoryUsage("Before detecting SVs from split reads, ");
 
+
+    printMessage("Getting split alignments...");
+    // std::map<std::string, hts_itr_t*> primary_map;
+    // std::map<std::string, std::vector<hts_itr_t*>> supp_map;
+    std::unordered_map<std::string, GenomicRegion> primary_map;
+    std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
+    this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map);
+
+    printMessage("[TEST] Primary map size: " + std::to_string(primary_map.size()));
+    printMessage("[TEST] Supplementary map size: " + std::to_string(supp_map.size()));
+
     // Find split-read SV evidence
     int sv_count = 0;
     uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength();
-    for (const auto& entry : primary_map) {
-        std::string qname = entry.first;
-        AlignmentData primary_alignment = entry.second;
-        std::string primary_chr = std::get<0>(primary_alignment);
-        uint32_t primary_start = std::get<1>(primary_alignment);
-        uint32_t primary_end = std::get<2>(primary_alignment);
+    for (auto& entry : primary_map) {
+        // std::string qname = entry.first;
+        const std::string& qname = entry.first;
+        GenomicRegion& primary_region = entry.second;
+        // AlignmentData primary_alignment = entry.second;
+        // std::string primary_chr = std::get<0>(primary_alignment);
+        // uint32_t primary_start = std::get<1>(primary_alignment);
+        // uint32_t primary_end = std::get<2>(primary_alignment);
+
+        // Get the primary alignment information
+        // std::string primary_chr = bamHdr->target_name[primary_bam1->core.tid];
+        // uint32_t primary_start = (uint32_t) primary_bam1->core.pos;
+        // uint32_t primary_end = (uint32_t) bam_endpos(primary_bam1) - 1;  // Last alignment position
+        // bool primary_fwd_strand = !(primary_bam1->core.flag & BAM_FREVERSE);
 
         // Skip primary alignments that do not have supplementary alignments
         if (supp_map.find(qname) == supp_map.end()) {
             continue;
         }
 
+        // Get the read match/mismatch map
+        MismatchData primary_mismatches;
+        this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches);
+        // std::vector<int> match_map(primary_region.end - primary_region.start
+        // + 1, 0);
+        // this->getMatchMismatchMap(fp_in, idx, bamHdr, primary_region, mismatch_data);
+
+        // std::pair<uint32_t, uint32_t> query_info = generateMatchMismatchMap(fp_in, idx, bamHdr, primary_itr, match_map);
+
         // Find the largest supplementary alignment, and also identify
         // inversions
         // printMessage("Finding largest supplementary alignment...");
-        AlignmentData largest_supp_alignment = supp_map[qname][0];
+        // AlignmentData largest_supp_region = supp_map[qname][0];
+        // hts_itr_t* largest_supp_itr = supp_map[qname][0];
+        GenomicRegion largest_supp_region = supp_map[qname][0];
         uint32_t largest_supp_length = 0;
+        const std::string& primary_chr = bamHdr->target_name[primary_region.tid];
         for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
-            const auto& supp_chr = std::get<0>(*it);
-            if (primary_chr != supp_chr) {
-                continue;  // Skip supplementary alignments on different chromosomes
+            GenomicRegion& supp_region = *it;
+            // Get the supplementary alignment information
+            // bam1_t* supp_bam1 = bam_init1();
+            // if (!supp_bam1) {
+            //     throw std::runtime_error("ERROR: failed to initialize BAM record");
+            // }
+            // if (sam_itr_next(fp_in, *it, supp_bam1) < 0) {
+            //     bam_destroy1(supp_bam1);
+            //     throw std::runtime_error("ERROR: failed to read alignment");
+            // }
+
+            // Skip if not on the primary chromosome
+            if (primary_region.tid != supp_region.tid) {
+                continue;
             }
-            uint32_t supp_start = std::get<1>(*it);
-            uint32_t supp_end = std::get<2>(*it);
+
+            // std::string supp_chr = bamHdr->target_name[supp_bam1->core.tid];
+            // uint32_t supp_start = (uint32_t) supp_bam1->core.pos;
+            // uint32_t supp_end = (uint32_t) bam_endpos(supp_bam1) - 1;  // Last alignment position
+            // uint32_t supp_length = supp_end - supp_start + 1;
+
+            // const auto& supp_chr = std::get<0>(*it);
+            // if (primary_chr != supp_chr) {
+            //     continue;  // Skip supplementary alignments on different chromosomes
+            // }
+            // uint32_t supp_start = std::get<1>(*it);
+            // uint32_t supp_end = std::get<2>(*it);
+            // uint32_t supp_length = supp_end - supp_start + 1;
+
+            // Get the supplementary alignment information
+            uint32_t supp_start = (uint32_t) supp_region.start;
+            uint32_t supp_end = (uint32_t) supp_region.end;
             uint32_t supp_length = supp_end - supp_start + 1;
             if (supp_length > largest_supp_length) {
                 largest_supp_length = supp_length;
-                largest_supp_alignment = *it;
+                largest_supp_region = *it;
             }
 
             // Inversion detection
-            bool is_opposite_strand = std::get<6>(primary_alignment) != std::get<6>(*it);
+            // bool is_opposite_strand = std::get<6>(primary_alignment) != std::get<6>(*it);
+            // bool supp_fwd_strand = !(supp_bam1->core.flag & BAM_FREVERSE);
+            // bool is_opposite_strand = primary_fwd_strand != supp_fwd_strand;
+            bool is_opposite_strand = primary_region.strand != supp_region.strand;
             if (is_opposite_strand) {
                 if (supp_length >= min_cnv_length) {
 
@@ -630,31 +977,42 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
         }
 
         // Trim overlapping alignments
-        uint32_t supp_start = std::get<1>(largest_supp_alignment);
-        uint32_t supp_end = std::get<2>(largest_supp_alignment);
-        bool primary_before_supp = primary_start < supp_start;
-        trimOverlappingAlignments(primary_alignment, largest_supp_alignment);
+        MismatchData supp_mismatches;
+        this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches);
+        // uint32_t supp_start = std::get<1>(largest_supp_region);
+        // uint32_t supp_end = std::get<2>(largest_supp_region);
+        // bool primary_before_supp = primary_start < supp_start;
+        // trimOverlappingAlignments(primary_alignment, largest_supp_region);
+        trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches);
 
         // Create the SV candidate using both alignments
-        supp_start = std::get<1>(largest_supp_alignment);
-        supp_end = std::get<2>(largest_supp_alignment);
-        primary_start = std::get<1>(primary_alignment);
-        primary_end = std::get<2>(primary_alignment);
+        // supp_start = std::get<1>(largest_supp_region);
+        // supp_end = std::get<2>(largest_supp_region);
+        // primary_start = std::get<1>(primary_alignment);
+        // primary_end = std::get<2>(primary_alignment);
         bool gap_exists = false;
         uint32_t boundary_left, boundary_right, gap_left, gap_right;
-        if (primary_before_supp) {
-            boundary_left = primary_start+1;
-            // boundary_right = supp_end+1;
-            boundary_right = std::max(primary_end, supp_end)+1;
-            gap_left = primary_end+1;
-            gap_right = supp_start+1;
+        if (primary_region.start < largest_supp_region.start) {  // Primary before supp
+            // boundary_left = primary_start+1;
+            // boundary_right = std::max(primary_end, supp_end)+1;
+            // gap_left = primary_end+1;
+            // gap_right = supp_start+1;
+            // gap_exists = gap_left < gap_right;
+            boundary_left = primary_region.start + 1;
+            boundary_right = std::max(primary_region.end, largest_supp_region.end) + 1;
+            gap_left = primary_region.end + 1;
+            gap_right = largest_supp_region.start + 1;
             gap_exists = gap_left < gap_right;
         } else {
-            boundary_left = supp_start+1;
-            // boundary_right = primary_end+1;
-            boundary_right = std::max(primary_end, supp_end)+1;
-            gap_left = supp_end+1;
-            gap_right = primary_start+1;
+            // boundary_left = supp_start+1;
+            // boundary_right = std::max(primary_end, supp_end)+1;
+            // gap_left = supp_end+1;
+            // gap_right = primary_start+1;
+            // gap_exists = gap_left < gap_right;
+            boundary_left = largest_supp_region.start + 1;
+            boundary_right = std::max(primary_region.end, largest_supp_region.end) + 1;
+            gap_left = largest_supp_region.end + 1;
+            gap_right = primary_region.start + 1;
             gap_exists = gap_left < gap_right;
         }
         
@@ -668,7 +1026,9 @@ void SVCaller::detectSVsFromSplitReads(std::vector<SVCall>& sv_calls, PrimaryMap
                 continue;
             }
 
-            // printMessage("Running copy number prediction on boundary: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right));
+            // printMessage("Running copy number prediction on boundary: " +
+            // primary_chr + ":" + std::to_string(boundary_left) + "-" +
+            // std::to_string(boundary_right));
             std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map);
             if (std::get<1>(bd_result) == SVType::UNKNOWN) {
                 continue;
@@ -888,85 +1248,101 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     std::cout << "Finished writing VCF file. Total SV calls: " << total_count << ", skipped: " << skip_count << " with unknown SV type" << std::endl;
 }
 
-void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, AlignmentData& supp_alignment)
+void SVCaller::trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches) const
 {
     // Get the start and end read positions for the primary and supplementary
     // alignments
-    uint32_t primary_alignment_start = std::get<1>(primary_alignment);
-    uint32_t primary_alignment_end = std::get<2>(primary_alignment);
-    uint32_t supp_alignment_start = std::get<1>(supp_alignment);
-    uint32_t supp_alignment_end = std::get<2>(supp_alignment);
-    uint32_t primary_query_start = std::get<3>(primary_alignment);
-    uint32_t primary_query_end = std::get<4>(primary_alignment);
-    uint32_t supp_query_start = std::get<3>(supp_alignment);
-    uint32_t supp_query_end = std::get<4>(supp_alignment);
-    const std::vector<int>& primary_match_map = std::get<5>(primary_alignment);
-    const std::vector<int>& supp_match_map = std::get<5>(supp_alignment);
+    // uint32_t primary_alignment_start = std::get<1>(primary_alignment);
+    // uint32_t primary_alignment_end = std::get<2>(primary_alignment);
+    // uint32_t supp_alignment_start = std::get<1>(supp_alignment);
+    // uint32_t supp_alignment_end = std::get<2>(supp_alignment);
+    // uint32_t primary_query_start = std::get<3>(primary_alignment);
+    // uint32_t primary_query_end = std::get<4>(primary_alignment);
+    // uint32_t supp_query_start = std::get<3>(supp_alignment);
+    // uint32_t supp_query_end = std::get<4>(supp_alignment);
+    // const std::vector<int>& primary_match_map = std::get<5>(primary_alignment);
+    // const std::vector<int>& supp_match_map = std::get<5>(supp_alignment);
 
     // Check for overlapping read alignments
-    bool primary_before_supp = primary_query_start < supp_query_start;
-    if (primary_before_supp) {
+    // bool primary_before_supp = primary_query_start < supp_query_start;
+    if (primary_mismatches.query_start < supp_mismatches.query_start) {
         // Primary before supplementary in the query
-        if (primary_query_end >= supp_query_start) {
+
+        // if (primary_query_end >= supp_query_start) {
+        if (primary_mismatches.query_end >= supp_mismatches.query_start) {
             // Calculate the mismatch rates at the overlapping region
-            double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, supp_query_start, primary_query_end);
-            double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, supp_query_start, primary_query_end);
-            uint32_t overlap_length = primary_query_end - supp_query_start + 1;
+            double primary_mismatch_rate = this->calculateMismatchRate(primary_mismatches);
+            double supp_mismatch_rate = this->calculateMismatchRate(supp_mismatches);
+            // uint32_t overlap_length = primary_query_end - supp_query_start +
+            // 1;
+            hts_pos_t overlap_length = primary_mismatches.query_end - supp_mismatches.query_start + 1;
 
             // Trim the ailgnment with the higher mismatch rate
             if (primary_mismatch_rate > supp_mismatch_rate) {
                 // Trim the end of the primary alignment, ensuring that the new
                 // end is not less than the start
-                if (primary_alignment_end > overlap_length && (primary_alignment_end - overlap_length) > primary_alignment_start) {
+                // if (primary_alignment_end > overlap_length &&
+                // (primary_alignment_end - overlap_length) >
+                // primary_alignment_start) {
+                if (primary_alignment.end > overlap_length && (primary_alignment.end - overlap_length) > primary_alignment.start) {
                     // Trim the end of the primary alignment
-                    uint32_t new_end = primary_alignment_end - overlap_length;
-                    std::get<2>(primary_alignment) = new_end;
+                    // uint32_t new_end = primary_alignment_end - overlap_length;
+                    // std::get<2>(primary_alignment) = new_end;
+                    primary_alignment.end = primary_alignment.end - overlap_length;
                 }
-                // std::get<2>(primary_alignment) = primary_alignment_end - overlap_length;
             } else {
                 // Trim the beginning of the supplementary alignment, ensuring
                 // that the new start is not greater than the end
-                if (supp_alignment_start + overlap_length < supp_alignment_end) {
+                // if (supp_alignment_start + overlap_length <
+                // supp_alignment_end) {
+                if (supp_alignment.start + overlap_length < supp_alignment.end) {
                     // Trim the beginning of the supplementary alignment
-                    uint32_t new_start = supp_alignment_start + overlap_length;
-                    std::get<1>(supp_alignment) = new_start;
+                    // uint32_t new_start = supp_alignment_start + overlap_length;
+                    // std::get<1>(supp_alignment) = new_start;
+                    supp_alignment.start = supp_alignment.start + overlap_length;
                 }
-                // uint32_t new_start = supp_alignment_start + overlap_length;
-                // std::get<1>(supp_alignment) = new_start;
-                // std::get<1>(supp_alignment) = supp_alignment_start + overlap_length;
             }
         }
+
+    // } else if (supp_mismatches.query_end >= primary_mismatches.query_start) {
     } else {
         // Supplementary before primary in the query
-        if (supp_query_end >= primary_query_start) {
+        if (primary_mismatches.query_start <= supp_mismatches.query_end) {
             // Calculate the mismatch rates at the overlapping region
-            double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, primary_query_start, supp_query_end);
-            double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, primary_query_start, supp_query_end);
-            uint32_t overlap_length = supp_query_end - primary_query_start + 1;
+            // double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, primary_query_start, supp_query_end);
+            // double supp_mismatch_rate =
+            // this->calculateMismatchRate(supp_match_map, primary_query_start,
+            // supp_query_end);
+            double primary_mismatch_rate = this->calculateMismatchRate(primary_mismatches);
+            double supp_mismatch_rate = this->calculateMismatchRate(supp_mismatches);
+            // hts_pos_t overlap_length = supp_query_end - primary_query_start +
+            // 1;
+            hts_pos_t overlap_length = supp_mismatches.query_end - primary_mismatches.query_start + 1;
 
             // Trim the ailgnment with the higher mismatch rate
             if (supp_mismatch_rate > primary_mismatch_rate) {
                 // Trim the end of the supplementary alignment, ensuring that
                 // the new end is not less than the start
-                if (supp_alignment_end > overlap_length && (supp_alignment_end - overlap_length) > supp_alignment_start) {
+                // if (supp_alignment_end > overlap_length &&
+                // (supp_alignment_end - overlap_length) > supp_alignment_start)
+                // {
+                if (supp_alignment.end > overlap_length && (supp_alignment.end - overlap_length) > supp_alignment.start) {
                     // Trim the end of the supplementary alignment
-                    uint32_t new_end = supp_alignment_end - overlap_length;
-                    std::get<2>(supp_alignment) = new_end;
+                    // uint32_t new_end = supp_alignment_end - overlap_length;
+                    // std::get<2>(supp_alignment) = new_end;
+                    supp_alignment.end = supp_alignment.end - overlap_length;
                 }
-                // uint32_t new_end = supp_alignment_end > overlap_length ? supp_alignment_end - overlap_length : 0;
-                // std::get<2>(supp_alignment) = new_end;
-                // std::get<2>(supp_alignment) = supp_alignment_end - overlap_length;
             } else {
                 // Trim the beginning of the primary alignment, ensuring that
                 // the new start is not greater than the end
-                if (primary_alignment_start + overlap_length < primary_alignment_end) {
+                // if (primary_alignment_start + overlap_length <
+                // primary_alignment_end) {
+                if (primary_alignment.start + overlap_length < primary_alignment.end) {
                     // Trim the beginning of the primary alignment
-                    uint32_t new_start = primary_alignment_start + overlap_length;
-                    std::get<1>(primary_alignment) = new_start;
+                    // uint32_t new_start = primary_alignment_start + overlap_length;
+                    // std::get<1>(primary_alignment) = new_start;
+                    primary_alignment.start = primary_alignment.start + overlap_length;
                 }
-                // uint32_t new_start = primary_alignment_start + overlap_length;
-                // std::get<1>(primary_alignment) = new_start;
-                // std::get<1>(primary_alignment) = primary_alignment_start + overlap_length;
             }
         }
     }

From 56507d49d44b032ac026e5ae8e419d4e0f91ea4a Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sun, 15 Dec 2024 14:21:23 -0500
Subject: [PATCH 052/134] Fix vector error

---
 src/sv_caller.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index db4da271..1d966a19 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -130,7 +130,7 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
     }
 
     // Main loop to process the alignments
-    std::vector<int> match_map;
+    std::vector<int> match_map(bam1->core.l_qseq, 0);  // Query position to match/mismatch (1/0) map
     uint32_t query_start = 0;
     uint32_t query_end = 0;
     uint32_t query_pos = 0;

From a9b6fcce253ffca243fd1e9fc18ec1787ac12135 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sun, 15 Dec 2024 16:40:24 -0500
Subject: [PATCH 053/134] Fix alignments error

---
 include/sv_caller.h |   2 +-
 src/cnv_caller.cpp  |  46 ++-------
 src/contextsv.cpp   |   2 -
 src/main.cpp        |   4 -
 src/sv_caller.cpp   | 226 ++++++++++++--------------------------------
 5 files changed, 73 insertions(+), 207 deletions(-)

diff --git a/include/sv_caller.h b/include/sv_caller.h
index abd397d1..f1028b70 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -35,7 +35,7 @@ class SVCaller {
         int min_mapq = 20;          // Minimum mapping quality to be considered
         const InputData& input_data;
 
-        void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data) const;
+        void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data, bool is_primary) const;
 
         void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map) const;
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index ac79d598..b380c150 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -94,9 +94,8 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
         return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false);
     }
 
-    // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2
-    // the SV length
-    // Only extened the region if "save CNV data" is enabled
+    // Run the Viterbi algorithm on SNPs in the SV region
+    // Only extend the region if "save CNV data" is enabled
     uint32_t snp_start_pos = start_pos;
     uint32_t snp_end_pos = end_pos;
     if (this->input_data.getSaveCNVData())
@@ -105,26 +104,18 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
         snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
         snp_end_pos = end_pos + sv_half_length;
     }
-    // uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
-    // uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
-    // uint32_t snp_end_pos = end_pos + sv_half_length;
 
     // Query the SNP region for the SV candidate
     SNPData snp_data;
     querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data);
-    // std::pair<SNPData, bool> snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov);
-    // SNPData& sv_snps = snp_call.first;
-    // bool sv_snps_found = snp_call.second;
 
     // Run the Viterbi algorithm
-    printMemoryUsage("Before running Viterbi algorithm, ");
     std::pair<std::vector<int>, double> prediction;
     runViterbi(hmm, snp_data, prediction);
     if (prediction.first.size() == 0)
     {
         return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false);
     }
-    printMemoryUsage("After running Viterbi algorithm, ");
 
     std::vector<int>& state_sequence = prediction.first;
     double likelihood = prediction.second;
@@ -213,8 +204,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
             continue;
         }
 
-        // Loop through the SV region +/- 1/2 SV length and run copy number
-        // predictions
         // Only extend the region if "save CNV data" is enabled
         uint32_t snp_start_pos = start_pos;
         uint32_t snp_end_pos = end_pos;
@@ -239,7 +228,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         double likelihood = prediction.second;
 
         // Get all the states in the SV region
-        // printMessage("Getting states for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
         std::vector<int> sv_states;
         for (size_t i = 0; i < state_sequence.size(); i++)
         {
@@ -276,7 +264,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         std::string genotype = cnv_genotype_map[max_state];
 
         // Determine the SV calling method used to call the SV
-        // (SNPCNV=SNP-based, Log2CNV=coverage-based)
         std::string data_type;
         data_type = "HMM";
 
@@ -289,7 +276,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         }
 
         // Update the SV type if known
-        // printMessage("Updating SV copy number data for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
         if (updated_sv_type != SVType::UNKNOWN && updated_sv_type != SVType::NEUTRAL)
         {
             std::string sv_type_str = getSVTypeString(updated_sv_type);
@@ -340,7 +326,6 @@ std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint3
 // Calculate the mean chromosome coverage
 double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map)
 {
-    printMemoryUsage("Before calculating mean chromosome coverage, ");
     {
         // Lock the bam file
         std::lock_guard<std::mutex> lock(this->bam_file_mtx);
@@ -354,7 +339,11 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
             return 0.0;
         }
 
-        // Enable multi-threading
+        // Enable multi-threading if running on a single chromosome
+        if (this->input_data.getChromosome() != "")
+        {
+            hts_set_threads(bam_file, this->input_data.getThreadCount());
+        }
         // hts_set_threads(bam_file, this->input_data.getThreadCount());
 
         // Read the header
@@ -450,7 +439,6 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
         bam_hdr_destroy(bam_header);
         sam_close(bam_file);
     }
-    printMemoryUsage("After calculating mean chromosome coverage, ");
 
     // Calculate the mean chromosome coverage for positions with non-zero depth
     uint64_t cum_depth = 0;
@@ -494,14 +482,11 @@ void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, i
         } catch (const std::out_of_range& e) {
             log2_region[i] = 0.0;
         }
-        // printMessage("Position: " + std::to_string((int)pos) + ", log2 ratio: " + std::to_string(log2_region[i]));
     }
 }
 
 void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp) const
-{
-    printMemoryUsage("Reading SNP allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", ");
-    
+{    
     // --------- SNP file ---------
     // Get the SNP file path
     std::string snp_filepath = this->input_data.getSNPFilepath();
@@ -515,7 +500,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     bcf_srs_t *snp_reader = bcf_sr_init();
     if (!snp_reader)
     {
-        // throw std::runtime_error("ERROR: Could not initialize SNP reader.");
         printError("ERROR: Could not initialize SNP reader.");
         return;
     }
@@ -536,7 +520,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         printError("ERROR: Could not add SNP file to reader: " + snp_filepath);
         return;
     }
-    printMemoryUsage("After adding SNP file to reader, ");
 
     // Get the header
     bcf_hdr_t *snp_header = bcf_sr_get_header(snp_reader, 0);
@@ -555,8 +538,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     if (pfb_filepath.empty())
     {
         use_pfb = false;
-        // printError("ERROR: Population allele frequency file path is empty.");
-        // return;
+        printMessage("WARNING: No population allele frequency file provided for chromosome " + chr);
     }
     
     bcf_srs_t *pfb_reader = bcf_sr_init();
@@ -615,7 +597,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath);
             return;
         }
-        printMemoryUsage("After adding population allele frequency file to reader, ");
 
         // Get the header
         bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0);
@@ -645,7 +626,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         // Read the SNP data ----------------------------------------------
 
         // Set the region
-        printMemoryUsage("Before setting region for SNP reader, ");
         std::string region_str = region_chunks[i];
         if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)
         {
@@ -653,10 +633,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             printError("ERROR: Could not set region for SNP reader: " + region_str);
             return;
         }
-        printMemoryUsage("After setting region for SNP reader, and before reading SNPs, ");
 
-        // std::cout << "Iterating through SNPs in region " << region_str <<
-        // "..." << std::endl;
         // printMessage("Iterating through SNPs in region " + std::to_string(current_region) + " of " + std::to_string((int) region_chunks.size()) + " with length " + std::to_string((int) (end_pos - start_pos)) + " bp...");
         bool snp_found = false;
         while (bcf_sr_next_line(snp_reader) > 0)
@@ -727,8 +704,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             }
         }
 
-        printMemoryUsage("After reading SNPs for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", ");
-
         if (snp_reader->errnum)
         {
             printError("ERROR: " + std::string(bcf_sr_strerror(snp_reader->errnum)));
@@ -740,8 +715,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             continue;
         }
 
-        printMemoryUsage("Before reading population allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", ");
-
         // Read the population allele frequency data ----------------------
         if (use_pfb)
         {
@@ -808,7 +781,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
                 printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum)));
             }
         }
-        printMemoryUsage("After reading population allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", ");
     }
     bcf_sr_destroy(snp_reader);
     bcf_sr_destroy(pfb_reader);
diff --git a/src/contextsv.cpp b/src/contextsv.cpp
index 107a6dae..01329c2c 100644
--- a/src/contextsv.cpp
+++ b/src/contextsv.cpp
@@ -19,9 +19,7 @@ ContextSV::ContextSV(InputData& input_data)
 
 int ContextSV::run()
 {
-    printMemoryUsage("Before creating SV caller, ");
     SVCaller sv_caller(this->input_data); 
-    printMemoryUsage("After creating SV caller, ");
     sv_caller.run();
 
     return 0;
diff --git a/src/main.cpp b/src/main.cpp
index c622c34d..bbdb8366 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -22,13 +22,10 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
     }
 
     // Set up input data
-    printMemoryUsage("Before setting up input data, ");
     InputData input_data;
     input_data.setLongReadBam(args.at("bam-file"));
     input_data.setShortReadBam(args.at("bam-file"));
-    printMemoryUsage("Before reading reference genome, ");
     input_data.setRefGenome(args.at("ref-file"));
-    printMemoryUsage("After reading reference genome, ");
     input_data.setSNPFilepath(args.at("snps-file"));
     input_data.setOutputDir(args.at("output-dir"));
     if (args.find("chr") != args.end()) {
@@ -61,7 +58,6 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
     if (args.find("debug") != args.end()) {
         input_data.setVerbose(true);
     }
-    printMemoryUsage("After setting up input data, ");
 
     // Run ContextSV
     run(input_data);
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 1d966a19..928e11b3 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -41,10 +41,6 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) co
 
 void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map) const
 {
-    // std::map<std::string, hts_itr_t*> primary_map;
-    // std::map<std::string, std::vector<hts_itr_t*>> supplementary_map;
-    printMemoryUsage("Before detecting SVs from CIGAR strings, ");
-
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
     if (!bam1) {
@@ -97,12 +93,10 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
     // Clean up the iterator and alignment
     hts_itr_destroy(itr);
     bam_destroy1(bam1);
-
-    printMemoryUsage("After detecting SVs from CIGAR strings, ");
     printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments...");
 }
 
-void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data) const
+void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data, bool is_primary) const
 {
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
@@ -112,7 +106,8 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
         sam_close(fp_in);
         throw std::runtime_error("ERROR: failed to initialize BAM record");
     }
-    hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start, region.end);
+    // hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start, region.end);
+    hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start - 1, region.end);
     if (!itr) {
         bam_destroy1(bam1);
         hts_idx_destroy(idx);
@@ -121,11 +116,52 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
         throw std::runtime_error("ERROR: failed to query region " + std::to_string(region.tid) + ":" + std::to_string(region.start) + "-" + std::to_string(region.end));
     }
 
-    // Read the alignment
-    if (readNextAlignment(fp_in, itr, bam1) < 0) {
-        bam_destroy1(bam1);
+    // // Read the alignment
+    // if (readNextAlignment(fp_in, itr, bam1) < 0) {
+    //     bam_destroy1(bam1);
+    //     hts_itr_destroy(itr);
+    //     printError("ERROR: failed to read alignment");
+    //     return;
+    // }
+
+    // Find the correct alignment
+    bool success = false;
+    std::string fail_str = "";
+    // printMessage("Looking for alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " with type: " + (is_primary ? "primary" : "supplementary") + " and strand: " + (region.strand ? "forward" : "reverse"));
+    while (readNextAlignment(fp_in, itr, bam1) >= 0) {
+        // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
+        if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) {
+            continue;
+        }
+
+        // Skip if not the correct type of alignment
+        if (is_primary && (bam1->core.flag & BAM_FSUPPLEMENTARY)) {
+            continue;
+        } else if (!is_primary && !(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
+            continue;
+        }
+
+        // Check the alignment start and end positions, and strand
+        if (bam1->core.pos+1 == region.start && bam_endpos(bam1) == region.end && !(bam1->core.flag & BAM_FREVERSE) == region.strand) {
+            // printMessage("SUCCESS: Found alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " at position: " + std::to_string(bam1->core.pos + 1) + "-" + std::to_string(bam_endpos(bam1)));
+            success = true;
+            break;
+        } else {
+            // std::string type_str = is_primary ? "primary" : "supplementary";
+            // std::string strand_str = region.strand ? "forward" : "reverse";
+            // fail_str = "ERROR: Incorrect alignment start and end positions for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + ", Got: " + std::to_string(bam1->core.pos + 1) + "-" + std::to_string(bam_endpos(bam1)) + " with type: " + type_str + " and strand: " + strand_str;
+            // printError(fail_str);
+            // printError("ERROR: Incorrect alignment start and end positions for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + ", Got: " + std::to_string(bam1->core.pos) + "-" + std::to_string(bam_endpos(bam1)));
+            continue;
+        }
+    }
+
+    // Check if the alignment was found
+    if (!success) {
+        printError("ERROR: Failed to find alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " with type: " + (is_primary ? "primary" : "supplementary") + " and strand: " + (region.strand ? "forward" : "reverse"));
+        // printError(fail_str);
         hts_itr_destroy(itr);
-        printError("ERROR: failed to read alignment");
+        bam_destroy1(bam1);
         return;
     }
 
@@ -177,7 +213,18 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
             // Compare the two sequences and update the mismatch map
             for (int j = 0; j < op_len; j++) {
                 if (cmatch_seq_str[j] != cmatch_ref_str[j]) {
-                    match_map[query_pos + j] = MISMATCH;
+                    try {
+                        match_map.at(query_pos + j) = MISMATCH;
+                    } catch (const std::out_of_range& e) {
+                        printError("ERROR: Out of range exception for query position: " + std::to_string(query_pos + j) + " with read length: " + std::to_string(bam1->core.l_qseq) + " and array size: " + std::to_string(match_map.size()) + " for CIGAR operation: " + std::to_string(op) + " with length: " + std::to_string(op_len));
+
+                        // Exit the program
+                        hts_itr_destroy(itr);
+                        bam_destroy1(bam1);
+                        
+                        return;
+                    }
+                    // match_map[query_pos + j] = MISMATCH;
                 } else {
                     match_map[query_pos + j] = MATCH;
                 }
@@ -211,8 +258,6 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
 
 void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map)
 {
-    printMemoryUsage("Before detecting SVs from CIGAR strings, ");
-
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
     if (!bam1) {
@@ -237,68 +282,15 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
         if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) {
             continue;
         }
-        // const std::string qname = bam_get_qname(bam1);  // Query template name
 
         // Process the alignment
         bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY);
         this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, primary, pos_depth_map);
-        // if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
-
-        //     // Get the primary alignment information
-        //     // std::string chr = bamHdr->target_name[bam1->core.tid];
-        //     // uint32_t start = (uint32_t)bam1->core.pos;
-        //     // uint32_t end = (uint32_t)bam_endpos(bam1);  // This is the first position after the alignment
-        //     // end--;  // Adjust to the last position of the alignment
-        //     // bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
-
-        //     // Call SVs directly from the CIGAR string
-        //     // std::tuple<std::vector<int>, uint32_t, uint32_t> query_info;
-        //     this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true, pos_depth_map);
-        //     // std::tuple<std::vector<int>, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true);
-        //     // const std::vector<int>& match_map = std::get<0>(query_info);
-        //     // uint32_t query_start = std::get<1>(query_info);
-        //     // uint32_t query_end = std::get<2>(query_info);
-
-        //     // Add the primary alignment to the map
-        //     // AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand);
-        //     // primary_alignments[qname] = alignment;
-
-        //     // Add the iterator to the primary map
-        //     // primary_map[qname] = itr;
-
-        // // Process supplementary alignments
-        // } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
-
-        //     // Get the supplementary alignment information
-        //     // std::string chr = bamHdr->target_name[bam1->core.tid];
-        //     // uint32_t start = bam1->core.pos;
-        //     // uint32_t end = bam_endpos(bam1);  // This is the first position after the alignment
-        //     // end--;  // Adjust to the last position of the alignment
-        //     // bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE);
-
-        //     // Get CIGAR string information, but don't call SVs
-        //     // std::tuple<std::vector<int>, int32_t, int32_t> query_info =
-        //     // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false);
-        //     // std::tuple<std::vector<int>, uint32_t, uint32_t> query_info;
-        //     this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false, pos_depth_map);
-        //     // const std::vector<int>& match_map = std::get<0>(query_info);
-        //     // uint32_t query_start = std::get<1>(query_info);
-        //     // uint32_t query_end = std::get<2>(query_info);
-
-        //     // Add the supplementary alignment to the map
-        //     // AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand);
-        //     // supplementary_alignments[qname].emplace_back(alignment);
-
-        //     // Add the iterator to the supplementary map
-        //     // supplementary_map[qname].push_back(itr);
-        // }
     }
 
     // Clean up the iterator and alignment
     hts_itr_destroy(itr);
     bam_destroy1(bam1);
-
-    printMemoryUsage("After detecting SVs from CIGAR strings, ");
 }
 
 // double SVCaller::calculateMismatchRate(const std::vector<int>& mismatch_map,
@@ -624,14 +616,12 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
 
 void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls)
 {
-    printMemoryUsage("Before opening BAM file, ");
     // Open the BAM file
     std::string bam_filepath = this->input_data.getLongReadBam();
     samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
     if (!fp_in) {
         throw std::runtime_error("ERROR: failed to open " + bam_filepath);
     }
-    printMemoryUsage("After opening BAM file, ");
 
     // Load the header
     bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
@@ -647,7 +637,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         sam_close(fp_in);
         throw std::runtime_error("ERROR: failed to load index for " + bam_filepath);
     }
-    printMemoryUsage("After loading index, ");
 
     // Split the chromosome into chunks for memory efficiency
     std::vector<std::string> region_chunks;
@@ -677,16 +666,13 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         }
         printMessage("Split chromosome " + chr + " into " + std::to_string(region_chunks.size()) + " chunks of size " + std::to_string(chunk_size) + "...");
     }
-    printMemoryUsage("After splitting chromosome into chunks, ");
 
     // Load chromosome data for copy number predictions
     // std::cout << "Loading chromosome data for copy number predictions..." << std::endl;
     printMessage(chr + ": Loading chromosome data...");
     CNVCaller cnv_caller(this->input_data);
-    printMemoryUsage("Before calculating mean chromosome coverage (top), ");
     std::vector<uint32_t> chr_pos_depth_map(chr_len+1, 0);  // 1-based index
     double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map);
-    printMemoryUsage("After calculating mean chromosome coverage (top), ");
     if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) {
         hts_idx_destroy(idx);
         bam_hdr_destroy(bamHdr);
@@ -758,8 +744,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
 
 void SVCaller::run()
 {
-    printMemoryUsage("Before running SV caller, ");
-    // Get the chromosomes to process
+    // Get the chromosomes
     std::vector<std::string> chromosomes;
     if (this->input_data.getChromosome() != "") {
         chromosomes.push_back(this->input_data.getChromosome());
@@ -767,12 +752,10 @@ void SVCaller::run()
         chromosomes = this->input_data.getRefGenomeChromosomes();
     }
     
-    printMemoryUsage("After getting chromosomes, ");
     // Read the HMM from the file
     std::string hmm_filepath = this->input_data.getHMMFilepath();
     std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
     const CHMM& hmm = ReadCHMM(hmm_filepath.c_str());
-    printMemoryUsage("After reading HMM, ");
 
     // Use multi-threading across chromosomes unless a single chromosome is
     // specified
@@ -845,36 +828,17 @@ void SVCaller::run()
 // Detect SVs from split read alignments
 void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map)
 {
-    printMemoryUsage("Before detecting SVs from split reads, ");
-
-
     printMessage("Getting split alignments...");
-    // std::map<std::string, hts_itr_t*> primary_map;
-    // std::map<std::string, std::vector<hts_itr_t*>> supp_map;
     std::unordered_map<std::string, GenomicRegion> primary_map;
     std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
     this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map);
 
-    printMessage("[TEST] Primary map size: " + std::to_string(primary_map.size()));
-    printMessage("[TEST] Supplementary map size: " + std::to_string(supp_map.size()));
-
     // Find split-read SV evidence
     int sv_count = 0;
     uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength();
     for (auto& entry : primary_map) {
-        // std::string qname = entry.first;
         const std::string& qname = entry.first;
         GenomicRegion& primary_region = entry.second;
-        // AlignmentData primary_alignment = entry.second;
-        // std::string primary_chr = std::get<0>(primary_alignment);
-        // uint32_t primary_start = std::get<1>(primary_alignment);
-        // uint32_t primary_end = std::get<2>(primary_alignment);
-
-        // Get the primary alignment information
-        // std::string primary_chr = bamHdr->target_name[primary_bam1->core.tid];
-        // uint32_t primary_start = (uint32_t) primary_bam1->core.pos;
-        // uint32_t primary_end = (uint32_t) bam_endpos(primary_bam1) - 1;  // Last alignment position
-        // bool primary_fwd_strand = !(primary_bam1->core.flag & BAM_FREVERSE);
 
         // Skip primary alignments that do not have supplementary alignments
         if (supp_map.find(qname) == supp_map.end()) {
@@ -883,51 +847,18 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
 
         // Get the read match/mismatch map
         MismatchData primary_mismatches;
-        this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches);
-        // std::vector<int> match_map(primary_region.end - primary_region.start
-        // + 1, 0);
-        // this->getMatchMismatchMap(fp_in, idx, bamHdr, primary_region, mismatch_data);
-
-        // std::pair<uint32_t, uint32_t> query_info = generateMatchMismatchMap(fp_in, idx, bamHdr, primary_itr, match_map);
-
-        // Find the largest supplementary alignment, and also identify
-        // inversions
-        // printMessage("Finding largest supplementary alignment...");
-        // AlignmentData largest_supp_region = supp_map[qname][0];
-        // hts_itr_t* largest_supp_itr = supp_map[qname][0];
+        this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches, true);
         GenomicRegion largest_supp_region = supp_map[qname][0];
         uint32_t largest_supp_length = 0;
         const std::string& primary_chr = bamHdr->target_name[primary_region.tid];
         for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
             GenomicRegion& supp_region = *it;
-            // Get the supplementary alignment information
-            // bam1_t* supp_bam1 = bam_init1();
-            // if (!supp_bam1) {
-            //     throw std::runtime_error("ERROR: failed to initialize BAM record");
-            // }
-            // if (sam_itr_next(fp_in, *it, supp_bam1) < 0) {
-            //     bam_destroy1(supp_bam1);
-            //     throw std::runtime_error("ERROR: failed to read alignment");
-            // }
 
             // Skip if not on the primary chromosome
             if (primary_region.tid != supp_region.tid) {
                 continue;
             }
 
-            // std::string supp_chr = bamHdr->target_name[supp_bam1->core.tid];
-            // uint32_t supp_start = (uint32_t) supp_bam1->core.pos;
-            // uint32_t supp_end = (uint32_t) bam_endpos(supp_bam1) - 1;  // Last alignment position
-            // uint32_t supp_length = supp_end - supp_start + 1;
-
-            // const auto& supp_chr = std::get<0>(*it);
-            // if (primary_chr != supp_chr) {
-            //     continue;  // Skip supplementary alignments on different chromosomes
-            // }
-            // uint32_t supp_start = std::get<1>(*it);
-            // uint32_t supp_end = std::get<2>(*it);
-            // uint32_t supp_length = supp_end - supp_start + 1;
-
             // Get the supplementary alignment information
             uint32_t supp_start = (uint32_t) supp_region.start;
             uint32_t supp_end = (uint32_t) supp_region.end;
@@ -938,9 +869,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
             }
 
             // Inversion detection
-            // bool is_opposite_strand = std::get<6>(primary_alignment) != std::get<6>(*it);
-            // bool supp_fwd_strand = !(supp_bam1->core.flag & BAM_FREVERSE);
-            // bool is_opposite_strand = primary_fwd_strand != supp_fwd_strand;
             bool is_opposite_strand = primary_region.strand != supp_region.strand;
             if (is_opposite_strand) {
                 if (supp_length >= min_cnv_length) {
@@ -978,37 +906,17 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
 
         // Trim overlapping alignments
         MismatchData supp_mismatches;
-        this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches);
-        // uint32_t supp_start = std::get<1>(largest_supp_region);
-        // uint32_t supp_end = std::get<2>(largest_supp_region);
-        // bool primary_before_supp = primary_start < supp_start;
-        // trimOverlappingAlignments(primary_alignment, largest_supp_region);
+        this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches, false);
         trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches);
-
-        // Create the SV candidate using both alignments
-        // supp_start = std::get<1>(largest_supp_region);
-        // supp_end = std::get<2>(largest_supp_region);
-        // primary_start = std::get<1>(primary_alignment);
-        // primary_end = std::get<2>(primary_alignment);
         bool gap_exists = false;
         uint32_t boundary_left, boundary_right, gap_left, gap_right;
         if (primary_region.start < largest_supp_region.start) {  // Primary before supp
-            // boundary_left = primary_start+1;
-            // boundary_right = std::max(primary_end, supp_end)+1;
-            // gap_left = primary_end+1;
-            // gap_right = supp_start+1;
-            // gap_exists = gap_left < gap_right;
             boundary_left = primary_region.start + 1;
             boundary_right = std::max(primary_region.end, largest_supp_region.end) + 1;
             gap_left = primary_region.end + 1;
             gap_right = largest_supp_region.start + 1;
             gap_exists = gap_left < gap_right;
         } else {
-            // boundary_left = supp_start+1;
-            // boundary_right = std::max(primary_end, supp_end)+1;
-            // gap_left = supp_end+1;
-            // gap_right = primary_start+1;
-            // gap_exists = gap_left < gap_right;
             boundary_left = largest_supp_region.start + 1;
             boundary_right = std::max(primary_region.end, largest_supp_region.end) + 1;
             gap_left = largest_supp_region.end + 1;
@@ -1026,9 +934,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 continue;
             }
 
-            // printMessage("Running copy number prediction on boundary: " +
-            // primary_chr + ":" + std::to_string(boundary_left) + "-" +
-            // std::to_string(boundary_right));
             std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map);
             if (std::get<1>(bd_result) == SVType::UNKNOWN) {
                 continue;
@@ -1046,9 +951,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                     continue;
                 }
 
-                // printMessage("Running copy number prediction on gap: " +
-                // primary_chr + ":" + std::to_string(gap_left) + "-" +
-                // std::to_string(gap_right));
                 std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map);
                 if (std::get<1>(gap_result) == SVType::UNKNOWN) {
                     continue;
@@ -1075,8 +977,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
             }
         }
     }
-
-    printMemoryUsage("After detecting SVs from split reads, ");
 }
 
 void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls)

From d1756826fff65f900fc9e08ee1b9e5045f587c21 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 16 Dec 2024 11:28:18 -0500
Subject: [PATCH 054/134] work on error handling

---
 include/sv_caller.h |   2 -
 src/cnv_caller.cpp  |   1 -
 src/khmm.cpp        |  40 +++----
 src/sv_caller.cpp   | 246 ++++++--------------------------------------
 src/sv_object.cpp   |   8 +-
 5 files changed, 52 insertions(+), 245 deletions(-)

diff --git a/include/sv_caller.h b/include/sv_caller.h
index f1028b70..11b3919f 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -61,8 +61,6 @@ class SVCaller {
         // sequence
         double calculateMismatchRate(const MismatchData& mismatch_data) const;
 
-        std::pair<uint32_t, uint32_t> generateMatchMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, hts_itr_t *itr, std::vector<int>& match_map) const;
-
         void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls);
 
         void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches) const;
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index b380c150..c0811d51 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -89,7 +89,6 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     // Check that the start position is less than the end position
     if (start_pos >= end_pos)
     {
-        // throw std::runtime_error("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false);
     }
diff --git a/src/khmm.cpp b/src/khmm.cpp
index b5b5bf02..fcc4899d 100644
--- a/src/khmm.cpp
+++ b/src/khmm.cpp
@@ -425,7 +425,6 @@ CHMM ReadCHMM(const std::string filename)
 	std::ifstream file(filename);
 	if (!file.is_open())
 	{
-		// throw std::runtime_error("Error opening file");
 		printError("Error opening file");
 		return CHMM();
 	}
@@ -437,7 +436,6 @@ CHMM ReadCHMM(const std::string filename)
 	std::getline(file, line);
 	if (sscanf(line.c_str(), "M=%d", &hmm.M) != 1)
 	{
-		// throw std::runtime_error("Error reading M");
 		printError("Error reading M");
 		return CHMM();
 	}
@@ -446,7 +444,6 @@ CHMM ReadCHMM(const std::string filename)
 	std::getline(file, line);
 	if (sscanf(line.c_str(), "N=%d", &hmm.N) != 1)
 	{
-		// throw std::runtime_error("Error reading N");
 		printError("Error reading N");
 		return CHMM();
 	}
@@ -455,14 +452,12 @@ CHMM ReadCHMM(const std::string filename)
 	std::getline(file, line);
 	if (line != "A:")
 	{
-		// throw std::runtime_error("Error reading A");
 		printError("Error reading A");
 		return CHMM();
 	}
 	hmm.A = readMatrix(file, hmm.N, hmm.N);
 	if (hmm.A.size() != (size_t)hmm.N || hmm.A[0].size() != (size_t)hmm.N)
 	{
-		// throw std::runtime_error("Error reading A");
 		printError("Error reading A");
 		return CHMM();
 	}
@@ -471,14 +466,12 @@ CHMM ReadCHMM(const std::string filename)
 	std::getline(file, line);
 	if (line != "B:")
 	{
-		// throw std::runtime_error("Error reading B");
 		printError("Error reading B");
 		return CHMM();
 	}
 	hmm.B = readMatrix(file, hmm.N, hmm.M);
 	if (hmm.B.size() != (size_t)hmm.N || hmm.B[0].size() != (size_t)hmm.M)
 	{
-		// throw std::runtime_error("Error reading B");
 		printError("Error reading B");
 		return CHMM();
 	}
@@ -487,14 +480,12 @@ CHMM ReadCHMM(const std::string filename)
 	std::getline(file, line);
 	if (line != "pi:")
 	{
-		// throw std::runtime_error("Error reading pi");
 		printError("Error reading pi");
 		return CHMM();
 	}
 	hmm.pi = readVector(file, hmm.N);
 	if (hmm.pi.size() != (size_t)hmm.N)
 	{
-		// throw std::runtime_error("Error reading pi");
 		printError("Error reading pi");
 		return CHMM();
 	}
@@ -503,14 +494,12 @@ CHMM ReadCHMM(const std::string filename)
 	std::getline(file, line);
 	if (line != "B1_mean:")
 	{
-		// throw std::runtime_error("Error reading B1_mean");
 		printError("Error reading B1_mean");
 		return CHMM();
 	}
 	hmm.B1_mean = readVector(file, hmm.N);
 	if (hmm.B1_mean.size() != (size_t)hmm.N)
 	{
-		// throw std::runtime_error("Error reading B1_mean");
 		printError("Error reading B1_mean");
 		return CHMM();
 	}
@@ -519,14 +508,12 @@ CHMM ReadCHMM(const std::string filename)
 	std::getline(file, line);
 	if (line != "B1_sd:")
 	{
-		// throw std::runtime_error("Error reading B1_sd");
 		printError("Error reading B1_sd");
 		return CHMM();
 	}
 	hmm.B1_sd = readVector(file, hmm.N);
 	if (hmm.B1_sd.size() != (size_t)hmm.N)
 	{
-		// throw std::runtime_error("Error reading B1_sd");
 		printError("Error reading B1_sd");
 		return CHMM();
 	}
@@ -535,7 +522,6 @@ CHMM ReadCHMM(const std::string filename)
 	std::getline(file, line);
 	if (line != "B1_uf:")
 	{
-		// throw std::runtime_error("Error reading B1_uf");
 		printError("Error reading B1_uf");
 		return CHMM();
 	}
@@ -543,7 +529,6 @@ CHMM ReadCHMM(const std::string filename)
 	try {
 		hmm.B1_uf = std::stod(line);
 	} catch (const std::invalid_argument& e) {
-		// throw std::runtime_error("Error reading B1_uf");
 		printError("Error reading B1_uf");
 		return CHMM();
 	}
@@ -552,37 +537,44 @@ CHMM ReadCHMM(const std::string filename)
 	std::getline(file, line);
 	if (line != "B2_mean:")
 	{
-		throw std::runtime_error("Error reading B2_mean");
+		printError("Error reading B2_mean");
+		return CHMM();
 	}
 	hmm.B2_mean = readVector(file, 5);
 	if (hmm.B2_mean.size() != (size_t)5)
 	{
-		throw std::runtime_error("Error reading B2_mean");
+		printError("Error reading B2_mean");
+		return CHMM();
 	}
 
 	// Read B2_sd
 	std::getline(file, line);
 	if (line != "B2_sd:")
 	{
-		throw std::runtime_error("Error reading B2_sd");
+		printError("Error reading B2_sd");
+		return CHMM();
 	}
 	hmm.B2_sd = readVector(file, 5);
 	if (hmm.B2_sd.size() != (size_t)5)
 	{
-		throw std::runtime_error("Error reading B2_sd");
+		printError("Error reading B2_sd");
+		return CHMM();
+
 	}
 
 	// Read B2_uf
 	std::getline(file, line);
 	if (line != "B2_uf:")
 	{
-		throw std::runtime_error("Error reading B2_uf");
+		printError("Error reading B2_uf");
+		return CHMM();
 	}
 	std::getline(file, line);
 	try {
 		hmm.B2_uf = std::stod(line);
 	} catch (const std::invalid_argument& e) {
-		throw std::runtime_error("Error reading B2_uf");
+		printError("Error reading B2_uf");
+		return CHMM();
 	}
 
 	return hmm;
@@ -597,7 +589,8 @@ std::vector<std::vector<double>> readMatrix(std::ifstream &file, int rows, int c
 		{
 			if (!(file >> matrix[i][j]))
 			{
-				throw std::runtime_error("Error reading matrix");
+				printError("Error reading matrix");
+				return std::vector<std::vector<double>>();
 			}
 		}
 	}
@@ -612,7 +605,8 @@ std::vector<double> readVector(std::ifstream &file, int size)
 	{
 		if (!(file >> vector[i]))
 		{
-			throw std::runtime_error("Error reading vector");
+			printError("Error reading vector");
+			return std::vector<double>();
 		}
 	}
 	file.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 928e11b3..df54f4ef 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -47,7 +47,8 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
         hts_idx_destroy(idx);
         bam_hdr_destroy(bamHdr);
         sam_close(fp_in);
-        throw std::runtime_error("ERROR: failed to initialize BAM record");
+        printError("ERROR: failed to initialize BAM record");
+        return;
     }
     hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str());
     if (!itr) {
@@ -55,7 +56,8 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
         hts_idx_destroy(idx);
         bam_hdr_destroy(bamHdr);
         sam_close(fp_in);
-        throw std::runtime_error("ERROR: failed to query region " + region);
+        printError("ERROR: failed to query region " + region);
+        return;
     }
 
     uint32_t primary_count = 0;
@@ -104,7 +106,8 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
         hts_idx_destroy(idx);
         bam_hdr_destroy(bamHdr);
         sam_close(fp_in);
-        throw std::runtime_error("ERROR: failed to initialize BAM record");
+        printError("ERROR: failed to initialize BAM record");
+        return;
     }
     // hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start, region.end);
     hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start - 1, region.end);
@@ -113,17 +116,10 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
         hts_idx_destroy(idx);
         bam_hdr_destroy(bamHdr);
         sam_close(fp_in);
-        throw std::runtime_error("ERROR: failed to query region " + std::to_string(region.tid) + ":" + std::to_string(region.start) + "-" + std::to_string(region.end));
+        printError("ERROR: failed to query region " + std::to_string(region.tid) + ":" + std::to_string(region.start) + "-" + std::to_string(region.end));
+        return;
     }
 
-    // // Read the alignment
-    // if (readNextAlignment(fp_in, itr, bam1) < 0) {
-    //     bam_destroy1(bam1);
-    //     hts_itr_destroy(itr);
-    //     printError("ERROR: failed to read alignment");
-    //     return;
-    // }
-
     // Find the correct alignment
     bool success = false;
     std::string fail_str = "";
@@ -147,11 +143,6 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
             success = true;
             break;
         } else {
-            // std::string type_str = is_primary ? "primary" : "supplementary";
-            // std::string strand_str = region.strand ? "forward" : "reverse";
-            // fail_str = "ERROR: Incorrect alignment start and end positions for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + ", Got: " + std::to_string(bam1->core.pos + 1) + "-" + std::to_string(bam_endpos(bam1)) + " with type: " + type_str + " and strand: " + strand_str;
-            // printError(fail_str);
-            // printError("ERROR: Incorrect alignment start and end positions for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + ", Got: " + std::to_string(bam1->core.pos) + "-" + std::to_string(bam_endpos(bam1)));
             continue;
         }
     }
@@ -159,7 +150,6 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
     // Check if the alignment was found
     if (!success) {
         printError("ERROR: Failed to find alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " with type: " + (is_primary ? "primary" : "supplementary") + " and strand: " + (region.strand ? "forward" : "reverse"));
-        // printError(fail_str);
         hts_itr_destroy(itr);
         bam_destroy1(bam1);
         return;
@@ -207,7 +197,10 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
 
             // Check that the two sequence lengths are equal
             if (cmatch_seq_str.length() != cmatch_ref_str.length()) {
-                throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op));
+                printError("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op));
+                hts_itr_destroy(itr);
+                bam_destroy1(bam1);
+                return;
             }
 
             // Compare the two sequences and update the mismatch map
@@ -238,9 +231,10 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
         // https://samtools.github.io/hts-specs/SAMv1.pdf
         if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
             pos += op_len;
+        }
 
         // Update the query position
-        } else if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
+        if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
             query_pos += op_len;
         }
     }
@@ -264,7 +258,8 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
         hts_idx_destroy(idx);
         bam_hdr_destroy(bamHdr);
         sam_close(fp_in);
-        throw std::runtime_error("ERROR: failed to initialize BAM record");
+        printError("ERROR: failed to initialize BAM record");
+        return;
     }
     hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str());
     if (!itr) {
@@ -272,7 +267,8 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
         hts_idx_destroy(idx);
         bam_hdr_destroy(bamHdr);
         sam_close(fp_in);
-        throw std::runtime_error("ERROR: failed to query region " + region);
+        printError("ERROR: failed to query region " + region);
+        return;
     }
 
     // Main loop to process the alignments
@@ -293,8 +289,6 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
     bam_destroy1(bam1);
 }
 
-// double SVCaller::calculateMismatchRate(const std::vector<int>& mismatch_map,
-// int32_t start, int32_t end)
 double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) const
 {
     int start = mismatch_data.query_start;
@@ -323,115 +317,6 @@ double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) const
 
     return mismatch_rate;
 }
-// {
-//     start = std::max(start, 0);
-//     end = std::min(end, (int32_t)mismatch_map.size() - 1);
-//     int match_count = 0;
-//     int mismatch_count = 0;
-//     int MATCH = 1;
-//     int MISMATCH = -1;
-//     for (int i = start; i <= end; i++) {
-//         if (mismatch_map[i] == MATCH) {
-//             match_count++;
-//         } else if (mismatch_map[i] == MISMATCH) {
-//             mismatch_count++;
-//         }
-//     }
-
-//     // Avoid division by zero
-//     if (match_count + mismatch_count == 0) {
-//         return 0.0;
-//     }
-
-//     double mismatch_rate = static_cast<double>(mismatch_count) / static_cast<double>(match_count + mismatch_count);
-
-//     return mismatch_rate;
-// }
-
-std::pair<uint32_t, uint32_t> SVCaller::generateMatchMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, hts_itr_t* itr, std::vector<int>& match_map) const
-{
-    // Create a read and iterator for the region
-    bam1_t *bam1 = bam_init1();
-    if (!bam1) {
-        printError("ERROR: failed to initialize BAM record");
-        return std::make_pair(0, 0);
-    }
-
-    // Read the alignment
-    if (readNextAlignment(fp_in, itr, bam1) < 0) {
-        bam_destroy1(bam1);
-        printError("ERROR: failed to read alignment");
-        return std::make_pair(0, 0);
-    }
-
-    // Main loop to process the alignments
-    std::string chr = bamHdr->target_name[bam1->core.tid];  // Chromosome name
-    uint32_t pos = (uint32_t)bam1->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
-    uint32_t query_pos = 0;
-    uint32_t query_start = 0;
-    uint32_t query_end = 0;
-    bool first_op = true;
-
-    // Get the CIGAR string
-    uint32_t* cigar = bam_get_cigar(bam1);  // CIGAR array
-    int cigar_len = bam1->core.n_cigar;
-    for (int i = 0; i < cigar_len; i++) {
-        int op = bam_cigar_op(cigar[i]);  // CIGAR operation
-        int op_len = bam_cigar_oplen(cigar[i]);  // CIGAR operation length
-        
-        // Update match/mismatch query map
-        int MATCH = 1;
-        int MISMATCH = -1;
-        if (op == BAM_CEQUAL) {
-            for (int j = 0; j < op_len; j++) {
-                match_map[query_pos + j] = MATCH;
-            }
-        } else if (op == BAM_CDIFF) {
-            for (int j = 0; j < op_len; j++) {
-                match_map[query_pos + j] = MISMATCH;
-            }
-        } else if (op == BAM_CMATCH) {
-            // Get the read sequence
-            uint8_t* seq_ptr = bam_get_seq(bam1);
-            std::string cmatch_seq_str = "";
-            for (int j = 0; j < op_len; j++) {
-                cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)];
-            }
-
-            // Get the corresponding reference sequence
-            int cmatch_pos = pos + 1;  // Querying the reference genome is 1-based
-            // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1));
-            std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1);
-
-            // Check that the two sequence lengths are equal
-            if (cmatch_seq_str.length() != cmatch_ref_str.length()) {
-                throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op));
-            }
-
-            // Compare the two sequences and update the mismatch map
-            for (int j = 0; j < op_len; j++) {
-                if (cmatch_seq_str[j] != cmatch_ref_str[j]) {
-                    match_map[query_pos + j] = MISMATCH;
-                } else {
-                    match_map[query_pos + j] = MATCH;
-                }
-            }
-        } else if (first_op && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP)) {
-            query_start = query_pos + op_len;
-            first_op = false;
-        }
-        
-        // Update the query position
-        if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
-            query_pos += op_len;
-        }
-    }
-    query_end = query_pos;
-    
-    bam_destroy1(bam1);  // Clean up the alignment
-
-    return std::make_pair(query_start, query_end);
-}
 
 void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map)
 {
@@ -440,19 +325,11 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     uint32_t* cigar = bam_get_cigar(alignment);  // CIGAR array
     int cigar_len = alignment->core.n_cigar;
     uint32_t query_pos = 0;
-    // std::unordered_map<int, int> query_match_map;  // Query position to
-    // match/mismatch (1/0) map
-    // std::vector<int> query_match_map(alignment->core.l_qseq, 0);  // Query position to match/mismatch (1/0) map
 
     // Loop through the CIGAR string, process operations, detect SVs (primary
-    // only), update clipped base support, calculate sequence identity for
-    // potential duplications (primary only), and calculate
-    // the clipped base support and mismatch rate
+    // only), and calculate sequence identity for potential duplications (primary only)
     uint32_t ref_pos;
     uint32_t ref_end;
-    // uint32_t query_start = 0;  // First alignment position in the query
-    // uint32_t query_end = 0;    // Last alignment position in the query
-    // bool first_op = false;  // First alignment operation for the query
     double default_lh = 0.0;
     for (int i = 0; i < cigar_len; i++) {
 
@@ -478,8 +355,6 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 bool is_duplication = false;
                 int ins_ref_pos;
                 uint32_t dup_start = std::max(0, (int)pos - op_len);
-                // int dup_start = std::max(0, pos - op_len);
-                // for (int j = pos - op_len; j <= pos; j++) {
                 for (uint32_t j = dup_start; j <= pos; j++) {
 
                     // Get the string for the window (1-based coordinates)
@@ -537,81 +412,19 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
                 addSVCall(sv_calls, ref_pos, ref_end, "DEL", "<DEL>", "CIGARDEL", "./.", default_lh, read_depth);
             }
-
-        // Check if the CIGAR operation is a clipped base
-        } else if (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP) {
-
-            // sv_calls.updateClippedBaseSupport(chr, pos);  // Update clipped base support
-
-            // Update the query alignment start position
-            // if (!first_op) {
-            //     query_start = query_pos + op_len;
-            //     first_op = true;
-            // }
         }
 
-        // // Update match/mismatch query map
-        // int MATCH = 1;
-        // int MISMATCH = -1;
-        // if (op == BAM_CEQUAL) {
-        //     for (int j = 0; j < op_len; j++) {
-        //         query_match_map[query_pos + j] = MATCH;
-        //     }
-        // } else if (op == BAM_CDIFF) {
-        //     for (int j = 0; j < op_len; j++) {
-        //         query_match_map[query_pos + j] = MISMATCH;
-        //     }
-        // } else if (op == BAM_CMATCH) {
-        //     // Get the read sequence
-        //     uint8_t* seq_ptr = bam_get_seq(alignment);
-        //     std::string cmatch_seq_str = "";
-        //     for (int j = 0; j < op_len; j++) {
-        //         cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)];
-        //     }
-
-        //     // Get the corresponding reference sequence
-        //     int cmatch_pos = pos + 1;  // Querying the reference genome is 1-based
-        //     // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1));
-        //     std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1);
-
-        //     // Check that the two sequence lengths are equal
-        //     if (cmatch_seq_str.length() != cmatch_ref_str.length()) {
-        //         throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op));
-        //     }
-
-        //     // Compare the two sequences and update the mismatch map
-        //     for (int j = 0; j < op_len; j++) {
-        //         if (cmatch_seq_str[j] != cmatch_ref_str[j]) {
-        //             query_match_map[query_pos + j] = MISMATCH;
-        //         } else {
-        //             query_match_map[query_pos + j] = MATCH;
-        //         }
-        //     }
-        // }
-
-        // Update the reference coordinate based on the CIGAR operation
+        // Update the reference position
         // https://samtools.github.io/hts-specs/SAMv1.pdf
         if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
             pos += op_len;
-        } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) {
-            // Do nothing
-        } else {
-            throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op));
         }
-
-        // Update the query position based on the CIGAR operation (M, I, S, H)
+        
+        // Update the query position
         if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
             query_pos += op_len;
-        } else if (op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) {
-            // Do nothing
-        } else {
-            throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op));
         }
     }
-
-    // query_end = query_pos;  // Last alignment position in the query
-
-    // query_info = std::tuple<std::vector<int>, uint32_t, uint32_t>(std::move(query_match_map), query_start, query_end);
 }
 
 void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls)
@@ -620,14 +433,16 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     std::string bam_filepath = this->input_data.getLongReadBam();
     samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
     if (!fp_in) {
-        throw std::runtime_error("ERROR: failed to open " + bam_filepath);
+        printError("ERROR: failed to open " + bam_filepath);
+        return;
     }
 
     // Load the header
     bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
     if (!bamHdr) {
         sam_close(fp_in);
-        throw std::runtime_error("ERROR: failed to read header from " + bam_filepath);
+        printError("ERROR: failed to read header from " + bam_filepath);
+        return;
     }
 
     // Load the index
@@ -635,7 +450,8 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     if (!idx) {
         bam_hdr_destroy(bamHdr);
         sam_close(fp_in);
-        throw std::runtime_error("ERROR: failed to load index for " + bam_filepath);
+        printError("ERROR: failed to load index for " + bam_filepath);
+        return;
     }
 
     // Split the chromosome into chunks for memory efficiency
@@ -651,7 +467,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         int region_end = region.second;
         std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end);
         region_chunks.push_back(chunk);
-        // std::cout << "Using specified region " << chunk << "..." << std::endl;
         
     } else {
         int chunk_size = std::ceil((double)chr_len / chunk_count);
@@ -668,7 +483,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     }
 
     // Load chromosome data for copy number predictions
-    // std::cout << "Loading chromosome data for copy number predictions..." << std::endl;
     printMessage(chr + ": Loading chromosome data...");
     CNVCaller cnv_caller(this->input_data);
     std::vector<uint32_t> chr_pos_depth_map(chr_len+1, 0);  // 1-based index
@@ -987,8 +801,10 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     std::cout << "Writing VCF file to " << output_vcf << std::endl;
 	std::ofstream vcf_stream(output_vcf);
     if (!vcf_stream.is_open()) {
-        throw std::runtime_error("Failed to open VCF file for writing.");
+        printError("Failed to open VCF file for writing.");
+        return;
     }
+    
     std::string sample_name = "SAMPLE";
 
     std::cout << "Getting reference genome filepath..." << std::endl;
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index aedb187c..7ef70223 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -21,8 +21,7 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std:
         return;
     }
 
-    // Set the alt allele to <DUP> or <DEL> if the SV type is DUP or DEL, throw
-    // an error otherwise
+    // Set the alt allele to <DUP> or <DEL> if the SV type is DUP or DEL
     if (sv_type == "DUP" && alt_allele == ".") {
         printError("ERROR: Invalid alt allele for duplication at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele);
         alt_allele = "<DUP>";
@@ -32,7 +31,8 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std:
     }
     
     if (start >= end) {
-        throw std::runtime_error("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end));
+        printError("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end));
+        return;
     }
 
     // Insert the SV call in sorted order
@@ -69,7 +69,7 @@ void updateSVType(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, s
         it->genotype = genotype;
         it->hmm_likelihood = hmm_likelihood;
     } else {
-        throw std::runtime_error("ERROR: SV call not found for update at position " + std::to_string(start) + "-" + std::to_string(end));
+        printError("ERROR: SV call not found for update at position " + std::to_string(start) + "-" + std::to_string(end));
     }
 }
 

From 6b611174fab649c4c0ecf7364d21185447007ebe Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 17 Dec 2024 07:26:55 -0500
Subject: [PATCH 055/134] Fix 1 off breakpoint error

---
 include/sv_caller.h |  2 --
 src/cnv_caller.cpp  |  7 ++---
 src/fasta_query.cpp | 14 +--------
 src/input_data.cpp  | 10 +++++-
 src/sv_caller.cpp   | 77 +++++++++++----------------------------------
 5 files changed, 31 insertions(+), 79 deletions(-)

diff --git a/include/sv_caller.h b/include/sv_caller.h
index 11b3919f..70be7e33 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -65,8 +65,6 @@ class SVCaller {
 
         void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches) const;
 
-        // void trimOverlappingAlignments(uint32_t& primary_start, uint32_t& primary_end, uint32_t& supp_start, uint32_t& supp_end, const std::vector<int>& primary_match_map, const std::vector<int>& supp_match_map);
-
         // Calculate the read depth (INFO/DP) for a region
         int calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end);
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index c0811d51..20e7189f 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -326,10 +326,8 @@ std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint3
 double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map)
 {
     {
-        // Lock the bam file
-        std::lock_guard<std::mutex> lock(this->bam_file_mtx);
-
         // Open the BAM file
+        std::lock_guard<std::mutex> lock(this->bam_file_mtx);  // Lock the BAM file
         std::string bam_filepath = this->input_data.getShortReadBam();
         samFile *bam_file = sam_open(bam_filepath.c_str(), "r");
         if (!bam_file)
@@ -398,7 +396,7 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
             
             // Parse the CIGAR string to get the depth (match, sequence match, and
             // mismatch)
-            uint32_t pos = bam_record->core.pos + 1;  // 0-based to 1-based
+            uint32_t pos = (uint32_t)bam_record->core.pos + 1;  // 0-based to 1-based
             uint32_t ref_pos = pos;
             uint32_t cigar_len = bam_record->core.n_cigar;
             uint32_t *cigar = bam_get_cigar(bam_record);
@@ -457,7 +455,6 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
         mean_chr_cov = static_cast<double>(cum_depth) / static_cast<double>(pos_count);
     }
 
-    // return std::make_pair(mean_chr_cov, chr_pos_depth_map);
     return mean_chr_cov;
 }
 
diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp
index 212343f0..9a705a9a 100644
--- a/src/fasta_query.cpp
+++ b/src/fasta_query.cpp
@@ -104,26 +104,14 @@ std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, u
     pos_start--;
     pos_end--;
 
-    // Ensure that the start position is not negative, and the end position is
-    // not larger than the chromosome length
-    if (pos_start < 0)
-    {
-        return "";
-    }
-    // if (pos_end >= (uint32_t)this->chr_to_seq[chr].length())
+    // Ensure that the end position is not larger than the chromosome length
     if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length())
     {
         return "";
     }
 
     uint32_t length = pos_end - pos_start + 1;
-    
-    // Get the sequence
     const std::string& sequence = this->chr_to_seq.at(chr);
-    // const std::string& sequence = this->chr_to_seq[chr];
-
-    // Get the substring
-    // std::string subsequence = sequence.substr(pos_start, length);
 
     // If the subsequence is empty, return empty string
     if (sequence.substr(pos_start, length).empty())
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 381d5ac5..952329d1 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -306,7 +306,15 @@ std::string InputData::getAlleleFreqFilepath(std::string chr) const
     {
         chr = chr.substr(3, chr.size() - 3);
     }
-    return this->pfb_filepaths.at(chr);
+
+    try
+    {
+        return this->pfb_filepaths.at(chr);
+    }
+    catch (const std::out_of_range& e)
+    {
+        return "";
+    }
 }
 
 void InputData::setThreadCount(int thread_count)
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index df54f4ef..61a236b4 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -485,8 +485,10 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     // Load chromosome data for copy number predictions
     printMessage(chr + ": Loading chromosome data...");
     CNVCaller cnv_caller(this->input_data);
+    printMessage(chr + ": LENGTH: " + std::to_string(chr_len));
     std::vector<uint32_t> chr_pos_depth_map(chr_len+1, 0);  // 1-based index
     double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map);
+    printMessage(chr + ": POSDEPTH: " + std::to_string(chr_pos_depth_map.size()));
     if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) {
         hts_idx_destroy(idx);
         bam_hdr_destroy(bamHdr);
@@ -529,7 +531,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         // std::cout << "Detecting copy number variants from split reads..." << std::endl;
         printMessage(chr + ": Split read SVs...");
         this->detectSVsFromSplitReads(sub_region, fp_in, idx, bamHdr, subregion_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map);
-        // this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map);
 
         // Merge the SV calls from the current region
         // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
@@ -689,31 +690,31 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
 
                     // Print error if the start position is greater than the end
                     // position
-                    if (supp_start+1 > supp_end+1) {
-                        printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1));
+                    if (supp_start > supp_end) {
+                        printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end));
                         continue;
                     }
 
-                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map);
+                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map);
                     if (std::get<1>(result) == SVType::UNKNOWN) {
                         continue;
                     }
 
                     double supp_lh = std::get<0>(result);
                     SVType supp_type = std::get<1>(result);
-                    int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
+                    int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
                     if (supp_type == SVType::NEUTRAL) {
-                        addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "<INV>", "HMM", "./.", supp_lh, read_depth);
+                        addSVCall(sv_calls, supp_start, supp_end, "INV", "<INV>", "HMM", "./.", supp_lh, read_depth);
                         
                         sv_count++;
                     } else if (supp_type == SVType::DUP) {
-                        addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", "<INV>", "HMM", "./.", supp_lh, read_depth);
+                        addSVCall(sv_calls, supp_start, supp_end, "INVDUP", "<INV>", "HMM", "./.", supp_lh, read_depth);
                     }
                 } else {
                     // Add the inversion without running copy number predictions
                     // (too small for predictions)
-                    int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1);
-                    addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "<INV>", "REV", "./.", 0.0, read_depth);
+                    int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
+                    addSVCall(sv_calls, supp_start, supp_end, "INV", "<INV>", "REV", "./.", 0.0, read_depth);
                 }
             }
         }
@@ -725,16 +726,16 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
         bool gap_exists = false;
         uint32_t boundary_left, boundary_right, gap_left, gap_right;
         if (primary_region.start < largest_supp_region.start) {  // Primary before supp
-            boundary_left = primary_region.start + 1;
-            boundary_right = std::max(primary_region.end, largest_supp_region.end) + 1;
-            gap_left = primary_region.end + 1;
-            gap_right = largest_supp_region.start + 1;
+            boundary_left = primary_region.start;
+            boundary_right = std::max(primary_region.end, largest_supp_region.end);
+            gap_left = primary_region.end;
+            gap_right = largest_supp_region.start;
             gap_exists = gap_left < gap_right;
         } else {
-            boundary_left = largest_supp_region.start + 1;
-            boundary_right = std::max(primary_region.end, largest_supp_region.end) + 1;
-            gap_left = largest_supp_region.end + 1;
-            gap_right = primary_region.start + 1;
+            boundary_left = largest_supp_region.start;
+            boundary_right = std::max(primary_region.end, largest_supp_region.end);
+            gap_left = largest_supp_region.end;
+            gap_right = primary_region.start;
             gap_exists = gap_left < gap_right;
         }
         
@@ -966,21 +967,8 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
 void SVCaller::trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches) const
 {
-    // Get the start and end read positions for the primary and supplementary
-    // alignments
-    // uint32_t primary_alignment_start = std::get<1>(primary_alignment);
-    // uint32_t primary_alignment_end = std::get<2>(primary_alignment);
-    // uint32_t supp_alignment_start = std::get<1>(supp_alignment);
-    // uint32_t supp_alignment_end = std::get<2>(supp_alignment);
-    // uint32_t primary_query_start = std::get<3>(primary_alignment);
-    // uint32_t primary_query_end = std::get<4>(primary_alignment);
-    // uint32_t supp_query_start = std::get<3>(supp_alignment);
-    // uint32_t supp_query_end = std::get<4>(supp_alignment);
-    // const std::vector<int>& primary_match_map = std::get<5>(primary_alignment);
-    // const std::vector<int>& supp_match_map = std::get<5>(supp_alignment);
 
     // Check for overlapping read alignments
-    // bool primary_before_supp = primary_query_start < supp_query_start;
     if (primary_mismatches.query_start < supp_mismatches.query_start) {
         // Primary before supplementary in the query
 
@@ -989,74 +977,47 @@ void SVCaller::trimOverlappingAlignments(GenomicRegion& primary_alignment, Genom
             // Calculate the mismatch rates at the overlapping region
             double primary_mismatch_rate = this->calculateMismatchRate(primary_mismatches);
             double supp_mismatch_rate = this->calculateMismatchRate(supp_mismatches);
-            // uint32_t overlap_length = primary_query_end - supp_query_start +
-            // 1;
             hts_pos_t overlap_length = primary_mismatches.query_end - supp_mismatches.query_start + 1;
 
             // Trim the ailgnment with the higher mismatch rate
             if (primary_mismatch_rate > supp_mismatch_rate) {
                 // Trim the end of the primary alignment, ensuring that the new
                 // end is not less than the start
-                // if (primary_alignment_end > overlap_length &&
-                // (primary_alignment_end - overlap_length) >
-                // primary_alignment_start) {
                 if (primary_alignment.end > overlap_length && (primary_alignment.end - overlap_length) > primary_alignment.start) {
                     // Trim the end of the primary alignment
-                    // uint32_t new_end = primary_alignment_end - overlap_length;
-                    // std::get<2>(primary_alignment) = new_end;
                     primary_alignment.end = primary_alignment.end - overlap_length;
                 }
             } else {
                 // Trim the beginning of the supplementary alignment, ensuring
                 // that the new start is not greater than the end
-                // if (supp_alignment_start + overlap_length <
-                // supp_alignment_end) {
                 if (supp_alignment.start + overlap_length < supp_alignment.end) {
                     // Trim the beginning of the supplementary alignment
-                    // uint32_t new_start = supp_alignment_start + overlap_length;
-                    // std::get<1>(supp_alignment) = new_start;
                     supp_alignment.start = supp_alignment.start + overlap_length;
                 }
             }
         }
 
-    // } else if (supp_mismatches.query_end >= primary_mismatches.query_start) {
     } else {
         // Supplementary before primary in the query
         if (primary_mismatches.query_start <= supp_mismatches.query_end) {
             // Calculate the mismatch rates at the overlapping region
-            // double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, primary_query_start, supp_query_end);
-            // double supp_mismatch_rate =
-            // this->calculateMismatchRate(supp_match_map, primary_query_start,
-            // supp_query_end);
             double primary_mismatch_rate = this->calculateMismatchRate(primary_mismatches);
             double supp_mismatch_rate = this->calculateMismatchRate(supp_mismatches);
-            // hts_pos_t overlap_length = supp_query_end - primary_query_start +
-            // 1;
             hts_pos_t overlap_length = supp_mismatches.query_end - primary_mismatches.query_start + 1;
 
             // Trim the ailgnment with the higher mismatch rate
             if (supp_mismatch_rate > primary_mismatch_rate) {
                 // Trim the end of the supplementary alignment, ensuring that
                 // the new end is not less than the start
-                // if (supp_alignment_end > overlap_length &&
-                // (supp_alignment_end - overlap_length) > supp_alignment_start)
-                // {
                 if (supp_alignment.end > overlap_length && (supp_alignment.end - overlap_length) > supp_alignment.start) {
                     // Trim the end of the supplementary alignment
-                    // uint32_t new_end = supp_alignment_end - overlap_length;
-                    // std::get<2>(supp_alignment) = new_end;
                     supp_alignment.end = supp_alignment.end - overlap_length;
                 }
             } else {
                 // Trim the beginning of the primary alignment, ensuring that
                 // the new start is not greater than the end
-                // if (primary_alignment_start + overlap_length <
-                // primary_alignment_end) {
                 if (primary_alignment.start + overlap_length < primary_alignment.end) {
                     // Trim the beginning of the primary alignment
-                    // uint32_t new_start = primary_alignment_start + overlap_length;
-                    // std::get<1>(primary_alignment) = new_start;
                     primary_alignment.start = primary_alignment.start + overlap_length;
                 }
             }
@@ -1077,7 +1038,7 @@ int SVCaller::calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uin
         // printMessage("Read depth at end: " + std::to_string(pos_depth_map.at(end)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start));
         read_depth += pos_depth_map.at(end);
     } catch (const std::out_of_range& e) {
-        std::cerr << "Warning: End position " << end << " not found in depth map." << std::endl;
+        std::cerr << "Warning: End position " << end << " not found in depth map of size " << pos_depth_map.size() << "." << std::endl;
     }
     // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth));
     return read_depth;

From b35c1077fd2ee4aa67d84a6ebda04b8f47307ef5 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 19 Dec 2024 10:31:24 -0500
Subject: [PATCH 056/134] thread safe ref genome

---
 Makefile-cpp             |   7 +
 include/cnv_caller.h     |  15 ++-
 include/contextsv.h      |   8 +-
 include/fasta_query.h    |   2 +
 include/input_data.h     |  23 +---
 include/sv_caller.h      |  21 +--
 include/swig_interface.h |   2 +-
 include/utils.h          |  50 +++++++
 src/cnv_caller.cpp       | 176 +++++++++++--------------
 src/contextsv.cpp        |  10 +-
 src/fasta_query.cpp      |   7 +-
 src/input_data.cpp       |  35 ++---
 src/sv_caller.cpp        | 273 +++++++++++++++++++--------------------
 src/sv_object.cpp        |  15 ---
 src/swig_interface.cpp   |   7 +-
 15 files changed, 321 insertions(+), 330 deletions(-)

diff --git a/Makefile-cpp b/Makefile-cpp
index 55630e9b..3babecb3 100644
--- a/Makefile-cpp
+++ b/Makefile-cpp
@@ -22,6 +22,13 @@ CXXFLAGS := -std=c++14 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedan
 LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR)  # Add rpath for shared libraries
 LDLIBS := -lhts  # Link with libhts.a or libhts.so
 
+# Enable thread sanitizer (TSan)
+# ifeq ($(TSAN),1)
+# TSAN_FLAGS := -fsanitize=thread -fPIE -pie -g
+# CXXFLAGS += $(TSAN_FLAGS)
+# LDFLAGS += $(TSAN_FLAGS)
+# endif
+
 # Sources and Output
 # SOURCES := $(wildcard $(SRC_DIR)/*.cpp)
 SOURCES := $(filter-out $(SRC_DIR)/swig_wrapper.cpp, $(wildcard $(SRC_DIR)/*.cpp))  # Filter out the SWIG wrapper from the sources
diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 12bbce80..0417b8ae 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -45,7 +45,6 @@ struct SNPData {
 // CNVCaller: Detect CNVs and return the state sequence by SNP position
 class CNVCaller {
     private:
-        const InputData& input_data;
         mutable std::mutex snp_file_mtx;  // SNP file mutex
         mutable std::mutex pfb_file_mtx;  // Population frequency file mutex
         mutable std::mutex bam_file_mtx;  // BAM file mutex
@@ -76,26 +75,28 @@ class CNVCaller {
         void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction) const;
 
         // Query a region for SNPs and return the SNP data
-        void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data) const;
+        void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const;
 
         // Split a region into chunks for parallel processing
         std::vector<std::string> splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const;
 
     public:
-        explicit CNVCaller(const InputData& input_data);
+        // explicit CNVCaller(const InputData& input_data);
+        // Constructor with no arguments
+        CNVCaller() = default;
 
         // Run copy number prediction for a single SV candidate, returning the
         // likelihood, predicted CNV type, genotype, and whether SNPs were found
-        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map) const;
+        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
-        void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map);
+        void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
 
-        double calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map);
+        double calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath, int thread_count, bool single_chr) const;
 
         void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& pos_log2) const;
 
-        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp) const;
+        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data) const;
 
         // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions
         void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const;
diff --git a/include/contextsv.h b/include/contextsv.h
index 97d7bce9..890748da 100644
--- a/include/contextsv.h
+++ b/include/contextsv.h
@@ -10,14 +10,12 @@
 
 
 class ContextSV {
-	private:
-		InputData& input_data;
-
 	public:
-		explicit ContextSV(InputData& input_data);
+		// explicit ContextSV(InputData& input_data);
+		ContextSV() = default;
 
 		// Entry point
-		int run();
+		int run(const InputData& input_data) const;
 };
 
 #endif  // CONTEXTSV_H
diff --git a/include/fasta_query.h b/include/fasta_query.h
index b130117a..75259441 100644
--- a/include/fasta_query.h
+++ b/include/fasta_query.h
@@ -8,6 +8,7 @@
 #include <map>
 #include <unordered_map>
 #include <vector>
+#include <mutex>
 /// @endcond
 
 class ReferenceGenome {
@@ -15,6 +16,7 @@ class ReferenceGenome {
         std::string fasta_filepath;
         std::vector<std::string> chromosomes;
         std::unordered_map<std::string, std::string> chr_to_seq;
+        mutable std::mutex mtx;
 
     public:
         int setFilepath(std::string fasta_filepath);
diff --git a/include/input_data.h b/include/input_data.h
index 43a9790b..72bca5af 100644
--- a/include/input_data.h
+++ b/include/input_data.h
@@ -36,26 +36,14 @@ class InputData {
         std::string getHMMFilepath() const;
 
         // Set the filepath to the reference genome FASTA file.
-		void setRefGenome(std::string fasta_filepath);
-
-        // Return a reference to the ReferenceGenome object.
-        const ReferenceGenome& getRefGenome() const;
-        std::string queryRefGenome(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const;
-
-        // Get the chromosomes in the reference genome.
-        std::vector<std::string> getRefGenomeChromosomes() const;
-
-        // Get a chromosome's length in the reference genome.
-        uint32_t getRefGenomeChromosomeLength(std::string chr) const;
+		void setRefGenome(std::string filepath);
+        std::string getRefGenome() const;
 
         // Set the filepath to the text file containing the locations of the
         // VCF files with population frequencies for each chromosome.
         void setAlleleFreqFilepaths(std::string filepath);
         std::string getAlleleFreqFilepath(std::string chr) const;
 
-        // Get the population frequency map.
-        // PFBMap getPFBMap();
-
         // Set the filepath to the VCF file with SNP calls used for CNV
         // detection with the HMM.
         void setSNPFilepath(std::string filepath);
@@ -71,11 +59,12 @@ class InputData {
 
         // Set the minimum CNV length to use for copy number predictions.
         void setMinCNVLength(int min_cnv_length);
-        int getMinCNVLength() const;
+        uint32_t getMinCNVLength() const;
 
         // Set the chromosome to analyze.
         void setChromosome(std::string chr);
         std::string getChromosome() const;
+        bool isSingleChr() const;
 
         // Set the region to analyze.
         void setRegion(std::string region);
@@ -106,10 +95,9 @@ class InputData {
         std::string snp_vcf_filepath;
         std::string ethnicity;
         std::unordered_map<std::string, std::string> pfb_filepaths;  // Map of population frequency VCF filepaths by chromosome
-        ReferenceGenome fasta_query;
         std::string output_dir;
         int sample_size;
-        int min_cnv_length;
+        uint32_t min_cnv_length;
         std::string chr;  // Chromosome to analyze
         std::pair<int32_t, int32_t> start_end;  // Region to analyze
         bool region_set;  // True if a region is set
@@ -118,6 +106,7 @@ class InputData {
         std::string cnv_filepath;
         bool verbose;  // True if verbose output is enabled
         bool save_cnv_data;  // True if SNP CNV regions should be extended around SV breakpoints, and saved to a TSV file (Large performance hit)
+        bool single_chr;
 };
 
 #endif // INPUT_DATA_H
diff --git a/include/sv_caller.h b/include/sv_caller.h
index 70be7e33..9d967510 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -33,46 +33,47 @@ class SVCaller {
     private:
         int min_sv_size = 50;       // Minimum SV size to be considered
         int min_mapq = 20;          // Minimum mapping quality to be considered
-        const InputData& input_data;
 
-        void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data, bool is_primary) const;
+        // void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data, bool is_primary) const;
 
         void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map) const;
 
         // Detect SVs from the CIGAR string of a read alignment, and return the
         // mismatch rate, and the start and end positions of the query sequence
-        void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map);
+        void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome) const;
 
-        void processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls);
+        void processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome);
 
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
         // RegionData detectSVsFromRegion(std::string region);
-        void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map);
+        void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome) const;
  
         // Read the next alignment from the BAM file in a thread-safe manner
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const;
 
         // Detect SVs from split alignments
-        void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map);
+        void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
 
         // Calculate the mismatch rate given a map of query positions to
         // match/mismatch (1/0) values within a specified range of the query
         // sequence
         double calculateMismatchRate(const MismatchData& mismatch_data) const;
 
-        void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls);
+        void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const;
 
         void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches) const;
 
         // Calculate the read depth (INFO/DP) for a region
-        int calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end);
+        int calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end) const;
 
     public:
-        explicit SVCaller(InputData& input_data);
+        // explicit SVCaller(InputData& input_data);
+        // Constructor with no arguments
+        SVCaller() = default;
 
         // Detect SVs and predict SV type from long read alignments and CNV calls
-        void run();
+        void run(const InputData& input_data);
 };
 
 #endif // SV_CALLER_H
diff --git a/include/swig_interface.h b/include/swig_interface.h
index c7f163ae..578f4653 100644
--- a/include/swig_interface.h
+++ b/include/swig_interface.h
@@ -12,6 +12,6 @@
 #include <string>
 /// @endcond
 
-int run(InputData input_data);
+int run(const InputData& input_data);
 
 #endif // SWIG_INTERFACE_H
diff --git a/include/utils.h b/include/utils.h
index 7311efbc..2fb4a3b1 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -3,12 +3,62 @@
 #ifndef UTILS_H
 #define UTILS_H
 
+#include <htslib/sam.h>
+#include <htslib/synced_bcf_reader.h>
+
 /// @cond
 #include <string>
 #include <mutex>
 #include <chrono>
 /// @endcond
 
+
+// Guard to close the BAM file
+struct BamFileGuard {
+    samFile* fp_in;
+    hts_idx_t* idx;
+    bam_hdr_t* bamHdr;
+
+    BamFileGuard(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr)
+        : fp_in(fp_in), idx(idx), bamHdr(bamHdr) {}
+
+    ~BamFileGuard() {
+        if (idx) {
+            hts_idx_destroy(idx);
+        }
+        if (bamHdr) {
+            bam_hdr_destroy(bamHdr);
+        }
+        if (fp_in) {
+            sam_close(fp_in);
+        }
+    }
+
+    BamFileGuard(const BamFileGuard&) = delete;  // Non-copyable
+    BamFileGuard& operator=(const BamFileGuard&) = delete;  // Non-assignable
+};
+
+// Guard to close the BCF file
+struct BcfFileGuard {
+    bcf_srs_t* reader;
+    bcf_hdr_t* hdr;
+
+    BcfFileGuard(bcf_srs_t* reader, bcf_hdr_t* hdr)
+        : reader(reader), hdr(hdr) {}
+
+    ~BcfFileGuard() {
+        if (hdr) {
+            bcf_hdr_destroy(hdr);
+        }
+        if (reader) {
+            bcf_sr_destroy(reader);
+        }
+    }
+
+    BcfFileGuard(const BcfFileGuard&) = delete;  // Non-copyable
+    BcfFileGuard& operator=(const BcfFileGuard&) = delete;  // Non-assignable
+};
+
 // Print the progress of a task
 void printProgress(int progress, int total);
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 20e7189f..cc30ccae 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -34,10 +34,6 @@
 
 using namespace sv_types;
 
-CNVCaller::CNVCaller(const InputData& input_data)
-    : input_data(input_data)  // Initialize the input data
-{
-}
 
 // Function to call the Viterbi algorithm for the CHMM
 void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction) const
@@ -52,25 +48,22 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::ve
 }
 
 // Function to obtain SNP information for a region
-void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data) const
+void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const
 {
-    // uint32_t window_size = (uint32_t)this->input_data.getWindowSize();
-
     // Initialize the SNP data with default values and sample size length
-    int sample_size = this->input_data.getSampleSize();
+    int sample_size = input_data.getSampleSize();
     int region_length = (int) (end_pos - start_pos + 1);
     if (region_length < sample_size)
     {
         sample_size = region_length;
     }
 
-    // std::set<uint32_t> snp_pos(sample_size);
     std::vector<uint32_t> snp_pos(sample_size, 0);
     std::vector<double> snp_baf(sample_size, -1.0);
     std::vector<double> snp_pfb(sample_size, 0.5);
     std::vector<double> snp_log2_cov(sample_size, 0.0);
     std::vector<bool> is_snp(sample_size, false);
-    this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp);
+    this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data);
 
     // Get the log2 ratio for <sample_size> evenly spaced positions in the
     // region
@@ -84,7 +77,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     snp_data.is_snp = std::move(is_snp);
 }
 
-std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map) const
+std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
 {
     // Check that the start position is less than the end position
     if (start_pos >= end_pos)
@@ -97,7 +90,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     // Only extend the region if "save CNV data" is enabled
     uint32_t snp_start_pos = start_pos;
     uint32_t snp_end_pos = end_pos;
-    if (this->input_data.getSaveCNVData())
+    if (input_data.getSaveCNVData())
     {
         uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
         snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
@@ -106,7 +99,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 
     // Query the SNP region for the SV candidate
     SNPData snp_data;
-    querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data);
+    querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data);
 
     // Run the Viterbi algorithm
     std::pair<std::vector<int>, double> prediction;
@@ -160,10 +153,12 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 
     // Save the SV calls as a TSV file if enabled
     bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL);
-    if (this->input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 10000)
+    // if (save_cnv_data && copy_number_change && (end_pos - start_pos) > 10000)
+    if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 10000)
     {
         std::string cnv_type_str = getSVTypeString(predicted_cnv_type);
-        std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv";
+        const std::string output_dir = input_data.getOutputDir();
+        std::string sv_filename = output_dir + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv";
         printMessage("Saving SV split-alignment copy number predictions to " + sv_filename + "...");
         this->saveSVCopyNumberToTSV(snp_data, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood);
     }
@@ -172,7 +167,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 }
 
 
-void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall> &sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map)
+void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
 {
     // Map with counts for each CNV type
     std::map<int, int> cnv_type_counts;
@@ -182,7 +177,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
     }
     
     // Loop through each SV candidate and predict the copy number state
-    int min_length = this->input_data.getMinCNVLength();
     for (auto& sv_call : sv_candidates)
     {
 
@@ -193,31 +187,23 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         // Error if start > end
         if (start_pos >= end_pos)
         {
-        	std::cerr << "Position error for CIGAR SV at " << chr << ":" << start_pos << "-" << end_pos << std::endl;
+            printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         	continue;
         }
 
         // Skip if not the minimum length for CNV predictions
-        if ((end_pos - start_pos) < (uint32_t) min_length)
+        if ((end_pos - start_pos) < input_data.getMinCNVLength())
         {
             continue;
         }
 
         // Only extend the region if "save CNV data" is enabled
-        uint32_t snp_start_pos = start_pos;
-        uint32_t snp_end_pos = end_pos;
-        if (this->input_data.getSaveCNVData())
-        {
-            uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
-            snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
-            snp_end_pos = end_pos + sv_half_length;
-        }
         SNPData snp_data;
-        this->querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data);
+        this->querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data);
 
         // Run the Viterbi algorithm
         if (snp_data.pos.size() == 0) {
-        	std::cerr << "ERROR: No windows for SV " << chr << ":" << start_pos << "-" << end_pos << " (" << snp_start_pos << "," << snp_end_pos << std::endl;
+            printError("ERROR: No SNP data found for Viterbi algorithm for CIGAR SV at " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         	continue;
         }
         
@@ -260,13 +246,14 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
 
         // Update the SV calls with the CNV type and genotype
         SVType updated_sv_type = getSVTypeFromCNState(max_state);
-        std::string genotype = cnv_genotype_map[max_state];
+        std::string genotype = cnv_genotype_map.at(max_state);
 
         // Determine the SV calling method used to call the SV
         std::string data_type;
         data_type = "HMM";
 
         // Update the SV genotype if known
+        // printMessage("Updating SV call for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
         if (updated_sv_type != SVType::UNKNOWN)
         {
             sv_call.genotype = genotype;
@@ -280,20 +267,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
             std::string sv_type_str = getSVTypeString(updated_sv_type);
             sv_call.sv_type = sv_type_str;
         }
-
-        // Save the SV calls as a TSV file if enabled, if the SV type is
-        // known, and the length is greater than 10 kb
-        if (this->input_data.getSaveCNVData() && updated_sv_type != SVType::UNKNOWN && (end_pos - start_pos) > 10000)
-        {
-            // Add the state sequence to the SNP data (avoid copying the data)
-            snp_data.state_sequence = std::move(state_sequence);
-
-            // Save the SV calls as a TSV file
-            std::string cnv_type_str = getSVTypeString(updated_sv_type);
-            std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_CIGAR.tsv";
-            printMessage("Saving SV CIGAR copy number predictions to " + sv_filename);
-            this->saveSVCopyNumberToTSV(snp_data, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood);
-        }
     }
 }
 
@@ -323,12 +296,11 @@ std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint3
 }
 
 // Calculate the mean chromosome coverage
-double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map)
+double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath, int thread_count, bool single_chr) const
 {
     {
         // Open the BAM file
         std::lock_guard<std::mutex> lock(this->bam_file_mtx);  // Lock the BAM file
-        std::string bam_filepath = this->input_data.getShortReadBam();
         samFile *bam_file = sam_open(bam_filepath.c_str(), "r");
         if (!bam_file)
         {
@@ -337,11 +309,10 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
         }
 
         // Enable multi-threading if running on a single chromosome
-        if (this->input_data.getChromosome() != "")
+        if (single_chr)
         {
-            hts_set_threads(bam_file, this->input_data.getThreadCount());
+            hts_set_threads(bam_file, thread_count);
         }
-        // hts_set_threads(bam_file, this->input_data.getThreadCount());
 
         // Read the header
         bam_hdr_t *bam_header = sam_hdr_read(bam_file);
@@ -361,14 +332,12 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
             printError("ERROR: Could not load index for BAM file: " + bam_filepath);
             return 0.0;
         }
+        BamFileGuard bam_guard(bam_file, bam_index, bam_header);  // Guard to close the BAM file
 
         // Create an iterator for the chromosome
         hts_itr_t *bam_iter = sam_itr_querys(bam_index, bam_header, chr.c_str());
         if (!bam_iter)
         {
-            hts_idx_destroy(bam_index);
-            bam_hdr_destroy(bam_header);
-            sam_close(bam_file);
             printError("ERROR: Could not create iterator for chromosome: " + chr + ", check if the chromosome exists in the BAM file.");
             return 0.0;
         }
@@ -378,9 +347,6 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
         if (!bam_record)
         {
             hts_itr_destroy(bam_iter);
-            hts_idx_destroy(bam_index);
-            bam_hdr_destroy(bam_header);
-            sam_close(bam_file);
             printError("ERROR: Could not initialize BAM record.");
             return 0.0;
         }
@@ -432,9 +398,6 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
         // Clean up
         bam_destroy1(bam_record);
         hts_itr_destroy(bam_iter);
-        hts_idx_destroy(bam_index);
-        bam_hdr_destroy(bam_header);
-        sam_close(bam_file);
     }
 
     // Calculate the mean chromosome coverage for positions with non-zero depth
@@ -481,11 +444,10 @@ void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, i
     }
 }
 
-void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp) const
-{    
+void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data) const
+{
     // --------- SNP file ---------
-    // Get the SNP file path
-    std::string snp_filepath = this->input_data.getSNPFilepath();
+    const std::string snp_filepath = input_data.getSNPFilepath();
     if (snp_filepath.empty())
     {
         printError("ERROR: SNP file path is empty.");
@@ -502,10 +464,10 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     snp_reader->require_index = 1;
 
     // Set multi-threading if running on a single chromosome
-    if (this->input_data.getChromosome() != "")
+    int thread_count = input_data.getThreadCount();
+    if (input_data.isSingleChr())
     {
-        int thread_count = this->input_data.getThreadCount() - 1;  // Leave one thread for the main thread
-        printMessage("Setting SNP reader threads to " + std::to_string(thread_count / 2));
+        printMessage("Setting SNP reader threads to " + std::to_string(std::max(1, thread_count / 2)));
         bcf_sr_set_threads(snp_reader, std::max(1, thread_count / 2));
     }
 
@@ -518,35 +480,38 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     }
 
     // Get the header
-    bcf_hdr_t *snp_header = bcf_sr_get_header(snp_reader, 0);
-    if (!snp_header)
-    {
-        bcf_sr_destroy(snp_reader);
-        printError("ERROR: Could not get header for SNP reader.");
-        return;
-    }
+    // bcf_hdr_t *snp_header = bcf_sr_get_header(snp_reader, 0);
+    // if (!snp_header)
+    // {
+    //     bcf_sr_destroy(snp_reader);
+    //     printError("ERROR: Could not get header for SNP reader.");
+    //     return;
+    // }
+    // BcfFileGuard snp_guard(snp_reader, snp_header);  // Guard to close the SNP file
 
     // --------- Population allele frequency file ---------
 
     // Get the population allele frequency file path
     bool use_pfb = true;
-    std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr);
+    const std::string pfb_filepath = input_data.getAlleleFreqFilepath(chr);
     if (pfb_filepath.empty())
     {
         use_pfb = false;
-        printMessage("WARNING: No population allele frequency file provided for chromosome " + chr);
+        // printMessage("WARNING: No population allele frequency file provided for chromosome " + chr);
     }
     
     bcf_srs_t *pfb_reader = bcf_sr_init();
     std::string chr_gnomad;
     std::string AF_key;
+    // BcfFileGuard pfb_guard(nullptr, nullptr);  // Guard to close the population allele frequency file
     if (use_pfb)
     {
         // Determine the ethnicity-specific allele frequency key
         AF_key = "AF";
-        if (this->input_data.getEthnicity() != "")
+        const std::string eth = input_data.getEthnicity();
+        if (eth != "")
         {
-            AF_key += "_" + this->input_data.getEthnicity();
+            AF_key += "_" + eth;
         }
 
         // Check if the filepath uses the 'chr' prefix notations based on the
@@ -571,38 +536,43 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         // Initialize the population allele frequency reader
         if (!pfb_reader)
         {
-            bcf_sr_destroy(snp_reader);
             printError("ERROR: Could not initialize population allele frequency reader.");
+
+            // Clean up
+            // bcf_hdr_destroy(snp_header);
+            bcf_sr_destroy(snp_reader);
             return;
         }
         pfb_reader->require_index = 1;
 
         // Set multi-threading if running on a single chromosome
-        if (this->input_data.getChromosome() != "")
+        if (input_data.isSingleChr())
         {
-            int thread_count = this->input_data.getThreadCount() - 1;  // Leave one thread for the main thread
-            printMessage("Setting population allele frequency reader threads to " + std::to_string(thread_count / 2));
+            printMessage("Setting population allele frequency reader threads to " + std::to_string(std::max(1, thread_count / 2)));
             bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2));
         }
 
         // Add the population allele frequency file to the reader
         if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0)
         {
-            bcf_sr_destroy(snp_reader);
-            bcf_sr_destroy(pfb_reader);
             printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath);
-            return;
-        }
 
-        // Get the header
-        bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0);
-        if (!pfb_header)
-        {
-            bcf_sr_destroy(snp_reader);
+            // Clean up
             bcf_sr_destroy(pfb_reader);
-            printError("ERROR: Could not get header for population allele frequency reader.");
+            // bcf_hdr_destroy(snp_header);
+            bcf_sr_destroy(snp_reader);
             return;
         }
+
+        // Get the header
+        // bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0);
+        // if (!pfb_header)
+        // {
+        //     bcf_sr_destroy(pfb_reader);
+        //     printError("ERROR: Could not get header for population allele frequency reader.");
+        //     return;
+        // }
+        // pfb_guard = BcfFileGuard(pfb_reader, pfb_header);  // Guard to close the population allele frequency file
     }
 
     // Split the region into samples
@@ -617,7 +587,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     {
         current_region++;
         // Lock during reading
-        // std::lock_guard<std::mutex> lock(this->snp_file_mtx);
+        std::lock_guard<std::mutex> lock(this->snp_file_mtx);
 
         // Read the SNP data ----------------------------------------------
 
@@ -625,9 +595,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         std::string region_str = region_chunks[i];
         if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)
         {
-            bcf_sr_destroy(snp_reader);
             printError("ERROR: Could not set region for SNP reader: " + region_str);
-            return;
+            break;
         }
 
         // printMessage("Iterating through SNPs in region " + std::to_string(current_region) + " of " + std::to_string((int) region_chunks.size()) + " with length " + std::to_string((int) (end_pos - start_pos)) + " bp...");
@@ -659,7 +628,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
                 int32_t *dp = 0;
                 // int dp_values[2];
                 int dp_count = 0;
-                int dp_ret = bcf_get_format_int32(snp_header, snp_record, "DP", &dp, &dp_count);
+                // int dp_ret = bcf_get_format_int32(snp_header, snp_record,
+                // "DP", &dp, &dp_count);
+                int dp_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "DP", &dp, &dp_count);
                 if (dp_ret < 0 || dp[0] <= 10)
                 {
                     continue;
@@ -667,7 +638,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
                 free(dp);
 
                 // Skip if the SNP does not pass the filter
-                if (bcf_has_filter(snp_header, snp_record, const_cast<char*>("PASS")) != 1)
+                // if (bcf_has_filter(snp_header, snp_record,
+                // const_cast<char*>("PASS")) != 1)
+                if (bcf_has_filter(snp_reader->readers[0].header, snp_record, const_cast<char*>("PASS")) != 1)
                 {
                     continue;
                 }
@@ -676,7 +649,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
                 int32_t *ad = 0;
                 // int ad_values[2];
                 int ad_count = 0;
-                int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count);
+                // int ad_ret = bcf_get_format_int32(snp_header, snp_record,
+                // "AD", &ad, &ad_count);
+                int ad_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "AD", &ad, &ad_count);
                 // int ad_ret = bcf_get_format_int32(snp_header, snp_record,
                 // "AD", &ad, &ad_count);
                 if (ad_ret < 0 || ad_count < 2)
@@ -719,10 +694,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos);
             if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0)
             {
-                bcf_sr_destroy(snp_reader);
-                bcf_sr_destroy(pfb_reader);
                 printError("ERROR: Could not set region for population allele frequency reader: " + region_str);
-                return;
+                break;
             }
 
             // Find the SNP position in the population allele frequency file
@@ -732,9 +705,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
                 {
                     continue;
                 }
-                // pfb_record = bcf_sr_get_line(pfb_reader, 0);
                 bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0);
-                // Do something with the record
                 if (pfb_record)
                 {
                     // Skip if not a SNP
@@ -778,6 +749,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             }
         }
     }
+
+    // Clean up
+    // bcf_hdr_destroy(snp_header);
     bcf_sr_destroy(snp_reader);
     bcf_sr_destroy(pfb_reader);
 }
diff --git a/src/contextsv.cpp b/src/contextsv.cpp
index 01329c2c..3957fe95 100644
--- a/src/contextsv.cpp
+++ b/src/contextsv.cpp
@@ -12,15 +12,11 @@
 #include "utils.h"
 /// @endcond
 
-ContextSV::ContextSV(InputData& input_data)
-    : input_data(input_data)  // Initialize the input data
-{
-}
 
-int ContextSV::run()
+int ContextSV::run(const InputData& input_data) const
 {
-    SVCaller sv_caller(this->input_data); 
-    sv_caller.run();
+    SVCaller sv_caller;
+    sv_caller.run(input_data);
 
     return 0;
 }
diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp
index 9a705a9a..3cde27d3 100644
--- a/src/fasta_query.cpp
+++ b/src/fasta_query.cpp
@@ -99,7 +99,9 @@ std::string ReferenceGenome::getFilepath() const
 
 // Function to get the reference sequence at a given position range
 std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const
-{    
+{
+    std::lock_guard<std::mutex> lock(this->mtx);
+    
     // Convert positions from 1-indexed (reference) to 0-indexed (string indexing)
     pos_start--;
     pos_end--;
@@ -125,6 +127,7 @@ std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, u
 // Function to get the chromosome contig lengths in VCF header format
 std::string ReferenceGenome::getContigHeader() const
 {
+    std::lock_guard<std::mutex> lock(this->mtx);
     std::string contig_header = "";
 
     // Sort the chromosomes
@@ -151,10 +154,12 @@ std::string ReferenceGenome::getContigHeader() const
 
 std::vector<std::string> ReferenceGenome::getChromosomes() const
 {
+    std::lock_guard<std::mutex> lock(this->mtx);
     return this->chromosomes;
 }
 
 uint32_t ReferenceGenome::getChromosomeLength(std::string chr) const
 {
+    std::lock_guard<std::mutex> lock(this->mtx);
     return this->chr_to_seq.at(chr).length();
 }
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 952329d1..a24efb9e 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -30,6 +30,7 @@ InputData::InputData()
     this->hmm_filepath = "data/wgs.hmm";
     this->verbose = false;
     this->save_cnv_data = false;
+    this->single_chr = false;
 }
 
 std::string InputData::getShortReadBam() const
@@ -84,30 +85,14 @@ void InputData::setLongReadBam(std::string filepath)
     }
 }
 
-void InputData::setRefGenome(std::string fasta_filepath)
+void InputData::setRefGenome(std::string filepath)
 {
-    // Set the reference genome
-    this->fasta_query.setFilepath(fasta_filepath);
+    this->ref_filepath = filepath;
 }
 
-const ReferenceGenome& InputData::getRefGenome() const
+std::string InputData::getRefGenome() const
 {
-    return this->fasta_query;
-}
-
-std::string InputData::queryRefGenome(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const
-{
-    return this->fasta_query.query(chr, pos_start, pos_end);
-}
-
-std::vector<std::string> InputData::getRefGenomeChromosomes() const
-{
-    return this->fasta_query.getChromosomes();
-}
-
-uint32_t InputData::getRefGenomeChromosomeLength(std::string chr) const
-{
-    return this->fasta_query.getChromosomeLength(chr);
+    return this->ref_filepath;
 }
 
 std::string InputData::getOutputDir() const
@@ -154,19 +139,20 @@ void InputData::setEthnicity(std::string ethnicity)
     this->ethnicity = ethnicity;
 }
 
-int InputData::getMinCNVLength() const
+uint32_t InputData::getMinCNVLength() const
 {
     return this->min_cnv_length;
 }
 
 void InputData::setMinCNVLength(int min_cnv_length)
 {
-    this->min_cnv_length = min_cnv_length;
+    this->min_cnv_length = (uint32_t) min_cnv_length;
 }
 
 void InputData::setChromosome(std::string chr)
 {
     this->chr = chr;
+    this->single_chr = true;
 }
 
 std::string InputData::getChromosome() const
@@ -174,6 +160,11 @@ std::string InputData::getChromosome() const
     return this->chr;
 }
 
+bool InputData::isSingleChr() const
+{
+    return this->single_chr;
+}
+
 void InputData::setRegion(std::string region)
 {
     // Check if the region is valid
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 61a236b4..6fafaf31 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -24,14 +24,11 @@
 #include "utils.h"
 #include "sv_types.h"
 #include "version.h"
+#include "fasta_query.h"
 /// @endcond
 
 # define DUP_SEQSIM_THRESHOLD 0.9  // Sequence similarity threshold for duplication detection
 
-SVCaller::SVCaller(InputData &input_data)
-    : input_data(input_data)  // Initialize the input data
-{
-}
 
 int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const
 {
@@ -44,18 +41,12 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
     if (!bam1) {
-        hts_idx_destroy(idx);
-        bam_hdr_destroy(bamHdr);
-        sam_close(fp_in);
         printError("ERROR: failed to initialize BAM record");
         return;
     }
     hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str());
     if (!itr) {
         bam_destroy1(bam1);
-        hts_idx_destroy(idx);
-        bam_hdr_destroy(bamHdr);
-        sam_close(fp_in);
         printError("ERROR: failed to query region " + region);
         return;
     }
@@ -98,6 +89,7 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
     printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments...");
 }
 
+/*
 void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data, bool is_primary) const
 {
     // Create a read and iterator for the region
@@ -192,7 +184,6 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
 
             // Get the corresponding reference sequence
             int cmatch_pos = pos + 1;  // Querying the reference genome is 1-based
-            // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1));
             std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1);
 
             // Check that the two sequence lengths are equal
@@ -249,24 +240,19 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
     mismatch_data.query_end = query_end;
     mismatch_data.match_map = std::move(match_map);
 }
+*/
 
-void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map)
+void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome) const
 {
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
     if (!bam1) {
-        hts_idx_destroy(idx);
-        bam_hdr_destroy(bamHdr);
-        sam_close(fp_in);
         printError("ERROR: failed to initialize BAM record");
         return;
     }
     hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str());
     if (!itr) {
         bam_destroy1(bam1);
-        hts_idx_destroy(idx);
-        bam_hdr_destroy(bamHdr);
-        sam_close(fp_in);
         printError("ERROR: failed to query region " + region);
         return;
     }
@@ -281,7 +267,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
 
         // Process the alignment
         bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY);
-        this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, primary, pos_depth_map);
+        this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, primary, pos_depth_map, ref_genome);
     }
 
     // Clean up the iterator and alignment
@@ -318,7 +304,7 @@ double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) const
     return mismatch_rate;
 }
 
-void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map)
+void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome) const
 {
     std::string chr = header->target_name[alignment->core.tid];  // Chromosome name
     uint32_t pos = (uint32_t)alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
@@ -359,7 +345,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
 
                     // Get the string for the window (1-based coordinates)
                     ins_ref_pos = j + 1;
-                    std::string window_str = this->input_data.queryRefGenome(chr, ins_ref_pos, ins_ref_pos + op_len - 1);
+                    // std::string window_str =
+                    // this->input_data.queryRefGenome(chr, ins_ref_pos,
+                    // ins_ref_pos + op_len - 1);
+                    std::string window_str = ref_genome.query(chr, ins_ref_pos, ins_ref_pos + op_len - 1);
 
                     // Continue if the window string is empty (out-of-range)
                     if (window_str == "") {
@@ -392,11 +381,24 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
 
                 // Add to SV calls (1-based) with the appropriate SV type
                 ref_pos = pos+1;
-                ref_end = ref_pos + op_len -1;
-                int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
+
+                // For insertions, the reference end position is the same as the
+                // reference position
+                // For duplications, the reference end position is the same as
+                // the reference position plus the length of the insertion
+                ref_end = ref_pos + op_len - 1;
                 if (is_duplication) {
+                    // ref_end = std::min(ref_pos + op_len - 1,
+                    // ref_genome.getChromosomeLength(chr));
+                    uint32_t bp1 = ref_pos;
+                    uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr));
+                    int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
                     addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth);
                 } else {
+                    // ref_end = ref_pos;
+                    uint32_t bp1 = std::max(1, (int)ref_pos - 1);
+                    uint32_t bp2 = ref_pos;
+                    int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
                     addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth);
                 }
             }
@@ -409,6 +411,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
             {
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
+                // printMessage("Test2");
                 int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
                 addSVCall(sv_calls, ref_pos, ref_end, "DEL", "<DEL>", "CIGARDEL", "./.", default_lh, read_depth);
             }
@@ -427,10 +430,13 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     }
 }
 
-void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls)
+void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome)
 {
+    int filter_threshold = 4;  // Minimum number of supporting reads for an SV call
+    bool single_chr = input_data.getChromosome() != "";
+
     // Open the BAM file
-    std::string bam_filepath = this->input_data.getLongReadBam();
+    std::string bam_filepath = input_data.getLongReadBam();
     samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
     if (!fp_in) {
         printError("ERROR: failed to open " + bam_filepath);
@@ -453,130 +459,91 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         printError("ERROR: failed to load index for " + bam_filepath);
         return;
     }
+    BamFileGuard bam_guard(fp_in, idx, bamHdr);  // Guard to close the BAM file
 
-    // Split the chromosome into chunks for memory efficiency
-    std::vector<std::string> region_chunks;
-    //int chunk_count = 100;
-    int chunk_count = 1;
-    uint32_t chr_len = this->input_data.getRefGenomeChromosomeLength(chr);
-    if (this->input_data.isRegionSet()) {
+    // Set the region to process
+    std::string region = chr;
+    uint32_t chr_len = ref_genome.getChromosomeLength(chr);
+    if (input_data.isRegionSet()) {
 
         // Use one chunk for the specified region
-        std::pair<int32_t, int32_t> region = this->input_data.getRegion();
-        int region_start = region.first;
-        int region_end = region.second;
-        std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end);
-        region_chunks.push_back(chunk);
+        std::pair<int32_t, int32_t> region_data = input_data.getRegion();
+        int region_start = region_data.first;
+        int region_end = region_data.second;
+        region = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end);
         
-    } else {
-        int chunk_size = std::ceil((double)chr_len / chunk_count);
-        for (int i = 0; i < chunk_count; i++) {
-            int start = i * chunk_size + 1;  // 1-based
-            int end = start + chunk_size;
-            if (i == chunk_count - 1) {
-                end = chr_len;
-            }
-            std::string chunk = chr + ":" + std::to_string(start) + "-" + std::to_string(end);
-            region_chunks.push_back(chunk);
-        }
-        printMessage("Split chromosome " + chr + " into " + std::to_string(region_chunks.size()) + " chunks of size " + std::to_string(chunk_size) + "...");
     }
 
     // Load chromosome data for copy number predictions
     printMessage(chr + ": Loading chromosome data...");
-    CNVCaller cnv_caller(this->input_data);
-    printMessage(chr + ": LENGTH: " + std::to_string(chr_len));
+    CNVCaller cnv_caller;
     std::vector<uint32_t> chr_pos_depth_map(chr_len+1, 0);  // 1-based index
-    double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map);
-    printMessage(chr + ": POSDEPTH: " + std::to_string(chr_pos_depth_map.size()));
+    int thread_count = input_data.getThreadCount();
+    double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count, single_chr);
     if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) {
-        hts_idx_destroy(idx);
-        bam_hdr_destroy(bamHdr);
-        sam_close(fp_in);
         return;
     }
 
-    // Process each chunk one at a time
-    // std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl;
-    int region_count = region_chunks.size();
-    int current_region = 0;
-    int filter_threshold = 4;
-    for (const auto& sub_region : region_chunks) {
-        current_region++;
-
-        // Detect SVs from the CIGAR strings
-        printMessage(chr + ": CIGAR SVs...");
-        std::vector<SVCall> subregion_sv_calls;        
-        this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, chr_pos_depth_map);
-
-        // std::set<SVCall>& subregion_sv_calls = std::get<0>(region_data);
-        // PrimaryMap& primary_map = std::get<1>(region_data);
-        // SuppMap& supp_map = std::get<2>(region_data);
-        // std::cout << " CIGAR SV calls from " << sub_region << "..." << std::endl;
-        printMessage(chr + ": Merging CIGAR...");
-        filterSVsWithLowSupport(subregion_sv_calls, filter_threshold);
-        mergeSVs(subregion_sv_calls);
-        int region_sv_count = getSVCount(subregion_sv_calls);
-        // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
-
-        // Run copy number variant predictions on the SVs detected from the
-        // CIGAR string, using a minimum CNV length threshold
-        if (region_sv_count > 0) {
-            // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl;
-            printMessage(chr + ": CIGAR predictions...");
-            cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map);
-        }
-
-        // Run split-read SV and copy number variant predictions
-        // std::cout << "Detecting copy number variants from split reads..." << std::endl;
-        printMessage(chr + ": Split read SVs...");
-        this->detectSVsFromSplitReads(sub_region, fp_in, idx, bamHdr, subregion_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map);
-
-        // Merge the SV calls from the current region
-        // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl;
-        printMessage(chr + ": Merging split reads...");
-        filterSVsWithLowSupport(subregion_sv_calls, filter_threshold);
-        mergeSVs(subregion_sv_calls);
+    // Detect SVs from the CIGAR strings
+    printMessage(chr + ": CIGAR SVs...");
+    this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
+
+    /*
+    printMessage(chr + ": Merging CIGAR...");
+    filterSVsWithLowSupport(chr_sv_calls, filter_threshold);
+    mergeSVs(chr_sv_calls);
+    int region_sv_count = getSVCount(chr_sv_calls);
+    // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
+
+    // Run copy number variant predictions on the SVs detected from the
+    // CIGAR string, using a minimum CNV length threshold
+    if (region_sv_count > 0) {
+        printMessage(chr + ": CIGAR predictions...");
+        cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
+    }
 
-        // Combine the SV calls from the current region
-        // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl;
-        printMessage(chr + ": Concatenating calls...");
-        concatenateSVCalls(combined_sv_calls, subregion_sv_calls);
+    // Run split-read SV and copy number variant predictions
+    printMessage(chr + ": Split read SVs...");
+    this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
 
-        printMessage("Completed " + std::to_string(current_region) + " of " + std::to_string(region_count) + " region(s) for chromosome " + chr + "...");
-    }
+    // Merge the SV calls from the current region
+    printMessage(chr + ": Merging split reads...");
+    filterSVsWithLowSupport(chr_sv_calls, filter_threshold);
+    mergeSVs(chr_sv_calls);
 
     // Run a final merge on the combined SV calls
     printMessage(chr + ": Merging final calls...");
-    mergeSVs(combined_sv_calls);
-    // filterSVsWithLowSupport(combined_sv_calls, filter_threshold);
-
-    // Clean up the BAM file, header, and index
-    hts_idx_destroy(idx);
-    bam_hdr_destroy(bamHdr);
-    sam_close(fp_in);
+    mergeSVs(chr_sv_calls);
+    */
+    printMessage("Completed chromosome " + chr);
 }
 
-void SVCaller::run()
+void SVCaller::run(const InputData& input_data)
 {
+    // Set up the reference genome
+    printMessage("Loading the reference genome...");
+    const std::string ref_filepath = input_data.getRefGenome();
+    ReferenceGenome ref_genome;
+    ref_genome.setFilepath(ref_filepath);
+
     // Get the chromosomes
     std::vector<std::string> chromosomes;
-    if (this->input_data.getChromosome() != "") {
-        chromosomes.push_back(this->input_data.getChromosome());
+    if (input_data.isSingleChr()) {
+        chromosomes.push_back(input_data.getChromosome());
     } else {
-        chromosomes = this->input_data.getRefGenomeChromosomes();
+        chromosomes = ref_genome.getChromosomes();
     }
     
     // Read the HMM from the file
-    std::string hmm_filepath = this->input_data.getHMMFilepath();
+    std::string hmm_filepath = input_data.getHMMFilepath();
     std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
     const CHMM& hmm = ReadCHMM(hmm_filepath.c_str());
 
     // Use multi-threading across chromosomes unless a single chromosome is
     // specified
     int max_threads = 1;
-    if (this->input_data.getChromosome() == "") {
-        max_threads = this->input_data.getThreadCount();
+    if (!input_data.isSingleChr()) {
+        max_threads = input_data.getThreadCount();
         std::cout << "Using " << max_threads << " threads for processing..." << std::endl;
     }
     ThreadPool pool(max_threads);
@@ -589,7 +556,8 @@ void SVCaller::run()
     auto process_chr = [&](const std::string& chr) {
         try {
             std::vector<SVCall> sv_calls;
-            this->processChromosome(chr, hmm, sv_calls);
+            InputData chr_input_data = input_data;  // Use a thread-local copy
+            this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome);
             {
                 std::lock_guard<std::mutex> lock(sv_mutex);
                 whole_genome_sv_calls[chr] = std::move(sv_calls);
@@ -636,22 +604,27 @@ void SVCaller::run()
 
     // Save to VCF
     std::cout << "Saving SVs to VCF..." << std::endl;
-    this->saveToVCF(whole_genome_sv_calls);
+    const std::string output_dir = input_data.getOutputDir();
+    this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome);
 }
 
 
 // Detect SVs from split read alignments
-void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map)
+void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
 {
-    printMessage("Getting split alignments...");
+    printMessage(region + ": Getting split alignments...");
     std::unordered_map<std::string, GenomicRegion> primary_map;
     std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
     this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map);
 
     // Find split-read SV evidence
+    printMessage(region + ": Finding split-read SVs...");
     int sv_count = 0;
-    uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength();
+    int current_primary = 0;
+    int primary_count = primary_map.size();
+    uint32_t min_cnv_length = input_data.getMinCNVLength();
     for (auto& entry : primary_map) {
+        current_primary++;
         const std::string& qname = entry.first;
         GenomicRegion& primary_region = entry.second;
 
@@ -661,10 +634,13 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
         }
 
         // Get the read match/mismatch map
-        MismatchData primary_mismatches;
-        this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches, true);
+        // printMessage(region + ": Getting mismatch map for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
+        // MismatchData primary_mismatches;
+        // this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches, true);
         GenomicRegion largest_supp_region = supp_map[qname][0];
         uint32_t largest_supp_length = 0;
+
+        printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
         const std::string& primary_chr = bamHdr->target_name[primary_region.tid];
         for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
             GenomicRegion& supp_region = *it;
@@ -695,13 +671,15 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                         continue;
                     }
 
-                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map);
+                    printMessage(region + ": Running copy number prediction for inversion...");
+                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data);
                     if (std::get<1>(result) == SVType::UNKNOWN) {
                         continue;
                     }
 
                     double supp_lh = std::get<0>(result);
                     SVType supp_type = std::get<1>(result);
+                    // printMessage("Test3");
                     int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
                     if (supp_type == SVType::NEUTRAL) {
                         addSVCall(sv_calls, supp_start, supp_end, "INV", "<INV>", "HMM", "./.", supp_lh, read_depth);
@@ -713,6 +691,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 } else {
                     // Add the inversion without running copy number predictions
                     // (too small for predictions)
+                    // printMessage("Test4");
                     int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
                     addSVCall(sv_calls, supp_start, supp_end, "INV", "<INV>", "REV", "./.", 0.0, read_depth);
                 }
@@ -720,9 +699,12 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
         }
 
         // Trim overlapping alignments
-        MismatchData supp_mismatches;
-        this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches, false);
-        trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches);
+        // MismatchData supp_mismatches;
+        // printMessage(region + ": Getting mismatch map for supplementary alignments...");
+        // this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches, false);
+
+        // printMessage(region + ": Trimming overlapping alignments...");
+        // trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches);
         bool gap_exists = false;
         uint32_t boundary_left, boundary_right, gap_left, gap_right;
         if (primary_region.start < largest_supp_region.start) {  // Primary before supp
@@ -749,7 +731,8 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 continue;
             }
 
-            std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map);
+            printMessage(region + ": Running copy number prediction for boundary...");
+            std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data);
             if (std::get<1>(bd_result) == SVType::UNKNOWN) {
                 continue;
             }
@@ -766,7 +749,8 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                     continue;
                 }
 
-                std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map);
+                printMessage(region + ": Running copy number prediction for gap...");
+                std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data);
                 if (std::get<1>(gap_result) == SVType::UNKNOWN) {
                     continue;
                 }
@@ -775,17 +759,20 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
 
                 // If higher likelihood than the boundary, add the gap as the SV call
                 if (gap_lh > bd_lh) {
+                    // printMessage("Test5");
                     int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right);
                     std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">";
                     addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "GAP", "./.", gap_lh, read_depth);
                 } else {
                     // Add the boundary as the SV call
+                    // printMessage("Test6");
                     int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                     std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
                     addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth);
                 }
             } else {
                 // Add the boundary as the SV call
+                // printMessage("Test7");
                 int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                 std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
                 addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth);
@@ -794,11 +781,10 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
     }
 }
 
-void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls)
+void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const
 {
     std::cout << "Creating VCF writer..." << std::endl;
-    // std::string output_vcf = output_dir + "/output.vcf";
-    std::string output_vcf = this->input_data.getOutputDir() + "/output.vcf";
+    std::string output_vcf = output_dir + "/output.vcf";
     std::cout << "Writing VCF file to " << output_vcf << std::endl;
 	std::ofstream vcf_stream(output_vcf);
     if (!vcf_stream.is_open()) {
@@ -810,7 +796,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
     std::cout << "Getting reference genome filepath..." << std::endl;
     try {
-        std::string ref_fp = this->input_data.getRefGenome().getFilepath();
+        std::string ref_fp = ref_genome.getFilepath();
         std::cout << "Reference genome filepath: " << ref_fp << std::endl;
     } catch (const std::exception& e) {
         std::cerr << "Error: " << e.what() << std::endl;
@@ -819,9 +805,9 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
     // Set the header lines
     std::cout << "Getting reference genome header..." << std::endl;
-    const std::string contig_header = this->input_data.getRefGenome().getContigHeader();
+    const std::string contig_header = ref_genome.getContigHeader();
     std::vector<std::string> header_lines = {
-        std::string("##reference=") + this->input_data.getRefGenome().getFilepath(),
+        std::string("##reference=") + ref_genome.getFilepath(),
         contig_header,
         "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">",
         "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">",
@@ -901,7 +887,9 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                 // Get the deleted sequence from the reference genome, also including the preceding base
                 int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
                 // ref_allele = ref_genome.query(chr, preceding_pos, end);
-                ref_allele = this->input_data.queryRefGenome(chr, preceding_pos, end);
+                // ref_allele = this->input_data.queryRefGenome(chr,
+                // preceding_pos, end);
+                ref_allele = ref_genome.query(chr, preceding_pos, end);
 
                 // Use the preceding base as the alternate allele 
                 if (ref_allele != "") {
@@ -919,7 +907,9 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             } else {
                 // Use the preceding base as the reference allele
                 int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
-                ref_allele = this->input_data.queryRefGenome(chr, preceding_pos, preceding_pos);
+                // ref_allele = this->input_data.queryRefGenome(chr,
+                // preceding_pos, preceding_pos);
+                ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
 
                 // Update the start position to the preceding base
                 start = preceding_pos;
@@ -1025,20 +1015,23 @@ void SVCaller::trimOverlappingAlignments(GenomicRegion& primary_alignment, Genom
     }
 }
 
-int SVCaller::calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end)
+int SVCaller::calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end) const
 {
     int read_depth = 0;
     try {
         // printMessage("Read depth at start: " + std::to_string(pos_depth_map.at(start)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start));
         read_depth += pos_depth_map.at(start);
     } catch (const std::out_of_range& e) {
-        std::cerr << "Warning: Start position " << start << " not found in depth map." << std::endl;
+        // std::cerr << "Warning: Start position " << start << " not found in
+        // depth map." << std::endl;
+        printError("Error: Start position " + std::to_string(start) + " not found in depth map.");
     }
     try {
         // printMessage("Read depth at end: " + std::to_string(pos_depth_map.at(end)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start));
         read_depth += pos_depth_map.at(end);
     } catch (const std::out_of_range& e) {
-        std::cerr << "Warning: End position " << end << " not found in depth map of size " << pos_depth_map.size() << "." << std::endl;
+        printError("Error: End position " + std::to_string(end) + " not found in depth map.");
+        // std::cerr << "Warning: End position " << end << " not found in depth map of size " << pos_depth_map.size() << "." << std::endl;
     }
     // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth));
     return read_depth;
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 7ef70223..1318f8d3 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -148,23 +148,8 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
 
 void filterSVsWithLowSupport(std::vector<SVCall>& sv_calls, int min_support)
 {
-    // int prev_size = sv_calls.size();
-
     // Filter SV calls with low read support
     sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) {
         return sv_call.support < min_support;
     }), sv_calls.end());
-
-    // // Print read depth for each SV call
-    // for (const auto& sv_call : sv_calls) {
-    //     std::cout << "SV call: " << sv_call.start << "-" << sv_call.end << " with depth " << sv_call.read_depth << " and length " << (sv_call.end - sv_call.start) << std::endl;
-    // }
-
-    // // Remove SV calls with low read depth
-    // sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_depth](const SVCall& sv_call) {
-    //     return sv_call.read_depth < min_depth;
-    // }), sv_calls.end());
-
-    // int updated_size = sv_calls.size();
-    // printMessage("Filtered " + std::to_string(prev_size) + " SV calls to " + std::to_string(updated_size) + " SV calls with DP >= " + std::to_string(min_depth));
 }
diff --git a/src/swig_interface.cpp b/src/swig_interface.cpp
index 8d2e7a42..76eb2151 100644
--- a/src/swig_interface.cpp
+++ b/src/swig_interface.cpp
@@ -7,14 +7,13 @@
 
 
 // Run the CLI with the given parameters
-int run(InputData input_data)
+int run(const InputData& input_data)
 {
-
 	// Run ContextSV
-	ContextSV contextsv(input_data);
+	ContextSV contextsv;
 	try
 	{	
-		contextsv.run();
+		contextsv.run(input_data);
 	}
 
 	catch (std::exception& e)

From 11180be5563900a8d321e9f497794ee25d1dab37 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 20 Dec 2024 11:17:13 -0500
Subject: [PATCH 057/134] Fix snp thread locks

---
 include/cnv_caller.h |  8 ++++----
 include/sv_caller.h  |  5 ++---
 src/cnv_caller.cpp   | 40 ++++++++++++----------------------------
 src/sv_caller.cpp    | 30 ++++++++++++++----------------
 4 files changed, 32 insertions(+), 51 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 0417b8ae..ab7bb147 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -75,7 +75,7 @@ class CNVCaller {
         void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction) const;
 
         // Query a region for SNPs and return the SNP data
-        void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const;
+        void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const;
 
         // Split a region into chunks for parallel processing
         std::vector<std::string> splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const;
@@ -87,16 +87,16 @@ class CNVCaller {
 
         // Run copy number prediction for a single SV candidate, returning the
         // likelihood, predicted CNV type, genotype, and whether SNPs were found
-        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
+        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const;
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
-        void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
+        void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const;
 
         double calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath, int thread_count, bool single_chr) const;
 
         void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& pos_log2) const;
 
-        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data) const;
+        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const;
 
         // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions
         void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const;
diff --git a/include/sv_caller.h b/include/sv_caller.h
index 9d967510..2b035198 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -42,7 +42,7 @@ class SVCaller {
         // mismatch rate, and the start and end positions of the query sequence
         void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome) const;
 
-        void processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome);
+        void processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, std::mutex& snp_mutex, std::mutex& pfb_mutex);
 
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
@@ -53,7 +53,7 @@ class SVCaller {
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const;
 
         // Detect SVs from split alignments
-        void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
+        void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const;
 
         // Calculate the mismatch rate given a map of query positions to
         // match/mismatch (1/0) values within a specified range of the query
@@ -68,7 +68,6 @@ class SVCaller {
         int calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end) const;
 
     public:
-        // explicit SVCaller(InputData& input_data);
         // Constructor with no arguments
         SVCaller() = default;
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index cc30ccae..569717b4 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -48,7 +48,7 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::ve
 }
 
 // Function to obtain SNP information for a region
-void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const
+void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const
 {
     // Initialize the SNP data with default values and sample size length
     int sample_size = input_data.getSampleSize();
@@ -63,7 +63,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     std::vector<double> snp_pfb(sample_size, 0.5);
     std::vector<double> snp_log2_cov(sample_size, 0.0);
     std::vector<bool> is_snp(sample_size, false);
-    this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data);
+    this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data, snp_mutex, pfb_mutex);
 
     // Get the log2 ratio for <sample_size> evenly spaced positions in the
     // region
@@ -77,7 +77,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     snp_data.is_snp = std::move(is_snp);
 }
 
-std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
+std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const
 {
     // Check that the start position is less than the end position
     if (start_pos >= end_pos)
@@ -99,7 +99,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 
     // Query the SNP region for the SV candidate
     SNPData snp_data;
-    querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data);
+    querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data, snp_mutex, pfb_mutex);
 
     // Run the Viterbi algorithm
     std::pair<std::vector<int>, double> prediction;
@@ -167,7 +167,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 }
 
 
-void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
+void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const
 {
     // Map with counts for each CNV type
     std::map<int, int> cnv_type_counts;
@@ -199,7 +199,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
 
         // Only extend the region if "save CNV data" is enabled
         SNPData snp_data;
-        this->querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data);
+        this->querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data, snp_mutex, pfb_mutex);
 
         // Run the Viterbi algorithm
         if (snp_data.pos.size() == 0) {
@@ -444,7 +444,7 @@ void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, i
     }
 }
 
-void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data) const
+void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const
 {
     // --------- SNP file ---------
     const std::string snp_filepath = input_data.getSNPFilepath();
@@ -479,16 +479,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         return;
     }
 
-    // Get the header
-    // bcf_hdr_t *snp_header = bcf_sr_get_header(snp_reader, 0);
-    // if (!snp_header)
-    // {
-    //     bcf_sr_destroy(snp_reader);
-    //     printError("ERROR: Could not get header for SNP reader.");
-    //     return;
-    // }
-    // BcfFileGuard snp_guard(snp_reader, snp_header);  // Guard to close the SNP file
-
     // --------- Population allele frequency file ---------
 
     // Get the population allele frequency file path
@@ -563,16 +553,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             bcf_sr_destroy(snp_reader);
             return;
         }
-
-        // Get the header
-        // bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0);
-        // if (!pfb_header)
-        // {
-        //     bcf_sr_destroy(pfb_reader);
-        //     printError("ERROR: Could not get header for population allele frequency reader.");
-        //     return;
-        // }
-        // pfb_guard = BcfFileGuard(pfb_reader, pfb_header);  // Guard to close the population allele frequency file
     }
 
     // Split the region into samples
@@ -587,7 +567,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     {
         current_region++;
         // Lock during reading
-        std::lock_guard<std::mutex> lock(this->snp_file_mtx);
+        // std::lock_guard<std::mutex> lock(this->snp_file_mtx);
+        std::lock_guard<std::mutex> lock(snp_mutex);
 
         // Read the SNP data ----------------------------------------------
 
@@ -689,6 +670,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         // Read the population allele frequency data ----------------------
         if (use_pfb)
         {
+            // Lock during reading
+            std::lock_guard<std::mutex> lock(pfb_mutex);
+
             // Set the region as the SNP position
             uint32_t target_snp_pos = snp_pos[i];  // Already 1-based
             std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos);
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 6fafaf31..d057f89c 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -388,14 +388,11 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 // the reference position plus the length of the insertion
                 ref_end = ref_pos + op_len - 1;
                 if (is_duplication) {
-                    // ref_end = std::min(ref_pos + op_len - 1,
-                    // ref_genome.getChromosomeLength(chr));
                     uint32_t bp1 = ref_pos;
                     uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr));
                     int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
                     addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth);
                 } else {
-                    // ref_end = ref_pos;
                     uint32_t bp1 = std::max(1, (int)ref_pos - 1);
                     uint32_t bp2 = ref_pos;
                     int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
@@ -430,7 +427,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     }
 }
 
-void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome)
+void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, std::mutex& snp_mutex, std::mutex& pfb_mutex)
 {
     int filter_threshold = 4;  // Minimum number of supporting reads for an SV call
     bool single_chr = input_data.getChromosome() != "";
@@ -488,23 +485,23 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     printMessage(chr + ": CIGAR SVs...");
     this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
 
-    /*
     printMessage(chr + ": Merging CIGAR...");
     filterSVsWithLowSupport(chr_sv_calls, filter_threshold);
     mergeSVs(chr_sv_calls);
     int region_sv_count = getSVCount(chr_sv_calls);
-    // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
+    printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
+    // Testing on HG002 whole genome
     // Run copy number variant predictions on the SVs detected from the
     // CIGAR string, using a minimum CNV length threshold
     if (region_sv_count > 0) {
         printMessage(chr + ": CIGAR predictions...");
-        cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
+        cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data, snp_mutex, pfb_mutex);
     }
 
     // Run split-read SV and copy number variant predictions
     printMessage(chr + ": Split read SVs...");
-    this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
+    this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data, snp_mutex, pfb_mutex);
 
     // Merge the SV calls from the current region
     printMessage(chr + ": Merging split reads...");
@@ -514,7 +511,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     // Run a final merge on the combined SV calls
     printMessage(chr + ": Merging final calls...");
     mergeSVs(chr_sv_calls);
-    */
     printMessage("Completed chromosome " + chr);
 }
 
@@ -551,18 +547,20 @@ void SVCaller::run(const InputData& input_data)
     // Shared resources
     std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
     std::mutex sv_mutex;
+    std::mutex snp_mutex;
+    std::mutex pfb_mutex;
 
     // Lambda to process a chromosome
     auto process_chr = [&](const std::string& chr) {
         try {
             std::vector<SVCall> sv_calls;
             InputData chr_input_data = input_data;  // Use a thread-local copy
-            this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome);
+            this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, snp_mutex, pfb_mutex);
             {
                 std::lock_guard<std::mutex> lock(sv_mutex);
                 whole_genome_sv_calls[chr] = std::move(sv_calls);
             }
-            printMessage("Completed chromosome " + chr);
+            // printMessage("Completed chromosome " + chr);
         } catch (const std::exception& e) {
             printError("Error processing chromosome " + chr + ": " + e.what());
         } catch (...) {
@@ -610,7 +608,7 @@ void SVCaller::run(const InputData& input_data)
 
 
 // Detect SVs from split read alignments
-void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
+void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const
 {
     printMessage(region + ": Getting split alignments...");
     std::unordered_map<std::string, GenomicRegion> primary_map;
@@ -671,8 +669,8 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                         continue;
                     }
 
-                    printMessage(region + ": Running copy number prediction for inversion...");
-                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data);
+                    printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")...");
+                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex);
                     if (std::get<1>(result) == SVType::UNKNOWN) {
                         continue;
                     }
@@ -732,7 +730,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
             }
 
             printMessage(region + ": Running copy number prediction for boundary...");
-            std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data);
+            std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex);
             if (std::get<1>(bd_result) == SVType::UNKNOWN) {
                 continue;
             }
@@ -750,7 +748,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 }
 
                 printMessage(region + ": Running copy number prediction for gap...");
-                std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data);
+                std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex);
                 if (std::get<1>(gap_result) == SVType::UNKNOWN) {
                     continue;
                 }

From d884e80f11af627261460428d4812e021b2bb71b Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 21 Dec 2024 10:43:52 -0500
Subject: [PATCH 058/134] Split read update

---
 src/sv_caller.cpp | 39 ++++++++++++++++++++++++++-------------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index d057f89c..b6794b16 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -83,6 +83,18 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
         num_alignments++;
     }
 
+    // Remove primary alignments without supplementary alignments
+    std::vector<std::string> to_remove;
+    for (const auto& entry : primary_map) {
+        const std::string& qname = entry.first;
+        if (supp_map.find(qname) == supp_map.end()) {
+            to_remove.push_back(qname);
+        }
+    }
+    for (const std::string& qname : to_remove) {
+        primary_map.erase(qname);
+    }
+
     // Clean up the iterator and alignment
     hts_itr_destroy(itr);
     bam_destroy1(bam1);
@@ -627,9 +639,9 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
         GenomicRegion& primary_region = entry.second;
 
         // Skip primary alignments that do not have supplementary alignments
-        if (supp_map.find(qname) == supp_map.end()) {
-            continue;
-        }
+        // if (supp_map.find(qname) == supp_map.end()) {
+        //     continue;
+        // }
 
         // Get the read match/mismatch map
         // printMessage(region + ": Getting mismatch map for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
@@ -638,7 +650,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
         GenomicRegion largest_supp_region = supp_map[qname][0];
         uint32_t largest_supp_length = 0;
 
-        printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
+        // printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
         const std::string& primary_chr = bamHdr->target_name[primary_region.tid];
         for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
             GenomicRegion& supp_region = *it;
@@ -669,7 +681,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                         continue;
                     }
 
-                    printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")...");
+                    // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")...");
                     std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex);
                     if (std::get<1>(result) == SVType::UNKNOWN) {
                         continue;
@@ -686,13 +698,14 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                     } else if (supp_type == SVType::DUP) {
                         addSVCall(sv_calls, supp_start, supp_end, "INVDUP", "<INV>", "HMM", "./.", supp_lh, read_depth);
                     }
-                } else {
-                    // Add the inversion without running copy number predictions
-                    // (too small for predictions)
-                    // printMessage("Test4");
-                    int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
-                    addSVCall(sv_calls, supp_start, supp_end, "INV", "<INV>", "REV", "./.", 0.0, read_depth);
                 }
+                // } else {
+                //     // Add the inversion without running copy number predictions
+                //     // (too small for predictions)
+                //     // printMessage("Test4");
+                //     int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
+                //     addSVCall(sv_calls, supp_start, supp_end, "INV", "<INV>", "REV", "./.", 0.0, read_depth);
+                // }
             }
         }
 
@@ -729,7 +742,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 continue;
             }
 
-            printMessage(region + ": Running copy number prediction for boundary...");
+            // printMessage(region + ": Running copy number prediction for boundary...");
             std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex);
             if (std::get<1>(bd_result) == SVType::UNKNOWN) {
                 continue;
@@ -747,7 +760,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                     continue;
                 }
 
-                printMessage(region + ": Running copy number prediction for gap...");
+                // printMessage(region + ": Running copy number prediction for gap...");
                 std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex);
                 if (std::get<1>(gap_result) == SVType::UNKNOWN) {
                     continue;

From 5d6a53c0c9bb8cf5652b4144fae83b17def0ebd1 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sun, 29 Dec 2024 13:04:31 -0500
Subject: [PATCH 059/134] Use single mutex and update dup seqsim threshold

---
 include/cnv_caller.h  |  18 +--
 include/fasta_query.h |   5 +-
 include/sv_caller.h   |  22 +--
 include/sv_object.h   |   4 +-
 src/cnv_caller.cpp    |  24 ++--
 src/fasta_query.cpp   |   8 +-
 src/sv_caller.cpp     | 309 +++++++++++++++++++++++++++---------------
 src/sv_object.cpp     |  16 +--
 8 files changed, 248 insertions(+), 158 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index ab7bb147..bed2a347 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -45,9 +45,10 @@ struct SNPData {
 // CNVCaller: Detect CNVs and return the state sequence by SNP position
 class CNVCaller {
     private:
-        mutable std::mutex snp_file_mtx;  // SNP file mutex
-        mutable std::mutex pfb_file_mtx;  // Population frequency file mutex
-        mutable std::mutex bam_file_mtx;  // BAM file mutex
+        //mutable std::mutex snp_file_mtx;  // SNP file mutex
+        //mutable std::mutex pfb_file_mtx;  // Population frequency file mutex
+        //mutable std::mutex bam_file_mtx;  // BAM file mutex
+        std::mutex& shared_mutex;
 
         // Define a map of CNV genotypes by HMM predicted state.
         // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output.
@@ -75,7 +76,7 @@ class CNVCaller {
         void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction) const;
 
         // Query a region for SNPs and return the SNP data
-        void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const;
+        void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const;
 
         // Split a region into chunks for parallel processing
         std::vector<std::string> splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const;
@@ -83,20 +84,21 @@ class CNVCaller {
     public:
         // explicit CNVCaller(const InputData& input_data);
         // Constructor with no arguments
-        CNVCaller() = default;
+        //CNVCaller() = default;
+	    CNVCaller(std::mutex& mtx) : shared_mutex(mtx) {}
 
         // Run copy number prediction for a single SV candidate, returning the
         // likelihood, predicted CNV type, genotype, and whether SNPs were found
-        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const;
+        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
-        void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const;
+        void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
 
         double calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath, int thread_count, bool single_chr) const;
 
         void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& pos_log2) const;
 
-        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const;
+        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data) const;
 
         // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions
         void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const;
diff --git a/include/fasta_query.h b/include/fasta_query.h
index 75259441..b3cee253 100644
--- a/include/fasta_query.h
+++ b/include/fasta_query.h
@@ -16,9 +16,12 @@ class ReferenceGenome {
         std::string fasta_filepath;
         std::vector<std::string> chromosomes;
         std::unordered_map<std::string, std::string> chr_to_seq;
-        mutable std::mutex mtx;
+        //mutable std::mutex mtx;
+        std::mutex& shared_mutex;
 
     public:
+	    ReferenceGenome(std::mutex& mtx) : shared_mutex(mtx) {}
+    
         int setFilepath(std::string fasta_filepath);
         std::string getFilepath() const;
         std::string query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const;
diff --git a/include/sv_caller.h b/include/sv_caller.h
index 2b035198..18c602c5 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -31,41 +31,41 @@ struct MismatchData {
 
 class SVCaller {
     private:
-        int min_sv_size = 50;       // Minimum SV size to be considered
         int min_mapq = 20;          // Minimum mapping quality to be considered
+        std::mutex shared_mutex;
 
-        // void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data, bool is_primary) const;
+        void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data, bool is_primary, const ReferenceGenome& ref_genome);
 
-        void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map) const;
+        void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map);
 
         // Detect SVs from the CIGAR string of a read alignment, and return the
         // mismatch rate, and the start and end positions of the query sequence
-        void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome) const;
+        void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome);
 
-        void processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, std::mutex& snp_mutex, std::mutex& pfb_mutex);
+        void processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome);
 
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
         // RegionData detectSVsFromRegion(std::string region);
-        void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome) const;
+        void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome);
  
         // Read the next alignment from the BAM file in a thread-safe manner
-        int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const;
+        int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
 
         // Detect SVs from split alignments
-        void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const;
+        void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, const ReferenceGenome& ref_genome);
 
         // Calculate the mismatch rate given a map of query positions to
         // match/mismatch (1/0) values within a specified range of the query
         // sequence
-        double calculateMismatchRate(const MismatchData& mismatch_data) const;
+        double calculateMismatchRate(const MismatchData& mismatch_data);
 
         void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const;
 
-        void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches) const;
+        void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches);
 
         // Calculate the read depth (INFO/DP) for a region
-        int calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end) const;
+        int calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end);
 
     public:
         // Constructor with no arguments
diff --git a/include/sv_object.h b/include/sv_object.h
index e36e8624..3fedee73 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -24,11 +24,11 @@ struct SVCall {
     bool operator<(const SVCall& other) const;
 
     // Constructor with parameters for all fields
-    SVCall(uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) :
+    SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) :
         start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {}
 };
 
-void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth);
+void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth);
 
 void mergeSVs(std::vector<SVCall>& sv_calls);
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 569717b4..a1a93adf 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -48,7 +48,7 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::ve
 }
 
 // Function to obtain SNP information for a region
-void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const
+void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const
 {
     // Initialize the SNP data with default values and sample size length
     int sample_size = input_data.getSampleSize();
@@ -63,7 +63,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     std::vector<double> snp_pfb(sample_size, 0.5);
     std::vector<double> snp_log2_cov(sample_size, 0.0);
     std::vector<bool> is_snp(sample_size, false);
-    this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data, snp_mutex, pfb_mutex);
+    this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data);
 
     // Get the log2 ratio for <sample_size> evenly spaced positions in the
     // region
@@ -77,7 +77,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     snp_data.is_snp = std::move(is_snp);
 }
 
-std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const
+std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
 {
     // Check that the start position is less than the end position
     if (start_pos >= end_pos)
@@ -99,7 +99,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 
     // Query the SNP region for the SV candidate
     SNPData snp_data;
-    querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data, snp_mutex, pfb_mutex);
+    querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data);
 
     // Run the Viterbi algorithm
     std::pair<std::vector<int>, double> prediction;
@@ -124,7 +124,8 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 
     // Determine if there is a majority state within the SV region and if it
     // is greater than 75%
-    double pct_threshold = 0.75;
+    //double pct_threshold = 0.75;
+    double pct_threshold = 0.90;
     int max_state = 0;
     int max_count = 0;
 
@@ -167,7 +168,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 }
 
 
-void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const
+void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
 {
     // Map with counts for each CNV type
     std::map<int, int> cnv_type_counts;
@@ -199,7 +200,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
 
         // Only extend the region if "save CNV data" is enabled
         SNPData snp_data;
-        this->querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data, snp_mutex, pfb_mutex);
+        this->querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data);
 
         // Run the Viterbi algorithm
         if (snp_data.pos.size() == 0) {
@@ -300,7 +301,7 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
 {
     {
         // Open the BAM file
-        std::lock_guard<std::mutex> lock(this->bam_file_mtx);  // Lock the BAM file
+        std::lock_guard<std::mutex> lock(this->shared_mutex);  // Lock the BAM file
         samFile *bam_file = sam_open(bam_filepath.c_str(), "r");
         if (!bam_file)
         {
@@ -444,7 +445,7 @@ void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, i
     }
 }
 
-void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const
+void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data) const
 {
     // --------- SNP file ---------
     const std::string snp_filepath = input_data.getSNPFilepath();
@@ -567,8 +568,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     {
         current_region++;
         // Lock during reading
-        // std::lock_guard<std::mutex> lock(this->snp_file_mtx);
-        std::lock_guard<std::mutex> lock(snp_mutex);
+        std::lock_guard<std::mutex> lock(this->shared_mutex);
 
         // Read the SNP data ----------------------------------------------
 
@@ -671,7 +671,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         if (use_pfb)
         {
             // Lock during reading
-            std::lock_guard<std::mutex> lock(pfb_mutex);
+            //std::lock_guard<std::mutex> lock(this->shared_mutex);
 
             // Set the region as the SNP position
             uint32_t target_snp_pos = snp_pos[i];  // Already 1-based
diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp
index 3cde27d3..0f2ce105 100644
--- a/src/fasta_query.cpp
+++ b/src/fasta_query.cpp
@@ -100,7 +100,7 @@ std::string ReferenceGenome::getFilepath() const
 // Function to get the reference sequence at a given position range
 std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const
 {
-    std::lock_guard<std::mutex> lock(this->mtx);
+    std::lock_guard<std::mutex> lock(this->shared_mutex);
     
     // Convert positions from 1-indexed (reference) to 0-indexed (string indexing)
     pos_start--;
@@ -127,7 +127,7 @@ std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, u
 // Function to get the chromosome contig lengths in VCF header format
 std::string ReferenceGenome::getContigHeader() const
 {
-    std::lock_guard<std::mutex> lock(this->mtx);
+    std::lock_guard<std::mutex> lock(this->shared_mutex);
     std::string contig_header = "";
 
     // Sort the chromosomes
@@ -154,12 +154,12 @@ std::string ReferenceGenome::getContigHeader() const
 
 std::vector<std::string> ReferenceGenome::getChromosomes() const
 {
-    std::lock_guard<std::mutex> lock(this->mtx);
+    std::lock_guard<std::mutex> lock(this->shared_mutex);
     return this->chromosomes;
 }
 
 uint32_t ReferenceGenome::getChromosomeLength(std::string chr) const
 {
-    std::lock_guard<std::mutex> lock(this->mtx);
+    std::lock_guard<std::mutex> lock(this->shared_mutex);
     return this->chr_to_seq.at(chr).length();
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index b6794b16..18359b8a 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -27,16 +27,18 @@
 #include "fasta_query.h"
 /// @endcond
 
-# define DUP_SEQSIM_THRESHOLD 0.9  // Sequence similarity threshold for duplication detection
+# define DUP_SEQSIM_THRESHOLD 0.99  // Sequence similarity threshold for duplication detection
 
+//std::mutex bam_mutex;
 
-int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const
+int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
 {
+    std::lock_guard<std::mutex> lock(this->shared_mutex);
     int ret = sam_itr_next(fp_in, itr, bam1);
     return ret;
 }
 
-void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map) const
+void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map)
 {
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
@@ -98,31 +100,33 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
     // Clean up the iterator and alignment
     hts_itr_destroy(itr);
     bam_destroy1(bam1);
-    printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments...");
+    printMessage(region + ": Processed " + std::to_string(primary_map.size()) + " primary alignments with " + std::to_string(supplementary_count) + " supplementary alignments");
+    // printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments...");
 }
 
-/*
-void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data, bool is_primary) const
+
+void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data, bool is_primary, const ReferenceGenome& ref_genome)
 {
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
     if (!bam1) {
-        hts_idx_destroy(idx);
-        bam_hdr_destroy(bamHdr);
-        sam_close(fp_in);
         printError("ERROR: failed to initialize BAM record");
         return;
     }
-    // hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start, region.end);
+
+
+    //bam_mutex.lock();
+    this->shared_mutex.lock();
     hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start - 1, region.end);
     if (!itr) {
+        this->shared_mutex.unlock();
         bam_destroy1(bam1);
-        hts_idx_destroy(idx);
-        bam_hdr_destroy(bamHdr);
-        sam_close(fp_in);
         printError("ERROR: failed to query region " + std::to_string(region.tid) + ":" + std::to_string(region.start) + "-" + std::to_string(region.end));
         return;
     }
+    this->shared_mutex.unlock();
+    //bam_mutex.unlock();
+
 
     // Find the correct alignment
     bool success = false;
@@ -196,7 +200,7 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
 
             // Get the corresponding reference sequence
             int cmatch_pos = pos + 1;  // Querying the reference genome is 1-based
-            std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1);
+            std::string cmatch_ref_str = ref_genome.query(chr, cmatch_pos, cmatch_pos + op_len - 1);
 
             // Check that the two sequence lengths are equal
             if (cmatch_seq_str.length() != cmatch_ref_str.length()) {
@@ -252,9 +256,9 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t
     mismatch_data.query_end = query_end;
     mismatch_data.match_map = std::move(match_map);
 }
-*/
 
-void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome) const
+
+void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome)
 {
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
@@ -287,7 +291,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
     bam_destroy1(bam1);
 }
 
-double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) const
+double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data)
 {
     int start = mismatch_data.query_start;
     int end = mismatch_data.query_end;
@@ -316,7 +320,7 @@ double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) const
     return mismatch_rate;
 }
 
-void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome) const
+void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome)
 {
     std::string chr = header->target_name[alignment->core.tid];  // Chromosome name
     uint32_t pos = (uint32_t)alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
@@ -329,101 +333,177 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     uint32_t ref_pos;
     uint32_t ref_end;
     double default_lh = 0.0;
+    // List of ambiguous bases
+    const std::string amb_bases = "RYKMSWBDHV";
     for (int i = 0; i < cigar_len; i++) {
 
         int op = bam_cigar_op(cigar[i]);  // CIGAR operation
         int op_len = bam_cigar_oplen(cigar[i]);  // CIGAR operation length
+
+        if (op_len == 0) {
+            printError("Warning: Encountered CIGAR operation with length 0 at position " + std::to_string(pos+1) + " in chromosome " + chr);
+            continue;
+        }
         
         // Process the CIGAR operation
         if (op == BAM_CINS && is_primary) {
-            if (op_len >= this->min_sv_size) {
 
-                // Get the sequence of the insertion from the query
-                std::string ins_seq_str(op_len, ' ');
-                for (int j = 0; j < op_len; j++) {
-                    ins_seq_str[j] = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
+            // Get the sequence of the insertion from the query
+            std::string ins_seq_str(op_len, ' ');
+            for (int j = 0; j < op_len; j++) {
+                // Replace ambiguous bases with N
+                char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
+                if (amb_bases.find(base) != std::string::npos) {
+                    ins_seq_str[j] = 'N';
+                } else {
+                    ins_seq_str[j] = base;
                 }
+                // Get the sequence character from the query
+                // ins_seq_str[j] = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
+            }
 
-                // To determine whether the insertion is a duplication, check
-                // for sequence identity between the insertion and the
-                // reference genome (duplications are typically >= 90%):
-                // Loop through the reference sequence and calculate the
-                // sequence identity +/- insertion length from the insertion
-                // position.
-                bool is_duplication = false;
-                int ins_ref_pos;
-                uint32_t dup_start = std::max(0, (int)pos - op_len);
-                for (uint32_t j = dup_start; j <= pos; j++) {
-
-                    // Get the string for the window (1-based coordinates)
-                    ins_ref_pos = j + 1;
-                    // std::string window_str =
-                    // this->input_data.queryRefGenome(chr, ins_ref_pos,
-                    // ins_ref_pos + op_len - 1);
-                    std::string window_str = ref_genome.query(chr, ins_ref_pos, ins_ref_pos + op_len - 1);
-
-                    // Continue if the window string is empty (out-of-range)
-                    if (window_str == "") {
+            // To determine whether the insertion is a duplication, check
+            // for sequence identity between the insertion and the
+            // reference genome (duplications are typically >= 90%):
+            // Loop through the reference sequence and calculate the
+            // sequence identity +/- insertion length from the insertion
+            // position.
+            // bool is_duplication = false;
+            // int ins_ref_pos;
+            // uint32_t dup_start = std::max(0, (int)pos - op_len);
+            // for (uint32_t j = dup_start; j <= pos; j++) {
+
+            //     // Get the string for the window (1-based coordinates)
+            //     ins_ref_pos = j + 1;
+            //     std::string window_str = ref_genome.query(chr, ins_ref_pos, ins_ref_pos + op_len - 1);
+
+            //     // Continue if the window string is empty (out-of-range)
+            //     if (window_str == "") {
+            //         continue;
+            //     }
+
+            //     // Calculate the sequence identity
+            //     int num_matches = 0;
+            //     for (int k = 0; k < op_len; k++) {
+            //         if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') {
+            //             num_matches++;
+            //         }
+            //     }
+            //     float seq_identity = (float)num_matches / (float)op_len;
+
+            //     // Check if the target sequence identity is reached
+            //     if (seq_identity >= DUP_SEQSIM_THRESHOLD) {
+            //         is_duplication = true;
+            //         break;
+            //     }
+            // }
+
+            // Calculate the sequence identity at the insertion position +/-
+            // length
+            // Before the insertion
+            if (pos >= (uint32_t)op_len-1)
+            {
+                uint32_t bp1 = pos - (op_len - 1);
+                uint32_t bp2 = pos;
+                const std::string& window_str = ref_genome.query(chr, bp1 + 1, bp2 + 1);
+                if (window_str.length() > 0)
+                {
+                    int num_matches = 0;
+                    for (int k = 0; k < op_len; k++)
+                    {
+                        if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N')
+                        {
+                            num_matches++;
+                        }
+                    }
+                    float seq_identity = (float)num_matches / (float)op_len;
+                    if (seq_identity >= DUP_SEQSIM_THRESHOLD)
+                    {
+                        uint32_t dup_bp1 = bp1 + 1;
+                        uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr));
+                        int read_depth = this->calculateReadDepth(pos_depth_map, dup_bp1, dup_bp2);
+                        addSVCall(sv_calls, dup_bp1, dup_bp2, "DUP", ins_seq_str, "LSEQSIM", "./.", default_lh, read_depth);
+
+                        // Continue to the next CIGAR operation
                         continue;
                     }
+                }
+            }
 
-                    // Calculate the sequence identity
+            // After the insertion
+            if (pos + op_len < ref_genome.getChromosomeLength(chr))
+            {
+                uint32_t bp1 = pos + 1;
+                uint32_t bp2 = bp1 + op_len - 1;
+                const std::string& window_str = ref_genome.query(chr, bp1 + 1, bp2 + 1);
+                if (window_str.length() > 0)
+                {
                     int num_matches = 0;
-                    for (int k = 0; k < op_len; k++) {
-                        if (ins_seq_str[k] == window_str[k]) {
+                    for (int k = 0; k < op_len; k++)
+                    {
+                        if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N')
+                        {
                             num_matches++;
                         }
                     }
                     float seq_identity = (float)num_matches / (float)op_len;
-
-                    // Check if the target sequence identity is reached
-                    if (seq_identity >= DUP_SEQSIM_THRESHOLD) {
-                        is_duplication = true;
-                        break;
+                    if (seq_identity >= DUP_SEQSIM_THRESHOLD)
+                    {
+                        uint32_t dup_bp1 = bp1 + 1;
+                        uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr));
+                        int read_depth = this->calculateReadDepth(pos_depth_map, dup_bp1, dup_bp2);
+                        addSVCall(sv_calls, dup_bp1, dup_bp2, "DUP", ins_seq_str, "RSEQSIM", "./.", default_lh, read_depth);
+
+                        // Continue to the next CIGAR operation
+                        continue;
                     }
                 }
-
-                // Determine whether to use a symbolic allele (>50bp) or the
-                // actual sequence
-                if (op_len > 50) {
-                    ins_seq_str = "<INS>";
-                } else {
-                    ins_seq_str = ins_seq_str;
-                }
-
-                // Add to SV calls (1-based) with the appropriate SV type
-                ref_pos = pos+1;
-
-                // For insertions, the reference end position is the same as the
-                // reference position
-                // For duplications, the reference end position is the same as
-                // the reference position plus the length of the insertion
-                ref_end = ref_pos + op_len - 1;
-                if (is_duplication) {
-                    uint32_t bp1 = ref_pos;
-                    uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr));
-                    int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                    addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth);
-                } else {
-                    uint32_t bp1 = std::max(1, (int)ref_pos - 1);
-                    uint32_t bp2 = ref_pos;
-                    int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                    addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth);
-                }
             }
 
+            // Add as an insertion
+            // For read depth calculation, use the previous and current
+            // positions (1-based)
+            int read_depth = this->calculateReadDepth(pos_depth_map, std::max(1, (int)pos), pos + 1);
+            uint32_t ins_pos = pos + 1;
+            uint32_t ins_end = ins_pos + op_len - 1;
+            addSVCall(sv_calls, ins_pos, ins_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth);
+
+            // Determine whether to use a symbolic allele (>50bp) or the
+            // actual sequence
+            // if (op_len > 50) {
+            //     ins_seq_str = "<INS>";
+            // } else {
+            //     ins_seq_str = ins_seq_str;
+            // }
+
+            // Add to SV calls (1-based) with the appropriate SV type
+            // ref_pos = pos+1;
+
+            // // For insertions, the reference end position is the same as the
+            // // reference position
+            // // For duplications, the reference end position is the same as
+            // // the reference position plus the length of the insertion
+            // ref_end = ref_pos + op_len - 1;
+            // if (is_duplication) {
+            //     uint32_t bp1 = ref_pos;
+            //     uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr));
+            //     int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
+            //     addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth);
+            // } else {
+            //     uint32_t bp1 = std::max(1, (int)ref_pos - 1);
+            //     uint32_t bp2 = ref_pos;
+            //     int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
+            //     addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth);
+            // }
+
         // Check if the CIGAR operation is a deletion
         } else if (op == BAM_CDEL && is_primary) {
 
-            // Add the SV if greater than the minimum SV size
-            if (op_len >= this->min_sv_size)
-            {
-                ref_pos = pos+1;
-                ref_end = ref_pos + op_len -1;
-                // printMessage("Test2");
-                int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
-                addSVCall(sv_calls, ref_pos, ref_end, "DEL", "<DEL>", "CIGARDEL", "./.", default_lh, read_depth);
-            }
+            ref_pos = pos+1;
+            ref_end = ref_pos + op_len -1;
+            // printMessage("Test2");
+            int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
+            addSVCall(sv_calls, ref_pos, ref_end, "DEL", "<DEL>", "CIGARDEL", "./.", default_lh, read_depth);
         }
 
         // Update the reference position
@@ -439,9 +519,11 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     }
 }
 
-void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, std::mutex& snp_mutex, std::mutex& pfb_mutex)
+void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome)
 {
-    int filter_threshold = 4;  // Minimum number of supporting reads for an SV call
+    // int filter_threshold = 4;  // Minimum number of supporting reads for an
+    // SV call
+    int filter_threshold = 10;  // Minimum number of supporting reads for an SV call
     bool single_chr = input_data.getChromosome() != "";
 
     // Open the BAM file
@@ -485,7 +567,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
 
     // Load chromosome data for copy number predictions
     printMessage(chr + ": Loading chromosome data...");
-    CNVCaller cnv_caller;
+    CNVCaller cnv_caller(this->shared_mutex);
     std::vector<uint32_t> chr_pos_depth_map(chr_len+1, 0);  // 1-based index
     int thread_count = input_data.getThreadCount();
     double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count, single_chr);
@@ -508,12 +590,12 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     // CIGAR string, using a minimum CNV length threshold
     if (region_sv_count > 0) {
         printMessage(chr + ": CIGAR predictions...");
-        cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data, snp_mutex, pfb_mutex);
+        cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
     }
 
     // Run split-read SV and copy number variant predictions
     printMessage(chr + ": Split read SVs...");
-    this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data, snp_mutex, pfb_mutex);
+    this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data, ref_genome);
 
     // Merge the SV calls from the current region
     printMessage(chr + ": Merging split reads...");
@@ -531,7 +613,7 @@ void SVCaller::run(const InputData& input_data)
     // Set up the reference genome
     printMessage("Loading the reference genome...");
     const std::string ref_filepath = input_data.getRefGenome();
-    ReferenceGenome ref_genome;
+    ReferenceGenome ref_genome(this->shared_mutex);
     ref_genome.setFilepath(ref_filepath);
 
     // Get the chromosomes
@@ -558,18 +640,19 @@ void SVCaller::run(const InputData& input_data)
 
     // Shared resources
     std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
-    std::mutex sv_mutex;
-    std::mutex snp_mutex;
-    std::mutex pfb_mutex;
+    //std::mutex sv_mutex;
+    //std::mutex snp_mutex;
+    //std::mutex pfb_mutex;
 
     // Lambda to process a chromosome
     auto process_chr = [&](const std::string& chr) {
         try {
             std::vector<SVCall> sv_calls;
             InputData chr_input_data = input_data;  // Use a thread-local copy
-            this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, snp_mutex, pfb_mutex);
+            this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome);
             {
-                std::lock_guard<std::mutex> lock(sv_mutex);
+                //std::lock_guard<std::mutex> lock(sv_mutex);
+                std::lock_guard<std::mutex> lock(this->shared_mutex);
                 whole_genome_sv_calls[chr] = std::move(sv_calls);
             }
             // printMessage("Completed chromosome " + chr);
@@ -620,7 +703,7 @@ void SVCaller::run(const InputData& input_data)
 
 
 // Detect SVs from split read alignments
-void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const
+void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, const ReferenceGenome& ref_genome)
 {
     printMessage(region + ": Getting split alignments...");
     std::unordered_map<std::string, GenomicRegion> primary_map;
@@ -631,7 +714,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
     printMessage(region + ": Finding split-read SVs...");
     int sv_count = 0;
     int current_primary = 0;
-    int primary_count = primary_map.size();
+    //int primary_count = primary_map.size();
     uint32_t min_cnv_length = input_data.getMinCNVLength();
     for (auto& entry : primary_map) {
         current_primary++;
@@ -645,8 +728,10 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
 
         // Get the read match/mismatch map
         // printMessage(region + ": Getting mismatch map for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
-        // MismatchData primary_mismatches;
-        // this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches, true);
+        //MismatchData primary_mismatches;
+        //this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches, true, ref_genome);
+        
+      	// Find the largest supplementary alignment
         GenomicRegion largest_supp_region = supp_map[qname][0];
         uint32_t largest_supp_length = 0;
 
@@ -682,7 +767,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                     }
 
                     // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")...");
-                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex);
+                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data);
                     if (std::get<1>(result) == SVType::UNKNOWN) {
                         continue;
                     }
@@ -710,12 +795,12 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
         }
 
         // Trim overlapping alignments
-        // MismatchData supp_mismatches;
-        // printMessage(region + ": Getting mismatch map for supplementary alignments...");
-        // this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches, false);
+        //MismatchData supp_mismatches;
+        //printMessage(region + ": Getting mismatch map for supplementary alignments...");
+        //this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches, false, ref_genome);
 
         // printMessage(region + ": Trimming overlapping alignments...");
-        // trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches);
+        //trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches);
         bool gap_exists = false;
         uint32_t boundary_left, boundary_right, gap_left, gap_right;
         if (primary_region.start < largest_supp_region.start) {  // Primary before supp
@@ -743,7 +828,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
             }
 
             // printMessage(region + ": Running copy number prediction for boundary...");
-            std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex);
+            std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data);
             if (std::get<1>(bd_result) == SVType::UNKNOWN) {
                 continue;
             }
@@ -761,7 +846,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 }
 
                 // printMessage(region + ": Running copy number prediction for gap...");
-                std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex);
+                std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data);
                 if (std::get<1>(gap_result) == SVType::UNKNOWN) {
                     continue;
                 }
@@ -966,7 +1051,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     std::cout << "Finished writing VCF file. Total SV calls: " << total_count << ", skipped: " << skip_count << " with unknown SV type" << std::endl;
 }
 
-void SVCaller::trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches) const
+void SVCaller::trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches)
 {
 
     // Check for overlapping read alignments
@@ -1026,7 +1111,7 @@ void SVCaller::trimOverlappingAlignments(GenomicRegion& primary_alignment, Genom
     }
 }
 
-int SVCaller::calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end) const
+int SVCaller::calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end)
 {
     int read_depth = 0;
     try {
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 1318f8d3..dd896717 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -14,7 +14,7 @@ bool SVCall::operator<(const SVCall & other) const
 	return start < other.start || (start == other.start && end < other.end);
 }
 
-void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth)
+void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth)
 {
     // Ignore unknown SV types
     if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") {
@@ -22,13 +22,13 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std:
     }
 
     // Set the alt allele to <DUP> or <DEL> if the SV type is DUP or DEL
-    if (sv_type == "DUP" && alt_allele == ".") {
-        printError("ERROR: Invalid alt allele for duplication at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele);
-        alt_allele = "<DUP>";
-    } else if (sv_type == "DEL" && alt_allele == ".") {
-        printError("ERROR: Invalid alt allele for deletion at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele);
-        alt_allele = "<DEL>";
-    }
+    // if (sv_type == "DUP" && alt_allele == ".") {
+    //     printError("ERROR: Invalid alt allele for duplication at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele);
+    //     alt_allele = "<DUP>";
+    // } else if (sv_type == "DEL" && alt_allele == ".") {
+    //     printError("ERROR: Invalid alt allele for deletion at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele);
+    //     alt_allele = "<DEL>";
+    // }
     
     if (start >= end) {
         printError("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end));

From 917a68c975e2b61d8c620dddf1799f9c3f7cc7cb Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sun, 29 Dec 2024 13:20:15 -0500
Subject: [PATCH 060/134] update thresholds

---
 src/cnv_caller.cpp | 5 +++--
 src/sv_caller.cpp  | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index a1a93adf..44505994 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -124,8 +124,9 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
 
     // Determine if there is a majority state within the SV region and if it
     // is greater than 75%
-    //double pct_threshold = 0.75;
-    double pct_threshold = 0.90;
+    double pct_threshold = 0.75;
+    // double pct_threshold = 0.90;
+    // double pct_threshold = 0.80;
     int max_state = 0;
     int max_count = 0;
 
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 18359b8a..34200b34 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -27,7 +27,7 @@
 #include "fasta_query.h"
 /// @endcond
 
-# define DUP_SEQSIM_THRESHOLD 0.99  // Sequence similarity threshold for duplication detection
+# define DUP_SEQSIM_THRESHOLD 0.90  // Sequence similarity threshold for duplication detection
 
 //std::mutex bam_mutex;
 

From 5083aa21594e0fd176311bc787967f62f099d545 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 23 Jan 2025 17:36:31 -0500
Subject: [PATCH 061/134] Improve multi-threading and add min-reads parameter

---
 include/input_data.h |   5 +
 include/utils.h      |  21 ---
 src/cnv_caller.cpp   |  45 +++---
 src/input_data.cpp   |  22 ++-
 src/main.cpp         |   6 +
 src/sv_caller.cpp    | 329 +++++++++++++++++++++++--------------------
 src/sv_object.cpp    |   2 +-
 7 files changed, 229 insertions(+), 201 deletions(-)

diff --git a/include/input_data.h b/include/input_data.h
index 72bca5af..3960d362 100644
--- a/include/input_data.h
+++ b/include/input_data.h
@@ -61,6 +61,10 @@ class InputData {
         void setMinCNVLength(int min_cnv_length);
         uint32_t getMinCNVLength() const;
 
+        // Set the minimum number of reads supporting an SV for filtering steps.
+        void setMinReadSupport(int min_reads);
+        int getMinReadSupport() const;
+
         // Set the chromosome to analyze.
         void setChromosome(std::string chr);
         std::string getChromosome() const;
@@ -98,6 +102,7 @@ class InputData {
         std::string output_dir;
         int sample_size;
         uint32_t min_cnv_length;
+        int min_reads;
         std::string chr;  // Chromosome to analyze
         std::pair<int32_t, int32_t> start_end;  // Region to analyze
         bool region_set;  // True if a region is set
diff --git a/include/utils.h b/include/utils.h
index 2fb4a3b1..6715b00e 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -38,27 +38,6 @@ struct BamFileGuard {
     BamFileGuard& operator=(const BamFileGuard&) = delete;  // Non-assignable
 };
 
-// Guard to close the BCF file
-struct BcfFileGuard {
-    bcf_srs_t* reader;
-    bcf_hdr_t* hdr;
-
-    BcfFileGuard(bcf_srs_t* reader, bcf_hdr_t* hdr)
-        : reader(reader), hdr(hdr) {}
-
-    ~BcfFileGuard() {
-        if (hdr) {
-            bcf_hdr_destroy(hdr);
-        }
-        if (reader) {
-            bcf_sr_destroy(reader);
-        }
-    }
-
-    BcfFileGuard(const BcfFileGuard&) = delete;  // Non-copyable
-    BcfFileGuard& operator=(const BcfFileGuard&) = delete;  // Non-assignable
-};
-
 // Print the progress of a task
 void printProgress(int progress, int total);
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 44505994..a622325d 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -80,7 +80,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
 std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
 {
     // Check that the start position is less than the end position
-    if (start_pos >= end_pos)
+    if (start_pos > end_pos)
     {
         printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false);
@@ -187,7 +187,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         uint32_t end_pos = sv_call.end;
         
         // Error if start > end
-        if (start_pos >= end_pos)
+        if (start_pos > end_pos)
         {
             printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         	continue;
@@ -310,11 +310,12 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
             return 0.0;
         }
 
-        // Enable multi-threading if running on a single chromosome
-        if (single_chr)
-        {
-            hts_set_threads(bam_file, thread_count);
-        }
+        // Enable multi-threading. This is possible here due to the lock
+        hts_set_threads(bam_file, thread_count);
+        // if (single_chr)
+        // {
+        //     hts_set_threads(bam_file, thread_count);
+        // }
 
         // Read the header
         bam_hdr_t *bam_header = sam_hdr_read(bam_file);
@@ -465,13 +466,14 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     }
     snp_reader->require_index = 1;
 
-    // Set multi-threading if running on a single chromosome
+    // Use multi-threading. This is possible here due to the lock
     int thread_count = input_data.getThreadCount();
-    if (input_data.isSingleChr())
-    {
-        printMessage("Setting SNP reader threads to " + std::to_string(std::max(1, thread_count / 2)));
-        bcf_sr_set_threads(snp_reader, std::max(1, thread_count / 2));
-    }
+    bcf_sr_set_threads(snp_reader, thread_count);
+    // if (input_data.isSingleChr())
+    // {
+    //     printMessage("Setting SNP reader threads to " + std::to_string(std::max(1, thread_count / 2)));
+    //     bcf_sr_set_threads(snp_reader, std::max(1, thread_count / 2));
+    // }
 
     // Add the SNP file to the reader
     if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0)
@@ -495,7 +497,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     bcf_srs_t *pfb_reader = bcf_sr_init();
     std::string chr_gnomad;
     std::string AF_key;
-    // BcfFileGuard pfb_guard(nullptr, nullptr);  // Guard to close the population allele frequency file
     if (use_pfb)
     {
         // Determine the ethnicity-specific allele frequency key
@@ -537,13 +538,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         }
         pfb_reader->require_index = 1;
 
-        // Set multi-threading if running on a single chromosome
-        if (input_data.isSingleChr())
-        {
-            printMessage("Setting population allele frequency reader threads to " + std::to_string(std::max(1, thread_count / 2)));
-            bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2));
-        }
-
         // Add the population allele frequency file to the reader
         if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0)
         {
@@ -551,10 +545,17 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
 
             // Clean up
             bcf_sr_destroy(pfb_reader);
-            // bcf_hdr_destroy(snp_header);
             bcf_sr_destroy(snp_reader);
             return;
         }
+
+        // Use multi-threading. This is possible here due to the lock
+        bcf_sr_set_threads(pfb_reader, thread_count);
+        // if (input_data.isSingleChr())
+        // {
+        //     printMessage("Setting population allele frequency reader threads to " + std::to_string(std::max(1, thread_count / 2)));
+        //     bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2));
+        // }
     }
 
     // Split the region into samples
diff --git a/src/input_data.cpp b/src/input_data.cpp
index a24efb9e..489a9894 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -26,6 +26,7 @@ InputData::InputData()
     this->output_dir = "";
     this->sample_size = 100;
     this->min_cnv_length = 1000;
+    this->min_reads = 5;
     this->thread_count = 1;
     this->hmm_filepath = "data/wgs.hmm";
     this->verbose = false;
@@ -103,10 +104,15 @@ std::string InputData::getOutputDir() const
 void InputData::setOutputDir(std::string dirpath)
 {
     this->output_dir = dirpath;
-
-    // Create the output directory
     std::string cmd = "mkdir -p " + output_dir;
-    system(cmd.c_str());
+    try
+    {
+        std::system(cmd.c_str());
+    } catch (const std::exception& e)
+    {
+        std::cerr << "Error creating output directory: " << e.what() << std::endl;
+        exit(1);
+    }
 }
 
 int InputData::getSampleSize() const
@@ -149,6 +155,16 @@ void InputData::setMinCNVLength(int min_cnv_length)
     this->min_cnv_length = (uint32_t) min_cnv_length;
 }
 
+void InputData::setMinReadSupport(int min_reads)
+{
+    this->min_reads = min_reads;
+}
+
+int InputData::getMinReadSupport() const
+{
+    return this->min_reads;
+}
+
 void InputData::setChromosome(std::string chr)
 {
     this->chr = chr;
diff --git a/src/main.cpp b/src/main.cpp
index bbdb8366..58d8fbdc 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -46,6 +46,9 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
     if (args.find("min-cnv") != args.end()) {
         input_data.setMinCNVLength(std::stoi(args.at("min-cnv")));
     }
+    if (args.find("min-reads") != args.end()) {
+        input_data.setMinReadSupport(std::stoi(args.at("min-reads")));
+    }
     if (args.find("eth") != args.end()) {
         input_data.setEthnicity(args.at("eth"));
     }
@@ -76,6 +79,7 @@ void printUsage(const std::string& programName) {
                 << "  -h, --hmm <hmm_file>          HMM file\n"
                 << "  -n, --sample-size <size>      Sample size for HMM predictions\n"
                 << "     --min-cnv <min_length>     Minimum CNV length\n"
+                << "     --min-reads <min_reads>    Minimum read support\n"
                 << "  -e, --eth <eth_file>          ETH file\n"
                 << "  -p, --pfb <pfb_file>          PFB file\n"
                 << "     --save-cnv                 Save CNV data\n"
@@ -110,6 +114,8 @@ std::unordered_map<std::string, std::string> parseArguments(int argc, char* argv
             args["sample-size"] = argv[++i];
         } else if (arg == "--min-cnv" && i + 1 < argc) {
             args["min-cnv"] = argv[++i];
+        } else if (arg == "--min-reads" && i + 1 < argc) {
+            args["min-reads"] = argv[++i];
         } else if ((arg == "-e" || arg == "--eth") && i + 1 < argc) {
             args["eth"] = argv[++i];
         } else if ((arg == "-p" || arg == "--pfb") && i + 1 < argc) {
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 34200b34..6180612f 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -27,7 +27,7 @@
 #include "fasta_query.h"
 /// @endcond
 
-# define DUP_SEQSIM_THRESHOLD 0.90  // Sequence similarity threshold for duplication detection
+# define DUP_SEQSIM_THRESHOLD 0.9  // Sequence similarity threshold for duplication detection
 
 //std::mutex bam_mutex;
 
@@ -336,174 +336,188 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     // List of ambiguous bases
     const std::string amb_bases = "RYKMSWBDHV";
     for (int i = 0; i < cigar_len; i++) {
-
-        int op = bam_cigar_op(cigar[i]);  // CIGAR operation
         int op_len = bam_cigar_oplen(cigar[i]);  // CIGAR operation length
-
-        if (op_len == 0) {
-            printError("Warning: Encountered CIGAR operation with length 0 at position " + std::to_string(pos+1) + " in chromosome " + chr);
-            continue;
-        }
-        
-        // Process the CIGAR operation
-        if (op == BAM_CINS && is_primary) {
-
-            // Get the sequence of the insertion from the query
-            std::string ins_seq_str(op_len, ' ');
-            for (int j = 0; j < op_len; j++) {
-                // Replace ambiguous bases with N
-                char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
-                if (amb_bases.find(base) != std::string::npos) {
-                    ins_seq_str[j] = 'N';
-                } else {
-                    ins_seq_str[j] = base;
+        int op = bam_cigar_op(cigar[i]);  // CIGAR operation
+        if (op_len >= 50) {
+            // Process SVs
+            
+            // Process the CIGAR operation
+            if (op == BAM_CINS && is_primary) {
+
+                // Get the sequence of the insertion from the query
+                std::string ins_seq_str(op_len, ' ');
+                for (int j = 0; j < op_len; j++) {
+                    // Replace ambiguous bases with N
+                    char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
+                    if (amb_bases.find(base) != std::string::npos) {
+                        ins_seq_str[j] = 'N';
+                    } else {
+                        ins_seq_str[j] = base;
+                    }
+                    // Get the sequence character from the query
+                    // ins_seq_str[j] = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
                 }
-                // Get the sequence character from the query
-                // ins_seq_str[j] = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
-            }
+                
+                // To determine whether the insertion is a duplication, check
+                // for sequence identity between the insertion and the
+                // reference genome (duplications are typically >= 90%):
+                // Loop through the reference sequence and calculate the
+                // sequence identity +/- insertion length from the insertion
+                // position.
+                // bool is_duplication = false;
+                // int ins_ref_pos;
+                // uint32_t dup_start = std::max(0, (int)pos - op_len);
+                // for (uint32_t j = dup_start; j <= pos; j++) {
+
+                //     // Get the string for the window (1-based coordinates)
+                //     ins_ref_pos = j + 1;
+                //     std::string window_str = ref_genome.query(chr, ins_ref_pos, ins_ref_pos + op_len - 1);
+
+                //     // Continue if the window string is empty (out-of-range)
+                //     if (window_str == "") {
+                //         continue;
+                //     }
+
+                //     // Calculate the sequence identity
+                //     int num_matches = 0;
+                //     for (int k = 0; k < op_len; k++) {
+                //         if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') {
+                //             num_matches++;
+                //         }
+                //     }
+                //     float seq_identity = (float)num_matches / (float)op_len;
+
+                //     // Check if the target sequence identity is reached
+                //     if (seq_identity >= DUP_SEQSIM_THRESHOLD) {
+                //         is_duplication = true;
+                //         break;
+                //     }
+                // }
 
-            // To determine whether the insertion is a duplication, check
-            // for sequence identity between the insertion and the
-            // reference genome (duplications are typically >= 90%):
-            // Loop through the reference sequence and calculate the
-            // sequence identity +/- insertion length from the insertion
-            // position.
-            // bool is_duplication = false;
-            // int ins_ref_pos;
-            // uint32_t dup_start = std::max(0, (int)pos - op_len);
-            // for (uint32_t j = dup_start; j <= pos; j++) {
-
-            //     // Get the string for the window (1-based coordinates)
-            //     ins_ref_pos = j + 1;
-            //     std::string window_str = ref_genome.query(chr, ins_ref_pos, ins_ref_pos + op_len - 1);
-
-            //     // Continue if the window string is empty (out-of-range)
-            //     if (window_str == "") {
-            //         continue;
-            //     }
-
-            //     // Calculate the sequence identity
-            //     int num_matches = 0;
-            //     for (int k = 0; k < op_len; k++) {
-            //         if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') {
-            //             num_matches++;
-            //         }
-            //     }
-            //     float seq_identity = (float)num_matches / (float)op_len;
-
-            //     // Check if the target sequence identity is reached
-            //     if (seq_identity >= DUP_SEQSIM_THRESHOLD) {
-            //         is_duplication = true;
-            //         break;
-            //     }
-            // }
-
-            // Calculate the sequence identity at the insertion position +/-
-            // length
-            // Before the insertion
-            if (pos >= (uint32_t)op_len-1)
-            {
-                uint32_t bp1 = pos - (op_len - 1);
-                uint32_t bp2 = pos;
-                const std::string& window_str = ref_genome.query(chr, bp1 + 1, bp2 + 1);
-                if (window_str.length() > 0)
+                // Calculate the sequence identity at the insertion position +/-
+                // length if >= 50bp
+                //if (op_len >= 50) {
+                
+                // Before the insertion
+                if (pos >= (uint32_t)op_len-1)
                 {
-                    int num_matches = 0;
-                    for (int k = 0; k < op_len; k++)
+                    uint32_t bp1 = pos - (op_len - 1) + 1;
+                    uint32_t bp2 = bp1 + op_len - 1; //pos + 1;
+                    const std::string& window_str = ref_genome.query(chr, bp1, bp2);
+                    if (window_str.length() > 0)
                     {
-                        if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N')
+                        int num_matches = 0;
+                        for (int k = 0; k < op_len; k++)
                         {
-                            num_matches++;
+                            if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N')
+                            {
+                                num_matches++;
+                            }
+                        }
+                        float seq_identity = (float)num_matches / (float)op_len;
+                        if (seq_identity >= DUP_SEQSIM_THRESHOLD)
+                        {
+                            //uint32_t dup_bp1 = bp1 + 1;
+                            //uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr));
+                            int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
+                            //printMessage("TEST3");
+                            addSVCall(sv_calls, bp1, bp2, "DUP", "<DUP>", "LSEQSIM", "./.", default_lh, read_depth);
+
+                            // Continue to the next CIGAR operation
+                            continue;
                         }
-                    }
-                    float seq_identity = (float)num_matches / (float)op_len;
-                    if (seq_identity >= DUP_SEQSIM_THRESHOLD)
-                    {
-                        uint32_t dup_bp1 = bp1 + 1;
-                        uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr));
-                        int read_depth = this->calculateReadDepth(pos_depth_map, dup_bp1, dup_bp2);
-                        addSVCall(sv_calls, dup_bp1, dup_bp2, "DUP", ins_seq_str, "LSEQSIM", "./.", default_lh, read_depth);
-
-                        // Continue to the next CIGAR operation
-                        continue;
                     }
                 }
-            }
 
-            // After the insertion
-            if (pos + op_len < ref_genome.getChromosomeLength(chr))
-            {
-                uint32_t bp1 = pos + 1;
-                uint32_t bp2 = bp1 + op_len - 1;
-                const std::string& window_str = ref_genome.query(chr, bp1 + 1, bp2 + 1);
-                if (window_str.length() > 0)
+                // After the insertion
+                if (pos + op_len < ref_genome.getChromosomeLength(chr))
                 {
-                    int num_matches = 0;
-                    for (int k = 0; k < op_len; k++)
+                    uint32_t bp1 = pos + 1;
+                    //uint32_t bp2 = std::min(bp1 + op_len - 1, ref_genome.getChromosomeLength(chr));
+                    uint32_t bp2 = bp1 + op_len - 1;
+                    const std::string& window_str = ref_genome.query(chr, bp1, bp2);
+                    if (window_str.length() > 0)
                     {
-                        if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N')
+                        int num_matches = 0;
+                        for (int k = 0; k < op_len; k++)
+                        {
+                            if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N')
+                            {
+                                num_matches++;
+                            }
+                        }
+                        float seq_identity = (float)num_matches / (float)op_len;
+                        if (seq_identity >= DUP_SEQSIM_THRESHOLD)
                         {
-                            num_matches++;
+                            //uint32_t dup_bp1 = bp1 + 1;
+                            //uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr));
+                            int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
+                            //printMessage("TEST1");
+                            addSVCall(sv_calls, bp1, bp2, "DUP", "<DUP>", "RSEQSIM", "./.", default_lh, read_depth);
+
+                            // Continue to the next CIGAR operation
+                            continue;
                         }
                     }
-                    float seq_identity = (float)num_matches / (float)op_len;
-                    if (seq_identity >= DUP_SEQSIM_THRESHOLD)
-                    {
-                        uint32_t dup_bp1 = bp1 + 1;
-                        uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr));
-                        int read_depth = this->calculateReadDepth(pos_depth_map, dup_bp1, dup_bp2);
-                        addSVCall(sv_calls, dup_bp1, dup_bp2, "DUP", ins_seq_str, "RSEQSIM", "./.", default_lh, read_depth);
+                }
 
-                        // Continue to the next CIGAR operation
-                        continue;
-                    }
+                // Add as an insertion
+                // For read depth calculation, use the previous and current
+                // positions (1-based)
+
+                //uint32_t ins_pos = pos;
+                //uint32_t ins_end = ins_pos + op_len -1;
+                uint32_t ins_pos = pos + 1;
+                uint32_t ins_end = ins_pos + op_len - 1;
+                int read_depth = this->calculateReadDepth(pos_depth_map, ins_pos-1, ins_pos);
+                //printMessage("TEST2: " + std::to_string(ins_pos) + ", " + std::to_string(ins_end) + ", OPLEN=" + std::to_string(op_len));
+                
+                // Determine the ALT allele format based on small vs. large insertion
+                std::string alt_allele = "<INS>";
+                if (op_len <= 50) {
+                    alt_allele = ins_seq_str;
                 }
-            }
+                
+                addSVCall(sv_calls, ins_pos, ins_end, "INS", alt_allele, "CIGARINS", "./.", default_lh, read_depth);
+
+                // Determine whether to use a symbolic allele (>50bp) or the
+                // actual sequence
+                // if (op_len > 50) {
+                //     ins_seq_str = "<INS>";
+                // } else {
+                //     ins_seq_str = ins_seq_str;
+                // }
+
+                // Add to SV calls (1-based) with the appropriate SV type
+                // ref_pos = pos+1;
+
+                // // For insertions, the reference end position is the same as the
+                // // reference position
+                // // For duplications, the reference end position is the same as
+                // // the reference position plus the length of the insertion
+                // ref_end = ref_pos + op_len - 1;
+                // if (is_duplication) {
+                //     uint32_t bp1 = ref_pos;
+                //     uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr));
+                //     int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
+                //     addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth);
+                // } else {
+                //     uint32_t bp1 = std::max(1, (int)ref_pos - 1);
+                //     uint32_t bp2 = ref_pos;
+                //     int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
+                //     addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth);
+                // }
+
+            // Check if the CIGAR operation is a deletion
+            } else if (op == BAM_CDEL && is_primary) {
 
-            // Add as an insertion
-            // For read depth calculation, use the previous and current
-            // positions (1-based)
-            int read_depth = this->calculateReadDepth(pos_depth_map, std::max(1, (int)pos), pos + 1);
-            uint32_t ins_pos = pos + 1;
-            uint32_t ins_end = ins_pos + op_len - 1;
-            addSVCall(sv_calls, ins_pos, ins_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth);
-
-            // Determine whether to use a symbolic allele (>50bp) or the
-            // actual sequence
-            // if (op_len > 50) {
-            //     ins_seq_str = "<INS>";
-            // } else {
-            //     ins_seq_str = ins_seq_str;
-            // }
-
-            // Add to SV calls (1-based) with the appropriate SV type
-            // ref_pos = pos+1;
-
-            // // For insertions, the reference end position is the same as the
-            // // reference position
-            // // For duplications, the reference end position is the same as
-            // // the reference position plus the length of the insertion
-            // ref_end = ref_pos + op_len - 1;
-            // if (is_duplication) {
-            //     uint32_t bp1 = ref_pos;
-            //     uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr));
-            //     int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-            //     addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth);
-            // } else {
-            //     uint32_t bp1 = std::max(1, (int)ref_pos - 1);
-            //     uint32_t bp2 = ref_pos;
-            //     int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-            //     addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth);
-            // }
-
-        // Check if the CIGAR operation is a deletion
-        } else if (op == BAM_CDEL && is_primary) {
-
-            ref_pos = pos+1;
-            ref_end = ref_pos + op_len -1;
-            // printMessage("Test2");
-            int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
-            addSVCall(sv_calls, ref_pos, ref_end, "DEL", "<DEL>", "CIGARDEL", "./.", default_lh, read_depth);
+                ref_pos = pos+1;
+                ref_end = ref_pos + op_len -1;
+                // printMessage("Test2");
+                int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
+                //printMessage("TEST4: " + std::to_string(ref_pos) + ", " + std::to_string(ref_end) + ", OPLEN=" + std::to_string(op_len));
+                addSVCall(sv_calls, ref_pos, ref_end, "DEL", "<DEL>", "CIGARDEL", "./.", default_lh, read_depth);
+            }
         }
 
         // Update the reference position
@@ -521,9 +535,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
 
 void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome)
 {
-    // int filter_threshold = 4;  // Minimum number of supporting reads for an
+    // int filter_threshold = 4;  // Minimum number of supporting reads for an SV call
+    // int filter_threshold = 10;  // Minimum number of supporting reads for an
     // SV call
-    int filter_threshold = 10;  // Minimum number of supporting reads for an SV call
+    int filter_threshold = input_data.getMinReadSupport();  // Minimum number of supporting reads for an SV call
     bool single_chr = input_data.getChromosome() != "";
 
     // Open the BAM file
@@ -534,6 +549,10 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         return;
     }
 
+    // Set multi-threading
+    int num_threads = input_data.getThreadCount();
+    hts_set_threads(fp_in, num_threads);
+
     // Load the header
     bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
     if (!bamHdr) {
@@ -962,10 +981,12 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             std::string data_type_str = sv_call.data_type;
             std::string alt_allele = sv_call.alt_allele;
             double hmm_likelihood = sv_call.hmm_likelihood;
-            int sv_length = end - start;
+            int sv_length = end - start + 1;
+            /*
             if (sv_type_str == "DEL") {
             	sv_length++;
         	}
+        	*/
             int read_depth = sv_call.read_depth;
             std::string ref_allele = ".";
             int support = sv_call.support;
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index dd896717..16362ba1 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -30,7 +30,7 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std:
     //     alt_allele = "<DEL>";
     // }
     
-    if (start >= end) {
+    if (start > end) {
         printError("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end));
         return;
     }

From b31875c16c6a459dfd37704f7119e654fc5154de Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 27 Jan 2025 20:58:16 -0500
Subject: [PATCH 062/134] efficiency updates

---
 Makefile-cpp          |   2 +-
 include/cnv_caller.h  |   2 +-
 include/fasta_query.h |   5 +-
 include/sv_caller.h   |   4 +-
 include/sv_object.h   |   2 +
 setup.py              |   2 +-
 src/cnv_caller.cpp    | 127 ++++++-----
 src/fasta_query.cpp   | 108 +++++++---
 src/input_data.cpp    |   5 +
 src/sv_caller.cpp     | 476 +++++++++++-------------------------------
 src/sv_object.cpp     |  18 +-
 11 files changed, 286 insertions(+), 465 deletions(-)

diff --git a/Makefile-cpp b/Makefile-cpp
index 3babecb3..e77cf0a8 100644
--- a/Makefile-cpp
+++ b/Makefile-cpp
@@ -18,7 +18,7 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib
 
 # Compiler and Flags
 CXX := g++
-CXXFLAGS := -std=c++14 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic
+CXXFLAGS := -std=c++17 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic
 LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR)  # Add rpath for shared libraries
 LDLIBS := -lhts  # Link with libhts.a or libhts.so
 
diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index bed2a347..055e3247 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -94,7 +94,7 @@ class CNVCaller {
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
         void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
 
-        double calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath, int thread_count, bool single_chr) const;
+        double calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const;
 
         void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& pos_log2) const;
 
diff --git a/include/fasta_query.h b/include/fasta_query.h
index b3cee253..a0697446 100644
--- a/include/fasta_query.h
+++ b/include/fasta_query.h
@@ -9,6 +9,7 @@
 #include <unordered_map>
 #include <vector>
 #include <mutex>
+#include <string_view>
 /// @endcond
 
 class ReferenceGenome {
@@ -16,6 +17,7 @@ class ReferenceGenome {
         std::string fasta_filepath;
         std::vector<std::string> chromosomes;
         std::unordered_map<std::string, std::string> chr_to_seq;
+        std::map<std::string, uint32_t> chr_to_length;
         //mutable std::mutex mtx;
         std::mutex& shared_mutex;
 
@@ -24,7 +26,8 @@ class ReferenceGenome {
     
         int setFilepath(std::string fasta_filepath);
         std::string getFilepath() const;
-        std::string query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const;
+        std::string_view query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const;
+        bool compare(const std::string& chr, uint32_t pos_start, uint32_t pos_end, const std::string& compare_seq, float match_threshold) const;
 
         // Get the chromosome contig lengths in VCF header format
         std::string getContigHeader() const;
diff --git a/include/sv_caller.h b/include/sv_caller.h
index 18c602c5..5cfbb956 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -34,8 +34,6 @@ class SVCaller {
         int min_mapq = 20;          // Minimum mapping quality to be considered
         std::mutex shared_mutex;
 
-        void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data, bool is_primary, const ReferenceGenome& ref_genome);
-
         void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map);
 
         // Detect SVs from the CIGAR string of a read alignment, and return the
@@ -53,7 +51,7 @@ class SVCaller {
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
 
         // Detect SVs from split alignments
-        void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, const ReferenceGenome& ref_genome);
+        void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data);
 
         // Calculate the mismatch rate given a map of query positions to
         // match/mismatch (1/0) values within a specified range of the query
diff --git a/include/sv_object.h b/include/sv_object.h
index 3fedee73..e3e3a8ba 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -34,6 +34,8 @@ void mergeSVs(std::vector<SVCall>& sv_calls);
 
 void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, int min_depth);
 
+void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, int min_depth, const std::string& data_type);
+
 uint32_t getSVCount(const std::vector<SVCall>& sv_calls);
 
 void concatenateSVCalls(std::vector<SVCall>& sv_calls, const std::vector<SVCall>& sv_calls_update);
diff --git a/setup.py b/setup.py
index ce0c428f..c8591523 100644
--- a/setup.py
+++ b/setup.py
@@ -40,7 +40,7 @@
     name="_" + NAME,
     sources=SRC_FILES,
     include_dirs=[INCLUDE_DIR, conda_include_dir],
-    extra_compile_args=["-std=c++14"],
+    extra_compile_args=["-std=c++17"],
     language="c++",
     libraries=["hts"],
     library_dirs=[conda_lib_dir]
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index a622325d..344288c4 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -298,7 +298,7 @@ std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint3
 }
 
 // Calculate the mean chromosome coverage
-double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath, int thread_count, bool single_chr) const
+double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const
 {
     {
         // Open the BAM file
@@ -449,6 +449,9 @@ void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, i
 
 void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data) const
 {
+    // Lock during reading
+    std::lock_guard<std::mutex> lock(this->shared_mutex);
+
     // --------- SNP file ---------
     const std::string snp_filepath = input_data.getSNPFilepath();
     if (snp_filepath.empty())
@@ -458,6 +461,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     }
 
     // Initialize the SNP file reader
+    // printMessage("Initializing SNP reader...");
     bcf_srs_t *snp_reader = bcf_sr_init();
     if (!snp_reader)
     {
@@ -469,19 +473,16 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     // Use multi-threading. This is possible here due to the lock
     int thread_count = input_data.getThreadCount();
     bcf_sr_set_threads(snp_reader, thread_count);
-    // if (input_data.isSingleChr())
-    // {
-    //     printMessage("Setting SNP reader threads to " + std::to_string(std::max(1, thread_count / 2)));
-    //     bcf_sr_set_threads(snp_reader, std::max(1, thread_count / 2));
-    // }
 
     // Add the SNP file to the reader
+    // printMessage("Adding SNP file to reader...");
     if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0)
     {
         bcf_sr_destroy(snp_reader);
         printError("ERROR: Could not add SNP file to reader: " + snp_filepath);
         return;
     }
+    // printMessage("SNP file added to reader.");
 
     // --------- Population allele frequency file ---------
 
@@ -491,7 +492,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     if (pfb_filepath.empty())
     {
         use_pfb = false;
-        // printMessage("WARNING: No population allele frequency file provided for chromosome " + chr);
     }
     
     bcf_srs_t *pfb_reader = bcf_sr_init();
@@ -532,13 +532,13 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             printError("ERROR: Could not initialize population allele frequency reader.");
 
             // Clean up
-            // bcf_hdr_destroy(snp_header);
             bcf_sr_destroy(snp_reader);
             return;
         }
         pfb_reader->require_index = 1;
 
         // Add the population allele frequency file to the reader
+        // printMessage("Adding population allele frequency file to reader...");
         if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0)
         {
             printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath);
@@ -551,11 +551,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
 
         // Use multi-threading. This is possible here due to the lock
         bcf_sr_set_threads(pfb_reader, thread_count);
-        // if (input_data.isSingleChr())
-        // {
-        //     printMessage("Setting population allele frequency reader threads to " + std::to_string(std::max(1, thread_count / 2)));
-        //     bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2));
-        // }
     }
 
     // Split the region into samples
@@ -564,25 +559,26 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
 
     // Loop through the samples and read the SNP data, storing the first
     // SNP position and BAF value for each sample
-    int print_count = 0;
+    // int print_count = 0;
     int current_region = 0;
     for (size_t i = 0; i < region_chunks.size(); ++i)
     {
         current_region++;
         // Lock during reading
-        std::lock_guard<std::mutex> lock(this->shared_mutex);
+        // std::lock_guard<std::mutex> lock(this->shared_mutex);
 
         // Read the SNP data ----------------------------------------------
 
         // Set the region
+        // printMessage("Setting region for SNP reader...");
         std::string region_str = region_chunks[i];
         if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)
         {
             printError("ERROR: Could not set region for SNP reader: " + region_str);
             break;
         }
+        // printMessage("Region set for SNP reader, loading SNP data...");
 
-        // printMessage("Iterating through SNPs in region " + std::to_string(current_region) + " of " + std::to_string((int) region_chunks.size()) + " with length " + std::to_string((int) (end_pos - start_pos)) + " bp...");
         bool snp_found = false;
         while (bcf_sr_next_line(snp_reader) > 0)
         {
@@ -609,10 +605,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
 
                 // Extract DP from FORMAT field
                 int32_t *dp = 0;
-                // int dp_values[2];
                 int dp_count = 0;
-                // int dp_ret = bcf_get_format_int32(snp_header, snp_record,
-                // "DP", &dp, &dp_count);
                 int dp_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "DP", &dp, &dp_count);
                 if (dp_ret < 0 || dp[0] <= 10)
                 {
@@ -621,8 +614,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
                 free(dp);
 
                 // Skip if the SNP does not pass the filter
-                // if (bcf_has_filter(snp_header, snp_record,
-                // const_cast<char*>("PASS")) != 1)
                 if (bcf_has_filter(snp_reader->readers[0].header, snp_record, const_cast<char*>("PASS")) != 1)
                 {
                     continue;
@@ -630,21 +621,14 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
 
                 // Extract AD from FORMAT field
                 int32_t *ad = 0;
-                // int ad_values[2];
                 int ad_count = 0;
-                // int ad_ret = bcf_get_format_int32(snp_header, snp_record,
-                // "AD", &ad, &ad_count);
                 int ad_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "AD", &ad, &ad_count);
-                // int ad_ret = bcf_get_format_int32(snp_header, snp_record,
-                // "AD", &ad, &ad_count);
                 if (ad_ret < 0 || ad_count < 2)
                 {
                     continue;
                 }
 
                 // Calculate the B-allele frequency (BAF)
-                // double baf = (double) ad_values[1] / (double) (ad_values[0] +
-                // ad_values[1]);
                 double baf = (double) ad[1] / (double) (ad[0] + ad[1]);
                 free(ad);
 
@@ -672,10 +656,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         // Read the population allele frequency data ----------------------
         if (use_pfb)
         {
-            // Lock during reading
-            //std::lock_guard<std::mutex> lock(this->shared_mutex);
-
             // Set the region as the SNP position
+            // printMessage("Setting region for population allele frequency reader...");
             uint32_t target_snp_pos = snp_pos[i];  // Already 1-based
             std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos);
             if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0)
@@ -683,61 +665,74 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
                 printError("ERROR: Could not set region for population allele frequency reader: " + region_str);
                 break;
             }
+            // printMessage("Region set for population allele frequency reader, loading population allele frequency data...");
 
             // Find the SNP position in the population allele frequency file
+            float *pfb_f = NULL;
+            int count = 0;
             while (bcf_sr_next_line(pfb_reader) > 0)
             {
-                if (!bcf_sr_has_line(pfb_reader, 0))
-                {
-                    continue;
-                }
+                // Get the SNP record and validate
                 bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0);
-                if (pfb_record)
+                if (!pfb_record || !bcf_is_snp(pfb_record))
                 {
-                    // Skip if not a SNP
-                    if (!bcf_is_snp(pfb_record))
-                    {
-                        continue;
-                    }
+                    continue;  // Skip if not a SNP
+                }
 
-                    // Get the population frequency for the SNP
-                    float *pfb_f = NULL;
-                    int count = 0;
-                    int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count);
-                    if (pfb_status < 0 || count == 0)
-                    {
-                        continue;
-                    }
-                    double pfb = (double) pfb_f[0];
-                    free(pfb_f);
+                // if (!bcf_sr_has_line(pfb_reader, 0))
+                // {
+                //     continue;
+                // }
+                // bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0);
+                // if (pfb_record)
+                // {
+                //     // Skip if not a SNP
+                //     if (!bcf_is_snp(pfb_record))
+                //     {
+                //         continue;
+                //     }
+
+                // Get the population frequency for the SNP
+                // float *pfb_f = NULL;
+                // int count = 0;
+                int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count);
+                if (pfb_status < 0 || count == 0)
+                {
+                    continue;
+                }
+                // double pfb = (double) pfb_f[0];
+                double pfb = static_cast<double>(pfb_f[0]);
+                // free(pfb_f);
 
-                    // Continue if the population frequency is outside the threshold
-                    if (pfb <= MIN_PFB || pfb >= MAX_PFB)
-                    {
-                        continue;
-                    }
+                // Skip if outside the acceptable range
+                if (pfb <= MIN_PFB || pfb >= MAX_PFB)
+                {
+                    continue;
+                }
 
-                    // Add the population frequency to the SNP data
-                    snp_pfb[i] = pfb;
+                // Add the population frequency to the SNP data
+                snp_pfb[i] = pfb;
 
-                    // Break after finding the SNP position
-                    break;
+                // Break after finding the SNP position
+                break;
 
-                    if (print_count < 20) {
-                        printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")");
-                        print_count++;
-                    }
-                }
+                // if (print_count < 20) {
+                //     printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")");
+                //     print_count++;
+                // }
             }
+            free(pfb_f);
+
+            // }
             if (pfb_reader->errnum)
             {
                 printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum)));
             }
         }
+        // printMessage("SNP region " + std::to_string(current_region) + " of " + std::to_string(region_chunks.size()) + " completed.");
     }
 
     // Clean up
-    // bcf_hdr_destroy(snp_header);
     bcf_sr_destroy(snp_reader);
     bcf_sr_destroy(pfb_reader);
 }
diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp
index 0f2ce105..445643cc 100644
--- a/src/fasta_query.cpp
+++ b/src/fasta_query.cpp
@@ -12,6 +12,9 @@
 /// @endcond
 
 
+#include "utils.h"
+
+
 int ReferenceGenome::setFilepath(std::string fasta_filepath)
 {
     if (fasta_filepath == "")
@@ -31,8 +34,8 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath)
     }
 
     // Get the chromosomes and sequences
-    std::vector<std::string> chromosomes;
-    std::unordered_map<std::string, std::string> chr_to_seq;
+    // std::vector<std::string> chromosomes;
+    // std::unordered_map<std::string, std::string> chr_to_seq;
     std::string current_chr = "";
     std::string sequence = "";
     std::string line_str = "";
@@ -45,8 +48,11 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath)
             // Store the previous chromosome and sequence
             if (current_chr != "")
             {
-                chromosomes.push_back(current_chr);  // Add the chromosome to the list
-                chr_to_seq[current_chr] = sequence;  // Add the sequence to the map
+                this->chromosomes.push_back(current_chr);  // Add the chromosome to the list
+                this->chr_to_seq[current_chr] = sequence;  // Add the sequence to the map
+                this->chr_to_length[current_chr] = sequence.length();  // Add the sequence length to the map
+                // chromosomes.push_back(current_chr);  // Add the chromosome to the list
+                // chr_to_seq[current_chr] = sequence;  // Add the sequence to the map
                 sequence = "";  // Reset the sequence
             }
 
@@ -61,11 +67,11 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath)
             }
 
             // Check if the chromosome is already in the map
-            if (chr_to_seq.find(current_chr) != chr_to_seq.end())
-            {
-                std::cerr << "Duplicate chromosome " << current_chr << std::endl;
-                exit(1);
-            }
+            // if (chr_to_seq.find(current_chr) != chr_to_seq.end())
+            // {
+            //     std::cerr << "Duplicate chromosome " << current_chr << std::endl;
+            //     exit(1);
+            // }
         } else {
             // Sequence line
             sequence += line_str;
@@ -75,19 +81,23 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath)
     // Add the last chromosome at the end of the file
     if (current_chr != "")
     {
-        chromosomes.push_back(current_chr);  // Add the chromosome to the list
-        chr_to_seq[current_chr] = sequence;  // Add the sequence to the map
+        this->chromosomes.push_back(current_chr);  // Add the chromosome to the list
+        this->chr_to_seq[current_chr] = sequence;  // Add the sequence to the map
+        this->chr_to_length[current_chr] = sequence.length();  // Add the sequence length to the map
+        // chromosomes.push_back(current_chr);  // Add the chromosome to the list
+        // chr_to_seq[current_chr] = sequence;  // Add the sequence to the map
     }
 
     // Close the file
     fasta_file.close();
 
     // Sort the chromosomes
-    std::sort(chromosomes.begin(), chromosomes.end());
+    // std::sort(chromosomes.begin(), chromosomes.end());
+    std::sort(this->chromosomes.begin(), this->chromosomes.end());
 
     // Set the chromosomes and sequences
-    this->chromosomes = chromosomes;
-    this->chr_to_seq = chr_to_seq;
+    // this->chromosomes = chromosomes;
+    // this->chr_to_seq = chr_to_seq;
 
     return 0;
 }
@@ -98,30 +108,75 @@ std::string ReferenceGenome::getFilepath() const
 }
 
 // Function to get the reference sequence at a given position range
-std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const
+std::string_view ReferenceGenome::query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const
 {
-    std::lock_guard<std::mutex> lock(this->shared_mutex);
+    // printMessage("Querying reference genome");
+    // std::lock_guard<std::mutex> lock(this->shared_mutex);
     
     // Convert positions from 1-indexed (reference) to 0-indexed (string indexing)
     pos_start--;
     pos_end--;
 
     // Ensure that the end position is not larger than the chromosome length
-    if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length())
+    // if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length())
+    const std::string& sequence = this->chr_to_seq.at(chr);
+    if (pos_end >= sequence.length() || pos_start > pos_end)
     {
-        return "";
+        return {};
     }
 
-    uint32_t length = pos_end - pos_start + 1;
-    const std::string& sequence = this->chr_to_seq.at(chr);
+    // uint32_t length = pos_end - pos_start + 1;
 
     // If the subsequence is empty, return empty string
-    if (sequence.substr(pos_start, length).empty())
+    // if (sequence.substr(pos_start, length).empty())
+    // {
+    //     return "";
+    // }
+
+    // return sequence.substr(pos_start, length);
+    return std::string_view(sequence).substr(pos_start, (pos_end - pos_start) + 1);
+}
+
+// Function to compare the reference sequence at a given position range
+bool ReferenceGenome::compare(const std::string& chr, uint32_t pos_start, uint32_t pos_end, const std::string& compare_seq, float match_threshold) const
+{
+    // std::lock_guard<std::mutex> lock(this->shared_mutex);
+    
+    // Convert positions from 1-indexed (reference) to 0-indexed (string indexing)
+    pos_start--;
+    pos_end--;
+
+    // Ensure that the end position is not larger than the chromosome length
+    // if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length())
+    const std::string& sequence = this->chr_to_seq.at(chr);
+    if (pos_end >= sequence.length() || pos_start >= pos_end)
+    {
+        return {};
+    }
+
+    // Get the subsequence
+    std::string_view subseq = std::string_view(sequence).substr(pos_start, pos_end - pos_start + 1);
+
+    // Ensure the lengths are equal
+    if (subseq.length() != compare_seq.length())
     {
-        return "";
+        printError("ERROR: Sequence lengths do not match for comparison");
+        return false;
     }
 
-    return sequence.substr(pos_start, length);
+    // Calculate the match rate
+    size_t num_matches = 0;
+    for (size_t i = 0; i < subseq.length(); i++)
+    {
+        if (subseq[i] == compare_seq[i])
+        {
+            num_matches++;
+        }
+    }
+    float match_rate = (float)num_matches / (float)subseq.length();
+
+    // Check if the match rate is above the threshold
+    return match_rate >= match_threshold;
 }
 
 // Function to get the chromosome contig lengths in VCF header format
@@ -154,12 +209,13 @@ std::string ReferenceGenome::getContigHeader() const
 
 std::vector<std::string> ReferenceGenome::getChromosomes() const
 {
-    std::lock_guard<std::mutex> lock(this->shared_mutex);
+    // std::lock_guard<std::mutex> lock(this->shared_mutex);
     return this->chromosomes;
 }
 
 uint32_t ReferenceGenome::getChromosomeLength(std::string chr) const
 {
-    std::lock_guard<std::mutex> lock(this->shared_mutex);
-    return this->chr_to_seq.at(chr).length();
+    // std::lock_guard<std::mutex> lock(this->shared_mutex);
+    // return this->chr_to_seq.at(chr).length();
+    return this->chr_to_length.at(chr);
 }
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 489a9894..40a640a2 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -157,6 +157,11 @@ void InputData::setMinCNVLength(int min_cnv_length)
 
 void InputData::setMinReadSupport(int min_reads)
 {
+    // Ensure that the minimum read support is an integer and greater than 0
+    if (min_reads < 1)
+    {
+        throw std::runtime_error("Minimum read support must be an integer greater than 0");
+    }
     this->min_reads = min_reads;
 }
 
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 6180612f..03d3b718 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -19,6 +19,7 @@
 #include <algorithm>
 #include <fstream>
 #include <condition_variable>
+#include <bitset>
 
 #include "ThreadPool.h"
 #include "utils.h"
@@ -100,164 +101,11 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
     // Clean up the iterator and alignment
     hts_itr_destroy(itr);
     bam_destroy1(bam1);
-    printMessage(region + ": Processed " + std::to_string(primary_map.size()) + " primary alignments with " + std::to_string(supplementary_count) + " supplementary alignments");
+    printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments");
     // printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments...");
 }
 
 
-void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data, bool is_primary, const ReferenceGenome& ref_genome)
-{
-    // Create a read and iterator for the region
-    bam1_t *bam1 = bam_init1();
-    if (!bam1) {
-        printError("ERROR: failed to initialize BAM record");
-        return;
-    }
-
-
-    //bam_mutex.lock();
-    this->shared_mutex.lock();
-    hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start - 1, region.end);
-    if (!itr) {
-        this->shared_mutex.unlock();
-        bam_destroy1(bam1);
-        printError("ERROR: failed to query region " + std::to_string(region.tid) + ":" + std::to_string(region.start) + "-" + std::to_string(region.end));
-        return;
-    }
-    this->shared_mutex.unlock();
-    //bam_mutex.unlock();
-
-
-    // Find the correct alignment
-    bool success = false;
-    std::string fail_str = "";
-    // printMessage("Looking for alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " with type: " + (is_primary ? "primary" : "supplementary") + " and strand: " + (region.strand ? "forward" : "reverse"));
-    while (readNextAlignment(fp_in, itr, bam1) >= 0) {
-        // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
-        if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) {
-            continue;
-        }
-
-        // Skip if not the correct type of alignment
-        if (is_primary && (bam1->core.flag & BAM_FSUPPLEMENTARY)) {
-            continue;
-        } else if (!is_primary && !(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
-            continue;
-        }
-
-        // Check the alignment start and end positions, and strand
-        if (bam1->core.pos+1 == region.start && bam_endpos(bam1) == region.end && !(bam1->core.flag & BAM_FREVERSE) == region.strand) {
-            // printMessage("SUCCESS: Found alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " at position: " + std::to_string(bam1->core.pos + 1) + "-" + std::to_string(bam_endpos(bam1)));
-            success = true;
-            break;
-        } else {
-            continue;
-        }
-    }
-
-    // Check if the alignment was found
-    if (!success) {
-        printError("ERROR: Failed to find alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " with type: " + (is_primary ? "primary" : "supplementary") + " and strand: " + (region.strand ? "forward" : "reverse"));
-        hts_itr_destroy(itr);
-        bam_destroy1(bam1);
-        return;
-    }
-
-    // Main loop to process the alignments
-    std::vector<int> match_map(bam1->core.l_qseq, 0);  // Query position to match/mismatch (1/0) map
-    uint32_t query_start = 0;
-    uint32_t query_end = 0;
-    uint32_t query_pos = 0;
-    bool first_op = true;
-
-    // Process mismatches in the CIGAR string
-    const std::string chr = bamHdr->target_name[bam1->core.tid];
-    hts_pos_t pos = bam1->core.pos;  // 0-based position
-    uint32_t* cigar = bam_get_cigar(bam1);  // CIGAR array
-    int cigar_len = bam1->core.n_cigar;
-    for (int i = 0; i < cigar_len; i++) {
-        int op = bam_cigar_op(cigar[i]);  // CIGAR operation
-        int op_len = bam_cigar_oplen(cigar[i]);  // CIGAR operation length
-        
-        // Update match/mismatch query map
-        int MATCH = 1;
-        int MISMATCH = -1;
-        if (op == BAM_CEQUAL) {
-            for (int j = 0; j < op_len; j++) {
-                match_map[query_pos + j] = MATCH;
-            }
-        } else if (op == BAM_CDIFF) {
-            for (int j = 0; j < op_len; j++) {
-                match_map[query_pos + j] = MISMATCH;
-            }
-        } else if (op == BAM_CMATCH) {
-            // Get the read sequence
-            uint8_t* seq_ptr = bam_get_seq(bam1);
-            std::string cmatch_seq_str = "";
-            for (int j = 0; j < op_len; j++) {
-                cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)];
-            }
-
-            // Get the corresponding reference sequence
-            int cmatch_pos = pos + 1;  // Querying the reference genome is 1-based
-            std::string cmatch_ref_str = ref_genome.query(chr, cmatch_pos, cmatch_pos + op_len - 1);
-
-            // Check that the two sequence lengths are equal
-            if (cmatch_seq_str.length() != cmatch_ref_str.length()) {
-                printError("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op));
-                hts_itr_destroy(itr);
-                bam_destroy1(bam1);
-                return;
-            }
-
-            // Compare the two sequences and update the mismatch map
-            for (int j = 0; j < op_len; j++) {
-                if (cmatch_seq_str[j] != cmatch_ref_str[j]) {
-                    try {
-                        match_map.at(query_pos + j) = MISMATCH;
-                    } catch (const std::out_of_range& e) {
-                        printError("ERROR: Out of range exception for query position: " + std::to_string(query_pos + j) + " with read length: " + std::to_string(bam1->core.l_qseq) + " and array size: " + std::to_string(match_map.size()) + " for CIGAR operation: " + std::to_string(op) + " with length: " + std::to_string(op_len));
-
-                        // Exit the program
-                        hts_itr_destroy(itr);
-                        bam_destroy1(bam1);
-                        
-                        return;
-                    }
-                    // match_map[query_pos + j] = MISMATCH;
-                } else {
-                    match_map[query_pos + j] = MATCH;
-                }
-            }
-        } else if (first_op && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP)) {
-            query_start = query_pos + op_len;
-            first_op = false;
-        }
-        
-        // Update the reference position
-        // https://samtools.github.io/hts-specs/SAMv1.pdf
-        if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
-            pos += op_len;
-        }
-
-        // Update the query position
-        if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
-            query_pos += op_len;
-        }
-    }
-    query_end = query_pos;
-    
-    // Clean up the iterator and alignment
-    hts_itr_destroy(itr);
-    bam_destroy1(bam1);
-
-    // Update the mismatch data
-    mismatch_data.query_start = query_start;
-    mismatch_data.query_end = query_end;
-    mismatch_data.match_map = std::move(match_map);
-}
-
-
 void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome)
 {
     // Create a read and iterator for the region
@@ -333,13 +181,15 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     uint32_t ref_pos;
     uint32_t ref_end;
     double default_lh = 0.0;
-    // List of ambiguous bases
-    const std::string amb_bases = "RYKMSWBDHV";
+    const std::string amb_bases = "RYKMSWBDHV";  // Ambiguous bases
+    std::bitset<256> amb_bases_bitset;
+    for (char base : amb_bases) {
+        amb_bases_bitset.set(base);
+    }
     for (int i = 0; i < cigar_len; i++) {
         int op_len = bam_cigar_oplen(cigar[i]);  // CIGAR operation length
         int op = bam_cigar_op(cigar[i]);  // CIGAR operation
         if (op_len >= 50) {
-            // Process SVs
             
             // Process the CIGAR operation
             if (op == BAM_CINS && is_primary) {
@@ -349,83 +199,24 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 for (int j = 0; j < op_len; j++) {
                     // Replace ambiguous bases with N
                     char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
-                    if (amb_bases.find(base) != std::string::npos) {
+                    if (amb_bases_bitset.test(base)) {
                         ins_seq_str[j] = 'N';
                     } else {
                         ins_seq_str[j] = base;
                     }
-                    // Get the sequence character from the query
-                    // ins_seq_str[j] = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
                 }
                 
-                // To determine whether the insertion is a duplication, check
-                // for sequence identity between the insertion and the
-                // reference genome (duplications are typically >= 90%):
-                // Loop through the reference sequence and calculate the
-                // sequence identity +/- insertion length from the insertion
-                // position.
-                // bool is_duplication = false;
-                // int ins_ref_pos;
-                // uint32_t dup_start = std::max(0, (int)pos - op_len);
-                // for (uint32_t j = dup_start; j <= pos; j++) {
-
-                //     // Get the string for the window (1-based coordinates)
-                //     ins_ref_pos = j + 1;
-                //     std::string window_str = ref_genome.query(chr, ins_ref_pos, ins_ref_pos + op_len - 1);
-
-                //     // Continue if the window string is empty (out-of-range)
-                //     if (window_str == "") {
-                //         continue;
-                //     }
-
-                //     // Calculate the sequence identity
-                //     int num_matches = 0;
-                //     for (int k = 0; k < op_len; k++) {
-                //         if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') {
-                //             num_matches++;
-                //         }
-                //     }
-                //     float seq_identity = (float)num_matches / (float)op_len;
-
-                //     // Check if the target sequence identity is reached
-                //     if (seq_identity >= DUP_SEQSIM_THRESHOLD) {
-                //         is_duplication = true;
-                //         break;
-                //     }
-                // }
-
-                // Calculate the sequence identity at the insertion position +/-
-                // length if >= 50bp
-                //if (op_len >= 50) {
-                
                 // Before the insertion
                 if (pos >= (uint32_t)op_len-1)
                 {
                     uint32_t bp1 = pos - (op_len - 1) + 1;
                     uint32_t bp2 = bp1 + op_len - 1; //pos + 1;
-                    const std::string& window_str = ref_genome.query(chr, bp1, bp2);
-                    if (window_str.length() > 0)
+
+                    if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
                     {
-                        int num_matches = 0;
-                        for (int k = 0; k < op_len; k++)
-                        {
-                            if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N')
-                            {
-                                num_matches++;
-                            }
-                        }
-                        float seq_identity = (float)num_matches / (float)op_len;
-                        if (seq_identity >= DUP_SEQSIM_THRESHOLD)
-                        {
-                            //uint32_t dup_bp1 = bp1 + 1;
-                            //uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr));
-                            int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                            //printMessage("TEST3");
-                            addSVCall(sv_calls, bp1, bp2, "DUP", "<DUP>", "LSEQSIM", "./.", default_lh, read_depth);
-
-                            // Continue to the next CIGAR operation
-                            continue;
-                        }
+                        int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
+                        addSVCall(sv_calls, bp1, bp2, "DUP", "<DUP>", "LSEQSIM", "./.", default_lh, read_depth);
+                        continue;
                     }
                 }
 
@@ -433,44 +224,22 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 if (pos + op_len < ref_genome.getChromosomeLength(chr))
                 {
                     uint32_t bp1 = pos + 1;
-                    //uint32_t bp2 = std::min(bp1 + op_len - 1, ref_genome.getChromosomeLength(chr));
                     uint32_t bp2 = bp1 + op_len - 1;
-                    const std::string& window_str = ref_genome.query(chr, bp1, bp2);
-                    if (window_str.length() > 0)
+
+                    if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
                     {
-                        int num_matches = 0;
-                        for (int k = 0; k < op_len; k++)
-                        {
-                            if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N')
-                            {
-                                num_matches++;
-                            }
-                        }
-                        float seq_identity = (float)num_matches / (float)op_len;
-                        if (seq_identity >= DUP_SEQSIM_THRESHOLD)
-                        {
-                            //uint32_t dup_bp1 = bp1 + 1;
-                            //uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr));
-                            int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                            //printMessage("TEST1");
-                            addSVCall(sv_calls, bp1, bp2, "DUP", "<DUP>", "RSEQSIM", "./.", default_lh, read_depth);
-
-                            // Continue to the next CIGAR operation
-                            continue;
-                        }
+                        int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
+                        addSVCall(sv_calls, bp1, bp2, "DUP", "<DUP>", "RSEQSIM", "./.", default_lh, read_depth);
+                        continue;
                     }
                 }
 
                 // Add as an insertion
                 // For read depth calculation, use the previous and current
                 // positions (1-based)
-
-                //uint32_t ins_pos = pos;
-                //uint32_t ins_end = ins_pos + op_len -1;
                 uint32_t ins_pos = pos + 1;
                 uint32_t ins_end = ins_pos + op_len - 1;
                 int read_depth = this->calculateReadDepth(pos_depth_map, ins_pos-1, ins_pos);
-                //printMessage("TEST2: " + std::to_string(ins_pos) + ", " + std::to_string(ins_end) + ", OPLEN=" + std::to_string(op_len));
                 
                 // Determine the ALT allele format based on small vs. large insertion
                 std::string alt_allele = "<INS>";
@@ -480,42 +249,12 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 
                 addSVCall(sv_calls, ins_pos, ins_end, "INS", alt_allele, "CIGARINS", "./.", default_lh, read_depth);
 
-                // Determine whether to use a symbolic allele (>50bp) or the
-                // actual sequence
-                // if (op_len > 50) {
-                //     ins_seq_str = "<INS>";
-                // } else {
-                //     ins_seq_str = ins_seq_str;
-                // }
-
-                // Add to SV calls (1-based) with the appropriate SV type
-                // ref_pos = pos+1;
-
-                // // For insertions, the reference end position is the same as the
-                // // reference position
-                // // For duplications, the reference end position is the same as
-                // // the reference position plus the length of the insertion
-                // ref_end = ref_pos + op_len - 1;
-                // if (is_duplication) {
-                //     uint32_t bp1 = ref_pos;
-                //     uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr));
-                //     int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                //     addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth);
-                // } else {
-                //     uint32_t bp1 = std::max(1, (int)ref_pos - 1);
-                //     uint32_t bp2 = ref_pos;
-                //     int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                //     addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth);
-                // }
-
             // Check if the CIGAR operation is a deletion
             } else if (op == BAM_CDEL && is_primary) {
 
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
-                // printMessage("Test2");
                 int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
-                //printMessage("TEST4: " + std::to_string(ref_pos) + ", " + std::to_string(ref_end) + ", OPLEN=" + std::to_string(op_len));
                 addSVCall(sv_calls, ref_pos, ref_end, "DEL", "<DEL>", "CIGARDEL", "./.", default_lh, read_depth);
             }
         }
@@ -538,8 +277,9 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     // int filter_threshold = 4;  // Minimum number of supporting reads for an SV call
     // int filter_threshold = 10;  // Minimum number of supporting reads for an
     // SV call
-    int filter_threshold = input_data.getMinReadSupport();  // Minimum number of supporting reads for an SV call
-    bool single_chr = input_data.getChromosome() != "";
+    int cigar_sv_support_threshold = input_data.getMinReadSupport();  // Minimum number of supporting reads for an SV call
+    int split_sv_support_threshold = 4;  // Minimum number of supporting reads for an SV call
+    // printMessage("Processing chromosome " + chr + " with filter threshold: " + std::to_string(filter_threshold));
 
     // Open the BAM file
     std::string bam_filepath = input_data.getLongReadBam();
@@ -589,7 +329,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     CNVCaller cnv_caller(this->shared_mutex);
     std::vector<uint32_t> chr_pos_depth_map(chr_len+1, 0);  // 1-based index
     int thread_count = input_data.getThreadCount();
-    double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count, single_chr);
+    double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count);
     if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) {
         return;
     }
@@ -599,7 +339,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
 
     printMessage(chr + ": Merging CIGAR...");
-    filterSVsWithLowSupport(chr_sv_calls, filter_threshold);
+    filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold);
     mergeSVs(chr_sv_calls);
     int region_sv_count = getSVCount(chr_sv_calls);
     printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
@@ -614,11 +354,11 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
 
     // Run split-read SV and copy number variant predictions
     printMessage(chr + ": Split read SVs...");
-    this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data, ref_genome);
+    this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
 
     // Merge the SV calls from the current region
     printMessage(chr + ": Merging split reads...");
-    filterSVsWithLowSupport(chr_sv_calls, filter_threshold);
+    filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold, "SPLIT");
     mergeSVs(chr_sv_calls);
 
     // Run a final merge on the combined SV calls
@@ -659,9 +399,6 @@ void SVCaller::run(const InputData& input_data)
 
     // Shared resources
     std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
-    //std::mutex sv_mutex;
-    //std::mutex snp_mutex;
-    //std::mutex pfb_mutex;
 
     // Lambda to process a chromosome
     auto process_chr = [&](const std::string& chr) {
@@ -670,7 +407,6 @@ void SVCaller::run(const InputData& input_data)
             InputData chr_input_data = input_data;  // Use a thread-local copy
             this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome);
             {
-                //std::lock_guard<std::mutex> lock(sv_mutex);
                 std::lock_guard<std::mutex> lock(this->shared_mutex);
                 whole_genome_sv_calls[chr] = std::move(sv_calls);
             }
@@ -692,10 +428,13 @@ void SVCaller::run(const InputData& input_data)
     }
 
     // Wait for all tasks to complete
+    int total_chr_count = futures.size();
+    int current_chr = 0;
     for (auto& future : futures) {
         try {
+            current_chr++;
             future.get();
-            printMessage("Chromosome task completed.");
+            printMessage("Chromosome task "+ std::to_string(current_chr) + " of " + std::to_string(total_chr_count) + " completed.");
         } catch (const std::exception& e) {
             printError("Error processing chromosome task: " + std::string(e.what()));
         } catch (...) {
@@ -722,42 +461,44 @@ void SVCaller::run(const InputData& input_data)
 
 
 // Detect SVs from split read alignments
-void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data, const ReferenceGenome& ref_genome)
+void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data)
 {
-    printMessage(region + ": Getting split alignments...");
+    // printMessage(region + ": Getting split alignments...");
     std::unordered_map<std::string, GenomicRegion> primary_map;
     std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
     this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map);
 
     // Find split-read SV evidence
-    printMessage(region + ": Finding split-read SVs...");
+    // printMessage(region + ": Finding split-read SVs...");
     int sv_count = 0;
     int current_primary = 0;
+    int primary_count = primary_map.size();
     //int primary_count = primary_map.size();
     uint32_t min_cnv_length = input_data.getMinCNVLength();
     for (auto& entry : primary_map) {
         current_primary++;
         const std::string& qname = entry.first;
         GenomicRegion& primary_region = entry.second;
-
-        // Skip primary alignments that do not have supplementary alignments
-        // if (supp_map.find(qname) == supp_map.end()) {
-        //     continue;
-        // }
-
-        // Get the read match/mismatch map
-        // printMessage(region + ": Getting mismatch map for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
-        //MismatchData primary_mismatches;
-        //this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches, true, ref_genome);
         
       	// Find the largest supplementary alignment
-        GenomicRegion largest_supp_region = supp_map[qname][0];
-        uint32_t largest_supp_length = 0;
+        auto& supp_regions = supp_map[qname];
+        GenomicRegion largest_supp_region = supp_regions[0];
+        auto it = std::max_element(supp_regions.begin(), supp_regions.end(), [](const GenomicRegion& a, const GenomicRegion& b) {
+            return a.end - a.start < b.end - b.start;
+        });
+        if (it != supp_regions.end()) {
+            largest_supp_region = *it;
+        }
+        
+        // GenomicRegion largest_supp_region = supp_map[qname][0];
+        // uint32_t largest_supp_length = 0;
 
         // printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
         const std::string& primary_chr = bamHdr->target_name[primary_region.tid];
-        for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) {
-            GenomicRegion& supp_region = *it;
+        // for (auto it = supp_map[qname].begin(); it != supp_map[qname].end();
+        // ++it) {
+        for (auto& supp_region : supp_regions) {
+            // GenomicRegion& supp_region = *it;
 
             // Skip if not on the primary chromosome
             if (primary_region.tid != supp_region.tid) {
@@ -765,28 +506,33 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
             }
 
             // Get the supplementary alignment information
-            uint32_t supp_start = (uint32_t) supp_region.start;
-            uint32_t supp_end = (uint32_t) supp_region.end;
-            uint32_t supp_length = supp_end - supp_start + 1;
-            if (supp_length > largest_supp_length) {
-                largest_supp_length = supp_length;
-                largest_supp_region = *it;
-            }
+            // uint32_t supp_start = (uint32_t) supp_region.start;
+            // uint32_t supp_end = (uint32_t) supp_region.end;
+            // uint32_t supp_length = supp_end - supp_start + 1;
+            // if (supp_length > largest_supp_length) {
+            //     largest_supp_length = supp_length;
+            //     largest_supp_region = *it;
+            // }
 
             // Inversion detection
             bool is_opposite_strand = primary_region.strand != supp_region.strand;
             if (is_opposite_strand) {
-                if (supp_length >= min_cnv_length) {
+                // if (supp_length >= min_cnv_length) {
+                if (supp_region.end - supp_region.start >= min_cnv_length) {
 
                     // Print error if the start position is greater than the end
                     // position
-                    if (supp_start > supp_end) {
-                        printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end));
+                    // if (supp_start > supp_end) {
+                    if (supp_region.start > supp_region.end) {
+                        printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_region.start) + "-" + std::to_string(supp_region.end));
+                        // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end));
                         continue;
                     }
 
                     // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")...");
-                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data);
+                    // std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data);
+                    // printMessage("Running copy number prediction, length: " + std::to_string(supp_region.end - supp_region.start));
+                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_region.start, supp_region.end, mean_chr_cov, pos_depth_map, input_data);
                     if (std::get<1>(result) == SVType::UNKNOWN) {
                         continue;
                     }
@@ -794,47 +540,44 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                     double supp_lh = std::get<0>(result);
                     SVType supp_type = std::get<1>(result);
                     // printMessage("Test3");
-                    int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
+                    int read_depth = this->calculateReadDepth(pos_depth_map, supp_region.start, supp_region.end);
+                    // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
                     if (supp_type == SVType::NEUTRAL) {
-                        addSVCall(sv_calls, supp_start, supp_end, "INV", "<INV>", "HMM", "./.", supp_lh, read_depth);
+                        // addSVCall(sv_calls, supp_start, supp_end, "INV",
+                        // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+                        addSVCall(sv_calls, supp_region.start, supp_region.end, "INV", "<INV>", "SPLIT", "./.", supp_lh, read_depth);
                         
                         sv_count++;
                     } else if (supp_type == SVType::DUP) {
-                        addSVCall(sv_calls, supp_start, supp_end, "INVDUP", "<INV>", "HMM", "./.", supp_lh, read_depth);
+                        // addSVCall(sv_calls, supp_start, supp_end, "INVDUP",
+                        // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+                        addSVCall(sv_calls, supp_region.start, supp_region.end, "INVDUP", "<INV>", "SPLIT", "./.", supp_lh, read_depth);
                     }
                 }
-                // } else {
-                //     // Add the inversion without running copy number predictions
-                //     // (too small for predictions)
-                //     // printMessage("Test4");
-                //     int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
-                //     addSVCall(sv_calls, supp_start, supp_end, "INV", "<INV>", "REV", "./.", 0.0, read_depth);
-                // }
             }
         }
 
-        // Trim overlapping alignments
-        //MismatchData supp_mismatches;
-        //printMessage(region + ": Getting mismatch map for supplementary alignments...");
-        //this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches, false, ref_genome);
-
-        // printMessage(region + ": Trimming overlapping alignments...");
-        //trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches);
+        // Analyze split-read evidence for deletions and duplications
         bool gap_exists = false;
         uint32_t boundary_left, boundary_right, gap_left, gap_right;
-        if (primary_region.start < largest_supp_region.start) {  // Primary before supp
-            boundary_left = primary_region.start;
-            boundary_right = std::max(primary_region.end, largest_supp_region.end);
-            gap_left = primary_region.end;
-            gap_right = largest_supp_region.start;
-            gap_exists = gap_left < gap_right;
-        } else {
-            boundary_left = largest_supp_region.start;
-            boundary_right = std::max(primary_region.end, largest_supp_region.end);
-            gap_left = largest_supp_region.end;
-            gap_right = primary_region.start;
-            gap_exists = gap_left < gap_right;
-        }
+        boundary_left = std::min(primary_region.start, largest_supp_region.start);
+        boundary_right = std::max(primary_region.end, largest_supp_region.end);
+        gap_left = std::min(primary_region.end, largest_supp_region.start);
+        gap_right = std::max(primary_region.start, largest_supp_region.end);
+        gap_exists = gap_left < gap_right;
+        // if (primary_region.start < largest_supp_region.start) {  // Primary before supp
+        //     boundary_left = primary_region.start;
+        //     boundary_right = std::max(primary_region.end, largest_supp_region.end);
+        //     gap_left = primary_region.end;
+        //     gap_right = largest_supp_region.start;
+        //     gap_exists = gap_left < gap_right;
+        // } else {
+        //     boundary_left = largest_supp_region.start;
+        //     boundary_right = std::max(primary_region.end, largest_supp_region.end);
+        //     gap_left = largest_supp_region.end;
+        //     gap_right = primary_region.start;
+        //     gap_exists = gap_left < gap_right;
+        // }
         
         // Run copy number variant predictions on the boundary if large enough
         if (boundary_right - boundary_left >= min_cnv_length) {
@@ -846,7 +589,9 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 continue;
             }
 
-            // printMessage(region + ": Running copy number prediction for boundary...");
+            // printMessage(region + ": Running copy number prediction for
+            // boundary...");
+            // printMessage("Running copy number prediction, length: " + std::to_string(boundary_right - boundary_left));
             std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data);
             if (std::get<1>(bd_result) == SVType::UNKNOWN) {
                 continue;
@@ -864,7 +609,9 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                     continue;
                 }
 
-                // printMessage(region + ": Running copy number prediction for gap...");
+                // printMessage(region + ": Running copy number prediction for
+                // gap...");
+                // printMessage("Running copy number prediction, length: " + std::to_string(gap_right - gap_left));
                 std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data);
                 if (std::get<1>(gap_result) == SVType::UNKNOWN) {
                     continue;
@@ -874,25 +621,27 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
 
                 // If higher likelihood than the boundary, add the gap as the SV call
                 if (gap_lh > bd_lh) {
-                    // printMessage("Test5");
                     int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right);
                     std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">";
-                    addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "GAP", "./.", gap_lh, read_depth);
+                    addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "SPLIT", "./.", gap_lh, read_depth);
                 } else {
                     // Add the boundary as the SV call
-                    // printMessage("Test6");
                     int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                     std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-                    addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth);
+                    addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "SPLIT", "./.", bd_lh, read_depth);
                 }
             } else {
                 // Add the boundary as the SV call
-                // printMessage("Test7");
                 int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                 std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-                addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth);
+                addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "SPLIT", "./.", bd_lh, read_depth);
             }
         }
+
+        // Print progress every 1000 primary alignments
+        if (current_primary % 1000 == 0) {
+            printMessage(region + ": Processed " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
+        }
     }
 }
 
@@ -1050,6 +799,11 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                 }
             }
 
+            // // Print the REF allele if SVTYPE = DUP
+            // if (sv_type_str == "DUP") {
+            //     printMessage("REF allele for DUP at " + chr + ":" + std::to_string(start) + "-" + std::to_string(end) + ": " + ref_allele + ", ALT allele: " + alt_allele);
+            // }
+
             // Create the VCF parameter strings
             std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
                 ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
@@ -1067,6 +821,16 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         	}
         }
     }
+    vcf_stream.close();
+
+    std::cout << "Saved SV calls to " << output_vcf << std::endl;
+
+    // Create a compressed and indexed VCF file
+    std::cout << "Creating compressed and indexed VCF file..." << std::endl;
+    std::string bgzip_cmd = "bgzip -f " + output_vcf;
+    std::string tabix_cmd = "tabix -p vcf " + output_vcf + ".gz";
+    std::system(bgzip_cmd.c_str());
+    std::system(tabix_cmd.c_str());
 
     // Print the number of SV calls skipped
     std::cout << "Finished writing VCF file. Total SV calls: " << total_count << ", skipped: " << skip_count << " with unknown SV type" << std::endl;
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 16362ba1..6a953ec6 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -20,15 +20,6 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std:
     if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") {
         return;
     }
-
-    // Set the alt allele to <DUP> or <DEL> if the SV type is DUP or DEL
-    // if (sv_type == "DUP" && alt_allele == ".") {
-    //     printError("ERROR: Invalid alt allele for duplication at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele);
-    //     alt_allele = "<DUP>";
-    // } else if (sv_type == "DEL" && alt_allele == ".") {
-    //     printError("ERROR: Invalid alt allele for deletion at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele);
-    //     alt_allele = "<DEL>";
-    // }
     
     if (start > end) {
         printError("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end));
@@ -44,7 +35,6 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std:
     if (it != sv_calls.end() && it->start == start && it->end == end)
     {
         it->support += 1;  // Update the read support
-        // printMessage("Updating SV call with length " + std::to_string(end - start) + " and type " + sv_type + " and support " + std::to_string(it->support));
         if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood)
         {
             // Update the SV call
@@ -153,3 +143,11 @@ void filterSVsWithLowSupport(std::vector<SVCall>& sv_calls, int min_support)
         return sv_call.support < min_support;
     }), sv_calls.end());
 }
+
+void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, int min_support, const std::string &data_type)
+{
+    // Filter SV calls with low read depth only for the specified data type, keeping the rest
+    sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support, data_type](const SVCall& sv_call) {
+        return sv_call.support < min_support && sv_call.data_type == data_type;
+    }), sv_calls.end());
+}

From 249cb6465c56c83fa3b617b42bf43a7b12e5f260 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 28 Jan 2025 22:13:49 -0500
Subject: [PATCH 063/134] resolve primary overlaps to increase speed

---
 include/sv_object.h |   8 +-
 src/sv_caller.cpp   | 292 ++++++++++++++++++++++++++++++--------------
 src/sv_object.cpp   |  17 ++-
 3 files changed, 221 insertions(+), 96 deletions(-)

diff --git a/include/sv_object.h b/include/sv_object.h
index e3e3a8ba..0b2a489c 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -19,13 +19,17 @@ struct SVCall {
     double hmm_likelihood = 0.0;
     int read_depth = 0;  // Breakpoint depth
     int support = 0;  // Number of supporting reads
+    int cluster_size = 0;  // Number of SV calls in the cluster
 
     // Comparison operator for std::set
     bool operator<(const SVCall& other) const;
 
     // Constructor with parameters for all fields
-    SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) :
-        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {}
+    SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) :
+        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {}
+        
+    // SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) :
+    //     start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {}
 };
 
 void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth);
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 03d3b718..2e24c2cf 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -20,6 +20,7 @@
 #include <fstream>
 #include <condition_variable>
 #include <bitset>
+#include <unordered_set>
 
 #include "ThreadPool.h"
 #include "utils.h"
@@ -58,6 +59,7 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
     uint32_t supplementary_count = 0;
 
     // Main loop to process the alignments
+    std::unordered_map<std::string, uint8_t> primary_map_qual;
     uint32_t num_alignments = 0;
     while (readNextAlignment(fp_in, itr, bam1) >= 0) {
 
@@ -71,15 +73,16 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
         if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
             // primary_map[qname] = itr;
             // Store chromosome (TID), start, and end positions (1-based) of the
-            // primary alignment, and the strand
+            // primary alignment, and the strand (true for forward, false for reverse)
             primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE)};
+            primary_map_qual[qname] = bam1->core.qual;
             primary_count++;
 
         // Process supplementary alignments
         } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
             // supp_map[qname].push_back(itr);
             // Store chromosome (TID), start, and end positions (1-based) of the
-            // supplementary alignment, and the strand
+            // supplementary alignment, and the strand (true for forward, false for reverse)
             supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE)});
             supplementary_count++;
         }
@@ -102,7 +105,46 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
     hts_itr_destroy(itr);
     bam_destroy1(bam1);
     printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments");
-    // printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments...");
+
+    // Filter overlapping primary alignments and keep the one with the highest mapping
+    // quality
+    // std::vector<std::string> to_remove_overlapping;
+    std::unordered_set<std::string> to_remove_overlapping;
+    for (const auto& entry1 : primary_map) {
+        const std::string& qname1 = entry1.first;
+        const GenomicRegion& primary1 = entry1.second;
+        for (const auto& entry2 : primary_map) {
+            const std::string& qname2 = entry2.first;
+            if (qname1 == qname2) {
+                continue;
+            }
+            const GenomicRegion& primary2 = entry2.second;
+            if (primary1.tid == primary2.tid && primary1.start <= primary2.end && primary1.end >= primary2.start) {
+                // Overlapping primary alignments
+                // printMessage("Overlapping primary alignments with quality " + std::to_string(primary_map_qual[qname1]) + " and " + std::to_string(primary_map_qual[qname2]));
+                if (primary_map_qual[qname1] < primary_map_qual[qname2]) {
+                    // to_remove_overlapping.push_back(qname1);
+                    to_remove_overlapping.insert(qname1);
+                } else {
+                    // If equal, remove the shorter alignment
+                    if (primary1.end - primary1.start < primary2.end - primary2.start) {
+                        // to_remove_overlapping.push_back(qname1);
+                        to_remove_overlapping.insert(qname1);
+                    } else {
+                        // to_remove_overlapping.push_back(qname2);
+                        to_remove_overlapping.insert(qname2);
+                    }
+                }
+            }
+        }
+    }
+
+    for (const std::string& qname : to_remove_overlapping) {
+        primary_map.erase(qname);
+        supp_map.erase(qname);
+    }
+    printMessage(region + ": Removed " + std::to_string(to_remove_overlapping.size()) + " overlapping primary alignments");
+    printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supp_map.size()) + " supplementary alignments after filtering");
 }
 
 
@@ -278,7 +320,9 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     // int filter_threshold = 10;  // Minimum number of supporting reads for an
     // SV call
     int cigar_sv_support_threshold = input_data.getMinReadSupport();  // Minimum number of supporting reads for an SV call
-    int split_sv_support_threshold = 4;  // Minimum number of supporting reads for an SV call
+    // int split_sv_support_threshold = 4;  // Minimum number of supporting
+    // reads for an SV call
+    int split_sv_support_threshold = input_data.getMinReadSupport();
     // printMessage("Processing chromosome " + chr + " with filter threshold: " + std::to_string(filter_threshold));
 
     // Open the BAM file
@@ -357,13 +401,14 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
 
     // Merge the SV calls from the current region
-    printMessage(chr + ": Merging split reads...");
-    filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold, "SPLIT");
-    mergeSVs(chr_sv_calls);
+    // printMessage(chr + ": Merging split reads...");
+    // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold);
+    // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold, "SPLIT");
+    // mergeSVs(chr_sv_calls);
 
     // Run a final merge on the combined SV calls
-    printMessage(chr + ": Merging final calls...");
-    mergeSVs(chr_sv_calls);
+    // printMessage(chr + ": Merging final calls...");
+    // mergeSVs(chr_sv_calls);
     printMessage("Completed chromosome " + chr);
 }
 
@@ -470,7 +515,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
 
     // Find split-read SV evidence
     // printMessage(region + ": Finding split-read SVs...");
-    int sv_count = 0;
     int current_primary = 0;
     int primary_count = primary_map.size();
     //int primary_count = primary_map.size();
@@ -478,104 +522,170 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
     for (auto& entry : primary_map) {
         current_primary++;
         const std::string& qname = entry.first;
-        GenomicRegion& primary_region = entry.second;
-        
+        GenomicRegion& primary = entry.second;
+        const std::string& primary_chr = bamHdr->target_name[primary.tid];
+
       	// Find the largest supplementary alignment
         auto& supp_regions = supp_map[qname];
-        GenomicRegion largest_supp_region = supp_regions[0];
+        // GenomicRegion largest_supp = supp_regions[0];
         auto it = std::max_element(supp_regions.begin(), supp_regions.end(), [](const GenomicRegion& a, const GenomicRegion& b) {
             return a.end - a.start < b.end - b.start;
         });
-        if (it != supp_regions.end()) {
-            largest_supp_region = *it;
-        }
-        
-        // GenomicRegion largest_supp_region = supp_map[qname][0];
-        // uint32_t largest_supp_length = 0;
+        GenomicRegion largest_supp = *it;
+
+        // If on a different chromosome, label as a translocation
+        if (primary.tid != largest_supp.tid) {
+            // Note that these do not currently have a likelihood score or read depth
+            // Create two BND records for the translocation
+            // Create the alternate allele format for the first BND record
+            const std::string& supp_chr = bamHdr->target_name[largest_supp.tid];
+            std::string alt_allele = "N[" + supp_chr + ":" + std::to_string(largest_supp.start) + "[";
+            if (largest_supp.strand == false) {
+                // Reverse-oriented relative to the reference
+                alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]";
+            }
+            addSVCall(sv_calls, primary.start, primary.end, "BND", alt_allele, "SPLIT", "./.", 0.0, 0);
 
-        // printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
-        const std::string& primary_chr = bamHdr->target_name[primary_region.tid];
-        // for (auto it = supp_map[qname].begin(); it != supp_map[qname].end();
-        // ++it) {
-        for (auto& supp_region : supp_regions) {
-            // GenomicRegion& supp_region = *it;
-
-            // Skip if not on the primary chromosome
-            if (primary_region.tid != supp_region.tid) {
-                continue;
+            // Create the alternate allele format for the second BND record
+            alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "[";
+            if (primary.strand == false) {
+                // Reverse-oriented relative to the reference
+                alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]";
             }
+            addSVCall(sv_calls, largest_supp.start, largest_supp.end, "BND", alt_allele, "SPLIT", "./.", 0.0, 0);
 
-            // Get the supplementary alignment information
-            // uint32_t supp_start = (uint32_t) supp_region.start;
-            // uint32_t supp_end = (uint32_t) supp_region.end;
-            // uint32_t supp_length = supp_end - supp_start + 1;
-            // if (supp_length > largest_supp_length) {
-            //     largest_supp_length = supp_length;
-            //     largest_supp_region = *it;
-            // }
-
-            // Inversion detection
-            bool is_opposite_strand = primary_region.strand != supp_region.strand;
-            if (is_opposite_strand) {
-                // if (supp_length >= min_cnv_length) {
-                if (supp_region.end - supp_region.start >= min_cnv_length) {
-
-                    // Print error if the start position is greater than the end
-                    // position
-                    // if (supp_start > supp_end) {
-                    if (supp_region.start > supp_region.end) {
-                        printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_region.start) + "-" + std::to_string(supp_region.end));
-                        // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end));
-                        continue;
-                    }
+            continue;
+        }
 
-                    // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")...");
-                    // std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data);
-                    // printMessage("Running copy number prediction, length: " + std::to_string(supp_region.end - supp_region.start));
-                    std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_region.start, supp_region.end, mean_chr_cov, pos_depth_map, input_data);
-                    if (std::get<1>(result) == SVType::UNKNOWN) {
-                        continue;
-                    }
+        // Inversion detection
+        bool is_opposite_strand = primary.strand != largest_supp.strand;
+        if (is_opposite_strand) {
+            // if (supp_length >= min_cnv_length) {
+            if (largest_supp.end - largest_supp.start >= min_cnv_length) {
 
-                    double supp_lh = std::get<0>(result);
-                    SVType supp_type = std::get<1>(result);
-                    // printMessage("Test3");
-                    int read_depth = this->calculateReadDepth(pos_depth_map, supp_region.start, supp_region.end);
-                    // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
-                    if (supp_type == SVType::NEUTRAL) {
-                        // addSVCall(sv_calls, supp_start, supp_end, "INV",
-                        // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                        addSVCall(sv_calls, supp_region.start, supp_region.end, "INV", "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                        
-                        sv_count++;
-                    } else if (supp_type == SVType::DUP) {
-                        // addSVCall(sv_calls, supp_start, supp_end, "INVDUP",
-                        // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                        addSVCall(sv_calls, supp_region.start, supp_region.end, "INVDUP", "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                    }
+                // Print error if the start position is greater than the end
+                // position
+                // if (supp_start > supp_end) {
+                if (largest_supp.start > largest_supp.end) {
+                    printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(largest_supp.start) + "-" + std::to_string(largest_supp.end));
+                    // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end));
+                    continue;
+                }
+
+                // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")...");
+                // std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data);
+                // printMessage("Running copy number prediction, length: " + std::to_string(supp_region.end - supp_region.start));
+                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, largest_supp.start, largest_supp.end, mean_chr_cov, pos_depth_map, input_data);
+                if (std::get<1>(result) == SVType::UNKNOWN) {
+                    continue;
+                }
+
+                double supp_lh = std::get<0>(result);
+                SVType supp_type = std::get<1>(result);
+                // printMessage("Test3");
+                int read_depth = this->calculateReadDepth(pos_depth_map, largest_supp.start, largest_supp.end);
+                // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
+                if (supp_type == SVType::NEUTRAL) {
+                    // addSVCall(sv_calls, supp_start, supp_end, "INV",
+                    // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+                    addSVCall(sv_calls, largest_supp.start, largest_supp.end, "INV", "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+                    continue;
+                    
+                } else if (supp_type == SVType::DUP) {
+                    // addSVCall(sv_calls, supp_start, supp_end, "INVDUP",
+                    // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+                    addSVCall(sv_calls, largest_supp.start, largest_supp.end, "INVDUP", "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+                    continue;
                 }
             }
         }
+        
+        // GenomicRegion largest_supp_region = supp_map[qname][0];
+        // uint32_t largest_supp_length = 0;
+
+        // printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
+        // const std::string& primary_chr = bamHdr->target_name[primary.tid];
+        // // for (auto it = supp_map[qname].begin(); it != supp_map[qname].end();
+        // // ++it) {
+        // for (auto& supp_region : supp_regions) {
+        //     // GenomicRegion& supp_region = *it;
+
+        //     // Skip if not on the primary chromosome
+        //     if (primary.tid != supp_region.tid) {
+        //         continue;
+        //     }
+
+        //     // Get the supplementary alignment information
+        //     // uint32_t supp_start = (uint32_t) supp_region.start;
+        //     // uint32_t supp_end = (uint32_t) supp_region.end;
+        //     // uint32_t supp_length = supp_end - supp_start + 1;
+        //     // if (supp_length > largest_supp_length) {
+        //     //     largest_supp_length = supp_length;
+        //     //     largest_supp_region = *it;
+        //     // }
+
+        //     // Inversion detection
+        //     bool is_opposite_strand = primary.strand != supp_region.strand;
+        //     if (is_opposite_strand) {
+        //         // if (supp_length >= min_cnv_length) {
+        //         if (supp_region.end - supp_region.start >= min_cnv_length) {
+
+        //             // Print error if the start position is greater than the end
+        //             // position
+        //             // if (supp_start > supp_end) {
+        //             if (supp_region.start > supp_region.end) {
+        //                 printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_region.start) + "-" + std::to_string(supp_region.end));
+        //                 // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end));
+        //                 continue;
+        //             }
+
+        //             // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")...");
+        //             // std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data);
+        //             // printMessage("Running copy number prediction, length: " + std::to_string(supp_region.end - supp_region.start));
+        //             std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_region.start, supp_region.end, mean_chr_cov, pos_depth_map, input_data);
+        //             if (std::get<1>(result) == SVType::UNKNOWN) {
+        //                 continue;
+        //             }
+
+        //             double supp_lh = std::get<0>(result);
+        //             SVType supp_type = std::get<1>(result);
+        //             // printMessage("Test3");
+        //             int read_depth = this->calculateReadDepth(pos_depth_map, supp_region.start, supp_region.end);
+        //             // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
+        //             if (supp_type == SVType::NEUTRAL) {
+        //                 // addSVCall(sv_calls, supp_start, supp_end, "INV",
+        //                 // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+        //                 addSVCall(sv_calls, supp_region.start, supp_region.end, "INV", "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+                        
+        //                 sv_count++;
+        //             } else if (supp_type == SVType::DUP) {
+        //                 // addSVCall(sv_calls, supp_start, supp_end, "INVDUP",
+        //                 // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+        //                 addSVCall(sv_calls, supp_region.start, supp_region.end, "INVDUP", "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+        //             }
+        //         }
+        //     }
+        // }
 
         // Analyze split-read evidence for deletions and duplications
         bool gap_exists = false;
         uint32_t boundary_left, boundary_right, gap_left, gap_right;
-        boundary_left = std::min(primary_region.start, largest_supp_region.start);
-        boundary_right = std::max(primary_region.end, largest_supp_region.end);
-        gap_left = std::min(primary_region.end, largest_supp_region.start);
-        gap_right = std::max(primary_region.start, largest_supp_region.end);
+        boundary_left = std::min(primary.start, largest_supp.start);
+        boundary_right = std::max(primary.end, largest_supp.end);
+        gap_left = std::min(primary.end, largest_supp.start);
+        gap_right = std::max(primary.start, largest_supp.end);
         gap_exists = gap_left < gap_right;
-        // if (primary_region.start < largest_supp_region.start) {  // Primary before supp
-        //     boundary_left = primary_region.start;
-        //     boundary_right = std::max(primary_region.end, largest_supp_region.end);
-        //     gap_left = primary_region.end;
+        // if (primary.start < largest_supp_region.start) {  // Primary before supp
+        //     boundary_left = primary.start;
+        //     boundary_right = std::max(primary.end, largest_supp_region.end);
+        //     gap_left = primary.end;
         //     gap_right = largest_supp_region.start;
         //     gap_exists = gap_left < gap_right;
         // } else {
         //     boundary_left = largest_supp_region.start;
-        //     boundary_right = std::max(primary_region.end, largest_supp_region.end);
+        //     boundary_right = std::max(primary.end, largest_supp_region.end);
         //     gap_left = largest_supp_region.end;
-        //     gap_right = primary_region.start;
+        //     gap_right = primary.start;
         //     gap_exists = gap_left < gap_right;
         // }
         
@@ -680,6 +790,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         "##INFO=<ID=ALN,Number=1,Type=String,Description=\"Feature used to identify the structural variant\">",
         "##INFO=<ID=HMM,Number=1,Type=Float,Description=\"HMM likelihood\">",
         "##INFO=<ID=SUPPORT,Number=1,Type=Integer,Description=\"Number of reads supporting the variant\">",
+        "##INFO=<ID=CLUSTER,Number=1,Type=Integer,Description=\"Cluster size\">",
         "##FILTER=<ID=PASS,Description=\"All filters passed\">",
         "##FILTER=<ID=LowQual,Description=\"Low quality\">",
         "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
@@ -731,6 +842,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             std::string alt_allele = sv_call.alt_allele;
             double hmm_likelihood = sv_call.hmm_likelihood;
             int sv_length = end - start + 1;
+            int cluster_size = sv_call.cluster_size;
             /*
             if (sv_type_str == "DEL") {
             	sv_length++;
@@ -799,15 +911,15 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                 }
             }
 
-            // // Print the REF allele if SVTYPE = DUP
-            // if (sv_type_str == "DUP") {
-            //     printMessage("REF allele for DUP at " + chr + ":" + std::to_string(start) + "-" + std::to_string(end) + ": " + ref_allele + ", ALT allele: " + alt_allele);
-            // }
+            // Print the REF allele if SVTYPE = DUP and if it is empty or "." (symbolic)
+            if (sv_type_str == "DUP" && (ref_allele == "" || ref_allele == ".")) {
+                printMessage("REF allele for DUP at " + chr + ":" + std::to_string(start) + "-" + std::to_string(end) + ": " + ref_allele + ", ALT allele: " + alt_allele);
+            }
 
             // Create the VCF parameter strings
             std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
                 ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
-                ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(support);
+                ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(support) + ";CLUSTER=" + std::to_string(cluster_size);
                 
             std::string format_str = "GT:DP";
             std::string sample_str = genotype + ":" + std::to_string(read_depth);
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 6a953ec6..db2b3955 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -27,7 +27,9 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std:
     }
 
     // Insert the SV call in sorted order
-    SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1};
+    // SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype,
+    // hmm_likelihood, read_depth, 1};
+    SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1};
     auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call);
 
     // Update the SV type if the SV call already exists (if likelihood is
@@ -51,7 +53,9 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std:
 void updateSVType(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string data_type, std::string genotype, double hmm_likelihood)
 {
     // Update the SV type for an existing SV call
-    auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), SVCall{start, end, "", "", "", "", 0.0, 0, 0});
+    // auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(),
+    // SVCall{start, end, "", "", "", "", 0.0, 0, 0});
+    auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), SVCall(start, end, "", "", "", "", 0.0, 0, 0, 0));
     if (it != sv_calls.end() && it->start == start && it->end == end)
     {
         it->sv_type = sv_type;
@@ -103,14 +107,17 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
                 //XprintMessage("Merging SV calls with overlap " + std::to_string(overlap_fraction));
                 // Keep the SV call with the higher read support
                 if (next.support > current_merge.support) {
+                    next.cluster_size = current_merge.cluster_size + 1;  // Update the cluster size
                     current_merge = next;
                 } else if (next.support == current_merge.support) {
                     // Keep the SV call with the higher likelihood
                     if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0 && next.hmm_likelihood > current_merge.hmm_likelihood) {
+                        next.cluster_size = current_merge.cluster_size + 1;  // Update the cluster size
                         current_merge = next;
                     } else if (next.hmm_likelihood == current_merge.hmm_likelihood) {
                         // Keep the SV call with the higher read depth
                         if (next.read_depth > current_merge.read_depth) {
+                            next.cluster_size = current_merge.cluster_size + 1;  // Update the cluster size
                             current_merge = next;
                         }
                     }
@@ -120,10 +127,12 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
 				uint32_t current_length = current_merge.end - current_merge.start;
 				uint32_t next_length = next.end - next.start;
 				if (next_length > current_length) {  // And support meets threshold
+                    next.cluster_size = current_merge.cluster_size + 1;  // Update the cluster size
 					current_merge = next;
 				}
             }
         } else {
+            // Store the merged SV call and move to the next SV call
             merged_sv_calls.push_back(current_merge);
             current_merge = next;
         }
@@ -138,9 +147,9 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
 
 void filterSVsWithLowSupport(std::vector<SVCall>& sv_calls, int min_support)
 {
-    // Filter SV calls with low read support
+    // Filter SV calls with low read support or low cluster size
     sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) {
-        return sv_call.support < min_support;
+        return sv_call.support < min_support && sv_call.cluster_size < min_support;
     }), sv_calls.end());
 }
 

From 29333bbae64388fadfbbb58878955581bb591de9 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 29 Jan 2025 12:29:44 -0500
Subject: [PATCH 064/134] fix errors preventing 1 recall for ins dup

---
 include/sv_object.h | 11 ++++++++---
 include/sv_types.h  | 12 ++++++++++++
 src/cnv_caller.cpp  | 48 +++++++++++++++++++++++++++++----------------
 src/sv_caller.cpp   | 29 ++++++++++++++-------------
 src/sv_object.cpp   | 24 +++++------------------
 5 files changed, 71 insertions(+), 53 deletions(-)

diff --git a/include/sv_object.h b/include/sv_object.h
index 0b2a489c..43267e0e 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -8,11 +8,16 @@
 #include <stdexcept>
 #include <unordered_map>
 
+#include "sv_types.h"
+
+using namespace sv_types;
+
 // Struct to represent a structural variant call
 struct SVCall {
     uint32_t start;
     uint32_t end;
-    std::string sv_type = "NA";
+    // std::string sv_type = "NA";
+    SVType sv_type = SVType::UNKNOWN;
     std::string alt_allele = ".";
     std::string data_type = "NA";
     std::string genotype = "./.";
@@ -25,14 +30,14 @@ struct SVCall {
     bool operator<(const SVCall& other) const;
 
     // Constructor with parameters for all fields
-    SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) :
+    SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) :
         start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {}
         
     // SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) :
     //     start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {}
 };
 
-void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth);
+void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth);
 
 void mergeSVs(std::vector<SVCall>& sv_calls);
 
diff --git a/include/sv_types.h b/include/sv_types.h
index 60471a01..c0e1fabf 100644
--- a/include/sv_types.h
+++ b/include/sv_types.h
@@ -57,6 +57,18 @@ namespace sv_types {
     inline SVType getSVTypeFromCNState(int cn_state) {
         return CNVTypeMap.at(cn_state);
     }
+
+    // Function to check if an SV type is a valid update from copy number predictions
+    inline bool isValidCopyNumberUpdate(SVType sv_type, SVType updated_sv_type) {
+        if (updated_sv_type == SVType::UNKNOWN) {
+            return false;
+        } else if (sv_type == SVType::DEL && updated_sv_type != SVType::DEL) {
+            return false;
+        } else if (sv_type == SVType::INS && updated_sv_type != SVType::DUP) {
+            return false;
+        }
+        return true;
+    }
 }
 
 #endif // SV_TYPES_H
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 344288c4..16d62d4f 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -246,29 +246,43 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
             max_state = 0;
         }
 
-        // Update the SV calls with the CNV type and genotype
+        // Update the SV information if it does not conflict with the current SV type
         SVType updated_sv_type = getSVTypeFromCNState(max_state);
-        std::string genotype = cnv_genotype_map.at(max_state);
-
-        // Determine the SV calling method used to call the SV
-        std::string data_type;
-        data_type = "HMM";
-
-        // Update the SV genotype if known
-        // printMessage("Updating SV call for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "...");
-        if (updated_sv_type != SVType::UNKNOWN)
+        bool is_valid_update = isValidCopyNumberUpdate(sv_call.sv_type, updated_sv_type);
+        // if (updated_sv_type != SVType::UNKNOWN && updated_sv_type !=
+        // SVType::NEUTRAL)
+        if (is_valid_update)
         {
+            std::string genotype = cnv_genotype_map.at(max_state);
+            std::string data_type = "CIGAR+HMM";
+            // std::string sv_type_str = getSVTypeString(updated_sv_type);
+            sv_call.sv_type = updated_sv_type;
+            sv_call.hmm_likelihood = likelihood;
             sv_call.genotype = genotype;
             sv_call.data_type = data_type;
-            sv_call.hmm_likelihood = likelihood;
         }
 
-        // Update the SV type if known
-        if (updated_sv_type != SVType::UNKNOWN && updated_sv_type != SVType::NEUTRAL)
-        {
-            std::string sv_type_str = getSVTypeString(updated_sv_type);
-            sv_call.sv_type = sv_type_str;
-        }
+        // Update the SV genotype if known
+        // printMessage("Updating SV call for " + chr + ":" +
+        // std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) +
+        // "...");
+        // std::string genotype = cnv_genotype_map.at(max_state);
+        // std::string data_type = "CIGAR+HMM";
+        // if (updated_sv_type != SVType::UNKNOWN)
+        // {
+        //     sv_call.genotype = genotype;
+        //     sv_call.data_type = data_type;
+        //     sv_call.hmm_likelihood = likelihood;
+        // }
+
+        // Update the SV type if known and it does not conflict with the current
+        // SV type
+        // SVType updated_sv_type = getSVTypeFromCNState(max_state);
+        // if (updated_sv_type != SVType::UNKNOWN && updated_sv_type != SVType::NEUTRAL)
+        // {
+        //     std::string sv_type_str = getSVTypeString(updated_sv_type);
+        //     sv_call.sv_type = sv_type_str;
+        // }
     }
 }
 
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 2e24c2cf..3da0dbe6 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -257,7 +257,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
                     {
                         int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                        addSVCall(sv_calls, bp1, bp2, "DUP", "<DUP>", "LSEQSIM", "./.", default_lh, read_depth);
+                        addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "LSEQSIM", "./.", default_lh, read_depth);
                         continue;
                     }
                 }
@@ -271,7 +271,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
                     {
                         int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                        addSVCall(sv_calls, bp1, bp2, "DUP", "<DUP>", "RSEQSIM", "./.", default_lh, read_depth);
+                        addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth);
                         continue;
                     }
                 }
@@ -289,7 +289,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                     alt_allele = ins_seq_str;
                 }
                 
-                addSVCall(sv_calls, ins_pos, ins_end, "INS", alt_allele, "CIGARINS", "./.", default_lh, read_depth);
+                addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth);
 
             // Check if the CIGAR operation is a deletion
             } else if (op == BAM_CDEL && is_primary) {
@@ -297,7 +297,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
                 int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
-                addSVCall(sv_calls, ref_pos, ref_end, "DEL", "<DEL>", "CIGARDEL", "./.", default_lh, read_depth);
+                addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "<DEL>", "CIGARDEL", "./.", default_lh, read_depth);
             }
         }
 
@@ -383,8 +383,9 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
 
     printMessage(chr + ": Merging CIGAR...");
-    filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold);
-    mergeSVs(chr_sv_calls);
+    // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold);
+    // mergeSVs(chr_sv_calls);
+    // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold);
     int region_sv_count = getSVCount(chr_sv_calls);
     printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
@@ -544,7 +545,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 // Reverse-oriented relative to the reference
                 alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]";
             }
-            addSVCall(sv_calls, primary.start, primary.end, "BND", alt_allele, "SPLIT", "./.", 0.0, 0);
+            addSVCall(sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0);
 
             // Create the alternate allele format for the second BND record
             alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "[";
@@ -552,7 +553,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 // Reverse-oriented relative to the reference
                 alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]";
             }
-            addSVCall(sv_calls, largest_supp.start, largest_supp.end, "BND", alt_allele, "SPLIT", "./.", 0.0, 0);
+            addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0);
 
             continue;
         }
@@ -588,13 +589,13 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 if (supp_type == SVType::NEUTRAL) {
                     // addSVCall(sv_calls, supp_start, supp_end, "INV",
                     // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                    addSVCall(sv_calls, largest_supp.start, largest_supp.end, "INV", "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+                    addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
                     continue;
                     
                 } else if (supp_type == SVType::DUP) {
                     // addSVCall(sv_calls, supp_start, supp_end, "INVDUP",
                     // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                    addSVCall(sv_calls, largest_supp.start, largest_supp.end, "INVDUP", "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+                    addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
                     continue;
                 }
             }
@@ -733,18 +734,18 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 if (gap_lh > bd_lh) {
                     int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right);
                     std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">";
-                    addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "SPLIT", "./.", gap_lh, read_depth);
+                    addSVCall(sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth);
                 } else {
                     // Add the boundary as the SV call
                     int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                     std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-                    addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "SPLIT", "./.", bd_lh, read_depth);
+                    addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth);
                 }
             } else {
                 // Add the boundary as the SV call
                 int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                 std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-                addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "SPLIT", "./.", bd_lh, read_depth);
+                addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth);
             }
         }
 
@@ -836,7 +837,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             // Get the SV candidate and SV info
             uint32_t start = sv_call.start;
             uint32_t end = sv_call.end;
-            std::string sv_type_str = sv_call.sv_type;
+            std::string sv_type_str = getSVTypeString(sv_call.sv_type);
             std::string genotype = sv_call.genotype;
             std::string data_type_str = sv_call.data_type;
             std::string alt_allele = sv_call.alt_allele;
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index db2b3955..2c980e60 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -14,10 +14,13 @@ bool SVCall::operator<(const SVCall & other) const
 	return start < other.start || (start == other.start && end < other.end);
 }
 
-void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth)
+void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth)
 {
     // Ignore unknown SV types
-    if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") {
+    // if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") {
+    //     return;
+    // }
+    if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) {
         return;
     }
     
@@ -50,23 +53,6 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std:
     }
 }
 
-void updateSVType(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string data_type, std::string genotype, double hmm_likelihood)
-{
-    // Update the SV type for an existing SV call
-    // auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(),
-    // SVCall{start, end, "", "", "", "", 0.0, 0, 0});
-    auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), SVCall(start, end, "", "", "", "", 0.0, 0, 0, 0));
-    if (it != sv_calls.end() && it->start == start && it->end == end)
-    {
-        it->sv_type = sv_type;
-        it->data_type = data_type;
-        it->genotype = genotype;
-        it->hmm_likelihood = hmm_likelihood;
-    } else {
-        printError("ERROR: SV call not found for update at position " + std::to_string(start) + "-" + std::to_string(end));
-    }
-}
-
 uint32_t getSVCount(const std::vector<SVCall>& sv_calls)
 {
     return (uint32_t) sv_calls.size();

From 0b3d205d4601cfcb779567b4549580ef7c3a23f0 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 30 Jan 2025 00:11:43 -0500
Subject: [PATCH 065/134] high recall merging

---
 include/sv_caller.h |   1 +
 include/sv_object.h |   9 ++--
 src/sv_caller.cpp   |  48 +++++++++++------
 src/sv_object.cpp   | 125 ++++++++++++++++++++++++--------------------
 4 files changed, 106 insertions(+), 77 deletions(-)

diff --git a/include/sv_caller.h b/include/sv_caller.h
index 5cfbb956..1824015c 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -21,6 +21,7 @@ struct GenomicRegion {
     hts_pos_t start;
     hts_pos_t end;
     bool strand;
+    uint8_t qual;
 };
 
 struct MismatchData {
diff --git a/include/sv_object.h b/include/sv_object.h
index 43267e0e..8ccbf79f 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -25,19 +25,22 @@ struct SVCall {
     int read_depth = 0;  // Breakpoint depth
     int support = 0;  // Number of supporting reads
     int cluster_size = 0;  // Number of SV calls in the cluster
+    uint8_t qual = 0;  // Alignment quality score
 
     // Comparison operator for std::set
     bool operator<(const SVCall& other) const;
 
     // Constructor with parameters for all fields
-    SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) :
-        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {}
+    SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size, uint8_t qual) :
+        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size), qual(qual) {}
+    // SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) :
+    //     start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {}
         
     // SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) :
     //     start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {}
 };
 
-void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth);
+void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, uint8_t qual);
 
 void mergeSVs(std::vector<SVCall>& sv_calls);
 
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 3da0dbe6..0839e310 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -59,7 +59,7 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
     uint32_t supplementary_count = 0;
 
     // Main loop to process the alignments
-    std::unordered_map<std::string, uint8_t> primary_map_qual;
+    // std::unordered_map<std::string, uint8_t> primary_map_qual;
     uint32_t num_alignments = 0;
     while (readNextAlignment(fp_in, itr, bam1) >= 0) {
 
@@ -68,14 +68,15 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
             continue;
         }
         const std::string qname = bam_get_qname(bam1);  // Query template name
+        uint8_t mapq = bam1->core.qual;  // Mapping quality
 
         // Process primary alignments
         if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
             // primary_map[qname] = itr;
             // Store chromosome (TID), start, and end positions (1-based) of the
             // primary alignment, and the strand (true for forward, false for reverse)
-            primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE)};
-            primary_map_qual[qname] = bam1->core.qual;
+            primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq};
+            // primary_map_qual[qname] = bam1->core.qual;
             primary_count++;
 
         // Process supplementary alignments
@@ -83,7 +84,7 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
             // supp_map[qname].push_back(itr);
             // Store chromosome (TID), start, and end positions (1-based) of the
             // supplementary alignment, and the strand (true for forward, false for reverse)
-            supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE)});
+            supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq});
             supplementary_count++;
         }
         num_alignments++;
@@ -122,7 +123,8 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
             if (primary1.tid == primary2.tid && primary1.start <= primary2.end && primary1.end >= primary2.start) {
                 // Overlapping primary alignments
                 // printMessage("Overlapping primary alignments with quality " + std::to_string(primary_map_qual[qname1]) + " and " + std::to_string(primary_map_qual[qname2]));
-                if (primary_map_qual[qname1] < primary_map_qual[qname2]) {
+                // if (primary_map_qual[qname1] < primary_map_qual[qname2]) {
+                if (primary1.qual < primary2.qual) {
                     // to_remove_overlapping.push_back(qname1);
                     to_remove_overlapping.insert(qname1);
                 } else {
@@ -217,6 +219,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     uint32_t* cigar = bam_get_cigar(alignment);  // CIGAR array
     int cigar_len = alignment->core.n_cigar;
     uint32_t query_pos = 0;
+    uint8_t qual = alignment->core.qual;
 
     // Loop through the CIGAR string, process operations, detect SVs (primary
     // only), and calculate sequence identity for potential duplications (primary only)
@@ -257,7 +260,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
                     {
                         int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                        addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "LSEQSIM", "./.", default_lh, read_depth);
+                        addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "LSEQSIM", "./.", default_lh, read_depth, qual);
                         continue;
                     }
                 }
@@ -271,7 +274,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
                     {
                         int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                        addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth);
+                        addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth, qual);
                         continue;
                     }
                 }
@@ -289,7 +292,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                     alt_allele = ins_seq_str;
                 }
                 
-                addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth);
+                addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, qual);
 
             // Check if the CIGAR operation is a deletion
             } else if (op == BAM_CDEL && is_primary) {
@@ -297,7 +300,12 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
                 int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
-                addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "<DEL>", "CIGARDEL", "./.", default_lh, read_depth);
+                addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "<DEL>", "CIGARDEL", "./.", default_lh, read_depth, qual);
+
+                // Print if the ref pos is within the range 44007800-44007930
+                if (ref_pos >= 44007800 && ref_pos <= 44007930) {
+                    printMessage("DEL: " + chr + ":" + std::to_string(ref_pos) + "-" + std::to_string(ref_end) + " (LENGTH " + std::to_string(op_len) + ")");
+                }
             }
         }
 
@@ -384,7 +392,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
 
     printMessage(chr + ": Merging CIGAR...");
     // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold);
-    // mergeSVs(chr_sv_calls);
+    mergeSVs(chr_sv_calls);
     // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold);
     int region_sv_count = getSVCount(chr_sv_calls);
     printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
@@ -401,6 +409,11 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     printMessage(chr + ": Split read SVs...");
     this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
 
+    // Sort the SV calls by start position
+    std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+        return a.start < b.start;
+    });
+
     // Merge the SV calls from the current region
     // printMessage(chr + ": Merging split reads...");
     // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold);
@@ -545,7 +558,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 // Reverse-oriented relative to the reference
                 alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]";
             }
-            addSVCall(sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0);
+            addSVCall(sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, primary.qual);
 
             // Create the alternate allele format for the second BND record
             alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "[";
@@ -553,7 +566,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 // Reverse-oriented relative to the reference
                 alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]";
             }
-            addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0);
+            addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, largest_supp.qual);
 
             continue;
         }
@@ -589,13 +602,13 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 if (supp_type == SVType::NEUTRAL) {
                     // addSVCall(sv_calls, supp_start, supp_end, "INV",
                     // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                    addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+                    addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "<INV>", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual);
                     continue;
                     
                 } else if (supp_type == SVType::DUP) {
                     // addSVCall(sv_calls, supp_start, supp_end, "INVDUP",
                     // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                    addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+                    addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "<INV>", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual);
                     continue;
                 }
             }
@@ -669,6 +682,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
         // }
 
         // Analyze split-read evidence for deletions and duplications
+        uint8_t mean_qual = (primary.qual + largest_supp.qual) / 2;
         bool gap_exists = false;
         uint32_t boundary_left, boundary_right, gap_left, gap_right;
         boundary_left = std::min(primary.start, largest_supp.start);
@@ -734,18 +748,18 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 if (gap_lh > bd_lh) {
                     int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right);
                     std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">";
-                    addSVCall(sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth);
+                    addSVCall(sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth, mean_qual);
                 } else {
                     // Add the boundary as the SV call
                     int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                     std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-                    addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth);
+                    addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual);
                 }
             } else {
                 // Add the boundary as the SV call
                 int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                 std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-                addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth);
+                addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual);
             }
         }
 
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 2c980e60..748fe01d 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -6,6 +6,7 @@
 #include <cmath>
 #include <stdexcept>
 #include <iostream>
+#include <numeric>
 
 #include "utils.h"
 
@@ -14,7 +15,7 @@ bool SVCall::operator<(const SVCall & other) const
 	return start < other.start || (start == other.start && end < other.end);
 }
 
-void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth)
+void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, uint8_t qual)
 {
     // Ignore unknown SV types
     // if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") {
@@ -32,14 +33,15 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVTy
     // Insert the SV call in sorted order
     // SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype,
     // hmm_likelihood, read_depth, 1};
-    SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1};
+    SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1, qual};
     auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call);
 
-    // Update the SV type if the SV call already exists (if likelihood is
-    // higher)
+    // Determine if the SV call already exists
     if (it != sv_calls.end() && it->start == start && it->end == end)
     {
         it->support += 1;  // Update the read support
+
+        // Update SV type if likelihood is higher
         if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood)
         {
             // Update the SV call
@@ -47,6 +49,7 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVTy
             it->data_type = data_type;
             it->genotype = genotype;
             it->hmm_likelihood = hmm_likelihood;
+            it->qual = qual;
         }
     } else {
         sv_calls.insert(it, sv_call);  // Insert the new SV call
@@ -68,67 +71,75 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
     if (sv_calls.size() < 2) {
         return;
     }
+    int initial_size = sv_calls.size();
 
-    // Merge SV calls if they overlap
-    // int initial_size = sv_calls.size();
-    
-    // Merge any SV calls that have >90% reciprocal overlap
+    std::vector<bool> merged(sv_calls.size(), false);
     std::vector<SVCall> merged_sv_calls;
-    SVCall current_merge = sv_calls[0];
-    for (size_t i = 1; i < sv_calls.size(); i++) {
-        SVCall& next = sv_calls[i];
-        // Check for overlap
-        if (next.start <= current_merge.end) {
-            //XprintMessage("Comparing SV " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " (support " + std::to_string(current_merge.support) + ", length " + std::to_string(current_merge.end - current_merge.start) + ") with " + std::to_string(next.start) + "-" + std::to_string(next.end) + " (support " + std::to_string(next.support) + ", length " + std::to_string(next.end - next.start) + ")");
-            
-            // if (current_merge.start <= next.end && next.start <= current_merge.end) {
-            // Calculate reciprocal overlap
-            uint32_t overlap = std::max(0, (int)std::min(current_merge.end, next.end) - (int)std::max(current_merge.start, next.start));
-            uint32_t union_length = std::max(current_merge.end, next.end) - std::min(current_merge.start, next.start);
-            double overlap_fraction = static_cast<double>(overlap) / union_length;
-            //XprintMessage("Overlap fraction: " + std::to_string(overlap_fraction));
-
-            // Merge if reciprocal overlap is >90%
-            if (overlap_fraction > 0.90) {
-                //XprintMessage("Merging SV calls with overlap " + std::to_string(overlap_fraction));
-                // Keep the SV call with the higher read support
-                if (next.support > current_merge.support) {
-                    next.cluster_size = current_merge.cluster_size + 1;  // Update the cluster size
-                    current_merge = next;
-                } else if (next.support == current_merge.support) {
-                    // Keep the SV call with the higher likelihood
-                    if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0 && next.hmm_likelihood > current_merge.hmm_likelihood) {
-                        next.cluster_size = current_merge.cluster_size + 1;  // Update the cluster size
-                        current_merge = next;
-                    } else if (next.hmm_likelihood == current_merge.hmm_likelihood) {
-                        // Keep the SV call with the higher read depth
-                        if (next.read_depth > current_merge.read_depth) {
-                            next.cluster_size = current_merge.cluster_size + 1;  // Update the cluster size
-                            current_merge = next;
-                        }
-                    }
+
+    // Sort SVs by start position to improve efficiency
+    std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+        return a.start < b.start;
+    });
+
+    for (size_t i = 0; i < sv_calls.size(); i++) {
+        if (merged[i]) continue;
+
+        size_t best_index = i;
+        int total_cluster_size = sv_calls[i].cluster_size;  // Track total cluster size
+
+        for (size_t j = i + 1; j < sv_calls.size(); j++) {
+            if (merged[j]) continue;
+
+            // Compute overlap
+            uint32_t overlap_start = std::max(sv_calls[i].start, sv_calls[j].start);
+            uint32_t overlap_end = std::min(sv_calls[i].end, sv_calls[j].end);
+            uint32_t overlap_length = (overlap_end > overlap_start) ? (overlap_end - overlap_start) : 0;
+
+            // Compute union length correctly
+            uint32_t union_start = std::min(sv_calls[i].start, sv_calls[j].start);
+            uint32_t union_end = std::max(sv_calls[i].end, sv_calls[j].end);
+            uint32_t union_length = union_end - union_start;  // No +1 to prevent off-by-one errors
+
+            double overlap_fraction = (union_length > 0) ? (static_cast<double>(overlap_length) / union_length) : 0.0;
+
+            // Throw error if fraction > 1
+            if (overlap_fraction > 1.0) {
+                throw std::runtime_error("Error: Overlap fraction = " + std::to_string(overlap_fraction) + " > 1.0");
+            }
+
+            // if (overlap_fraction > 0.5) {
+            if (overlap_fraction > 0.5) {  // Changed from 0.5
+                total_cluster_size += sv_calls[j].cluster_size;
+                if (sv_calls[j].support > sv_calls[best_index].support) {
+                    best_index = j;
                 }
-            } else {
-            	// Continue with the larger length
-				uint32_t current_length = current_merge.end - current_merge.start;
-				uint32_t next_length = next.end - next.start;
-				if (next_length > current_length) {  // And support meets threshold
-                    next.cluster_size = current_merge.cluster_size + 1;  // Update the cluster size
-					current_merge = next;
-				}
+                merged[j] = true;  // Mark SV as merged
             }
-        } else {
-            // Store the merged SV call and move to the next SV call
-            merged_sv_calls.push_back(current_merge);
-            current_merge = next;
         }
+
+        sv_calls[best_index].cluster_size = total_cluster_size; // Update best SV with total size
+        merged_sv_calls.push_back(sv_calls[best_index]); // Keep the strongest SV
     }
 
-    merged_sv_calls.push_back(current_merge);  // Add the last SV call
-    sv_calls = merged_sv_calls;  // Update the SV calls
+    // Filter out merged SVs with low support or cluster size
+    // merged_sv_calls.erase(std::remove_if(merged_sv_calls.begin(), merged_sv_calls.end(), [initial_size](const SVCall& sv_call) {
+    //     return sv_call.support < 2 && sv_call.cluster_size < 10;  // Adjust thresholds as needed
+    // }), merged_sv_calls.end());
+    // merged_sv_calls.erase(std::remove_if(merged_sv_calls.begin(), merged_sv_calls.end(), [initial_size](const SVCall& sv_call) {
+    //     return sv_call.support < 2 && sv_call.cluster_size < 3;  // Adjust thresholds as needed
+    // }), merged_sv_calls.end());
+
+    sv_calls = std::move(merged_sv_calls); // Replace with filtered list
+
+    // Print SVs that have length 2039
+    for (const auto& sv_call : sv_calls) {
+        if (sv_call.end - sv_call.start == 2039) {
+            printMessage("Found merged SV with length 2039 at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " (SUP=" + std::to_string(sv_call.support) + ")");
+        }
+    }
 
-    // int updated_size = sv_calls.size();
-    // std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl;
+    int updated_size = sv_calls.size();
+    printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls");
 }
 
 void filterSVsWithLowSupport(std::vector<SVCall>& sv_calls, int min_support)

From 8e4da06e32d71a93886c7761aa1786466145ceb2 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 30 Jan 2025 18:43:51 -0500
Subject: [PATCH 066/134] improved merging to obtain 1 recall for sv types in
 chr21

---
 src/sv_object.cpp | 82 +++++++++++++++++++++++++++--------------------
 1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 748fe01d..5e31f3e8 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -31,8 +31,6 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVTy
     }
 
     // Insert the SV call in sorted order
-    // SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype,
-    // hmm_likelihood, read_depth, 1};
     SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1, qual};
     auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call);
 
@@ -84,57 +82,71 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
     for (size_t i = 0; i < sv_calls.size(); i++) {
         if (merged[i]) continue;
 
-        size_t best_index = i;
-        int total_cluster_size = sv_calls[i].cluster_size;  // Track total cluster size
+        std::vector<SVCall> cluster;
+        cluster.push_back(sv_calls[i]);
+        merged[i] = true;
 
+        // Use 10% of the length of the first SV as the threshold
+        uint32_t sv_a_window = (uint32_t) std::ceil((double) (sv_calls[i].end - sv_calls[i].start + 1) * 0.1);
+
+        // Find SVs that have start or end positions within 10% of each other's length
         for (size_t j = i + 1; j < sv_calls.size(); j++) {
             if (merged[j]) continue;
 
-            // Compute overlap
-            uint32_t overlap_start = std::max(sv_calls[i].start, sv_calls[j].start);
-            uint32_t overlap_end = std::min(sv_calls[i].end, sv_calls[j].end);
-            uint32_t overlap_length = (overlap_end > overlap_start) ? (overlap_end - overlap_start) : 0;
+            // Check if the SVs are within 10% of the largest SV's length
+            uint32_t sv_b_window = (uint32_t) std::ceil((double) (sv_calls[j].end - sv_calls[j].start + 1) * 0.1);
+            uint32_t sv_window = std::max(sv_a_window, sv_b_window);
+            bool start_within_window = std::abs((int) sv_calls[j].start - (int) sv_calls[i].start) <= (int) sv_window;
+            bool end_within_window = std::abs((int) sv_calls[j].end - (int) sv_calls[i].end) <= (int) sv_window;
+            if (start_within_window && end_within_window) {
+                cluster.push_back(sv_calls[j]);
+                merged[j] = true;
+            }
+        }
 
-            // Compute union length correctly
-            uint32_t union_start = std::min(sv_calls[i].start, sv_calls[j].start);
-            uint32_t union_end = std::max(sv_calls[i].end, sv_calls[j].end);
-            uint32_t union_length = union_end - union_start;  // No +1 to prevent off-by-one errors
+        // Remove clusters with single SVs that have low support
+        if (cluster.size() < 2 && cluster[0].support < 2) {
+            continue;
+        }
 
-            double overlap_fraction = (union_length > 0) ? (static_cast<double>(overlap_length) / union_length) : 0.0;
+        std::vector<SVCall> filtered_cluster = cluster;
 
-            // Throw error if fraction > 1
-            if (overlap_fraction > 1.0) {
-                throw std::runtime_error("Error: Overlap fraction = " + std::to_string(overlap_fraction) + " > 1.0");
+        // If any SV length equals 2039, print all the SV calls in the cluster
+        bool found_2039 = false;
+        for (const auto& sv : filtered_cluster) {
+            if (sv.end - sv.start == 2039) {
+                printMessage("[TEST] Found SV with length 2039 at " + std::to_string(sv.start) + "-" + std::to_string(sv.end) + " (SUP=" + std::to_string(sv.support) + ")");
+                found_2039 = true;
             }
-
-            // if (overlap_fraction > 0.5) {
-            if (overlap_fraction > 0.5) {  // Changed from 0.5
-                total_cluster_size += sv_calls[j].cluster_size;
-                if (sv_calls[j].support > sv_calls[best_index].support) {
-                    best_index = j;
-                }
-                merged[j] = true;  // Mark SV as merged
+        }
+        if (found_2039) {
+            std::cout << "[TEST] Cluster of SVs with size " << filtered_cluster.size() << ":" << std::endl;
+            for (const auto& sv : filtered_cluster) {
+                printMessage("SV: " + std::to_string(sv.start) + "-" + std::to_string(sv.end) + " (SUP=" + std::to_string(sv.support) + ", LEN=" + std::to_string(sv.end - sv.start) + ")");
             }
         }
 
-        sv_calls[best_index].cluster_size = total_cluster_size; // Update best SV with total size
-        merged_sv_calls.push_back(sv_calls[best_index]); // Keep the strongest SV
-    }
+        // Find the median-length SV in the cluster and use it as the merged SV
+        // Sort the cluster by length
+        std::sort(filtered_cluster.begin(), filtered_cluster.end(), [](const SVCall& a, const SVCall& b) {
+            return (a.end - a.start) < (b.end - b.start);
+        });
 
-    // Filter out merged SVs with low support or cluster size
-    // merged_sv_calls.erase(std::remove_if(merged_sv_calls.begin(), merged_sv_calls.end(), [initial_size](const SVCall& sv_call) {
-    //     return sv_call.support < 2 && sv_call.cluster_size < 10;  // Adjust thresholds as needed
-    // }), merged_sv_calls.end());
-    // merged_sv_calls.erase(std::remove_if(merged_sv_calls.begin(), merged_sv_calls.end(), [initial_size](const SVCall& sv_call) {
-    //     return sv_call.support < 2 && sv_call.cluster_size < 3;  // Adjust thresholds as needed
-    // }), merged_sv_calls.end());
+        // Get the median SV
+        size_t median_index = filtered_cluster.size() / 2;
+        SVCall median_sv = filtered_cluster[median_index];
 
+        median_sv.cluster_size = (int) cluster.size();
+        
+        // Add the merged SV to the list
+        merged_sv_calls.push_back(median_sv);
+    }
     sv_calls = std::move(merged_sv_calls); // Replace with filtered list
 
     // Print SVs that have length 2039
     for (const auto& sv_call : sv_calls) {
         if (sv_call.end - sv_call.start == 2039) {
-            printMessage("Found merged SV with length 2039 at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " (SUP=" + std::to_string(sv_call.support) + ")");
+            printMessage("[TEST] Found merged SV with length 2039 at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " (SUP=" + std::to_string(sv_call.support) + ")");
         }
     }
 

From 07551d975e6765089cfe936b2307d75a3f16025f Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 31 Jan 2025 20:16:52 -0500
Subject: [PATCH 067/134] dbscan in cpp

---
 include/dbscan.h    |  31 +++++++++
 include/sv_object.h |   5 --
 src/dbscan.cpp      |  79 +++++++++++++++++++++++
 src/sv_caller.cpp   |   1 -
 src/sv_object.cpp   | 149 ++++++++++++++++++++++++--------------------
 5 files changed, 192 insertions(+), 73 deletions(-)
 create mode 100644 include/dbscan.h
 create mode 100644 src/dbscan.cpp

diff --git a/include/dbscan.h b/include/dbscan.h
new file mode 100644
index 00000000..923570c5
--- /dev/null
+++ b/include/dbscan.h
@@ -0,0 +1,31 @@
+#ifndef DBSCAN_H
+#define DBSCAN_H
+
+#include <vector>
+#include <utility>
+#include <cmath>
+#include <algorithm>
+
+#include "sv_object.h"
+
+class DBSCAN {
+    public:
+        DBSCAN(double epsilon, int minPts) : epsilon(epsilon), minPts(minPts) {}
+
+        void fit(const std::vector<SVCall>& sv_calls);
+
+        const std::vector<int>& getClusters() const;
+
+    private:
+        double epsilon;
+        int minPts;
+        std::vector<int> clusters;
+
+        bool expandCluster(const std::vector<SVCall>& sv_calls, size_t pointIdx, int clusterId);
+
+        std::vector<size_t> regionQuery(const std::vector<SVCall>& sv_calls, size_t pointIdx) const;
+
+        double distance(const SVCall& a, const SVCall& b) const;
+};
+
+#endif // DBSCAN_H
diff --git a/include/sv_object.h b/include/sv_object.h
index 8ccbf79f..cf52cac6 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -33,11 +33,6 @@ struct SVCall {
     // Constructor with parameters for all fields
     SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size, uint8_t qual) :
         start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size), qual(qual) {}
-    // SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) :
-    //     start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {}
-        
-    // SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) :
-    //     start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {}
 };
 
 void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, uint8_t qual);
diff --git a/src/dbscan.cpp b/src/dbscan.cpp
new file mode 100644
index 00000000..d7cc3aeb
--- /dev/null
+++ b/src/dbscan.cpp
@@ -0,0 +1,79 @@
+#include "dbscan.h"
+
+#include <vector>
+#include <utility>
+#include <cmath>
+#include <algorithm>
+
+
+void DBSCAN::fit(const std::vector<SVCall>& sv_calls) {
+    int clusterId = 0;
+    // clusters.assign(points.size(), -1); // -1 means unclassified
+    clusters.assign(sv_calls.size(), -1); // -1 means unclassified
+
+    // for (size_t i = 0; i < points.size(); ++i) {
+    for (size_t i = 0; i < sv_calls.size(); ++i) {
+        if (clusters[i] == -1) { // if point is not yet classified
+            // if (expandCluster(points, i, clusterId)) {
+            if (expandCluster(sv_calls, i, clusterId)) {
+                ++clusterId;
+            }
+        }
+    }
+}
+
+const std::vector<int>& DBSCAN::getClusters() const {
+    return clusters;
+}
+
+// bool DBSCAN::expandCluster(const std::vector<std::pair<double, double>>&
+// points, size_t pointIdx, int clusterId) {
+bool DBSCAN::expandCluster(const std::vector<SVCall>& sv_calls, size_t pointIdx, int clusterId) {
+    std::vector<size_t> seeds = regionQuery(sv_calls, pointIdx);
+    if (static_cast<int>(seeds.size()) < minPts) {
+        clusters[pointIdx] = -2; // mark as noise
+        return false;
+    }
+
+    for (size_t seedIdx : seeds) {
+        clusters[seedIdx] = clusterId;
+    }
+
+    seeds.erase(std::remove(seeds.begin(), seeds.end(), pointIdx), seeds.end());
+
+    while (!seeds.empty()) {
+        size_t currentPoint = seeds.back();
+        seeds.pop_back();
+
+        std::vector<size_t> result = regionQuery(sv_calls, currentPoint);
+        if (static_cast<int>(result.size()) >= minPts) {
+            for (size_t resultPoint : result) {
+                if (clusters[resultPoint] == -1 || clusters[resultPoint] == -2) {
+                    if (clusters[resultPoint] == -1) {
+                        seeds.push_back(resultPoint);
+                    }
+                    clusters[resultPoint] = clusterId;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+std::vector<size_t> DBSCAN::regionQuery(const std::vector<SVCall>& sv_calls, size_t pointIdx) const {
+    std::vector<size_t> neighbors;
+    for (size_t i = 0; i < sv_calls.size(); ++i) {
+        if (distance(sv_calls[pointIdx], sv_calls[i]) <= epsilon) {
+            neighbors.push_back(i);
+        }
+    }
+    return neighbors;
+}
+
+double DBSCAN::distance(const SVCall& point1, const SVCall& point2) const {
+    // return std::sqrt(std::pow(point1.first - point2.first, 2) +
+    // std::pow(point1.second - point2.second, 2));
+    return std::sqrt(std::pow(static_cast<double>(point1.start) - static_cast<double>(point2.start), 2) +
+                     std::pow(static_cast<double>(point1.end) - static_cast<double>(point2.end), 2));
+}
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 0839e310..d7f79d01 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -373,7 +373,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         int region_start = region_data.first;
         int region_end = region_data.second;
         region = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end);
-        
     }
 
     // Load chromosome data for copy number predictions
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 5e31f3e8..0d4cec97 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -8,6 +8,7 @@
 #include <iostream>
 #include <numeric>
 
+#include "dbscan.h"
 #include "utils.h"
 
 bool SVCall::operator<(const SVCall & other) const
@@ -71,84 +72,98 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
     }
     int initial_size = sv_calls.size();
 
-    std::vector<bool> merged(sv_calls.size(), false);
+    // Cluster SVs using DBSCAN for each SV type
     std::vector<SVCall> merged_sv_calls;
 
-    // Sort SVs by start position to improve efficiency
-    std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-        return a.start < b.start;
-    });
-
-    for (size_t i = 0; i < sv_calls.size(); i++) {
-        if (merged[i]) continue;
-
-        std::vector<SVCall> cluster;
-        cluster.push_back(sv_calls[i]);
-        merged[i] = true;
-
-        // Use 10% of the length of the first SV as the threshold
-        uint32_t sv_a_window = (uint32_t) std::ceil((double) (sv_calls[i].end - sv_calls[i].start + 1) * 0.1);
-
-        // Find SVs that have start or end positions within 10% of each other's length
-        for (size_t j = i + 1; j < sv_calls.size(); j++) {
-            if (merged[j]) continue;
-
-            // Check if the SVs are within 10% of the largest SV's length
-            uint32_t sv_b_window = (uint32_t) std::ceil((double) (sv_calls[j].end - sv_calls[j].start + 1) * 0.1);
-            uint32_t sv_window = std::max(sv_a_window, sv_b_window);
-            bool start_within_window = std::abs((int) sv_calls[j].start - (int) sv_calls[i].start) <= (int) sv_window;
-            bool end_within_window = std::abs((int) sv_calls[j].end - (int) sv_calls[i].end) <= (int) sv_window;
-            if (start_within_window && end_within_window) {
-                cluster.push_back(sv_calls[j]);
-                merged[j] = true;
-            }
-        }
+    // Create a set of size intervals and corresponding DBSCAN epsilons
+    std::map<std::pair<int, int>, double> size_to_eps;
+    size_to_eps[{0, 1000}] = 200;
+    size_to_eps[{1000, 5000}] = 500;
+    size_to_eps[{5000, 10000}] = 3000;
+    size_to_eps[{10000, 50000}] = 4000;
+    size_to_eps[{50000, 100000}] = 5000;
+    size_to_eps[{100000, 500000}] = 10000;
+    size_to_eps[{500000, 1000000}] = 20000;
 
-        // Remove clusters with single SVs that have low support
-        if (cluster.size() < 2 && cluster[0].support < 2) {
-            continue;
-        }
+    for (auto& size_interval : size_to_eps) {
 
-        std::vector<SVCall> filtered_cluster = cluster;
+        // Calculate epsilon as 20% of the size interval
+        double epsilon = 0.2 * (size_interval.first.second - size_interval.first.first);
+        printMessage("Clustering SVs with size " + std::to_string(size_interval.first.first) + "-" + std::to_string(size_interval.first.second) + " with epsilon " + std::to_string(epsilon));
 
-        // If any SV length equals 2039, print all the SV calls in the cluster
-        bool found_2039 = false;
-        for (const auto& sv : filtered_cluster) {
-            if (sv.end - sv.start == 2039) {
-                printMessage("[TEST] Found SV with length 2039 at " + std::to_string(sv.start) + "-" + std::to_string(sv.end) + " (SUP=" + std::to_string(sv.support) + ")");
-                found_2039 = true;
-            }
-        }
-        if (found_2039) {
-            std::cout << "[TEST] Cluster of SVs with size " << filtered_cluster.size() << ":" << std::endl;
-            for (const auto& sv : filtered_cluster) {
-                printMessage("SV: " + std::to_string(sv.start) + "-" + std::to_string(sv.end) + " (SUP=" + std::to_string(sv.support) + ", LEN=" + std::to_string(sv.end - sv.start) + ")");
+        DBSCAN dbscan(size_interval.second, 2);
+        
+        for ( const auto& sv_type : {
+            SVType::DEL,
+            SVType::DUP,
+            SVType::INV,
+            SVType::INS,
+            SVType::BND,
+            SVType::INV_DUP
+        })
+        {
+            // DBSCAN dbscan(1000, 2);
+
+            // Create a vector of SV calls for the current SV type and size interval
+            std::vector<SVCall> sv_type_calls;
+            
+            // If the final size interval, then don't set an upper bound
+            int lower_bound = size_interval.first.first;
+            int upper_bound = size_interval.first.second;
+            if (lower_bound == 500000)
+            {
+                std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type, lower_bound](const SVCall& sv_call) {
+                    return sv_call.sv_type == sv_type && static_cast<int>(sv_call.end - sv_call.start) >= lower_bound;
+                });
+                // std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) {
+                //     return sv_call.sv_type == sv_type && (sv_call.end - sv_call.start) >= lower_bound;
+                // });
+            } else {
+                std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type, lower_bound, upper_bound](const SVCall& sv_call) {
+                    return sv_call.sv_type == sv_type && static_cast<int>(sv_call.end - sv_call.start) >= lower_bound && static_cast<int>(sv_call.end - sv_call.start) <= upper_bound;
+                });
+                // std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type, &size_interval](const SVCall& sv_call) {
+                //     return sv_call.sv_type == sv_type && (sv_call.end - sv_call.start) >= lower_bound && (sv_call.end - sv_call.start) <= upper_bound;
+                // });
             }
-        }
+            // std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) {
+            //     return sv_call.sv_type == sv_type && (sv_call.end - sv_call.start) >= size_interval.first && (sv_call.end - sv_call.start) <= size_interval.second;
+            // });
 
-        // Find the median-length SV in the cluster and use it as the merged SV
-        // Sort the cluster by length
-        std::sort(filtered_cluster.begin(), filtered_cluster.end(), [](const SVCall& a, const SVCall& b) {
-            return (a.end - a.start) < (b.end - b.start);
-        });
-
-        // Get the median SV
-        size_t median_index = filtered_cluster.size() / 2;
-        SVCall median_sv = filtered_cluster[median_index];
+            if (sv_type_calls.size() < 2) {
+                continue;
+            }
 
-        median_sv.cluster_size = (int) cluster.size();
-        
-        // Add the merged SV to the list
-        merged_sv_calls.push_back(median_sv);
-    }
-    sv_calls = std::move(merged_sv_calls); // Replace with filtered list
+            dbscan.fit(sv_type_calls);
+            const std::vector<int>& clusters = dbscan.getClusters();
+            std::map<int, std::vector<SVCall>> cluster_map;
+            for (size_t i = 0; i < clusters.size(); ++i) {
+                cluster_map[clusters[i]].push_back(sv_type_calls[i]);
+            }
 
-    // Print SVs that have length 2039
-    for (const auto& sv_call : sv_calls) {
-        if (sv_call.end - sv_call.start == 2039) {
-            printMessage("[TEST] Found merged SV with length 2039 at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " (SUP=" + std::to_string(sv_call.support) + ")");
+            // Merge SVs in each cluster
+            for (auto& cluster : cluster_map) {
+                int cluster_id = cluster.first;
+                // const std::vector<SVCall>& cluster_sv_calls = cluster.second;
+                std::vector<SVCall>& cluster_sv_calls = cluster.second;
+                if (cluster_id < 0) {
+                    continue;  // Skip noise and unclassified points
+                } else {
+                    // Use the median length SV
+                    // Sort the SVs in the cluster by their length
+                    std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+                        return (a.end - a.start) < (b.end - b.start);
+                    });
+                    int median_index = cluster_sv_calls.size() / 2;
+                    SVCall median_sv_call = cluster_sv_calls[median_index];
+                    median_sv_call.cluster_size = (int) cluster_sv_calls.size();
+                    merged_sv_calls.push_back(median_sv_call);
+                }
+            }
+            printMessage("Completed clustering for " + getSVTypeString(sv_type));
         }
     }
+    sv_calls = std::move(merged_sv_calls); // Replace with filtered list
 
     int updated_size = sv_calls.size();
     printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls");

From 4793bbfc694a779150f10814de622d8a63ea25bd Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sun, 2 Feb 2025 20:27:54 -0500
Subject: [PATCH 068/134] update dbscan

---
 include/sv_object.h |   2 +
 src/dbscan.cpp      |  17 +++-
 src/sv_caller.cpp   | 110 ++++--------------------
 src/sv_object.cpp   | 199 ++++++++++++++++++++++----------------------
 4 files changed, 132 insertions(+), 196 deletions(-)

diff --git a/include/sv_object.h b/include/sv_object.h
index cf52cac6..da544571 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -47,4 +47,6 @@ uint32_t getSVCount(const std::vector<SVCall>& sv_calls);
 
 void concatenateSVCalls(std::vector<SVCall>& sv_calls, const std::vector<SVCall>& sv_calls_update);
 
+void mergeSVs(std::vector<SVCall> &sv_calls, double epsilon, int min_pts);
+
 #endif // SV_OBJECT_H
diff --git a/src/dbscan.cpp b/src/dbscan.cpp
index d7cc3aeb..d5310292 100644
--- a/src/dbscan.cpp
+++ b/src/dbscan.cpp
@@ -74,6 +74,19 @@ std::vector<size_t> DBSCAN::regionQuery(const std::vector<SVCall>& sv_calls, siz
 double DBSCAN::distance(const SVCall& point1, const SVCall& point2) const {
     // return std::sqrt(std::pow(point1.first - point2.first, 2) +
     // std::pow(point1.second - point2.second, 2));
-    return std::sqrt(std::pow(static_cast<double>(point1.start) - static_cast<double>(point2.start), 2) +
-                     std::pow(static_cast<double>(point1.end) - static_cast<double>(point2.end), 2));
+    // return std::sqrt(std::pow(static_cast<double>(point1.start) - static_cast<double>(point2.start), 2) +
+    // std::pow(static_cast<double>(point1.end) -
+    // static_cast<double>(point2.end), 2));
+    
+    // Calculate reciprocal overlap-based distance
+    // https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02840-6
+    // https://link.springer.com/article/10.1186/gb-2009-10-10-r119
+    int overlap = std::max(0, std::min(static_cast<int>(point1.end), static_cast<int>(point2.end)) - std::max(static_cast<int>(point1.start), static_cast<int>(point2.start)));
+    int length1 = static_cast<int>(point1.end - point1.start);
+    int length2 = static_cast<int>(point2.end - point2.start);
+
+    // Minimum reciprocal overlap
+    double distance = 1.0 - std::min(static_cast<double>(overlap) / static_cast<double>(length1), static_cast<double>(overlap) / static_cast<double>(length2));
+    // double distance = 1.0 - static_cast<double>(overlap) / std::min(length1, length2);
+    return distance;  // 0.0 means identical, 1.0 means no overlap
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index d7f79d01..6c137c8c 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -391,7 +391,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
 
     printMessage(chr + ": Merging CIGAR...");
     // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold);
-    mergeSVs(chr_sv_calls);
+    mergeSVs(chr_sv_calls, 0.8, 5);
     // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold);
     int region_sv_count = getSVCount(chr_sv_calls);
     printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
@@ -417,11 +417,9 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     // printMessage(chr + ": Merging split reads...");
     // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold);
     // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold, "SPLIT");
-    // mergeSVs(chr_sv_calls);
 
     // Run a final merge on the combined SV calls
     // printMessage(chr + ": Merging final calls...");
-    // mergeSVs(chr_sv_calls);
     printMessage("Completed chromosome " + chr);
 }
 
@@ -528,9 +526,9 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
 
     // Find split-read SV evidence
     // printMessage(region + ": Finding split-read SVs...");
+    std::vector<SVCall> split_sv_calls;
     int current_primary = 0;
     int primary_count = primary_map.size();
-    //int primary_count = primary_map.size();
     uint32_t min_cnv_length = input_data.getMinCNVLength();
     for (auto& entry : primary_map) {
         current_primary++;
@@ -557,7 +555,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 // Reverse-oriented relative to the reference
                 alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]";
             }
-            addSVCall(sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, primary.qual);
+            addSVCall(split_sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, primary.qual);
 
             // Create the alternate allele format for the second BND record
             alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "[";
@@ -565,7 +563,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 // Reverse-oriented relative to the reference
                 alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]";
             }
-            addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, largest_supp.qual);
+            addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, largest_supp.qual);
 
             continue;
         }
@@ -585,9 +583,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                     continue;
                 }
 
-                // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")...");
-                // std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data);
-                // printMessage("Running copy number prediction, length: " + std::to_string(supp_region.end - supp_region.start));
                 std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, largest_supp.start, largest_supp.end, mean_chr_cov, pos_depth_map, input_data);
                 if (std::get<1>(result) == SVType::UNKNOWN) {
                     continue;
@@ -601,84 +596,17 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 if (supp_type == SVType::NEUTRAL) {
                     // addSVCall(sv_calls, supp_start, supp_end, "INV",
                     // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                    addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "<INV>", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual);
+                    addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "<INV>", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual);
                     continue;
                     
                 } else if (supp_type == SVType::DUP) {
                     // addSVCall(sv_calls, supp_start, supp_end, "INVDUP",
                     // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                    addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "<INV>", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual);
+                    addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "<INV>", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual);
                     continue;
                 }
             }
         }
-        
-        // GenomicRegion largest_supp_region = supp_map[qname][0];
-        // uint32_t largest_supp_length = 0;
-
-        // printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
-        // const std::string& primary_chr = bamHdr->target_name[primary.tid];
-        // // for (auto it = supp_map[qname].begin(); it != supp_map[qname].end();
-        // // ++it) {
-        // for (auto& supp_region : supp_regions) {
-        //     // GenomicRegion& supp_region = *it;
-
-        //     // Skip if not on the primary chromosome
-        //     if (primary.tid != supp_region.tid) {
-        //         continue;
-        //     }
-
-        //     // Get the supplementary alignment information
-        //     // uint32_t supp_start = (uint32_t) supp_region.start;
-        //     // uint32_t supp_end = (uint32_t) supp_region.end;
-        //     // uint32_t supp_length = supp_end - supp_start + 1;
-        //     // if (supp_length > largest_supp_length) {
-        //     //     largest_supp_length = supp_length;
-        //     //     largest_supp_region = *it;
-        //     // }
-
-        //     // Inversion detection
-        //     bool is_opposite_strand = primary.strand != supp_region.strand;
-        //     if (is_opposite_strand) {
-        //         // if (supp_length >= min_cnv_length) {
-        //         if (supp_region.end - supp_region.start >= min_cnv_length) {
-
-        //             // Print error if the start position is greater than the end
-        //             // position
-        //             // if (supp_start > supp_end) {
-        //             if (supp_region.start > supp_region.end) {
-        //                 printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_region.start) + "-" + std::to_string(supp_region.end));
-        //                 // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end));
-        //                 continue;
-        //             }
-
-        //             // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")...");
-        //             // std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data);
-        //             // printMessage("Running copy number prediction, length: " + std::to_string(supp_region.end - supp_region.start));
-        //             std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_region.start, supp_region.end, mean_chr_cov, pos_depth_map, input_data);
-        //             if (std::get<1>(result) == SVType::UNKNOWN) {
-        //                 continue;
-        //             }
-
-        //             double supp_lh = std::get<0>(result);
-        //             SVType supp_type = std::get<1>(result);
-        //             // printMessage("Test3");
-        //             int read_depth = this->calculateReadDepth(pos_depth_map, supp_region.start, supp_region.end);
-        //             // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
-        //             if (supp_type == SVType::NEUTRAL) {
-        //                 // addSVCall(sv_calls, supp_start, supp_end, "INV",
-        //                 // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-        //                 addSVCall(sv_calls, supp_region.start, supp_region.end, "INV", "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                        
-        //                 sv_count++;
-        //             } else if (supp_type == SVType::DUP) {
-        //                 // addSVCall(sv_calls, supp_start, supp_end, "INVDUP",
-        //                 // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-        //                 addSVCall(sv_calls, supp_region.start, supp_region.end, "INVDUP", "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-        //             }
-        //         }
-        //     }
-        // }
 
         // Analyze split-read evidence for deletions and duplications
         uint8_t mean_qual = (primary.qual + largest_supp.qual) / 2;
@@ -689,19 +617,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
         gap_left = std::min(primary.end, largest_supp.start);
         gap_right = std::max(primary.start, largest_supp.end);
         gap_exists = gap_left < gap_right;
-        // if (primary.start < largest_supp_region.start) {  // Primary before supp
-        //     boundary_left = primary.start;
-        //     boundary_right = std::max(primary.end, largest_supp_region.end);
-        //     gap_left = primary.end;
-        //     gap_right = largest_supp_region.start;
-        //     gap_exists = gap_left < gap_right;
-        // } else {
-        //     boundary_left = largest_supp_region.start;
-        //     boundary_right = std::max(primary.end, largest_supp_region.end);
-        //     gap_left = largest_supp_region.end;
-        //     gap_right = primary.start;
-        //     gap_exists = gap_left < gap_right;
-        // }
         
         // Run copy number variant predictions on the boundary if large enough
         if (boundary_right - boundary_left >= min_cnv_length) {
@@ -747,18 +662,18 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 if (gap_lh > bd_lh) {
                     int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right);
                     std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">";
-                    addSVCall(sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth, mean_qual);
+                    addSVCall(split_sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth, mean_qual);
                 } else {
                     // Add the boundary as the SV call
                     int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                     std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-                    addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual);
+                    addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual);
                 }
             } else {
                 // Add the boundary as the SV call
                 int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                 std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-                addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual);
+                addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual);
             }
         }
 
@@ -767,6 +682,13 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
             printMessage(region + ": Processed " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
         }
     }
+
+    // Merge the split-read SV calls
+    printMessage(region + ": Merging split-read SVs...");
+    mergeSVs(split_sv_calls, 0.1, 2);
+
+    // Unify the SV calls
+    sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
 }
 
 void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 0d4cec97..bf4e44a5 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -34,25 +34,26 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVTy
     // Insert the SV call in sorted order
     SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1, qual};
     auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call);
-
-    // Determine if the SV call already exists
-    if (it != sv_calls.end() && it->start == start && it->end == end)
-    {
-        it->support += 1;  // Update the read support
-
-        // Update SV type if likelihood is higher
-        if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood)
-        {
-            // Update the SV call
-            it->sv_type = sv_type;
-            it->data_type = data_type;
-            it->genotype = genotype;
-            it->hmm_likelihood = hmm_likelihood;
-            it->qual = qual;
-        }
-    } else {
-        sv_calls.insert(it, sv_call);  // Insert the new SV call
-    }
+    sv_calls.insert(it, sv_call);
+
+    // // Determine if the SV call already exists
+    // if (it != sv_calls.end() && it->start == start && it->end == end)
+    // {
+    //     it->support += 1;  // Update the read support
+
+    //     // Update SV type if likelihood is higher
+    //     if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood)
+    //     {
+    //         // Update the SV call
+    //         it->sv_type = sv_type;
+    //         it->data_type = data_type;
+    //         it->genotype = genotype;
+    //         it->hmm_likelihood = hmm_likelihood;
+    //         it->qual = qual;
+    //     }
+    // } else {
+    //     sv_calls.insert(it, sv_call);  // Insert the new SV call
+    // }
 }
 
 uint32_t getSVCount(const std::vector<SVCall>& sv_calls)
@@ -65,7 +66,7 @@ void concatenateSVCalls(std::vector<SVCall> &target, const std::vector<SVCall>&
     target.insert(target.end(), source.begin(), source.end());
 }
 
-void mergeSVs(std::vector<SVCall>& sv_calls)
+void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
 {
     if (sv_calls.size() < 2) {
         return;
@@ -77,91 +78,89 @@ void mergeSVs(std::vector<SVCall>& sv_calls)
 
     // Create a set of size intervals and corresponding DBSCAN epsilons
     std::map<std::pair<int, int>, double> size_to_eps;
-    size_to_eps[{0, 1000}] = 200;
-    size_to_eps[{1000, 5000}] = 500;
-    size_to_eps[{5000, 10000}] = 3000;
-    size_to_eps[{10000, 50000}] = 4000;
-    size_to_eps[{50000, 100000}] = 5000;
-    size_to_eps[{100000, 500000}] = 10000;
-    size_to_eps[{500000, 1000000}] = 20000;
-
-    for (auto& size_interval : size_to_eps) {
-
-        // Calculate epsilon as 20% of the size interval
-        double epsilon = 0.2 * (size_interval.first.second - size_interval.first.first);
-        printMessage("Clustering SVs with size " + std::to_string(size_interval.first.first) + "-" + std::to_string(size_interval.first.second) + " with epsilon " + std::to_string(epsilon));
-
-        DBSCAN dbscan(size_interval.second, 2);
-        
-        for ( const auto& sv_type : {
-            SVType::DEL,
-            SVType::DUP,
-            SVType::INV,
-            SVType::INS,
-            SVType::BND,
-            SVType::INV_DUP
-        })
-        {
-            // DBSCAN dbscan(1000, 2);
-
-            // Create a vector of SV calls for the current SV type and size interval
-            std::vector<SVCall> sv_type_calls;
-            
-            // If the final size interval, then don't set an upper bound
-            int lower_bound = size_interval.first.first;
-            int upper_bound = size_interval.first.second;
-            if (lower_bound == 500000)
-            {
-                std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type, lower_bound](const SVCall& sv_call) {
-                    return sv_call.sv_type == sv_type && static_cast<int>(sv_call.end - sv_call.start) >= lower_bound;
-                });
-                // std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) {
-                //     return sv_call.sv_type == sv_type && (sv_call.end - sv_call.start) >= lower_bound;
-                // });
-            } else {
-                std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type, lower_bound, upper_bound](const SVCall& sv_call) {
-                    return sv_call.sv_type == sv_type && static_cast<int>(sv_call.end - sv_call.start) >= lower_bound && static_cast<int>(sv_call.end - sv_call.start) <= upper_bound;
-                });
-                // std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type, &size_interval](const SVCall& sv_call) {
-                //     return sv_call.sv_type == sv_type && (sv_call.end - sv_call.start) >= lower_bound && (sv_call.end - sv_call.start) <= upper_bound;
-                // });
-            }
-            // std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) {
-            //     return sv_call.sv_type == sv_type && (sv_call.end - sv_call.start) >= size_interval.first && (sv_call.end - sv_call.start) <= size_interval.second;
-            // });
+    // size_to_eps[{0, 1000}] = 200;
+    // size_to_eps[{1000, 5000}] = 500;
+    // size_to_eps[{5000, 10000}] = 3000;
+    // size_to_eps[{10000, 50000}] = 4000;
+    // size_to_eps[{50000, 100000}] = 5000;
+    // size_to_eps[{100000, 500000}] = 10000;
+    // size_to_eps[{500000, 1000000}] = 20000;
+
+    // Small SVs
+    // size_to_eps[{50, 200}] = 50;
+
+    // // Medium SVs
+    // size_to_eps[{200, 1000}] = 200;
+
+    // // Large SVs
+    // size_to_eps[{1000, 10000}] = 1000;
+
+    // // Very large SVs
+    // size_to_eps[{10000, 100000}] = 10000;
+
+    // // Extreme SVs
+    // size_to_eps[{100000, 1000000}] = 20000;
+
+    // std::vector<double> epsilons = {50, 200, 1000, 10000, 100000, 500000};
+    // std::vector<double> epsilons = {0.2};
+
+    // double epsilon = size_interval.second;
+    // // Calculate epsilon as 20% of the largest size in the interval
+    // double epsilon = 0.1 * size_interval.first.second;
+    // printMessage("Clustering SVs with size " + std::to_string(size_interval.first.first) + "-" + std::to_string(size_interval.first.second) + " with epsilon " + std::to_string(epsilon));
+    // int min_pts = 2;
+    // int min_pts = 2;
+    DBSCAN dbscan(epsilon, min_pts);
+    // DBSCAN dbscan(size_interval.second, 2);
+    
+    for ( const auto& sv_type : {
+        SVType::DEL,
+        SVType::DUP,
+        SVType::INV,
+        SVType::INS,
+        SVType::BND,
+        SVType::INV_DUP
+    })
+    {
+        // DBSCAN dbscan(1000, 2);
 
-            if (sv_type_calls.size() < 2) {
-                continue;
-            }
+        // Create a vector of SV calls for the current SV type and size interval
+        std::vector<SVCall> sv_type_calls;
+        std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) {
+            return sv_call.sv_type == sv_type;
+        });
 
-            dbscan.fit(sv_type_calls);
-            const std::vector<int>& clusters = dbscan.getClusters();
-            std::map<int, std::vector<SVCall>> cluster_map;
-            for (size_t i = 0; i < clusters.size(); ++i) {
-                cluster_map[clusters[i]].push_back(sv_type_calls[i]);
-            }
+        if (sv_type_calls.size() < 2) {
+            continue;
+        }
 
-            // Merge SVs in each cluster
-            for (auto& cluster : cluster_map) {
-                int cluster_id = cluster.first;
-                // const std::vector<SVCall>& cluster_sv_calls = cluster.second;
-                std::vector<SVCall>& cluster_sv_calls = cluster.second;
-                if (cluster_id < 0) {
-                    continue;  // Skip noise and unclassified points
-                } else {
-                    // Use the median length SV
-                    // Sort the SVs in the cluster by their length
-                    std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-                        return (a.end - a.start) < (b.end - b.start);
-                    });
-                    int median_index = cluster_sv_calls.size() / 2;
-                    SVCall median_sv_call = cluster_sv_calls[median_index];
-                    median_sv_call.cluster_size = (int) cluster_sv_calls.size();
-                    merged_sv_calls.push_back(median_sv_call);
-                }
+        dbscan.fit(sv_type_calls);
+        const std::vector<int>& clusters = dbscan.getClusters();
+        std::map<int, std::vector<SVCall>> cluster_map;
+        for (size_t i = 0; i < clusters.size(); ++i) {
+            cluster_map[clusters[i]].push_back(sv_type_calls[i]);
+        }
+
+        // Merge SVs in each cluster
+        int cluster_count = 0;
+        for (auto& cluster : cluster_map) {
+            int cluster_id = cluster.first;
+            std::vector<SVCall>& cluster_sv_calls = cluster.second;
+            if (cluster_id < 0) {
+                continue;  // Skip noise and unclassified points
+            } else {
+                // Use the median length SV
+                std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+                    return (a.end - a.start) < (b.end - b.start);
+                });
+                int median_index = cluster_sv_calls.size() / 2;
+                SVCall median_sv_call = cluster_sv_calls[median_index];
+                median_sv_call.cluster_size = (int) cluster_sv_calls.size();
+                merged_sv_calls.push_back(median_sv_call);
+                cluster_count++;
             }
-            printMessage("Completed clustering for " + getSVTypeString(sv_type));
         }
+        printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type));
     }
     sv_calls = std::move(merged_sv_calls); // Replace with filtered list
 

From 0c329a3aa8946eeff00356f7149da169fa96c881 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 3 Feb 2025 16:51:42 -0500
Subject: [PATCH 069/134] add dbscan parameters

---
 include/input_data.h | 10 ++++++++++
 src/input_data.cpp   | 22 ++++++++++++++++++++++
 src/main.cpp         | 15 +++++++++++++++
 src/sv_caller.cpp    |  7 +++++--
 src/sv_object.cpp    | 37 +------------------------------------
 5 files changed, 53 insertions(+), 38 deletions(-)

diff --git a/include/input_data.h b/include/input_data.h
index 3960d362..d88426f3 100644
--- a/include/input_data.h
+++ b/include/input_data.h
@@ -65,6 +65,14 @@ class InputData {
         void setMinReadSupport(int min_reads);
         int getMinReadSupport() const;
 
+        // Set the epsilon parameter for DBSCAN clustering.
+        void setDBSCAN_Epsilon(double epsilon);
+        double getDBSCAN_Epsilon() const;
+
+        // Set the minimum number of points in a cluster for DBSCAN.
+        void setDBSCAN_MinPts(int min_pts);
+        int getDBSCAN_MinPts() const;
+
         // Set the chromosome to analyze.
         void setChromosome(std::string chr);
         std::string getChromosome() const;
@@ -103,6 +111,8 @@ class InputData {
         int sample_size;
         uint32_t min_cnv_length;
         int min_reads;
+        double dbscan_epsilon;
+        int dbscan_min_pts;
         std::string chr;  // Chromosome to analyze
         std::pair<int32_t, int32_t> start_end;  // Region to analyze
         bool region_set;  // True if a region is set
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 40a640a2..649e8b1c 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -27,6 +27,8 @@ InputData::InputData()
     this->sample_size = 100;
     this->min_cnv_length = 1000;
     this->min_reads = 5;
+    this->dbscan_epsilon = 0.5;
+    this->dbscan_min_pts = 5;
     this->thread_count = 1;
     this->hmm_filepath = "data/wgs.hmm";
     this->verbose = false;
@@ -170,6 +172,26 @@ int InputData::getMinReadSupport() const
     return this->min_reads;
 }
 
+void InputData::setDBSCAN_Epsilon(double epsilon)
+{
+    this->dbscan_epsilon = epsilon;
+}
+
+double InputData::getDBSCAN_Epsilon() const
+{
+    return this->dbscan_epsilon;
+}
+
+void InputData::setDBSCAN_MinPts(int min_pts)
+{
+    this->dbscan_min_pts = min_pts;
+}
+
+int InputData::getDBSCAN_MinPts() const
+{
+    return this->dbscan_min_pts;
+}
+
 void InputData::setChromosome(std::string chr)
 {
     this->chr = chr;
diff --git a/src/main.cpp b/src/main.cpp
index 58d8fbdc..5275f368 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -62,6 +62,15 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
         input_data.setVerbose(true);
     }
 
+    // DBSCAN parameters
+    if (args.find("epsilon") != args.end()) {
+        input_data.setDBSCAN_Epsilon(std::stod(args.at("epsilon")));
+    }
+
+    if (args.find("min-pts") != args.end()) {
+        input_data.setDBSCAN_MinPts(std::stoi(args.at("min-pts")));
+    }
+
     // Run ContextSV
     run(input_data);
 }
@@ -80,6 +89,8 @@ void printUsage(const std::string& programName) {
                 << "  -n, --sample-size <size>      Sample size for HMM predictions\n"
                 << "     --min-cnv <min_length>     Minimum CNV length\n"
                 << "     --min-reads <min_reads>    Minimum read support\n"
+                << "     --eps <epsilon>             DBSCAN epsilon\n"
+                << "     --min-pts <min_pts>         DBSCAN minimum points\n"
                 << "  -e, --eth <eth_file>          ETH file\n"
                 << "  -p, --pfb <pfb_file>          PFB file\n"
                 << "     --save-cnv                 Save CNV data\n"
@@ -116,6 +127,10 @@ std::unordered_map<std::string, std::string> parseArguments(int argc, char* argv
             args["min-cnv"] = argv[++i];
         } else if (arg == "--min-reads" && i + 1 < argc) {
             args["min-reads"] = argv[++i];
+        } else if (arg == "--eps" && i + 1 < argc) {
+            args["epsilon"] = argv[++i];
+        } else if (arg == "--min-pts" && i + 1 < argc) {
+            args["min-pts"] = argv[++i];
         } else if ((arg == "-e" || arg == "--eth") && i + 1 < argc) {
             args["eth"] = argv[++i];
         } else if ((arg == "-p" || arg == "--pfb") && i + 1 < argc) {
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 6c137c8c..1d8115f5 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -331,7 +331,10 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     // int split_sv_support_threshold = 4;  // Minimum number of supporting
     // reads for an SV call
     int split_sv_support_threshold = input_data.getMinReadSupport();
-    // printMessage("Processing chromosome " + chr + " with filter threshold: " + std::to_string(filter_threshold));
+    // printMessage("Processing chromosome " + chr + " with filter threshold: "
+    // + std::to_string(filter_threshold));
+    double dbscan_epsilon = input_data.getDBSCAN_Epsilon();
+    int dbscan_min_pts = input_data.getDBSCAN_MinPts();
 
     // Open the BAM file
     std::string bam_filepath = input_data.getLongReadBam();
@@ -391,7 +394,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
 
     printMessage(chr + ": Merging CIGAR...");
     // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold);
-    mergeSVs(chr_sv_calls, 0.8, 5);
+    mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts);
     // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold);
     int region_sv_count = getSVCount(chr_sv_calls);
     printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index bf4e44a5..0dd23e79 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -77,42 +77,9 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
     std::vector<SVCall> merged_sv_calls;
 
     // Create a set of size intervals and corresponding DBSCAN epsilons
+    printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts));
     std::map<std::pair<int, int>, double> size_to_eps;
-    // size_to_eps[{0, 1000}] = 200;
-    // size_to_eps[{1000, 5000}] = 500;
-    // size_to_eps[{5000, 10000}] = 3000;
-    // size_to_eps[{10000, 50000}] = 4000;
-    // size_to_eps[{50000, 100000}] = 5000;
-    // size_to_eps[{100000, 500000}] = 10000;
-    // size_to_eps[{500000, 1000000}] = 20000;
-
-    // Small SVs
-    // size_to_eps[{50, 200}] = 50;
-
-    // // Medium SVs
-    // size_to_eps[{200, 1000}] = 200;
-
-    // // Large SVs
-    // size_to_eps[{1000, 10000}] = 1000;
-
-    // // Very large SVs
-    // size_to_eps[{10000, 100000}] = 10000;
-
-    // // Extreme SVs
-    // size_to_eps[{100000, 1000000}] = 20000;
-
-    // std::vector<double> epsilons = {50, 200, 1000, 10000, 100000, 500000};
-    // std::vector<double> epsilons = {0.2};
-
-    // double epsilon = size_interval.second;
-    // // Calculate epsilon as 20% of the largest size in the interval
-    // double epsilon = 0.1 * size_interval.first.second;
-    // printMessage("Clustering SVs with size " + std::to_string(size_interval.first.first) + "-" + std::to_string(size_interval.first.second) + " with epsilon " + std::to_string(epsilon));
-    // int min_pts = 2;
-    // int min_pts = 2;
     DBSCAN dbscan(epsilon, min_pts);
-    // DBSCAN dbscan(size_interval.second, 2);
-    
     for ( const auto& sv_type : {
         SVType::DEL,
         SVType::DUP,
@@ -122,8 +89,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
         SVType::INV_DUP
     })
     {
-        // DBSCAN dbscan(1000, 2);
-
         // Create a vector of SV calls for the current SV type and size interval
         std::vector<SVCall> sv_type_calls;
         std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) {

From 686fe9965fc1fd643fea46a4a39c17567f1ba437 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 6 Feb 2025 20:13:01 -0500
Subject: [PATCH 070/134] cluster primary alignments

---
 .gitignore                   |   1 +
 include/dbscan.h             |   4 +
 include/sv_caller.h          |   2 -
 python/plot_distributions.py |  14 ++-
 src/sv_caller.cpp            | 206 +++++++++++++++++++----------------
 src/sv_object.cpp            |  15 ++-
 6 files changed, 139 insertions(+), 103 deletions(-)

diff --git a/.gitignore b/.gitignore
index 343adf0f..9f6f43d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -87,6 +87,7 @@ data/hg19ToHg38.over.chain.gz
 # Test images
 python/dbscan_clustering*.png
 python/dist_plots
+upset_plot*.png
 
 # Temporary files
 lib/.nfs*
diff --git a/include/dbscan.h b/include/dbscan.h
index 923570c5..9826a144 100644
--- a/include/dbscan.h
+++ b/include/dbscan.h
@@ -12,6 +12,7 @@ class DBSCAN {
     public:
         DBSCAN(double epsilon, int minPts) : epsilon(epsilon), minPts(minPts) {}
 
+        // Fit the DBSCAN algorithm to SV calls
         void fit(const std::vector<SVCall>& sv_calls);
 
         const std::vector<int>& getClusters() const;
@@ -21,10 +22,13 @@ class DBSCAN {
         int minPts;
         std::vector<int> clusters;
 
+        // Expand the cluster for a given SV call
         bool expandCluster(const std::vector<SVCall>& sv_calls, size_t pointIdx, int clusterId);
 
+        // Find the region query for a given SV call
         std::vector<size_t> regionQuery(const std::vector<SVCall>& sv_calls, size_t pointIdx) const;
 
+        // Calculate the distance between two SV calls
         double distance(const SVCall& a, const SVCall& b) const;
 };
 
diff --git a/include/sv_caller.h b/include/sv_caller.h
index 1824015c..5eb96e17 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -61,8 +61,6 @@ class SVCaller {
 
         void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const;
 
-        void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches);
-
         // Calculate the read depth (INFO/DP) for a region
         int calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end);
 
diff --git a/python/plot_distributions.py b/python/plot_distributions.py
index 7db7cb2a..8766a157 100644
--- a/python/plot_distributions.py
+++ b/python/plot_distributions.py
@@ -192,13 +192,21 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"):
     # Add the bin edges to the x-axis ticks as a range
     fig.update_xaxes(tickvals=x_values, ticktext=bin_labels)
 
-    # Move the legend to the top right inside the plot
+    # # Move the legend to the top right inside the plot
+    # fig.update_layout(legend=dict(
+    #     orientation='v',
+    #     yanchor='top',
+    #     y=0.75,
+    #     xanchor='right',
+    #     x=0.75,
+    # ))
+    # Move the legend to the bottom right outside the plot
     fig.update_layout(legend=dict(
         orientation='v',
         yanchor='top',
-        y=0.75,
+        y=1.0,
         xanchor='right',
-        x=0.75,
+        x=1.15,
     ))
 
     # Set a larger font size for all text in the plot
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 1d8115f5..e6ff793a 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -27,6 +27,7 @@
 #include "sv_types.h"
 #include "version.h"
 #include "fasta_query.h"
+#include "dbscan.h"
 /// @endcond
 
 # define DUP_SEQSIM_THRESHOLD 0.9  // Sequence similarity threshold for duplication detection
@@ -107,45 +108,106 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
     bam_destroy1(bam1);
     printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments");
 
-    // Filter overlapping primary alignments and keep the one with the highest mapping
-    // quality
-    // std::vector<std::string> to_remove_overlapping;
-    std::unordered_set<std::string> to_remove_overlapping;
-    for (const auto& entry1 : primary_map) {
-        const std::string& qname1 = entry1.first;
-        const GenomicRegion& primary1 = entry1.second;
-        for (const auto& entry2 : primary_map) {
-            const std::string& qname2 = entry2.first;
-            if (qname1 == qname2) {
-                continue;
-            }
-            const GenomicRegion& primary2 = entry2.second;
-            if (primary1.tid == primary2.tid && primary1.start <= primary2.end && primary1.end >= primary2.start) {
-                // Overlapping primary alignments
-                // printMessage("Overlapping primary alignments with quality " + std::to_string(primary_map_qual[qname1]) + " and " + std::to_string(primary_map_qual[qname2]));
-                // if (primary_map_qual[qname1] < primary_map_qual[qname2]) {
-                if (primary1.qual < primary2.qual) {
-                    // to_remove_overlapping.push_back(qname1);
-                    to_remove_overlapping.insert(qname1);
-                } else {
-                    // If equal, remove the shorter alignment
-                    if (primary1.end - primary1.start < primary2.end - primary2.start) {
-                        // to_remove_overlapping.push_back(qname1);
-                        to_remove_overlapping.insert(qname1);
-                    } else {
-                        // to_remove_overlapping.push_back(qname2);
-                        to_remove_overlapping.insert(qname2);
-                    }
-                }
+    // Create a set of dummy SVs from the primary alignments for each chromosome
+    // and run DBSCAN to merge them
+    std::unordered_map<std::string, std::vector<SVCall>> dummy_sv_map;
+    std::unordered_map<std::string, std::vector<std::string>> dummy_sv_qnames;
+    for (const auto& entry : primary_map) {
+        const std::string& chrom = bamHdr->target_name[entry.second.tid];
+        uint32_t start = entry.second.start;
+        uint32_t end = entry.second.end;
+        const std::string& qname = entry.first;
+        SVCall sv_call(start, end, SVType::DUP, ".", qname, ".", 0.0, 0, 0, 0, 0);
+        dummy_sv_map[chrom].emplace_back(sv_call);
+        dummy_sv_qnames[chrom].emplace_back(entry.first);
+    }
+
+    // Run DBSCAN to merge the dummy SVs
+    double epsilon = 0.65;
+    int min_pts = 2;
+    std::unordered_set<std::string> qnames_to_keep;
+    for (const auto& entry : dummy_sv_map) {
+        const std::string& chrom = entry.first;
+        const std::vector<SVCall>& sv_calls = entry.second;
+        DBSCAN dbscan(epsilon, min_pts);
+        dbscan.fit(sv_calls);
+        const std::vector<int>& clusters = dbscan.getClusters();
+        std::map<int, std::vector<SVCall>> cluster_map;
+        for (size_t i = 0; i < clusters.size(); ++i) {
+            cluster_map[clusters[i]].push_back(sv_calls[i]);
+        }
+
+        // Merge the SVs in each cluster, using the median of the start and end
+        // positions of the SVs in each cluster
+        for (auto& cluster : cluster_map) {
+            int cluster_id = cluster.first;
+            std::vector<SVCall>& cluster_sv_calls = cluster.second;
+            if (cluster_id < 0) {
+                continue;  // Skip noise and unclassified points
             }
+            
+            // Use the median length SV as the representative SV
+            std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+                return (a.end - a.start) < (b.end - b.start);
+            });
+            SVCall median_sv = cluster_sv_calls[cluster_sv_calls.size() / 2];
+            const std::string& qname = median_sv.data_type;
+            qnames_to_keep.insert(qname);
         }
     }
 
-    for (const std::string& qname : to_remove_overlapping) {
+    // Remove the SVs that are not in the qnames_to_keep set
+    std::unordered_set<std::string> qnames_to_remove;
+    for (const auto& entry : primary_map) {
+        const std::string& qname = entry.first;
+        if (qnames_to_keep.find(qname) == qnames_to_keep.end()) {
+            qnames_to_remove.insert(qname);
+        }
+    }
+
+    for (const std::string& qname : qnames_to_remove) {
         primary_map.erase(qname);
         supp_map.erase(qname);
     }
-    printMessage(region + ": Removed " + std::to_string(to_remove_overlapping.size()) + " overlapping primary alignments");
+
+    // Filter overlapping primary alignments and keep the one with the highest mapping
+    // quality
+
+    // for (const auto& entry1 : primary_map) {
+    //     const std::string& qname1 = entry1.first;
+    //     const GenomicRegion& primary1 = entry1.second;
+    //     for (const auto& entry2 : primary_map) {
+    //         const std::string& qname2 = entry2.first;
+    //         if (qname1 == qname2) {
+    //             continue;
+    //         }
+    //         const GenomicRegion& primary2 = entry2.second;
+    //         if (primary1.tid == primary2.tid && primary1.start <= primary2.end && primary1.end >= primary2.start) {
+    //             // Overlapping primary alignments
+    //             // printMessage("Overlapping primary alignments with quality " + std::to_string(primary_map_qual[qname1]) + " and " + std::to_string(primary_map_qual[qname2]));
+    //             // if (primary_map_qual[qname1] < primary_map_qual[qname2]) {
+    //             if (primary1.qual < primary2.qual) {
+    //                 // to_remove_overlapping.push_back(qname1);
+    //                 to_remove_overlapping.insert(qname1);
+    //             } else {
+    //                 // If equal, remove the shorter alignment
+    //                 if (primary1.end - primary1.start < primary2.end - primary2.start) {
+    //                     // to_remove_overlapping.push_back(qname1);
+    //                     to_remove_overlapping.insert(qname1);
+    //                 } else {
+    //                     // to_remove_overlapping.push_back(qname2);
+    //                     to_remove_overlapping.insert(qname2);
+    //                 }
+    //             }
+    //         }
+    //     }
+    // }
+
+    // for (const std::string& qname : to_remove_overlapping) {
+    //     primary_map.erase(qname);
+    //     supp_map.erase(qname);
+    // }
+    // printMessage(region + ": Removed " + std::to_string(to_remove_overlapping.size()) + " overlapping primary alignments");
     printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supp_map.size()) + " supplementary alignments after filtering");
 }
 
@@ -230,6 +292,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     std::bitset<256> amb_bases_bitset;
     for (char base : amb_bases) {
         amb_bases_bitset.set(base);
+        amb_bases_bitset.set(std::tolower(base));
     }
     for (int i = 0; i < cigar_len; i++) {
         int op_len = bam_cigar_oplen(cigar[i]);  // CIGAR operation length
@@ -686,9 +749,9 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
         }
     }
 
-    // Merge the split-read SV calls
-    printMessage(region + ": Merging split-read SVs...");
-    mergeSVs(split_sv_calls, 0.1, 2);
+    // // Merge the split-read SV calls
+    // printMessage(region + ": Merging split-read SVs...");
+    // mergeSVs(split_sv_calls, 0.1, 2);
 
     // Unify the SV calls
     sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
@@ -855,6 +918,19 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                 printMessage("REF allele for DUP at " + chr + ":" + std::to_string(start) + "-" + std::to_string(end) + ": " + ref_allele + ", ALT allele: " + alt_allele);
             }
 
+            // Fix ambiguous bases in the reference allele
+            const std::string amb_bases = "RYKMSWBDHV";  // Ambiguous bases
+            std::bitset<256> amb_bases_bitset;
+            for (char base : amb_bases) {
+                amb_bases_bitset.set(base);
+                amb_bases_bitset.set(std::tolower(base));
+            }
+            for (char& base : ref_allele) {
+                if (amb_bases_bitset.test(base)) {
+                    base = 'N';
+                }
+            }
+
             // Create the VCF parameter strings
             std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
                 ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
@@ -887,66 +963,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     std::cout << "Finished writing VCF file. Total SV calls: " << total_count << ", skipped: " << skip_count << " with unknown SV type" << std::endl;
 }
 
-void SVCaller::trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches)
-{
-
-    // Check for overlapping read alignments
-    if (primary_mismatches.query_start < supp_mismatches.query_start) {
-        // Primary before supplementary in the query
-
-        // if (primary_query_end >= supp_query_start) {
-        if (primary_mismatches.query_end >= supp_mismatches.query_start) {
-            // Calculate the mismatch rates at the overlapping region
-            double primary_mismatch_rate = this->calculateMismatchRate(primary_mismatches);
-            double supp_mismatch_rate = this->calculateMismatchRate(supp_mismatches);
-            hts_pos_t overlap_length = primary_mismatches.query_end - supp_mismatches.query_start + 1;
-
-            // Trim the ailgnment with the higher mismatch rate
-            if (primary_mismatch_rate > supp_mismatch_rate) {
-                // Trim the end of the primary alignment, ensuring that the new
-                // end is not less than the start
-                if (primary_alignment.end > overlap_length && (primary_alignment.end - overlap_length) > primary_alignment.start) {
-                    // Trim the end of the primary alignment
-                    primary_alignment.end = primary_alignment.end - overlap_length;
-                }
-            } else {
-                // Trim the beginning of the supplementary alignment, ensuring
-                // that the new start is not greater than the end
-                if (supp_alignment.start + overlap_length < supp_alignment.end) {
-                    // Trim the beginning of the supplementary alignment
-                    supp_alignment.start = supp_alignment.start + overlap_length;
-                }
-            }
-        }
-
-    } else {
-        // Supplementary before primary in the query
-        if (primary_mismatches.query_start <= supp_mismatches.query_end) {
-            // Calculate the mismatch rates at the overlapping region
-            double primary_mismatch_rate = this->calculateMismatchRate(primary_mismatches);
-            double supp_mismatch_rate = this->calculateMismatchRate(supp_mismatches);
-            hts_pos_t overlap_length = supp_mismatches.query_end - primary_mismatches.query_start + 1;
-
-            // Trim the ailgnment with the higher mismatch rate
-            if (supp_mismatch_rate > primary_mismatch_rate) {
-                // Trim the end of the supplementary alignment, ensuring that
-                // the new end is not less than the start
-                if (supp_alignment.end > overlap_length && (supp_alignment.end - overlap_length) > supp_alignment.start) {
-                    // Trim the end of the supplementary alignment
-                    supp_alignment.end = supp_alignment.end - overlap_length;
-                }
-            } else {
-                // Trim the beginning of the primary alignment, ensuring that
-                // the new start is not greater than the end
-                if (primary_alignment.start + overlap_length < primary_alignment.end) {
-                    // Trim the beginning of the primary alignment
-                    primary_alignment.start = primary_alignment.start + overlap_length;
-                }
-            }
-        }
-    }
-}
-
 int SVCaller::calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end)
 {
     int read_depth = 0;
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 0dd23e79..fb3a17d9 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -77,9 +77,8 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
     std::vector<SVCall> merged_sv_calls;
 
     // Create a set of size intervals and corresponding DBSCAN epsilons
-    printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts));
-    std::map<std::pair<int, int>, double> size_to_eps;
-    DBSCAN dbscan(epsilon, min_pts);
+    // printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts));
+    // DBSCAN dbscan(epsilon, min_pts);
     for ( const auto& sv_type : {
         SVType::DEL,
         SVType::DUP,
@@ -89,6 +88,16 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
         SVType::INV_DUP
     })
     {
+        // Create a DBSCAN object for the current SV type
+        if (sv_type == SVType::DEL) {
+            epsilon = 0.45;
+            min_pts = 16;
+        } else {
+            epsilon = 0.65;
+            min_pts = 15;
+        }
+        DBSCAN dbscan(epsilon, min_pts);
+
         // Create a vector of SV calls for the current SV type and size interval
         std::vector<SVCall> sv_type_calls;
         std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) {

From d6f1b6f4635ec24b9ad868d0b2c649c4410010d8 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 8 Feb 2025 18:10:39 -0500
Subject: [PATCH 071/134] add invdel

---
 include/sv_caller.h |   2 +-
 include/sv_object.h |   7 +-
 include/sv_types.h  |   6 +-
 src/sv_caller.cpp   |  80 +++++++++++------------
 src/sv_object.cpp   | 153 ++++++++++++++++++++++++++++++--------------
 5 files changed, 154 insertions(+), 94 deletions(-)

diff --git a/include/sv_caller.h b/include/sv_caller.h
index 5eb96e17..fedd172f 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -52,7 +52,7 @@ class SVCaller {
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
 
         // Detect SVs from split alignments
-        void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data);
+        void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data);
 
         // Calculate the mismatch rate given a map of query positions to
         // match/mismatch (1/0) values within a specified range of the query
diff --git a/include/sv_object.h b/include/sv_object.h
index da544571..58ccd5dc 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -25,17 +25,16 @@ struct SVCall {
     int read_depth = 0;  // Breakpoint depth
     int support = 0;  // Number of supporting reads
     int cluster_size = 0;  // Number of SV calls in the cluster
-    uint8_t qual = 0;  // Alignment quality score
 
     // Comparison operator for std::set
     bool operator<(const SVCall& other) const;
 
     // Constructor with parameters for all fields
-    SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size, uint8_t qual) :
-        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size), qual(qual) {}
+    SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) :
+        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {}
 };
 
-void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, uint8_t qual);
+void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth);
 
 void mergeSVs(std::vector<SVCall>& sv_calls);
 
diff --git a/include/sv_types.h b/include/sv_types.h
index c0e1fabf..26415935 100644
--- a/include/sv_types.h
+++ b/include/sv_types.h
@@ -20,8 +20,9 @@ namespace sv_types {
         INS = 3,
         BND = 4,
         NEUTRAL = 5,  // Neutral copy number with unknown type
-        INV_DUP = 6,  // Inversion duplication
-        COMPLEX = 7  // Complex SV
+        INV_DUP = 6,  // Inverted duplication
+        INV_DEL = 7,  // Inverted deletion
+        COMPLEX = 8  // Complex SV
     };
 
     // Mapping of SV types to strings
@@ -34,6 +35,7 @@ namespace sv_types {
         {SVType::BND, "BND"},
         {SVType::NEUTRAL, "NEUTRAL"},
         {SVType::INV_DUP, "INVDUP"},
+        {SVType::INV_DEL, "INVDEL"},
         {SVType::COMPLEX, "COMPLEX"}
     };
 
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index e6ff793a..a8e25c5f 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -117,7 +117,7 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
         uint32_t start = entry.second.start;
         uint32_t end = entry.second.end;
         const std::string& qname = entry.first;
-        SVCall sv_call(start, end, SVType::DUP, ".", qname, ".", 0.0, 0, 0, 0, 0);
+        SVCall sv_call(start, end, SVType::DUP, ".", qname, ".", 0.0, 0, 0, 0);
         dummy_sv_map[chrom].emplace_back(sv_call);
         dummy_sv_qnames[chrom].emplace_back(entry.first);
     }
@@ -323,7 +323,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
                     {
                         int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                        addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "LSEQSIM", "./.", default_lh, read_depth, qual);
+                        addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "LSEQSIM", "./.", default_lh, read_depth);
                         continue;
                     }
                 }
@@ -337,7 +337,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
                     {
                         int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                        addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth, qual);
+                        addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth);
                         continue;
                     }
                 }
@@ -355,7 +355,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                     alt_allele = ins_seq_str;
                 }
                 
-                addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, qual);
+                addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth);
 
             // Check if the CIGAR operation is a deletion
             } else if (op == BAM_CDEL && is_primary) {
@@ -363,7 +363,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
                 int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
-                addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "<DEL>", "CIGARDEL", "./.", default_lh, read_depth, qual);
+                addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "<DEL>", "CIGARDEL", "./.", default_lh, read_depth);
 
                 // Print if the ref pos is within the range 44007800-44007930
                 if (ref_pos >= 44007800 && ref_pos <= 44007930) {
@@ -394,8 +394,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     // int split_sv_support_threshold = 4;  // Minimum number of supporting
     // reads for an SV call
     int split_sv_support_threshold = input_data.getMinReadSupport();
-    // printMessage("Processing chromosome " + chr + " with filter threshold: "
-    // + std::to_string(filter_threshold));
     double dbscan_epsilon = input_data.getDBSCAN_Epsilon();
     int dbscan_min_pts = input_data.getDBSCAN_MinPts();
 
@@ -456,13 +454,13 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
 
     printMessage(chr + ": Merging CIGAR...");
-    // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold);
-    mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts);
-    // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold);
+    double cigar_epsilon = 0.45;
+    int cigar_min_pts = 15;
+    mergeSVs(chr_sv_calls, cigar_epsilon, cigar_min_pts);
+
     int region_sv_count = getSVCount(chr_sv_calls);
     printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
-    // Testing on HG002 whole genome
     // Run copy number variant predictions on the SVs detected from the
     // CIGAR string, using a minimum CNV length threshold
     if (region_sv_count > 0) {
@@ -472,7 +470,23 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
 
     // Run split-read SV and copy number variant predictions
     printMessage(chr + ": Split read SVs...");
-    this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
+    std::vector<SVCall> split_sv_calls;
+    this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, split_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
+
+    // Merge the split-read SVs separately
+    printMessage(chr + ": Merging split reads...");
+    double split_epsilon = 0.45;
+    int split_min_pts = 2;
+    mergeSVs(split_sv_calls, split_epsilon, split_min_pts);
+
+    // Unify the SV calls
+    printMessage(chr + ": Unifying SVs...");
+    chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
+
+    // printMessage(chr + ": Final merge...");
+    // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts);
+
+    // TODO: Merge subsets based on highest HMM likelihood
 
     // Sort the SV calls by start position
     std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
@@ -583,7 +597,7 @@ void SVCaller::run(const InputData& input_data)
 
 
 // Detect SVs from split read alignments
-void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data)
+void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data)
 {
     // printMessage(region + ": Getting split alignments...");
     std::unordered_map<std::string, GenomicRegion> primary_map;
@@ -592,7 +606,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
 
     // Find split-read SV evidence
     // printMessage(region + ": Finding split-read SVs...");
-    std::vector<SVCall> split_sv_calls;
+    // std::vector<SVCall> split_sv_calls;
     int current_primary = 0;
     int primary_count = primary_map.size();
     uint32_t min_cnv_length = input_data.getMinCNVLength();
@@ -621,7 +635,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 // Reverse-oriented relative to the reference
                 alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]";
             }
-            addSVCall(split_sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, primary.qual);
+            addSVCall(split_sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0);
 
             // Create the alternate allele format for the second BND record
             alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "[";
@@ -629,7 +643,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 // Reverse-oriented relative to the reference
                 alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]";
             }
-            addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, largest_supp.qual);
+            addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0);
 
             continue;
         }
@@ -637,15 +651,12 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
         // Inversion detection
         bool is_opposite_strand = primary.strand != largest_supp.strand;
         if (is_opposite_strand) {
-            // if (supp_length >= min_cnv_length) {
             if (largest_supp.end - largest_supp.start >= min_cnv_length) {
 
                 // Print error if the start position is greater than the end
                 // position
-                // if (supp_start > supp_end) {
                 if (largest_supp.start > largest_supp.end) {
                     printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(largest_supp.start) + "-" + std::to_string(largest_supp.end));
-                    // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end));
                     continue;
                 }
 
@@ -656,26 +667,22 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
 
                 double supp_lh = std::get<0>(result);
                 SVType supp_type = std::get<1>(result);
-                // printMessage("Test3");
                 int read_depth = this->calculateReadDepth(pos_depth_map, largest_supp.start, largest_supp.end);
-                // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end);
                 if (supp_type == SVType::NEUTRAL) {
-                    // addSVCall(sv_calls, supp_start, supp_end, "INV",
-                    // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                    addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "<INV>", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual);
+                    addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
                     continue;
                     
                 } else if (supp_type == SVType::DUP) {
-                    // addSVCall(sv_calls, supp_start, supp_end, "INVDUP",
-                    // "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                    addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "<INV>", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual);
+                    addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+                    continue;
+                } else if (supp_type == SVType::DEL) {
+                    addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DEL, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
                     continue;
                 }
             }
         }
 
         // Analyze split-read evidence for deletions and duplications
-        uint8_t mean_qual = (primary.qual + largest_supp.qual) / 2;
         bool gap_exists = false;
         uint32_t boundary_left, boundary_right, gap_left, gap_right;
         boundary_left = std::min(primary.start, largest_supp.start);
@@ -693,10 +700,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 printError("ERROR: Invalid boundary coordinates: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right));
                 continue;
             }
-
-            // printMessage(region + ": Running copy number prediction for
-            // boundary...");
-            // printMessage("Running copy number prediction, length: " + std::to_string(boundary_right - boundary_left));
+            
             std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data);
             if (std::get<1>(bd_result) == SVType::UNKNOWN) {
                 continue;
@@ -728,18 +732,18 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 if (gap_lh > bd_lh) {
                     int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right);
                     std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">";
-                    addSVCall(split_sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth, mean_qual);
+                    addSVCall(split_sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth);
                 } else {
                     // Add the boundary as the SV call
                     int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                     std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-                    addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual);
+                    addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth);
                 }
             } else {
                 // Add the boundary as the SV call
                 int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
                 std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-                addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual);
+                addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth);
             }
         }
 
@@ -749,12 +753,8 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
         }
     }
 
-    // // Merge the split-read SV calls
-    // printMessage(region + ": Merging split-read SVs...");
-    // mergeSVs(split_sv_calls, 0.1, 2);
-
     // Unify the SV calls
-    sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
+    // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
 }
 
 void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index fb3a17d9..e46afb77 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -16,12 +16,8 @@ bool SVCall::operator<(const SVCall & other) const
 	return start < other.start || (start == other.start && end < other.end);
 }
 
-void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, uint8_t qual)
+void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth)
 {
-    // Ignore unknown SV types
-    // if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") {
-    //     return;
-    // }
     if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) {
         return;
     }
@@ -32,28 +28,9 @@ void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVTy
     }
 
     // Insert the SV call in sorted order
-    SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1, qual};
+    SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1};
     auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call);
     sv_calls.insert(it, sv_call);
-
-    // // Determine if the SV call already exists
-    // if (it != sv_calls.end() && it->start == start && it->end == end)
-    // {
-    //     it->support += 1;  // Update the read support
-
-    //     // Update SV type if likelihood is higher
-    //     if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood)
-    //     {
-    //         // Update the SV call
-    //         it->sv_type = sv_type;
-    //         it->data_type = data_type;
-    //         it->genotype = genotype;
-    //         it->hmm_likelihood = hmm_likelihood;
-    //         it->qual = qual;
-    //     }
-    // } else {
-    //     sv_calls.insert(it, sv_call);  // Insert the new SV call
-    // }
 }
 
 uint32_t getSVCount(const std::vector<SVCall>& sv_calls)
@@ -68,6 +45,8 @@ void concatenateSVCalls(std::vector<SVCall> &target, const std::vector<SVCall>&
 
 void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
 {
+    printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts));
+    
     if (sv_calls.size() < 2) {
         return;
     }
@@ -76,9 +55,8 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
     // Cluster SVs using DBSCAN for each SV type
     std::vector<SVCall> merged_sv_calls;
 
-    // Create a set of size intervals and corresponding DBSCAN epsilons
-    // printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts));
-    // DBSCAN dbscan(epsilon, min_pts);
+    // Cluster SVs using DBSCAN for each SV type
+    DBSCAN dbscan(epsilon, min_pts);
     for ( const auto& sv_type : {
         SVType::DEL,
         SVType::DUP,
@@ -89,14 +67,22 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
     })
     {
         // Create a DBSCAN object for the current SV type
-        if (sv_type == SVType::DEL) {
-            epsilon = 0.45;
-            min_pts = 16;
-        } else {
-            epsilon = 0.65;
-            min_pts = 15;
-        }
-        DBSCAN dbscan(epsilon, min_pts);
+        // epsilon = 0.45;
+        // min_pts = 15;
+        // if (sv_type == SVType::DEL) {
+        //     epsilon = 0.45;
+        //     min_pts = 16;
+        // } else {
+        //     // epsilon = 0.65;
+        //     // min_pts = 15;
+        //     // epsilon = 0.45;
+        //     // min_pts = 16;
+        //     // epsilon = 0.45;
+        //     // min_pts = 2;
+        //     // epsilon = 0.45;
+        //     // min_pts = 15;
+        // }
+        // DBSCAN dbscan(epsilon, min_pts);
 
         // Create a vector of SV calls for the current SV type and size interval
         std::vector<SVCall> sv_type_calls;
@@ -120,22 +106,95 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
         for (auto& cluster : cluster_map) {
             int cluster_id = cluster.first;
             std::vector<SVCall>& cluster_sv_calls = cluster.second;
-            if (cluster_id < 0) {
-                continue;  // Skip noise and unclassified points
-            } else {
-                // Use the median length SV
-                std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-                    return (a.end - a.start) < (b.end - b.start);
-                });
-                int median_index = cluster_sv_calls.size() / 2;
-                SVCall median_sv_call = cluster_sv_calls[median_index];
-                median_sv_call.cluster_size = (int) cluster_sv_calls.size();
-                merged_sv_calls.push_back(median_sv_call);
+            // if (cluster_id < 0) {
+            //     continue;  // Skip noise and unclassified points
+            // } else {
+            if (true) {
+                // Use the highest HMM likelihood normalized by SV size as the
+                // representative SV (if any non-zero likelihoods exist)
+                bool has_nonzero_likelihood = false;
+                if (cluster_sv_calls.size() > 0) {
+                    for (const auto& sv_call : cluster_sv_calls) {
+
+                        // Check if any SV has a non-zero likelihood
+                        if (sv_call.hmm_likelihood != 0.0) {
+                            has_nonzero_likelihood = true;
+                            break;
+                        }
+                    }
+                }
+
+                // [TEST] Check if any SV has a length greater than 600kb
+                bool found_large_sv = false;
+                for (const auto& sv_call : cluster_sv_calls) {
+                    if (sv_call.end - sv_call.start > 600000) {
+                        found_large_sv = true;
+                        break;
+                    }
+                }
+                if (found_large_sv) {
+                    printMessage("Found large SV with length greater than 600kb");
+                    printMessage("Found " + std::to_string(cluster_sv_calls.size()) + " SVs in cluster " + std::to_string(cluster_id) + " of type " + getSVTypeString(sv_type) + ", with epsilon=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts));
+                }
+                
+                SVCall merged_sv_call = cluster_sv_calls[0];
+                if (has_nonzero_likelihood) {
+                    // Use the highest HMM likelihood normalized by SV size as the
+                    // representative SV
+                    // std::vector<double> likelihoods;
+                    // Default very low log-likelihood for zero likelihoods
+                    std::vector<double> likelihoods(cluster_sv_calls.size(), -std::numeric_limits<double>::infinity());
+                    // for (const auto& sv_call : cluster_sv_calls) {
+                    int i = 0;
+                    for (const auto& sv_call : cluster_sv_calls) {
+                        if (sv_call.hmm_likelihood != 0.0) {
+                            uint32_t sv_size = (uint32_t) (sv_call.end - sv_call.start);
+                            if (sv_size > 0) {
+                                likelihoods[i] = sv_call.hmm_likelihood / sv_size;
+                                // likelihoods.push_back(sv_call.hmm_likelihood / sv_size);
+                            }
+                        }
+
+                        // Print the SV length, likelihood, and normalized
+                        // likelihood
+                        if (found_large_sv) {
+                            printMessage("Start: " + std::to_string(sv_call.start) + ", end: " + std::to_string(sv_call.end) + ", likelihood: " + std::to_string(sv_call.hmm_likelihood) + ", normalized likelihood: " + std::to_string(likelihoods[i]) + ", length: " + std::to_string(sv_call.end - sv_call.start));
+                            // printMessage("SV length: " + std::to_string(sv_call.end - sv_call.start) + ", likelihood: " + std::to_string(sv_call.hmm_likelihood) + ", normalized likelihood: " + std::to_string(likelihoods[i]) + ", start: " + std::to_string(sv_call.start) + ", end: " + std::to_string(sv_call.end));
+                        }
+                        i++;
+                    }
+                    
+                    // Find the index of the maximum element in the likelihoods
+                    // vector
+                    auto max_likelihood_it = std::max_element(likelihoods.begin(), likelihoods.end());
+                    int max_likelihood_index = std::distance(likelihoods.begin(), max_likelihood_it);
+                    merged_sv_call = cluster_sv_calls[max_likelihood_index];
+                    printMessage("Merged SV with highest normalized likelihood: " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", likelihood: " + std::to_string(merged_sv_call.hmm_likelihood) + ", normalized likelihood: " + std::to_string(merged_sv_call.hmm_likelihood / (merged_sv_call.end - merged_sv_call.start)) + ", size: " + std::to_string(merged_sv_call.end - merged_sv_call.start));
+
+                } else {
+                    // Use the median length SV
+                    std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+                        return (a.end - a.start) < (b.end - b.start);
+                    });
+                    int median_index = cluster_sv_calls.size() / 2;
+                    merged_sv_call = cluster_sv_calls[median_index];
+                    printMessage("Merged SV with median length: " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", likelihood: " + std::to_string(merged_sv_call.hmm_likelihood) + ", size: " + std::to_string(merged_sv_call.end - merged_sv_call.start));
+                
+                if (cluster_id < 0) {
+                    merged_sv_call.cluster_size = cluster_id;
+                } else {
+                    merged_sv_call.cluster_size = (int) cluster_sv_calls.size();
+                }
+                // merged_sv_call.cluster_size = (int) cluster_sv_calls.size();
+                merged_sv_calls.push_back(merged_sv_call);
                 cluster_count++;
+                }
             }
         }
         printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type));
     }
+
+    printMessage("[TEST] Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(merged_sv_calls.size()) + " SV calls");
     sv_calls = std::move(merged_sv_calls); // Replace with filtered list
 
     int updated_size = sv_calls.size();

From ddf6d50f9b5bdee21879916f5d75d31a9d21bb37 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 10 Feb 2025 20:24:20 -0500
Subject: [PATCH 072/134] improve split read breakpoints

---
 include/dbscan1d.h  |  34 +++
 include/sv_caller.h |  46 +++-
 include/sv_object.h |   5 +-
 src/dbscan1d.cpp    |  90 +++++++
 src/sv_caller.cpp   | 618 ++++++++++++++++++++++++++------------------
 src/sv_object.cpp   | 134 +++++-----
 6 files changed, 591 insertions(+), 336 deletions(-)
 create mode 100644 include/dbscan1d.h
 create mode 100644 src/dbscan1d.cpp

diff --git a/include/dbscan1d.h b/include/dbscan1d.h
new file mode 100644
index 00000000..07692e65
--- /dev/null
+++ b/include/dbscan1d.h
@@ -0,0 +1,34 @@
+#ifndef DBSCAN1D_H
+#define DBSCAN1D_H
+
+
+#include <vector>
+#include <utility>
+#include <cmath>
+#include <algorithm>
+
+
+class DBSCAN1D {
+    public:
+        DBSCAN1D(double epsilon, int minPts) : epsilon(epsilon), minPts(minPts) {}
+
+        void fit(const std::vector<int>& points);
+
+        const std::vector<int>& getClusters() const;
+
+        std::vector<int> getLargestCluster(const std::vector<int> &points);
+
+    private:
+        double epsilon;
+        int minPts;
+        std::vector<int> clusters;
+
+        bool expandCluster(const std::vector<int>& points, size_t pointIdx, int clusterId);
+
+        std::vector<size_t> regionQuery(const std::vector<int>& points, size_t pointIdx) const;
+
+        double distance(int a, int b) const;
+
+};
+
+#endif  // DBSCAN1D_H
diff --git a/include/sv_caller.h b/include/sv_caller.h
index fedd172f..1998547f 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -22,8 +22,50 @@ struct GenomicRegion {
     hts_pos_t end;
     bool strand;
     uint8_t qual;
+    int cluster_size;  // Number of alignments used for this region
 };
 
+// Interval Tree Node
+struct IntervalNode {
+    GenomicRegion region;
+    std::string qname;
+    hts_pos_t max_end;  // To optimize queries
+    IntervalNode* left;
+    IntervalNode* right;
+
+    IntervalNode(GenomicRegion r, std::string name)
+        : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {}
+};
+
+IntervalNode* insert(IntervalNode* root, GenomicRegion region, std::string qname) {
+    if (!root)
+        return new IntervalNode(region, qname);
+
+    if (region.start < root->region.start)
+        root->left = insert(root->left, region, qname);
+    else
+        root->right = insert(root->right, region, qname);
+
+    // Update max_end
+    root->max_end = std::max(root->max_end, region.end);
+    return root;
+}
+
+void findOverlaps(IntervalNode* root, GenomicRegion query, std::vector<std::string>& result) {
+    if (!root) return;
+
+    // If overlapping, add to result
+    if (query.start <= root->region.end && query.end >= root->region.start)
+        result.push_back(root->qname);
+
+    // If left subtree may have overlaps, search left
+    if (root->left && root->left->max_end >= query.start)
+        findOverlaps(root->left, query, result);
+
+    // Always check the right subtree
+    findOverlaps(root->right, query, result);
+}
+
 struct MismatchData {
     uint32_t query_start;
     uint32_t query_end;
@@ -35,7 +77,7 @@ class SVCaller {
         int min_mapq = 20;          // Minimum mapping quality to be considered
         std::mutex shared_mutex;
 
-        void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map);
+        std::vector<SVCall> getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map);
 
         // Detect SVs from the CIGAR string of a read alignment, and return the
         // mismatch rate, and the start and end positions of the query sequence
@@ -64,6 +106,8 @@ class SVCaller {
         // Calculate the read depth (INFO/DP) for a region
         int calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end);
 
+        bool regionOverlaps(const GenomicRegion& a, const GenomicRegion& b);
+
     public:
         // Constructor with no arguments
         SVCaller() = default;
diff --git a/include/sv_object.h b/include/sv_object.h
index 58ccd5dc..fc090166 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -34,10 +34,13 @@ struct SVCall {
         start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {}
 };
 
-void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth);
+// void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth);
+void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call);
 
 void mergeSVs(std::vector<SVCall>& sv_calls);
 
+void mergeSVSubsets(std::vector<SVCall>& sv_calls);
+
 void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, int min_depth);
 
 void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, int min_depth, const std::string& data_type);
diff --git a/src/dbscan1d.cpp b/src/dbscan1d.cpp
new file mode 100644
index 00000000..90fc9458
--- /dev/null
+++ b/src/dbscan1d.cpp
@@ -0,0 +1,90 @@
+#include "dbscan1d.h"
+
+#include <vector>
+#include <cmath>
+#include <algorithm>
+#include <map>
+
+void DBSCAN1D::fit(const std::vector<int>& points) {
+    int clusterId = 0;
+    clusters.assign(points.size(), -1); // -1 means unclassified
+
+    for (size_t i = 0; i < points.size(); ++i) {
+        if (clusters[i] == -1) { // if point is not yet classified
+            if (expandCluster(points, i, clusterId)) {
+                ++clusterId;
+            }
+        }
+    }
+}
+
+const std::vector<int>& DBSCAN1D::getClusters() const {
+    return clusters;
+}
+
+bool DBSCAN1D::expandCluster(const std::vector<int>& points, size_t pointIdx, int clusterId) {
+    std::vector<size_t> seeds = regionQuery(points, pointIdx);
+    if (static_cast<int>(seeds.size()) < minPts) {
+        clusters[pointIdx] = -2; // mark as noise
+        return false;
+    }
+
+    for (size_t seedIdx : seeds) {
+        clusters[seedIdx] = clusterId;
+    }
+
+    seeds.erase(std::remove(seeds.begin(), seeds.end(), pointIdx), seeds.end());
+
+    while (!seeds.empty()) {
+        size_t currentPoint = seeds.back();
+        seeds.pop_back();
+
+        std::vector<size_t> result = regionQuery(points, currentPoint);
+        if (static_cast<int>(result.size()) >= minPts) {
+            for (size_t resultPoint : result) {
+                if (clusters[resultPoint] == -1 || clusters[resultPoint] == -2) {
+                    if (clusters[resultPoint] == -1) {
+                        seeds.push_back(resultPoint);
+                    }
+                    clusters[resultPoint] = clusterId;
+                }
+            }
+        }
+    }
+
+    return true;
+}
+
+std::vector<size_t> DBSCAN1D::regionQuery(const std::vector<int>& points, size_t pointIdx) const {
+    std::vector<size_t> neighbors;
+    for (size_t i = 0; i < points.size(); ++i) {
+        if (distance(points[pointIdx], points[i]) <= epsilon) {
+            neighbors.push_back(i);
+        }
+    }
+    return neighbors;
+}
+
+double DBSCAN1D::distance(int point1, int point2) const {
+    return std::abs(point1 - point2);
+}
+
+std::vector<int> DBSCAN1D::getLargestCluster(const std::vector<int> &points)
+{
+    std::vector<int> clusters = getClusters();
+    std::map<int, std::vector<int>> cluster_map;
+    for (size_t i = 0; i < clusters.size(); ++i) {
+        cluster_map[clusters[i]].push_back(points[i]);
+    }
+
+    int largest_cluster_id = -1;
+    size_t largest_size = 0;
+    for (const auto &entry : cluster_map) {
+        if (entry.first >= 0 && entry.second.size() > largest_size) {
+            largest_size = entry.second.size();
+            largest_cluster_id = entry.first;
+        }
+    }
+
+    return cluster_map[largest_cluster_id];
+}
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index a8e25c5f..98877f3f 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -28,6 +28,7 @@
 #include "version.h"
 #include "fasta_query.h"
 #include "dbscan.h"
+#include "dbscan1d.h"
 /// @endcond
 
 # define DUP_SEQSIM_THRESHOLD 0.9  // Sequence similarity threshold for duplication detection
@@ -41,19 +42,19 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
     return ret;
 }
 
-void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map)
+std::vector<SVCall> SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map)
 {
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
     if (!bam1) {
         printError("ERROR: failed to initialize BAM record");
-        return;
+        return {};
     }
     hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str());
     if (!itr) {
         bam_destroy1(bam1);
         printError("ERROR: failed to query region " + region);
-        return;
+        return {};
     }
 
     uint32_t primary_count = 0;
@@ -73,11 +74,9 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
 
         // Process primary alignments
         if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
-            // primary_map[qname] = itr;
             // Store chromosome (TID), start, and end positions (1-based) of the
             // primary alignment, and the strand (true for forward, false for reverse)
-            primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq};
-            // primary_map_qual[qname] = bam1->core.qual;
+            primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq, 0};
             primary_count++;
 
         // Process supplementary alignments
@@ -85,7 +84,7 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
             // supp_map[qname].push_back(itr);
             // Store chromosome (TID), start, and end positions (1-based) of the
             // supplementary alignment, and the strand (true for forward, false for reverse)
-            supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq});
+            supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq, 0});
             supplementary_count++;
         }
         num_alignments++;
@@ -109,106 +108,157 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam
     printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments");
 
     // Create a set of dummy SVs from the primary alignments for each chromosome
-    // and run DBSCAN to merge them
-    std::unordered_map<std::string, std::vector<SVCall>> dummy_sv_map;
-    std::unordered_map<std::string, std::vector<std::string>> dummy_sv_qnames;
+    // and run DBSCAN to cluster them
+    // std::vector<SVCall> dummy_sv_map;
+    // std::vector<std::string> dummy_sv_qnames;
+    // for (const auto& entry : primary_map) {
+    //     const std::string& chrom = bamHdr->target_name[entry.second.tid];
+    //     if (chrom != region) {
+    //         continue;  // Skip alignments not in the same chromosome
+    //     }
+    //     uint32_t start = entry.second.start;
+    //     uint32_t end = entry.second.end;
+    //     const std::string& qname = entry.first;
+    //     SVCall sv_call(start, end, SVType::DUP, ".", qname, ".", 0.0, 0, 0, 0);
+    //     dummy_sv_map.emplace_back(sv_call);
+    //     dummy_sv_qnames.emplace_back(entry.first);
+    // }
+
+    // // Run DBSCAN to merge the dummy SVs
+    // // double epsilon = 0.65;
+    // double epsilon = 0.45;
+    // int min_pts = 2;
+    // std::vector<std::vector<std::string>> primary_clusters;
+    // DBSCAN dbscan(epsilon, min_pts);
+    // dbscan.fit(dummy_sv_map);
+    // const std::vector<int>& cluster_ids = dbscan.getClusters();
+    
+    // // Create the 2D vector of clusters
+    // for (int cluster_id : cluster_ids) {
+    //     if (cluster_id < 0) {
+    //         continue;  // Skip noise and unclassified points
+    //     }
+    //     std::vector<std::string> cluster;
+    //     for (size_t i = 0; i < cluster_ids.size(); ++i) {
+    //         if (cluster_ids[i] == cluster_id) {
+    //             cluster.push_back(dummy_sv_qnames[i]);
+    //         }
+    //     }
+    //     primary_clusters.push_back(cluster);
+    // }
+
+    // Identify overlapping primary alignments and then cluster their primary
+    // start, end vs. supplementary alignment start, end positions, keeping the
+    // median of the largest cluster for the primary and supplementary positions
+    // as the final genome coordinates of the SV
+    IntervalNode* root = nullptr;
     for (const auto& entry : primary_map) {
-        const std::string& chrom = bamHdr->target_name[entry.second.tid];
-        uint32_t start = entry.second.start;
-        uint32_t end = entry.second.end;
         const std::string& qname = entry.first;
-        SVCall sv_call(start, end, SVType::DUP, ".", qname, ".", 0.0, 0, 0, 0);
-        dummy_sv_map[chrom].emplace_back(sv_call);
-        dummy_sv_qnames[chrom].emplace_back(entry.first);
-    }
-
-    // Run DBSCAN to merge the dummy SVs
-    double epsilon = 0.65;
-    int min_pts = 2;
-    std::unordered_set<std::string> qnames_to_keep;
-    for (const auto& entry : dummy_sv_map) {
-        const std::string& chrom = entry.first;
-        const std::vector<SVCall>& sv_calls = entry.second;
-        DBSCAN dbscan(epsilon, min_pts);
-        dbscan.fit(sv_calls);
-        const std::vector<int>& clusters = dbscan.getClusters();
-        std::map<int, std::vector<SVCall>> cluster_map;
-        for (size_t i = 0; i < clusters.size(); ++i) {
-            cluster_map[clusters[i]].push_back(sv_calls[i]);
-        }
-
-        // Merge the SVs in each cluster, using the median of the start and end
-        // positions of the SVs in each cluster
-        for (auto& cluster : cluster_map) {
-            int cluster_id = cluster.first;
-            std::vector<SVCall>& cluster_sv_calls = cluster.second;
-            if (cluster_id < 0) {
-                continue;  // Skip noise and unclassified points
-            }
-            
-            // Use the median length SV as the representative SV
-            std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-                return (a.end - a.start) < (b.end - b.start);
-            });
-            SVCall median_sv = cluster_sv_calls[cluster_sv_calls.size() / 2];
-            const std::string& qname = median_sv.data_type;
-            qnames_to_keep.insert(qname);
-        }
+        const GenomicRegion& region = entry.second;
+        root = insert(root, region, qname);
     }
+    std::vector<std::vector<std::string>> primary_clusters;
+    std::set<std::string> processed;
 
-    // Remove the SVs that are not in the qnames_to_keep set
-    std::unordered_set<std::string> qnames_to_remove;
     for (const auto& entry : primary_map) {
         const std::string& qname = entry.first;
-        if (qnames_to_keep.find(qname) == qnames_to_keep.end()) {
-            qnames_to_remove.insert(qname);
+        if (processed.find(qname) != processed.end()) {
+            continue;  // Skip already processed primary alignments
+        }
+        const GenomicRegion& region = entry.second;
+        std::vector<std::string> overlap_group;
+        findOverlaps(root, region, overlap_group);
+        for (const std::string& qname : overlap_group) {
+            processed.insert(qname);
+        }
+        if (overlap_group.size() > 1) {
+            primary_clusters.push_back(overlap_group);
         }
     }
+    printMessage(region + ": Found " + std::to_string(primary_clusters.size()) + " groups of overlapping primary alignments");
+
+    // For each primary alignment cluster the supplementary alignment start and
+    // end positions, keeping the median of the largest cluster
+    std::vector<SVCall> sv_candidates;
+    int current_group = 0;
+    int min_length = 2000;
+    int max_length = 1000000;
+    for (const auto& primary_group : primary_clusters) {
+        // Use DBSCAN to cluster primary alignment start, end positions
+        DBSCAN1D dbscan(100, 5);
+        current_group++;
+        std::vector<int> starts;
+        std::vector<int> ends;
+        for (const std::string& qname : primary_group) {
+            const GenomicRegion& region = primary_map[qname];
+            starts.push_back(region.start);
+            ends.push_back(region.end);
+        }
 
-    for (const std::string& qname : qnames_to_remove) {
-        primary_map.erase(qname);
-        supp_map.erase(qname);
-    }
+        // Get the largest cluster of primary alignment start positions
+        dbscan.fit(starts);
+        std::vector<int> primary_start_cluster = dbscan.getLargestCluster(starts);
+
+        // Get the largest cluster of primary alignment end positions
+        dbscan.fit(ends);
+        std::vector<int> primary_end_cluster = dbscan.getLargestCluster(ends);
+
+        // Get the supplementary alignment positions
+        std::vector<int> supp_starts;
+        std::vector<int> supp_ends;
+        for (const std::string& qname : primary_group) {
+            const std::vector<GenomicRegion>& regions = supp_map[qname];
+            for (const GenomicRegion& region : regions) {
+                supp_starts.push_back(region.start);
+                supp_ends.push_back(region.end);
+            }
+        }
 
-    // Filter overlapping primary alignments and keep the one with the highest mapping
-    // quality
+        // Get the largest cluster of supplementary alignment start positions
+        dbscan.fit(supp_starts);
+        std::vector<int> supp_start_cluster = dbscan.getLargestCluster(supp_starts);
+
+        // Get the largest cluster of supplementary alignment end positions
+        dbscan.fit(supp_ends);
+        std::vector<int> supp_end_cluster = dbscan.getLargestCluster(supp_ends);
+
+        // Use the median of the largest cluster of primary and supplementary
+        // alignment start, end positions as the final genome coordinates of the
+        // SV
+        int primary_pos = -1;
+        if (primary_start_cluster.size() > primary_end_cluster.size()) {
+            std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
+            primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
+        } else if (primary_end_cluster.size() > primary_start_cluster.size()) {
+            std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
+            primary_pos = primary_end_cluster[primary_end_cluster.size() / 2];
+        }
 
-    // for (const auto& entry1 : primary_map) {
-    //     const std::string& qname1 = entry1.first;
-    //     const GenomicRegion& primary1 = entry1.second;
-    //     for (const auto& entry2 : primary_map) {
-    //         const std::string& qname2 = entry2.first;
-    //         if (qname1 == qname2) {
-    //             continue;
-    //         }
-    //         const GenomicRegion& primary2 = entry2.second;
-    //         if (primary1.tid == primary2.tid && primary1.start <= primary2.end && primary1.end >= primary2.start) {
-    //             // Overlapping primary alignments
-    //             // printMessage("Overlapping primary alignments with quality " + std::to_string(primary_map_qual[qname1]) + " and " + std::to_string(primary_map_qual[qname2]));
-    //             // if (primary_map_qual[qname1] < primary_map_qual[qname2]) {
-    //             if (primary1.qual < primary2.qual) {
-    //                 // to_remove_overlapping.push_back(qname1);
-    //                 to_remove_overlapping.insert(qname1);
-    //             } else {
-    //                 // If equal, remove the shorter alignment
-    //                 if (primary1.end - primary1.start < primary2.end - primary2.start) {
-    //                     // to_remove_overlapping.push_back(qname1);
-    //                     to_remove_overlapping.insert(qname1);
-    //                 } else {
-    //                     // to_remove_overlapping.push_back(qname2);
-    //                     to_remove_overlapping.insert(qname2);
-    //                 }
-    //             }
-    //         }
-    //     }
-    // }
+        // Get the supplementary alignment positions
+        int supp_pos = -1;
+        if (supp_start_cluster.size() > supp_end_cluster.size()) {
+            std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
+            supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
+        } else if (supp_end_cluster.size() > supp_start_cluster.size()) {
+            std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
+            supp_pos = supp_end_cluster[supp_end_cluster.size() / 2];
+        }
 
-    // for (const std::string& qname : to_remove_overlapping) {
-    //     primary_map.erase(qname);
-    //     supp_map.erase(qname);
-    // }
-    // printMessage(region + ": Removed " + std::to_string(to_remove_overlapping.size()) + " overlapping primary alignments");
-    printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supp_map.size()) + " supplementary alignments after filtering");
+        if (primary_pos == -1 || supp_pos == -1) {
+            continue;
+        }
+
+        // Store the SV candidate if the length is between 2kb and 1Mb
+        int sv_start = std::min(primary_pos, supp_pos);
+        int sv_end = std::max(primary_pos, supp_pos);
+        int sv_length = sv_end - sv_start + 1;
+        if (sv_length >= min_length && sv_length <= max_length) {
+            SVCall sv_candidate(sv_start, sv_end, SVType::UNKNOWN, ".", "NA", "./.", 0.0, 0, 0, 0);
+            sv_candidates.push_back(sv_candidate);
+        }
+    }
+
+    return sv_candidates;
 }
 
 
@@ -323,7 +373,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
                     {
                         int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                        addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "LSEQSIM", "./.", default_lh, read_depth);
+                        // addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>",
+                        // "LSEQSIM", "./.", default_lh, read_depth);
+                        SVCall sv_call(bp1, bp2, SVType::DUP, "<DUP>", "LSEQSIM", "./.", default_lh, read_depth, 1, 0);
+                        addSVCall(sv_calls, sv_call);
                         continue;
                     }
                 }
@@ -337,7 +390,9 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
                     {
                         int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                        addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth);
+                        SVCall sv_call(bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth, 1, 0);
+                        addSVCall(sv_calls, sv_call);
+                        // addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth);
                         continue;
                     }
                 }
@@ -354,8 +409,9 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 if (op_len <= 50) {
                     alt_allele = ins_seq_str;
                 }
-                
-                addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth);
+                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, 1, 0);
+                addSVCall(sv_calls, sv_call);                
+                // addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth);
 
             // Check if the CIGAR operation is a deletion
             } else if (op == BAM_CDEL && is_primary) {
@@ -363,7 +419,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
                 int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
-                addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "<DEL>", "CIGARDEL", "./.", default_lh, read_depth);
+                // addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "<DEL>",
+                // "CIGARDEL", "./.", default_lh, read_depth);
+                SVCall sv_call(ref_pos, ref_end, SVType::DEL, "<DEL>", "CIGARDEL", "./.", default_lh, read_depth, 1, 0);
+                addSVCall(sv_calls, sv_call);
 
                 // Print if the ref pos is within the range 44007800-44007930
                 if (ref_pos >= 44007800 && ref_pos <= 44007930) {
@@ -450,23 +509,23 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     }
 
     // Detect SVs from the CIGAR strings
-    printMessage(chr + ": CIGAR SVs...");
-    this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
+    // printMessage(chr + ": CIGAR SVs...");
+    // this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
 
-    printMessage(chr + ": Merging CIGAR...");
-    double cigar_epsilon = 0.45;
-    int cigar_min_pts = 15;
-    mergeSVs(chr_sv_calls, cigar_epsilon, cigar_min_pts);
+    // printMessage(chr + ": Merging CIGAR...");
+    // double cigar_epsilon = 0.45;
+    // int cigar_min_pts = 15;
+    // mergeSVs(chr_sv_calls, cigar_epsilon, cigar_min_pts);
 
-    int region_sv_count = getSVCount(chr_sv_calls);
-    printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
+    // int region_sv_count = getSVCount(chr_sv_calls);
+    // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
     // Run copy number variant predictions on the SVs detected from the
     // CIGAR string, using a minimum CNV length threshold
-    if (region_sv_count > 0) {
-        printMessage(chr + ": CIGAR predictions...");
-        cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
-    }
+    // if (region_sv_count > 0) {
+    //     printMessage(chr + ": CIGAR predictions...");
+    //     cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
+    // }
 
     // Run split-read SV and copy number variant predictions
     printMessage(chr + ": Split read SVs...");
@@ -479,27 +538,16 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     int split_min_pts = 2;
     mergeSVs(split_sv_calls, split_epsilon, split_min_pts);
 
-    // Unify the SV calls
     printMessage(chr + ": Unifying SVs...");
     chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
 
-    // printMessage(chr + ": Final merge...");
-    // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts);
-
-    // TODO: Merge subsets based on highest HMM likelihood
+    mergeSVSubsets(chr_sv_calls);
 
     // Sort the SV calls by start position
     std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
         return a.start < b.start;
     });
 
-    // Merge the SV calls from the current region
-    // printMessage(chr + ": Merging split reads...");
-    // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold);
-    // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold, "SPLIT");
-
-    // Run a final merge on the combined SV calls
-    // printMessage(chr + ": Merging final calls...");
     printMessage("Completed chromosome " + chr);
 }
 
@@ -602,156 +650,205 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
     // printMessage(region + ": Getting split alignments...");
     std::unordered_map<std::string, GenomicRegion> primary_map;
     std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
-    this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map);
-
-    // Find split-read SV evidence
-    // printMessage(region + ": Finding split-read SVs...");
-    // std::vector<SVCall> split_sv_calls;
-    int current_primary = 0;
-    int primary_count = primary_map.size();
-    uint32_t min_cnv_length = input_data.getMinCNVLength();
-    for (auto& entry : primary_map) {
-        current_primary++;
-        const std::string& qname = entry.first;
-        GenomicRegion& primary = entry.second;
-        const std::string& primary_chr = bamHdr->target_name[primary.tid];
-
-      	// Find the largest supplementary alignment
-        auto& supp_regions = supp_map[qname];
-        // GenomicRegion largest_supp = supp_regions[0];
-        auto it = std::max_element(supp_regions.begin(), supp_regions.end(), [](const GenomicRegion& a, const GenomicRegion& b) {
-            return a.end - a.start < b.end - b.start;
-        });
-        GenomicRegion largest_supp = *it;
-
-        // If on a different chromosome, label as a translocation
-        if (primary.tid != largest_supp.tid) {
-            // Note that these do not currently have a likelihood score or read depth
-            // Create two BND records for the translocation
-            // Create the alternate allele format for the first BND record
-            const std::string& supp_chr = bamHdr->target_name[largest_supp.tid];
-            std::string alt_allele = "N[" + supp_chr + ":" + std::to_string(largest_supp.start) + "[";
-            if (largest_supp.strand == false) {
-                // Reverse-oriented relative to the reference
-                alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]";
-            }
-            addSVCall(split_sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0);
-
-            // Create the alternate allele format for the second BND record
-            alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "[";
-            if (primary.strand == false) {
-                // Reverse-oriented relative to the reference
-                alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]";
-            }
-            addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0);
+    std::vector<SVCall> sv_candidates = this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map);
+
+    // Run copy number predictions on the SVs detected from the split reads
+    printMessage(region + ": Split read predictions...");
+    int current_sv = 0;
+    int total_svs = sv_candidates.size();
+    int min_cnv_length = input_data.getMinCNVLength();
+    for (auto& sv_candidate : sv_candidates) {
+        // Skip if the SV is too small
+        if ((int)sv_candidate.end - (int)sv_candidate.start <= min_cnv_length) {
+            continue;
+        }
 
+        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(region, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
+        if (std::get<1>(result) == SVType::UNKNOWN) {
             continue;
         }
 
-        // Inversion detection
-        bool is_opposite_strand = primary.strand != largest_supp.strand;
-        if (is_opposite_strand) {
-            if (largest_supp.end - largest_supp.start >= min_cnv_length) {
+        double supp_lh = std::get<0>(result);
+        SVType supp_type = std::get<1>(result);
+        // int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
+        if (supp_type != SVType::NEUTRAL) {
+            int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
+            std::string alt_allele = supp_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(supp_type) + ">";
+            SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", "./.", supp_lh, read_depth, 1, sv_candidate.cluster_size);
+            addSVCall(split_sv_calls, sv_call);
+        }
+        current_sv++;
+        if (current_sv % 1000 == 0) {
+            printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates");
+        }
+    }
 
-                // Print error if the start position is greater than the end
-                // position
-                if (largest_supp.start > largest_supp.end) {
-                    printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(largest_supp.start) + "-" + std::to_string(largest_supp.end));
-                    continue;
-                }
+    // // Find split-read SV evidence
+    // // printMessage(region + ": Finding split-read SVs...");
+    // // std::vector<SVCall> split_sv_calls;
+    // int current_primary = 0;
+    // int primary_count = primary_map.size();
+    // uint32_t min_cnv_length = input_data.getMinCNVLength();
+    // for (auto& entry : primary_map) {
+    //     current_primary++;
+    //     const std::string& qname = entry.first;
+    //     GenomicRegion& primary = entry.second;
+    //     const std::string& primary_chr = bamHdr->target_name[primary.tid];
+    //     int primary_cluster_size = primary.cluster_size;
+
+    //   	// Find the largest supplementary alignment
+    //     auto& supp_regions = supp_map[qname];
+    //     // GenomicRegion largest_supp = supp_regions[0];
+    //     auto it = std::max_element(supp_regions.begin(), supp_regions.end(), [](const GenomicRegion& a, const GenomicRegion& b) {
+    //         return a.end - a.start < b.end - b.start;
+    //     });
+    //     GenomicRegion largest_supp = *it;
+
+    //     // If on a different chromosome, label as a translocation
+    //     if (primary.tid != largest_supp.tid) {
+    //         // Note that these do not currently have a likelihood score or read depth
+    //         // Create two BND records for the translocation
+    //         // Create the alternate allele format for the first BND record
+    //         const std::string& supp_chr = bamHdr->target_name[largest_supp.tid];
+    //         std::string alt_allele = "N[" + supp_chr + ":" + std::to_string(largest_supp.start) + "[";
+    //         if (largest_supp.strand == false) {
+    //             // Reverse-oriented relative to the reference
+    //             alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]";
+    //         }
+    //         // addSVCall(split_sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0);
+    //         SVCall sv_call1(primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, 1, primary_cluster_size);
+    //         addSVCall(split_sv_calls, sv_call1);
+
+    //         // Create the alternate allele format for the second BND record
+    //         alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "[";
+    //         if (primary.strand == false) {
+    //             // Reverse-oriented relative to the reference
+    //             alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]";
+    //         }
+    //         // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end,
+    //         // SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0);
+    //         SVCall sv_call2(largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, 1, primary_cluster_size);
+    //         addSVCall(split_sv_calls, sv_call2);
 
-                std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, largest_supp.start, largest_supp.end, mean_chr_cov, pos_depth_map, input_data);
-                if (std::get<1>(result) == SVType::UNKNOWN) {
-                    continue;
-                }
+    //         continue;
+    //     }
+
+    //     // Inversion detection
+    //     bool is_opposite_strand = primary.strand != largest_supp.strand;
+    //     if (is_opposite_strand) {
+    //         if (largest_supp.end - largest_supp.start >= min_cnv_length) {
+
+    //             // Print error if the start position is greater than the end
+    //             // position
+    //             if (largest_supp.start > largest_supp.end) {
+    //                 printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(largest_supp.start) + "-" + std::to_string(largest_supp.end));
+    //                 continue;
+    //             }
 
-                double supp_lh = std::get<0>(result);
-                SVType supp_type = std::get<1>(result);
-                int read_depth = this->calculateReadDepth(pos_depth_map, largest_supp.start, largest_supp.end);
-                if (supp_type == SVType::NEUTRAL) {
-                    addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                    continue;
+    //             std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, largest_supp.start, largest_supp.end, mean_chr_cov, pos_depth_map, input_data);
+    //             if (std::get<1>(result) == SVType::UNKNOWN) {
+    //                 continue;
+    //             }
+
+    //             double supp_lh = std::get<0>(result);
+    //             SVType supp_type = std::get<1>(result);
+    //             int read_depth = this->calculateReadDepth(pos_depth_map, largest_supp.start, largest_supp.end);
+    //             if (supp_type == SVType::NEUTRAL) {
+    //                 SVCall sv_call(largest_supp.start, largest_supp.end, SVType::INV, "<INV>", "SPLIT", "./.", supp_lh, read_depth, 1, primary_cluster_size);
+    //                 addSVCall(split_sv_calls, sv_call);
+    //                 // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+    //                 continue;
                     
-                } else if (supp_type == SVType::DUP) {
-                    addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                    continue;
-                } else if (supp_type == SVType::DEL) {
-                    addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DEL, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-                    continue;
-                }
-            }
-        }
+    //             } else if (supp_type == SVType::DUP) {
+    //                 SVCall sv_call(largest_supp.start, largest_supp.end, SVType::INV_DUP, "<INV>", "SPLIT", "./.", supp_lh, read_depth, 1, primary_cluster_size);
+    //                 addSVCall(split_sv_calls, sv_call);
+    //                 // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+    //                 continue;
+    //             } else if (supp_type == SVType::DEL) {
+    //                 SVCall sv_call(largest_supp.start, largest_supp.end, SVType::INV_DEL, "<INV>", "SPLIT", "./.", supp_lh, read_depth, 1, primary_cluster_size);
+    //                 addSVCall(split_sv_calls, sv_call);
+    //                 // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DEL, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
+    //                 continue;
+    //             }
+    //         }
+    //     }
 
-        // Analyze split-read evidence for deletions and duplications
-        bool gap_exists = false;
-        uint32_t boundary_left, boundary_right, gap_left, gap_right;
-        boundary_left = std::min(primary.start, largest_supp.start);
-        boundary_right = std::max(primary.end, largest_supp.end);
-        gap_left = std::min(primary.end, largest_supp.start);
-        gap_right = std::max(primary.start, largest_supp.end);
-        gap_exists = gap_left < gap_right;
+    //     // Analyze split-read evidence for deletions and duplications
+    //     bool gap_exists = false;
+    //     uint32_t boundary_left, boundary_right, gap_left, gap_right;
+    //     boundary_left = std::min(primary.start, largest_supp.start);
+    //     boundary_right = std::max(primary.end, largest_supp.end);
+    //     gap_left = std::min(primary.end, largest_supp.start);
+    //     gap_right = std::max(primary.start, largest_supp.end);
+    //     gap_exists = gap_left < gap_right;
         
-        // Run copy number variant predictions on the boundary if large enough
-        if (boundary_right - boundary_left >= min_cnv_length) {
+    //     // Run copy number variant predictions on the boundary if large enough
+    //     if (boundary_right - boundary_left >= min_cnv_length) {
 
-            // Print error if the start position is greater than the end
-            // position
-            if (boundary_left > boundary_right) {
-                printError("ERROR: Invalid boundary coordinates: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right));
-                continue;
-            }
+    //         // Print error if the start position is greater than the end
+    //         // position
+    //         if (boundary_left > boundary_right) {
+    //             printError("ERROR: Invalid boundary coordinates: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right));
+    //             continue;
+    //         }
             
-            std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data);
-            if (std::get<1>(bd_result) == SVType::UNKNOWN) {
-                continue;
-            }
-            double bd_lh = std::get<0>(bd_result);
-            SVType bd_type = std::get<1>(bd_result);
+    //         std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data);
+    //         if (std::get<1>(bd_result) == SVType::UNKNOWN) {
+    //             continue;
+    //         }
+    //         double bd_lh = std::get<0>(bd_result);
+    //         SVType bd_type = std::get<1>(bd_result);
 
-            // Run copy number variant predictions on the gap if it exists
-            if (gap_exists && gap_right - gap_left >= min_cnv_length) {
+    //         // Run copy number variant predictions on the gap if it exists
+    //         if (gap_exists && gap_right - gap_left >= min_cnv_length) {
 
-                // Print error if the start position is greater than the end
-                // position
-                if (gap_left > gap_right) {
-                    printError("ERROR: Invalid gap coordinates: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right));
-                    continue;
-                }
+    //             // Print error if the start position is greater than the end
+    //             // position
+    //             if (gap_left > gap_right) {
+    //                 printError("ERROR: Invalid gap coordinates: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right));
+    //                 continue;
+    //             }
 
-                // printMessage(region + ": Running copy number prediction for
-                // gap...");
-                // printMessage("Running copy number prediction, length: " + std::to_string(gap_right - gap_left));
-                std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data);
-                if (std::get<1>(gap_result) == SVType::UNKNOWN) {
-                    continue;
-                }
-                double gap_lh = std::get<0>(gap_result);
-                SVType gap_type = std::get<1>(gap_result);
-
-                // If higher likelihood than the boundary, add the gap as the SV call
-                if (gap_lh > bd_lh) {
-                    int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right);
-                    std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">";
-                    addSVCall(split_sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth);
-                } else {
-                    // Add the boundary as the SV call
-                    int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
-                    std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-                    addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth);
-                }
-            } else {
-                // Add the boundary as the SV call
-                int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
-                std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-                addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth);
-            }
-        }
+    //             // printMessage(region + ": Running copy number prediction for
+    //             // gap...");
+    //             // printMessage("Running copy number prediction, length: " + std::to_string(gap_right - gap_left));
+    //             std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data);
+    //             if (std::get<1>(gap_result) == SVType::UNKNOWN) {
+    //                 continue;
+    //             }
+    //             double gap_lh = std::get<0>(gap_result);
+    //             SVType gap_type = std::get<1>(gap_result);
+
+    //             // If higher likelihood than the boundary, add the gap as the SV call
+    //             if (gap_lh > bd_lh) {
+    //                 int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right);
+    //                 std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">";
+    //                 SVCall sv_call(gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth, 1, primary_cluster_size);
+    //                 addSVCall(split_sv_calls, sv_call);
+    //                 // addSVCall(split_sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth);
+    //             } else {
+    //                 // Add the boundary as the SV call
+    //                 int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
+    //                 std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
+    //                 SVCall sv_call(boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, 1, primary_cluster_size);
+    //                 addSVCall(split_sv_calls, sv_call);
+    //                 // addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth);
+    //             }
+    //         } else {
+    //             // Add the boundary as the SV call
+    //             int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
+    //             std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
+    //             SVCall sv_call(boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, 1, primary_cluster_size);
+    //             addSVCall(split_sv_calls, sv_call);
+    //             // addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth);
+    //         }
+    //     }
 
-        // Print progress every 1000 primary alignments
-        if (current_primary % 1000 == 0) {
-            printMessage(region + ": Processed " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
-        }
-    }
+    //     // Print progress every 1000 primary alignments
+    //     if (current_primary % 1000 == 0) {
+    //         printMessage(region + ": Processed " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
+    //     }
+    // }
 
     // Unify the SV calls
     // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
@@ -984,3 +1081,8 @@ int SVCaller::calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uin
     // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth));
     return read_depth;
 }
+
+bool SVCaller::regionOverlaps(const GenomicRegion &a, const GenomicRegion &b)
+{
+    return a.tid == b.tid && a.start <= b.end && b.start <= a.end;
+}
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index e46afb77..32738adc 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -16,21 +16,26 @@ bool SVCall::operator<(const SVCall & other) const
 	return start < other.start || (start == other.start && end < other.end);
 }
 
-void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth)
+void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call)
 {
-    if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) {
+    if (sv_call.sv_type == SVType::UNKNOWN || sv_call.sv_type == SVType::NEUTRAL) {
         return;
     }
-    
-    if (start > end) {
-        printError("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end));
+
+    // Check if the SV call is valid
+    if (sv_call.start > sv_call.end) {
+        printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end));
         return;
     }
 
     // Insert the SV call in sorted order
-    SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1};
     auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call);
     sv_calls.insert(it, sv_call);
+
+    // Insert the SV call in sorted order
+    // SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1};
+    // auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call);
+    // sv_calls.insert(it, sv_call);
 }
 
 uint32_t getSVCount(const std::vector<SVCall>& sv_calls)
@@ -66,24 +71,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
         SVType::INV_DUP
     })
     {
-        // Create a DBSCAN object for the current SV type
-        // epsilon = 0.45;
-        // min_pts = 15;
-        // if (sv_type == SVType::DEL) {
-        //     epsilon = 0.45;
-        //     min_pts = 16;
-        // } else {
-        //     // epsilon = 0.65;
-        //     // min_pts = 15;
-        //     // epsilon = 0.45;
-        //     // min_pts = 16;
-        //     // epsilon = 0.45;
-        //     // min_pts = 2;
-        //     // epsilon = 0.45;
-        //     // min_pts = 15;
-        // }
-        // DBSCAN dbscan(epsilon, min_pts);
-
         // Create a vector of SV calls for the current SV type and size interval
         std::vector<SVCall> sv_type_calls;
         std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) {
@@ -110,8 +97,7 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
             //     continue;  // Skip noise and unclassified points
             // } else {
             if (true) {
-                // Use the highest HMM likelihood normalized by SV size as the
-                // representative SV (if any non-zero likelihoods exist)
+                // Check if any SV has a non-zero likelihood
                 bool has_nonzero_likelihood = false;
                 if (cluster_sv_calls.size() > 0) {
                     for (const auto& sv_call : cluster_sv_calls) {
@@ -123,53 +109,20 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
                         }
                     }
                 }
-
-                // [TEST] Check if any SV has a length greater than 600kb
-                bool found_large_sv = false;
-                for (const auto& sv_call : cluster_sv_calls) {
-                    if (sv_call.end - sv_call.start > 600000) {
-                        found_large_sv = true;
-                        break;
-                    }
-                }
-                if (found_large_sv) {
-                    printMessage("Found large SV with length greater than 600kb");
-                    printMessage("Found " + std::to_string(cluster_sv_calls.size()) + " SVs in cluster " + std::to_string(cluster_id) + " of type " + getSVTypeString(sv_type) + ", with epsilon=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts));
-                }
                 
                 SVCall merged_sv_call = cluster_sv_calls[0];
                 if (has_nonzero_likelihood) {
-                    // Use the highest HMM likelihood normalized by SV size as the
-                    // representative SV
-                    // std::vector<double> likelihoods;
-                    // Default very low log-likelihood for zero likelihoods
-                    std::vector<double> likelihoods(cluster_sv_calls.size(), -std::numeric_limits<double>::infinity());
-                    // for (const auto& sv_call : cluster_sv_calls) {
-                    int i = 0;
-                    for (const auto& sv_call : cluster_sv_calls) {
-                        if (sv_call.hmm_likelihood != 0.0) {
-                            uint32_t sv_size = (uint32_t) (sv_call.end - sv_call.start);
-                            if (sv_size > 0) {
-                                likelihoods[i] = sv_call.hmm_likelihood / sv_size;
-                                // likelihoods.push_back(sv_call.hmm_likelihood / sv_size);
-                            }
-                        }
+                    // These are detected from split reads, choose the one with
+                    // the highest non-zero likelihood
+                    std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+                        return a.hmm_likelihood > b.hmm_likelihood;
+                    });
 
-                        // Print the SV length, likelihood, and normalized
-                        // likelihood
-                        if (found_large_sv) {
-                            printMessage("Start: " + std::to_string(sv_call.start) + ", end: " + std::to_string(sv_call.end) + ", likelihood: " + std::to_string(sv_call.hmm_likelihood) + ", normalized likelihood: " + std::to_string(likelihoods[i]) + ", length: " + std::to_string(sv_call.end - sv_call.start));
-                            // printMessage("SV length: " + std::to_string(sv_call.end - sv_call.start) + ", likelihood: " + std::to_string(sv_call.hmm_likelihood) + ", normalized likelihood: " + std::to_string(likelihoods[i]) + ", start: " + std::to_string(sv_call.start) + ", end: " + std::to_string(sv_call.end));
-                        }
-                        i++;
-                    }
-                    
-                    // Find the index of the maximum element in the likelihoods
-                    // vector
-                    auto max_likelihood_it = std::max_element(likelihoods.begin(), likelihoods.end());
-                    int max_likelihood_index = std::distance(likelihoods.begin(), max_likelihood_it);
-                    merged_sv_call = cluster_sv_calls[max_likelihood_index];
-                    printMessage("Merged SV with highest normalized likelihood: " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", likelihood: " + std::to_string(merged_sv_call.hmm_likelihood) + ", normalized likelihood: " + std::to_string(merged_sv_call.hmm_likelihood / (merged_sv_call.end - merged_sv_call.start)) + ", size: " + std::to_string(merged_sv_call.end - merged_sv_call.start));
+                    // Obtain the highest non-zero likelihood
+                    auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) {
+                        return sv_call.hmm_likelihood != 0.0;
+                    });
+                    merged_sv_call = *it;
 
                 } else {
                     // Use the median length SV
@@ -178,30 +131,59 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
                     });
                     int median_index = cluster_sv_calls.size() / 2;
                     merged_sv_call = cluster_sv_calls[median_index];
-                    printMessage("Merged SV with median length: " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", likelihood: " + std::to_string(merged_sv_call.hmm_likelihood) + ", size: " + std::to_string(merged_sv_call.end - merged_sv_call.start));
-                
+                }
+
                 if (cluster_id < 0) {
                     merged_sv_call.cluster_size = cluster_id;
                 } else {
                     merged_sv_call.cluster_size = (int) cluster_sv_calls.size();
                 }
-                // merged_sv_call.cluster_size = (int) cluster_sv_calls.size();
                 merged_sv_calls.push_back(merged_sv_call);
                 cluster_count++;
-                }
             }
         }
         printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type));
     }
-
-    printMessage("[TEST] Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(merged_sv_calls.size()) + " SV calls");
     sv_calls = std::move(merged_sv_calls); // Replace with filtered list
 
     int updated_size = sv_calls.size();
     printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls");
 }
 
-void filterSVsWithLowSupport(std::vector<SVCall>& sv_calls, int min_support)
+void mergeSVSubsets(std::vector<SVCall> &sv_calls)
+{
+    // Sort the SV calls by start position
+    int initial_size = sv_calls.size();
+    std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+        return a.start < b.start;
+    });
+
+    // Remove SVs that are subsets of other SVs
+    std::vector<SVCall> filtered_sv_calls;
+    // Since the input SV calls are sorted by start position, we can iterate
+    // through them in order and only keep the SVs that are not subsets of
+    // others
+    for (const auto& sv_call : sv_calls) {
+        // Check if the current SV call is a subset of any previously added
+        // SV call
+        bool is_subset = false;
+        for (const auto& filtered_sv_call : filtered_sv_calls) {
+            if (sv_call.start >= filtered_sv_call.start && sv_call.end <= filtered_sv_call.end) {
+                is_subset = true;
+                break;
+            }
+        }
+        // If it's not a subset, add it to the filtered list
+        if (!is_subset) {
+            filtered_sv_calls.push_back(sv_call);
+        }
+    }
+    sv_calls = std::move(filtered_sv_calls); // Replace with filtered list
+    int updated_size = sv_calls.size();
+    printMessage("Filtered SV calls to remove subsets, from " + std::to_string(initial_size) + " to " + std::to_string(updated_size));
+}
+
+void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, int min_support)
 {
     // Filter SV calls with low read support or low cluster size
     sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) {

From 3341a9295016509419f07a9e61cd6fb85f87c0fd Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 10 Feb 2025 20:39:40 -0500
Subject: [PATCH 073/134] remove test code

---
 src/sv_caller.cpp | 66 +++++++++--------------------------------------
 1 file changed, 12 insertions(+), 54 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 98877f3f..362413f2 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -61,7 +61,6 @@ std::vector<SVCall> SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx,
     uint32_t supplementary_count = 0;
 
     // Main loop to process the alignments
-    // std::unordered_map<std::string, uint8_t> primary_map_qual;
     uint32_t num_alignments = 0;
     while (readNextAlignment(fp_in, itr, bam1) >= 0) {
 
@@ -81,7 +80,6 @@ std::vector<SVCall> SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx,
 
         // Process supplementary alignments
         } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
-            // supp_map[qname].push_back(itr);
             // Store chromosome (TID), start, and end positions (1-based) of the
             // supplementary alignment, and the strand (true for forward, false for reverse)
             supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq, 0});
@@ -107,46 +105,6 @@ std::vector<SVCall> SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx,
     bam_destroy1(bam1);
     printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments");
 
-    // Create a set of dummy SVs from the primary alignments for each chromosome
-    // and run DBSCAN to cluster them
-    // std::vector<SVCall> dummy_sv_map;
-    // std::vector<std::string> dummy_sv_qnames;
-    // for (const auto& entry : primary_map) {
-    //     const std::string& chrom = bamHdr->target_name[entry.second.tid];
-    //     if (chrom != region) {
-    //         continue;  // Skip alignments not in the same chromosome
-    //     }
-    //     uint32_t start = entry.second.start;
-    //     uint32_t end = entry.second.end;
-    //     const std::string& qname = entry.first;
-    //     SVCall sv_call(start, end, SVType::DUP, ".", qname, ".", 0.0, 0, 0, 0);
-    //     dummy_sv_map.emplace_back(sv_call);
-    //     dummy_sv_qnames.emplace_back(entry.first);
-    // }
-
-    // // Run DBSCAN to merge the dummy SVs
-    // // double epsilon = 0.65;
-    // double epsilon = 0.45;
-    // int min_pts = 2;
-    // std::vector<std::vector<std::string>> primary_clusters;
-    // DBSCAN dbscan(epsilon, min_pts);
-    // dbscan.fit(dummy_sv_map);
-    // const std::vector<int>& cluster_ids = dbscan.getClusters();
-    
-    // // Create the 2D vector of clusters
-    // for (int cluster_id : cluster_ids) {
-    //     if (cluster_id < 0) {
-    //         continue;  // Skip noise and unclassified points
-    //     }
-    //     std::vector<std::string> cluster;
-    //     for (size_t i = 0; i < cluster_ids.size(); ++i) {
-    //         if (cluster_ids[i] == cluster_id) {
-    //             cluster.push_back(dummy_sv_qnames[i]);
-    //         }
-    //     }
-    //     primary_clusters.push_back(cluster);
-    // }
-
     // Identify overlapping primary alignments and then cluster their primary
     // start, end vs. supplementary alignment start, end positions, keeping the
     // median of the largest cluster for the primary and supplementary positions
@@ -509,23 +467,23 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     }
 
     // Detect SVs from the CIGAR strings
-    // printMessage(chr + ": CIGAR SVs...");
-    // this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
+    printMessage(chr + ": CIGAR SVs...");
+    this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
 
-    // printMessage(chr + ": Merging CIGAR...");
-    // double cigar_epsilon = 0.45;
-    // int cigar_min_pts = 15;
-    // mergeSVs(chr_sv_calls, cigar_epsilon, cigar_min_pts);
+    printMessage(chr + ": Merging CIGAR...");
+    double cigar_epsilon = 0.45;
+    int cigar_min_pts = 15;
+    mergeSVs(chr_sv_calls, cigar_epsilon, cigar_min_pts);
 
-    // int region_sv_count = getSVCount(chr_sv_calls);
-    // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
+    int region_sv_count = getSVCount(chr_sv_calls);
+    printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
     // Run copy number variant predictions on the SVs detected from the
     // CIGAR string, using a minimum CNV length threshold
-    // if (region_sv_count > 0) {
-    //     printMessage(chr + ": CIGAR predictions...");
-    //     cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
-    // }
+    if (region_sv_count > 0) {
+        printMessage(chr + ": CIGAR predictions...");
+        cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
+    }
 
     // Run split-read SV and copy number variant predictions
     printMessage(chr + ": Split read SVs...");

From 12ae0f76a7ba85f909abfb312ad067aab7a856b9 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 15 Feb 2025 14:00:30 -0500
Subject: [PATCH 074/134] get chromosomes from bam

---
 include/sv_caller.h |   2 +
 src/sv_caller.cpp   | 105 +++++++++++++++++++++++++++++++++-----------
 2 files changed, 82 insertions(+), 25 deletions(-)

diff --git a/include/sv_caller.h b/include/sv_caller.h
index 1998547f..e0b477e6 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -77,6 +77,8 @@ class SVCaller {
         int min_mapq = 20;          // Minimum mapping quality to be considered
         std::mutex shared_mutex;
 
+        std::vector<std::string> getChromosomes(const std::string& bam_filepath);
+
         std::vector<SVCall> getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map);
 
         // Detect SVs from the CIGAR string of a read alignment, and return the
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 362413f2..8449930e 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -42,7 +42,31 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
     return ret;
 }
 
-std::vector<SVCall> SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map)
+std::vector<std::string> SVCaller::getChromosomes(const std::string &bam_filepath)
+{
+    // Open the BAM file
+    samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
+    if (!fp_in) {
+        printError("ERROR: failed to open BAM file " + bam_filepath);
+        return {};
+    }
+    bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
+    if (!bamHdr) {
+        sam_close(fp_in);
+        printError("ERROR: failed to read header from " + bam_filepath);
+        return {};
+    }
+    std::vector<std::string> chromosomes;
+    for (int i = 0; i < bamHdr->n_targets; i++) {
+        chromosomes.push_back(bamHdr->target_name[i]);
+        // printMessage("Chromosome: " + std::string(bamHdr->target_name[i]));
+    }
+    bam_hdr_destroy(bamHdr);
+    sam_close(fp_in);
+    return chromosomes;
+}
+
+std::vector<SVCall> SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const std::string &region, std::unordered_map<std::string, GenomicRegion> &primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>> &supp_map)
 {
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
@@ -142,15 +166,34 @@ std::vector<SVCall> SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx,
     int min_length = 2000;
     int max_length = 1000000;
     for (const auto& primary_group : primary_clusters) {
+        // Determine if the primary alignments are mostly on opposite strands to
+        // the corresponding supplementary alignments (potential inversions)
+        bool inversion = false;
+        for (const std::string& qname : primary_group) {
+            const std::vector<GenomicRegion>& regions = supp_map[qname];
+            int num_supp = (int) regions.size();
+            int num_opposite_strand = 0;
+            for (const GenomicRegion& region : regions) {
+                if (region.strand != primary_map[qname].strand) {
+                    num_opposite_strand++;
+                }
+            }
+            if (static_cast<double>(num_opposite_strand) / static_cast<double>(num_supp) > 0.5) {
+                inversion = true;
+            }
+        }
+
         // Use DBSCAN to cluster primary alignment start, end positions
         DBSCAN1D dbscan(100, 5);
         current_group++;
         std::vector<int> starts;
         std::vector<int> ends;
+        std::vector<bool> primary_strands;
         for (const std::string& qname : primary_group) {
             const GenomicRegion& region = primary_map[qname];
             starts.push_back(region.start);
             ends.push_back(region.end);
+            primary_strands.push_back(region.strand);
         }
 
         // Get the largest cluster of primary alignment start positions
@@ -164,11 +207,13 @@ std::vector<SVCall> SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx,
         // Get the supplementary alignment positions
         std::vector<int> supp_starts;
         std::vector<int> supp_ends;
+        std::vector<bool> supp_strands;
         for (const std::string& qname : primary_group) {
             const std::vector<GenomicRegion>& regions = supp_map[qname];
             for (const GenomicRegion& region : regions) {
                 supp_starts.push_back(region.start);
                 supp_ends.push_back(region.end);
+                supp_strands.push_back(region.strand);
             }
         }
 
@@ -206,12 +251,13 @@ std::vector<SVCall> SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx,
             continue;
         }
 
-        // Store the SV candidate if the length is between 2kb and 1Mb
+        // Store the SV candidate if the length is within the specified range
         int sv_start = std::min(primary_pos, supp_pos);
         int sv_end = std::max(primary_pos, supp_pos);
         int sv_length = sv_end - sv_start + 1;
+        SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
         if (sv_length >= min_length && sv_length <= max_length) {
-            SVCall sv_candidate(sv_start, sv_end, SVType::UNKNOWN, ".", "NA", "./.", 0.0, 0, 0, 0);
+            SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "NA", "./.", 0.0, 0, 0, 0);
             sv_candidates.push_back(sv_candidate);
         }
     }
@@ -289,7 +335,6 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     uint32_t* cigar = bam_get_cigar(alignment);  // CIGAR array
     int cigar_len = alignment->core.n_cigar;
     uint32_t query_pos = 0;
-    uint8_t qual = alignment->core.qual;
 
     // Loop through the CIGAR string, process operations, detect SVs (primary
     // only), and calculate sequence identity for potential duplications (primary only)
@@ -404,13 +449,6 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
 
 void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome)
 {
-    // int filter_threshold = 4;  // Minimum number of supporting reads for an SV call
-    // int filter_threshold = 10;  // Minimum number of supporting reads for an
-    // SV call
-    int cigar_sv_support_threshold = input_data.getMinReadSupport();  // Minimum number of supporting reads for an SV call
-    // int split_sv_support_threshold = 4;  // Minimum number of supporting
-    // reads for an SV call
-    int split_sv_support_threshold = input_data.getMinReadSupport();
     double dbscan_epsilon = input_data.getDBSCAN_Epsilon();
     int dbscan_min_pts = input_data.getDBSCAN_MinPts();
 
@@ -447,6 +485,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     // Set the region to process
     std::string region = chr;
     uint32_t chr_len = ref_genome.getChromosomeLength(chr);
+    // uint32_t chr_len = bamHdr->target_len[bam_name2id(bamHdr, chr.c_str())];
     if (input_data.isRegionSet()) {
 
         // Use one chunk for the specified region
@@ -471,9 +510,9 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
 
     printMessage(chr + ": Merging CIGAR...");
-    double cigar_epsilon = 0.45;
-    int cigar_min_pts = 15;
-    mergeSVs(chr_sv_calls, cigar_epsilon, cigar_min_pts);
+    // double cigar_epsilon = 0.45;
+    // int cigar_min_pts = 15;
+    mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts);
 
     int region_sv_count = getSVCount(chr_sv_calls);
     printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
@@ -509,6 +548,8 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     printMessage("Completed chromosome " + chr);
 }
 
+
+
 void SVCaller::run(const InputData& input_data)
 {
     // Set up the reference genome
@@ -520,9 +561,12 @@ void SVCaller::run(const InputData& input_data)
     // Get the chromosomes
     std::vector<std::string> chromosomes;
     if (input_data.isSingleChr()) {
+        // Get the chromosome from the user input argument
         chromosomes.push_back(input_data.getChromosome());
     } else {
-        chromosomes = ref_genome.getChromosomes();
+        // chromosomes = ref_genome.getChromosomes();
+        // Get the chromosomes from the input BAM file
+        chromosomes = this->getChromosomes(input_data.getLongReadBam());
     }
     
     // Read the HMM from the file
@@ -614,12 +658,8 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
     printMessage(region + ": Split read predictions...");
     int current_sv = 0;
     int total_svs = sv_candidates.size();
-    int min_cnv_length = input_data.getMinCNVLength();
     for (auto& sv_candidate : sv_candidates) {
-        // Skip if the SV is too small
-        if ((int)sv_candidate.end - (int)sv_candidate.start <= min_cnv_length) {
-            continue;
-        }
+        bool is_inversion = sv_candidate.sv_type == SVType::INV;
 
         std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(region, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
         if (std::get<1>(result) == SVType::UNKNOWN) {
@@ -628,12 +668,27 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
 
         double supp_lh = std::get<0>(result);
         SVType supp_type = std::get<1>(result);
+        std::string genotype = std::get<2>(result);
+
         // int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
-        if (supp_type != SVType::NEUTRAL) {
-            int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
-            std::string alt_allele = supp_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(supp_type) + ">";
-            SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", "./.", supp_lh, read_depth, 1, sv_candidate.cluster_size);
-            addSVCall(split_sv_calls, sv_call);
+        if (supp_type != SVType::UNKNOWN) {
+            if (is_inversion) {
+                if (supp_type == SVType::DEL) {
+                    supp_type = SVType::INV_DEL;
+                } else if (supp_type == SVType::DUP) {
+                    supp_type = SVType::INV_DUP;
+                } else if (supp_type == SVType::NEUTRAL) {
+                    supp_type = SVType::INV;
+                }
+                printMessage("Inversion detected: " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " (LENGTH " + std::to_string(sv_candidate.end - sv_candidate.start + 1) + ")");
+            }
+            
+            if (supp_type != SVType::NEUTRAL) {
+                int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
+                std::string alt_allele = "<" + getSVTypeString(supp_type) + ">";
+                SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
+                addSVCall(split_sv_calls, sv_call);
+            }
         }
         current_sv++;
         if (current_sv % 1000 == 0) {

From f39fd8a6a3ca3f8a0ed2f2c1811ac7c63d4ad492 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 26 Feb 2025 19:47:06 -0500
Subject: [PATCH 075/134] Fix large cnv detection errors

---
 src/dbscan.cpp    |   2 -
 src/sv_caller.cpp | 319 ++++++++++++++--------------------------------
 src/sv_object.cpp |   3 +-
 3 files changed, 98 insertions(+), 226 deletions(-)

diff --git a/src/dbscan.cpp b/src/dbscan.cpp
index d5310292..6fe97563 100644
--- a/src/dbscan.cpp
+++ b/src/dbscan.cpp
@@ -26,8 +26,6 @@ const std::vector<int>& DBSCAN::getClusters() const {
     return clusters;
 }
 
-// bool DBSCAN::expandCluster(const std::vector<std::pair<double, double>>&
-// points, size_t pointIdx, int clusterId) {
 bool DBSCAN::expandCluster(const std::vector<SVCall>& sv_calls, size_t pointIdx, int clusterId) {
     std::vector<size_t> seeds = regionQuery(sv_calls, pointIdx);
     if (static_cast<int>(seeds.size()) < minPts) {
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 8449930e..fddc04d3 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -204,6 +204,11 @@ std::vector<SVCall> SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx,
         dbscan.fit(ends);
         std::vector<int> primary_end_cluster = dbscan.getLargestCluster(ends);
 
+        // Continue if no clusters were found
+        if (primary_start_cluster.empty() && primary_end_cluster.empty()) {
+            continue;
+        }
+
         // Get the supplementary alignment positions
         std::vector<int> supp_starts;
         std::vector<int> supp_ends;
@@ -225,26 +230,61 @@ std::vector<SVCall> SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx,
         dbscan.fit(supp_ends);
         std::vector<int> supp_end_cluster = dbscan.getLargestCluster(supp_ends);
 
+        // Continue if no clusters were found
+        if (supp_start_cluster.empty() && supp_end_cluster.empty()) {
+            continue;
+        }
+
         // Use the median of the largest cluster of primary and supplementary
         // alignment start, end positions as the final genome coordinates of the
         // SV
         int primary_pos = -1;
+        int primary_pos2 = -1;
         if (primary_start_cluster.size() > primary_end_cluster.size()) {
             std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
             primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
         } else if (primary_end_cluster.size() > primary_start_cluster.size()) {
             std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
             primary_pos = primary_end_cluster[primary_end_cluster.size() / 2];
+        } else {
+            // Use both positions
+            std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
+            std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
+            primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
+            primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2];
         }
 
         // Get the supplementary alignment positions
         int supp_pos = -1;
+        int supp_pos2 = -1;
         if (supp_start_cluster.size() > supp_end_cluster.size()) {
             std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
             supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
         } else if (supp_end_cluster.size() > supp_start_cluster.size()) {
             std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
             supp_pos = supp_end_cluster[supp_end_cluster.size() / 2];
+        } else {
+            // Use both positions. This has been shown to occur in nested SVs
+            std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
+            std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
+            supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
+            supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2];
+        }
+
+        // If two of either were found, use the larger SV candidate
+        if (primary_pos2 != -1) {
+            int sv_length1 = std::abs(primary_pos - supp_pos);
+            int sv_length2 = std::abs(primary_pos2 - supp_pos);
+            if (sv_length2 > sv_length1) {
+                primary_pos = primary_pos2;
+            }
+        }
+        if (supp_pos2 != -1) {
+            int sv_length1 = std::abs(primary_pos - supp_pos);
+            int sv_length2 = std::abs(primary_pos - supp_pos2);
+            if (sv_length2 > sv_length1) {
+                supp_pos = supp_pos2;
+            }
         }
 
         if (primary_pos == -1 || supp_pos == -1) {
@@ -509,9 +549,16 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     printMessage(chr + ": CIGAR SVs...");
     this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
 
+    // Calculate the median read depth across the SV calls
+    printMessage(chr + ": Calculating median SV read depth...");
+    uint32_t cumulative_depth = 0;
+    for (auto& sv_call : chr_sv_calls) {
+        cumulative_depth += sv_call.read_depth;
+    }
+    double median_sv_depth = (double)cumulative_depth / (double)chr_sv_calls.size();
+    printMessage("Median SV read depth: " + std::to_string(median_sv_depth));
+
     printMessage(chr + ": Merging CIGAR...");
-    // double cigar_epsilon = 0.45;
-    // int cigar_min_pts = 15;
     mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts);
 
     int region_sv_count = getSVCount(chr_sv_calls);
@@ -529,16 +576,16 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     std::vector<SVCall> split_sv_calls;
     this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, split_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
 
-    // Merge the split-read SVs separately
+    // // Merge the split-read SVs separately
     printMessage(chr + ": Merging split reads...");
     double split_epsilon = 0.45;
-    int split_min_pts = 2;
+    int split_min_pts = 2;  // This is low since split alignments were already previously merged
     mergeSVs(split_sv_calls, split_epsilon, split_min_pts);
 
     printMessage(chr + ": Unifying SVs...");
     chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
 
-    mergeSVSubsets(chr_sv_calls);
+    // mergeSVSubsets(chr_sv_calls);
 
     // Sort the SV calls by start position
     std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
@@ -548,8 +595,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     printMessage("Completed chromosome " + chr);
 }
 
-
-
 void SVCaller::run(const InputData& input_data)
 {
     // Set up the reference genome
@@ -680,7 +725,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
                 } else if (supp_type == SVType::NEUTRAL) {
                     supp_type = SVType::INV;
                 }
-                printMessage("Inversion detected: " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " (LENGTH " + std::to_string(sv_candidate.end - sv_candidate.start + 1) + ")");
             }
             
             if (supp_type != SVType::NEUTRAL) {
@@ -695,176 +739,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
             printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates");
         }
     }
-
-    // // Find split-read SV evidence
-    // // printMessage(region + ": Finding split-read SVs...");
-    // // std::vector<SVCall> split_sv_calls;
-    // int current_primary = 0;
-    // int primary_count = primary_map.size();
-    // uint32_t min_cnv_length = input_data.getMinCNVLength();
-    // for (auto& entry : primary_map) {
-    //     current_primary++;
-    //     const std::string& qname = entry.first;
-    //     GenomicRegion& primary = entry.second;
-    //     const std::string& primary_chr = bamHdr->target_name[primary.tid];
-    //     int primary_cluster_size = primary.cluster_size;
-
-    //   	// Find the largest supplementary alignment
-    //     auto& supp_regions = supp_map[qname];
-    //     // GenomicRegion largest_supp = supp_regions[0];
-    //     auto it = std::max_element(supp_regions.begin(), supp_regions.end(), [](const GenomicRegion& a, const GenomicRegion& b) {
-    //         return a.end - a.start < b.end - b.start;
-    //     });
-    //     GenomicRegion largest_supp = *it;
-
-    //     // If on a different chromosome, label as a translocation
-    //     if (primary.tid != largest_supp.tid) {
-    //         // Note that these do not currently have a likelihood score or read depth
-    //         // Create two BND records for the translocation
-    //         // Create the alternate allele format for the first BND record
-    //         const std::string& supp_chr = bamHdr->target_name[largest_supp.tid];
-    //         std::string alt_allele = "N[" + supp_chr + ":" + std::to_string(largest_supp.start) + "[";
-    //         if (largest_supp.strand == false) {
-    //             // Reverse-oriented relative to the reference
-    //             alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]";
-    //         }
-    //         // addSVCall(split_sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0);
-    //         SVCall sv_call1(primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, 1, primary_cluster_size);
-    //         addSVCall(split_sv_calls, sv_call1);
-
-    //         // Create the alternate allele format for the second BND record
-    //         alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "[";
-    //         if (primary.strand == false) {
-    //             // Reverse-oriented relative to the reference
-    //             alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]";
-    //         }
-    //         // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end,
-    //         // SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0);
-    //         SVCall sv_call2(largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, 1, primary_cluster_size);
-    //         addSVCall(split_sv_calls, sv_call2);
-
-    //         continue;
-    //     }
-
-    //     // Inversion detection
-    //     bool is_opposite_strand = primary.strand != largest_supp.strand;
-    //     if (is_opposite_strand) {
-    //         if (largest_supp.end - largest_supp.start >= min_cnv_length) {
-
-    //             // Print error if the start position is greater than the end
-    //             // position
-    //             if (largest_supp.start > largest_supp.end) {
-    //                 printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(largest_supp.start) + "-" + std::to_string(largest_supp.end));
-    //                 continue;
-    //             }
-
-    //             std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, largest_supp.start, largest_supp.end, mean_chr_cov, pos_depth_map, input_data);
-    //             if (std::get<1>(result) == SVType::UNKNOWN) {
-    //                 continue;
-    //             }
-
-    //             double supp_lh = std::get<0>(result);
-    //             SVType supp_type = std::get<1>(result);
-    //             int read_depth = this->calculateReadDepth(pos_depth_map, largest_supp.start, largest_supp.end);
-    //             if (supp_type == SVType::NEUTRAL) {
-    //                 SVCall sv_call(largest_supp.start, largest_supp.end, SVType::INV, "<INV>", "SPLIT", "./.", supp_lh, read_depth, 1, primary_cluster_size);
-    //                 addSVCall(split_sv_calls, sv_call);
-    //                 // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-    //                 continue;
-                    
-    //             } else if (supp_type == SVType::DUP) {
-    //                 SVCall sv_call(largest_supp.start, largest_supp.end, SVType::INV_DUP, "<INV>", "SPLIT", "./.", supp_lh, read_depth, 1, primary_cluster_size);
-    //                 addSVCall(split_sv_calls, sv_call);
-    //                 // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-    //                 continue;
-    //             } else if (supp_type == SVType::DEL) {
-    //                 SVCall sv_call(largest_supp.start, largest_supp.end, SVType::INV_DEL, "<INV>", "SPLIT", "./.", supp_lh, read_depth, 1, primary_cluster_size);
-    //                 addSVCall(split_sv_calls, sv_call);
-    //                 // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DEL, "<INV>", "SPLIT", "./.", supp_lh, read_depth);
-    //                 continue;
-    //             }
-    //         }
-    //     }
-
-    //     // Analyze split-read evidence for deletions and duplications
-    //     bool gap_exists = false;
-    //     uint32_t boundary_left, boundary_right, gap_left, gap_right;
-    //     boundary_left = std::min(primary.start, largest_supp.start);
-    //     boundary_right = std::max(primary.end, largest_supp.end);
-    //     gap_left = std::min(primary.end, largest_supp.start);
-    //     gap_right = std::max(primary.start, largest_supp.end);
-    //     gap_exists = gap_left < gap_right;
-        
-    //     // Run copy number variant predictions on the boundary if large enough
-    //     if (boundary_right - boundary_left >= min_cnv_length) {
-
-    //         // Print error if the start position is greater than the end
-    //         // position
-    //         if (boundary_left > boundary_right) {
-    //             printError("ERROR: Invalid boundary coordinates: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right));
-    //             continue;
-    //         }
-            
-    //         std::tuple<double, SVType, std::string, bool> bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data);
-    //         if (std::get<1>(bd_result) == SVType::UNKNOWN) {
-    //             continue;
-    //         }
-    //         double bd_lh = std::get<0>(bd_result);
-    //         SVType bd_type = std::get<1>(bd_result);
-
-    //         // Run copy number variant predictions on the gap if it exists
-    //         if (gap_exists && gap_right - gap_left >= min_cnv_length) {
-
-    //             // Print error if the start position is greater than the end
-    //             // position
-    //             if (gap_left > gap_right) {
-    //                 printError("ERROR: Invalid gap coordinates: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right));
-    //                 continue;
-    //             }
-
-    //             // printMessage(region + ": Running copy number prediction for
-    //             // gap...");
-    //             // printMessage("Running copy number prediction, length: " + std::to_string(gap_right - gap_left));
-    //             std::tuple<double, SVType, std::string, bool> gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data);
-    //             if (std::get<1>(gap_result) == SVType::UNKNOWN) {
-    //                 continue;
-    //             }
-    //             double gap_lh = std::get<0>(gap_result);
-    //             SVType gap_type = std::get<1>(gap_result);
-
-    //             // If higher likelihood than the boundary, add the gap as the SV call
-    //             if (gap_lh > bd_lh) {
-    //                 int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right);
-    //                 std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">";
-    //                 SVCall sv_call(gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth, 1, primary_cluster_size);
-    //                 addSVCall(split_sv_calls, sv_call);
-    //                 // addSVCall(split_sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth);
-    //             } else {
-    //                 // Add the boundary as the SV call
-    //                 int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
-    //                 std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-    //                 SVCall sv_call(boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, 1, primary_cluster_size);
-    //                 addSVCall(split_sv_calls, sv_call);
-    //                 // addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth);
-    //             }
-    //         } else {
-    //             // Add the boundary as the SV call
-    //             int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right);
-    //             std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">";
-    //             SVCall sv_call(boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, 1, primary_cluster_size);
-    //             addSVCall(split_sv_calls, sv_call);
-    //             // addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth);
-    //         }
-    //     }
-
-    //     // Print progress every 1000 primary alignments
-    //     if (current_primary % 1000 == 0) {
-    //         printMessage(region + ": Processed " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments...");
-    //     }
-    // }
-
-    // Unify the SV calls
-    // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
 }
 
 void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const
@@ -897,6 +771,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         contig_header,
         "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">",
         "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">",
+        "##INFO=<ID=SVTYPE2,Number=1,Type=String,Description=\"Type of structural variant (if more than one)\">",
         "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">",
         "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Method used to call the structural variant\">",
         "##INFO=<ID=ALN,Number=1,Type=String,Description=\"Feature used to identify the structural variant\">",
@@ -938,7 +813,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     std::string header_line = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE";
     vcf_stream << header_line << std::endl;
     std::cout << "Saving SV calls to " << output_vcf << std::endl;
-    int skip_count = 0;
     int total_count = 0;
     for (const auto& pair : sv_calls) {
         std::string chr = pair.first;
@@ -948,37 +822,39 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             // Get the SV candidate and SV info
             uint32_t start = sv_call.start;
             uint32_t end = sv_call.end;
-            std::string sv_type_str = getSVTypeString(sv_call.sv_type);
+            SVType sv_type = sv_call.sv_type;
             std::string genotype = sv_call.genotype;
             std::string data_type_str = sv_call.data_type;
             std::string alt_allele = sv_call.alt_allele;
             double hmm_likelihood = sv_call.hmm_likelihood;
             int sv_length = end - start + 1;
             int cluster_size = sv_call.cluster_size;
-            /*
-            if (sv_type_str == "DEL") {
-            	sv_length++;
-        	}
-        	*/
             int read_depth = sv_call.read_depth;
             std::string ref_allele = ".";
             int support = sv_call.support;
 
-            // If the SV type is unknown, skip it
-            if (sv_type_str == "UNKNOWN" || sv_type_str == "NEUTRAL") {
-                skip_count += 1;
-                continue;
+            // If the SV type is unknown, print a warning and skip
+            if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) {
+                std::cerr << "Warning: Unknown SV type for SV at " << chr << ":" << start << "-" << end << std::endl;
             } else {
                 total_count += 1;
             }
 
+            // For complex SVs, split the SV into multiple types (SVTYPE +
+            // SVTYPE2)
+            SVType sv_type2 = SVType::UNKNOWN;
+            if (sv_type == SVType::INV_DEL) {
+                sv_type = SVType::DEL;
+                sv_type2 = SVType::INV;
+            } else if (sv_type == SVType::INV_DUP) {
+                sv_type = SVType::DUP;
+                sv_type2 = SVType::INV;
+            }
+
             // Deletion
-            if (sv_type_str == "DEL") {
+            if (sv_type == SVType::DEL) {
                 // Get the deleted sequence from the reference genome, also including the preceding base
                 int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
-                // ref_allele = ref_genome.query(chr, preceding_pos, end);
-                // ref_allele = this->input_data.queryRefGenome(chr,
-                // preceding_pos, end);
                 ref_allele = ref_genome.query(chr, preceding_pos, end);
 
                 // Use the preceding base as the alternate allele 
@@ -990,44 +866,28 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                 }
 
                 sv_length = -1 * sv_length;  // Negative length for deletions
-
                 start = preceding_pos;  // Update the position to the preceding base
 
             // Other types (duplications, insertions, inversions)
             } else {
-                // Use the preceding base as the reference allele
+                // Update the position to the preceding base
                 int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
-                // ref_allele = this->input_data.queryRefGenome(chr,
-                // preceding_pos, preceding_pos);
                 ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
-
-                // Update the start position to the preceding base
                 start = preceding_pos;
 
                 // Update the end position to the same base for duplications and insertions
-                if (sv_type_str == "DUP" || sv_type_str == "INS") {
+                if (sv_type == SVType::DUP || sv_type == SVType::INS) {
                     end = start;
                 }
 
-                if (sv_type_str == "INS") {
-                    // Check if in symbolic form
+                if (sv_type == SVType::INS) {
                     if (alt_allele != "<INS>") {
-                        // Use the insertion sequence as the alternate allele
+                        // Insert the reference allele before the insertion
                         alt_allele.insert(0, ref_allele);
                     }
-                    // start = preceding_pos;  // Update the position to the preceding base
-
-                    // // Update the end position to the start position to change from
-                    // // query to reference coordinates for insertions
-                    // end = start;
                 }
             }
 
-            // Print the REF allele if SVTYPE = DUP and if it is empty or "." (symbolic)
-            if (sv_type_str == "DUP" && (ref_allele == "" || ref_allele == ".")) {
-                printMessage("REF allele for DUP at " + chr + ":" + std::to_string(start) + "-" + std::to_string(end) + ": " + ref_allele + ", ALT allele: " + alt_allele);
-            }
-
             // Fix ambiguous bases in the reference allele
             const std::string amb_bases = "RYKMSWBDHV";  // Ambiguous bases
             std::bitset<256> amb_bases_bitset;
@@ -1042,9 +902,18 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             }
 
             // Create the VCF parameter strings
+            std::string sv_type_str = getSVTypeString(sv_type);
+            std::string sv_type2_str = ".";
+            if (sv_type2 != SVType::UNKNOWN) {
+                sv_type2_str = getSVTypeString(sv_type2);
+            }
             std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
-                ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
+                ";SVTYPE2=" + sv_type2_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
                 ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(support) + ";CLUSTER=" + std::to_string(cluster_size);
+
+            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + 
+            //     ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + 
+            //     ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(support) + ";CLUSTER=" + std::to_string(cluster_size);
                 
             std::string format_str = "GT:DP";
             std::string sample_str = genotype + ":" + std::to_string(read_depth);
@@ -1059,7 +928,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         }
     }
     vcf_stream.close();
-
     std::cout << "Saved SV calls to " << output_vcf << std::endl;
 
     // Create a compressed and indexed VCF file
@@ -1068,9 +936,12 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     std::string tabix_cmd = "tabix -p vcf " + output_vcf + ".gz";
     std::system(bgzip_cmd.c_str());
     std::system(tabix_cmd.c_str());
+    output_vcf += ".gz";
+    std::cout << "VCF file created: " << output_vcf << std::endl;
+    std::cout << "Index file created: " << output_vcf + ".tbi" << std::endl;
 
     // Print the number of SV calls skipped
-    std::cout << "Finished writing VCF file. Total SV calls: " << total_count << ", skipped: " << skip_count << " with unknown SV type" << std::endl;
+    std::cout << "Finished writing VCF file. Total records: " << total_count << std::endl;
 }
 
 int SVCaller::calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end)
@@ -1084,13 +955,15 @@ int SVCaller::calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uin
         // depth map." << std::endl;
         printError("Error: Start position " + std::to_string(start) + " not found in depth map.");
     }
-    try {
-        // printMessage("Read depth at end: " + std::to_string(pos_depth_map.at(end)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start));
-        read_depth += pos_depth_map.at(end);
-    } catch (const std::out_of_range& e) {
-        printError("Error: End position " + std::to_string(end) + " not found in depth map.");
-        // std::cerr << "Warning: End position " << end << " not found in depth map of size " << pos_depth_map.size() << "." << std::endl;
-    }
+
+    // UPDATE: Only use the start position for the read depth calculation
+    // try {
+    //     // printMessage("Read depth at end: " + std::to_string(pos_depth_map.at(end)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start));
+    //     read_depth += pos_depth_map.at(end);
+    // } catch (const std::out_of_range& e) {
+    //     printError("Error: End position " + std::to_string(end) + " not found in depth map.");
+    //     // std::cerr << "Warning: End position " << end << " not found in depth map of size " << pos_depth_map.size() << "." << std::endl;
+    // }
     // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth));
     return read_depth;
 }
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 32738adc..ef12b2f6 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -68,7 +68,8 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
         SVType::INV,
         SVType::INS,
         SVType::BND,
-        SVType::INV_DUP
+        SVType::INV_DUP,
+        SVType::INV_DEL,
     })
     {
         // Create a vector of SV calls for the current SV type and size interval

From 2c0a8a09c2999db1b256dd5738ee9a5f557e0dfa Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 1 Mar 2025 00:56:35 -0500
Subject: [PATCH 076/134] use pct mean coverage for minpts

---
 include/input_data.h         | 10 ++++---
 python/plot_distributions.py | 54 ++++++++++++++++++++++++------------
 src/input_data.cpp           | 30 +++++++++-----------
 src/main.cpp                 | 11 +++++---
 src/sv_caller.cpp            |  7 +++++
 src/sv_object.cpp            | 17 ++++--------
 6 files changed, 75 insertions(+), 54 deletions(-)

diff --git a/include/input_data.h b/include/input_data.h
index d88426f3..106c70b6 100644
--- a/include/input_data.h
+++ b/include/input_data.h
@@ -61,10 +61,6 @@ class InputData {
         void setMinCNVLength(int min_cnv_length);
         uint32_t getMinCNVLength() const;
 
-        // Set the minimum number of reads supporting an SV for filtering steps.
-        void setMinReadSupport(int min_reads);
-        int getMinReadSupport() const;
-
         // Set the epsilon parameter for DBSCAN clustering.
         void setDBSCAN_Epsilon(double epsilon);
         double getDBSCAN_Epsilon() const;
@@ -73,6 +69,11 @@ class InputData {
         void setDBSCAN_MinPts(int min_pts);
         int getDBSCAN_MinPts() const;
 
+        // Set the percentage of mean chromosome coverage to use for DBSCAN
+        // minimum points.
+        void setDBSCAN_MinPtsPct(double min_pts_pct);
+        double getDBSCAN_MinPtsPct() const;
+
         // Set the chromosome to analyze.
         void setChromosome(std::string chr);
         std::string getChromosome() const;
@@ -113,6 +114,7 @@ class InputData {
         int min_reads;
         double dbscan_epsilon;
         int dbscan_min_pts;
+        double dbscan_min_pts_pct;
         std::string chr;  // Chromosome to analyze
         std::pair<int32_t, int32_t> start_end;  // Region to analyze
         bool region_set;  // True if a region is set
diff --git a/python/plot_distributions.py b/python/plot_distributions.py
index 8766a157..37eb1638 100644
--- a/python/plot_distributions.py
+++ b/python/plot_distributions.py
@@ -26,11 +26,28 @@
 
 def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"):
     # Read VCF file into a pandas DataFrame
-    vcf_df = pd.read_csv(input_vcf, sep='\t', comment='#', header=None, \
-                         names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE'], \
-                            dtype={'CHROM': str, 'POS': np.int64, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, \
-                                   'FILTER': str, 'INFO': str, 'FORMAT': str, 'SAMPLE': str})
-
+    try:
+        vcf_df = pd.read_csv(input_vcf, sep='\t', comment='#', header=None, \
+                            names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE'], \
+                                dtype={'CHROM': str, 'POS': np.int64, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, \
+                                    'FILTER': str, 'INFO': str, 'FORMAT': str, 'SAMPLE': str})
+    except Exception as e:
+        try:
+            print("[DEBUG] Caught TypeError")
+            # Truvari merged VCF format with different columns
+            vcf_df = pd.read_csv(input_vcf, sep='\t', comment='#', header=None, \
+                                names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE', 'SAMPLE2'], \
+                                    dtype={'CHROM': str, 'POS': np.int64, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, \
+                                        'FILTER': str, 'INFO': str, 'FORMAT': str, 'SAMPLE': str, 'SAMPLE2': str})
+        except Exception as e:
+            print("[DEBUG] Caught Exception")
+            # Platinum pedigree VCF format with different columns
+            vcf_df = pd.read_csv(input_vcf, sep='\t', comment='#', header=None, \
+                                names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE', 'SAMPLE2', 'SAMPLE3', 'SAMPLE4', 'SAMPLE5', 'SAMPLE6', 'SAMPLE7'], \
+                                    dtype={'CHROM': str, 'POS': np.int64, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, \
+                                        'FILTER': str, 'INFO': str, 'FORMAT': str, 'SAMPLE1': str, 'SAMPLE2': str, 'SAMPLE3': str, 'SAMPLE4': str, \
+                                            'SAMPLE5': str, 'SAMPLE6': str, 'SAMPLE7': str})
+            
     # Initialize dictionaries to store SV sizes for each type of SV
     sv_sizes = {}
 
@@ -61,6 +78,7 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"):
         # Continue if SV type is BND (no SV size)
         if sv_type == "BND":
             continue
+
         # If the SV caller is DELLY, then we use the second SV size for non-INS
         # (they don't have SVLEN) and the first SV size for INS
         sv_size = None
@@ -71,7 +89,9 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"):
 
         # If the plot title is GIAB, then we need to convert INS to DUP if
         # INFO/SVTYPE is INS and INFO/REPTYPE is DUP
-        if plot_title == "GIAB" and sv_type == "INS":
+        # if plot_title == "GIAB" and sv_type == "INS":
+        # Check if GIAB is a substring of the plot title
+        if "GIAB" in plot_title and sv_type == "INS":
             if 'REPTYPE=DUP' in record['INFO']:
                 sv_type = "DUP"
 
@@ -192,22 +212,22 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"):
     # Add the bin edges to the x-axis ticks as a range
     fig.update_xaxes(tickvals=x_values, ticktext=bin_labels)
 
-    # # Move the legend to the top right inside the plot
-    # fig.update_layout(legend=dict(
-    #     orientation='v',
-    #     yanchor='top',
-    #     y=0.75,
-    #     xanchor='right',
-    #     x=0.75,
-    # ))
-    # Move the legend to the bottom right outside the plot
+    # Move the legend to the top right inside the plot
     fig.update_layout(legend=dict(
         orientation='v',
         yanchor='top',
-        y=1.0,
+        y=0.75,
         xanchor='right',
-        x=1.15,
+        x=0.75,
     ))
+    # # Move the legend to the bottom right outside the plot
+    # fig.update_layout(legend=dict(
+    #     orientation='v',
+    #     yanchor='top',
+    #     y=1.0,
+    #     xanchor='right',
+    #     x=1.15,
+    # ))
 
     # Set a larger font size for all text in the plot
     fig.update_layout(font=dict(size=26))
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 649e8b1c..7a073dae 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -27,8 +27,9 @@ InputData::InputData()
     this->sample_size = 100;
     this->min_cnv_length = 1000;
     this->min_reads = 5;
-    this->dbscan_epsilon = 0.5;
-    this->dbscan_min_pts = 5;
+    this->dbscan_epsilon = 0.99;
+    this->dbscan_min_pts = 15;
+    this->dbscan_min_pts_pct = 0.0;
     this->thread_count = 1;
     this->hmm_filepath = "data/wgs.hmm";
     this->verbose = false;
@@ -157,21 +158,6 @@ void InputData::setMinCNVLength(int min_cnv_length)
     this->min_cnv_length = (uint32_t) min_cnv_length;
 }
 
-void InputData::setMinReadSupport(int min_reads)
-{
-    // Ensure that the minimum read support is an integer and greater than 0
-    if (min_reads < 1)
-    {
-        throw std::runtime_error("Minimum read support must be an integer greater than 0");
-    }
-    this->min_reads = min_reads;
-}
-
-int InputData::getMinReadSupport() const
-{
-    return this->min_reads;
-}
-
 void InputData::setDBSCAN_Epsilon(double epsilon)
 {
     this->dbscan_epsilon = epsilon;
@@ -192,6 +178,16 @@ int InputData::getDBSCAN_MinPts() const
     return this->dbscan_min_pts;
 }
 
+void InputData::setDBSCAN_MinPtsPct(double min_pts_pct)
+{
+    this->dbscan_min_pts_pct = min_pts_pct;
+}
+
+double InputData::getDBSCAN_MinPtsPct() const
+{
+    return this->dbscan_min_pts_pct;
+}
+
 void InputData::setChromosome(std::string chr)
 {
     this->chr = chr;
diff --git a/src/main.cpp b/src/main.cpp
index 5275f368..e493cd4e 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -46,9 +46,6 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
     if (args.find("min-cnv") != args.end()) {
         input_data.setMinCNVLength(std::stoi(args.at("min-cnv")));
     }
-    if (args.find("min-reads") != args.end()) {
-        input_data.setMinReadSupport(std::stoi(args.at("min-reads")));
-    }
     if (args.find("eth") != args.end()) {
         input_data.setEthnicity(args.at("eth"));
     }
@@ -71,6 +68,10 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
         input_data.setDBSCAN_MinPts(std::stoi(args.at("min-pts")));
     }
 
+    if (args.find("min-pts-pct") != args.end()) {
+        input_data.setDBSCAN_MinPtsPct(std::stod(args.at("min-pts-pct")));
+    }
+
     // Run ContextSV
     run(input_data);
 }
@@ -88,9 +89,9 @@ void printUsage(const std::string& programName) {
                 << "  -h, --hmm <hmm_file>          HMM file\n"
                 << "  -n, --sample-size <size>      Sample size for HMM predictions\n"
                 << "     --min-cnv <min_length>     Minimum CNV length\n"
-                << "     --min-reads <min_reads>    Minimum read support\n"
                 << "     --eps <epsilon>             DBSCAN epsilon\n"
                 << "     --min-pts <min_pts>         DBSCAN minimum points\n"
+                << "     --min-pts-pct <min_pts_pct> Percentage of mean chr. coverage to use for DBSCAN minimum points\n"
                 << "  -e, --eth <eth_file>          ETH file\n"
                 << "  -p, --pfb <pfb_file>          PFB file\n"
                 << "     --save-cnv                 Save CNV data\n"
@@ -131,6 +132,8 @@ std::unordered_map<std::string, std::string> parseArguments(int argc, char* argv
             args["epsilon"] = argv[++i];
         } else if (arg == "--min-pts" && i + 1 < argc) {
             args["min-pts"] = argv[++i];
+        } else if (arg == "--min-pts-pct" && i + 1 < argc) {
+            args["min-pts-pct"] = argv[++i];
         } else if ((arg == "-e" || arg == "--eth") && i + 1 < argc) {
             args["eth"] = argv[++i];
         } else if ((arg == "-p" || arg == "--pfb") && i + 1 < argc) {
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index fddc04d3..4875bbc2 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -545,6 +545,13 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         return;
     }
 
+    // Estimate DBSCAN minimum points
+    double dbscan_min_pts_pct = input_data.getDBSCAN_MinPtsPct();
+    if (dbscan_min_pts_pct > 0.0) {
+        dbscan_min_pts = (int)std::ceil(mean_chr_cov * dbscan_min_pts_pct);
+        printMessage(chr + ": Mean chr. cov.: " + std::to_string(mean_chr_cov) + " (DBSCAN min. pts.= " + std::to_string(dbscan_min_pts) + ", min. pts. pct.= " + std::to_string(dbscan_min_pts_pct) + ")");
+    }
+
     // Detect SVs from the CIGAR strings
     printMessage(chr + ": CIGAR SVs...");
     this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index ef12b2f6..41d787fc 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -31,11 +31,6 @@ void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call)
     // Insert the SV call in sorted order
     auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call);
     sv_calls.insert(it, sv_call);
-
-    // Insert the SV call in sorted order
-    // SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1};
-    // auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call);
-    // sv_calls.insert(it, sv_call);
 }
 
 uint32_t getSVCount(const std::vector<SVCall>& sv_calls)
@@ -55,12 +50,10 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
     if (sv_calls.size() < 2) {
         return;
     }
-    int initial_size = sv_calls.size();
 
     // Cluster SVs using DBSCAN for each SV type
+    int initial_size = sv_calls.size();
     std::vector<SVCall> merged_sv_calls;
-
-    // Cluster SVs using DBSCAN for each SV type
     DBSCAN dbscan(epsilon, min_pts);
     for ( const auto& sv_type : {
         SVType::DEL,
@@ -94,10 +87,10 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
         for (auto& cluster : cluster_map) {
             int cluster_id = cluster.first;
             std::vector<SVCall>& cluster_sv_calls = cluster.second;
-            // if (cluster_id < 0) {
-            //     continue;  // Skip noise and unclassified points
-            // } else {
-            if (true) {
+            if (cluster_id < 0) {
+                continue;  // Skip noise and unclassified points
+            } else {
+            // if (true) {
                 // Check if any SV has a non-zero likelihood
                 bool has_nonzero_likelihood = false;
                 if (cluster_sv_calls.size() > 0) {

From f282196c9c3a0010865bc27e31bfcfd3326ec1a8 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 1 Mar 2025 16:19:29 -0500
Subject: [PATCH 077/134] remove unused code

---
 src/sv_caller.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 4875bbc2..cc72a247 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -556,15 +556,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     printMessage(chr + ": CIGAR SVs...");
     this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
 
-    // Calculate the median read depth across the SV calls
-    printMessage(chr + ": Calculating median SV read depth...");
-    uint32_t cumulative_depth = 0;
-    for (auto& sv_call : chr_sv_calls) {
-        cumulative_depth += sv_call.read_depth;
-    }
-    double median_sv_depth = (double)cumulative_depth / (double)chr_sv_calls.size();
-    printMessage("Median SV read depth: " + std::to_string(median_sv_depth));
-
     printMessage(chr + ": Merging CIGAR...");
     mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts);
 

From 972a27126760c960d99dd418a1c1f3ad721dae0c Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 5 Mar 2025 15:03:47 -0500
Subject: [PATCH 078/134] fix memory leaks

---
 include/cnv_caller.h  |  13 +-
 include/fasta_query.h |   8 +-
 include/sv_caller.h   |  48 ++--
 include/utils.h       |   3 +
 src/cnv_caller.cpp    | 643 +++++++++++++++++++++++++++---------------
 src/fasta_query.cpp   |  57 +---
 src/sv_caller.cpp     | 276 +++++++++++++-----
 7 files changed, 659 insertions(+), 389 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 055e3247..c2961cc8 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -14,7 +14,8 @@
 #include <vector>
 #include <unordered_map>
 #include <set>
-#include <mutex>
+// #include <mutex>
+#include <shared_mutex>
 #include <future>
 
 /// @endcond
@@ -48,7 +49,8 @@ class CNVCaller {
         //mutable std::mutex snp_file_mtx;  // SNP file mutex
         //mutable std::mutex pfb_file_mtx;  // Population frequency file mutex
         //mutable std::mutex bam_file_mtx;  // BAM file mutex
-        std::mutex& shared_mutex;
+        // std::mutex& shared_mutex;
+        std::shared_mutex& shared_mutex;
 
         // Define a map of CNV genotypes by HMM predicted state.
         // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output.
@@ -82,10 +84,7 @@ class CNVCaller {
         std::vector<std::string> splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const;
 
     public:
-        // explicit CNVCaller(const InputData& input_data);
-        // Constructor with no arguments
-        //CNVCaller() = default;
-	    CNVCaller(std::mutex& mtx) : shared_mutex(mtx) {}
+	    CNVCaller(std::shared_mutex& shared_mutex) : shared_mutex(shared_mutex) {}
 
         // Run copy number prediction for a single SV candidate, returning the
         // likelihood, predicted CNV type, genotype, and whether SNPs were found
@@ -96,7 +95,7 @@ class CNVCaller {
 
         double calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const;
 
-        void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& pos_log2) const;
+        // void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& pos_log2) const;
 
         void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data) const;
 
diff --git a/include/fasta_query.h b/include/fasta_query.h
index a0697446..4486bdb0 100644
--- a/include/fasta_query.h
+++ b/include/fasta_query.h
@@ -8,7 +8,8 @@
 #include <map>
 #include <unordered_map>
 #include <vector>
-#include <mutex>
+// #include <mutex>
+#include <shared_mutex>
 #include <string_view>
 /// @endcond
 
@@ -18,11 +19,10 @@ class ReferenceGenome {
         std::vector<std::string> chromosomes;
         std::unordered_map<std::string, std::string> chr_to_seq;
         std::map<std::string, uint32_t> chr_to_length;
-        //mutable std::mutex mtx;
-        std::mutex& shared_mutex;
+        std::shared_mutex& shared_mutex;
 
     public:
-	    ReferenceGenome(std::mutex& mtx) : shared_mutex(mtx) {}
+	    ReferenceGenome(std::shared_mutex& shared_mutex) : shared_mutex(shared_mutex) {}
     
         int setFilepath(std::string fasta_filepath);
         std::string getFilepath() const;
diff --git a/include/sv_caller.h b/include/sv_caller.h
index e0b477e6..d795f44b 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -11,7 +11,8 @@
 #include <htslib/sam.h>
 
 /// @cond
-#include <mutex>
+// #include <mutex>
+#include <shared_mutex>
 #include <unordered_map>
 #include <future>
 /// @endcond
@@ -30,28 +31,41 @@ struct IntervalNode {
     GenomicRegion region;
     std::string qname;
     hts_pos_t max_end;  // To optimize queries
-    IntervalNode* left;
-    IntervalNode* right;
+    // IntervalNode* left;
+    // IntervalNode* right;
+    std::unique_ptr<IntervalNode> left;
+    std::unique_ptr<IntervalNode> right;
 
     IntervalNode(GenomicRegion r, std::string name)
         : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {}
 };
 
-IntervalNode* insert(IntervalNode* root, GenomicRegion region, std::string qname) {
-    if (!root)
-        return new IntervalNode(region, qname);
+// IntervalNode* insert(IntervalNode* root, GenomicRegion region, std::string
+// qname) {
+void insert(std::unique_ptr<IntervalNode>& root, GenomicRegion region, std::string qname) {
+    if (!root) {
+        // return new IntervalNode(region, qname);
+        root = std::make_unique<IntervalNode>(region, qname);
+        return;
+    }
 
     if (region.start < root->region.start)
-        root->left = insert(root->left, region, qname);
-    else
-        root->right = insert(root->right, region, qname);
+    {
+        // root->left = insert(root->left, region, qname);
+        insert(root->left, region, qname);
+    } else {
+        // root->right = insert(root->right, region, qname);
+        insert(root->right, region, qname);
+    }
 
     // Update max_end
     root->max_end = std::max(root->max_end, region.end);
-    return root;
+    // return root;
 }
 
-void findOverlaps(IntervalNode* root, GenomicRegion query, std::vector<std::string>& result) {
+// void findOverlaps(IntervalNode* root, GenomicRegion query,
+// std::vector<std::string>& result) {
+void findOverlaps(const std::unique_ptr<IntervalNode>& root, GenomicRegion query, std::vector<std::string>& result) {
     if (!root) return;
 
     // If overlapping, add to result
@@ -75,17 +89,17 @@ struct MismatchData {
 class SVCaller {
     private:
         int min_mapq = 20;          // Minimum mapping quality to be considered
-        std::mutex shared_mutex;
+        mutable std::shared_mutex shared_mutex;  // Shared mutex for thread safety
 
         std::vector<std::string> getChromosomes(const std::string& bam_filepath);
 
-        std::vector<SVCall> getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map<std::string, GenomicRegion>& primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>>& supp_map);
+        void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls);
 
         // Detect SVs from the CIGAR string of a read alignment, and return the
         // mismatch rate, and the start and end positions of the query sequence
         void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome);
 
-        void processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome);
+        void processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov, std::vector<SVCall>& split_sv_calls);
 
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
@@ -96,14 +110,16 @@ class SVCaller {
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
 
         // Detect SVs from split alignments
-        void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data);
+        // void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data);
 
         // Calculate the mismatch rate given a map of query positions to
         // match/mismatch (1/0) values within a specified range of the query
         // sequence
         double calculateMismatchRate(const MismatchData& mismatch_data);
 
-        void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const;
+        void runSplitReadCopyNumberPredictions(const std::string& chr, std::vector<SVCall>& split_sv_calls, const CNVCaller &cnv_caller, const CHMM &hmm, double mean_chr_cov, const std::vector<uint32_t> &pos_depth_map, const InputData &input_data);
+
+        void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const std::string &output_dir, const ReferenceGenome &ref_genome) const;
 
         // Calculate the read depth (INFO/DP) for a region
         int calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end);
diff --git a/include/utils.h b/include/utils.h
index 6715b00e..6ec95610 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -25,12 +25,15 @@ struct BamFileGuard {
     ~BamFileGuard() {
         if (idx) {
             hts_idx_destroy(idx);
+            idx = nullptr;
         }
         if (bamHdr) {
             bam_hdr_destroy(bamHdr);
+            bamHdr = nullptr;
         }
         if (fp_in) {
             sam_close(fp_in);
+            fp_in = nullptr;
         }
     }
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 16d62d4f..d1a711e9 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -24,6 +24,9 @@
 #include <string>
 #include <algorithm>  // std::max
 #include <utility>    // std::pair
+#include <unordered_set>
+#include <execution>  // std::execution::par
+// #include <omp.h>
 
 #include "utils.h"
 #include "sv_types.h"
@@ -52,28 +55,116 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
 {
     // Initialize the SNP data with default values and sample size length
     int sample_size = input_data.getSampleSize();
-    int region_length = (int) (end_pos - start_pos + 1);
-    if (region_length < sample_size)
-    {
-        sample_size = region_length;
-    }
-
-    std::vector<uint32_t> snp_pos(sample_size, 0);
-    std::vector<double> snp_baf(sample_size, -1.0);
-    std::vector<double> snp_pfb(sample_size, 0.5);
-    std::vector<double> snp_log2_cov(sample_size, 0.0);
-    std::vector<bool> is_snp(sample_size, false);
+    // int region_length = (int) (end_pos - start_pos + 1);
+    // if (region_length < sample_size)
+    // {
+    //     sample_size = region_length;
+    // }
+
+    // std::vector<uint32_t> snp_pos(sample_size, 0);
+    // std::vector<double> snp_baf(sample_size, -1.0);
+    // std::vector<double> snp_pfb(sample_size, 0.5);
+    // std::vector<double> snp_log2_cov(sample_size, 0.0);
+    // std::vector<bool> is_snp(sample_size, false);
+    std::vector<uint32_t> snp_pos;
+    std::vector<double> snp_baf;
+    std::vector<double> snp_pfb;
+    std::vector<bool> is_snp;
     this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data);
 
     // Get the log2 ratio for <sample_size> evenly spaced positions in the
     // region
-    this->calculateRegionLog2Ratio(start_pos, end_pos, sample_size, pos_depth_map, mean_chr_cov, snp_log2_cov);
+    // this->calculateSNPLog2Ratios(snp_pos, snp_log2_cov, pos_depth_map,
+    // mean_chr_cov, input_data);
+    sample_size = std::max((int) snp_pos.size(), sample_size);
+    // std::vector<uint32_t> snp_pos_hmm(sample_size, 0);
+    // std::vector<double> snp_baf_hmm(sample_size, -1.0);
+    // std::vector<double> snp_pfb_hmm(sample_size, 0.5);
+    // std::vector<double> snp_log2_hmm(sample_size, 0.0);
+    // std::vector<bool> is_snp_hmm(sample_size, false);
+    std::vector<uint32_t> snp_pos_hmm;
+    std::vector<double> snp_baf_hmm;
+    std::vector<double> snp_pfb_hmm;
+    std::vector<double> snp_log2_hmm;
+    std::vector<bool> is_snp_hmm;
+
+    // Loop through evenly spaced positions in the region and get the log2 ratio
+    double pos_step = (double) (end_pos - start_pos + 1) / (double) sample_size;
+    // Convert SNP positions for faster access (convert to a set)
+    std::unordered_set<uint32_t> snp_pos_set(snp_pos.begin(), snp_pos.end());
+
+    for (int i = 0; i < sample_size; i++)
+    {
+        // Calculate the mean depth for the window
+        double cov_sum = 0.0;
+        int pos_count = 0;
+        for (int j = 0; j < pos_step; j++)
+        {
+            uint32_t pos = (uint32_t) (start_pos + i * pos_step + j);
+            if (pos > end_pos)
+            {
+                break;
+            }
+            try
+            {
+                cov_sum += pos_depth_map.at(pos);
+                pos_count++;
+            }
+            catch (const std::out_of_range& e)
+            {
+                // Ignore out of range errors
+            }
+        }
+        double log2_cov = 0.0;
+        if (pos_count > 0)
+        {
+            log2_cov = log2((cov_sum / (double) pos_count) / mean_chr_cov);
+        }
+
+        // Loop through positions and get the log2 ratio
+        bool snp_found_in_sample = false;
+        for (int j = 0; j < pos_step; j++)
+        {
+            uint32_t pos = (uint32_t) (start_pos + i * pos_step + j);
+            if (pos > end_pos)
+            {
+                break;
+            }
+
+            // Check if the position is a SNP
+            if (snp_pos_set.find(pos) != snp_pos_set.end())
+            {
+                // Update the SNP data
+                snp_pos_hmm.push_back(pos);
+                snp_baf_hmm.push_back(snp_baf[i]);
+                snp_pfb_hmm.push_back(snp_pfb[i]);
+                snp_log2_hmm.push_back(log2_cov);
+                is_snp_hmm.push_back(true);
+                snp_found_in_sample = true;
+            }
+        }
+
+        // If no SNP was found in the sample, then use the middle of the window
+        // as a placeholder
+        // This is to ensure that the HMM has a value for every position in the
+        // sample
+        if (!snp_found_in_sample)
+        {
+            uint32_t pos = (uint32_t) (start_pos + (i * pos_step) + (pos_step / 2.0));
+            snp_pos_hmm.push_back(pos);
+            snp_baf_hmm.push_back(-1.0);
+            snp_pfb_hmm.push_back(0.5);
+            snp_log2_hmm.push_back(log2_cov);
+            is_snp_hmm.push_back(false);
+        }
+    }
+    // this->calculateRegionLog2Ratio(start_pos, end_pos, sample_size, pos_depth_map, mean_chr_cov, snp_log2_cov);
 
     // Update the SNP data with all information
     snp_data.pos = std::move(snp_pos);
     snp_data.baf = std::move(snp_baf);
     snp_data.pfb = std::move(snp_pfb);
-    snp_data.log2_cov = std::move(snp_log2_cov);
+    snp_data.log2_cov = std::move(snp_log2_hmm);
     snp_data.is_snp = std::move(is_snp);
 }
 
@@ -249,8 +340,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         // Update the SV information if it does not conflict with the current SV type
         SVType updated_sv_type = getSVTypeFromCNState(max_state);
         bool is_valid_update = isValidCopyNumberUpdate(sv_call.sv_type, updated_sv_type);
-        // if (updated_sv_type != SVType::UNKNOWN && updated_sv_type !=
-        // SVType::NEUTRAL)
         if (is_valid_update)
         {
             std::string genotype = cnv_genotype_map.at(max_state);
@@ -261,28 +350,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
             sv_call.genotype = genotype;
             sv_call.data_type = data_type;
         }
-
-        // Update the SV genotype if known
-        // printMessage("Updating SV call for " + chr + ":" +
-        // std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) +
-        // "...");
-        // std::string genotype = cnv_genotype_map.at(max_state);
-        // std::string data_type = "CIGAR+HMM";
-        // if (updated_sv_type != SVType::UNKNOWN)
-        // {
-        //     sv_call.genotype = genotype;
-        //     sv_call.data_type = data_type;
-        //     sv_call.hmm_likelihood = likelihood;
-        // }
-
-        // Update the SV type if known and it does not conflict with the current
-        // SV type
-        // SVType updated_sv_type = getSVTypeFromCNState(max_state);
-        // if (updated_sv_type != SVType::UNKNOWN && updated_sv_type != SVType::NEUTRAL)
-        // {
-        //     std::string sv_type_str = getSVTypeString(updated_sv_type);
-        //     sv_call.sv_type = sv_type_str;
-        // }
     }
 }
 
@@ -316,7 +383,9 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
 {
     {
         // Open the BAM file
-        std::lock_guard<std::mutex> lock(this->shared_mutex);  // Lock the BAM file
+        // std::shared_lock<std::shared_mutex> lock(this->shared_mutex);  //
+        // Lock the BAM file
+        printMessage("Opening BAM file: " + bam_filepath);
         samFile *bam_file = sam_open(bam_filepath.c_str(), "r");
         if (!bam_file)
         {
@@ -324,12 +393,8 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
             return 0.0;
         }
 
-        // Enable multi-threading. This is possible here due to the lock
+        // Enable multi-threading while opening the BAM file
         hts_set_threads(bam_file, thread_count);
-        // if (single_chr)
-        // {
-        //     hts_set_threads(bam_file, thread_count);
-        // }
 
         // Read the header
         bam_hdr_t *bam_header = sam_hdr_read(bam_file);
@@ -368,14 +433,25 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
             return 0.0;
         }
 
+        // Set threading back to 1 for reading the BAM file
+        // printMessage("Setting threads to 1 for reading BAM file...");
+        // hts_set_threads(bam_file, 1);
+        // printMessage("Threads set to 1 for reading BAM file.");
+
         // Iterate through the chromosome and update the depth map
+        printMessage("Iterating through BAM file reads...");
         while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0)
         {
             // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads
-            if (bam_record->core.flag & BAM_FUNMAP || bam_record->core.flag & BAM_FSECONDARY || bam_record->core.flag & BAM_FQCFAIL || bam_record->core.flag & BAM_FDUP)
+            uint16_t flag = bam_record->core.flag;
+            if (flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP))
             {
                 continue;
             }
+            // if (bam_record->core.flag & BAM_FUNMAP || bam_record->core.flag & BAM_FSECONDARY || bam_record->core.flag & BAM_FQCFAIL || bam_record->core.flag & BAM_FDUP)
+            // {
+            //     continue;
+            // }
             
             // Parse the CIGAR string to get the depth (match, sequence match, and
             // mismatch)
@@ -392,11 +468,17 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
                     // Update the depth for each position in the alignment
                     for (uint32_t j = 0; j < op_len; j++)
                     {
-                        try {
-                            chr_pos_depth_map[ref_pos + j]++;
-                        } catch (const std::out_of_range& oor) {
-                            printError("Out of range error for " + chr + ":" + std::to_string(ref_pos+j));
+                        if (ref_pos + j >= chr_pos_depth_map.size())
+                        {
+                            printError("ERROR: Reference position out of range for " + chr + ":" + std::to_string(ref_pos+j));
+                            continue;
                         }
+                        chr_pos_depth_map[ref_pos + j]++;
+                        // try {
+                        //     chr_pos_depth_map[ref_pos + j]++;
+                        // } catch (const std::out_of_range& oor) {
+                        //     printError("Out of range error for " + chr + ":" + std::to_string(ref_pos+j));
+                        // }
                     }
                 }
                 
@@ -416,55 +498,111 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
         bam_destroy1(bam_record);
         hts_itr_destroy(bam_iter);
     }
-
-    // Calculate the mean chromosome coverage for positions with non-zero depth
-    uint64_t cum_depth = 0;
-    uint32_t pos_count = 0;
-    for (const auto& pos_depth : chr_pos_depth_map)
-    {
-        if (pos_depth > 0)
-        {
-            cum_depth += pos_depth;
-            pos_count++;
-        }
-    }
-
-    double mean_chr_cov = 0.0;
-    if (pos_count > 0)
-    {
-        mean_chr_cov = static_cast<double>(cum_depth) / static_cast<double>(pos_count);
-    }
+    printMessage("Finished reading BAM file, calculating mean chromosome coverage...");
+
+    // // Calculate the mean chromosome coverage for positions with non-zero depth
+    // uint64_t cum_depth = 0;
+    // uint32_t pos_count = 0;
+    // for (const auto& pos_depth : chr_pos_depth_map)
+    // {
+    //     if (pos_depth > 0)
+    //     {
+    //         cum_depth += pos_depth;
+    //         pos_count++;
+    //     }
+    // }
+
+    // double mean_chr_cov = 0.0;
+    // if (pos_count > 0)
+    // {
+    //     mean_chr_cov = static_cast<double>(cum_depth) / static_cast<double>(pos_count);
+    // }
+    // printMessage("Completed calculating mean chromosome coverage: " +
+    // std::to_string(mean_chr_cov));
+    
+    // Parallel sum of the depth map
+    uint64_t cum_depth = std::reduce(
+        std::execution::par,
+        chr_pos_depth_map.begin(),
+        chr_pos_depth_map.end(),
+        0ULL
+    );
+
+    // Parallel count of the non-zero depth positions
+    uint32_t pos_count = std::count_if(
+        std::execution::par,
+        chr_pos_depth_map.begin(),
+        chr_pos_depth_map.end(),
+        [](uint32_t depth) { return depth > 0; }
+    );
+
+    printMessage("Number of positions with non-zero depth: " + std::to_string(pos_count));
+    printMessage("Total depth: " + std::to_string(cum_depth));
+
+    double mean_chr_cov = (pos_count > 0) ? static_cast<double>(cum_depth) / static_cast<double>(pos_count) : 0.0;
+    printMessage("Completed calculating mean chromosome coverage: " + std::to_string(mean_chr_cov));
 
     return mean_chr_cov;
 }
 
-void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& log2_region) const
-{
-    uint32_t region_length = end_pos - start_pos + 1;
-    for (int i = 0; i < sample_size; i++)
-    {
-        uint32_t pos = start_pos + ((double)region_length / sample_size) * i;
-        try {
-            uint32_t depth = pos_depth_map.at(pos);
-
-            // Calculate the log2 ratio for the position
-            if (depth == 0)
-            {
-                log2_region[i] = 0.0;
-            } else {
-                log2_region[i] = log2((double) depth / mean_chr_cov);
-            }
-
-        } catch (const std::out_of_range& e) {
-            log2_region[i] = 0.0;
-        }
-    }
-}
+// void CNVCaller::calculateSNPLog2Ratios(const std::vector<uint32_t>& snp_pos, const std::vector<double>& snp_log2_cov, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov) const
+// {
+//     // Calculate the log2 ratio for each SNP position
+//     for (size_t i = 0; i < snp_pos.size(); i++)
+//     {
+//         uint32_t pos = snp_pos[i];
+//         try {
+//             uint32_t depth = pos_depth_map.at(pos);
+
+//             // Calculate the log2 ratio for the position
+//             if (depth == 0)
+//             {
+//                 snp_log2_cov[i] = 0.0;
+//             } else {
+//                 snp_log2_cov[i] = log2((double) depth / mean_chr_cov);
+//             }
+
+//         } catch (const std::out_of_range& e) {
+//             snp_log2_cov[i] = 0.0;
+//         }
+//     }
+// }
+
+// void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& log2_region, std::vector<uint32_t>& snp_pos) const
+// {
+//     uint32_t region_length = end_pos - start_pos + 1;
+//     double step_size = (double) region_length / sample_size;
+//     std::set<uint32_t> snp_pos_set(snp_pos.begin(), snp_pos.end());
+
+//     // Loop through each interval in the region and calculate the log2 ratio
+//     for (int i = 0; i < sample_size; i++)
+//     {
+//         uint32_t pos = start_pos + (uint32_t) (i * step_size);
+//         if (pos > end_pos)
+//         {
+//             pos = end_pos;
+//         }
+//         try {
+//             uint32_t depth = pos_depth_map.at(pos);
+
+//             // Calculate the log2 ratio for the position
+//             if (depth == 0)
+//             {
+//                 log2_region[i] = 0.0;
+//             } else {
+//                 log2_region[i] = log2((double) depth / mean_chr_cov);
+//             }
+
+//         } catch (const std::out_of_range& e) {
+//             log2_region[i] = 0.0;
+//         }
+//     }
+// }
 
 void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data) const
 {
     // Lock during reading
-    std::lock_guard<std::mutex> lock(this->shared_mutex);
+    std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
 
     // --------- SNP file ---------
     const std::string snp_filepath = input_data.getSNPFilepath();
@@ -475,7 +613,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     }
 
     // Initialize the SNP file reader
-    // printMessage("Initializing SNP reader...");
     bcf_srs_t *snp_reader = bcf_sr_init();
     if (!snp_reader)
     {
@@ -484,19 +621,23 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     }
     snp_reader->require_index = 1;
 
-    // Use multi-threading. This is possible here due to the lock
+    // Use multi-threading if not threading by chromosome
     int thread_count = input_data.getThreadCount();
+    // if (!input_data.isSingleChr())
+    // {
+    //     // Use half of the threads for SNP reading
+    //     thread_count = std::max(1, input_data.getThreadCount() / 2);
+    // }
+    printMessage("Using " + std::to_string(thread_count) + " threads for SNP reading...");
     bcf_sr_set_threads(snp_reader, thread_count);
 
     // Add the SNP file to the reader
-    // printMessage("Adding SNP file to reader...");
     if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0)
     {
         bcf_sr_destroy(snp_reader);
         printError("ERROR: Could not add SNP file to reader: " + snp_filepath);
         return;
     }
-    // printMessage("SNP file added to reader.");
 
     // --------- Population allele frequency file ---------
 
@@ -507,7 +648,16 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     {
         use_pfb = false;
     }
-    
+
+    // Ensure the file exists (ifsstream will throw an exception if the file
+    // does not exist)
+    std::ifstream pfb_file(pfb_filepath);
+    if (!pfb_file)
+    {
+        use_pfb = false;
+    }
+    pfb_file.close();
+
     bcf_srs_t *pfb_reader = bcf_sr_init();
     std::string chr_gnomad;
     std::string AF_key;
@@ -552,7 +702,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         pfb_reader->require_index = 1;
 
         // Add the population allele frequency file to the reader
-        // printMessage("Adding population allele frequency file to reader...");
         if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0)
         {
             printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath);
@@ -563,188 +712,212 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             return;
         }
 
-        // Use multi-threading. This is possible here due to the lock
+        // Use multi-threading if not threading by chromosome
+        int thread_count = input_data.getThreadCount();
+        // if (!input_data.isSingleChr())
+        // {
+        //     // Use half of the threads for population allele frequency reading
+        //     thread_count = std::max(1, input_data.getThreadCount() / 2);
+        // }
+        printMessage("Using " + std::to_string(thread_count) + " threads for population allele frequency reading...");
         bcf_sr_set_threads(pfb_reader, thread_count);
     }
 
     // Split the region into samples
-    int sample_size = snp_pos.size();
-    std::vector<std::string> region_chunks = splitRegionIntoChunks(chr, start_pos, end_pos, sample_size);
+    // int sample_size = snp_pos.size();
+    // std::vector<std::string> region_chunks = splitRegionIntoChunks(chr, start_pos, end_pos, sample_size);
 
     // Loop through the samples and read the SNP data, storing the first
     // SNP position and BAF value for each sample
-    // int print_count = 0;
     int current_region = 0;
-    for (size_t i = 0; i < region_chunks.size(); ++i)
-    {
-        current_region++;
-        // Lock during reading
-        // std::lock_guard<std::mutex> lock(this->shared_mutex);
+    // for (size_t i = 0; i < region_chunks.size(); ++i)
+    // {
+    //     current_region++;
+
+    // Read the SNP data ----------------------------------------------
 
-        // Read the SNP data ----------------------------------------------
+    // Set the region
+    printMessage("Setting region for SNP reader...");
+    // std::string region_str = region_chunks[i];
+    if (bcf_sr_set_regions(snp_reader, chr.c_str(), 0) < 0)
+    {
+        printError("ERROR: Could not set region for SNP reader: " + chr);
+        bcf_sr_destroy(snp_reader);
+        bcf_sr_destroy(pfb_reader);
+        return;
+    }
 
-        // Set the region
-        // printMessage("Setting region for SNP reader...");
-        std::string region_str = region_chunks[i];
-        if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)
+    printMessage("Region set for SNP reader, loading SNP data...");
+    bool snp_found = false;
+    while (bcf_sr_next_line(snp_reader) > 0)
+    {
+        if (!bcf_sr_has_line(snp_reader, 0))
         {
-            printError("ERROR: Could not set region for SNP reader: " + region_str);
-            break;
+            continue;
         }
-        // printMessage("Region set for SNP reader, loading SNP data...");
-
-        bool snp_found = false;
-        while (bcf_sr_next_line(snp_reader) > 0)
+        bcf1_t *snp_record = bcf_sr_get_line(snp_reader, 0);
+        if (snp_record)
         {
-            if (!bcf_sr_has_line(snp_reader, 0))
+            uint32_t pos = (uint32_t)snp_record->pos + 1;
+
+            // Skip if not a SNP
+            if (!bcf_is_snp(snp_record))
             {
                 continue;
             }
-            bcf1_t *snp_record = bcf_sr_get_line(snp_reader, 0);
-            if (snp_record)
-            {
-                uint32_t pos = (uint32_t)snp_record->pos + 1;
-
-                // Skip if not a SNP
-                if (!bcf_is_snp(snp_record))
-                {
-                    continue;
-                }
 
-                // Get the QUAL, DP, and AD values
-                if (bcf_float_is_missing(snp_record->qual) || snp_record->qual <= 30)
-                {
-                    continue;
-                }
+            // Get the QUAL, DP, and AD values
+            if (bcf_float_is_missing(snp_record->qual) || snp_record->qual <= 30)
+            {
+                continue;
+            }
 
-                // Extract DP from FORMAT field
-                int32_t *dp = 0;
-                int dp_count = 0;
-                int dp_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "DP", &dp, &dp_count);
-                if (dp_ret < 0 || dp[0] <= 10)
-                {
-                    continue;
-                }
-                free(dp);
+            // Extract DP from FORMAT field
+            int32_t *dp = 0;
+            int dp_count = 0;
+            int dp_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "DP", &dp, &dp_count);
+            if (dp_ret < 0 || dp[0] <= 10)
+            {
+                continue;
+            }
+            free(dp);
 
-                // Skip if the SNP does not pass the filter
-                if (bcf_has_filter(snp_reader->readers[0].header, snp_record, const_cast<char*>("PASS")) != 1)
-                {
-                    continue;
-                }
+            // Skip if the SNP does not pass the filter
+            if (bcf_has_filter(snp_reader->readers[0].header, snp_record, const_cast<char*>("PASS")) != 1)
+            {
+                continue;
+            }
 
-                // Extract AD from FORMAT field
-                int32_t *ad = 0;
-                int ad_count = 0;
-                int ad_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "AD", &ad, &ad_count);
-                if (ad_ret < 0 || ad_count < 2)
-                {
-                    continue;
-                }
+            // Extract AD from FORMAT field
+            int32_t *ad = 0;
+            int ad_count = 0;
+            int ad_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "AD", &ad, &ad_count);
+            if (ad_ret < 0 || ad_count < 2)
+            {
+                continue;
+            }
 
-                // Calculate the B-allele frequency (BAF)
-                double baf = (double) ad[1] / (double) (ad[0] + ad[1]);
-                free(ad);
+            // Calculate the B-allele frequency (BAF)
+            double baf = (double) ad[1] / (double) (ad[0] + ad[1]);
+            free(ad);
+
+            // Add the SNP position and BAF information
+            snp_pos.push_back(pos);
+            snp_baf.push_back(baf);
+            // is_snp.push_back(true);
+            snp_pfb.push_back(0.5);
+            // snp_pos[i] = pos;
+            // snp_baf[i] = baf;
+            // is_snp[i] = true;
+            snp_found = true;
+            // break;  // Only one SNP per region
+        }
+    }
 
-                // Add the SNP position and BAF information
-                snp_pos[i] = pos;
-                snp_baf[i] = baf;
-                is_snp[i] = true;
-                snp_found = true;
+    if (snp_reader->errnum)
+    {
+        printError("ERROR: " + std::string(bcf_sr_strerror(snp_reader->errnum)));
+    }
 
-                break;  // Only one SNP per region
-            }
-        }
+    // Continue if no SNP was found in the region
+    if (!snp_found)
+    {
+        printMessage("No SNP found in region: " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos));
+        bcf_sr_destroy(snp_reader);
+        bcf_sr_destroy(pfb_reader);
+        return;
+    }
 
-        if (snp_reader->errnum)
+    // Read the population allele frequency data ----------------------
+    // Get the minimum and maximum SNP positions
+    uint32_t min_snp_pos = *std::min_element(snp_pos.begin(), snp_pos.end());
+    uint32_t max_snp_pos = *std::max_element(snp_pos.begin(), snp_pos.end());
+    std::unordered_set<uint32_t> snp_pos_set(snp_pos.begin(), snp_pos.end());
+    std::unordered_map<uint32_t, double> snp_index_map;
+    for (size_t i = 0; i < snp_pos.size(); i++)
+    {
+        snp_index_map[snp_pos[i]] = i;
+    }
+    if (use_pfb)
+    {
+        // Set the region for the population allele frequency reader
+        std::string pfb_region_str = chr_gnomad + ":" + std::to_string(min_snp_pos) + "-" + std::to_string(max_snp_pos);
+        printMessage("Setting region for population allele frequency reader: " + pfb_region_str);
+        if (bcf_sr_set_regions(pfb_reader, pfb_region_str.c_str(), 0) < 0)
         {
-            printError("ERROR: " + std::string(bcf_sr_strerror(snp_reader->errnum)));
+            printError("ERROR: Could not set region for population allele frequency reader: " + pfb_region_str);
         }
 
-        // Continue if no SNP was found in the region
-        if (!snp_found)
-        {
-            continue;
-        }
+        printMessage("Loading population allele frequency data...");
+        // for (size_t i = 0; i < snp_pos.size(); ++i)
+        // {
+        // Set the region as the SNP position
+        // printMessage("Setting region for population allele frequency reader...");
+        // uint32_t target_snp_pos = snp_pos[i];  // Already 1-based
+        // std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos);
+        // if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0)
+        // {
+        //     printError("ERROR: Could not set region for population allele frequency reader: " + snp_region_str);
+        // }
+        // printMessage("Region set for population allele frequency reader, loading population allele frequency data...");
 
-        // Read the population allele frequency data ----------------------
-        if (use_pfb)
+        // Find the SNP position in the population allele frequency file
+        float *pfb_f = NULL;
+        int count = 0;
+        while (bcf_sr_next_line(pfb_reader) > 0)
         {
-            // Set the region as the SNP position
-            // printMessage("Setting region for population allele frequency reader...");
-            uint32_t target_snp_pos = snp_pos[i];  // Already 1-based
-            std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos);
-            if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0)
+            // Get the SNP record and validate
+            bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0);
+            if (!pfb_record || !bcf_is_snp(pfb_record))
             {
-                printError("ERROR: Could not set region for population allele frequency reader: " + region_str);
-                break;
+                continue;  // Skip if not a SNP
             }
-            // printMessage("Region set for population allele frequency reader, loading population allele frequency data...");
 
-            // Find the SNP position in the population allele frequency file
-            float *pfb_f = NULL;
-            int count = 0;
-            while (bcf_sr_next_line(pfb_reader) > 0)
+            // Get the SNP position
+            uint32_t pfb_pos = (uint32_t) pfb_record->pos + 1;
+            if (snp_pos_set.find(pfb_pos) == snp_pos_set.end())
             {
-                // Get the SNP record and validate
-                bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0);
-                if (!pfb_record || !bcf_is_snp(pfb_record))
-                {
-                    continue;  // Skip if not a SNP
-                }
+                continue;  // Skip if the SNP position is not in the set
+            }
 
-                // if (!bcf_sr_has_line(pfb_reader, 0))
-                // {
-                //     continue;
-                // }
-                // bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0);
-                // if (pfb_record)
-                // {
-                //     // Skip if not a SNP
-                //     if (!bcf_is_snp(pfb_record))
-                //     {
-                //         continue;
-                //     }
-
-                // Get the population frequency for the SNP
-                // float *pfb_f = NULL;
-                // int count = 0;
-                int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count);
-                if (pfb_status < 0 || count == 0)
-                {
-                    continue;
-                }
-                // double pfb = (double) pfb_f[0];
-                double pfb = static_cast<double>(pfb_f[0]);
-                // free(pfb_f);
+            // Get the SNP position index
+            size_t i = snp_index_map[pfb_pos];
 
-                // Skip if outside the acceptable range
-                if (pfb <= MIN_PFB || pfb >= MAX_PFB)
-                {
-                    continue;
-                }
+            // Get the population frequency for the SNP
+            int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count);
+            if (pfb_status < 0 || count == 0)
+            {
+                continue;
+            }
+            // double pfb = (double) pfb_f[0];
+            double pfb = static_cast<double>(pfb_f[0]);
+            // free(pfb_f);
 
-                // Add the population frequency to the SNP data
-                snp_pfb[i] = pfb;
+            // Skip if outside the acceptable range
+            if (pfb <= MIN_PFB || pfb >= MAX_PFB)
+            {
+                continue;
+            }
 
-                // Break after finding the SNP position
-                break;
+            // Add the population frequency to the SNP data
+            snp_pfb[i] = pfb;
 
-                // if (print_count < 20) {
-                //     printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")");
-                //     print_count++;
-                // }
-            }
-            free(pfb_f);
+            break;  // Break after finding the SNP position
 
+            // if (print_count < 20) {
+            //     printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")");
+            //     print_count++;
             // }
-            if (pfb_reader->errnum)
-            {
-                printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum)));
-            }
         }
-        // printMessage("SNP region " + std::to_string(current_region) + " of " + std::to_string(region_chunks.size()) + " completed.");
+        free(pfb_f);
+
+        // if (pfb_reader->errnum)
+        // {
+        //     printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum)));
+        // }
+        // }
     }
+    // }
 
     // Clean up
     bcf_sr_destroy(snp_reader);
diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp
index 445643cc..84a2ae07 100644
--- a/src/fasta_query.cpp
+++ b/src/fasta_query.cpp
@@ -34,8 +34,6 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath)
     }
 
     // Get the chromosomes and sequences
-    // std::vector<std::string> chromosomes;
-    // std::unordered_map<std::string, std::string> chr_to_seq;
     std::string current_chr = "";
     std::string sequence = "";
     std::string line_str = "";
@@ -51,13 +49,10 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath)
                 this->chromosomes.push_back(current_chr);  // Add the chromosome to the list
                 this->chr_to_seq[current_chr] = sequence;  // Add the sequence to the map
                 this->chr_to_length[current_chr] = sequence.length();  // Add the sequence length to the map
-                // chromosomes.push_back(current_chr);  // Add the chromosome to the list
-                // chr_to_seq[current_chr] = sequence;  // Add the sequence to the map
                 sequence = "";  // Reset the sequence
             }
 
-            // Get the new chromosome
-            current_chr = line_str.substr(1);
+            current_chr = line_str.substr(1);  // Remove the '>' character
 
             // Remove the description
             size_t space_pos = current_chr.find(" ");
@@ -65,15 +60,7 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath)
             {
                 current_chr.erase(space_pos);
             }
-
-            // Check if the chromosome is already in the map
-            // if (chr_to_seq.find(current_chr) != chr_to_seq.end())
-            // {
-            //     std::cerr << "Duplicate chromosome " << current_chr << std::endl;
-            //     exit(1);
-            // }
         } else {
-            // Sequence line
             sequence += line_str;
         }
     }
@@ -84,21 +71,11 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath)
         this->chromosomes.push_back(current_chr);  // Add the chromosome to the list
         this->chr_to_seq[current_chr] = sequence;  // Add the sequence to the map
         this->chr_to_length[current_chr] = sequence.length();  // Add the sequence length to the map
-        // chromosomes.push_back(current_chr);  // Add the chromosome to the list
-        // chr_to_seq[current_chr] = sequence;  // Add the sequence to the map
     }
 
-    // Close the file
     fasta_file.close();
-
-    // Sort the chromosomes
-    // std::sort(chromosomes.begin(), chromosomes.end());
     std::sort(this->chromosomes.begin(), this->chromosomes.end());
 
-    // Set the chromosomes and sequences
-    // this->chromosomes = chromosomes;
-    // this->chr_to_seq = chr_to_seq;
-
     return 0;
 }
 
@@ -109,55 +86,36 @@ std::string ReferenceGenome::getFilepath() const
 
 // Function to get the reference sequence at a given position range
 std::string_view ReferenceGenome::query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const
-{
-    // printMessage("Querying reference genome");
-    // std::lock_guard<std::mutex> lock(this->shared_mutex);
-    
+{   
     // Convert positions from 1-indexed (reference) to 0-indexed (string indexing)
     pos_start--;
     pos_end--;
 
     // Ensure that the end position is not larger than the chromosome length
-    // if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length())
     const std::string& sequence = this->chr_to_seq.at(chr);
     if (pos_end >= sequence.length() || pos_start > pos_end)
     {
         return {};
     }
 
-    // uint32_t length = pos_end - pos_start + 1;
-
-    // If the subsequence is empty, return empty string
-    // if (sequence.substr(pos_start, length).empty())
-    // {
-    //     return "";
-    // }
-
-    // return sequence.substr(pos_start, length);
     return std::string_view(sequence).substr(pos_start, (pos_end - pos_start) + 1);
 }
 
 // Function to compare the reference sequence at a given position range
 bool ReferenceGenome::compare(const std::string& chr, uint32_t pos_start, uint32_t pos_end, const std::string& compare_seq, float match_threshold) const
-{
-    // std::lock_guard<std::mutex> lock(this->shared_mutex);
-    
+{    
     // Convert positions from 1-indexed (reference) to 0-indexed (string indexing)
     pos_start--;
     pos_end--;
 
     // Ensure that the end position is not larger than the chromosome length
-    // if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length())
     const std::string& sequence = this->chr_to_seq.at(chr);
     if (pos_end >= sequence.length() || pos_start >= pos_end)
     {
         return {};
     }
 
-    // Get the subsequence
     std::string_view subseq = std::string_view(sequence).substr(pos_start, pos_end - pos_start + 1);
-
-    // Ensure the lengths are equal
     if (subseq.length() != compare_seq.length())
     {
         printError("ERROR: Sequence lengths do not match for comparison");
@@ -175,14 +133,13 @@ bool ReferenceGenome::compare(const std::string& chr, uint32_t pos_start, uint32
     }
     float match_rate = (float)num_matches / (float)subseq.length();
 
-    // Check if the match rate is above the threshold
     return match_rate >= match_threshold;
 }
 
 // Function to get the chromosome contig lengths in VCF header format
 std::string ReferenceGenome::getContigHeader() const
 {
-    std::lock_guard<std::mutex> lock(this->shared_mutex);
+    std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
     std::string contig_header = "";
 
     // Sort the chromosomes
@@ -192,13 +149,10 @@ std::string ReferenceGenome::getContigHeader() const
         chromosomes.push_back(chr_seq.first);
     }
     std::sort(chromosomes.begin(), chromosomes.end());
-
-    // Iterate over the chromosomes and add them to the contig header
     for (auto const& chr : chromosomes)
     {
         // Add the contig header line
         contig_header += "##contig=<ID=" + chr + ",length=" + std::to_string(this->chr_to_seq.at(chr).length()) + ">\n";
-        // contig_header += "##contig=<ID=" + chr + ",length=" + std::to_string(this->chr_to_seq[chr].length()) + ">\n";
     }
 
     // Remove the last newline character
@@ -209,13 +163,10 @@ std::string ReferenceGenome::getContigHeader() const
 
 std::vector<std::string> ReferenceGenome::getChromosomes() const
 {
-    // std::lock_guard<std::mutex> lock(this->shared_mutex);
     return this->chromosomes;
 }
 
 uint32_t ReferenceGenome::getChromosomeLength(std::string chr) const
 {
-    // std::lock_guard<std::mutex> lock(this->shared_mutex);
-    // return this->chr_to_seq.at(chr).length();
     return this->chr_to_length.at(chr);
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index cc72a247..d2e5fa31 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -37,7 +37,8 @@
 
 int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
 {
-    std::lock_guard<std::mutex> lock(this->shared_mutex);
+    // std::lock_guard<std::mutex> lock(this->shared_mutex);
+    std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
     int ret = sam_itr_next(fp_in, itr, bam1);
     return ret;
 }
@@ -66,19 +67,22 @@ std::vector<std::string> SVCaller::getChromosomes(const std::string &bam_filepat
     return chromosomes;
 }
 
-std::vector<SVCall> SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const std::string &region, std::unordered_map<std::string, GenomicRegion> &primary_map, std::unordered_map<std::string, std::vector<GenomicRegion>> &supp_map)
+void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const std::string &region, std::vector<SVCall>& sv_calls)
 {
+    std::unordered_map<std::string, GenomicRegion> primary_map;
+    std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
+
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
     if (!bam1) {
         printError("ERROR: failed to initialize BAM record");
-        return {};
+        return;
     }
     hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str());
     if (!itr) {
         bam_destroy1(bam1);
         printError("ERROR: failed to query region " + region);
-        return {};
+        return;
     }
 
     uint32_t primary_count = 0;
@@ -133,11 +137,13 @@ std::vector<SVCall> SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx,
     // start, end vs. supplementary alignment start, end positions, keeping the
     // median of the largest cluster for the primary and supplementary positions
     // as the final genome coordinates of the SV
-    IntervalNode* root = nullptr;
+    // IntervalNode* root = nullptr;
+    std::unique_ptr<IntervalNode> root = nullptr;
     for (const auto& entry : primary_map) {
         const std::string& qname = entry.first;
         const GenomicRegion& region = entry.second;
-        root = insert(root, region, qname);
+        // root = insert(root, region, qname);
+        insert(root, region, qname);
     }
     std::vector<std::vector<std::string>> primary_clusters;
     std::set<std::string> processed;
@@ -302,7 +308,7 @@ std::vector<SVCall> SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx,
         }
     }
 
-    return sv_candidates;
+    // return sv_candidates;
 }
 
 
@@ -416,8 +422,6 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
                     {
                         int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
-                        // addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>",
-                        // "LSEQSIM", "./.", default_lh, read_depth);
                         SVCall sv_call(bp1, bp2, SVType::DUP, "<DUP>", "LSEQSIM", "./.", default_lh, read_depth, 1, 0);
                         addSVCall(sv_calls, sv_call);
                         continue;
@@ -435,7 +439,6 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                         int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
                         SVCall sv_call(bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth, 1, 0);
                         addSVCall(sv_calls, sv_call);
-                        // addSVCall(sv_calls, bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth);
                         continue;
                     }
                 }
@@ -454,7 +457,6 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 }
                 SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, 1, 0);
                 addSVCall(sv_calls, sv_call);                
-                // addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth);
 
             // Check if the CIGAR operation is a deletion
             } else if (op == BAM_CDEL && is_primary) {
@@ -462,15 +464,8 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
                 int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
-                // addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "<DEL>",
-                // "CIGARDEL", "./.", default_lh, read_depth);
                 SVCall sv_call(ref_pos, ref_end, SVType::DEL, "<DEL>", "CIGARDEL", "./.", default_lh, read_depth, 1, 0);
                 addSVCall(sv_calls, sv_call);
-
-                // Print if the ref pos is within the range 44007800-44007930
-                if (ref_pos >= 44007800 && ref_pos <= 44007930) {
-                    printMessage("DEL: " + chr + ":" + std::to_string(ref_pos) + "-" + std::to_string(ref_end) + " (LENGTH " + std::to_string(op_len) + ")");
-                }
             }
         }
 
@@ -487,7 +482,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     }
 }
 
-void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome)
+void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov, std::vector<SVCall>& split_sv_calls)
 {
     double dbscan_epsilon = input_data.getDBSCAN_Epsilon();
     int dbscan_min_pts = input_data.getDBSCAN_MinPts();
@@ -500,9 +495,16 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         return;
     }
 
-    // Set multi-threading
-    int num_threads = input_data.getThreadCount();
-    hts_set_threads(fp_in, num_threads);
+    // Use multi-threading for the BAM file
+    // int thread_count = input_data.getThreadCount();
+    // // if (!input_data.isSingleChr()) {
+    // //     // Use half the threads for chromosomes, the other half for file I/O
+    // //     thread_count = std::max(1, thread_count / 2);
+    // // }
+    // printMessage("Using " + std::to_string(thread_count) + " threads for BAM file I/O");
+    // // int num_threads = input_data.getThreadCount();
+    // hts_set_threads(fp_in, thread_count);
+    hts_set_threads(fp_in, 1);  // Disable multi-threading for now
 
     // Load the header
     bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
@@ -524,7 +526,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
 
     // Set the region to process
     std::string region = chr;
-    uint32_t chr_len = ref_genome.getChromosomeLength(chr);
+    // uint32_t chr_len = ref_genome.getChromosomeLength(chr);
     // uint32_t chr_len = bamHdr->target_len[bam_name2id(bamHdr, chr.c_str())];
     if (input_data.isRegionSet()) {
 
@@ -536,14 +538,15 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     }
 
     // Load chromosome data for copy number predictions
-    printMessage(chr + ": Loading chromosome data...");
+    // printMessage(chr + ": Loading chromosome data...");
     CNVCaller cnv_caller(this->shared_mutex);
-    std::vector<uint32_t> chr_pos_depth_map(chr_len+1, 0);  // 1-based index
-    int thread_count = input_data.getThreadCount();
-    double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count);
-    if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) {
-        return;
-    }
+    // std::vector<uint32_t> chr_pos_depth_map(chr_len+1, 0);  // 1-based index
+
+    // // Use only half the threads for chromosomes, the other half for file I/O
+    // double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count);
+    // if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) {
+    //     return;
+    // }
 
     // Estimate DBSCAN minimum points
     double dbscan_min_pts_pct = input_data.getDBSCAN_MinPtsPct();
@@ -564,26 +567,30 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
 
     // Run copy number variant predictions on the SVs detected from the
     // CIGAR string, using a minimum CNV length threshold
-    if (region_sv_count > 0) {
-        printMessage(chr + ": CIGAR predictions...");
-        cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
-    }
+    // if (region_sv_count > 0) {
+    //     printMessage(chr + ": CIGAR predictions...");
+    //     cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
+    // }
 
+    // [TEST] Before this section has no memory leaks
     // Run split-read SV and copy number variant predictions
     printMessage(chr + ": Split read SVs...");
-    std::vector<SVCall> split_sv_calls;
-    this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, split_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
-
-    // // Merge the split-read SVs separately
-    printMessage(chr + ": Merging split reads...");
-    double split_epsilon = 0.45;
-    int split_min_pts = 2;  // This is low since split alignments were already previously merged
-    mergeSVs(split_sv_calls, split_epsilon, split_min_pts);
-
-    printMessage(chr + ": Unifying SVs...");
-    chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
-
-    // mergeSVSubsets(chr_sv_calls);
+    // std::unordered_map<std::string, GenomicRegion> primary_map;
+    // std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
+    // std::vector<SVCall> sv_candidates = this->getSplitAlignments(fp_in, idx,
+    // bamHdr, region, primary_map, supp_map);
+    this->getSplitAlignments(fp_in, idx, bamHdr, region, split_sv_calls);
+    // std::vector<SVCall> split_sv_calls;
+    // this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, split_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
+
+    // // // Merge the split-read SVs separately
+    // printMessage(chr + ": Merging split reads...");
+    // double split_epsilon = 0.45;
+    // int split_min_pts = 2;  // This is low since split alignments were already previously merged
+    // mergeSVs(split_sv_calls, split_epsilon, split_min_pts);
+
+    // printMessage(chr + ": Unifying SVs...");
+    // chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
 
     // Sort the SV calls by start position
     std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
@@ -598,7 +605,9 @@ void SVCaller::run(const InputData& input_data)
     // Set up the reference genome
     printMessage("Loading the reference genome...");
     const std::string ref_filepath = input_data.getRefGenome();
-    ReferenceGenome ref_genome(this->shared_mutex);
+    std::shared_mutex ref_mutex;  // Dummy mutex (remove later)
+    // ReferenceGenome ref_genome(this->shared_mutex);
+    ReferenceGenome ref_genome(ref_mutex);
     ref_genome.setFilepath(ref_filepath);
 
     // Get the chromosomes
@@ -611,33 +620,75 @@ void SVCaller::run(const InputData& input_data)
         // Get the chromosomes from the input BAM file
         chromosomes = this->getChromosomes(input_data.getLongReadBam());
     }
+
+    // [TEST] Keep only the first 6 chromosomes
+    chromosomes.resize(4);
+    // Remove the first chromosome
+    chromosomes.erase(chromosomes.begin());
+    printMessage("Chromosomes: " + std::to_string(chromosomes.size()));
+    for (const auto& chr : chromosomes) {
+        printMessage("  " + chr);
+    }
     
     // Read the HMM from the file
     std::string hmm_filepath = input_data.getHMMFilepath();
     std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
     const CHMM& hmm = ReadCHMM(hmm_filepath.c_str());
 
-    // Use multi-threading across chromosomes unless a single chromosome is
-    // specified
-    int max_threads = 1;
-    if (!input_data.isSingleChr()) {
-        max_threads = input_data.getThreadCount();
-        std::cout << "Using " << max_threads << " threads for processing..." << std::endl;
+    // Calculate the mean chromosome coverage and generate the position depth
+    // maps for each chromosome (I/O is multi-threaded, which is more efficient
+    // than per-chromosome multi-threading in this case)
+    std::shared_mutex shared_mutex;
+    CNVCaller cnv_caller(shared_mutex);
+    std::unordered_map<std::string, std::vector<uint32_t>> chr_pos_depth_map;
+    std::unordered_map<std::string, double> chr_mean_cov_map;
+    const std::string bam_filepath = input_data.getLongReadBam();
+    int chr_thread_count = input_data.getThreadCount();
+    std::cout << "Reading chromosome coverage..." << std::endl;
+    std::cout << "(Thread count: " << chr_thread_count << ")" << std::endl;
+    int current_chr = 0;
+    int total_chr_count = chromosomes.size();
+    for (const auto& chr : chromosomes) {
+        current_chr++;
+        uint32_t chr_len = ref_genome.getChromosomeLength(chr);
+        if (chr_len == 0) {
+            printError("ERROR: chromosome " + chr + " not found in reference genome");
+            return;
+        }
+        printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Reading " + chr + "...");
+        std::vector<uint32_t> pos_depth_map(chr_len+1, 0);  // 1-based index
+        double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, pos_depth_map, bam_filepath, chr_thread_count);
+        if (mean_chr_cov == 0.0 || pos_depth_map.size() == 0) {
+            printError("ERROR: failed to calculate mean chromosome coverage for " + chr);
+            return;
+        }
+        chr_pos_depth_map[chr] = std::move(pos_depth_map);
+        chr_mean_cov_map[chr] = mean_chr_cov;
+        printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean cov. for " + chr + ": " + std::to_string(mean_chr_cov));
     }
-    ThreadPool pool(max_threads);
+    printMessage("Completed reading chromosome coverage.");
 
-    // Shared resources
+    // Use multi-threading across chromosomes. If a single chromosome is
+    // specified, use a single main thread (multi-threading is used for file I/O)
+    int thread_count = 1;
+    if (!input_data.isSingleChr()) {
+        thread_count = input_data.getThreadCount();
+        std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl;
+    }
+    ThreadPool pool(thread_count);
     std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
-
-    // Lambda to process a chromosome
+    std::unordered_map<std::string, std::vector<SVCall>> whole_genome_split_sv_calls;
     auto process_chr = [&](const std::string& chr) {
         try {
             std::vector<SVCall> sv_calls;
+            std::vector<SVCall> split_sv_calls;
             InputData chr_input_data = input_data;  // Use a thread-local copy
-            this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome);
+            this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls);
             {
-                std::lock_guard<std::mutex> lock(this->shared_mutex);
+                // std::lock_guard<std::mutex> lock(this->shared_mutex);
+                std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
                 whole_genome_sv_calls[chr] = std::move(sv_calls);
+                whole_genome_split_sv_calls[chr] = std::move(split_sv_calls);
             }
             // printMessage("Completed chromosome " + chr);
         } catch (const std::exception& e) {
@@ -657,8 +708,7 @@ void SVCaller::run(const InputData& input_data)
     }
 
     // Wait for all tasks to complete
-    int total_chr_count = futures.size();
-    int current_chr = 0;
+    current_chr = 0;
     for (auto& future : futures) {
         try {
             current_chr++;
@@ -672,6 +722,42 @@ void SVCaller::run(const InputData& input_data)
     }
     printMessage("All tasks have finished.");
 
+    // Run copy number variant predictions on the SVs detected from the
+    // CIGAR string, using a minimum CNV length threshold
+    printMessage("Running copy number predictions on CIGAR SVs...");
+    for (auto& entry : whole_genome_sv_calls) {
+        const std::string& chr = entry.first;
+        std::vector<SVCall>& sv_calls = entry.second;
+        if (sv_calls.size() > 0) {
+            printMessage("Running copy number predictions on " + chr + "...");
+            cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
+        }
+    }
+
+    printMessage("Running copy number predictions on split-read SVs...");
+    for (auto& entry : whole_genome_split_sv_calls) {
+        const std::string& chr = entry.first;
+        std::vector<SVCall>& sv_calls = entry.second;
+        if (sv_calls.size() > 0) {
+            printMessage("Running copy number predictions on " + chr + "...");
+            this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
+
+            // Merge the split-read SVs separately
+            printMessage(chr + ": Merging split reads...");
+            double split_epsilon = 0.45;
+            int split_min_pts = 2;  // This is low since split alignments were already previously merged
+            mergeSVs(sv_calls, split_epsilon, split_min_pts);
+        }
+    }
+
+    printMessage("Unifying SVs...");
+    for (auto& entry : whole_genome_split_sv_calls) {
+        const std::string& chr = entry.first;
+        std::vector<SVCall>& sv_calls = entry.second;
+        whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end());
+    }
+    // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
+
     // Print the total number of SVs detected for each chromosome
     uint32_t total_sv_count = 0;
     for (const auto& entry : whole_genome_sv_calls) {
@@ -690,21 +776,64 @@ void SVCaller::run(const InputData& input_data)
 
 
 // Detect SVs from split read alignments
-void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data)
-{
-    // printMessage(region + ": Getting split alignments...");
-    std::unordered_map<std::string, GenomicRegion> primary_map;
-    std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
-    std::vector<SVCall> sv_candidates = this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map);
+// void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data)
+// {
+//     // printMessage(region + ": Getting split alignments...");
+//     // std::unordered_map<std::string, GenomicRegion> primary_map;
+//     // std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
+//     std::vector<SVCall> sv_candidates = this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map);
+
+//     // Run copy number predictions on the SVs detected from the split reads
+//     printMessage(region + ": Split read predictions...");
+//     int current_sv = 0;
+//     int total_svs = sv_candidates.size();
+//     for (auto& sv_candidate : sv_candidates) {
+//         bool is_inversion = sv_candidate.sv_type == SVType::INV;
+
+//         std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(region, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
+//         if (std::get<1>(result) == SVType::UNKNOWN) {
+//             continue;
+//         }
+
+//         double supp_lh = std::get<0>(result);
+//         SVType supp_type = std::get<1>(result);
+//         std::string genotype = std::get<2>(result);
+//         if (supp_type != SVType::UNKNOWN) {
+//             if (is_inversion) {
+//                 if (supp_type == SVType::DEL) {
+//                     supp_type = SVType::INV_DEL;
+//                 } else if (supp_type == SVType::DUP) {
+//                     supp_type = SVType::INV_DUP;
+//                 } else if (supp_type == SVType::NEUTRAL) {
+//                     supp_type = SVType::INV;
+//                 }
+//             }
+            
+//             if (supp_type != SVType::NEUTRAL) {
+//                 int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
+//                 std::string alt_allele = "<" + getSVTypeString(supp_type) + ">";
+//                 SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
+//                 addSVCall(split_sv_calls, sv_call);
+//             }
+//         }
+//         current_sv++;
+//         if (current_sv % 1000 == 0) {
+//             printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates");
+//         }
+//     }
+// }
 
+// Detect SVs from split read alignments
+void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::vector<SVCall>& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data)
+{
     // Run copy number predictions on the SVs detected from the split reads
-    printMessage(region + ": Split read predictions...");
+    printMessage("Split read predictions...");
     int current_sv = 0;
-    int total_svs = sv_candidates.size();
-    for (auto& sv_candidate : sv_candidates) {
+    int total_svs = split_sv_calls.size();
+    for (auto& sv_candidate : split_sv_calls) {
         bool is_inversion = sv_candidate.sv_type == SVType::INV;
 
-        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(region, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
+        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
         if (std::get<1>(result) == SVType::UNKNOWN) {
             continue;
         }
@@ -712,8 +841,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in
         double supp_lh = std::get<0>(result);
         SVType supp_type = std::get<1>(result);
         std::string genotype = std::get<2>(result);
-
-        // int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
         if (supp_type != SVType::UNKNOWN) {
             if (is_inversion) {
                 if (supp_type == SVType::DEL) {
@@ -764,6 +891,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     // Set the header lines
     std::cout << "Getting reference genome header..." << std::endl;
     const std::string contig_header = ref_genome.getContigHeader();
+    std::cout << "Formatting VCF header..." << std::endl;
     std::vector<std::string> header_lines = {
         std::string("##reference=") + ref_genome.getFilepath(),
         contig_header,

From f0f84275d2fd2246d16b8f5a9a8b9451e2d049eb Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 6 Mar 2025 09:05:44 -0500
Subject: [PATCH 079/134] Fix inversion detection

---
 include/cnv_caller.h |   3 +-
 src/cnv_caller.cpp   | 207 ++++++++++++++++++++++---------------------
 src/sv_caller.cpp    | 119 ++++++++++++++++++-------
 3 files changed, 196 insertions(+), 133 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index c2961cc8..6b10bc29 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -93,7 +93,8 @@ class CNVCaller {
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
         void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
 
-        double calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const;
+        // double calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const;
+        void calculateMeanChromosomeCoverage(const std::vector<std::string>& chromosomes, std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map, std::unordered_map<std::string, double>& chr_mean_cov_map, const std::string& bam_filepath, int thread_count) const;
 
         // void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& pos_log2) const;
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index d1a711e9..45f59ec4 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -77,6 +77,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     // this->calculateSNPLog2Ratios(snp_pos, snp_log2_cov, pos_depth_map,
     // mean_chr_cov, input_data);
     sample_size = std::max((int) snp_pos.size(), sample_size);
+    //printMessage("Sample size: " + std::to_string(sample_size));
     // std::vector<uint32_t> snp_pos_hmm(sample_size, 0);
     // std::vector<double> snp_baf_hmm(sample_size, -1.0);
     // std::vector<double> snp_pfb_hmm(sample_size, 0.5);
@@ -161,11 +162,11 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     // this->calculateRegionLog2Ratio(start_pos, end_pos, sample_size, pos_depth_map, mean_chr_cov, snp_log2_cov);
 
     // Update the SNP data with all information
-    snp_data.pos = std::move(snp_pos);
-    snp_data.baf = std::move(snp_baf);
-    snp_data.pfb = std::move(snp_pfb);
+    snp_data.pos = std::move(snp_pos_hmm);
+    snp_data.baf = std::move(snp_baf_hmm);
+    snp_data.pfb = std::move(snp_pfb_hmm);
     snp_data.log2_cov = std::move(snp_log2_hmm);
-    snp_data.is_snp = std::move(is_snp);
+    snp_data.is_snp = std::move(is_snp_hmm);
 }
 
 std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
@@ -379,67 +380,70 @@ std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint3
 }
 
 // Calculate the mean chromosome coverage
-double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const
+// double CNVCaller::calculateMeanChromosomeCoverage(std::string chr,
+// std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath,
+// int thread_count) const
+void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>& chromosomes, std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map, std::unordered_map<std::string, double>& chr_mean_cov_map, const std::string& bam_filepath, int thread_count) const
 {
+    // Open the BAM file
+    // std::shared_lock<std::shared_mutex> lock(this->shared_mutex);  // Lock the BAM file
+    printMessage("Opening BAM file: " + bam_filepath);
+    samFile *bam_file = sam_open(bam_filepath.c_str(), "r");
+    if (!bam_file)
     {
-        // Open the BAM file
-        // std::shared_lock<std::shared_mutex> lock(this->shared_mutex);  //
-        // Lock the BAM file
-        printMessage("Opening BAM file: " + bam_filepath);
-        samFile *bam_file = sam_open(bam_filepath.c_str(), "r");
-        if (!bam_file)
-        {
-            printError("ERROR: Could not open BAM file: " + bam_filepath);
-            return 0.0;
-        }
+        printError("ERROR: Could not open BAM file: " + bam_filepath);
+        return;
+    }
 
-        // Enable multi-threading while opening the BAM file
-        hts_set_threads(bam_file, thread_count);
+    // Enable multi-threading while opening the BAM file
+    hts_set_threads(bam_file, thread_count);
 
-        // Read the header
-        bam_hdr_t *bam_header = sam_hdr_read(bam_file);
-        if (!bam_header)
-        {
-            sam_close(bam_file);
-            printError("ERROR: Could not read header from BAM file: " + bam_filepath);
-            return 0.0;
-        }
+    // Read the header
+    bam_hdr_t *bam_header = sam_hdr_read(bam_file);
+    if (!bam_header)
+    {
+        sam_close(bam_file);
+        printError("ERROR: Could not read header from BAM file: " + bam_filepath);
+        return;
+    }
 
-        // Load the index
-        hts_idx_t *bam_index = sam_index_load(bam_file, bam_filepath.c_str());
-        if (!bam_index)
-        {
-            bam_hdr_destroy(bam_header);
-            sam_close(bam_file);
-            printError("ERROR: Could not load index for BAM file: " + bam_filepath);
-            return 0.0;
-        }
-        BamFileGuard bam_guard(bam_file, bam_index, bam_header);  // Guard to close the BAM file
+    // Load the index
+    hts_idx_t *bam_index = sam_index_load(bam_file, bam_filepath.c_str());
+    if (!bam_index)
+    {
+        bam_hdr_destroy(bam_header);
+        sam_close(bam_file);
+        printError("ERROR: Could not load index for BAM file: " + bam_filepath);
+        return;
+    }
+    BamFileGuard bam_guard(bam_file, bam_index, bam_header);  // Guard to close the BAM file
+
+    // Initialize the record
+    bam1_t *bam_record = bam_init1();
+    if (!bam_record)
+    {
+        // Clean up the BAM file and index
+        bam_hdr_destroy(bam_header);
+        sam_close(bam_file);
+        printError("ERROR: Could not initialize BAM record.");
+        return;
+    }
 
+    // Iterate through each chromosome and update the depth map
+    int current_chr = 0;
+    int total_chr_count = chromosomes.size();
+    for (const std::string& chr : chromosomes)
+    {
         // Create an iterator for the chromosome
         hts_itr_t *bam_iter = sam_itr_querys(bam_index, bam_header, chr.c_str());
         if (!bam_iter)
         {
             printError("ERROR: Could not create iterator for chromosome: " + chr + ", check if the chromosome exists in the BAM file.");
-            return 0.0;
-        }
-
-        // Initialize the record
-        bam1_t *bam_record = bam_init1();
-        if (!bam_record)
-        {
-            hts_itr_destroy(bam_iter);
-            printError("ERROR: Could not initialize BAM record.");
-            return 0.0;
+            continue;
         }
 
-        // Set threading back to 1 for reading the BAM file
-        // printMessage("Setting threads to 1 for reading BAM file...");
-        // hts_set_threads(bam_file, 1);
-        // printMessage("Threads set to 1 for reading BAM file.");
-
-        // Iterate through the chromosome and update the depth map
-        printMessage("Iterating through BAM file reads...");
+        printMessage("(" + std::to_string(++current_chr) + "/" + std::to_string(total_chr_count) + ") Reading BAM file for chromosome: " + chr);
+        std::vector<uint32_t>& pos_depth_map = chr_pos_depth_map[chr];
         while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0)
         {
             // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads
@@ -468,12 +472,12 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
                     // Update the depth for each position in the alignment
                     for (uint32_t j = 0; j < op_len; j++)
                     {
-                        if (ref_pos + j >= chr_pos_depth_map.size())
+                        if (ref_pos + j >= pos_depth_map.size())
                         {
                             printError("ERROR: Reference position out of range for " + chr + ":" + std::to_string(ref_pos+j));
                             continue;
                         }
-                        chr_pos_depth_map[ref_pos + j]++;
+                        pos_depth_map[ref_pos + j]++;
                         // try {
                         //     chr_pos_depth_map[ref_pos + j]++;
                         // } catch (const std::out_of_range& oor) {
@@ -494,55 +498,58 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector<u
             }
         }
 
-        // Clean up
-        bam_destroy1(bam_record);
+        // Clean up the iterator
         hts_itr_destroy(bam_iter);
-    }
-    printMessage("Finished reading BAM file, calculating mean chromosome coverage...");
 
-    // // Calculate the mean chromosome coverage for positions with non-zero depth
-    // uint64_t cum_depth = 0;
-    // uint32_t pos_count = 0;
-    // for (const auto& pos_depth : chr_pos_depth_map)
-    // {
-    //     if (pos_depth > 0)
-    //     {
-    //         cum_depth += pos_depth;
-    //         pos_count++;
-    //     }
-    // }
+        printMessage("Finished reading BAM file, calculating mean chromosome coverage...");
 
-    // double mean_chr_cov = 0.0;
-    // if (pos_count > 0)
-    // {
-    //     mean_chr_cov = static_cast<double>(cum_depth) / static_cast<double>(pos_count);
-    // }
-    // printMessage("Completed calculating mean chromosome coverage: " +
-    // std::to_string(mean_chr_cov));
-    
-    // Parallel sum of the depth map
-    uint64_t cum_depth = std::reduce(
-        std::execution::par,
-        chr_pos_depth_map.begin(),
-        chr_pos_depth_map.end(),
-        0ULL
-    );
-
-    // Parallel count of the non-zero depth positions
-    uint32_t pos_count = std::count_if(
-        std::execution::par,
-        chr_pos_depth_map.begin(),
-        chr_pos_depth_map.end(),
-        [](uint32_t depth) { return depth > 0; }
-    );
-
-    printMessage("Number of positions with non-zero depth: " + std::to_string(pos_count));
-    printMessage("Total depth: " + std::to_string(cum_depth));
-
-    double mean_chr_cov = (pos_count > 0) ? static_cast<double>(cum_depth) / static_cast<double>(pos_count) : 0.0;
-    printMessage("Completed calculating mean chromosome coverage: " + std::to_string(mean_chr_cov));
-
-    return mean_chr_cov;
+        // // Calculate the mean chromosome coverage for positions with non-zero depth
+        // uint64_t cum_depth = 0;
+        // uint32_t pos_count = 0;
+        // for (const auto& pos_depth : chr_pos_depth_map)
+        // {
+        //     if (pos_depth > 0)
+        //     {
+        //         cum_depth += pos_depth;
+        //         pos_count++;
+        //     }
+        // }
+
+        // double mean_chr_cov = 0.0;
+        // if (pos_count > 0)
+        // {
+        //     mean_chr_cov = static_cast<double>(cum_depth) / static_cast<double>(pos_count);
+        // }
+        // printMessage("Completed calculating mean chromosome coverage: " +
+        // std::to_string(mean_chr_cov));
+        
+        // Parallel sum of the depth map
+        uint64_t cum_depth = std::reduce(
+            std::execution::par,
+            pos_depth_map.begin(),
+            pos_depth_map.end(),
+            0ULL
+        );
+
+        // Parallel count of the non-zero depth positions
+        uint32_t pos_count = std::count_if(
+            std::execution::par,
+            pos_depth_map.begin(),
+            pos_depth_map.end(),
+            [](uint32_t depth) { return depth > 0; }
+        );
+
+        printMessage("Number of positions with non-zero depth: " + std::to_string(pos_count));
+        printMessage("Total depth: " + std::to_string(cum_depth));
+
+        double mean_chr_cov = (pos_count > 0) ? static_cast<double>(cum_depth) / static_cast<double>(pos_count) : 0.0;
+        chr_mean_cov_map[chr] = mean_chr_cov;
+
+        printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov));
+    }
+
+    // Clean up
+    // sam_close(bam_file);
 }
 
 // void CNVCaller::calculateSNPLog2Ratios(const std::vector<uint32_t>& snp_pos, const std::vector<double>& snp_log2_cov, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov) const
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index d2e5fa31..fcfded24 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -167,7 +167,7 @@ void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bam
 
     // For each primary alignment cluster the supplementary alignment start and
     // end positions, keeping the median of the largest cluster
-    std::vector<SVCall> sv_candidates;
+    // std::vector<SVCall> sv_candidates;
     int current_group = 0;
     int min_length = 2000;
     int max_length = 1000000;
@@ -304,7 +304,7 @@ void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bam
         SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
         if (sv_length >= min_length && sv_length <= max_length) {
             SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "NA", "./.", 0.0, 0, 0, 0);
-            sv_candidates.push_back(sv_candidate);
+            sv_calls.push_back(sv_candidate);
         }
     }
 
@@ -592,10 +592,10 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     // printMessage(chr + ": Unifying SVs...");
     // chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
 
-    // Sort the SV calls by start position
-    std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-        return a.start < b.start;
-    });
+    // // Sort the SV calls by start position
+    // std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+    //     return a.start < b.start;
+    // });
 
     printMessage("Completed chromosome " + chr);
 }
@@ -622,13 +622,13 @@ void SVCaller::run(const InputData& input_data)
     }
 
     // [TEST] Keep only the first 6 chromosomes
-    chromosomes.resize(4);
-    // Remove the first chromosome
-    chromosomes.erase(chromosomes.begin());
-    printMessage("Chromosomes: " + std::to_string(chromosomes.size()));
-    for (const auto& chr : chromosomes) {
-        printMessage("  " + chr);
-    }
+    // chromosomes.resize(4);
+    // // Remove the first chromosome
+    // chromosomes.erase(chromosomes.begin());
+    // printMessage("Chromosomes: " + std::to_string(chromosomes.size()));
+    // for (const auto& chr : chromosomes) {
+    //     printMessage("  " + chr);
+    // }
     
     // Read the HMM from the file
     std::string hmm_filepath = input_data.getHMMFilepath();
@@ -644,29 +644,63 @@ void SVCaller::run(const InputData& input_data)
     std::unordered_map<std::string, double> chr_mean_cov_map;
     const std::string bam_filepath = input_data.getLongReadBam();
     int chr_thread_count = input_data.getThreadCount();
-    std::cout << "Reading chromosome coverage..." << std::endl;
-    std::cout << "(Thread count: " << chr_thread_count << ")" << std::endl;
-    int current_chr = 0;
-    int total_chr_count = chromosomes.size();
+
+    // Initialize the chromosome position depth map and mean coverage map
     for (const auto& chr : chromosomes) {
-        current_chr++;
         uint32_t chr_len = ref_genome.getChromosomeLength(chr);
         if (chr_len == 0) {
-            printError("ERROR: chromosome " + chr + " not found in reference genome");
-            return;
+            printError("Chromosome " + chr + " not found in reference genome");
+            continue;
         }
-        printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Reading " + chr + "...");
-        std::vector<uint32_t> pos_depth_map(chr_len+1, 0);  // 1-based index
-        double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, pos_depth_map, bam_filepath, chr_thread_count);
-        if (mean_chr_cov == 0.0 || pos_depth_map.size() == 0) {
-            printError("ERROR: failed to calculate mean chromosome coverage for " + chr);
-            return;
+        chr_pos_depth_map[chr] = std::vector<uint32_t>(chr_len+1, 0);  // 1-based index
+        chr_mean_cov_map[chr] = 0.0;
+    }
+    cnv_caller.calculateMeanChromosomeCoverage(chromosomes, chr_pos_depth_map, chr_mean_cov_map, bam_filepath, chr_thread_count);
+
+    // Remove chromosomes with no reads (mean coverage is zero)
+    std::vector<std::string> null_chr;
+    for (const auto& chr : chromosomes) {
+        if (chr_mean_cov_map[chr] == 0.0) {
+            null_chr.push_back(chr);
         }
-        chr_pos_depth_map[chr] = std::move(pos_depth_map);
-        chr_mean_cov_map[chr] = mean_chr_cov;
-        printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean cov. for " + chr + ": " + std::to_string(mean_chr_cov));
     }
-    printMessage("Completed reading chromosome coverage.");
+    for (const auto& chr : null_chr) {
+        printMessage("Removing chromosome " + chr + " with no reads...");
+        chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end());
+    }
+    // std::cout << "Reading chromosome coverage..." << std::endl;
+    // std::cout << "(Thread count: " << chr_thread_count << ")" << std::endl;
+    // int current_chr = 0;
+    // int total_chr_count = chromosomes.size();
+    // std::vector<std::string> null_chr;
+    // for (const auto& chr : chromosomes) {
+    //     current_chr++;
+    //     uint32_t chr_len = ref_genome.getChromosomeLength(chr);
+    //     if (chr_len == 0) {
+    //         printError("ERROR: chromosome " + chr + " not found in reference genome");
+    //         return;
+    //     }
+    //     printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Reading " + chr + "...");
+    //     std::vector<uint32_t> pos_depth_map(chr_len+1, 0);  // 1-based index
+    //     double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, pos_depth_map, bam_filepath, chr_thread_count);
+    //     if (mean_chr_cov == 0.0 || pos_depth_map.size() == 0) {
+    //         // No reads, continue to the next chromosome
+    //         null_chr.push_back(chr);
+    //         continue;
+    //         // printError("ERROR: failed to calculate mean chromosome coverage for " + chr);
+    //         // return;
+    //     }
+    //     chr_pos_depth_map[chr] = std::move(pos_depth_map);
+    //     chr_mean_cov_map[chr] = mean_chr_cov;
+    //     printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean cov. for " + chr + ": " + std::to_string(mean_chr_cov));
+    // }
+    // printMessage("Completed reading chromosome coverage.");
+
+    // Remove chromosomes with no reads
+    // for (const auto& chr : null_chr) {
+    //     printMessage("Removing chromosome " + chr + " with no reads...");
+    //     chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end());
+    // }
 
     // Use multi-threading across chromosomes. If a single chromosome is
     // specified, use a single main thread (multi-threading is used for file I/O)
@@ -708,7 +742,8 @@ void SVCaller::run(const InputData& input_data)
     }
 
     // Wait for all tasks to complete
-    current_chr = 0;
+    int current_chr = 0;
+    int total_chr_count = chromosomes.size();
     for (auto& future : futures) {
         try {
             current_chr++;
@@ -756,7 +791,21 @@ void SVCaller::run(const InputData& input_data)
         std::vector<SVCall>& sv_calls = entry.second;
         whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end());
     }
-    // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
+    // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(),
+    // split_sv_calls.end());
+    
+
+    // Sort the SV calls by start position
+    for (auto& entry : whole_genome_sv_calls) {
+        std::sort(entry.second.begin(), entry.second.end(), [](const SVCall& a, const SVCall& b) {
+            return a.start < b.start;
+        });
+    }
+        // // Sort the SV calls by start position
+        // std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+        //     return a.start < b.start;
+        // });
+
 
     // Print the total number of SVs detected for each chromosome
     uint32_t total_sv_count = 0;
@@ -858,6 +907,12 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
                 SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
                 addSVCall(split_sv_calls, sv_call);
             }
+        } else if (supp_type == SVType::UNKNOWN && sv_candidate.sv_type == SVType::INV) {
+            // Inversion with no CNV prediction
+            int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
+            std::string alt_allele = "<INV>";
+            SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
+            addSVCall(split_sv_calls, sv_call);
         }
         current_sv++;
         if (current_sv % 1000 == 0) {

From 96f18df80c3e230ba2bcb2e86d58b91fda067d8c Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 6 Mar 2025 16:16:16 -0500
Subject: [PATCH 080/134] inversion fixes

---
 src/cnv_caller.cpp |  29 ---------
 src/sv_caller.cpp  | 147 ++++++++++++++++++++++++++++++---------------
 src/sv_object.cpp  |   6 +-
 3 files changed, 101 insertions(+), 81 deletions(-)

diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 45f59ec4..6db7a78b 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -630,12 +630,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
 
     // Use multi-threading if not threading by chromosome
     int thread_count = input_data.getThreadCount();
-    // if (!input_data.isSingleChr())
-    // {
-    //     // Use half of the threads for SNP reading
-    //     thread_count = std::max(1, input_data.getThreadCount() / 2);
-    // }
-    printMessage("Using " + std::to_string(thread_count) + " threads for SNP reading...");
     bcf_sr_set_threads(snp_reader, thread_count);
 
     // Add the SNP file to the reader
@@ -721,31 +715,11 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
 
         // Use multi-threading if not threading by chromosome
         int thread_count = input_data.getThreadCount();
-        // if (!input_data.isSingleChr())
-        // {
-        //     // Use half of the threads for population allele frequency reading
-        //     thread_count = std::max(1, input_data.getThreadCount() / 2);
-        // }
-        printMessage("Using " + std::to_string(thread_count) + " threads for population allele frequency reading...");
         bcf_sr_set_threads(pfb_reader, thread_count);
     }
 
-    // Split the region into samples
-    // int sample_size = snp_pos.size();
-    // std::vector<std::string> region_chunks = splitRegionIntoChunks(chr, start_pos, end_pos, sample_size);
-
-    // Loop through the samples and read the SNP data, storing the first
-    // SNP position and BAF value for each sample
-    int current_region = 0;
-    // for (size_t i = 0; i < region_chunks.size(); ++i)
-    // {
-    //     current_region++;
-
     // Read the SNP data ----------------------------------------------
-
     // Set the region
-    printMessage("Setting region for SNP reader...");
-    // std::string region_str = region_chunks[i];
     if (bcf_sr_set_regions(snp_reader, chr.c_str(), 0) < 0)
     {
         printError("ERROR: Could not set region for SNP reader: " + chr);
@@ -754,7 +728,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         return;
     }
 
-    printMessage("Region set for SNP reader, loading SNP data...");
     bool snp_found = false;
     while (bcf_sr_next_line(snp_reader) > 0)
     {
@@ -849,13 +822,11 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     {
         // Set the region for the population allele frequency reader
         std::string pfb_region_str = chr_gnomad + ":" + std::to_string(min_snp_pos) + "-" + std::to_string(max_snp_pos);
-        printMessage("Setting region for population allele frequency reader: " + pfb_region_str);
         if (bcf_sr_set_regions(pfb_reader, pfb_region_str.c_str(), 0) < 0)
         {
             printError("ERROR: Could not set region for population allele frequency reader: " + pfb_region_str);
         }
 
-        printMessage("Loading population allele frequency data...");
         // for (size_t i = 0; i < snp_pos.size(); ++i)
         // {
         // Set the region as the SNP position
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index fcfded24..1dae1e66 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -246,35 +246,43 @@ void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bam
         // SV
         int primary_pos = -1;
         int primary_pos2 = -1;
+        int primary_cluster_size = 0;
         if (primary_start_cluster.size() > primary_end_cluster.size()) {
             std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
             primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
+            primary_cluster_size = primary_start_cluster.size();
         } else if (primary_end_cluster.size() > primary_start_cluster.size()) {
             std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
             primary_pos = primary_end_cluster[primary_end_cluster.size() / 2];
+            primary_cluster_size = primary_end_cluster.size();
         } else {
             // Use both positions
             std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
             std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
             primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
             primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2];
+            primary_cluster_size = primary_start_cluster.size();
         }
 
         // Get the supplementary alignment positions
         int supp_pos = -1;
         int supp_pos2 = -1;
+        int supp_cluster_size = 0;
         if (supp_start_cluster.size() > supp_end_cluster.size()) {
             std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
             supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
+            supp_cluster_size = supp_start_cluster.size();
         } else if (supp_end_cluster.size() > supp_start_cluster.size()) {
             std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
             supp_pos = supp_end_cluster[supp_end_cluster.size() / 2];
+            supp_cluster_size = supp_end_cluster.size();
         } else {
             // Use both positions. This has been shown to occur in nested SVs
             std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
             std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
             supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
             supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2];
+            supp_cluster_size = supp_start_cluster.size();
         }
 
         // If two of either were found, use the larger SV candidate
@@ -301,13 +309,34 @@ void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bam
         int sv_start = std::min(primary_pos, supp_pos);
         int sv_end = std::max(primary_pos, supp_pos);
         int sv_length = sv_end - sv_start + 1;
+        int cluster_size = std::max(primary_cluster_size, supp_cluster_size);
         SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
         if (sv_length >= min_length && sv_length <= max_length) {
-            SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "NA", "./.", 0.0, 0, 0, 0);
-            sv_calls.push_back(sv_candidate);
+            SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size);
+            addSVCall(sv_calls, sv_candidate);
+            // printMessage(region + ": Found SV candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group) + " with inversion status " + std::to_string(inversion));
         }
     }
 
+    // Combine SVs with identical start and end positions, and sum the cluster
+    // sizes
+    std::vector<SVCall> combined_sv_calls;
+    std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+        return a.start < b.start || (a.start == b.start && a.end < b.end);
+    });
+    int merge_count = 0;
+    for (size_t i = 0; i < sv_calls.size(); i++) {
+        SVCall& sv_call = sv_calls[i];
+        if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.end == sv_calls[i - 1].end) {
+            sv_calls[i - 1].cluster_size += sv_call.cluster_size;
+            merge_count++;
+        } else {
+            combined_sv_calls.push_back(sv_call);
+        }
+    }
+    sv_calls = std::move(combined_sv_calls);
+    printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions");
+
     // return sv_candidates;
 }
 
@@ -456,7 +485,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                     alt_allele = ins_seq_str;
                 }
                 SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, 1, 0);
-                addSVCall(sv_calls, sv_call);                
+                addSVCall(sv_calls, sv_call);
 
             // Check if the CIGAR operation is a deletion
             } else if (op == BAM_CDEL && is_primary) {
@@ -565,13 +594,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     int region_sv_count = getSVCount(chr_sv_calls);
     printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
-    // Run copy number variant predictions on the SVs detected from the
-    // CIGAR string, using a minimum CNV length threshold
-    // if (region_sv_count > 0) {
-    //     printMessage(chr + ": CIGAR predictions...");
-    //     cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
-    // }
-
     // [TEST] Before this section has no memory leaks
     // Run split-read SV and copy number variant predictions
     printMessage(chr + ": Split read SVs...");
@@ -580,6 +602,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     // std::vector<SVCall> sv_candidates = this->getSplitAlignments(fp_in, idx,
     // bamHdr, region, primary_map, supp_map);
     this->getSplitAlignments(fp_in, idx, bamHdr, region, split_sv_calls);
+
     // std::vector<SVCall> split_sv_calls;
     // this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, split_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
 
@@ -773,15 +796,18 @@ void SVCaller::run(const InputData& input_data)
     for (auto& entry : whole_genome_split_sv_calls) {
         const std::string& chr = entry.first;
         std::vector<SVCall>& sv_calls = entry.second;
+
         if (sv_calls.size() > 0) {
-            printMessage("Running copy number predictions on " + chr + "...");
+            printMessage("Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates...");
             this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
 
             // Merge the split-read SVs separately
-            printMessage(chr + ": Merging split reads...");
-            double split_epsilon = 0.45;
-            int split_min_pts = 2;  // This is low since split alignments were already previously merged
-            mergeSVs(sv_calls, split_epsilon, split_min_pts);
+            // printMessage(chr + ": Merging split reads...");
+            // double split_epsilon = 0.45;
+            // // int split_min_pts = 2;  // This is low since split alignments
+            // // were already previously merged
+            // int split_min_pts = 1;
+            // mergeSVs(sv_calls, split_epsilon, split_min_pts);
         }
     }
 
@@ -796,11 +822,25 @@ void SVCaller::run(const InputData& input_data)
     
 
     // Sort the SV calls by start position
-    for (auto& entry : whole_genome_sv_calls) {
-        std::sort(entry.second.begin(), entry.second.end(), [](const SVCall& a, const SVCall& b) {
-            return a.start < b.start;
-        });
-    }
+    // printMessage("Sorting SVs...");
+    // for (auto& entry : whole_genome_sv_calls) {
+    //     std::sort(entry.second.begin(), entry.second.end(), [](const SVCall& a, const SVCall& b) {
+    //         return a.start < b.start || (a.start == b.start && a.end < b.end);
+    //     });
+
+    //     // Check that the SVs are sorted
+    //     bool unsorted = false;
+    //     for (size_t i = 1; i < entry.second.size(); i++) {
+    //         if (entry.second[i].start < entry.second[i-1].start || (entry.second[i].start == entry.second[i-1].start && entry.second[i].end < entry.second[i-1].end)) {
+    //             printError("ERROR: SVs are not sorted for chromosome " + entry.first);
+    //             unsorted = true;
+    //             break;
+    //         }
+    //     }
+    //     if (!unsorted) {
+    //         printMessage("SVs are sorted for chromosome " + entry.first);
+    //     }
+    // }
         // // Sort the SV calls by start position
         // std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
         //     return a.start < b.start;
@@ -883,10 +923,6 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
         bool is_inversion = sv_candidate.sv_type == SVType::INV;
 
         std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
-        if (std::get<1>(result) == SVType::UNKNOWN) {
-            continue;
-        }
-
         double supp_lh = std::get<0>(result);
         SVType supp_type = std::get<1>(result);
         std::string genotype = std::get<2>(result);
@@ -905,6 +941,7 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
                 int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
                 std::string alt_allele = "<" + getSVTypeString(supp_type) + ">";
                 SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
+                // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type));
                 addSVCall(split_sv_calls, sv_call);
             }
         } else if (supp_type == SVType::UNKNOWN && sv_candidate.sv_type == SVType::INV) {
@@ -912,12 +949,13 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
             int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
             std::string alt_allele = "<INV>";
             SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
+            // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type));
             addSVCall(split_sv_calls, sv_call);
         }
         current_sv++;
-        if (current_sv % 1000 == 0) {
-            printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates");
-        }
+        // if (current_sv % 1000 == 0) {
+        //     printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates");
+        // }
     }
 }
 
@@ -1017,6 +1055,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             // If the SV type is unknown, print a warning and skip
             if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) {
                 std::cerr << "Warning: Unknown SV type for SV at " << chr << ":" << start << "-" << end << std::endl;
+                continue;
             } else {
                 total_count += 1;
             }
@@ -1035,7 +1074,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             // Deletion
             if (sv_type == SVType::DEL) {
                 // Get the deleted sequence from the reference genome, also including the preceding base
-                int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
+                uint32_t preceding_pos = (uint32_t) std::max(1, (int) start-1);  // Make sure the position is not negative
                 ref_allele = ref_genome.query(chr, preceding_pos, end);
 
                 // Use the preceding base as the alternate allele 
@@ -1051,22 +1090,30 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
             // Other types (duplications, insertions, inversions)
             } else {
-                // Update the position to the preceding base
-                int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
-                ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
-                start = preceding_pos;
-
-                // Update the end position to the same base for duplications and insertions
-                if (sv_type == SVType::DUP || sv_type == SVType::INS) {
-                    end = start;
-                }
 
                 if (sv_type == SVType::INS) {
+                    // Update the position to the preceding base
+                    int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
+                    ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
+                    start = preceding_pos;
+
                     if (alt_allele != "<INS>") {
                         // Insert the reference allele before the insertion
                         alt_allele.insert(0, ref_allele);
                     }
+                } else {
+                    ref_allele = "N";  // Convention for INV and DUP
                 }
+                // // Update the position to the preceding base
+                // int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
+                // ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
+                // start = preceding_pos;
+
+                end = start;  // Update the end position to the same base
+                // // Update the end position to the same base for duplications and insertions
+                // if (sv_type == SVType::DUP || sv_type == SVType::INS) {
+                //     end = start;
+                // }
             }
 
             // Fix ambiguous bases in the reference allele
@@ -1102,24 +1149,26 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
             // Write the SV call to the file (CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLES)
             vcf_stream << chr << "\t" << start << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "PASS" << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl;
-            if (total_count % 1000 == 0)
-            {
-            	std::cout << "Wrote SV at " << chr << ": " << start << ", total=" << total_count << std::endl;
-        	}
+            
+            // std::cout << "Wrote SV at " << chr << ": " << start << ", " << end << std::endl;
+            // if (total_count % 1000 == 0)
+            // {
+            // 	std::cout << "Wrote SV at " << chr << ": " << start << ", total=" << total_count << std::endl;
+        	// }
         }
     }
     vcf_stream.close();
     std::cout << "Saved SV calls to " << output_vcf << std::endl;
 
     // Create a compressed and indexed VCF file
-    std::cout << "Creating compressed and indexed VCF file..." << std::endl;
-    std::string bgzip_cmd = "bgzip -f " + output_vcf;
-    std::string tabix_cmd = "tabix -p vcf " + output_vcf + ".gz";
-    std::system(bgzip_cmd.c_str());
-    std::system(tabix_cmd.c_str());
-    output_vcf += ".gz";
-    std::cout << "VCF file created: " << output_vcf << std::endl;
-    std::cout << "Index file created: " << output_vcf + ".tbi" << std::endl;
+    // std::cout << "Creating compressed and indexed VCF file..." << std::endl;
+    // std::string bgzip_cmd = "bgzip -f " + output_vcf;
+    // std::string tabix_cmd = "tabix -p vcf " + output_vcf + ".gz";
+    // std::system(bgzip_cmd.c_str());
+    // std::system(tabix_cmd.c_str());
+    // output_vcf += ".gz";
+    // std::cout << "VCF file created: " << output_vcf << std::endl;
+    // std::cout << "Index file created: " << output_vcf + ".tbi" << std::endl;
 
     // Print the number of SV calls skipped
     std::cout << "Finished writing VCF file. Total records: " << total_count << std::endl;
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 41d787fc..371fa521 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -18,9 +18,9 @@ bool SVCall::operator<(const SVCall & other) const
 
 void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call)
 {
-    if (sv_call.sv_type == SVType::UNKNOWN || sv_call.sv_type == SVType::NEUTRAL) {
-        return;
-    }
+    // if (sv_call.sv_type == SVType::UNKNOWN || sv_call.sv_type == SVType::NEUTRAL) {
+    //     return;
+    // }
 
     // Check if the SV call is valid
     if (sv_call.start > sv_call.end) {

From 766b220e478d432784d00f0ac2414e2e7837fc64 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 8 Mar 2025 22:13:39 -0500
Subject: [PATCH 081/134] improve insertion detection

---
 src/dbscan.cpp    |   6 --
 src/sv_caller.cpp | 233 ++++++++--------------------------------------
 src/sv_object.cpp |  85 ++++++++++++++++-
 3 files changed, 121 insertions(+), 203 deletions(-)

diff --git a/src/dbscan.cpp b/src/dbscan.cpp
index 6fe97563..d6c41346 100644
--- a/src/dbscan.cpp
+++ b/src/dbscan.cpp
@@ -70,11 +70,6 @@ std::vector<size_t> DBSCAN::regionQuery(const std::vector<SVCall>& sv_calls, siz
 }
 
 double DBSCAN::distance(const SVCall& point1, const SVCall& point2) const {
-    // return std::sqrt(std::pow(point1.first - point2.first, 2) +
-    // std::pow(point1.second - point2.second, 2));
-    // return std::sqrt(std::pow(static_cast<double>(point1.start) - static_cast<double>(point2.start), 2) +
-    // std::pow(static_cast<double>(point1.end) -
-    // static_cast<double>(point2.end), 2));
     
     // Calculate reciprocal overlap-based distance
     // https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02840-6
@@ -85,6 +80,5 @@ double DBSCAN::distance(const SVCall& point1, const SVCall& point2) const {
 
     // Minimum reciprocal overlap
     double distance = 1.0 - std::min(static_cast<double>(overlap) / static_cast<double>(length1), static_cast<double>(overlap) / static_cast<double>(length2));
-    // double distance = 1.0 - static_cast<double>(overlap) / std::min(length1, length2);
     return distance;  // 0.0 means identical, 1.0 means no overlap
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 1dae1e66..1669e093 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -335,9 +335,10 @@ void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bam
         }
     }
     sv_calls = std::move(combined_sv_calls);
-    printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions");
 
-    // return sv_candidates;
+    // if (merge_count > 0) {
+    //     printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions");
+    // }
 }
 
 
@@ -441,6 +442,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
                         ins_seq_str[j] = base;
                     }
                 }
+                // std::string ins_seq_str_rc = reverseComplement(ins_seq_str);
                 
                 // Before the insertion
                 if (pos >= (uint32_t)op_len-1)
@@ -523,17 +525,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         printError("ERROR: failed to open " + bam_filepath);
         return;
     }
-
-    // Use multi-threading for the BAM file
-    // int thread_count = input_data.getThreadCount();
-    // // if (!input_data.isSingleChr()) {
-    // //     // Use half the threads for chromosomes, the other half for file I/O
-    // //     thread_count = std::max(1, thread_count / 2);
-    // // }
-    // printMessage("Using " + std::to_string(thread_count) + " threads for BAM file I/O");
-    // // int num_threads = input_data.getThreadCount();
-    // hts_set_threads(fp_in, thread_count);
-    hts_set_threads(fp_in, 1);  // Disable multi-threading for now
+    hts_set_threads(fp_in, 1);
 
     // Load the header
     bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
@@ -555,8 +547,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
 
     // Set the region to process
     std::string region = chr;
-    // uint32_t chr_len = ref_genome.getChromosomeLength(chr);
-    // uint32_t chr_len = bamHdr->target_len[bam_name2id(bamHdr, chr.c_str())];
     if (input_data.isRegionSet()) {
 
         // Use one chunk for the specified region
@@ -566,17 +556,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         region = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end);
     }
 
-    // Load chromosome data for copy number predictions
-    // printMessage(chr + ": Loading chromosome data...");
-    CNVCaller cnv_caller(this->shared_mutex);
-    // std::vector<uint32_t> chr_pos_depth_map(chr_len+1, 0);  // 1-based index
-
-    // // Use only half the threads for chromosomes, the other half for file I/O
-    // double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count);
-    // if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) {
-    //     return;
-    // }
-
     // Estimate DBSCAN minimum points
     double dbscan_min_pts_pct = input_data.getDBSCAN_MinPtsPct();
     if (dbscan_min_pts_pct > 0.0) {
@@ -584,6 +563,8 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
         printMessage(chr + ": Mean chr. cov.: " + std::to_string(mean_chr_cov) + " (DBSCAN min. pts.= " + std::to_string(dbscan_min_pts) + ", min. pts. pct.= " + std::to_string(dbscan_min_pts_pct) + ")");
     }
 
+
+    // -----------------------------------------------------------------------
     // Detect SVs from the CIGAR strings
     printMessage(chr + ": CIGAR SVs...");
     this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
@@ -594,33 +575,10 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     int region_sv_count = getSVCount(chr_sv_calls);
     printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
-    // [TEST] Before this section has no memory leaks
-    // Run split-read SV and copy number variant predictions
+    // -----------------------------------------------------------------------
+    // Detect SVs from the split reads
     printMessage(chr + ": Split read SVs...");
-    // std::unordered_map<std::string, GenomicRegion> primary_map;
-    // std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
-    // std::vector<SVCall> sv_candidates = this->getSplitAlignments(fp_in, idx,
-    // bamHdr, region, primary_map, supp_map);
     this->getSplitAlignments(fp_in, idx, bamHdr, region, split_sv_calls);
-
-    // std::vector<SVCall> split_sv_calls;
-    // this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, split_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data);
-
-    // // // Merge the split-read SVs separately
-    // printMessage(chr + ": Merging split reads...");
-    // double split_epsilon = 0.45;
-    // int split_min_pts = 2;  // This is low since split alignments were already previously merged
-    // mergeSVs(split_sv_calls, split_epsilon, split_min_pts);
-
-    // printMessage(chr + ": Unifying SVs...");
-    // chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end());
-
-    // // Sort the SV calls by start position
-    // std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-    //     return a.start < b.start;
-    // });
-
-    printMessage("Completed chromosome " + chr);
 }
 
 void SVCaller::run(const InputData& input_data)
@@ -643,15 +601,6 @@ void SVCaller::run(const InputData& input_data)
         // Get the chromosomes from the input BAM file
         chromosomes = this->getChromosomes(input_data.getLongReadBam());
     }
-
-    // [TEST] Keep only the first 6 chromosomes
-    // chromosomes.resize(4);
-    // // Remove the first chromosome
-    // chromosomes.erase(chromosomes.begin());
-    // printMessage("Chromosomes: " + std::to_string(chromosomes.size()));
-    // for (const auto& chr : chromosomes) {
-    //     printMessage("  " + chr);
-    // }
     
     // Read the HMM from the file
     std::string hmm_filepath = input_data.getHMMFilepath();
@@ -691,39 +640,6 @@ void SVCaller::run(const InputData& input_data)
         printMessage("Removing chromosome " + chr + " with no reads...");
         chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end());
     }
-    // std::cout << "Reading chromosome coverage..." << std::endl;
-    // std::cout << "(Thread count: " << chr_thread_count << ")" << std::endl;
-    // int current_chr = 0;
-    // int total_chr_count = chromosomes.size();
-    // std::vector<std::string> null_chr;
-    // for (const auto& chr : chromosomes) {
-    //     current_chr++;
-    //     uint32_t chr_len = ref_genome.getChromosomeLength(chr);
-    //     if (chr_len == 0) {
-    //         printError("ERROR: chromosome " + chr + " not found in reference genome");
-    //         return;
-    //     }
-    //     printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Reading " + chr + "...");
-    //     std::vector<uint32_t> pos_depth_map(chr_len+1, 0);  // 1-based index
-    //     double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, pos_depth_map, bam_filepath, chr_thread_count);
-    //     if (mean_chr_cov == 0.0 || pos_depth_map.size() == 0) {
-    //         // No reads, continue to the next chromosome
-    //         null_chr.push_back(chr);
-    //         continue;
-    //         // printError("ERROR: failed to calculate mean chromosome coverage for " + chr);
-    //         // return;
-    //     }
-    //     chr_pos_depth_map[chr] = std::move(pos_depth_map);
-    //     chr_mean_cov_map[chr] = mean_chr_cov;
-    //     printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean cov. for " + chr + ": " + std::to_string(mean_chr_cov));
-    // }
-    // printMessage("Completed reading chromosome coverage.");
-
-    // Remove chromosomes with no reads
-    // for (const auto& chr : null_chr) {
-    //     printMessage("Removing chromosome " + chr + " with no reads...");
-    //     chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end());
-    // }
 
     // Use multi-threading across chromosomes. If a single chromosome is
     // specified, use a single main thread (multi-threading is used for file I/O)
@@ -742,7 +658,6 @@ void SVCaller::run(const InputData& input_data)
             InputData chr_input_data = input_data;  // Use a thread-local copy
             this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls);
             {
-                // std::lock_guard<std::mutex> lock(this->shared_mutex);
                 std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
                 whole_genome_sv_calls[chr] = std::move(sv_calls);
                 whole_genome_split_sv_calls[chr] = std::move(split_sv_calls);
@@ -759,7 +674,7 @@ void SVCaller::run(const InputData& input_data)
     std::vector<std::future<void>> futures;
     for (const auto& chr : chromosomes) {
         futures.emplace_back(pool.enqueue([&, chr] {
-            printMessage("Processing chromosome " + chr);
+            // printMessage("Processing chromosome " + chr);
             process_chr(chr);
         }));
     }
@@ -771,7 +686,6 @@ void SVCaller::run(const InputData& input_data)
         try {
             current_chr++;
             future.get();
-            printMessage("Chromosome task "+ std::to_string(current_chr) + " of " + std::to_string(total_chr_count) + " completed.");
         } catch (const std::exception& e) {
             printError("Error processing chromosome task: " + std::string(e.what()));
         } catch (...) {
@@ -780,34 +694,34 @@ void SVCaller::run(const InputData& input_data)
     }
     printMessage("All tasks have finished.");
 
+    // -------------------------------------------------------
     // Run copy number variant predictions on the SVs detected from the
     // CIGAR string, using a minimum CNV length threshold
+    current_chr = 0;
     printMessage("Running copy number predictions on CIGAR SVs...");
     for (auto& entry : whole_genome_sv_calls) {
+        current_chr++;
         const std::string& chr = entry.first;
         std::vector<SVCall>& sv_calls = entry.second;
         if (sv_calls.size() > 0) {
-            printMessage("Running copy number predictions on " + chr + "...");
+            // printMessage("Running copy number predictions on " + chr +
+            // "...");
+            printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
             cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
         }
     }
+    // -------------------------------------------------------
 
     printMessage("Running copy number predictions on split-read SVs...");
+    current_chr = 0;
     for (auto& entry : whole_genome_split_sv_calls) {
         const std::string& chr = entry.first;
         std::vector<SVCall>& sv_calls = entry.second;
 
         if (sv_calls.size() > 0) {
-            printMessage("Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates...");
+            current_chr++;
+            printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates...");
             this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
-
-            // Merge the split-read SVs separately
-            // printMessage(chr + ": Merging split reads...");
-            // double split_epsilon = 0.45;
-            // // int split_min_pts = 2;  // This is low since split alignments
-            // // were already previously merged
-            // int split_min_pts = 1;
-            // mergeSVs(sv_calls, split_epsilon, split_min_pts);
         }
     }
 
@@ -817,35 +731,6 @@ void SVCaller::run(const InputData& input_data)
         std::vector<SVCall>& sv_calls = entry.second;
         whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end());
     }
-    // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(),
-    // split_sv_calls.end());
-    
-
-    // Sort the SV calls by start position
-    // printMessage("Sorting SVs...");
-    // for (auto& entry : whole_genome_sv_calls) {
-    //     std::sort(entry.second.begin(), entry.second.end(), [](const SVCall& a, const SVCall& b) {
-    //         return a.start < b.start || (a.start == b.start && a.end < b.end);
-    //     });
-
-    //     // Check that the SVs are sorted
-    //     bool unsorted = false;
-    //     for (size_t i = 1; i < entry.second.size(); i++) {
-    //         if (entry.second[i].start < entry.second[i-1].start || (entry.second[i].start == entry.second[i-1].start && entry.second[i].end < entry.second[i-1].end)) {
-    //             printError("ERROR: SVs are not sorted for chromosome " + entry.first);
-    //             unsorted = true;
-    //             break;
-    //         }
-    //     }
-    //     if (!unsorted) {
-    //         printMessage("SVs are sorted for chromosome " + entry.first);
-    //     }
-    // }
-        // // Sort the SV calls by start position
-        // std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-        //     return a.start < b.start;
-        // });
-
 
     // Print the total number of SVs detected for each chromosome
     uint32_t total_sv_count = 0;
@@ -853,9 +738,9 @@ void SVCaller::run(const InputData& input_data)
         std::string chr = entry.first;
         int sv_count = getSVCount(entry.second);
         total_sv_count += sv_count;
-        printMessage("Total SVs detected for chromosome " + chr + ": " + std::to_string(sv_count));
+        printMessage("Total SVs detected for " + chr + ": " + std::to_string(sv_count));
     }
-    printMessage("Total SVs detected for all chromosomes: " + std::to_string(total_sv_count));
+    printMessage("Total SVs detected: " + std::to_string(total_sv_count));
 
     // Save to VCF
     std::cout << "Saving SVs to VCF..." << std::endl;
@@ -863,63 +748,14 @@ void SVCaller::run(const InputData& input_data)
     this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome);
 }
 
-
-// Detect SVs from split read alignments
-// void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data)
-// {
-//     // printMessage(region + ": Getting split alignments...");
-//     // std::unordered_map<std::string, GenomicRegion> primary_map;
-//     // std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
-//     std::vector<SVCall> sv_candidates = this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map);
-
-//     // Run copy number predictions on the SVs detected from the split reads
-//     printMessage(region + ": Split read predictions...");
-//     int current_sv = 0;
-//     int total_svs = sv_candidates.size();
-//     for (auto& sv_candidate : sv_candidates) {
-//         bool is_inversion = sv_candidate.sv_type == SVType::INV;
-
-//         std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(region, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
-//         if (std::get<1>(result) == SVType::UNKNOWN) {
-//             continue;
-//         }
-
-//         double supp_lh = std::get<0>(result);
-//         SVType supp_type = std::get<1>(result);
-//         std::string genotype = std::get<2>(result);
-//         if (supp_type != SVType::UNKNOWN) {
-//             if (is_inversion) {
-//                 if (supp_type == SVType::DEL) {
-//                     supp_type = SVType::INV_DEL;
-//                 } else if (supp_type == SVType::DUP) {
-//                     supp_type = SVType::INV_DUP;
-//                 } else if (supp_type == SVType::NEUTRAL) {
-//                     supp_type = SVType::INV;
-//                 }
-//             }
-            
-//             if (supp_type != SVType::NEUTRAL) {
-//                 int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
-//                 std::string alt_allele = "<" + getSVTypeString(supp_type) + ">";
-//                 SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
-//                 addSVCall(split_sv_calls, sv_call);
-//             }
-//         }
-//         current_sv++;
-//         if (current_sv % 1000 == 0) {
-//             printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates");
-//         }
-//     }
-// }
-
 // Detect SVs from split read alignments
 void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::vector<SVCall>& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data)
 {
     // Run copy number predictions on the SVs detected from the split reads
-    printMessage("Split read predictions...");
-    int current_sv = 0;
-    int total_svs = split_sv_calls.size();
-    for (auto& sv_candidate : split_sv_calls) {
+    std::vector<SVCall> processed_calls;
+    for (const auto& sv_candidate : split_sv_calls) {
+        printMessage("Processing SV candidate " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " " + getSVTypeString(sv_candidate.sv_type) + "...");
+
         bool is_inversion = sv_candidate.sv_type == SVType::INV;
 
         std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
@@ -928,6 +764,12 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
         std::string genotype = std::get<2>(result);
         if (supp_type != SVType::UNKNOWN) {
             if (is_inversion) {
+            	// Add an additional inversion separately
+		        int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
+		        std::string alt_allele = "<INV>";
+		        SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
+                processed_calls.push_back(sv_call);
+                /*
                 if (supp_type == SVType::DEL) {
                     supp_type = SVType::INV_DEL;
                 } else if (supp_type == SVType::DUP) {
@@ -935,6 +777,7 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
                 } else if (supp_type == SVType::NEUTRAL) {
                     supp_type = SVType::INV;
                 }
+                */
             }
             
             if (supp_type != SVType::NEUTRAL) {
@@ -942,7 +785,8 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
                 std::string alt_allele = "<" + getSVTypeString(supp_type) + ">";
                 SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
                 // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type));
-                addSVCall(split_sv_calls, sv_call);
+                // addSVCall(split_sv_calls, sv_call);
+                processed_calls.push_back(sv_call);
             }
         } else if (supp_type == SVType::UNKNOWN && sv_candidate.sv_type == SVType::INV) {
             // Inversion with no CNV prediction
@@ -950,13 +794,16 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
             std::string alt_allele = "<INV>";
             SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
             // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type));
-            addSVCall(split_sv_calls, sv_call);
+            // addSVCall(split_sv_calls, sv_call);
+            processed_calls.push_back(sv_call);
         }
-        current_sv++;
         // if (current_sv % 1000 == 0) {
         //     printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates");
         // }
     }
+
+    // Replace with the processed calls
+    split_sv_calls = std::move(processed_calls);
 }
 
 void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 371fa521..0c19468c 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -65,6 +65,11 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
         SVType::INV_DEL,
     })
     {
+        // [TEST] Skip if not insertions
+        // if (sv_type != SVType::INS) {
+        //     continue;
+        // }
+
         // Create a vector of SV calls for the current SV type and size interval
         std::vector<SVCall> sv_type_calls;
         std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) {
@@ -87,7 +92,49 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
         for (auto& cluster : cluster_map) {
             int cluster_id = cluster.first;
             std::vector<SVCall>& cluster_sv_calls = cluster.second;
+
+
+            // [TEST] If insertions, and if any SV has length between 9400 and
+            // 9500, print all SV coordinates in the cluster
+            bool print_all = false;
+            // if (sv_type == SVType::INS) {
+            //     for (const auto& sv_call : cluster_sv_calls) {
+            //         // printMessage("[TEST] SV call " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1));
+            //         // if (sv_call.end - sv_call.start >= 9400 && sv_call.end -
+            //         // sv_call.start <= 9500) {
+            //         // if (sv_call.end - sv_call.start >= 15100 && sv_call.end -
+            //         // sv_call.start <= 15200) {
+            //         // if (sv_call.end - sv_call.start >= 11200 && sv_call.end -
+            //         // sv_call.start <= 11300) {
+            //         // if (sv_call.end - sv_call.start >= 16800 && sv_call.end -
+            //         // sv_call.start <= 17000) {
+            //         // if (sv_call.end - sv_call.start >= 11300 && sv_call.end -
+            //         // sv_call.start <= 11400) {
+            //         // if (sv_call.end - sv_call.start >= 13100 && sv_call.end -
+            //         // sv_call.start <= 13200) {
+            //         if (sv_call.end - sv_call.start >= 28200 && sv_call.end - sv_call.start <= 28300) {
+            //             print_all = true;
+            //             break;
+            //         }
+            //     }
+            // }
+            if (print_all) {
+                printMessage("[TEST] Cluster " + std::to_string(cluster_id) + " has " + std::to_string(cluster_sv_calls.size()) + " SVs:");
+                for (const auto& sv_call : cluster_sv_calls) {
+                    printMessage("  " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1));
+                }
+            }
+
             if (cluster_id < 0) {
+                // Add all noise points to the merged list if >10 kb
+                // for (const auto& sv_call : cluster_sv_calls) {
+                //     if ((sv_call.end - sv_call.start)+1 >= 10000) {
+                //         SVCall noise_sv_call = sv_call;
+                //         noise_sv_call.cluster_size = cluster_id;
+                //         merged_sv_calls.push_back(noise_sv_call);
+                //         printMessage("[TEST] Adding noise SV " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1));
+                //     }
+                // }
                 continue;  // Skip noise and unclassified points
             } else {
             // if (true) {
@@ -118,13 +165,43 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
                     });
                     merged_sv_call = *it;
 
+                    // [TEST]
+                    if (print_all) {
+                        printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with highest likelihood SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1));
+                    }
+
                 } else {
-                    // Use the median length SV
+                    // Use the median length SV of the top 10% of the cluster
+                    // (shorter reads are often noise)
                     std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-                        return (a.end - a.start) < (b.end - b.start);
+                        return (a.end - a.start) > (b.end - b.start);
                     });
-                    int median_index = cluster_sv_calls.size() / 2;
-                    merged_sv_call = cluster_sv_calls[median_index];
+
+                    // Get the top 10% of the cluster
+                    size_t top_10_percent = std::max(1, (int) (cluster_sv_calls.size() * 0.1));
+                    std::vector<SVCall> top_10(cluster_sv_calls.begin(), cluster_sv_calls.begin() + top_10_percent);
+
+                    // Get the median SV for the top 10% of the cluster
+                    size_t median_index = top_10.size() / 2;
+                    merged_sv_call = top_10[median_index];
+
+                    // // Get the starting index of the top 10% of the cluster
+                    // // (Cluster is sorted by descending length)
+                    // size_t start_index = std::max(0, (int) (cluster_sv_calls.size() * 0.9));
+
+                    // // Get the top 10% of the cluster
+                    // std::vector<SVCall> top_half(cluster_sv_calls.begin() + start_index, cluster_sv_calls.end());
+
+                    // // Get the median SV for the top 50% of the cluster
+                    // size_t median_index = top_half.size() / 2;
+                    // merged_sv_call = top_half[median_index];
+                    // int median_index = cluster_sv_calls.size() / 2;
+                    // merged_sv_call = cluster_sv_calls[median_index];
+
+                    // [TEST]
+                    if (print_all) {
+                        printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with median SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1));
+                    }
                 }
 
                 if (cluster_id < 0) {

From 0776822eb6bbb43a271e9a9b43735ec8cb4241d0 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 12 Mar 2025 23:06:39 -0400
Subject: [PATCH 082/134] fix insertions

---
 include/sv_caller.h |   66 ++-
 src/sv_caller.cpp   | 1335 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 1050 insertions(+), 351 deletions(-)

diff --git a/include/sv_caller.h b/include/sv_caller.h
index d795f44b..1b420226 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -21,51 +21,70 @@ struct GenomicRegion {
     int tid;
     hts_pos_t start;
     hts_pos_t end;
+    int query_start;
+    int query_end;
     bool strand;
-    uint8_t qual;
     int cluster_size;  // Number of alignments used for this region
 };
 
+struct PrimaryAlignment {
+    hts_pos_t start;
+    hts_pos_t end;
+    int query_start;
+    int query_end;
+    bool strand;
+    int cluster_size;  // Number of alignments used for this region
+};
+
+struct SuppAlignment {
+    int tid;
+    hts_pos_t start;
+    hts_pos_t end;
+    int query_start;
+    int query_end;
+    bool strand;
+    int cluster_size;  // Number of alignments used for this region
+};
+
+struct SplitSignature {
+    int tid;
+    hts_pos_t start;
+    hts_pos_t end;
+    bool strand;
+    hts_pos_t query_start;
+    hts_pos_t query_end;
+};
+
 // Interval Tree Node
 struct IntervalNode {
-    GenomicRegion region;
+    PrimaryAlignment region;
     std::string qname;
     hts_pos_t max_end;  // To optimize queries
-    // IntervalNode* left;
-    // IntervalNode* right;
     std::unique_ptr<IntervalNode> left;
     std::unique_ptr<IntervalNode> right;
 
-    IntervalNode(GenomicRegion r, std::string name)
+    IntervalNode(PrimaryAlignment r, std::string name)
         : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {}
 };
 
-// IntervalNode* insert(IntervalNode* root, GenomicRegion region, std::string
-// qname) {
-void insert(std::unique_ptr<IntervalNode>& root, GenomicRegion region, std::string qname) {
+void insert(std::unique_ptr<IntervalNode>& root, const PrimaryAlignment& region, std::string qname) {
     if (!root) {
-        // return new IntervalNode(region, qname);
         root = std::make_unique<IntervalNode>(region, qname);
         return;
     }
 
     if (region.start < root->region.start)
     {
-        // root->left = insert(root->left, region, qname);
         insert(root->left, region, qname);
     } else {
-        // root->right = insert(root->right, region, qname);
         insert(root->right, region, qname);
     }
 
     // Update max_end
     root->max_end = std::max(root->max_end, region.end);
-    // return root;
 }
 
-// void findOverlaps(IntervalNode* root, GenomicRegion query,
-// std::vector<std::string>& result) {
-void findOverlaps(const std::unique_ptr<IntervalNode>& root, GenomicRegion query, std::vector<std::string>& result) {
+void findOverlaps(const std::unique_ptr<IntervalNode>& root, const PrimaryAlignment& query, std::vector<std::string>& result) {
     if (!root) return;
 
     // If overlapping, add to result
@@ -93,18 +112,23 @@ class SVCaller {
 
         std::vector<std::string> getChromosomes(const std::string& bam_filepath);
 
-        void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls);
+        // void findSplitCNVBreakpoints(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls);
+
+        void findSplitSVSignatures(std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const InputData& input_data);
+
+        void findSplitReadSVs(std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const ReferenceGenome& ref_genome, const InputData& input_data);
 
-        // Detect SVs from the CIGAR string of a read alignment, and return the
-        // mismatch rate, and the start and end positions of the query sequence
-        void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome);
+        // Process a single CIGAR record and find candidate SVs
+        void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome);
+
+        std::pair<int, int> getAlignmentReadPositions(bam1_t* alignment);
 
         void processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov, std::vector<SVCall>& split_sv_calls);
 
         // Detect SVs at a region from long read alignments. This is used for
         // whole genome analysis running in parallel.
         // RegionData detectSVsFromRegion(std::string region);
-        void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome);
+        void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome);
  
         // Read the next alignment from the BAM file in a thread-safe manner
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
@@ -124,8 +148,6 @@ class SVCaller {
         // Calculate the read depth (INFO/DP) for a region
         int calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end);
 
-        bool regionOverlaps(const GenomicRegion& a, const GenomicRegion& b);
-
     public:
         // Constructor with no arguments
         SVCaller() = default;
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 1669e093..31b7270a 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -67,29 +67,355 @@ std::vector<std::string> SVCaller::getChromosomes(const std::string &bam_filepat
     return chromosomes;
 }
 
-void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const std::string &region, std::vector<SVCall>& sv_calls)
+// void SVCaller::findSplitCNVBreakpoints(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const std::string &region, std::vector<SVCall>& sv_calls)
+// {
+//     std::unordered_map<std::string, GenomicRegion> primary_map;
+//     std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
+
+//     // Create a read and iterator for the region
+//     bam1_t *bam1 = bam_init1();
+//     if (!bam1) {
+//         printError("ERROR: failed to initialize BAM record");
+//         return;
+//     }
+//     hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str());
+//     if (!itr) {
+//         bam_destroy1(bam1);
+//         printError("ERROR: failed to query region " + region);
+//         return;
+//     }
+
+//     uint32_t primary_count = 0;
+//     uint32_t supplementary_count = 0;
+
+//     // Main loop to process the alignments
+//     uint32_t num_alignments = 0;
+//     while (readNextAlignment(fp_in, itr, bam1) >= 0) {
+
+//         // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
+//         if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) {
+//             continue;
+//         }
+//         const std::string qname = bam_get_qname(bam1);  // Query template name
+
+//         // Process primary alignments
+//         if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
+//             // Store chromosome (TID), start, and end positions (1-based) of the
+//             // primary alignment, and the strand (true for forward, false for reverse)
+//             primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0};
+//             primary_count++;
+
+//         // Process supplementary alignments
+//         } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
+//             // Store chromosome (TID), start, and end positions (1-based) of the
+//             // supplementary alignment, and the strand (true for forward, false for reverse)
+//             supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0});
+//             supplementary_count++;
+//         }
+//         num_alignments++;
+//     }
+
+//     // Remove primary alignments without supplementary alignments
+//     std::vector<std::string> to_remove;
+//     for (const auto& entry : primary_map) {
+//         const std::string& qname = entry.first;
+//         if (supp_map.find(qname) == supp_map.end()) {
+//             to_remove.push_back(qname);
+//         }
+//     }
+//     for (const std::string& qname : to_remove) {
+//         primary_map.erase(qname);
+//     }
+
+//     // // Clean up the iterator and alignment
+//     // hts_itr_destroy(itr);
+//     // bam_destroy1(bam1);
+//     // printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments");
+
+//     // Identify overlapping primary alignments and then cluster their primary
+//     // start, end vs. supplementary alignment start, end positions, keeping the
+//     // median of the largest cluster for the primary and supplementary positions
+//     // as the final genome coordinates of the SV
+//     // IntervalNode* root = nullptr;
+//     std::unique_ptr<IntervalNode> root = nullptr;
+//     for (const auto& entry : primary_map) {
+//         const std::string& qname = entry.first;
+//         const GenomicRegion& region = entry.second;
+//         // root = insert(root, region, qname);
+//         insert(root, region, qname);
+//     }
+//     std::vector<std::vector<std::string>> primary_clusters;
+//     std::set<std::string> processed;
+
+//     for (const auto& entry : primary_map) {
+//         const std::string& qname = entry.first;
+//         if (processed.find(qname) != processed.end()) {
+//             continue;  // Skip already processed primary alignments
+//         }
+//         const GenomicRegion& region = entry.second;
+//         std::vector<std::string> overlap_group;
+//         findOverlaps(root, region, overlap_group);
+//         for (const std::string& qname : overlap_group) {
+//             processed.insert(qname);
+//         }
+//         if (overlap_group.size() > 1) {
+//             primary_clusters.push_back(overlap_group);
+//         }
+//     }
+//     printMessage(region + ": Found " + std::to_string(primary_clusters.size()) + " groups of overlapping primary alignments");
+
+//     // For each primary alignment cluster the supplementary alignment start and
+//     // end positions, keeping the median of the largest cluster
+//     // std::vector<SVCall> sv_candidates;
+//     int current_group = 0;
+//     int min_length = 2000;
+//     int max_length = 1000000;
+//     for (const auto& primary_cluster : primary_clusters) {
+//         // Determine if the primary alignments are mostly on opposite strands to
+//         // the corresponding supplementary alignments (potential inversions)
+//         bool inversion = false;
+//         for (const std::string& qname : primary_cluster) {
+//             const std::vector<GenomicRegion>& supp_alns = supp_map[qname];
+//             int num_supp = (int) supp_alns.size();
+//             int num_opposite_strand = 0;
+//             for (const GenomicRegion& supp_aln : supp_alns) {
+//                 // Opposite-strand alignment on the same chromosome
+//                 // (Since the iterator is single-chromosome, this is the case)
+//                 if (supp_aln.strand != primary_map[qname].strand) {
+//                     num_opposite_strand++;
+//                 }
+//             }
+//             if (static_cast<double>(num_opposite_strand) / static_cast<double>(num_supp) > 0.5) {
+//                 inversion = true;
+//             }
+//         }
+
+//         // Use DBSCAN to cluster primary alignment start, end positions
+//         DBSCAN1D dbscan(100, 5);
+//         current_group++;
+//         std::vector<int> starts;
+//         std::vector<int> ends;
+//         std::vector<bool> primary_strands;
+//         for (const std::string& qname : primary_cluster) {
+//             const GenomicRegion& region = primary_map[qname];
+//             starts.push_back(region.start);
+//             ends.push_back(region.end);
+//             primary_strands.push_back(region.strand);
+//         }
+
+//         // Get the largest cluster of primary alignment start positions
+//         dbscan.fit(starts);
+//         std::vector<int> primary_start_cluster = dbscan.getLargestCluster(starts);
+
+//         // Get the largest cluster of primary alignment end positions
+//         dbscan.fit(ends);
+//         std::vector<int> primary_end_cluster = dbscan.getLargestCluster(ends);
+
+//         // Continue if no clusters were found
+//         if (primary_start_cluster.empty() && primary_end_cluster.empty()) {
+//             continue;
+//         }
+
+//         // Get the supplementary alignment positions
+//         std::vector<int> supp_starts;
+//         std::vector<int> supp_ends;
+//         std::vector<bool> supp_strands;
+//         for (const std::string& qname : primary_cluster) {
+//             const std::vector<GenomicRegion>& regions = supp_map[qname];
+//             for (const GenomicRegion& region : regions) {
+//                 supp_starts.push_back(region.start);
+//                 supp_ends.push_back(region.end);
+//                 supp_strands.push_back(region.strand);
+//             }
+//         }
+
+//         // Get the largest cluster of supplementary alignment start positions
+//         dbscan.fit(supp_starts);
+//         std::vector<int> supp_start_cluster = dbscan.getLargestCluster(supp_starts);
+
+//         // Get the largest cluster of supplementary alignment end positions
+//         dbscan.fit(supp_ends);
+//         std::vector<int> supp_end_cluster = dbscan.getLargestCluster(supp_ends);
+
+//         // Continue if no clusters were found
+//         if (supp_start_cluster.empty() && supp_end_cluster.empty()) {
+//             continue;
+//         }
+
+//         // Use the median of the largest cluster of primary and supplementary
+//         // alignment start, end positions as the final genome coordinates of the
+//         // SV
+//         int primary_pos = -1;
+//         int primary_pos2 = -1;
+//         int primary_cluster_size = 0;
+//         if (primary_start_cluster.size() > primary_end_cluster.size()) {
+//             std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
+//             primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
+//             primary_cluster_size = primary_start_cluster.size();
+//         } else if (primary_end_cluster.size() > primary_start_cluster.size()) {
+//             std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
+//             primary_pos = primary_end_cluster[primary_end_cluster.size() / 2];
+//             primary_cluster_size = primary_end_cluster.size();
+//         } else {
+//             // Use both positions
+//             std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
+//             std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
+//             primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
+//             primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2];
+//             primary_cluster_size = primary_start_cluster.size();
+//         }
+
+//         // Get the supplementary alignment positions
+//         int supp_pos = -1;
+//         int supp_pos2 = -1;
+//         int supp_cluster_size = 0;
+//         if (supp_start_cluster.size() > supp_end_cluster.size()) {
+//             std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
+//             supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
+//             supp_cluster_size = supp_start_cluster.size();
+//         } else if (supp_end_cluster.size() > supp_start_cluster.size()) {
+//             std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
+//             supp_pos = supp_end_cluster[supp_end_cluster.size() / 2];
+//             supp_cluster_size = supp_end_cluster.size();
+//         } else {
+//             // Use both positions. This has been shown to occur in nested SVs
+//             std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
+//             std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
+//             supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
+//             supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2];
+//             supp_cluster_size = supp_start_cluster.size();
+//         }
+
+//         // If two of either were found, use the larger SV candidate
+//         if (primary_pos2 != -1) {
+//             int sv_length1 = std::abs(primary_pos - supp_pos);
+//             int sv_length2 = std::abs(primary_pos2 - supp_pos);
+//             if (sv_length2 > sv_length1) {
+//                 primary_pos = primary_pos2;
+//             }
+//         }
+//         if (supp_pos2 != -1) {
+//             int sv_length1 = std::abs(primary_pos - supp_pos);
+//             int sv_length2 = std::abs(primary_pos - supp_pos2);
+//             if (sv_length2 > sv_length1) {
+//                 supp_pos = supp_pos2;
+//             }
+//         }
+
+//         if (primary_pos == -1 || supp_pos == -1) {
+//             continue;
+//         }
+
+//         // Store the SV candidate if the length is within the specified range
+//         int sv_start = std::min(primary_pos, supp_pos);
+//         int sv_end = std::max(primary_pos, supp_pos);
+//         int sv_length = sv_end - sv_start + 1;
+//         int cluster_size = std::max(primary_cluster_size, supp_cluster_size);
+        
+//         // Determine the SV type
+//         SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
+//         if (sv_length >= min_length && sv_length <= max_length) {
+//             SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size);
+//             addSVCall(sv_calls, sv_candidate);
+//             // printMessage(region + ": Found SV candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group) + " with inversion status " + std::to_string(inversion));
+//         }
+//     }
+
+//     // Combine SVs with identical start and end positions, and sum the cluster
+//     // sizes
+//     std::vector<SVCall> combined_sv_calls;
+//     std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+//         return a.start < b.start || (a.start == b.start && a.end < b.end);
+//     });
+//     int merge_count = 0;
+//     for (size_t i = 0; i < sv_calls.size(); i++) {
+//         SVCall& sv_call = sv_calls[i];
+//         if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.end == sv_calls[i - 1].end) {
+//             sv_calls[i - 1].cluster_size += sv_call.cluster_size;
+//             merge_count++;
+//         } else {
+//             combined_sv_calls.push_back(sv_call);
+//         }
+//     }
+//     sv_calls = std::move(combined_sv_calls);
+
+//     // if (merge_count > 0) {
+//     //     printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions");
+//     // }
+// }
+
+void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const InputData &input_data)
 {
-    std::unordered_map<std::string, GenomicRegion> primary_map;
-    std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
+    // Open the BAM file
+    std::string bam_filepath = input_data.getLongReadBam();
+    samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
+    if (!fp_in) {
+        printError("ERROR: failed to open " + bam_filepath);
+        return;
+    }
+
+    // Set maximum thread count
+    int thread_count = input_data.getThreadCount();
+    hts_set_threads(fp_in, thread_count);
+    printMessage("Using " + std::to_string(thread_count) + " threads for split read analysis");
+
+    // Load the header
+    bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
+    if (!bamHdr) {
+        sam_close(fp_in);
+        printError("ERROR: failed to read header from " + bam_filepath);
+        return;
+    }
+
+    // Load the index
+    hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str());
+    if (!idx) {
+        bam_hdr_destroy(bamHdr);
+        sam_close(fp_in);
+        printError("ERROR: failed to load index for " + bam_filepath);
+        return;
+    }
+    BamFileGuard bam_guard(fp_in, idx, bamHdr);  // Guard to close the BAM file
+
+    // Alignment data structures
+    // std::unordered_map<std::string, GenomicRegion> primary_map;
+    // std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
+    std::unordered_map<int, std::unordered_map<std::string, PrimaryAlignment>> primary_map;  // TID-> qname -> primary alignment
+    std::unordered_map<std::string, std::vector<SuppAlignment>> supp_map;  // qname -> supplementary alignment
 
-    // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
     if (!bam1) {
         printError("ERROR: failed to initialize BAM record");
         return;
     }
-    hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str());
-    if (!itr) {
-        bam_destroy1(bam1);
-        printError("ERROR: failed to query region " + region);
-        return;
+    
+    // Set the region to the whole genome, or a user-specified chromosome
+    hts_itr_t *itr = nullptr;
+    if (input_data.isSingleChr()) {
+        std::string chr = input_data.getChromosome();
+        itr = sam_itr_querys(idx, bamHdr, chr.c_str());
+        if (!itr) {
+            bam_destroy1(bam1);
+            printError("ERROR: failed to create iterator for " + chr);
+            return;
+        }
+    } else {
+        itr = sam_itr_queryi(idx, HTS_IDX_START, 0, 0);
+        if (!itr) {
+            bam_destroy1(bam1);
+            printError("ERROR: failed to create iterator for the whole genome");
+            return;
+        }
     }
 
     uint32_t primary_count = 0;
     uint32_t supplementary_count = 0;
 
     // Main loop to process the alignments
+    printMessage("Processing alignments from " + bam_filepath);
     uint32_t num_alignments = 0;
+    std::unordered_set<int> alignment_tids;  // All unique chromosome IDs
+    std::unordered_set<std::string> supp_qnames;  // All unique query names
     while (readNextAlignment(fp_in, itr, bam1) >= 0) {
 
         // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
@@ -97,252 +423,404 @@ void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bam
             continue;
         }
         const std::string qname = bam_get_qname(bam1);  // Query template name
-        uint8_t mapq = bam1->core.qual;  // Mapping quality
 
         // Process primary alignments
         if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
             // Store chromosome (TID), start, and end positions (1-based) of the
-            // primary alignment, and the strand (true for forward, false for reverse)
-            primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq, 0};
+            // primary alignment, and the strand (true for forward, false for
+            // reverse)
+            std::pair<int, int> qpos = getAlignmentReadPositions(bam1);
+
+            primary_map[bam1->core.tid][qname] = PrimaryAlignment{bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0};
+            // primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0};
+            // primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0};
+            alignment_tids.insert(bam1->core.tid);
             primary_count++;
 
         // Process supplementary alignments
         } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
             // Store chromosome (TID), start, and end positions (1-based) of the
-            // supplementary alignment, and the strand (true for forward, false for reverse)
-            supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq, 0});
+            // supplementary alignment, and the strand (true for forward, false
+            // for reverse)
+            std::pair<int, int> qpos = getAlignmentReadPositions(bam1);
+            supp_map[qname].push_back(SuppAlignment{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0});
+            // supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0});
+            alignment_tids.insert(bam1->core.tid);
+            supp_qnames.insert(qname);
             supplementary_count++;
         }
         num_alignments++;
+
+        if (num_alignments % 1000000 == 0) {
+            printMessage("Processed " + std::to_string(num_alignments) + " alignments");
+        }
     }
 
     // Remove primary alignments without supplementary alignments
-    std::vector<std::string> to_remove;
-    for (const auto& entry : primary_map) {
-        const std::string& qname = entry.first;
-        if (supp_map.find(qname) == supp_map.end()) {
-            to_remove.push_back(qname);
+    std::unordered_map<int, std::unordered_set<std::string>> to_remove;
+    for (auto& chr_primary : primary_map) {
+        // Get the qnames for this chromosome
+        std::unordered_set<std::string> qnames;
+        for (const auto& entry : chr_primary.second) {
+            if (supp_qnames.find(entry.first) == supp_qnames.end()) {
+                to_remove[chr_primary.first].insert(entry.first);
+            }
         }
     }
-    for (const std::string& qname : to_remove) {
-        primary_map.erase(qname);
-    }
 
-    // Clean up the iterator and alignment
-    hts_itr_destroy(itr);
-    bam_destroy1(bam1);
-    printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments");
-
-    // Identify overlapping primary alignments and then cluster their primary
-    // start, end vs. supplementary alignment start, end positions, keeping the
-    // median of the largest cluster for the primary and supplementary positions
-    // as the final genome coordinates of the SV
-    // IntervalNode* root = nullptr;
-    std::unique_ptr<IntervalNode> root = nullptr;
-    for (const auto& entry : primary_map) {
-        const std::string& qname = entry.first;
-        const GenomicRegion& region = entry.second;
-        // root = insert(root, region, qname);
-        insert(root, region, qname);
+    int total_removed = 0;
+    for (auto& chr_primary : primary_map) {
+        // Remove the qnames from the primary map
+        total_removed += to_remove[chr_primary.first].size();
+        for (const auto& qname : to_remove[chr_primary.first]) {
+            chr_primary.second.erase(qname);
+        }
     }
-    std::vector<std::vector<std::string>> primary_clusters;
-    std::set<std::string> processed;
+    printMessage("Removed " + std::to_string(total_removed) + " primary alignments without supplementary alignments");
+
+    // std::vector<std::string> to_remove;
+    // for (const auto& entry : primary_map) {
+    //     const std::string& qname = entry.first;
+    //     if (supp_map.find(qname) == supp_map.end()) {
+    //         to_remove.push_back(qname);
+    //     }
+    // }
+    // for (const std::string& qname : to_remove) {
+    //     primary_map.erase(qname);
+    // }
 
-    for (const auto& entry : primary_map) {
-        const std::string& qname = entry.first;
-        if (processed.find(qname) != processed.end()) {
-            continue;  // Skip already processed primary alignments
-        }
-        const GenomicRegion& region = entry.second;
-        std::vector<std::string> overlap_group;
-        findOverlaps(root, region, overlap_group);
-        for (const std::string& qname : overlap_group) {
-            processed.insert(qname);
+
+    for (const auto& chr_primary : primary_map) {
+        int primary_tid = chr_primary.first;
+        std::string chr_name = bamHdr->target_name[primary_tid];
+        printMessage("Processing chromosome " + chr_name + " with " + std::to_string(chr_primary.second.size()) + " primary alignments");
+
+        std::vector<SVCall> chr_sv_calls;
+
+        // std::unordered_map<int, std::unordered_map<std::string, PrimaryAlignment>> primary_map;  // TID-> qname -> primary alignment
+        // const std::unordered_map<std::string, std::vector<PrimaryAlignment>>&
+        // chr_primary_map = chr_primary.second;
+        const std::unordered_map<std::string, PrimaryAlignment>& chr_primary_map = chr_primary.second;
+
+        // Identify overlapping primary alignments and then cluster their primary
+        // start, end vs. supplementary alignment start, end positions, keeping the
+        // median of the largest cluster for the primary and supplementary positions
+        // as the final genome coordinates of the SV
+        // IntervalNode* root = nullptr;
+        std::unique_ptr<IntervalNode> root = nullptr;
+        for (const auto& entry : chr_primary_map) {
+            const std::string& qname = entry.first;
+            const PrimaryAlignment& region = entry.second;
+            insert(root, region, qname);
         }
-        if (overlap_group.size() > 1) {
-            primary_clusters.push_back(overlap_group);
+
+        std::vector<std::vector<std::string>> primary_clusters;
+        std::set<std::string> processed;
+        for (const auto& entry : chr_primary_map) {
+            const std::string& qname = entry.first;
+            if (processed.find(qname) != processed.end()) {
+                continue;  // Skip already processed primary alignments
+            }
+            const PrimaryAlignment& primary_aln = entry.second;
+            std::vector<std::string> overlap_group;
+            findOverlaps(root, primary_aln, overlap_group);
+            for (const std::string& qname : overlap_group) {
+                processed.insert(qname);
+            }
+            if (overlap_group.size() > 1) {
+                primary_clusters.push_back(overlap_group);
+            }
         }
-    }
-    printMessage(region + ": Found " + std::to_string(primary_clusters.size()) + " groups of overlapping primary alignments");
-
-    // For each primary alignment cluster the supplementary alignment start and
-    // end positions, keeping the median of the largest cluster
-    // std::vector<SVCall> sv_candidates;
-    int current_group = 0;
-    int min_length = 2000;
-    int max_length = 1000000;
-    for (const auto& primary_group : primary_clusters) {
-        // Determine if the primary alignments are mostly on opposite strands to
-        // the corresponding supplementary alignments (potential inversions)
-        bool inversion = false;
-        for (const std::string& qname : primary_group) {
-            const std::vector<GenomicRegion>& regions = supp_map[qname];
-            int num_supp = (int) regions.size();
-            int num_opposite_strand = 0;
-            for (const GenomicRegion& region : regions) {
-                if (region.strand != primary_map[qname].strand) {
-                    num_opposite_strand++;
+        printMessage(chr_name + ": Found " + std::to_string(primary_clusters.size()) + " groups of overlapping primary alignments");
+
+        // For each primary alignment cluster the supplementary alignment start and
+        // end positions, keeping the median of the largest cluster
+        int current_group = 0;
+        int min_length = 2000;
+        int max_length = 1000000;
+        for (const auto& primary_cluster : primary_clusters) {
+            // Determine if the primary alignments are mostly on opposite strands to
+            // the corresponding supplementary alignments (potential inversions)
+            bool inversion = false;
+            int num_primary = (int) primary_cluster.size();
+            int num_supp_opposite_strand = 0;
+            for (const std::string& qname : primary_cluster) {
+                const std::vector<SuppAlignment>& supp_alns = supp_map[qname];
+                bool primary_strand = chr_primary_map.at(qname).strand;
+                bool has_opposite_strand = false;
+                for (const SuppAlignment& supp_aln : supp_alns) {
+                    // Analyze if on the same chromosome
+                    if (supp_aln.tid == primary_tid && supp_aln.strand != primary_strand) {
+                        has_opposite_strand = true;
+                    }
+                }
+                if (has_opposite_strand) {
+                    num_supp_opposite_strand++;
                 }
             }
-            if (static_cast<double>(num_opposite_strand) / static_cast<double>(num_supp) > 0.5) {
+            if (static_cast<double>(num_supp_opposite_strand) / static_cast<double>(num_primary) > 0.5) {
                 inversion = true;
             }
-        }
 
-        // Use DBSCAN to cluster primary alignment start, end positions
-        DBSCAN1D dbscan(100, 5);
-        current_group++;
-        std::vector<int> starts;
-        std::vector<int> ends;
-        std::vector<bool> primary_strands;
-        for (const std::string& qname : primary_group) {
-            const GenomicRegion& region = primary_map[qname];
-            starts.push_back(region.start);
-            ends.push_back(region.end);
-            primary_strands.push_back(region.strand);
-        }
+            // Use DBSCAN to cluster primary alignment start, end positions
+            DBSCAN1D dbscan(100, 5);
+            current_group++;
+            std::vector<int> starts;
+            std::vector<int> ends;
+            std::vector<bool> primary_strands;
+            for (const std::string& qname : primary_cluster) {
+                const PrimaryAlignment& primary_aln = chr_primary_map.at(qname);
+                starts.push_back(primary_aln.start);
+                ends.push_back(primary_aln.end);
+                primary_strands.push_back(primary_aln.strand);
+            }
 
-        // Get the largest cluster of primary alignment start positions
-        dbscan.fit(starts);
-        std::vector<int> primary_start_cluster = dbscan.getLargestCluster(starts);
+            // Get the largest cluster of primary alignment start positions
+            dbscan.fit(starts);
+            std::vector<int> primary_start_cluster = dbscan.getLargestCluster(starts);
 
-        // Get the largest cluster of primary alignment end positions
-        dbscan.fit(ends);
-        std::vector<int> primary_end_cluster = dbscan.getLargestCluster(ends);
+            // Get the largest cluster of primary alignment end positions
+            dbscan.fit(ends);
+            std::vector<int> primary_end_cluster = dbscan.getLargestCluster(ends);
 
-        // Continue if no clusters were found
-        if (primary_start_cluster.empty() && primary_end_cluster.empty()) {
-            continue;
-        }
+            // Continue if no clusters were found
+            if (primary_start_cluster.empty() && primary_end_cluster.empty()) {
+                continue;
+            }
 
-        // Get the supplementary alignment positions
-        std::vector<int> supp_starts;
-        std::vector<int> supp_ends;
-        std::vector<bool> supp_strands;
-        for (const std::string& qname : primary_group) {
-            const std::vector<GenomicRegion>& regions = supp_map[qname];
-            for (const GenomicRegion& region : regions) {
-                supp_starts.push_back(region.start);
-                supp_ends.push_back(region.end);
-                supp_strands.push_back(region.strand);
+            // Get the supplementary alignment positions, and also the distances
+            // between the primary and supplementary alignments on the read
+            std::vector<int> supp_starts;
+            std::vector<int> supp_ends;
+            std::vector<bool> supp_strands;
+            std::vector<int> split_distances;
+            for (const std::string& qname : primary_cluster) {
+                const PrimaryAlignment& primary_aln = chr_primary_map.at(qname);
+                const std::vector<SuppAlignment>& supp_alns = supp_map.at(qname);
+                for (const SuppAlignment& supp_aln : supp_alns) {
+                    if (supp_aln.tid == primary_tid) {
+                        // Same chromosome
+                        int distance = 0;
+                        supp_starts.push_back(supp_aln.start);
+                        supp_ends.push_back(supp_aln.end);
+                        supp_strands.push_back(supp_aln.strand);
+
+                        // Calculate the distance between the primary and supplementary
+                        // alignments on the read if on the same chromosome and same
+                        // strand
+                        if (supp_aln.strand == primary_aln.strand) {
+                            // Same strand
+                            // Calculate distance (negative if overlapping)
+                            if (primary_aln.query_start <= supp_aln.query_start) {
+                                distance = supp_aln.query_start - primary_aln.query_end;
+                            } else {
+                                distance = primary_aln.query_start - supp_aln.query_end;
+                            }
+                            split_distances.push_back(distance);
+                        } else {
+                            // TODO: INVERSIONS                       
+                        }
+                    } else {
+                        // TODO: TRANSLOCATIONS
+                    }
+                }
             }
-        }
 
-        // Get the largest cluster of supplementary alignment start positions
-        dbscan.fit(supp_starts);
-        std::vector<int> supp_start_cluster = dbscan.getLargestCluster(supp_starts);
+            // Get the largest cluster of supplementary alignment start positions
+            dbscan.fit(supp_starts);
+            std::vector<int> supp_start_cluster = dbscan.getLargestCluster(supp_starts);
 
-        // Get the largest cluster of supplementary alignment end positions
-        dbscan.fit(supp_ends);
-        std::vector<int> supp_end_cluster = dbscan.getLargestCluster(supp_ends);
+            // Get the largest cluster of supplementary alignment end positions
+            dbscan.fit(supp_ends);
+            std::vector<int> supp_end_cluster = dbscan.getLargestCluster(supp_ends);
 
-        // Continue if no clusters were found
-        if (supp_start_cluster.empty() && supp_end_cluster.empty()) {
-            continue;
-        }
+            // Get the largest cluster of split distances
+            dbscan.fit(split_distances);
+            std::vector<int> split_distance_cluster = dbscan.getLargestCluster(split_distances);
+            // printMessage("Found " + std::to_string(split_distance_cluster.size()) + " split distances (cluster size)");
 
-        // Use the median of the largest cluster of primary and supplementary
-        // alignment start, end positions as the final genome coordinates of the
-        // SV
-        int primary_pos = -1;
-        int primary_pos2 = -1;
-        int primary_cluster_size = 0;
-        if (primary_start_cluster.size() > primary_end_cluster.size()) {
-            std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
-            primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
-            primary_cluster_size = primary_start_cluster.size();
-        } else if (primary_end_cluster.size() > primary_start_cluster.size()) {
-            std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
-            primary_pos = primary_end_cluster[primary_end_cluster.size() / 2];
-            primary_cluster_size = primary_end_cluster.size();
-        } else {
-            // Use both positions
-            std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
-            std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
-            primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
-            primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2];
-            primary_cluster_size = primary_start_cluster.size();
-        }
+            // Continue if no clusters were found
+            // if (supp_start_cluster.empty() && supp_end_cluster.empty()) {
+            if (supp_start_cluster.empty() && supp_end_cluster.empty() && split_distance_cluster.empty()) {
+                continue;
+            }
 
-        // Get the supplementary alignment positions
-        int supp_pos = -1;
-        int supp_pos2 = -1;
-        int supp_cluster_size = 0;
-        if (supp_start_cluster.size() > supp_end_cluster.size()) {
-            std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
-            supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
-            supp_cluster_size = supp_start_cluster.size();
-        } else if (supp_end_cluster.size() > supp_start_cluster.size()) {
-            std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
-            supp_pos = supp_end_cluster[supp_end_cluster.size() / 2];
-            supp_cluster_size = supp_end_cluster.size();
-        } else {
-            // Use both positions. This has been shown to occur in nested SVs
-            std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
-            std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
-            supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
-            supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2];
-            supp_cluster_size = supp_start_cluster.size();
-        }
+            // Use the median of the largest cluster of primary and supplementary
+            // alignment start, end positions as the final genome coordinates of the
+            // SV
+            int primary_pos = -1;
+            int primary_pos2 = -1;
+            int primary_cluster_size = 0;
+            if (primary_start_cluster.size() > primary_end_cluster.size()) {
+                std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
+                primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
+                primary_cluster_size = primary_start_cluster.size();
+            } else if (primary_end_cluster.size() > primary_start_cluster.size()) {
+                std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
+                primary_pos = primary_end_cluster[primary_end_cluster.size() / 2];
+                primary_cluster_size = primary_end_cluster.size();
+            } else {
+                // Use both positions
+                std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
+                std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
+                primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
+                primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2];
+                primary_cluster_size = primary_start_cluster.size();
+            }
 
-        // If two of either were found, use the larger SV candidate
-        if (primary_pos2 != -1) {
-            int sv_length1 = std::abs(primary_pos - supp_pos);
-            int sv_length2 = std::abs(primary_pos2 - supp_pos);
-            if (sv_length2 > sv_length1) {
-                primary_pos = primary_pos2;
+            // -------------------------------
+            // SPLIT INSERTION DETECTION
+            int read_distance = 0;
+            if (!split_distance_cluster.empty()) {
+                // Use the median of the largest cluster of split distances as the
+                // insertion size
+                std::sort(split_distance_cluster.begin(), split_distance_cluster.end());
+                read_distance = split_distance_cluster[split_distance_cluster.size() / 2];
+
+                // Add an insertion SV call at the primary position
+                if (primary_pos != -1 && read_distance > 2000) {
+                    if (primary_pos2 != -1) {
+                        // If two positions were found, use the 5'most position
+                        primary_pos = std::min(primary_pos, primary_pos2);
+                    }
+                    SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, "<INS>", "SPLITINS", "./.", 0.0, 0, 0, primary_cluster_size);
+                    addSVCall(chr_sv_calls, sv_candidate);
+                    printMessage(chr_name + ": Found split insertion candidate " + std::to_string(primary_pos) + "-" + std::to_string(primary_pos + (read_distance-1)) + " with size " + std::to_string(read_distance) + " for group " + std::to_string(current_group));
+                    // continue;
+                }
             }
-        }
-        if (supp_pos2 != -1) {
-            int sv_length1 = std::abs(primary_pos - supp_pos);
-            int sv_length2 = std::abs(primary_pos - supp_pos2);
-            if (sv_length2 > sv_length1) {
-                supp_pos = supp_pos2;
+
+            // TODO: After this classify deletions if negative (keep the rest
+            // the same)
+
+            // --------------------------------
+
+            // Get the supplementary alignment positions
+            int supp_pos = -1;
+            int supp_pos2 = -1;
+            int supp_cluster_size = 0;
+            int supp_best_start = -1;
+            int supp_best_end = -1;
+            if (!supp_start_cluster.empty() && !supp_end_cluster.empty()) {
+                std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
+                int supp_best_start = supp_start_cluster[supp_start_cluster.size() / 2];
+                std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
+                int supp_best_end = supp_end_cluster[supp_end_cluster.size() / 2];
+                if (supp_start_cluster.size() > supp_end_cluster.size()) {
+                    // std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
+                    // supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
+                    supp_pos = supp_best_start;
+                    supp_cluster_size = supp_start_cluster.size();
+                } else if (supp_end_cluster.size() > supp_start_cluster.size()) {
+                    // std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
+                    // supp_pos = supp_end_cluster[supp_end_cluster.size() / 2];
+                    supp_pos = supp_best_end;
+                    supp_cluster_size = supp_end_cluster.size();
+                } else {
+                    // Use both positions. This has been shown to occur in nested SVs
+                    // std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
+                    // std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
+                    // supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
+                    // supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2];
+                    supp_pos = supp_best_start;
+                    supp_pos2 = supp_best_end;
+                    supp_cluster_size = supp_start_cluster.size();
+                }
+
+                // Store the inversion as the supplementary start and end positions
+                if (inversion && std::abs(supp_best_start - supp_best_end) >= 50) {
+                    SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, "<INV>", "SUPPINV", "./.", 0.0, 0, 0, supp_cluster_size);
+                    addSVCall(chr_sv_calls, sv_candidate);
+                    printMessage(chr_name + ": Found inversion candidate " + std::to_string(supp_best_start) + "-" + std::to_string(supp_best_end) + " for group " + std::to_string(current_group));
+                }
             }
-        }
 
-        if (primary_pos == -1 || supp_pos == -1) {
-            continue;
-        }
+            // If two of either were found, use the larger SV candidate
+            if (primary_pos2 != -1) {
+                int sv_length1 = std::abs(primary_pos - supp_pos);
+                int sv_length2 = std::abs(primary_pos2 - supp_pos);
+                if (sv_length2 > sv_length1) {
+                    primary_pos = primary_pos2;
+                }
+            }
+            if (supp_pos2 != -1) {
+                int sv_length1 = std::abs(primary_pos - supp_pos);
+                int sv_length2 = std::abs(primary_pos - supp_pos2);
+                if (sv_length2 > sv_length1) {
+                    supp_pos = supp_pos2;
+                }
+            }
 
-        // Store the SV candidate if the length is within the specified range
-        int sv_start = std::min(primary_pos, supp_pos);
-        int sv_end = std::max(primary_pos, supp_pos);
-        int sv_length = sv_end - sv_start + 1;
-        int cluster_size = std::max(primary_cluster_size, supp_cluster_size);
-        SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
-        if (sv_length >= min_length && sv_length <= max_length) {
-            SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size);
-            addSVCall(sv_calls, sv_candidate);
-            // printMessage(region + ": Found SV candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group) + " with inversion status " + std::to_string(inversion));
+            if (primary_pos == -1 || supp_pos == -1) {
+                continue;
+            }
+
+            // Store the SV candidate if the length is within the specified range
+            int sv_start = std::min(primary_pos, supp_pos);
+            int sv_end = std::max(primary_pos, supp_pos);
+            int sv_length = sv_end - sv_start + 1;
+            int cluster_size = std::max(primary_cluster_size, supp_cluster_size);
+
+            // If the read distance is < 30bp while the SV is > 2kb, then this is a
+            // potential deletion
+            if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) {
+                printMessage(chr_name + ": Found potential deletion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group));
+                // printMessage("Found potential deletion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group));
+                // continue;
+                SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDEL", "./.", 0.0, 0, 0, cluster_size);
+                addSVCall(chr_sv_calls, sv_candidate);
+            }
+
+            // Add a dummy SV call for CNV detection
+            else if (sv_length >= min_length && sv_length <= max_length) {
+                SVCall sv_candidate(sv_start, sv_end, SVType::UNKNOWN, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size);
+                addSVCall(chr_sv_calls, sv_candidate);
+            }
+            
+            // Determine the SV type
+            // SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
+            // if (sv_length >= min_length && sv_length <= max_length) {
+            //     SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size);
+            //     addSVCall(chr_sv_calls, sv_candidate);
+            //     // printMessage(region + ": Found SV candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group) + " with inversion status " + std::to_string(inversion));
+            // }
         }
-    }
+        // Combine SVs with identical start and end positions, and sum the cluster
+        // sizes
+        printMessage("Combining SVs with identical start and end positions");
+        std::vector<SVCall> combined_sv_calls;
+        std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+            return a.start < b.start || (a.start == b.start && a.end < b.end);
+        });
+        int merge_count = 0;
+        for (size_t i = 0; i < chr_sv_calls.size(); i++) {
+            SVCall& sv_call = chr_sv_calls[i];
+            // SVCall& sv_call = sv_calls[i];
+            if (i > 0 && sv_call.start == chr_sv_calls[i - 1].start && sv_call.end == chr_sv_calls[i - 1].end) {
+                chr_sv_calls[i - 1].cluster_size += sv_call.cluster_size;
+                merge_count++;
+            } else {
+                combined_sv_calls.push_back(sv_call);
+            }
+        }
+
+        // Add the combined SV calls to the main vector
+        sv_calls[chr_name] = std::move(combined_sv_calls);
 
-    // Combine SVs with identical start and end positions, and sum the cluster
-    // sizes
-    std::vector<SVCall> combined_sv_calls;
-    std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-        return a.start < b.start || (a.start == b.start && a.end < b.end);
-    });
-    int merge_count = 0;
-    for (size_t i = 0; i < sv_calls.size(); i++) {
-        SVCall& sv_call = sv_calls[i];
-        if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.end == sv_calls[i - 1].end) {
-            sv_calls[i - 1].cluster_size += sv_call.cluster_size;
-            merge_count++;
-        } else {
-            combined_sv_calls.push_back(sv_call);
+        // Print the number of merged SV calls
+        printMessage(chr_name + ": Found " + std::to_string(sv_calls[chr_name].size()) + " SV candidates");
+        
+        if (merge_count > 0) {
+            printMessage(chr_name + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions");
         }
     }
-    sv_calls = std::move(combined_sv_calls);
 
     // if (merge_count > 0) {
     //     printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions");
     // }
 }
 
-
-void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome)
+void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome)
 {
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
@@ -367,7 +845,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr,
 
         // Process the alignment
         bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY);
-        this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, primary, pos_depth_map, ref_genome);
+        this->processCIGARRecord(bamHdr, bam1, sv_calls, primary, pos_depth_map, ref_genome);
     }
 
     // Clean up the iterator and alignment
@@ -404,7 +882,181 @@ double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data)
     return mismatch_rate;
 }
 
-void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome)
+void SVCaller::findSplitReadSVs(std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const ReferenceGenome &ref_genome, const InputData& input_data)
+{
+    // Open the BAM file
+    std::string bam_filepath = input_data.getLongReadBam();
+    samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
+    if (!fp_in) {
+        printError("ERROR: failed to open " + bam_filepath);
+        return;
+    }
+
+    // Set maximum thread count
+    int thread_count = input_data.getThreadCount();
+    hts_set_threads(fp_in, thread_count);
+    printMessage("Using " + std::to_string(thread_count) + " threads for split read analysis");
+
+    // Load the header
+    bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
+    if (!bamHdr) {
+        sam_close(fp_in);
+        printError("ERROR: failed to read header from " + bam_filepath);
+        return;
+    }
+
+    // Load the index
+    hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str());
+    if (!idx) {
+        bam_hdr_destroy(bamHdr);
+        sam_close(fp_in);
+        printError("ERROR: failed to load index for " + bam_filepath);
+        return;
+    }
+    BamFileGuard bam_guard(fp_in, idx, bamHdr);  // Guard to close the BAM file
+
+    // Create a whole-genome iterator
+    hts_itr_t *itr = sam_itr_queryi(idx, HTS_IDX_START, 0, 0);
+    if (!itr) {
+        printError("ERROR: failed to query the whole genome");
+        return;
+    }
+
+    // Process the alignments
+    std::unordered_map<std::string, SplitSignature> primary_map;
+    std::unordered_map<std::string, std::vector<SplitSignature>> supp_map;
+    bam1_t *bam1 = bam_init1();
+    if (!bam1) {
+        printError("ERROR: failed to initialize BAM record");
+        return;
+    }
+    uint32_t primary_count = 0;
+    uint32_t supplementary_count = 0;
+    uint32_t num_alignments = 0;
+    printMessage("Processing split read alignment records...");
+    while (readNextAlignment(fp_in, itr, bam1) >= 0) {
+
+        // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
+        if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) {
+            continue;
+        }
+        const std::string qname = bam_get_qname(bam1);  // Query template name
+
+        // Process primary alignments
+        if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
+
+            // Get the start and end positions in the read sequence
+            uint32_t query_start = 0;
+            uint32_t query_end = 0;
+            uint32_t* cigar = bam_get_cigar(bam1);
+            int cigar_len = bam1->core.n_cigar;
+            for (int i = 0; i < cigar_len; i++) {
+                int op_len = bam_cigar_oplen(cigar[i]);
+                int op = bam_cigar_op(cigar[i]);
+
+                if (i == 0 && op == BAM_CSOFT_CLIP) {
+                    query_start = op_len;
+                }
+                
+                // https://github.com/samtools/htslib/blob/develop/htslib/sam.h:
+                // bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
+                if (bam_cigar_type(op) & 1) {
+                    query_end += op_len;
+                }
+            }
+
+            // Store the SV signature
+            primary_map[qname] = SplitSignature{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), query_start, query_end};
+            primary_count++;
+
+        // Process supplementary alignments
+        } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
+            // Get the start and end positions in the read sequence
+            uint32_t query_start = 0;
+            uint32_t query_end = 0;
+            uint32_t* cigar = bam_get_cigar(bam1);
+            int cigar_len = bam1->core.n_cigar;
+            for (int i = 0; i < cigar_len; i++) {
+                int op_len = bam_cigar_oplen(cigar[i]);
+                int op = bam_cigar_op(cigar[i]);
+
+                if (i == 0 && op == BAM_CSOFT_CLIP) {
+                    query_start = op_len;
+                }
+                
+                // https://github.com/samtools/htslib/blob/develop/htslib/sam.h:
+                // bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
+                if (bam_cigar_type(op) & 1) {
+                    query_end += op_len;
+                }
+            }
+
+            // Store the SV signature
+            supp_map[qname].push_back(SplitSignature{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), query_start, query_end});
+            supplementary_count++;
+        }
+        num_alignments++;
+
+        if (num_alignments % 100000 == 0) {
+            printMessage("Processed " + std::to_string(num_alignments) + " split read alignment records");
+        }
+    }
+
+    // Remove primary alignments without supplementary alignments
+    std::vector<std::string> to_remove;
+    for (const auto& entry : primary_map) {
+        const std::string& qname = entry.first;
+        if (supp_map.find(qname) == supp_map.end()) {
+            to_remove.push_back(qname);
+        }
+    }
+    for (const std::string& qname : to_remove) {
+        primary_map.erase(qname);
+    }
+
+    // Clean up the iterator and alignment
+    hts_itr_destroy(itr);
+    bam_destroy1(bam1);
+    printMessage("Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments");
+
+    // Find insertions by comparing the primary vs. supplementary alignment
+    // distances in the read vs. reference genome on the same chromosome
+    int ins_count = 0;
+    std::vector<SVCall> sv_candidates;
+    for (const auto& entry : primary_map) {
+        const std::string& qname = entry.first;
+        const SplitSignature& primary = entry.second;
+        const std::vector<SplitSignature>& supp_alns = supp_map[qname];
+
+        // TODO: Cluster positions for improved performance
+
+        for (const SplitSignature& supp : supp_alns) {
+            if (primary.tid == supp.tid) {
+                int ref_dist = std::abs(primary.start - supp.start);
+                int query_dist = std::abs(primary.query_start - supp.query_start);
+
+                // If the reads are within 100 bp of each other, and the
+                // reference distance is greater than 2kb, then it is likely an
+                // insertion
+                if (query_dist <= 100 && ref_dist >= 2000) {
+                    int sv_start = std::min(primary.start, supp.start);
+                    int sv_end = std::max(primary.start, supp.start);
+                    int sv_length = sv_end - sv_start + 1;
+                    int cluster_size = 1;
+                    printMessage("Found insertion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length));
+                    SVCall sv_candidate(sv_start, sv_end, SVType::INS, "<INS>", "SPLITINS", "./.", 0.0, 0, 0, cluster_size);
+                    std::string chr = bamHdr->target_name[primary.tid];
+                    sv_calls[chr].push_back(sv_candidate);
+                    ins_count++;
+                }
+            }
+        }
+    }
+
+    printMessage("Found " + std::to_string(ins_count) + " insertions");
+}
+
+void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector<SVCall> &sv_calls, bool is_primary, const std::vector<uint32_t> &pos_depth_map, const ReferenceGenome &ref_genome)
 {
     std::string chr = header->target_name[alignment->core.tid];  // Chromosome name
     uint32_t pos = (uint32_t)alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
@@ -513,11 +1165,31 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec
     }
 }
 
-void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov, std::vector<SVCall>& split_sv_calls)
+std::pair<int, int> SVCaller::getAlignmentReadPositions(bam1_t *alignment)
 {
-    double dbscan_epsilon = input_data.getDBSCAN_Epsilon();
-    int dbscan_min_pts = input_data.getDBSCAN_MinPts();
+    int query_start = 0;
+    int query_end = 0;
+    uint32_t* cigar = bam_get_cigar(alignment);
+    int cigar_len = alignment->core.n_cigar;
+    for (int i = 0; i < cigar_len; i++) {
+        int op_len = bam_cigar_oplen(cigar[i]);
+        int op = bam_cigar_op(cigar[i]);
+
+        if (i == 0 && op == BAM_CSOFT_CLIP) {
+            query_start = op_len;
+        }
+        
+        // https://github.com/samtools/htslib/blob/develop/htslib/sam.h:
+        // bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
+        if (bam_cigar_type(op) & 1) {
+            query_end += op_len;
+        }
+    }
+    return std::make_pair(query_start, query_end);
+}
 
+void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov, std::vector<SVCall>& split_sv_calls)
+{
     // Open the BAM file
     std::string bam_filepath = input_data.getLongReadBam();
     samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
@@ -545,40 +1217,31 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     }
     BamFileGuard bam_guard(fp_in, idx, bamHdr);  // Guard to close the BAM file
 
-    // Set the region to process
-    std::string region = chr;
-    if (input_data.isRegionSet()) {
-
-        // Use one chunk for the specified region
-        std::pair<int32_t, int32_t> region_data = input_data.getRegion();
-        int region_start = region_data.first;
-        int region_end = region_data.second;
-        region = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end);
-    }
-
-    // Estimate DBSCAN minimum points
+    // Get DBSCAN parameters
+    double dbscan_epsilon = input_data.getDBSCAN_Epsilon();
+    int dbscan_min_pts = 5;
     double dbscan_min_pts_pct = input_data.getDBSCAN_MinPtsPct();
     if (dbscan_min_pts_pct > 0.0) {
         dbscan_min_pts = (int)std::ceil(mean_chr_cov * dbscan_min_pts_pct);
         printMessage(chr + ": Mean chr. cov.: " + std::to_string(mean_chr_cov) + " (DBSCAN min. pts.= " + std::to_string(dbscan_min_pts) + ", min. pts. pct.= " + std::to_string(dbscan_min_pts_pct) + ")");
-    }
-
+    } 
 
     // -----------------------------------------------------------------------
-    // Detect SVs from the CIGAR strings
-    printMessage(chr + ": CIGAR SVs...");
-    this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome);
+    // // Detect SVs from the CIGAR strings
+    // printMessage(chr + ": CIGAR SVs...");
+    // this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome);
 
-    printMessage(chr + ": Merging CIGAR...");
-    mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts);
+    // printMessage(chr + ": Merging CIGAR...");
+    // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts);
 
-    int region_sv_count = getSVCount(chr_sv_calls);
-    printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
+    // int region_sv_count = getSVCount(chr_sv_calls);
+    // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
 
     // -----------------------------------------------------------------------
-    // Detect SVs from the split reads
-    printMessage(chr + ": Split read SVs...");
-    this->getSplitAlignments(fp_in, idx, bamHdr, region, split_sv_calls);
+
+    // // Detect SVs from the split reads
+    // printMessage(chr + ": Split read SVs...");
+    // this->findSplitCNVBreakpoints(fp_in, idx, bamHdr, chr, split_sv_calls);
 }
 
 void SVCaller::run(const InputData& input_data)
@@ -643,77 +1306,95 @@ void SVCaller::run(const InputData& input_data)
 
     // Use multi-threading across chromosomes. If a single chromosome is
     // specified, use a single main thread (multi-threading is used for file I/O)
-    int thread_count = 1;
-    if (!input_data.isSingleChr()) {
-        thread_count = input_data.getThreadCount();
-        std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl;
-    }
-    ThreadPool pool(thread_count);
+    // int thread_count = 1;
+    // if (!input_data.isSingleChr()) {
+    //     thread_count = input_data.getThreadCount();
+    //     std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl;
+    // }
+    // ThreadPool pool(thread_count);
     std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
     std::unordered_map<std::string, std::vector<SVCall>> whole_genome_split_sv_calls;
-    auto process_chr = [&](const std::string& chr) {
-        try {
-            std::vector<SVCall> sv_calls;
-            std::vector<SVCall> split_sv_calls;
-            InputData chr_input_data = input_data;  // Use a thread-local copy
-            this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls);
-            {
-                std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
-                whole_genome_sv_calls[chr] = std::move(sv_calls);
-                whole_genome_split_sv_calls[chr] = std::move(split_sv_calls);
-            }
-            // printMessage("Completed chromosome " + chr);
-        } catch (const std::exception& e) {
-            printError("Error processing chromosome " + chr + ": " + e.what());
-        } catch (...) {
-            printError("Unknown error processing chromosome " + chr);
-        }
-    };
-
-    // Submit tasks to the thread pool and track futures
-    std::vector<std::future<void>> futures;
-    for (const auto& chr : chromosomes) {
-        futures.emplace_back(pool.enqueue([&, chr] {
-            // printMessage("Processing chromosome " + chr);
-            process_chr(chr);
-        }));
-    }
+    // auto process_chr = [&](const std::string& chr) {
+    //     try {
+    //         std::vector<SVCall> sv_calls;
+    //         std::vector<SVCall> split_sv_calls;
+    //         InputData chr_input_data = input_data;  // Use a thread-local copy
+    //         this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls);
+    //         {
+    //             std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
+    //             whole_genome_sv_calls[chr] = std::move(sv_calls);
+    //             whole_genome_split_sv_calls[chr] = std::move(split_sv_calls);
+    //         }
+    //         // printMessage("Completed chromosome " + chr);
+    //     } catch (const std::exception& e) {
+    //         printError("Error processing chromosome " + chr + ": " + e.what());
+    //     } catch (...) {
+    //         printError("Unknown error processing chromosome " + chr);
+    //     }
+    // };
+
+    // // Submit tasks to the thread pool and track futures
+    // std::vector<std::future<void>> futures;
+    // for (const auto& chr : chromosomes) {
+    //     futures.emplace_back(pool.enqueue([&, chr] {
+    //         // printMessage("Processing chromosome " + chr);
+    //         process_chr(chr);
+    //     }));
+    // }
 
-    // Wait for all tasks to complete
-    int current_chr = 0;
-    int total_chr_count = chromosomes.size();
-    for (auto& future : futures) {
-        try {
-            current_chr++;
-            future.get();
-        } catch (const std::exception& e) {
-            printError("Error processing chromosome task: " + std::string(e.what()));
-        } catch (...) {
-            printError("Unknown error processing chromosome task.");
-        }
-    }
-    printMessage("All tasks have finished.");
+    // // Wait for all tasks to complete
+    // int current_chr = 0;
+    // int total_chr_count = chromosomes.size();
+    // for (auto& future : futures) {
+    //     try {
+    //         current_chr++;
+    //         future.get();
+    //     } catch (const std::exception& e) {
+    //         printError("Error processing chromosome task: " + std::string(e.what()));
+    //     } catch (...) {
+    //         printError("Unknown error processing chromosome task.");
+    //     }
+    // }
+    // printMessage("All tasks have finished.");
 
     // -------------------------------------------------------
     // Run copy number variant predictions on the SVs detected from the
     // CIGAR string, using a minimum CNV length threshold
-    current_chr = 0;
-    printMessage("Running copy number predictions on CIGAR SVs...");
-    for (auto& entry : whole_genome_sv_calls) {
-        current_chr++;
-        const std::string& chr = entry.first;
-        std::vector<SVCall>& sv_calls = entry.second;
-        if (sv_calls.size() > 0) {
-            // printMessage("Running copy number predictions on " + chr +
-            // "...");
-            printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
-            cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
-        }
-    }
+    // current_chr = 0;
+    // printMessage("Running copy number predictions on CIGAR SVs...");
+    // for (auto& entry : whole_genome_sv_calls) {
+    //     current_chr++;
+    //     const std::string& chr = entry.first;
+    //     std::vector<SVCall>& sv_calls = entry.second;
+    //     if (sv_calls.size() > 0) {
+    //         // printMessage("Running copy number predictions on " + chr +
+    //         // "...");
+    //         printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
+    //         cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
+    //     }
+    // }
     // -------------------------------------------------------
 
+    // printMessage("Running copy number predictions on split-read SVs...");
+    // current_chr = 0;
+    // for (auto& entry : whole_genome_split_sv_calls) {
+    //     const std::string& chr = entry.first;
+    //     std::vector<SVCall>& sv_calls = entry.second;
+
+    //     if (sv_calls.size() > 0) {
+    //         current_chr++;
+    //         printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates...");
+    //         this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
+    //     }
+    // }
+
+    // Identify split-SV signatures
+    printMessage("Identifying split-SV signatures...");
+    this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data);
+
     printMessage("Running copy number predictions on split-read SVs...");
-    current_chr = 0;
+    int current_chr = 0;
+    int total_chr_count = whole_genome_split_sv_calls.size();
     for (auto& entry : whole_genome_split_sv_calls) {
         const std::string& chr = entry.first;
         std::vector<SVCall>& sv_calls = entry.second;
@@ -725,11 +1406,18 @@ void SVCaller::run(const InputData& input_data)
         }
     }
 
+    // Detect inversions, insertions, and translocations from the split read
+    // alignments (no copy number predictions)
+    // printMessage("Detecting inversions, insertions, and translocations from split read alignments...");
+    // std::unordered_map<std::string, std::vector<SVCall>> neutral_sv_calls;
+    // this->findSplitReadSVs(neutral_sv_calls, ref_genome, input_data);
+    
     printMessage("Unifying SVs...");
     for (auto& entry : whole_genome_split_sv_calls) {
         const std::string& chr = entry.first;
         std::vector<SVCall>& sv_calls = entry.second;
         whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end());
+        // whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), neutral_sv_calls[chr].begin(), neutral_sv_calls[chr].end());
     }
 
     // Print the total number of SVs detected for each chromosome
@@ -756,54 +1444,48 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
     for (const auto& sv_candidate : split_sv_calls) {
         printMessage("Processing SV candidate " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " " + getSVTypeString(sv_candidate.sv_type) + "...");
 
-        bool is_inversion = sv_candidate.sv_type == SVType::INV;
+        // bool is_inversion = sv_candidate.sv_type == SVType::INV;
 
         std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
         double supp_lh = std::get<0>(result);
         SVType supp_type = std::get<1>(result);
         std::string genotype = std::get<2>(result);
-        if (supp_type != SVType::UNKNOWN) {
-            if (is_inversion) {
-            	// Add an additional inversion separately
-		        int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
-		        std::string alt_allele = "<INV>";
-		        SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
-                processed_calls.push_back(sv_call);
-                /*
-                if (supp_type == SVType::DEL) {
-                    supp_type = SVType::INV_DEL;
-                } else if (supp_type == SVType::DUP) {
-                    supp_type = SVType::INV_DUP;
-                } else if (supp_type == SVType::NEUTRAL) {
-                    supp_type = SVType::INV;
-                }
-                */
-            }
+        if (supp_type != SVType::UNKNOWN && supp_type != SVType::NEUTRAL) {
+            // if (is_inversion) {
+            // 	// Add an additional inversion separately
+		    //     int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
+		    //     std::string alt_allele = "<INV>";
+		    //     SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
+            //     processed_calls.push_back(sv_call);
+            // }
             
-            if (supp_type != SVType::NEUTRAL) {
-                int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
-                std::string alt_allele = "<" + getSVTypeString(supp_type) + ">";
-                SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
-                // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type));
-                // addSVCall(split_sv_calls, sv_call);
-                processed_calls.push_back(sv_call);
-            }
-        } else if (supp_type == SVType::UNKNOWN && sv_candidate.sv_type == SVType::INV) {
-            // Inversion with no CNV prediction
             int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
-            std::string alt_allele = "<INV>";
-            SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
+            std::string alt_allele = "<" + getSVTypeString(supp_type) + ">";
+            SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
             // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type));
             // addSVCall(split_sv_calls, sv_call);
             processed_calls.push_back(sv_call);
         }
+
+        // } else if (sv_candidate.sv_type == SVType::INV) {
+        //     // SV with no copy number prediction, but is a potential inversion or insertion
+        //     int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
+        //     // std::string alt_allele = "<INV>";
+        //     std::string alt_allele = "<" + getSVTypeString(sv_candidate.sv_type) + ">";
+        //     SVCall sv_call(sv_candidate.start, sv_candidate.end, sv_candidate.sv_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
+        //     printMessage("[TEST-SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type));
+        //     processed_calls.push_back(sv_call);
+        // }
         // if (current_sv % 1000 == 0) {
         //     printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates");
         // }
     }
 
+    // Insert the copy number predictions back into the split SV calls
+    printMessage("Inserting CNV calls...");
+    split_sv_calls.insert(split_sv_calls.end(), processed_calls.begin(), processed_calls.end());
     // Replace with the processed calls
-    split_sv_calls = std::move(processed_calls);
+    // split_sv_calls = std::move(processed_calls);
 }
 
 void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const
@@ -1044,8 +1726,3 @@ int SVCaller::calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uin
     // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth));
     return read_depth;
 }
-
-bool SVCaller::regionOverlaps(const GenomicRegion &a, const GenomicRegion &b)
-{
-    return a.tid == b.tid && a.start <= b.end && b.start <= a.end;
-}

From 84d1d6e60b49bcb56dbcd8381b5a0b759af011f2 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 13 Mar 2025 16:58:08 -0400
Subject: [PATCH 083/134] fix sv duplicate merging

---
 include/sv_object.h |   3 +
 src/cnv_caller.cpp  |  10 +-
 src/sv_caller.cpp   | 290 +++++++++++++++++++++++++-------------------
 src/sv_object.cpp   |  29 +++++
 4 files changed, 201 insertions(+), 131 deletions(-)

diff --git a/include/sv_object.h b/include/sv_object.h
index fc090166..7c1f5410 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -39,6 +39,9 @@ void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call);
 
 void mergeSVs(std::vector<SVCall>& sv_calls);
 
+// Merge SVs with identical start positions, and sum the cluster sizes
+void mergeDuplicateSVs(std::vector<SVCall>& sv_calls);
+
 void mergeSVSubsets(std::vector<SVCall>& sv_calls);
 
 void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, int min_depth);
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 6db7a78b..c23c146b 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -501,7 +501,7 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>&
         // Clean up the iterator
         hts_itr_destroy(bam_iter);
 
-        printMessage("Finished reading BAM file, calculating mean chromosome coverage...");
+        // printMessage("Finished reading BAM file, calculating mean chromosome coverage...");
 
         // // Calculate the mean chromosome coverage for positions with non-zero depth
         // uint64_t cum_depth = 0;
@@ -539,13 +539,13 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>&
             [](uint32_t depth) { return depth > 0; }
         );
 
-        printMessage("Number of positions with non-zero depth: " + std::to_string(pos_count));
-        printMessage("Total depth: " + std::to_string(cum_depth));
+        // printMessage("Number of positions with non-zero depth: " + std::to_string(pos_count));
+        // printMessage("Total depth: " + std::to_string(cum_depth));
 
         double mean_chr_cov = (pos_count > 0) ? static_cast<double>(cum_depth) / static_cast<double>(pos_count) : 0.0;
         chr_mean_cov_map[chr] = mean_chr_cov;
 
-        printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov));
+        // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov));
     }
 
     // Clean up
@@ -802,7 +802,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     // Continue if no SNP was found in the region
     if (!snp_found)
     {
-        printMessage("No SNP found in region: " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos));
+        // printMessage("No SNP found in region: " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos));
         bcf_sr_destroy(snp_reader);
         bcf_sr_destroy(pfb_reader);
         return;
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 31b7270a..00a6f0ae 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -531,7 +531,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                 primary_clusters.push_back(overlap_group);
             }
         }
-        printMessage(chr_name + ": Found " + std::to_string(primary_clusters.size()) + " groups of overlapping primary alignments");
+        // printMessage(chr_name + ": Found " + std::to_string(primary_clusters.size()) + " groups of overlapping primary alignments");
 
         // For each primary alignment cluster the supplementary alignment start and
         // end positions, keeping the median of the largest cluster
@@ -685,7 +685,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                     }
                     SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, "<INS>", "SPLITINS", "./.", 0.0, 0, 0, primary_cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
-                    printMessage(chr_name + ": Found split insertion candidate " + std::to_string(primary_pos) + "-" + std::to_string(primary_pos + (read_distance-1)) + " with size " + std::to_string(read_distance) + " for group " + std::to_string(current_group));
+                    // printMessage(chr_name + ": Found split insertion candidate " + std::to_string(primary_pos) + "-" + std::to_string(primary_pos + (read_distance-1)) + " with size " + std::to_string(read_distance) + " for group " + std::to_string(current_group));
                     // continue;
                 }
             }
@@ -731,7 +731,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                 if (inversion && std::abs(supp_best_start - supp_best_end) >= 50) {
                     SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, "<INV>", "SUPPINV", "./.", 0.0, 0, 0, supp_cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
-                    printMessage(chr_name + ": Found inversion candidate " + std::to_string(supp_best_start) + "-" + std::to_string(supp_best_end) + " for group " + std::to_string(current_group));
+                    // printMessage(chr_name + ": Found inversion candidate " + std::to_string(supp_best_start) + "-" + std::to_string(supp_best_end) + " for group " + std::to_string(current_group));
                 }
             }
 
@@ -764,9 +764,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             // If the read distance is < 30bp while the SV is > 2kb, then this is a
             // potential deletion
             if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) {
-                printMessage(chr_name + ": Found potential deletion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group));
-                // printMessage("Found potential deletion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group));
-                // continue;
+                // printMessage(chr_name + ": Found potential deletion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group));
                 SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDEL", "./.", 0.0, 0, 0, cluster_size);
                 addSVCall(chr_sv_calls, sv_candidate);
             }
@@ -785,34 +783,47 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             //     // printMessage(region + ": Found SV candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group) + " with inversion status " + std::to_string(inversion));
             // }
         }
+
         // Combine SVs with identical start and end positions, and sum the cluster
         // sizes
-        printMessage("Combining SVs with identical start and end positions");
-        std::vector<SVCall> combined_sv_calls;
+        printMessage("Combining SVs with identical start positions");
         std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
             return a.start < b.start || (a.start == b.start && a.end < b.end);
         });
-        int merge_count = 0;
-        for (size_t i = 0; i < chr_sv_calls.size(); i++) {
-            SVCall& sv_call = chr_sv_calls[i];
-            // SVCall& sv_call = sv_calls[i];
-            if (i > 0 && sv_call.start == chr_sv_calls[i - 1].start && sv_call.end == chr_sv_calls[i - 1].end) {
-                chr_sv_calls[i - 1].cluster_size += sv_call.cluster_size;
-                merge_count++;
-            } else {
-                combined_sv_calls.push_back(sv_call);
-            }
-        }
+        
+        // Merge duplicate SV calls with identical start positions
+        mergeDuplicateSVs(chr_sv_calls);
+
+        // int initial_size = chr_sv_calls.size();
+        // std::vector<SVCall> combined_sv_calls;
+        // for (size_t i = 0; i < chr_sv_calls.size(); i++) {
+        //     SVCall& sv_call = chr_sv_calls[i];
+        //     if (i > 0 && sv_call.start == chr_sv_calls[i - 1].start) {
+        //         // Keep the largest cluster size for the same start position
+        //         if (sv_call.cluster_size > chr_sv_calls[i - 1].cluster_size) {
+        //             combined_sv_calls.back() = sv_call;
+        //         }
+
+        //         // Combine cluster sizes
+        //         combined_sv_calls.back().cluster_size += sv_call.cluster_size;
+        //     } else {
+        //         // Add the SV call to the combined list
+        //         combined_sv_calls.push_back(sv_call);
+        //     }
+        // }
+        // int merge_count = initial_size - combined_sv_calls.size();
+        // printMessage("Merged " + std::to_string(merge_count) + " SV candidates with identical start positions");
 
         // Add the combined SV calls to the main vector
-        sv_calls[chr_name] = std::move(combined_sv_calls);
+        // sv_calls[chr_name] = std::move(combined_sv_calls);
+        sv_calls[chr_name] = std::move(chr_sv_calls);
 
         // Print the number of merged SV calls
         printMessage(chr_name + ": Found " + std::to_string(sv_calls[chr_name].size()) + " SV candidates");
         
-        if (merge_count > 0) {
-            printMessage(chr_name + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions");
-        }
+        // if (merge_count > 0) {
+        //     printMessage(chr_name + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions");
+        // }
     }
 
     // if (merge_count > 0) {
@@ -1154,6 +1165,10 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
 
         // Update the reference position
         // https://samtools.github.io/hts-specs/SAMv1.pdf
+        // if (bam_cigar_type(op) & 2) {
+        //     // bit 2: consume reference
+        //     ref_pos += op_len;
+        // }
         if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
             pos += op_len;
         }
@@ -1167,7 +1182,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
 
 std::pair<int, int> SVCaller::getAlignmentReadPositions(bam1_t *alignment)
 {
-    int query_start = 0;
+    int query_start = -1;
     int query_end = 0;
     uint32_t* cigar = bam_get_cigar(alignment);
     int cigar_len = alignment->core.n_cigar;
@@ -1175,16 +1190,26 @@ std::pair<int, int> SVCaller::getAlignmentReadPositions(bam1_t *alignment)
         int op_len = bam_cigar_oplen(cigar[i]);
         int op = bam_cigar_op(cigar[i]);
 
-        if (i == 0 && op == BAM_CSOFT_CLIP) {
-            query_start = op_len;
+        // if (i == 0 && op == BAM_CSOFT_CLIP) {
+        //     query_start = op_len;
+        // }
+        // Set the query start position to the first non-soft clip operation
+        if (query_start == -1 && (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CEQUAL || op == BAM_CDIFF)) {
+            query_start = query_end;  // First valid query position
         }
         
         // https://github.com/samtools/htslib/blob/develop/htslib/sam.h:
         // bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
-        if (bam_cigar_type(op) & 1) {
+        if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
             query_end += op_len;
         }
     }
+
+    if (query_start == -1) {
+        // If no valid query start position was found, set it to 0
+        query_start = 0;
+    }
+
     return std::make_pair(query_start, query_end);
 }
 
@@ -1227,21 +1252,15 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v
     } 
 
     // -----------------------------------------------------------------------
-    // // Detect SVs from the CIGAR strings
-    // printMessage(chr + ": CIGAR SVs...");
-    // this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome);
-
-    // printMessage(chr + ": Merging CIGAR...");
-    // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts);
+    // Detect SVs from the CIGAR strings
+    printMessage(chr + ": CIGAR SVs...");
+    this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome);
 
-    // int region_sv_count = getSVCount(chr_sv_calls);
-    // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count));
+    printMessage(chr + ": Merging CIGAR...");
+    mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts);
 
-    // -----------------------------------------------------------------------
-
-    // // Detect SVs from the split reads
-    // printMessage(chr + ": Split read SVs...");
-    // this->findSplitCNVBreakpoints(fp_in, idx, bamHdr, chr, split_sv_calls);
+    int region_sv_count = getSVCount(chr_sv_calls);
+    printMessage(chr + ": Found " + std::to_string(region_sv_count) + " SV candidates in the CIGAR string");
 }
 
 void SVCaller::run(const InputData& input_data)
@@ -1250,7 +1269,6 @@ void SVCaller::run(const InputData& input_data)
     printMessage("Loading the reference genome...");
     const std::string ref_filepath = input_data.getRefGenome();
     std::shared_mutex ref_mutex;  // Dummy mutex (remove later)
-    // ReferenceGenome ref_genome(this->shared_mutex);
     ReferenceGenome ref_genome(ref_mutex);
     ref_genome.setFilepath(ref_filepath);
 
@@ -1260,7 +1278,6 @@ void SVCaller::run(const InputData& input_data)
         // Get the chromosome from the user input argument
         chromosomes.push_back(input_data.getChromosome());
     } else {
-        // chromosomes = ref_genome.getChromosomes();
         // Get the chromosomes from the input BAM file
         chromosomes = this->getChromosomes(input_data.getLongReadBam());
     }
@@ -1306,95 +1323,80 @@ void SVCaller::run(const InputData& input_data)
 
     // Use multi-threading across chromosomes. If a single chromosome is
     // specified, use a single main thread (multi-threading is used for file I/O)
-    // int thread_count = 1;
-    // if (!input_data.isSingleChr()) {
-    //     thread_count = input_data.getThreadCount();
-    //     std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl;
-    // }
-    // ThreadPool pool(thread_count);
+    int thread_count = 1;
+    if (!input_data.isSingleChr()) {
+        thread_count = input_data.getThreadCount();
+        std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl;
+    }
+    ThreadPool pool(thread_count);
     std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
-    std::unordered_map<std::string, std::vector<SVCall>> whole_genome_split_sv_calls;
-    // auto process_chr = [&](const std::string& chr) {
-    //     try {
-    //         std::vector<SVCall> sv_calls;
-    //         std::vector<SVCall> split_sv_calls;
-    //         InputData chr_input_data = input_data;  // Use a thread-local copy
-    //         this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls);
-    //         {
-    //             std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
-    //             whole_genome_sv_calls[chr] = std::move(sv_calls);
-    //             whole_genome_split_sv_calls[chr] = std::move(split_sv_calls);
-    //         }
-    //         // printMessage("Completed chromosome " + chr);
-    //     } catch (const std::exception& e) {
-    //         printError("Error processing chromosome " + chr + ": " + e.what());
-    //     } catch (...) {
-    //         printError("Unknown error processing chromosome " + chr);
-    //     }
-    // };
-
-    // // Submit tasks to the thread pool and track futures
-    // std::vector<std::future<void>> futures;
-    // for (const auto& chr : chromosomes) {
-    //     futures.emplace_back(pool.enqueue([&, chr] {
-    //         // printMessage("Processing chromosome " + chr);
-    //         process_chr(chr);
-    //     }));
-    // }
+    auto process_chr = [&](const std::string& chr) {
+        try {
+            std::vector<SVCall> sv_calls;
+            std::vector<SVCall> split_sv_calls;
+            InputData chr_input_data = input_data;  // Use a thread-local copy
+            this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls);
+            {
+                std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
+                whole_genome_sv_calls[chr] = std::move(sv_calls);
+            }
+            // printMessage("Completed chromosome " + chr);
+        } catch (const std::exception& e) {
+            printError("Error processing chromosome " + chr + ": " + e.what());
+        } catch (...) {
+            printError("Unknown error processing chromosome " + chr);
+        }
+    };
+
+    // Submit tasks to the thread pool and track futures
+    std::vector<std::future<void>> futures;
+    for (const auto& chr : chromosomes) {
+        futures.emplace_back(pool.enqueue([&, chr] {
+            // printMessage("Processing chromosome " + chr);
+            process_chr(chr);
+        }));
+    }
 
     // // Wait for all tasks to complete
-    // int current_chr = 0;
-    // int total_chr_count = chromosomes.size();
-    // for (auto& future : futures) {
-    //     try {
-    //         current_chr++;
-    //         future.get();
-    //     } catch (const std::exception& e) {
-    //         printError("Error processing chromosome task: " + std::string(e.what()));
-    //     } catch (...) {
-    //         printError("Unknown error processing chromosome task.");
-    //     }
-    // }
-    // printMessage("All tasks have finished.");
+    int current_chr = 0;
+    int total_chr_count = chromosomes.size();
+    for (auto& future : futures) {
+        try {
+            current_chr++;
+            future.get();
+        } catch (const std::exception& e) {
+            printError("Error processing chromosome task: " + std::string(e.what()));
+        } catch (...) {
+            printError("Unknown error processing chromosome task.");
+        }
+    }
+    printMessage("All tasks have finished.");
 
     // -------------------------------------------------------
     // Run copy number variant predictions on the SVs detected from the
     // CIGAR string, using a minimum CNV length threshold
-    // current_chr = 0;
-    // printMessage("Running copy number predictions on CIGAR SVs...");
-    // for (auto& entry : whole_genome_sv_calls) {
-    //     current_chr++;
-    //     const std::string& chr = entry.first;
-    //     std::vector<SVCall>& sv_calls = entry.second;
-    //     if (sv_calls.size() > 0) {
-    //         // printMessage("Running copy number predictions on " + chr +
-    //         // "...");
-    //         printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
-    //         cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
-    //     }
-    // }
+    current_chr = 0;
+    printMessage("Running copy number predictions on CIGAR SVs...");
+    for (auto& entry : whole_genome_sv_calls) {
+        current_chr++;
+        const std::string& chr = entry.first;
+        std::vector<SVCall>& sv_calls = entry.second;
+        if (sv_calls.size() > 0) {
+            // printMessage("Running copy number predictions on " + chr +
+            // "...");
+            printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
+            cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
+        }
+    }
     // -------------------------------------------------------
 
-    // printMessage("Running copy number predictions on split-read SVs...");
-    // current_chr = 0;
-    // for (auto& entry : whole_genome_split_sv_calls) {
-    //     const std::string& chr = entry.first;
-    //     std::vector<SVCall>& sv_calls = entry.second;
-
-    //     if (sv_calls.size() > 0) {
-    //         current_chr++;
-    //         printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates...");
-    //         this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
-    //     }
-    // }
-
     // Identify split-SV signatures
     printMessage("Identifying split-SV signatures...");
+    std::unordered_map<std::string, std::vector<SVCall>> whole_genome_split_sv_calls;
     this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data);
 
     printMessage("Running copy number predictions on split-read SVs...");
-    int current_chr = 0;
-    int total_chr_count = whole_genome_split_sv_calls.size();
+    current_chr = 0;
     for (auto& entry : whole_genome_split_sv_calls) {
         const std::string& chr = entry.first;
         std::vector<SVCall>& sv_calls = entry.second;
@@ -1405,19 +1407,12 @@ void SVCaller::run(const InputData& input_data)
             this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
         }
     }
-
-    // Detect inversions, insertions, and translocations from the split read
-    // alignments (no copy number predictions)
-    // printMessage("Detecting inversions, insertions, and translocations from split read alignments...");
-    // std::unordered_map<std::string, std::vector<SVCall>> neutral_sv_calls;
-    // this->findSplitReadSVs(neutral_sv_calls, ref_genome, input_data);
     
     printMessage("Unifying SVs...");
     for (auto& entry : whole_genome_split_sv_calls) {
         const std::string& chr = entry.first;
         std::vector<SVCall>& sv_calls = entry.second;
         whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end());
-        // whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), neutral_sv_calls[chr].begin(), neutral_sv_calls[chr].end());
     }
 
     // Print the total number of SVs detected for each chromosome
@@ -1442,7 +1437,7 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
     // Run copy number predictions on the SVs detected from the split reads
     std::vector<SVCall> processed_calls;
     for (const auto& sv_candidate : split_sv_calls) {
-        printMessage("Processing SV candidate " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " " + getSVTypeString(sv_candidate.sv_type) + "...");
+        // printMessage("Processing SV candidate " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " " + getSVTypeString(sv_candidate.sv_type) + "...");
 
         // bool is_inversion = sv_candidate.sv_type == SVType::INV;
 
@@ -1486,6 +1481,43 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
     split_sv_calls.insert(split_sv_calls.end(), processed_calls.begin(), processed_calls.end());
     // Replace with the processed calls
     // split_sv_calls = std::move(processed_calls);
+
+    mergeDuplicateSVs(split_sv_calls);
+
+    // Combine SVs with identical start positions, keeping the largest cluster size
+    // printMessage("[2] Combining SVs with identical start and end positions...");
+    // std::vector<SVCall> combined_calls;
+    // std::sort(split_sv_calls.begin(), split_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+    //     return (a.start < b.start) || (a.start == b.start && a.end < b.end);
+    // });
+    // int initial_size = split_sv_calls.size();
+    // for (size_t i = 0; i < split_sv_calls.size(); ++i) {
+    //     const SVCall& current_call = split_sv_calls[i];
+    //     printMessage("Current start: " + std::to_string(current_call.start) + ", previous start: " + (i > 0 ? std::to_string(split_sv_calls[i-1].start) : "N/A"));
+    //     if (i > 0 && current_call.start == split_sv_calls[i-1].start) {
+    //         printMessage("Found identical start position: " + std::to_string(current_call.start) + " with end: " + std::to_string(current_call.end));
+    //         // Keep the largest cluster size
+    //         if (current_call.cluster_size > split_sv_calls[i-1].cluster_size) {
+    //             combined_calls.back() = current_call;
+    //             printMessage("Replacing previous call with larger cluster size: " + std::to_string(current_call.cluster_size) + " > " + std::to_string(split_sv_calls[i-1].cluster_size));
+    //         }
+
+    //         // Merge the cluster sizes
+    //         combined_calls.back().cluster_size += current_call.cluster_size;
+    //         printMessage("Merged cluster size: " + std::to_string(combined_calls.back().cluster_size) + " (previous: " + std::to_string(split_sv_calls[i-1].cluster_size) + ")");
+    //     } else {
+    //         // Add the current call to the combined calls
+    //         combined_calls.push_back(current_call);
+    //     }
+    // }
+
+
+    // printMessage("Merged " + std::to_string(merge_count) + " SVs with identical start and end positions");
+    // // Replace the split SV calls with the combined calls
+    // printMessage("[TEST] Total SVs before merging: " + std::to_string(split_sv_calls.size()));
+    // split_sv_calls.clear();
+    // split_sv_calls.insert(split_sv_calls.end(), combined_calls.begin(), combined_calls.end());
+    // printMessage("[TEST] Total SVs after merging: " + std::to_string(split_sv_calls.size()));
 }
 
 void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const
@@ -1562,6 +1594,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     vcf_stream << header_line << std::endl;
     std::cout << "Saving SV calls to " << output_vcf << std::endl;
     int total_count = 0;
+    int unclassified_svs = 0;
     for (const auto& pair : sv_calls) {
         std::string chr = pair.first;
         const std::vector<SVCall>& sv_calls = pair.second;
@@ -1583,7 +1616,9 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
             // If the SV type is unknown, print a warning and skip
             if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) {
-                std::cerr << "Warning: Unknown SV type for SV at " << chr << ":" << start << "-" << end << std::endl;
+                // std::cerr << "Warning: Unknown SV type for SV at " << chr <<
+                // ":" << start << "-" << end << std::endl;
+                unclassified_svs += 1;
                 continue;
             } else {
                 total_count += 1;
@@ -1701,6 +1736,9 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
     // Print the number of SV calls skipped
     std::cout << "Finished writing VCF file. Total records: " << total_count << std::endl;
+    if (unclassified_svs > 0) {
+        std::cout << "Total unclassified SVs: " << unclassified_svs << std::endl;
+    }
 }
 
 int SVCaller::calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end)
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 0c19468c..7eabc4c6 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -221,6 +221,35 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
     printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls");
 }
 
+void mergeDuplicateSVs(std::vector<SVCall> &sv_calls)
+{
+    int initial_size = sv_calls.size();
+    std::vector<SVCall> combined_sv_calls;
+    std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+        return a.start < b.start;
+    });
+    for (size_t i = 0; i < sv_calls.size(); i++) {
+        SVCall& sv_call = sv_calls[i];
+        if (i > 0 && sv_call.start == sv_calls[i - 1].start) {
+            // Keep the larger cluster size for the same start position
+            if (sv_call.cluster_size > sv_calls[i - 1].cluster_size) {
+                combined_sv_calls.back() = sv_call;
+            }
+
+            // Combine cluster sizes
+            combined_sv_calls.back().cluster_size += sv_call.cluster_size;
+        } else {
+            // Add the SV call to the combined list
+            combined_sv_calls.push_back(sv_call);
+        }
+    }
+    int merge_count = initial_size - combined_sv_calls.size();
+    sv_calls = std::move(combined_sv_calls); // Replace with filtered list
+    if (merge_count > 0) {
+        printMessage("Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions");
+    }
+}
+
 void mergeSVSubsets(std::vector<SVCall> &sv_calls)
 {
     // Sort the SV calls by start position

From 0f305baee2bbc98cb62045810960968d2a158f02 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 14 Mar 2025 11:51:39 -0400
Subject: [PATCH 084/134] improved ins and del

---
 src/sv_caller.cpp | 146 +++++++++++++++++-----------------------------
 src/sv_object.cpp |  34 +++++++++--
 2 files changed, 81 insertions(+), 99 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 00a6f0ae..d1244895 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -690,9 +690,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                 }
             }
 
-            // TODO: After this classify deletions if negative (keep the rest
-            // the same)
-
             // --------------------------------
 
             // Get the supplementary alignment positions
@@ -701,33 +698,34 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             int supp_cluster_size = 0;
             int supp_best_start = -1;
             int supp_best_end = -1;
-            if (!supp_start_cluster.empty() && !supp_end_cluster.empty()) {
+            if (!supp_start_cluster.empty()) {
                 std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
-                int supp_best_start = supp_start_cluster[supp_start_cluster.size() / 2];
+                supp_best_start = supp_start_cluster[supp_start_cluster.size() / 2];
+            }
+            if (!supp_end_cluster.empty()) {
                 std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
-                int supp_best_end = supp_end_cluster[supp_end_cluster.size() / 2];
-                if (supp_start_cluster.size() > supp_end_cluster.size()) {
-                    // std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
-                    // supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
-                    supp_pos = supp_best_start;
-                    supp_cluster_size = supp_start_cluster.size();
-                } else if (supp_end_cluster.size() > supp_start_cluster.size()) {
-                    // std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
-                    // supp_pos = supp_end_cluster[supp_end_cluster.size() / 2];
-                    supp_pos = supp_best_end;
-                    supp_cluster_size = supp_end_cluster.size();
-                } else {
-                    // Use both positions. This has been shown to occur in nested SVs
-                    // std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
-                    // std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
-                    // supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
-                    // supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2];
-                    supp_pos = supp_best_start;
-                    supp_pos2 = supp_best_end;
-                    supp_cluster_size = supp_start_cluster.size();
-                }
+                supp_best_end = supp_end_cluster[supp_end_cluster.size() / 2];
+            }
+
+            if (supp_start_cluster.size() > supp_end_cluster.size()) {
+                // std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
+                // supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
+                supp_pos = supp_best_start;
+                supp_cluster_size = supp_start_cluster.size();
+            } else if (supp_end_cluster.size() > supp_start_cluster.size()) {
+                // std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
+                // supp_pos = supp_end_cluster[supp_end_cluster.size() / 2];
+                supp_pos = supp_best_end;
+                supp_cluster_size = supp_end_cluster.size();
+            } else if (supp_best_end == -1 && supp_best_start == -1) {
+                // Use both positions. This has been shown to occur in some nested SVs
+                supp_pos = supp_best_start;
+                supp_pos2 = supp_best_end;
+                supp_cluster_size = supp_start_cluster.size();
+            }
 
-                // Store the inversion as the supplementary start and end positions
+            // Store the inversion as the supplementary start and end positions
+            if (supp_best_start != -1 && supp_best_end != -1) {
                 if (inversion && std::abs(supp_best_start - supp_best_end) >= 50) {
                     SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, "<INV>", "SUPPINV", "./.", 0.0, 0, 0, supp_cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
@@ -764,14 +762,22 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             // If the read distance is < 30bp while the SV is > 2kb, then this is a
             // potential deletion
             if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) {
-                // printMessage(chr_name + ": Found potential deletion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group));
                 SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDEL", "./.", 0.0, 0, 0, cluster_size);
                 addSVCall(chr_sv_calls, sv_candidate);
+
+                // Add an inversion call if necessary
+                if (inversion) {
+                    SVCall sv_candidate(sv_start, sv_end, SVType::INV, "<INV>", "INVDEL", "./.", 0.0, 0, 0, cluster_size);
+                    addSVCall(chr_sv_calls, sv_candidate);
+                }
             }
 
             // Add a dummy SV call for CNV detection
             else if (sv_length >= min_length && sv_length <= max_length) {
-                SVCall sv_candidate(sv_start, sv_end, SVType::UNKNOWN, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size);
+                SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
+                std::string alt = (sv_type == SVType::INV) ? "<INV>" : ".";
+                SVCall sv_candidate(sv_start, sv_end, sv_type, alt, "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size);
+                // SVCall sv_candidate(sv_start, sv_end, SVType::UNKNOWN, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size);
                 addSVCall(chr_sv_calls, sv_candidate);
             }
             
@@ -793,37 +799,15 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
         
         // Merge duplicate SV calls with identical start positions
         mergeDuplicateSVs(chr_sv_calls);
-
-        // int initial_size = chr_sv_calls.size();
-        // std::vector<SVCall> combined_sv_calls;
-        // for (size_t i = 0; i < chr_sv_calls.size(); i++) {
-        //     SVCall& sv_call = chr_sv_calls[i];
-        //     if (i > 0 && sv_call.start == chr_sv_calls[i - 1].start) {
-        //         // Keep the largest cluster size for the same start position
-        //         if (sv_call.cluster_size > chr_sv_calls[i - 1].cluster_size) {
-        //             combined_sv_calls.back() = sv_call;
-        //         }
-
-        //         // Combine cluster sizes
-        //         combined_sv_calls.back().cluster_size += sv_call.cluster_size;
-        //     } else {
-        //         // Add the SV call to the combined list
-        //         combined_sv_calls.push_back(sv_call);
-        //     }
-        // }
-        // int merge_count = initial_size - combined_sv_calls.size();
-        // printMessage("Merged " + std::to_string(merge_count) + " SV candidates with identical start positions");
-
-        // Add the combined SV calls to the main vector
-        // sv_calls[chr_name] = std::move(combined_sv_calls);
         sv_calls[chr_name] = std::move(chr_sv_calls);
 
         // Print the number of merged SV calls
         printMessage(chr_name + ": Found " + std::to_string(sv_calls[chr_name].size()) + " SV candidates");
-        
-        // if (merge_count > 0) {
-        //     printMessage(chr_name + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions");
-        // }
+
+        // Print all SV calls
+        for (const SVCall& sv_call : sv_calls[chr_name]) {
+            printMessage("SV: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", length: " + std::to_string(sv_call.end - sv_call.start + 1) + ", cluster size: " + std::to_string(sv_call.cluster_size) + ", group: " + std::to_string(current_group));
+        }
     }
 
     // if (merge_count > 0) {
@@ -1320,6 +1304,9 @@ void SVCaller::run(const InputData& input_data)
         printMessage("Removing chromosome " + chr + " with no reads...");
         chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end());
     }
+    std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
+    int current_chr = 0;
+    int total_chr_count = chromosomes.size();
 
     // Use multi-threading across chromosomes. If a single chromosome is
     // specified, use a single main thread (multi-threading is used for file I/O)
@@ -1329,7 +1316,6 @@ void SVCaller::run(const InputData& input_data)
         std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl;
     }
     ThreadPool pool(thread_count);
-    std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
     auto process_chr = [&](const std::string& chr) {
         try {
             std::vector<SVCall> sv_calls;
@@ -1358,8 +1344,6 @@ void SVCaller::run(const InputData& input_data)
     }
 
     // // Wait for all tasks to complete
-    int current_chr = 0;
-    int total_chr_count = chromosomes.size();
     for (auto& future : futures) {
         try {
             current_chr++;
@@ -1479,45 +1463,21 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
     // Insert the copy number predictions back into the split SV calls
     printMessage("Inserting CNV calls...");
     split_sv_calls.insert(split_sv_calls.end(), processed_calls.begin(), processed_calls.end());
-    // Replace with the processed calls
-    // split_sv_calls = std::move(processed_calls);
-
     mergeDuplicateSVs(split_sv_calls);
 
-    // Combine SVs with identical start positions, keeping the largest cluster size
-    // printMessage("[2] Combining SVs with identical start and end positions...");
-    // std::vector<SVCall> combined_calls;
-    // std::sort(split_sv_calls.begin(), split_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-    //     return (a.start < b.start) || (a.start == b.start && a.end < b.end);
-    // });
-    // int initial_size = split_sv_calls.size();
-    // for (size_t i = 0; i < split_sv_calls.size(); ++i) {
-    //     const SVCall& current_call = split_sv_calls[i];
-    //     printMessage("Current start: " + std::to_string(current_call.start) + ", previous start: " + (i > 0 ? std::to_string(split_sv_calls[i-1].start) : "N/A"));
-    //     if (i > 0 && current_call.start == split_sv_calls[i-1].start) {
-    //         printMessage("Found identical start position: " + std::to_string(current_call.start) + " with end: " + std::to_string(current_call.end));
-    //         // Keep the largest cluster size
-    //         if (current_call.cluster_size > split_sv_calls[i-1].cluster_size) {
-    //             combined_calls.back() = current_call;
-    //             printMessage("Replacing previous call with larger cluster size: " + std::to_string(current_call.cluster_size) + " > " + std::to_string(split_sv_calls[i-1].cluster_size));
-    //         }
-
-    //         // Merge the cluster sizes
-    //         combined_calls.back().cluster_size += current_call.cluster_size;
-    //         printMessage("Merged cluster size: " + std::to_string(combined_calls.back().cluster_size) + " (previous: " + std::to_string(split_sv_calls[i-1].cluster_size) + ")");
+    // Remove any deletions with no HMM predictions (HMM likelihood is zero)
+    // int failed_del_count = 0;
+    // for (auto it = split_sv_calls.begin(); it != split_sv_calls.end();) {
+    //     if (it->hmm_likelihood == 0.0 && it->sv_type == SVType::DEL) {
+    //         it = split_sv_calls.erase(it);
+    //         failed_del_count++;
     //     } else {
-    //         // Add the current call to the combined calls
-    //         combined_calls.push_back(current_call);
+    //         ++it;
     //     }
     // }
-
-
-    // printMessage("Merged " + std::to_string(merge_count) + " SVs with identical start and end positions");
-    // // Replace the split SV calls with the combined calls
-    // printMessage("[TEST] Total SVs before merging: " + std::to_string(split_sv_calls.size()));
-    // split_sv_calls.clear();
-    // split_sv_calls.insert(split_sv_calls.end(), combined_calls.begin(), combined_calls.end());
-    // printMessage("[TEST] Total SVs after merging: " + std::to_string(split_sv_calls.size()));
+    // if (failed_del_count > 0) {
+    //     printMessage("Removed " + std::to_string(failed_del_count) + " failed deletion candidates with no HMM predictions");
+    // }
 }
 
 void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 7eabc4c6..d316b726 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -225,19 +225,41 @@ void mergeDuplicateSVs(std::vector<SVCall> &sv_calls)
 {
     int initial_size = sv_calls.size();
     std::vector<SVCall> combined_sv_calls;
+    // std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+    //     return a.start < b.start;
+    // });
+    // Sort first by start position, then by SV type
     std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-        return a.start < b.start;
+        return std::tie(a.start, a.sv_type) < std::tie(b.start, b.sv_type);
     });
     for (size_t i = 0; i < sv_calls.size(); i++) {
         SVCall& sv_call = sv_calls[i];
-        if (i > 0 && sv_call.start == sv_calls[i - 1].start) {
-            // Keep the larger cluster size for the same start position
-            if (sv_call.cluster_size > sv_calls[i - 1].cluster_size) {
+        if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.sv_type == sv_calls[i - 1].sv_type) {
+            // Keep the SV call with a non-zero likelihood
+            // The HMM prediction is more reliable than the split read prediction
+            if (sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) {
+                combined_sv_calls.back() = sv_call;
+            }
+
+            // If the likelihoods are equal, keep the one with the larger cluster size
+            // This is to ensure that the SV call with more supporting reads is
+            // kept
+            else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size > sv_calls[i - 1].cluster_size) {
                 combined_sv_calls.back() = sv_call;
             }
+            // // Keep the larger cluster size for the same start position
+            // if (sv_call.cluster_size > sv_calls[i - 1].cluster_size) {
+            //     combined_sv_calls.back() = sv_call;
+            // }
+
+            // // If cluster sizes are equal, keep the one with non-zero likelihood
+            // // The HMM prediction is more reliable than the split read prediction
+            // else if (sv_call.cluster_size == sv_calls[i - 1].cluster_size && sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) {
+            //     combined_sv_calls.back() = sv_call;
+            // }
 
-            // Combine cluster sizes
-            combined_sv_calls.back().cluster_size += sv_call.cluster_size;
+            // // Combine cluster sizes
+            // combined_sv_calls.back().cluster_size += sv_call.cluster_size;
         } else {
             // Add the SV call to the combined list
             combined_sv_calls.push_back(sv_call);

From 63ee46cba612bb383b36bb738bb7580600b3a0c0 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 14 Mar 2025 13:13:14 -0400
Subject: [PATCH 085/134] remove comments and fix warnings

---
 include/sv_caller.h |  22 +-
 include/sv_object.h |  14 +-
 src/sv_caller.cpp   | 652 +-------------------------------------------
 src/sv_object.cpp   |  67 +----
 4 files changed, 15 insertions(+), 740 deletions(-)

diff --git a/include/sv_caller.h b/include/sv_caller.h
index 1b420226..6e446fa6 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -112,44 +112,28 @@ class SVCaller {
 
         std::vector<std::string> getChromosomes(const std::string& bam_filepath);
 
-        // void findSplitCNVBreakpoints(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls);
-
         void findSplitSVSignatures(std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const InputData& input_data);
 
-        void findSplitReadSVs(std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const ReferenceGenome& ref_genome, const InputData& input_data);
-
         // Process a single CIGAR record and find candidate SVs
         void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome);
 
         std::pair<int, int> getAlignmentReadPositions(bam1_t* alignment);
 
-        void processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov, std::vector<SVCall>& split_sv_calls);
+        void processChromosome(const std::string& chr, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov);
 
-        // Detect SVs at a region from long read alignments. This is used for
-        // whole genome analysis running in parallel.
-        // RegionData detectSVsFromRegion(std::string region);
         void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome);
  
         // Read the next alignment from the BAM file in a thread-safe manner
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
 
-        // Detect SVs from split alignments
-        // void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector<SVCall>& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data);
-
-        // Calculate the mismatch rate given a map of query positions to
-        // match/mismatch (1/0) values within a specified range of the query
-        // sequence
-        double calculateMismatchRate(const MismatchData& mismatch_data);
-
         void runSplitReadCopyNumberPredictions(const std::string& chr, std::vector<SVCall>& split_sv_calls, const CNVCaller &cnv_caller, const CHMM &hmm, double mean_chr_cov, const std::vector<uint32_t> &pos_depth_map, const InputData &input_data);
 
         void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const std::string &output_dir, const ReferenceGenome &ref_genome) const;
 
-        // Calculate the read depth (INFO/DP) for a region
-        int calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end);
+        // Query the read depth (INFO/DP) at a position
+        int getReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start);
 
     public:
-        // Constructor with no arguments
         SVCaller() = default;
 
         // Detect SVs and predict SV type from long read alignments and CNV calls
diff --git a/include/sv_object.h b/include/sv_object.h
index 7c1f5410..d838e968 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -12,11 +12,9 @@
 
 using namespace sv_types;
 
-// Struct to represent a structural variant call
 struct SVCall {
     uint32_t start;
     uint32_t end;
-    // std::string sv_type = "NA";
     SVType sv_type = SVType::UNKNOWN;
     std::string alt_allele = ".";
     std::string data_type = "NA";
@@ -26,32 +24,22 @@ struct SVCall {
     int support = 0;  // Number of supporting reads
     int cluster_size = 0;  // Number of SV calls in the cluster
 
-    // Comparison operator for std::set
     bool operator<(const SVCall& other) const;
 
-    // Constructor with parameters for all fields
     SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) :
         start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {}
 };
 
-// void addSVCall(std::vector<SVCall>& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth);
 void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call);
 
-void mergeSVs(std::vector<SVCall>& sv_calls);
-
 // Merge SVs with identical start positions, and sum the cluster sizes
 void mergeDuplicateSVs(std::vector<SVCall>& sv_calls);
 
-void mergeSVSubsets(std::vector<SVCall>& sv_calls);
-
-void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, int min_depth);
-
-void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, int min_depth, const std::string& data_type);
-
 uint32_t getSVCount(const std::vector<SVCall>& sv_calls);
 
 void concatenateSVCalls(std::vector<SVCall>& sv_calls, const std::vector<SVCall>& sv_calls_update);
 
+// Merge SVs using DBSCAN clustering
 void mergeSVs(std::vector<SVCall> &sv_calls, double epsilon, int min_pts);
 
 #endif // SV_OBJECT_H
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index d1244895..a58d985d 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -33,11 +33,8 @@
 
 # define DUP_SEQSIM_THRESHOLD 0.9  // Sequence similarity threshold for duplication detection
 
-//std::mutex bam_mutex;
-
 int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1)
 {
-    // std::lock_guard<std::mutex> lock(this->shared_mutex);
     std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
     int ret = sam_itr_next(fp_in, itr, bam1);
     return ret;
@@ -60,290 +57,12 @@ std::vector<std::string> SVCaller::getChromosomes(const std::string &bam_filepat
     std::vector<std::string> chromosomes;
     for (int i = 0; i < bamHdr->n_targets; i++) {
         chromosomes.push_back(bamHdr->target_name[i]);
-        // printMessage("Chromosome: " + std::string(bamHdr->target_name[i]));
     }
     bam_hdr_destroy(bamHdr);
     sam_close(fp_in);
     return chromosomes;
 }
 
-// void SVCaller::findSplitCNVBreakpoints(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const std::string &region, std::vector<SVCall>& sv_calls)
-// {
-//     std::unordered_map<std::string, GenomicRegion> primary_map;
-//     std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
-
-//     // Create a read and iterator for the region
-//     bam1_t *bam1 = bam_init1();
-//     if (!bam1) {
-//         printError("ERROR: failed to initialize BAM record");
-//         return;
-//     }
-//     hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str());
-//     if (!itr) {
-//         bam_destroy1(bam1);
-//         printError("ERROR: failed to query region " + region);
-//         return;
-//     }
-
-//     uint32_t primary_count = 0;
-//     uint32_t supplementary_count = 0;
-
-//     // Main loop to process the alignments
-//     uint32_t num_alignments = 0;
-//     while (readNextAlignment(fp_in, itr, bam1) >= 0) {
-
-//         // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
-//         if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) {
-//             continue;
-//         }
-//         const std::string qname = bam_get_qname(bam1);  // Query template name
-
-//         // Process primary alignments
-//         if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
-//             // Store chromosome (TID), start, and end positions (1-based) of the
-//             // primary alignment, and the strand (true for forward, false for reverse)
-//             primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0};
-//             primary_count++;
-
-//         // Process supplementary alignments
-//         } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
-//             // Store chromosome (TID), start, and end positions (1-based) of the
-//             // supplementary alignment, and the strand (true for forward, false for reverse)
-//             supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0});
-//             supplementary_count++;
-//         }
-//         num_alignments++;
-//     }
-
-//     // Remove primary alignments without supplementary alignments
-//     std::vector<std::string> to_remove;
-//     for (const auto& entry : primary_map) {
-//         const std::string& qname = entry.first;
-//         if (supp_map.find(qname) == supp_map.end()) {
-//             to_remove.push_back(qname);
-//         }
-//     }
-//     for (const std::string& qname : to_remove) {
-//         primary_map.erase(qname);
-//     }
-
-//     // // Clean up the iterator and alignment
-//     // hts_itr_destroy(itr);
-//     // bam_destroy1(bam1);
-//     // printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments");
-
-//     // Identify overlapping primary alignments and then cluster their primary
-//     // start, end vs. supplementary alignment start, end positions, keeping the
-//     // median of the largest cluster for the primary and supplementary positions
-//     // as the final genome coordinates of the SV
-//     // IntervalNode* root = nullptr;
-//     std::unique_ptr<IntervalNode> root = nullptr;
-//     for (const auto& entry : primary_map) {
-//         const std::string& qname = entry.first;
-//         const GenomicRegion& region = entry.second;
-//         // root = insert(root, region, qname);
-//         insert(root, region, qname);
-//     }
-//     std::vector<std::vector<std::string>> primary_clusters;
-//     std::set<std::string> processed;
-
-//     for (const auto& entry : primary_map) {
-//         const std::string& qname = entry.first;
-//         if (processed.find(qname) != processed.end()) {
-//             continue;  // Skip already processed primary alignments
-//         }
-//         const GenomicRegion& region = entry.second;
-//         std::vector<std::string> overlap_group;
-//         findOverlaps(root, region, overlap_group);
-//         for (const std::string& qname : overlap_group) {
-//             processed.insert(qname);
-//         }
-//         if (overlap_group.size() > 1) {
-//             primary_clusters.push_back(overlap_group);
-//         }
-//     }
-//     printMessage(region + ": Found " + std::to_string(primary_clusters.size()) + " groups of overlapping primary alignments");
-
-//     // For each primary alignment cluster the supplementary alignment start and
-//     // end positions, keeping the median of the largest cluster
-//     // std::vector<SVCall> sv_candidates;
-//     int current_group = 0;
-//     int min_length = 2000;
-//     int max_length = 1000000;
-//     for (const auto& primary_cluster : primary_clusters) {
-//         // Determine if the primary alignments are mostly on opposite strands to
-//         // the corresponding supplementary alignments (potential inversions)
-//         bool inversion = false;
-//         for (const std::string& qname : primary_cluster) {
-//             const std::vector<GenomicRegion>& supp_alns = supp_map[qname];
-//             int num_supp = (int) supp_alns.size();
-//             int num_opposite_strand = 0;
-//             for (const GenomicRegion& supp_aln : supp_alns) {
-//                 // Opposite-strand alignment on the same chromosome
-//                 // (Since the iterator is single-chromosome, this is the case)
-//                 if (supp_aln.strand != primary_map[qname].strand) {
-//                     num_opposite_strand++;
-//                 }
-//             }
-//             if (static_cast<double>(num_opposite_strand) / static_cast<double>(num_supp) > 0.5) {
-//                 inversion = true;
-//             }
-//         }
-
-//         // Use DBSCAN to cluster primary alignment start, end positions
-//         DBSCAN1D dbscan(100, 5);
-//         current_group++;
-//         std::vector<int> starts;
-//         std::vector<int> ends;
-//         std::vector<bool> primary_strands;
-//         for (const std::string& qname : primary_cluster) {
-//             const GenomicRegion& region = primary_map[qname];
-//             starts.push_back(region.start);
-//             ends.push_back(region.end);
-//             primary_strands.push_back(region.strand);
-//         }
-
-//         // Get the largest cluster of primary alignment start positions
-//         dbscan.fit(starts);
-//         std::vector<int> primary_start_cluster = dbscan.getLargestCluster(starts);
-
-//         // Get the largest cluster of primary alignment end positions
-//         dbscan.fit(ends);
-//         std::vector<int> primary_end_cluster = dbscan.getLargestCluster(ends);
-
-//         // Continue if no clusters were found
-//         if (primary_start_cluster.empty() && primary_end_cluster.empty()) {
-//             continue;
-//         }
-
-//         // Get the supplementary alignment positions
-//         std::vector<int> supp_starts;
-//         std::vector<int> supp_ends;
-//         std::vector<bool> supp_strands;
-//         for (const std::string& qname : primary_cluster) {
-//             const std::vector<GenomicRegion>& regions = supp_map[qname];
-//             for (const GenomicRegion& region : regions) {
-//                 supp_starts.push_back(region.start);
-//                 supp_ends.push_back(region.end);
-//                 supp_strands.push_back(region.strand);
-//             }
-//         }
-
-//         // Get the largest cluster of supplementary alignment start positions
-//         dbscan.fit(supp_starts);
-//         std::vector<int> supp_start_cluster = dbscan.getLargestCluster(supp_starts);
-
-//         // Get the largest cluster of supplementary alignment end positions
-//         dbscan.fit(supp_ends);
-//         std::vector<int> supp_end_cluster = dbscan.getLargestCluster(supp_ends);
-
-//         // Continue if no clusters were found
-//         if (supp_start_cluster.empty() && supp_end_cluster.empty()) {
-//             continue;
-//         }
-
-//         // Use the median of the largest cluster of primary and supplementary
-//         // alignment start, end positions as the final genome coordinates of the
-//         // SV
-//         int primary_pos = -1;
-//         int primary_pos2 = -1;
-//         int primary_cluster_size = 0;
-//         if (primary_start_cluster.size() > primary_end_cluster.size()) {
-//             std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
-//             primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
-//             primary_cluster_size = primary_start_cluster.size();
-//         } else if (primary_end_cluster.size() > primary_start_cluster.size()) {
-//             std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
-//             primary_pos = primary_end_cluster[primary_end_cluster.size() / 2];
-//             primary_cluster_size = primary_end_cluster.size();
-//         } else {
-//             // Use both positions
-//             std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
-//             std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
-//             primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
-//             primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2];
-//             primary_cluster_size = primary_start_cluster.size();
-//         }
-
-//         // Get the supplementary alignment positions
-//         int supp_pos = -1;
-//         int supp_pos2 = -1;
-//         int supp_cluster_size = 0;
-//         if (supp_start_cluster.size() > supp_end_cluster.size()) {
-//             std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
-//             supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
-//             supp_cluster_size = supp_start_cluster.size();
-//         } else if (supp_end_cluster.size() > supp_start_cluster.size()) {
-//             std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
-//             supp_pos = supp_end_cluster[supp_end_cluster.size() / 2];
-//             supp_cluster_size = supp_end_cluster.size();
-//         } else {
-//             // Use both positions. This has been shown to occur in nested SVs
-//             std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
-//             std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
-//             supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
-//             supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2];
-//             supp_cluster_size = supp_start_cluster.size();
-//         }
-
-//         // If two of either were found, use the larger SV candidate
-//         if (primary_pos2 != -1) {
-//             int sv_length1 = std::abs(primary_pos - supp_pos);
-//             int sv_length2 = std::abs(primary_pos2 - supp_pos);
-//             if (sv_length2 > sv_length1) {
-//                 primary_pos = primary_pos2;
-//             }
-//         }
-//         if (supp_pos2 != -1) {
-//             int sv_length1 = std::abs(primary_pos - supp_pos);
-//             int sv_length2 = std::abs(primary_pos - supp_pos2);
-//             if (sv_length2 > sv_length1) {
-//                 supp_pos = supp_pos2;
-//             }
-//         }
-
-//         if (primary_pos == -1 || supp_pos == -1) {
-//             continue;
-//         }
-
-//         // Store the SV candidate if the length is within the specified range
-//         int sv_start = std::min(primary_pos, supp_pos);
-//         int sv_end = std::max(primary_pos, supp_pos);
-//         int sv_length = sv_end - sv_start + 1;
-//         int cluster_size = std::max(primary_cluster_size, supp_cluster_size);
-        
-//         // Determine the SV type
-//         SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
-//         if (sv_length >= min_length && sv_length <= max_length) {
-//             SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size);
-//             addSVCall(sv_calls, sv_candidate);
-//             // printMessage(region + ": Found SV candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group) + " with inversion status " + std::to_string(inversion));
-//         }
-//     }
-
-//     // Combine SVs with identical start and end positions, and sum the cluster
-//     // sizes
-//     std::vector<SVCall> combined_sv_calls;
-//     std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-//         return a.start < b.start || (a.start == b.start && a.end < b.end);
-//     });
-//     int merge_count = 0;
-//     for (size_t i = 0; i < sv_calls.size(); i++) {
-//         SVCall& sv_call = sv_calls[i];
-//         if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.end == sv_calls[i - 1].end) {
-//             sv_calls[i - 1].cluster_size += sv_call.cluster_size;
-//             merge_count++;
-//         } else {
-//             combined_sv_calls.push_back(sv_call);
-//         }
-//     }
-//     sv_calls = std::move(combined_sv_calls);
-
-//     // if (merge_count > 0) {
-//     //     printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions");
-//     // }
-// }
-
 void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const InputData &input_data)
 {
     // Open the BAM file
@@ -378,8 +97,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
     BamFileGuard bam_guard(fp_in, idx, bamHdr);  // Guard to close the BAM file
 
     // Alignment data structures
-    // std::unordered_map<std::string, GenomicRegion> primary_map;
-    // std::unordered_map<std::string, std::vector<GenomicRegion>> supp_map;
     std::unordered_map<int, std::unordered_map<std::string, PrimaryAlignment>> primary_map;  // TID-> qname -> primary alignment
     std::unordered_map<std::string, std::vector<SuppAlignment>> supp_map;  // qname -> supplementary alignment
 
@@ -432,8 +149,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             std::pair<int, int> qpos = getAlignmentReadPositions(bam1);
 
             primary_map[bam1->core.tid][qname] = PrimaryAlignment{bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0};
-            // primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0};
-            // primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0};
             alignment_tids.insert(bam1->core.tid);
             primary_count++;
 
@@ -444,7 +159,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             // for reverse)
             std::pair<int, int> qpos = getAlignmentReadPositions(bam1);
             supp_map[qname].push_back(SuppAlignment{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0});
-            // supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0});
             alignment_tids.insert(bam1->core.tid);
             supp_qnames.insert(qname);
             supplementary_count++;
@@ -459,7 +173,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
     // Remove primary alignments without supplementary alignments
     std::unordered_map<int, std::unordered_set<std::string>> to_remove;
     for (auto& chr_primary : primary_map) {
-        // Get the qnames for this chromosome
         std::unordered_set<std::string> qnames;
         for (const auto& entry : chr_primary.second) {
             if (supp_qnames.find(entry.first) == supp_qnames.end()) {
@@ -478,35 +191,15 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
     }
     printMessage("Removed " + std::to_string(total_removed) + " primary alignments without supplementary alignments");
 
-    // std::vector<std::string> to_remove;
-    // for (const auto& entry : primary_map) {
-    //     const std::string& qname = entry.first;
-    //     if (supp_map.find(qname) == supp_map.end()) {
-    //         to_remove.push_back(qname);
-    //     }
-    // }
-    // for (const std::string& qname : to_remove) {
-    //     primary_map.erase(qname);
-    // }
-
-
     for (const auto& chr_primary : primary_map) {
         int primary_tid = chr_primary.first;
         std::string chr_name = bamHdr->target_name[primary_tid];
         printMessage("Processing chromosome " + chr_name + " with " + std::to_string(chr_primary.second.size()) + " primary alignments");
 
         std::vector<SVCall> chr_sv_calls;
-
-        // std::unordered_map<int, std::unordered_map<std::string, PrimaryAlignment>> primary_map;  // TID-> qname -> primary alignment
-        // const std::unordered_map<std::string, std::vector<PrimaryAlignment>>&
-        // chr_primary_map = chr_primary.second;
         const std::unordered_map<std::string, PrimaryAlignment>& chr_primary_map = chr_primary.second;
 
-        // Identify overlapping primary alignments and then cluster their primary
-        // start, end vs. supplementary alignment start, end positions, keeping the
-        // median of the largest cluster for the primary and supplementary positions
-        // as the final genome coordinates of the SV
-        // IntervalNode* root = nullptr;
+        // Identify overlapping primary alignments and cluster endpoints
         std::unique_ptr<IntervalNode> root = nullptr;
         for (const auto& entry : chr_primary_map) {
             const std::string& qname = entry.first;
@@ -531,7 +224,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                 primary_clusters.push_back(overlap_group);
             }
         }
-        // printMessage(chr_name + ": Found " + std::to_string(primary_clusters.size()) + " groups of overlapping primary alignments");
 
         // For each primary alignment cluster the supplementary alignment start and
         // end positions, keeping the median of the largest cluster
@@ -637,10 +329,8 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             // Get the largest cluster of split distances
             dbscan.fit(split_distances);
             std::vector<int> split_distance_cluster = dbscan.getLargestCluster(split_distances);
-            // printMessage("Found " + std::to_string(split_distance_cluster.size()) + " split distances (cluster size)");
 
             // Continue if no clusters were found
-            // if (supp_start_cluster.empty() && supp_end_cluster.empty()) {
             if (supp_start_cluster.empty() && supp_end_cluster.empty() && split_distance_cluster.empty()) {
                 continue;
             }
@@ -685,8 +375,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                     }
                     SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, "<INS>", "SPLITINS", "./.", 0.0, 0, 0, primary_cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
-                    // printMessage(chr_name + ": Found split insertion candidate " + std::to_string(primary_pos) + "-" + std::to_string(primary_pos + (read_distance-1)) + " with size " + std::to_string(read_distance) + " for group " + std::to_string(current_group));
-                    // continue;
                 }
             }
 
@@ -708,13 +396,9 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             }
 
             if (supp_start_cluster.size() > supp_end_cluster.size()) {
-                // std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
-                // supp_pos = supp_start_cluster[supp_start_cluster.size() / 2];
                 supp_pos = supp_best_start;
                 supp_cluster_size = supp_start_cluster.size();
             } else if (supp_end_cluster.size() > supp_start_cluster.size()) {
-                // std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
-                // supp_pos = supp_end_cluster[supp_end_cluster.size() / 2];
                 supp_pos = supp_best_end;
                 supp_cluster_size = supp_end_cluster.size();
             } else if (supp_best_end == -1 && supp_best_start == -1) {
@@ -729,7 +413,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                 if (inversion && std::abs(supp_best_start - supp_best_end) >= 50) {
                     SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, "<INV>", "SUPPINV", "./.", 0.0, 0, 0, supp_cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
-                    // printMessage(chr_name + ": Found inversion candidate " + std::to_string(supp_best_start) + "-" + std::to_string(supp_best_end) + " for group " + std::to_string(current_group));
                 }
             }
 
@@ -777,17 +460,8 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                 SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
                 std::string alt = (sv_type == SVType::INV) ? "<INV>" : ".";
                 SVCall sv_candidate(sv_start, sv_end, sv_type, alt, "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size);
-                // SVCall sv_candidate(sv_start, sv_end, SVType::UNKNOWN, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size);
                 addSVCall(chr_sv_calls, sv_candidate);
             }
-            
-            // Determine the SV type
-            // SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
-            // if (sv_length >= min_length && sv_length <= max_length) {
-            //     SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size);
-            //     addSVCall(chr_sv_calls, sv_candidate);
-            //     // printMessage(region + ": Found SV candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group) + " with inversion status " + std::to_string(inversion));
-            // }
         }
 
         // Combine SVs with identical start and end positions, and sum the cluster
@@ -809,10 +483,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             printMessage("SV: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", length: " + std::to_string(sv_call.end - sv_call.start + 1) + ", cluster size: " + std::to_string(sv_call.cluster_size) + ", group: " + std::to_string(current_group));
         }
     }
-
-    // if (merge_count > 0) {
-    //     printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions");
-    // }
 }
 
 void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome)
@@ -848,209 +518,6 @@ void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, c
     bam_destroy1(bam1);
 }
 
-double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data)
-{
-    int start = mismatch_data.query_start;
-    int end = mismatch_data.query_end;
-    const std::vector<int>& mismatch_map = mismatch_data.match_map;
-    start = std::max(start, 0);
-    end = std::min(end, (int32_t)mismatch_map.size() - 1);
-    int match_count = 0;
-    int mismatch_count = 0;
-    int MATCH = 1;
-    int MISMATCH = -1;
-    for (int i = start; i <= end; i++) {
-        if (mismatch_map[i] == MATCH) {
-            match_count++;
-        } else if (mismatch_map[i] == MISMATCH) {
-            mismatch_count++;
-        }
-    }
-
-    // Avoid division by zero
-    if (match_count + mismatch_count == 0) {
-        return 0.0;
-    }
-
-    double mismatch_rate = static_cast<double>(mismatch_count) / static_cast<double>(match_count + mismatch_count);
-
-    return mismatch_rate;
-}
-
-void SVCaller::findSplitReadSVs(std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const ReferenceGenome &ref_genome, const InputData& input_data)
-{
-    // Open the BAM file
-    std::string bam_filepath = input_data.getLongReadBam();
-    samFile *fp_in = sam_open(bam_filepath.c_str(), "r");
-    if (!fp_in) {
-        printError("ERROR: failed to open " + bam_filepath);
-        return;
-    }
-
-    // Set maximum thread count
-    int thread_count = input_data.getThreadCount();
-    hts_set_threads(fp_in, thread_count);
-    printMessage("Using " + std::to_string(thread_count) + " threads for split read analysis");
-
-    // Load the header
-    bam_hdr_t *bamHdr = sam_hdr_read(fp_in);
-    if (!bamHdr) {
-        sam_close(fp_in);
-        printError("ERROR: failed to read header from " + bam_filepath);
-        return;
-    }
-
-    // Load the index
-    hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str());
-    if (!idx) {
-        bam_hdr_destroy(bamHdr);
-        sam_close(fp_in);
-        printError("ERROR: failed to load index for " + bam_filepath);
-        return;
-    }
-    BamFileGuard bam_guard(fp_in, idx, bamHdr);  // Guard to close the BAM file
-
-    // Create a whole-genome iterator
-    hts_itr_t *itr = sam_itr_queryi(idx, HTS_IDX_START, 0, 0);
-    if (!itr) {
-        printError("ERROR: failed to query the whole genome");
-        return;
-    }
-
-    // Process the alignments
-    std::unordered_map<std::string, SplitSignature> primary_map;
-    std::unordered_map<std::string, std::vector<SplitSignature>> supp_map;
-    bam1_t *bam1 = bam_init1();
-    if (!bam1) {
-        printError("ERROR: failed to initialize BAM record");
-        return;
-    }
-    uint32_t primary_count = 0;
-    uint32_t supplementary_count = 0;
-    uint32_t num_alignments = 0;
-    printMessage("Processing split read alignment records...");
-    while (readNextAlignment(fp_in, itr, bam1) >= 0) {
-
-        // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
-        if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) {
-            continue;
-        }
-        const std::string qname = bam_get_qname(bam1);  // Query template name
-
-        // Process primary alignments
-        if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) {
-
-            // Get the start and end positions in the read sequence
-            uint32_t query_start = 0;
-            uint32_t query_end = 0;
-            uint32_t* cigar = bam_get_cigar(bam1);
-            int cigar_len = bam1->core.n_cigar;
-            for (int i = 0; i < cigar_len; i++) {
-                int op_len = bam_cigar_oplen(cigar[i]);
-                int op = bam_cigar_op(cigar[i]);
-
-                if (i == 0 && op == BAM_CSOFT_CLIP) {
-                    query_start = op_len;
-                }
-                
-                // https://github.com/samtools/htslib/blob/develop/htslib/sam.h:
-                // bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
-                if (bam_cigar_type(op) & 1) {
-                    query_end += op_len;
-                }
-            }
-
-            // Store the SV signature
-            primary_map[qname] = SplitSignature{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), query_start, query_end};
-            primary_count++;
-
-        // Process supplementary alignments
-        } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
-            // Get the start and end positions in the read sequence
-            uint32_t query_start = 0;
-            uint32_t query_end = 0;
-            uint32_t* cigar = bam_get_cigar(bam1);
-            int cigar_len = bam1->core.n_cigar;
-            for (int i = 0; i < cigar_len; i++) {
-                int op_len = bam_cigar_oplen(cigar[i]);
-                int op = bam_cigar_op(cigar[i]);
-
-                if (i == 0 && op == BAM_CSOFT_CLIP) {
-                    query_start = op_len;
-                }
-                
-                // https://github.com/samtools/htslib/blob/develop/htslib/sam.h:
-                // bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference
-                if (bam_cigar_type(op) & 1) {
-                    query_end += op_len;
-                }
-            }
-
-            // Store the SV signature
-            supp_map[qname].push_back(SplitSignature{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), query_start, query_end});
-            supplementary_count++;
-        }
-        num_alignments++;
-
-        if (num_alignments % 100000 == 0) {
-            printMessage("Processed " + std::to_string(num_alignments) + " split read alignment records");
-        }
-    }
-
-    // Remove primary alignments without supplementary alignments
-    std::vector<std::string> to_remove;
-    for (const auto& entry : primary_map) {
-        const std::string& qname = entry.first;
-        if (supp_map.find(qname) == supp_map.end()) {
-            to_remove.push_back(qname);
-        }
-    }
-    for (const std::string& qname : to_remove) {
-        primary_map.erase(qname);
-    }
-
-    // Clean up the iterator and alignment
-    hts_itr_destroy(itr);
-    bam_destroy1(bam1);
-    printMessage("Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments");
-
-    // Find insertions by comparing the primary vs. supplementary alignment
-    // distances in the read vs. reference genome on the same chromosome
-    int ins_count = 0;
-    std::vector<SVCall> sv_candidates;
-    for (const auto& entry : primary_map) {
-        const std::string& qname = entry.first;
-        const SplitSignature& primary = entry.second;
-        const std::vector<SplitSignature>& supp_alns = supp_map[qname];
-
-        // TODO: Cluster positions for improved performance
-
-        for (const SplitSignature& supp : supp_alns) {
-            if (primary.tid == supp.tid) {
-                int ref_dist = std::abs(primary.start - supp.start);
-                int query_dist = std::abs(primary.query_start - supp.query_start);
-
-                // If the reads are within 100 bp of each other, and the
-                // reference distance is greater than 2kb, then it is likely an
-                // insertion
-                if (query_dist <= 100 && ref_dist >= 2000) {
-                    int sv_start = std::min(primary.start, supp.start);
-                    int sv_end = std::max(primary.start, supp.start);
-                    int sv_length = sv_end - sv_start + 1;
-                    int cluster_size = 1;
-                    printMessage("Found insertion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length));
-                    SVCall sv_candidate(sv_start, sv_end, SVType::INS, "<INS>", "SPLITINS", "./.", 0.0, 0, 0, cluster_size);
-                    std::string chr = bamHdr->target_name[primary.tid];
-                    sv_calls[chr].push_back(sv_candidate);
-                    ins_count++;
-                }
-            }
-        }
-    }
-
-    printMessage("Found " + std::to_string(ins_count) + " insertions");
-}
-
 void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector<SVCall> &sv_calls, bool is_primary, const std::vector<uint32_t> &pos_depth_map, const ReferenceGenome &ref_genome)
 {
     std::string chr = header->target_name[alignment->core.tid];  // Chromosome name
@@ -1089,7 +556,6 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                         ins_seq_str[j] = base;
                     }
                 }
-                // std::string ins_seq_str_rc = reverseComplement(ins_seq_str);
                 
                 // Before the insertion
                 if (pos >= (uint32_t)op_len-1)
@@ -1099,7 +565,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
 
                     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
                     {
-                        int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
+                        int read_depth = this->getReadDepth(pos_depth_map, bp1);
                         SVCall sv_call(bp1, bp2, SVType::DUP, "<DUP>", "LSEQSIM", "./.", default_lh, read_depth, 1, 0);
                         addSVCall(sv_calls, sv_call);
                         continue;
@@ -1114,7 +580,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
 
                     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
                     {
-                        int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2);
+                        int read_depth = this->getReadDepth(pos_depth_map, bp1);
                         SVCall sv_call(bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth, 1, 0);
                         addSVCall(sv_calls, sv_call);
                         continue;
@@ -1122,11 +588,9 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                 }
 
                 // Add as an insertion
-                // For read depth calculation, use the previous and current
-                // positions (1-based)
                 uint32_t ins_pos = pos + 1;
                 uint32_t ins_end = ins_pos + op_len - 1;
-                int read_depth = this->calculateReadDepth(pos_depth_map, ins_pos-1, ins_pos);
+                int read_depth = this->getReadDepth(pos_depth_map, ins_pos-1);
                 
                 // Determine the ALT allele format based on small vs. large insertion
                 std::string alt_allele = "<INS>";
@@ -1141,7 +605,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
 
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
-                int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end);
+                int read_depth = this->getReadDepth(pos_depth_map, ref_pos);
                 SVCall sv_call(ref_pos, ref_end, SVType::DEL, "<DEL>", "CIGARDEL", "./.", default_lh, read_depth, 1, 0);
                 addSVCall(sv_calls, sv_call);
             }
@@ -1149,10 +613,6 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
 
         // Update the reference position
         // https://samtools.github.io/hts-specs/SAMv1.pdf
-        // if (bam_cigar_type(op) & 2) {
-        //     // bit 2: consume reference
-        //     ref_pos += op_len;
-        // }
         if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
             pos += op_len;
         }
@@ -1174,9 +634,6 @@ std::pair<int, int> SVCaller::getAlignmentReadPositions(bam1_t *alignment)
         int op_len = bam_cigar_oplen(cigar[i]);
         int op = bam_cigar_op(cigar[i]);
 
-        // if (i == 0 && op == BAM_CSOFT_CLIP) {
-        //     query_start = op_len;
-        // }
         // Set the query start position to the first non-soft clip operation
         if (query_start == -1 && (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CEQUAL || op == BAM_CDIFF)) {
             query_start = query_end;  // First valid query position
@@ -1190,14 +647,13 @@ std::pair<int, int> SVCaller::getAlignmentReadPositions(bam1_t *alignment)
     }
 
     if (query_start == -1) {
-        // If no valid query start position was found, set it to 0
         query_start = 0;
     }
 
     return std::make_pair(query_start, query_end);
 }
 
-void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov, std::vector<SVCall>& split_sv_calls)
+void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov)
 {
     // Open the BAM file
     std::string bam_filepath = input_data.getLongReadBam();
@@ -1321,12 +777,11 @@ void SVCaller::run(const InputData& input_data)
             std::vector<SVCall> sv_calls;
             std::vector<SVCall> split_sv_calls;
             InputData chr_input_data = input_data;  // Use a thread-local copy
-            this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls);
+            this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]);
             {
                 std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
                 whole_genome_sv_calls[chr] = std::move(sv_calls);
             }
-            // printMessage("Completed chromosome " + chr);
         } catch (const std::exception& e) {
             printError("Error processing chromosome " + chr + ": " + e.what());
         } catch (...) {
@@ -1366,8 +821,6 @@ void SVCaller::run(const InputData& input_data)
         const std::string& chr = entry.first;
         std::vector<SVCall>& sv_calls = entry.second;
         if (sv_calls.size() > 0) {
-            // printMessage("Running copy number predictions on " + chr +
-            // "...");
             printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
             cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
         }
@@ -1415,69 +868,27 @@ void SVCaller::run(const InputData& input_data)
     this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome);
 }
 
-// Detect SVs from split read alignments
+// Run copy number predictions on the SVs detected from the split reads
 void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::vector<SVCall>& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data)
 {
-    // Run copy number predictions on the SVs detected from the split reads
     std::vector<SVCall> processed_calls;
     for (const auto& sv_candidate : split_sv_calls) {
-        // printMessage("Processing SV candidate " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " " + getSVTypeString(sv_candidate.sv_type) + "...");
-
-        // bool is_inversion = sv_candidate.sv_type == SVType::INV;
-
         std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
         double supp_lh = std::get<0>(result);
         SVType supp_type = std::get<1>(result);
         std::string genotype = std::get<2>(result);
         if (supp_type != SVType::UNKNOWN && supp_type != SVType::NEUTRAL) {
-            // if (is_inversion) {
-            // 	// Add an additional inversion separately
-		    //     int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
-		    //     std::string alt_allele = "<INV>";
-		    //     SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
-            //     processed_calls.push_back(sv_call);
-            // }
-            
-            int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
+            int read_depth = this->getReadDepth(pos_depth_map, sv_candidate.start);
             std::string alt_allele = "<" + getSVTypeString(supp_type) + ">";
             SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
-            // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type));
-            // addSVCall(split_sv_calls, sv_call);
             processed_calls.push_back(sv_call);
         }
-
-        // } else if (sv_candidate.sv_type == SVType::INV) {
-        //     // SV with no copy number prediction, but is a potential inversion or insertion
-        //     int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end);
-        //     // std::string alt_allele = "<INV>";
-        //     std::string alt_allele = "<" + getSVTypeString(sv_candidate.sv_type) + ">";
-        //     SVCall sv_call(sv_candidate.start, sv_candidate.end, sv_candidate.sv_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
-        //     printMessage("[TEST-SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type));
-        //     processed_calls.push_back(sv_call);
-        // }
-        // if (current_sv % 1000 == 0) {
-        //     printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates");
-        // }
     }
 
     // Insert the copy number predictions back into the split SV calls
     printMessage("Inserting CNV calls...");
     split_sv_calls.insert(split_sv_calls.end(), processed_calls.begin(), processed_calls.end());
     mergeDuplicateSVs(split_sv_calls);
-
-    // Remove any deletions with no HMM predictions (HMM likelihood is zero)
-    // int failed_del_count = 0;
-    // for (auto it = split_sv_calls.begin(); it != split_sv_calls.end();) {
-    //     if (it->hmm_likelihood == 0.0 && it->sv_type == SVType::DEL) {
-    //         it = split_sv_calls.erase(it);
-    //         failed_del_count++;
-    //     } else {
-    //         ++it;
-    //     }
-    // }
-    // if (failed_del_count > 0) {
-    //     printMessage("Removed " + std::to_string(failed_del_count) + " failed deletion candidates with no HMM predictions");
-    // }
 }
 
 void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const
@@ -1576,8 +987,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
             // If the SV type is unknown, print a warning and skip
             if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) {
-                // std::cerr << "Warning: Unknown SV type for SV at " << chr <<
-                // ":" << start << "-" << end << std::endl;
                 unclassified_svs += 1;
                 continue;
             } else {
@@ -1628,16 +1037,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                 } else {
                     ref_allele = "N";  // Convention for INV and DUP
                 }
-                // // Update the position to the preceding base
-                // int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
-                // ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
-                // start = preceding_pos;
-
                 end = start;  // Update the end position to the same base
-                // // Update the end position to the same base for duplications and insertions
-                // if (sv_type == SVType::DUP || sv_type == SVType::INS) {
-                //     end = start;
-                // }
             }
 
             // Fix ambiguous bases in the reference allele
@@ -1662,10 +1062,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
                 ";SVTYPE2=" + sv_type2_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
                 ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(support) + ";CLUSTER=" + std::to_string(cluster_size);
-
-            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + 
-            //     ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + 
-            //     ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(support) + ";CLUSTER=" + std::to_string(cluster_size);
                 
             std::string format_str = "GT:DP";
             std::string sample_str = genotype + ":" + std::to_string(read_depth);
@@ -1673,27 +1069,11 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
             // Write the SV call to the file (CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLES)
             vcf_stream << chr << "\t" << start << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "PASS" << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl;
-            
-            // std::cout << "Wrote SV at " << chr << ": " << start << ", " << end << std::endl;
-            // if (total_count % 1000 == 0)
-            // {
-            // 	std::cout << "Wrote SV at " << chr << ": " << start << ", total=" << total_count << std::endl;
-        	// }
         }
     }
     vcf_stream.close();
     std::cout << "Saved SV calls to " << output_vcf << std::endl;
 
-    // Create a compressed and indexed VCF file
-    // std::cout << "Creating compressed and indexed VCF file..." << std::endl;
-    // std::string bgzip_cmd = "bgzip -f " + output_vcf;
-    // std::string tabix_cmd = "tabix -p vcf " + output_vcf + ".gz";
-    // std::system(bgzip_cmd.c_str());
-    // std::system(tabix_cmd.c_str());
-    // output_vcf += ".gz";
-    // std::cout << "VCF file created: " << output_vcf << std::endl;
-    // std::cout << "Index file created: " << output_vcf + ".tbi" << std::endl;
-
     // Print the number of SV calls skipped
     std::cout << "Finished writing VCF file. Total records: " << total_count << std::endl;
     if (unclassified_svs > 0) {
@@ -1701,26 +1081,14 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     }
 }
 
-int SVCaller::calculateReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start, uint32_t end)
+int SVCaller::getReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start)
 {
     int read_depth = 0;
     try {
-        // printMessage("Read depth at start: " + std::to_string(pos_depth_map.at(start)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start));
         read_depth += pos_depth_map.at(start);
     } catch (const std::out_of_range& e) {
-        // std::cerr << "Warning: Start position " << start << " not found in
-        // depth map." << std::endl;
         printError("Error: Start position " + std::to_string(start) + " not found in depth map.");
     }
 
-    // UPDATE: Only use the start position for the read depth calculation
-    // try {
-    //     // printMessage("Read depth at end: " + std::to_string(pos_depth_map.at(end)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start));
-    //     read_depth += pos_depth_map.at(end);
-    // } catch (const std::out_of_range& e) {
-    //     printError("Error: End position " + std::to_string(end) + " not found in depth map.");
-    //     // std::cerr << "Warning: End position " << end << " not found in depth map of size " << pos_depth_map.size() << "." << std::endl;
-    // }
-    // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth));
     return read_depth;
 }
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index d316b726..e4bb4699 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -225,9 +225,7 @@ void mergeDuplicateSVs(std::vector<SVCall> &sv_calls)
 {
     int initial_size = sv_calls.size();
     std::vector<SVCall> combined_sv_calls;
-    // std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-    //     return a.start < b.start;
-    // });
+
     // Sort first by start position, then by SV type
     std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
         return std::tie(a.start, a.sv_type) < std::tie(b.start, b.sv_type);
@@ -247,21 +245,7 @@ void mergeDuplicateSVs(std::vector<SVCall> &sv_calls)
             else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size > sv_calls[i - 1].cluster_size) {
                 combined_sv_calls.back() = sv_call;
             }
-            // // Keep the larger cluster size for the same start position
-            // if (sv_call.cluster_size > sv_calls[i - 1].cluster_size) {
-            //     combined_sv_calls.back() = sv_call;
-            // }
-
-            // // If cluster sizes are equal, keep the one with non-zero likelihood
-            // // The HMM prediction is more reliable than the split read prediction
-            // else if (sv_call.cluster_size == sv_calls[i - 1].cluster_size && sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) {
-            //     combined_sv_calls.back() = sv_call;
-            // }
-
-            // // Combine cluster sizes
-            // combined_sv_calls.back().cluster_size += sv_call.cluster_size;
         } else {
-            // Add the SV call to the combined list
             combined_sv_calls.push_back(sv_call);
         }
     }
@@ -271,52 +255,3 @@ void mergeDuplicateSVs(std::vector<SVCall> &sv_calls)
         printMessage("Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions");
     }
 }
-
-void mergeSVSubsets(std::vector<SVCall> &sv_calls)
-{
-    // Sort the SV calls by start position
-    int initial_size = sv_calls.size();
-    std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-        return a.start < b.start;
-    });
-
-    // Remove SVs that are subsets of other SVs
-    std::vector<SVCall> filtered_sv_calls;
-    // Since the input SV calls are sorted by start position, we can iterate
-    // through them in order and only keep the SVs that are not subsets of
-    // others
-    for (const auto& sv_call : sv_calls) {
-        // Check if the current SV call is a subset of any previously added
-        // SV call
-        bool is_subset = false;
-        for (const auto& filtered_sv_call : filtered_sv_calls) {
-            if (sv_call.start >= filtered_sv_call.start && sv_call.end <= filtered_sv_call.end) {
-                is_subset = true;
-                break;
-            }
-        }
-        // If it's not a subset, add it to the filtered list
-        if (!is_subset) {
-            filtered_sv_calls.push_back(sv_call);
-        }
-    }
-    sv_calls = std::move(filtered_sv_calls); // Replace with filtered list
-    int updated_size = sv_calls.size();
-    printMessage("Filtered SV calls to remove subsets, from " + std::to_string(initial_size) + " to " + std::to_string(updated_size));
-}
-
-void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, int min_support)
-{
-    // Filter SV calls with low read support or low cluster size
-    sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) {
-        return sv_call.support < min_support && sv_call.cluster_size < min_support;
-    }), sv_calls.end());
-}
-
-void filterSVsWithLowSupport(std::vector<SVCall> &sv_calls, int min_support, const std::string &data_type)
-{
-    // Filter SV calls with low read depth only for the specified data type, keeping the rest
-    sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support, data_type](const SVCall& sv_call) {
-        return sv_call.support < min_support && sv_call.data_type == data_type;
-    }), sv_calls.end());
-}

From c10a530c033a5639f4ef3e96135819c73f6eace2 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 15 Mar 2025 09:18:04 -0400
Subject: [PATCH 086/134] cnv merge fix

---
 src/cnv_caller.cpp | 170 ++-------------------------------------------
 src/sv_caller.cpp  |  30 ++++----
 2 files changed, 22 insertions(+), 178 deletions(-)

diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index c23c146b..f9f346a8 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -26,7 +26,6 @@
 #include <utility>    // std::pair
 #include <unordered_set>
 #include <execution>  // std::execution::par
-// #include <omp.h>
 
 #include "utils.h"
 #include "sv_types.h"
@@ -55,17 +54,6 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
 {
     // Initialize the SNP data with default values and sample size length
     int sample_size = input_data.getSampleSize();
-    // int region_length = (int) (end_pos - start_pos + 1);
-    // if (region_length < sample_size)
-    // {
-    //     sample_size = region_length;
-    // }
-
-    // std::vector<uint32_t> snp_pos(sample_size, 0);
-    // std::vector<double> snp_baf(sample_size, -1.0);
-    // std::vector<double> snp_pfb(sample_size, 0.5);
-    // std::vector<double> snp_log2_cov(sample_size, 0.0);
-    // std::vector<bool> is_snp(sample_size, false);
     std::vector<uint32_t> snp_pos;
     std::vector<double> snp_baf;
     std::vector<double> snp_pfb;
@@ -74,15 +62,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
 
     // Get the log2 ratio for <sample_size> evenly spaced positions in the
     // region
-    // this->calculateSNPLog2Ratios(snp_pos, snp_log2_cov, pos_depth_map,
-    // mean_chr_cov, input_data);
     sample_size = std::max((int) snp_pos.size(), sample_size);
-    //printMessage("Sample size: " + std::to_string(sample_size));
-    // std::vector<uint32_t> snp_pos_hmm(sample_size, 0);
-    // std::vector<double> snp_baf_hmm(sample_size, -1.0);
-    // std::vector<double> snp_pfb_hmm(sample_size, 0.5);
-    // std::vector<double> snp_log2_hmm(sample_size, 0.0);
-    // std::vector<bool> is_snp_hmm(sample_size, false);
     std::vector<uint32_t> snp_pos_hmm;
     std::vector<double> snp_baf_hmm;
     std::vector<double> snp_pfb_hmm;
@@ -91,9 +71,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
 
     // Loop through evenly spaced positions in the region and get the log2 ratio
     double pos_step = (double) (end_pos - start_pos + 1) / (double) sample_size;
-    // Convert SNP positions for faster access (convert to a set)
     std::unordered_set<uint32_t> snp_pos_set(snp_pos.begin(), snp_pos.end());
-
     for (int i = 0; i < sample_size; i++)
     {
         // Calculate the mean depth for the window
@@ -145,10 +123,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
             }
         }
 
-        // If no SNP was found in the sample, then use the middle of the window
-        // as a placeholder
-        // This is to ensure that the HMM has a value for every position in the
-        // sample
+        // If no SNP was found in the sample, then use the center position
         if (!snp_found_in_sample)
         {
             uint32_t pos = (uint32_t) (start_pos + (i * pos_step) + (pos_step / 2.0));
@@ -159,7 +134,6 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
             is_snp_hmm.push_back(false);
         }
     }
-    // this->calculateRegionLog2Ratio(start_pos, end_pos, sample_size, pos_depth_map, mean_chr_cov, snp_log2_cov);
 
     // Update the SNP data with all information
     snp_data.pos = std::move(snp_pos_hmm);
@@ -217,8 +191,6 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     // Determine if there is a majority state within the SV region and if it
     // is greater than 75%
     double pct_threshold = 0.75;
-    // double pct_threshold = 0.90;
-    // double pct_threshold = 0.80;
     int max_state = 0;
     int max_count = 0;
 
@@ -345,7 +317,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         {
             std::string genotype = cnv_genotype_map.at(max_state);
             std::string data_type = "CIGAR+HMM";
-            // std::string sv_type_str = getSVTypeString(updated_sv_type);
             sv_call.sv_type = updated_sv_type;
             sv_call.hmm_likelihood = likelihood;
             sv_call.genotype = genotype;
@@ -380,9 +351,6 @@ std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint3
 }
 
 // Calculate the mean chromosome coverage
-// double CNVCaller::calculateMeanChromosomeCoverage(std::string chr,
-// std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath,
-// int thread_count) const
 void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>& chromosomes, std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map, std::unordered_map<std::string, double>& chr_mean_cov_map, const std::string& bam_filepath, int thread_count) const
 {
     // Open the BAM file
@@ -452,11 +420,7 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>&
             {
                 continue;
             }
-            // if (bam_record->core.flag & BAM_FUNMAP || bam_record->core.flag & BAM_FSECONDARY || bam_record->core.flag & BAM_FQCFAIL || bam_record->core.flag & BAM_FDUP)
-            // {
-            //     continue;
-            // }
-            
+
             // Parse the CIGAR string to get the depth (match, sequence match, and
             // mismatch)
             uint32_t pos = (uint32_t)bam_record->core.pos + 1;  // 0-based to 1-based
@@ -478,11 +442,6 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>&
                             continue;
                         }
                         pos_depth_map[ref_pos + j]++;
-                        // try {
-                        //     chr_pos_depth_map[ref_pos + j]++;
-                        // } catch (const std::out_of_range& oor) {
-                        //     printError("Out of range error for " + chr + ":" + std::to_string(ref_pos+j));
-                        // }
                     }
                 }
                 
@@ -497,31 +456,7 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>&
                 }
             }
         }
-
-        // Clean up the iterator
         hts_itr_destroy(bam_iter);
-
-        // printMessage("Finished reading BAM file, calculating mean chromosome coverage...");
-
-        // // Calculate the mean chromosome coverage for positions with non-zero depth
-        // uint64_t cum_depth = 0;
-        // uint32_t pos_count = 0;
-        // for (const auto& pos_depth : chr_pos_depth_map)
-        // {
-        //     if (pos_depth > 0)
-        //     {
-        //         cum_depth += pos_depth;
-        //         pos_count++;
-        //     }
-        // }
-
-        // double mean_chr_cov = 0.0;
-        // if (pos_count > 0)
-        // {
-        //     mean_chr_cov = static_cast<double>(cum_depth) / static_cast<double>(pos_count);
-        // }
-        // printMessage("Completed calculating mean chromosome coverage: " +
-        // std::to_string(mean_chr_cov));
         
         // Parallel sum of the depth map
         uint64_t cum_depth = std::reduce(
@@ -539,73 +474,11 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>&
             [](uint32_t depth) { return depth > 0; }
         );
 
-        // printMessage("Number of positions with non-zero depth: " + std::to_string(pos_count));
-        // printMessage("Total depth: " + std::to_string(cum_depth));
-
         double mean_chr_cov = (pos_count > 0) ? static_cast<double>(cum_depth) / static_cast<double>(pos_count) : 0.0;
         chr_mean_cov_map[chr] = mean_chr_cov;
-
-        // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov));
     }
-
-    // Clean up
-    // sam_close(bam_file);
 }
 
-// void CNVCaller::calculateSNPLog2Ratios(const std::vector<uint32_t>& snp_pos, const std::vector<double>& snp_log2_cov, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov) const
-// {
-//     // Calculate the log2 ratio for each SNP position
-//     for (size_t i = 0; i < snp_pos.size(); i++)
-//     {
-//         uint32_t pos = snp_pos[i];
-//         try {
-//             uint32_t depth = pos_depth_map.at(pos);
-
-//             // Calculate the log2 ratio for the position
-//             if (depth == 0)
-//             {
-//                 snp_log2_cov[i] = 0.0;
-//             } else {
-//                 snp_log2_cov[i] = log2((double) depth / mean_chr_cov);
-//             }
-
-//         } catch (const std::out_of_range& e) {
-//             snp_log2_cov[i] = 0.0;
-//         }
-//     }
-// }
-
-// void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& log2_region, std::vector<uint32_t>& snp_pos) const
-// {
-//     uint32_t region_length = end_pos - start_pos + 1;
-//     double step_size = (double) region_length / sample_size;
-//     std::set<uint32_t> snp_pos_set(snp_pos.begin(), snp_pos.end());
-
-//     // Loop through each interval in the region and calculate the log2 ratio
-//     for (int i = 0; i < sample_size; i++)
-//     {
-//         uint32_t pos = start_pos + (uint32_t) (i * step_size);
-//         if (pos > end_pos)
-//         {
-//             pos = end_pos;
-//         }
-//         try {
-//             uint32_t depth = pos_depth_map.at(pos);
-
-//             // Calculate the log2 ratio for the position
-//             if (depth == 0)
-//             {
-//                 log2_region[i] = 0.0;
-//             } else {
-//                 log2_region[i] = log2((double) depth / mean_chr_cov);
-//             }
-
-//         } catch (const std::out_of_range& e) {
-//             log2_region[i] = 0.0;
-//         }
-//     }
-// }
-
 void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data) const
 {
     // Lock during reading
@@ -784,13 +657,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             // Add the SNP position and BAF information
             snp_pos.push_back(pos);
             snp_baf.push_back(baf);
-            // is_snp.push_back(true);
             snp_pfb.push_back(0.5);
-            // snp_pos[i] = pos;
-            // snp_baf[i] = baf;
-            // is_snp[i] = true;
             snp_found = true;
-            // break;  // Only one SNP per region
         }
     }
 
@@ -802,7 +670,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     // Continue if no SNP was found in the region
     if (!snp_found)
     {
-        // printMessage("No SNP found in region: " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos));
         bcf_sr_destroy(snp_reader);
         bcf_sr_destroy(pfb_reader);
         return;
@@ -827,18 +694,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             printError("ERROR: Could not set region for population allele frequency reader: " + pfb_region_str);
         }
 
-        // for (size_t i = 0; i < snp_pos.size(); ++i)
-        // {
-        // Set the region as the SNP position
-        // printMessage("Setting region for population allele frequency reader...");
-        // uint32_t target_snp_pos = snp_pos[i];  // Already 1-based
-        // std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos);
-        // if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0)
-        // {
-        //     printError("ERROR: Could not set region for population allele frequency reader: " + snp_region_str);
-        // }
-        // printMessage("Region set for population allele frequency reader, loading population allele frequency data...");
-
         // Find the SNP position in the population allele frequency file
         float *pfb_f = NULL;
         int count = 0;
@@ -867,36 +722,19 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             {
                 continue;
             }
-            // double pfb = (double) pfb_f[0];
             double pfb = static_cast<double>(pfb_f[0]);
-            // free(pfb_f);
 
             // Skip if outside the acceptable range
             if (pfb <= MIN_PFB || pfb >= MAX_PFB)
             {
                 continue;
             }
-
-            // Add the population frequency to the SNP data
             snp_pfb[i] = pfb;
-
-            break;  // Break after finding the SNP position
-
-            // if (print_count < 20) {
-            //     printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")");
-            //     print_count++;
-            // }
+            break;
         }
         free(pfb_f);
-
-        // if (pfb_reader->errnum)
-        // {
-        //     printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum)));
-        // }
-        // }
     }
-    // }
-
+    
     // Clean up
     bcf_sr_destroy(snp_reader);
     bcf_sr_destroy(pfb_reader);
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index a58d985d..e2d429bb 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -872,23 +872,26 @@ void SVCaller::run(const InputData& input_data)
 void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::vector<SVCall>& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data)
 {
     std::vector<SVCall> processed_calls;
-    for (const auto& sv_candidate : split_sv_calls) {
+    for (auto& sv_candidate : split_sv_calls) {
         std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
         double supp_lh = std::get<0>(result);
         SVType supp_type = std::get<1>(result);
         std::string genotype = std::get<2>(result);
-        if (supp_type != SVType::UNKNOWN && supp_type != SVType::NEUTRAL) {
-            int read_depth = this->getReadDepth(pos_depth_map, sv_candidate.start);
-            std::string alt_allele = "<" + getSVTypeString(supp_type) + ">";
-            SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size);
-            processed_calls.push_back(sv_call);
+
+        // For inversions with copy-neutral support, update the HMM likelihood
+        if (supp_type == SVType::NEUTRAL && sv_candidate.sv_type == SVType::INV) {
+            sv_candidate.hmm_likelihood = supp_lh;
         }
-    }
 
-    // Insert the copy number predictions back into the split SV calls
-    printMessage("Inserting CNV calls...");
-    split_sv_calls.insert(split_sv_calls.end(), processed_calls.begin(), processed_calls.end());
-    mergeDuplicateSVs(split_sv_calls);
+        // Update the SV type if the support is not neutral or unknown
+        else if (supp_type != SVType::UNKNOWN && supp_type != SVType::NEUTRAL) {
+            sv_candidate.sv_type = supp_type;
+            sv_candidate.alt_allele = "<" + getSVTypeString(supp_type) + ">";
+            sv_candidate.data_type += "+HMM";  // Update the data type to include HMM
+            sv_candidate.genotype = genotype;
+            sv_candidate.hmm_likelihood = supp_lh;
+        }
+    }
 }
 
 void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const
@@ -997,9 +1000,11 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             // SVTYPE2)
             SVType sv_type2 = SVType::UNKNOWN;
             if (sv_type == SVType::INV_DEL) {
+                printError("Warning: Inversion and deletion detected at " + chr + ":" + std::to_string(start) + "-" + std::to_string(end));
                 sv_type = SVType::DEL;
                 sv_type2 = SVType::INV;
             } else if (sv_type == SVType::INV_DUP) {
+                printError("Warning: Inversion and duplication detected at " + chr + ":" + std::to_string(start) + "-" + std::to_string(end));
                 sv_type = SVType::DUP;
                 sv_type2 = SVType::INV;
             }
@@ -1034,10 +1039,11 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                         // Insert the reference allele before the insertion
                         alt_allele.insert(0, ref_allele);
                     }
+                    end = start;  // Update the end position to the same base
+
                 } else {
                     ref_allele = "N";  // Convention for INV and DUP
                 }
-                end = start;  // Update the end position to the same base
             }
 
             // Fix ambiguous bases in the reference allele

From 62ba567b12e72efda47bba7de1afd93648e59521 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 18 Mar 2025 15:19:34 -0400
Subject: [PATCH 087/134] fix genotypes

---
 include/cnv_caller.h | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 6b10bc29..4a250692 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -57,21 +57,30 @@ class CNVCaller {
         // Each of the 6 state predictions corresponds to a copy number state
         // (0=No predicted state)
         // 0: Unknown (No predicted state)
-        // 1: 0/0 (Two copy loss: homozygous deletion, GT: 0/0)
-        // 2: 1/0 (One copy loss: heterozygous deletion, GT: 0/1)
-        // 3: 1/1 (Normal diploid: no copy number change, GT: 1/1)
-        // 4: 1/1 (Copy neutral LOH: no copy number change, GT: 1/1)
+        // 1: 1/1 (Two copy loss: homozygous deletion, GT: 1/1 for homozygous variant)
+        // 2: 0/1 (One copy loss: heterozygous deletion, GT: 0/1)
+        // 3: 0/0 (Normal diploid: no copy number change, GT: 0/0 for homozygous reference)
+        // 4: 1/1 (Copy neutral LOH: no copy number change, GT: 1/1 for homozygous variant)
         // 5: 2/1 (One copy gain: heterozygous duplication, GT: 1/2->0/1)
         // 6: 2/2 (Two copy gain: homozygous duplication, GT: 2/2->1/1)
-        std ::map<int, std::string> cnv_genotype_map = {
+        std::map<int, std::string> cnv_genotype_map = {
             {0, "./."},
-            {1, "0/0"},
+            {1, "1/1"},
             {2, "0/1"},
-            {3, "1/1"},
+            {3, "0/0"},
             {4, "1/1"},
             {5, "0/1"},
             {6, "1/1"}
         };
+        // std ::map<int, std::string> cnv_genotype_map = {
+        //     {0, "./."},
+        //     {1, "0/0"},
+        //     {2, "0/1"},
+        //     {3, "1/1"},
+        //     {4, "1/1"},
+        //     {5, "0/1"},
+        //     {6, "1/1"}
+        // };
 
         void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp);
 

From 877d1ff7440a51557735aed989a9d80a93813004 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 19 Mar 2025 17:02:03 -0400
Subject: [PATCH 088/134] clean up comments and save cnv in json format

---
 include/cnv_caller.h |  20 +----
 include/input_data.h |   4 +
 include/utils.h      |   2 +
 src/cnv_caller.cpp   | 175 +++++++++++++++++++++++++++++++++++++++----
 src/input_data.cpp   |  11 +++
 src/khmm.cpp         |  40 +---------
 src/main.cpp         |  13 ++++
 src/sv_caller.cpp    |   8 +-
 src/sv_object.cpp    |   8 +-
 src/utils.cpp        |  10 ++-
 10 files changed, 210 insertions(+), 81 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 4a250692..b424b6c2 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -46,10 +46,6 @@ struct SNPData {
 // CNVCaller: Detect CNVs and return the state sequence by SNP position
 class CNVCaller {
     private:
-        //mutable std::mutex snp_file_mtx;  // SNP file mutex
-        //mutable std::mutex pfb_file_mtx;  // Population frequency file mutex
-        //mutable std::mutex bam_file_mtx;  // BAM file mutex
-        // std::mutex& shared_mutex;
         std::shared_mutex& shared_mutex;
 
         // Define a map of CNV genotypes by HMM predicted state.
@@ -72,15 +68,6 @@ class CNVCaller {
             {5, "0/1"},
             {6, "1/1"}
         };
-        // std ::map<int, std::string> cnv_genotype_map = {
-        //     {0, "./."},
-        //     {1, "0/0"},
-        //     {2, "0/1"},
-        //     {3, "1/1"},
-        //     {4, "1/1"},
-        //     {5, "0/1"},
-        //     {6, "1/1"}
-        // };
 
         void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp);
 
@@ -102,15 +89,14 @@ class CNVCaller {
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
         void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
 
-        // double calculateMeanChromosomeCoverage(std::string chr, std::vector<uint32_t>& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const;
         void calculateMeanChromosomeCoverage(const std::vector<std::string>& chromosomes, std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map, std::unordered_map<std::string, double>& chr_mean_cov_map, const std::string& bam_filepath, int thread_count) const;
 
-        // void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, std::vector<double>& pos_log2) const;
-
-        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data) const;
+        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, const InputData& input_data) const;
 
         // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions
         void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const;
+
+        void saveSVCopyNumberToJSON(SNPData& before_sv, SNPData& after_sv, SNPData& snp_data, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood, const std::string& filepath) const;
 };
 
 #endif // CNV_CALLER_H
diff --git a/include/input_data.h b/include/input_data.h
index 106c70b6..0687af76 100644
--- a/include/input_data.h
+++ b/include/input_data.h
@@ -100,6 +100,9 @@ class InputData {
         // (+/- 1/2 SV length), save a TSV file, and generate HTML reports.
         void saveCNVData(bool save_cnv_data);
         bool getSaveCNVData() const;
+
+        void setCNVOutputFile(std::string filepath);
+        std::string getCNVOutputFile() const;
         
     private:
         std::string short_read_bam;
@@ -124,6 +127,7 @@ class InputData {
         bool verbose;  // True if verbose output is enabled
         bool save_cnv_data;  // True if SNP CNV regions should be extended around SV breakpoints, and saved to a TSV file (Large performance hit)
         bool single_chr;
+        std::string cnv_output_file;
 };
 
 #endif // INPUT_DATA_H
diff --git a/include/utils.h b/include/utils.h
index 6ec95610..e7bf164a 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -59,4 +59,6 @@ std::string removeChrPrefix(std::string chr);
 
 void printMemoryUsage(const std::string &functionName);
 
+bool fileExists(const std::string &filepath);
+
 #endif // UTILS_H
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index f9f346a8..b5d61745 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -57,8 +57,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     std::vector<uint32_t> snp_pos;
     std::vector<double> snp_baf;
     std::vector<double> snp_pfb;
-    std::vector<bool> is_snp;
-    this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data);
+    this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, input_data);
 
     // Get the log2 ratio for <sample_size> evenly spaced positions in the
     // region
@@ -97,6 +96,11 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
         double log2_cov = 0.0;
         if (pos_count > 0)
         {
+            if (cov_sum == 0)
+            {
+                // Use a small value to avoid division by zero
+                cov_sum = 1e-9;
+            }
             log2_cov = log2((cov_sum / (double) pos_count) / mean_chr_cov);
         }
 
@@ -156,11 +160,24 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     // Only extend the region if "save CNV data" is enabled
     uint32_t snp_start_pos = start_pos;
     uint32_t snp_end_pos = end_pos;
+    SNPData before_sv;
+    SNPData after_sv;
     if (input_data.getSaveCNVData())
     {
         uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
-        snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1;
-        snp_end_pos = end_pos + sv_half_length;
+        if (start_pos > 1)
+        {
+            uint32_t before_sv_start = std::max((uint32_t) 1, start_pos - sv_half_length);
+            uint32_t before_sv_end = start_pos - 1;
+            querySNPRegion(chr, before_sv_start, before_sv_end, pos_depth_map, mean_chr_cov, before_sv, input_data);
+        }
+        uint32_t chr_last_index = pos_depth_map.size() - 1;
+        if (end_pos < chr_last_index)
+        {
+            uint32_t after_sv_start = end_pos + 1;
+            uint32_t after_sv_end = std::min(chr_last_index, end_pos + sv_half_length);
+            querySNPRegion(chr, after_sv_start, after_sv_end, pos_depth_map, mean_chr_cov, after_sv, input_data);
+        }
     }
 
     // Query the SNP region for the SV candidate
@@ -217,16 +234,17 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     }
     snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
 
-    // Save the SV calls as a TSV file if enabled
+    // Save the SV calls if enabled
     bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL);
-    // if (save_cnv_data && copy_number_change && (end_pos - start_pos) > 10000)
-    if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 10000)
+    if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 50000)
     {
         std::string cnv_type_str = getSVTypeString(predicted_cnv_type);
-        const std::string output_dir = input_data.getOutputDir();
-        std::string sv_filename = output_dir + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv";
-        printMessage("Saving SV split-alignment copy number predictions to " + sv_filename + "...");
-        this->saveSVCopyNumberToTSV(snp_data, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood);
+        // const std::string output_dir = input_data.getOutputDir();
+        // std::string json_filepath = output_dir + "/CNVCalls.json";
+        std::string json_filepath = input_data.getCNVOutputFile();
+        printMessage("Saving SV copy number predictions to " + json_filepath + "...");
+
+        this->saveSVCopyNumberToJSON(before_sv, after_sv, snp_data, chr, start_pos, end_pos, cnv_type_str, likelihood, json_filepath);
     }
     
     return std::make_tuple(likelihood, predicted_cnv_type, genotype, true);
@@ -479,7 +497,7 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>&
     }
 }
 
-void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, std::vector<bool>& is_snp, const InputData& input_data) const
+void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, const InputData& input_data) const
 {
     // Lock during reading
     std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
@@ -593,7 +611,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
 
     // Read the SNP data ----------------------------------------------
     // Set the region
-    if (bcf_sr_set_regions(snp_reader, chr.c_str(), 0) < 0)
+    std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
+    if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)  //chr.c_str(), 0) < 0)
     {
         printError("ERROR: Could not set region for SNP reader: " + chr);
         bcf_sr_destroy(snp_reader);
@@ -822,6 +841,136 @@ void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, s
     tsv_file.close();
 }
 
+void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SNPData &snp_data, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood, const std::string& filepath) const
+{
+    // Append the SV information to the JSON file
+    std::ofstream json_file(filepath, std::ios::app);
+    if (!json_file.is_open())
+    {
+        std::cerr << "ERROR: Could not open JSON file for writing: " << filepath << std::endl;
+        exit(1);
+    }
+    json_file << "{\n";
+    json_file << "  \"chromosome\": \"" << chr << "\",\n";
+    json_file << "  \"start\": " << start << ",\n";
+    json_file << "  \"end\": " << end << ",\n";
+    json_file << "  \"sv_type\": \"" << sv_type << "\",\n";
+    json_file << "  \"likelihood\": " << likelihood << ",\n";
+    json_file << "  \"before_sv\": {\n";
+    json_file << "    \"positions\": [";
+        for (size_t i = 0; i < before_sv.pos.size(); ++i)
+        {
+            json_file << before_sv.pos[i];
+            if (i < before_sv.pos.size() - 1)
+                json_file << ", ";
+        }
+        json_file << "],\n";
+    json_file << "    \"b_allele_freq\": [";
+        for (size_t i = 0; i < before_sv.baf.size(); ++i)
+        {
+            json_file << before_sv.baf[i];
+            if (i < before_sv.baf.size() - 1)
+                json_file << ", ";
+        }
+        json_file << "],\n";
+    json_file << "    \"population_freq\": [";
+        for (size_t i = 0; i < before_sv.pfb.size(); ++i)
+        {
+            json_file << before_sv.pfb[i];
+            if (i < before_sv.pfb.size() - 1)
+                json_file << ", ";
+        }
+        json_file << "],\n";
+    json_file << "    \"log2_ratio\": [";
+        for (size_t i = 0; i < before_sv.log2_cov.size(); ++i)
+        {
+            json_file << before_sv.log2_cov[i];
+            if (i < before_sv.log2_cov.size() - 1)
+                json_file << ", ";
+        }
+        json_file << "]\n";
+    json_file << "  },\n";
+    json_file << "  \"after_sv\": {\n";
+    json_file << "    \"positions\": [";
+        for (size_t i = 0; i < after_sv.pos.size(); ++i)
+        {
+            json_file << after_sv.pos[i];
+            if (i < after_sv.pos.size() - 1)
+                json_file << ", ";
+        }
+        json_file << "],\n";
+    json_file << "    \"b_allele_freq\": [";
+        for (size_t i = 0; i < after_sv.baf.size(); ++i)
+        {
+            json_file << after_sv.baf[i];
+            if (i < after_sv.baf.size() - 1)
+                json_file << ", ";
+        }
+        json_file << "],\n";
+    json_file << "    \"population_freq\": [";
+        for (size_t i = 0; i < after_sv.pfb.size(); ++i)
+        {
+            json_file << after_sv.pfb[i];
+            if (i < after_sv.pfb.size() - 1)
+                json_file << ", ";
+        }
+        json_file << "],\n";
+    json_file << "    \"log2_ratio\": [";
+        for (size_t i = 0; i < after_sv.log2_cov.size(); ++i)
+        {
+            json_file << after_sv.log2_cov[i];
+            if (i < after_sv.log2_cov.size() - 1)
+                json_file << ", ";
+        }
+        json_file << "]\n";
+    json_file << "  },\n";
+    json_file << "  \"sv\": {\n";
+    json_file << "    \"positions\": [";
+        for (size_t i = 0; i < snp_data.pos.size(); ++i)
+        {
+            json_file << snp_data.pos[i];
+            if (i < snp_data.pos.size() - 1)
+                json_file << ", ";
+        }
+        json_file << "],\n";
+    json_file << "    \"b_allele_freq\": [";
+        for (size_t i = 0; i < snp_data.baf.size(); ++i)
+        {
+            json_file << snp_data.baf[i];
+            if (i < snp_data.baf.size() - 1)
+                json_file << ", ";
+        }
+        json_file << "],\n";
+    json_file << "    \"population_freq\": [";
+        for (size_t i = 0; i < snp_data.pfb.size(); ++i)
+        {
+            json_file << snp_data.pfb[i];
+            if (i < snp_data.pfb.size() - 1)
+                json_file << ", ";
+        }
+        json_file << "],\n";
+    json_file << "    \"log2_ratio\": [";
+        for (size_t i = 0; i < snp_data.log2_cov.size(); ++i)
+        {
+            json_file << snp_data.log2_cov[i];
+            if (i < snp_data.log2_cov.size() - 1)
+                json_file << ", ";
+        }
+        json_file << "],\n";
+    json_file << "    \"states\": [";
+        for (size_t i = 0; i < snp_data.state_sequence.size(); ++i)
+        {
+            json_file << snp_data.state_sequence[i];
+            if (i < snp_data.state_sequence.size() - 1)
+                json_file << ", ";
+        }
+        json_file << "]\n";
+    json_file << "  }\n";
+    json_file << "}\n";
+    json_file.close();
+    printMessage("Saved copy number predictions for " + chr + ":" + std::to_string(start) + "-" + std::to_string(end) + " to " + filepath);
+}
+
 void CNVCaller::updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp)
 {
     // Update the SNP data
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 7a073dae..5661eb3b 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -35,6 +35,7 @@ InputData::InputData()
     this->verbose = false;
     this->save_cnv_data = false;
     this->single_chr = false;
+    this->cnv_output_file = "";
 }
 
 std::string InputData::getShortReadBam() const
@@ -403,3 +404,13 @@ bool InputData::getSaveCNVData() const
 {
     return this->save_cnv_data;
 }
+
+void InputData::setCNVOutputFile(std::string filepath)
+{
+    this->cnv_output_file = filepath;
+}
+
+std::string InputData::getCNVOutputFile() const
+{
+    return this->cnv_output_file;
+}
diff --git a/src/khmm.cpp b/src/khmm.cpp
index fcc4899d..43ed3958 100644
--- a/src/khmm.cpp
+++ b/src/khmm.cpp
@@ -55,15 +55,8 @@ std::pair<std::vector<int>, double> testVit_CHMM(CHMM hmm, int T, std::vector<do
 	return state_sequence;
 }
 
-// double b1iot(int state, double *mean, double *sd, double uf, double o)
 double b1iot(int state, std::vector<double> mean, std::vector<double> sd, double uf, double o)
 {
-	// if (o < mean[1])
-	// {
-	// 	o = mean[1];
-	// }
-	// double p = uf + ((1 - uf) * pdf_normal(o, mean[state], sd[state]));
-
 	// Get the values (0-based indexing)
 	if (o < mean[0])
 	{
@@ -74,22 +67,8 @@ double b1iot(int state, std::vector<double> mean, std::vector<double> sd, double
 	return log(p);
 }
 
-// double b2iot(int state, double *mean, double *sd, double uf, double pfb, double b)
 double b2iot(int state, const std::vector<double> mean, const std::vector<double> sd, double uf, double pfb, double b)
 {
-	// double p = 0;
-	// double mean0 = mean[1];  // mean[1] = 0
-	// double mean25 = mean[2];  // mean[2] = 0.25
-	// double mean33 = mean[3];  // mean[3] = 0.33
-	// double mean50 = mean[4];  // mean[4] = 0.5
-	// double mean50_state1 = mean[5];  // mean[5] = 0.5
-	// double sd0 = sd[1];  // sd[1] = 0
-	// double sd25 = sd[2];  // sd[2] = 0.25
-	// double sd33 = sd[3];  // sd[3] = 0.33
-	// double sd50 = sd[4];  // sd[4] = 0.5
-	// double sd50_state1 = sd[5];  // sd[5] = 0.5
-	// p = uf;  // UF = previous alpha (transition probability)
-
 	// Get the values (0-based indexing)
 	double p = 0;
 	double mean0 = mean[0];  // mean[0] = 0
@@ -275,7 +254,6 @@ std::pair<std::vector<int>, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect
 	{
 		for (j = 1; j <= hmm.N; j++)
 		{
-			// A1[i][j] = hmm.A[i][j];
 			// Update for 0-based indexing
 			A1[i][j] = hmm.A[i-1][j-1];
 		}
@@ -333,11 +311,7 @@ std::pair<std::vector<int>, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect
 
 	/* 1. Initialization  */
 	for (i = 1; i <= hmm.N; i++)
-	{
-		// delta[1][i] = hmm.pi[i] + biot[i][1];  // Initialize the delta matrix
-		// (log probability) to the initial state distribution + the emission
-		// probability
-		
+	{	
 		// Update to 0-based indexing
 		delta[1][i] = hmm.pi[i-1] + biot[i][1];  // Initialize the delta matrix
 		psi[1][i] = 0;  // Initialize the psi matrix (state sequence) to 0 (no state)
@@ -396,20 +370,8 @@ std::pair<std::vector<int>, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect
 		q[t] = psi[t + 1][q[t + 1]];
 	}
 
-	// // Print t, the state, delta, biot, and psi
-	// for (t = 1; t <= T; t++)
-	// {
-	// 	std::cout << "Time " << t << " with state " << q[t] << ":" << std::endl;
-	// 	for (i = 1; i <= hmm.N; i++)
-	// 	{
-	// 		std::cout << "State " << i << ": delta = " << delta[t][i] << ", biot = " << biot[i][t] << ", psi = " << psi[t][i] << ", LRR = " << O1[t-1] << ", BAF = " << O2[t-1] << std::endl;
-	// 	}
-	// 	std::cout << std::endl;
-	// }
-
 	for (i = 1; i <= hmm.N; i++)
 	{ /*recover the HMM model as original*/
-		// hmm.pi[i] = exp(hmm.pi[i]);
 		// Update to 0-based indexing
 		hmm.pi[i-1] = exp(hmm.pi[i-1]);
 	}
diff --git a/src/main.cpp b/src/main.cpp
index e493cd4e..b793619b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -72,6 +72,19 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
         input_data.setDBSCAN_MinPtsPct(std::stod(args.at("min-pts-pct")));
     }
 
+    // Set up the CNV JSON file if enabled
+    if (input_data.getSaveCNVData()) {
+        const std::string output_dir = input_data.getOutputDir();
+        std::string json_filepath = output_dir + "/CNVCalls.json";
+        int json_file_count = 1;
+        while (fileExists(json_filepath)) {
+            json_filepath = output_dir + "/CNVCalls_" + std::to_string(json_file_count) + ".json";
+            json_file_count++;
+        }
+        input_data.setCNVOutputFile(json_filepath);
+        std::cout << "Saving CNV data to: " << json_filepath << std::endl;
+    }
+    
     // Run ContextSV
     run(input_data);
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index e2d429bb..3dc567bc 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -373,7 +373,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                         // If two positions were found, use the 5'most position
                         primary_pos = std::min(primary_pos, primary_pos2);
                     }
-                    SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, "<INS>", "SPLITINS", "./.", 0.0, 0, 0, primary_cluster_size);
+                    SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, "<INS>", "SPLITDIST1", "./.", 0.0, 0, 0, primary_cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
                 }
             }
@@ -445,12 +445,12 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             // If the read distance is < 30bp while the SV is > 2kb, then this is a
             // potential deletion
             if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) {
-                SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDEL", "./.", 0.0, 0, 0, cluster_size);
+                SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDIST2", "./.", 0.0, 0, 0, cluster_size);
                 addSVCall(chr_sv_calls, sv_candidate);
 
                 // Add an inversion call if necessary
                 if (inversion) {
-                    SVCall sv_candidate(sv_start, sv_end, SVType::INV, "<INV>", "INVDEL", "./.", 0.0, 0, 0, cluster_size);
+                    SVCall sv_candidate(sv_start, sv_end, SVType::INV, "<INV>", "SPLITINV", "./.", 0.0, 0, 0, cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
                 }
             }
@@ -459,7 +459,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             else if (sv_length >= min_length && sv_length <= max_length) {
                 SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
                 std::string alt = (sv_type == SVType::INV) ? "<INV>" : ".";
-                SVCall sv_candidate(sv_start, sv_end, sv_type, alt, "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size);
+                SVCall sv_candidate(sv_start, sv_end, sv_type, alt, "SPLIT", "./.", 0.0, 0, 0, cluster_size);
                 addSVCall(chr_sv_calls, sv_candidate);
             }
         }
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index e4bb4699..9934805a 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -18,10 +18,6 @@ bool SVCall::operator<(const SVCall & other) const
 
 void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call)
 {
-    // if (sv_call.sv_type == SVType::UNKNOWN || sv_call.sv_type == SVType::NEUTRAL) {
-    //     return;
-    // }
-
     // Check if the SV call is valid
     if (sv_call.start > sv_call.end) {
         printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end));
@@ -232,7 +228,9 @@ void mergeDuplicateSVs(std::vector<SVCall> &sv_calls)
     });
     for (size_t i = 0; i < sv_calls.size(); i++) {
         SVCall& sv_call = sv_calls[i];
-        if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.sv_type == sv_calls[i - 1].sv_type) {
+        // For SVs at the same start position with the same SV type, keep the one
+        // with the highest likelihood
+        if (i > 0 && sv_call.start == sv_calls[i - 1].start && ((sv_call.sv_type == sv_calls[i - 1].sv_type) || sv_call.sv_type == SVType::UNKNOWN || sv_calls[i - 1].sv_type == SVType::UNKNOWN)) {
             // Keep the SV call with a non-zero likelihood
             // The HMM prediction is more reliable than the split read prediction
             if (sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) {
diff --git a/src/utils.cpp b/src/utils.cpp
index bb82abbc..884139a5 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -6,6 +6,7 @@
 #include <stdio.h>
 #include <string>
 #include <iostream>
+#include <fstream>
 /// @endcond
 
 
@@ -15,10 +16,7 @@ std::mutex print_mtx;
 // Print a progress bar
 void printProgress(int progress, int total)
 {
-    // Get the percentage
     float percent = (float)progress / (float)total * 100.0;
-
-    // Get the number of hashes
     int num_hashes = (int)(percent / 2.0);
 
     // Print the progress bar
@@ -121,3 +119,9 @@ void printMemoryUsage(const std::string& functionName) {
     std::cout << functionName << " memory usage: "
               << std::fixed << std::setprecision(2) << mem_usage_gb << " GB" << std::endl;
 }
+
+bool fileExists(const std::string &filepath)
+{
+    std::ifstream file(filepath);
+    return file.is_open();
+}

From 53cc23dc2a1a175aa603c4f29367e435c4a6b9e5 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 19 Mar 2025 17:43:47 -0400
Subject: [PATCH 089/134] fix json format

---
 include/utils.h          |  4 +++
 python/cnv_plots_json.py | 78 ++++++++++++++++++++++++++++++++++++++++
 src/sv_caller.cpp        | 11 +++++-
 src/utils.cpp            | 18 ++++++++++
 4 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 python/cnv_plots_json.py

diff --git a/include/utils.h b/include/utils.h
index e7bf164a..9f40d9d5 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -61,4 +61,8 @@ void printMemoryUsage(const std::string &functionName);
 
 bool fileExists(const std::string &filepath);
 
+void openJSON(const std::string & filepath);
+
+void closeJSON(const std::string & filepath);
+
 #endif // UTILS_H
diff --git a/python/cnv_plots_json.py b/python/cnv_plots_json.py
new file mode 100644
index 00000000..5fad7319
--- /dev/null
+++ b/python/cnv_plots_json.py
@@ -0,0 +1,78 @@
+import plotly.graph_objs as go
+import json
+import argparse
+
+# Set up argument parser
+parser = argparse.ArgumentParser(description='Generate CNV plots from JSON data.')
+parser.add_argument('json_file', type=str, help='Path to the JSON file containing SV data')
+args = parser.parse_args()
+
+# Load your JSON data
+with open(args.json_file) as f:
+    sv_data = json.load(f)
+
+# Loop through each SV (assuming your JSON contains multiple SVs)
+for sv in sv_data:
+    print(type(sv))
+
+    # Extract data for plotting
+    positions_before = sv['before_sv']['positions']
+    b_allele_freq_before = sv['before_sv']['b_allele_freq']
+    positions_after = sv['after_sv']['positions']
+    b_allele_freq_after = sv['after_sv']['b_allele_freq']
+    
+    # Generate hover text (optional, can be customized)
+    hover_text_before = [f"Position: {pos}, BAF: {baf}" for pos, baf in zip(positions_before, b_allele_freq_before)]
+    hover_text_after = [f"Position: {pos}, BAF: {baf}" for pos, baf in zip(positions_after, b_allele_freq_after)]
+    
+    # Plotting data for 'before_sv' and 'after_sv'
+    baf_trace_before = go.Scatter(
+        x=positions_before,
+        y=b_allele_freq_before,
+        mode="markers+lines",
+        name="B-Allele Frequency (Before SV)",
+        text=hover_text_before,
+        marker=dict(
+            color='blue',
+            size=10
+        ),
+        line=dict(
+            color="black",
+            width=0
+        ),
+        showlegend=False
+    )
+
+    baf_trace_after = go.Scatter(
+        x=positions_after,
+        y=b_allele_freq_after,
+        mode="markers+lines",
+        name="B-Allele Frequency (After SV)",
+        text=hover_text_after,
+        marker=dict(
+            color='red',
+            size=10
+        ),
+        line=dict(
+            color="black",
+            width=0
+        ),
+        showlegend=False
+    )
+    
+    # Create layout for the plot
+    layout = go.Layout(
+        title=f"SV Plot: {sv['chromosome']} {sv['start']}-{sv['end']} ({sv['sv_type']})",
+        xaxis=dict(title="Position"),
+        yaxis=dict(title="B-Allele Frequency"),
+        hovermode='closest'
+    )
+    
+    # Create figure with data and layout
+    fig = go.Figure(data=[baf_trace_before, baf_trace_after], layout=layout)
+    
+    # Save the plot to an HTML file (use a unique filename per SV)
+    file_name = f"output/SV_{sv['chromosome']}_{sv['start']}_{sv['end']}.html"
+    fig.write_html(file_name)
+
+    print(f"Plot saved as {file_name}")
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 3dc567bc..00aaff67 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -727,6 +727,12 @@ void SVCaller::run(const InputData& input_data)
     std::cout << "Reading HMM from file: " << hmm_filepath << std::endl;
     const CHMM& hmm = ReadCHMM(hmm_filepath.c_str());
 
+    // Set up the JSON output file for CNV data
+    const std::string& json_fp = input_data.getCNVOutputFile();
+    if (input_data.getSaveCNVData()) {
+        openJSON(json_fp);
+    }
+
     // Calculate the mean chromosome coverage and generate the position depth
     // maps for each chromosome (I/O is multi-threaded, which is more efficient
     // than per-chromosome multi-threading in this case)
@@ -844,7 +850,10 @@ void SVCaller::run(const InputData& input_data)
             this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
         }
     }
-    
+    if (input_data.getSaveCNVData()) {
+        closeJSON(json_fp);
+    }
+
     printMessage("Unifying SVs...");
     for (auto& entry : whole_genome_split_sv_calls) {
         const std::string& chr = entry.first;
diff --git a/src/utils.cpp b/src/utils.cpp
index 884139a5..553af91d 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -125,3 +125,21 @@ bool fileExists(const std::string &filepath)
     std::ifstream file(filepath);
     return file.is_open();
 }
+
+void openJSON(const std::string &filepath)
+{
+    // Add the initial [ and close
+    std::ofstream
+        json_file(filepath);
+    json_file << "[\n";
+    json_file.close();
+}
+
+void closeJSON(const std::string &filepath)
+{
+    // Add the final ] and close
+    std::ofstream
+        json_file(filepath, std::ios::app);
+    json_file << "]";
+    json_file.close();
+}

From 56fc8d2c7a34273519e24f2e4826535e7f2f393d Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 19 Mar 2025 19:39:26 -0400
Subject: [PATCH 090/134] json plots

---
 python/cnv_plots_json.py | 239 +++++++++++++++++++++++++++++++--------
 src/cnv_caller.cpp       |  22 +++-
 2 files changed, 209 insertions(+), 52 deletions(-)

diff --git a/python/cnv_plots_json.py b/python/cnv_plots_json.py
index 5fad7319..9190b53a 100644
--- a/python/cnv_plots_json.py
+++ b/python/cnv_plots_json.py
@@ -1,4 +1,5 @@
-import plotly.graph_objs as go
+import plotly
+from plotly.subplots import make_subplots
 import json
 import argparse
 
@@ -11,68 +12,206 @@
 with open(args.json_file) as f:
     sv_data = json.load(f)
 
+# State marker colors
+# https://community.plotly.com/t/plotly-colours-list/11730/6
+state_colors_dict = {
+    '1': 'red',
+    '2': 'darkred',
+    '3': 'darkgreen',
+    '4': 'green',
+    '5': 'darkblue',
+    '6': 'blue',
+}
+
+sv_type_dict = {
+    'DEL': 'Deletion',
+    'DUP': 'Duplication',
+    'INV': 'Inversion'
+}
+
 # Loop through each SV (assuming your JSON contains multiple SVs)
 for sv in sv_data:
-    print(type(sv))
 
     # Extract data for plotting
     positions_before = sv['before_sv']['positions']
     b_allele_freq_before = sv['before_sv']['b_allele_freq']
     positions_after = sv['after_sv']['positions']
     b_allele_freq_after = sv['after_sv']['b_allele_freq']
+
+    # Create a subplot for the CNV plot and the BAF plot.
+    fig = make_subplots(
+        rows=2,
+        cols=1,
+        shared_xaxes=True,
+        vertical_spacing=0.05,
+        subplot_titles=(r"SNP Log<sub>2</sub> Ratio", "SNP B-Allele Frequency")
+        )
+
+    # Get the chromosome, start, end, and sv_type from the SV data
+    chromosome = sv['chromosome']
+    start = sv['start']
+    end = sv['end']
+    sv_type = sv['sv_type']
+    likelihood = sv['likelihood']
+    sv_length = sv['size']
+
+    # Plot the data for 'before_sv', 'sv', and 'after_sv'
+    for section in ["before_sv", "sv", "after_sv"]:
+        positions = sv[section]['positions']
+        b_allele_freq = sv[section]['b_allele_freq']
+        population_freq = sv[section]['population_freq']
+        log2_ratio = sv[section]['log2_ratio']
+
+        if section == "sv":
+            is_snp = sv[section]['is_snp']
+            states = sv[section]['states']
+            state_colors = [state_colors_dict[str(state)] for state in states]
+            marker_symbols = ['circle' if is_snp_val else 'circle-open' for is_snp_val in is_snp]
+
+            # Set the hover text
+            hover_text = []
+            for i, position in enumerate(positions):
+                # Add hover text for each point
+                hover_text.append(
+                    f"Position: {position}<br>"
+                    f"State: {states[i]}<br>"
+                    f"Log2 Ratio: {log2_ratio[i]}<br>"
+                    f"SNP: {is_snp[i]}<br>"
+                    f"BAF: {b_allele_freq[i]}<br>"
+                    f"Population Frequency: {population_freq[i]}<br>"
+                )
+        else:
+            state_colors = ['black'] * len(positions)
+            marker_symbols = ['circle-open'] * len(positions)
+            hover_text = []
+            for i, position in enumerate(positions):
+                # Add hover text for each point
+                hover_text.append(
+                    f"Position: {position}<br>"
+                    f"Log2 Ratio: {log2_ratio[i]}<br>"
+                    f"BAF: {b_allele_freq[i]}<br>"
+                    f"Population Frequency: {population_freq[i]}<br>"
+                )
+
+        # Create the log2 trace
+        log2_trace = plotly.graph_objs.Scatter(
+            x=positions,
+            y=log2_ratio,
+            mode='markers+lines',
+            name=r'Log<sub>2</sub> Ratio',
+            text=hover_text,
+            hoverinfo='text',
+            marker=dict(
+                color=state_colors,
+                size=10,
+                symbol=marker_symbols,
+            ),
+            line=dict(
+                color='black',
+                width=0
+            ),
+            showlegend=False
+        )
+
+        # Create the BAF trace
+        baf_trace = plotly.graph_objs.Scatter(
+            x=positions,
+            y=b_allele_freq,
+            mode='markers+lines',
+            name='B-Allele Frequency',
+            text=hover_text,
+            hoverinfo='text',
+            marker=dict(
+                color=state_colors,
+                size=10,
+                symbol=marker_symbols,
+            ),
+            line=dict(
+                color='black',
+                width=0
+            ),
+            showlegend=False
+        )
+
+        if section == "sv":
+            # Create a shaded rectangle for the CNV, layering it below the CNV
+            # trace and labeling it with the CNV type.
+            fig.add_vrect(
+                x0 = start,
+                x1 = end,
+                fillcolor = "Black",
+                layer = "below",
+                line_width = 0,
+                opacity = 0.1,
+                annotation_text = '',
+                annotation_position = "top left",
+                annotation_font_size = 20,
+                annotation_font_color = "black"
+            )
+
+            # Add vertical lines at the start and end positions of the CNV.
+            fig.add_vline(
+                x = start,
+                line_width = 2,
+                line_color = "black",
+                layer = "below"
+            )
+
+            fig.add_vline(
+                x = end,
+                line_width = 2,
+                line_color = "black",
+                layer = "below"
+            )
+
+        # Add traces to the figure
+        fig.append_trace(log2_trace, row=1, col=1)
+        fig.append_trace(baf_trace, row=2, col=1)
     
-    # Generate hover text (optional, can be customized)
-    hover_text_before = [f"Position: {pos}, BAF: {baf}" for pos, baf in zip(positions_before, b_allele_freq_before)]
-    hover_text_after = [f"Position: {pos}, BAF: {baf}" for pos, baf in zip(positions_after, b_allele_freq_after)]
-    
-    # Plotting data for 'before_sv' and 'after_sv'
-    baf_trace_before = go.Scatter(
-        x=positions_before,
-        y=b_allele_freq_before,
-        mode="markers+lines",
-        name="B-Allele Frequency (Before SV)",
-        text=hover_text_before,
-        marker=dict(
-            color='blue',
-            size=10
-        ),
-        line=dict(
-            color="black",
-            width=0
-        ),
-        showlegend=False
+    # Set the x-axis title.
+    fig.update_xaxes(
+        title_text = "Chromosome Position",
+        row = 2,
+        col = 1
     )
 
-    baf_trace_after = go.Scatter(
-        x=positions_after,
-        y=b_allele_freq_after,
-        mode="markers+lines",
-        name="B-Allele Frequency (After SV)",
-        text=hover_text_after,
-        marker=dict(
-            color='red',
-            size=10
-        ),
-        line=dict(
-            color="black",
-            width=0
-        ),
-        showlegend=False
+    # Set the y-axis titles.
+    fig.update_yaxes(
+        title_text = r"Log<sub>2</sub> Ratio",
+        row = 1,
+        col = 1
     )
-    
-    # Create layout for the plot
-    layout = go.Layout(
-        title=f"SV Plot: {sv['chromosome']} {sv['start']}-{sv['end']} ({sv['sv_type']})",
-        xaxis=dict(title="Position"),
-        yaxis=dict(title="B-Allele Frequency"),
-        hovermode='closest'
+
+    fig.update_yaxes(
+        title_text = "B-Allele Frequency",
+        row = 2,
+        col = 1
     )
-    
-    # Create figure with data and layout
-    fig = go.Figure(data=[baf_trace_before, baf_trace_after], layout=layout)
-    
+
+    # Set the Y-axis range for the log2 ratio plot.
+    fig.update_yaxes(
+        range = [-2.0, 2.0],
+        row = 1,
+        col = 1
+    )
+
+    # Set the Y-axis range for the BAF plot.
+    fig.update_yaxes(
+        range = [-0.2, 1.2],
+        row = 2,
+        col = 1
+    )
+
+    # Set the title of the plot.
+    fig.update_layout(
+        title_text = f"{sv_type_dict[sv_type]} at {chromosome}:{start}-{end} ({sv_length} bp) (LLH={likelihood})",
+        title_x = 0.5,
+        showlegend = False,
+    )
+    #     height = 800,
+    #     width = 800
+    # )
     # Save the plot to an HTML file (use a unique filename per SV)
-    file_name = f"output/SV_{sv['chromosome']}_{sv['start']}_{sv['end']}.html"
+    file_name = f"output/SV_{chromosome}_{start}_{end}.html"
     fig.write_html(file_name)
-
     print(f"Plot saved as {file_name}")
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index b5d61745..7d6d6636 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -238,9 +238,18 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL);
     if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 50000)
     {
+        // Set B-allele and population frequency values to 0 for non-SNPs
+        for (size_t i = 0; i < snp_data.pos.size(); i++)
+        {
+            if (!snp_data.is_snp[i])
+            {
+                snp_data.baf[i] = 0.0;
+                snp_data.pfb[i] = 0.0;
+            }
+        }
+
+        // Save the SNP data to JSON
         std::string cnv_type_str = getSVTypeString(predicted_cnv_type);
-        // const std::string output_dir = input_data.getOutputDir();
-        // std::string json_filepath = output_dir + "/CNVCalls.json";
         std::string json_filepath = input_data.getCNVOutputFile();
         printMessage("Saving SV copy number predictions to " + json_filepath + "...");
 
@@ -856,6 +865,7 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN
     json_file << "  \"end\": " << end << ",\n";
     json_file << "  \"sv_type\": \"" << sv_type << "\",\n";
     json_file << "  \"likelihood\": " << likelihood << ",\n";
+    json_file << "  \"size\": " << (end - start + 1) << ",\n";
     json_file << "  \"before_sv\": {\n";
     json_file << "    \"positions\": [";
         for (size_t i = 0; i < before_sv.pos.size(); ++i)
@@ -964,6 +974,14 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN
             if (i < snp_data.state_sequence.size() - 1)
                 json_file << ", ";
         }
+        json_file << "],\n";
+    json_file << "    \"is_snp\": [";
+        for (size_t i = 0; i < snp_data.is_snp.size(); ++i)
+        {
+            json_file << snp_data.is_snp[i];
+            if (i < snp_data.is_snp.size() - 1)
+                json_file << ", ";
+        }
         json_file << "]\n";
     json_file << "  }\n";
     json_file << "}\n";

From 2e23c50137c0f10ae4b2dd5361ea320e61193e09 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 19 Mar 2025 20:43:02 -0400
Subject: [PATCH 091/134] fix multi-cnv json

---
 include/utils.h    |  2 +-
 src/cnv_caller.cpp | 13 ++++++++++++-
 src/sv_caller.cpp  |  6 +++---
 src/utils.cpp      | 14 ++++++--------
 4 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/include/utils.h b/include/utils.h
index 9f40d9d5..6eb1237d 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -61,7 +61,7 @@ void printMemoryUsage(const std::string &functionName);
 
 bool fileExists(const std::string &filepath);
 
-void openJSON(const std::string & filepath);
+bool isFileEmpty(const std::string &filepath);
 
 void closeJSON(const std::string & filepath);
 
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 7d6d6636..08e4fb4d 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -859,6 +859,17 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN
         std::cerr << "ERROR: Could not open JSON file for writing: " << filepath << std::endl;
         exit(1);
     }
+
+    // If not the first record, write the closing bracket
+    // Check if file is empty
+    if (isFileEmpty(filepath))
+    {
+        json_file << "[\n";
+    } else {
+        // Close the previous JSON object
+        json_file << "},\n";
+    }
+
     json_file << "{\n";
     json_file << "  \"chromosome\": \"" << chr << "\",\n";
     json_file << "  \"start\": " << start << ",\n";
@@ -984,7 +995,7 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN
         }
         json_file << "]\n";
     json_file << "  }\n";
-    json_file << "}\n";
+    // json_file << "},\n";
     json_file.close();
     printMessage("Saved copy number predictions for " + chr + ":" + std::to_string(start) + "-" + std::to_string(end) + " to " + filepath);
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 00aaff67..5fee517e 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -729,9 +729,9 @@ void SVCaller::run(const InputData& input_data)
 
     // Set up the JSON output file for CNV data
     const std::string& json_fp = input_data.getCNVOutputFile();
-    if (input_data.getSaveCNVData()) {
-        openJSON(json_fp);
-    }
+    // if (input_data.getSaveCNVData()) {
+    //     openJSON(json_fp);
+    // }
 
     // Calculate the mean chromosome coverage and generate the position depth
     // maps for each chromosome (I/O is multi-threaded, which is more efficient
diff --git a/src/utils.cpp b/src/utils.cpp
index 553af91d..a27263b7 100644
--- a/src/utils.cpp
+++ b/src/utils.cpp
@@ -7,6 +7,7 @@
 #include <string>
 #include <iostream>
 #include <fstream>
+#include <filesystem>
 /// @endcond
 
 
@@ -126,20 +127,17 @@ bool fileExists(const std::string &filepath)
     return file.is_open();
 }
 
-void openJSON(const std::string &filepath)
+bool isFileEmpty(const std::string &filepath)
 {
-    // Add the initial [ and close
-    std::ofstream
-        json_file(filepath);
-    json_file << "[\n";
-    json_file.close();
+    return std::filesystem::file_size(filepath) == 0;
 }
 
 void closeJSON(const std::string &filepath)
 {
-    // Add the final ] and close
     std::ofstream
         json_file(filepath, std::ios::app);
-    json_file << "]";
+
+    json_file << "}\n";  // Close the last JSON object
+    json_file << "]";  // Close the JSON array
     json_file.close();
 }

From 347e5ad42077ff20c7c98108ade6672f5219d46f Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 19 Mar 2025 21:07:42 -0400
Subject: [PATCH 092/134] update plots

---
 python/cnv_plots_json.py |  4 +++-
 src/cnv_caller.cpp       | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/python/cnv_plots_json.py b/python/cnv_plots_json.py
index 9190b53a..31a59110 100644
--- a/python/cnv_plots_json.py
+++ b/python/cnv_plots_json.py
@@ -81,8 +81,10 @@
                     f"Population Frequency: {population_freq[i]}<br>"
                 )
         else:
+            is_snp = sv[section]['is_snp']
             state_colors = ['black'] * len(positions)
-            marker_symbols = ['circle-open'] * len(positions)
+            # marker_symbols = ['circle-open'] * len(positions)
+            marker_symbols = ['circle' if is_snp_val else 'circle-open' for is_snp_val in is_snp]
             hover_text = []
             for i, position in enumerate(positions):
                 # Add hover text for each point
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 08e4fb4d..b5301a1e 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -247,6 +247,22 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
                 snp_data.pfb[i] = 0.0;
             }
         }
+        for (size_t i = 0; i < before_sv.pos.size(); i++)
+        {
+            if (!before_sv.is_snp[i])
+            {
+                before_sv.baf[i] = 0.0;
+                before_sv.pfb[i] = 0.0;
+            }
+        }
+        for (size_t i = 0; i < after_sv.pos.size(); i++)
+        {
+            if (!after_sv.is_snp[i])
+            {
+                after_sv.baf[i] = 0.0;
+                after_sv.pfb[i] = 0.0;
+            }
+        }
 
         // Save the SNP data to JSON
         std::string cnv_type_str = getSVTypeString(predicted_cnv_type);
@@ -909,6 +925,14 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN
             if (i < before_sv.log2_cov.size() - 1)
                 json_file << ", ";
         }
+        json_file << "],\n";
+    json_file << "    \"is_snp\": [";
+        for (size_t i = 0; i < snp_data.is_snp.size(); ++i)
+        {
+            json_file << snp_data.is_snp[i];
+            if (i < snp_data.is_snp.size() - 1)
+                json_file << ", ";
+        }
         json_file << "]\n";
     json_file << "  },\n";
     json_file << "  \"after_sv\": {\n";
@@ -943,6 +967,14 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN
             if (i < after_sv.log2_cov.size() - 1)
                 json_file << ", ";
         }
+        json_file << "],\n";
+    json_file << "    \"is_snp\": [";
+        for (size_t i = 0; i < snp_data.is_snp.size(); ++i)
+        {
+            json_file << snp_data.is_snp[i];
+            if (i < snp_data.is_snp.size() - 1)
+                json_file << ", ";
+        }
         json_file << "]\n";
     json_file << "  },\n";
     json_file << "  \"sv\": {\n";

From 708e7823079740549a272d233b20c50f79f94be8 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 19 Mar 2025 22:47:30 -0400
Subject: [PATCH 093/134] simplify snp analysis

---
 include/cnv_caller.h |   2 +-
 src/cnv_caller.cpp   | 108 ++++++++++++++++++++---------------------
 src/sv_caller.cpp    | 111 +++++++++++++++++++++----------------------
 3 files changed, 110 insertions(+), 111 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index b424b6c2..609238ef 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -91,7 +91,7 @@ class CNVCaller {
 
         void calculateMeanChromosomeCoverage(const std::vector<std::string>& chromosomes, std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map, std::unordered_map<std::string, double>& chr_mean_cov_map, const std::string& bam_filepath, int thread_count) const;
 
-        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, const InputData& input_data) const;
+        void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf, std::unordered_map<uint32_t, double>& snp_pfb, const InputData& input_data) const;
 
         // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions
         void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const;
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index b5301a1e..dc978c22 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -55,24 +55,23 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     // Initialize the SNP data with default values and sample size length
     int sample_size = input_data.getSampleSize();
     std::vector<uint32_t> snp_pos;
-    std::vector<double> snp_baf;
-    std::vector<double> snp_pfb;
-    this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, input_data);
+    std::unordered_map<uint32_t, double> snp_baf_map;
+    std::unordered_map<uint32_t, double> snp_pfb_map;
+    this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf_map, snp_pfb_map, input_data);
 
     // Get the log2 ratio for <sample_size> evenly spaced positions in the
     // region
     sample_size = std::max((int) snp_pos.size(), sample_size);
-    std::vector<uint32_t> snp_pos_hmm;
-    std::vector<double> snp_baf_hmm;
-    std::vector<double> snp_pfb_hmm;
-    std::vector<double> snp_log2_hmm;
-    std::vector<bool> is_snp_hmm;
 
     // Loop through evenly spaced positions in the region and get the log2 ratio
     double pos_step = (double) (end_pos - start_pos + 1) / (double) sample_size;
     std::unordered_set<uint32_t> snp_pos_set(snp_pos.begin(), snp_pos.end());
+    std::unordered_map<std::string, double> window_log2_map;
     for (int i = 0; i < sample_size; i++)
     {
+        uint32_t window_start = (uint32_t) (start_pos + i * pos_step);
+        uint32_t window_end = (uint32_t) (start_pos + (i + 1) * pos_step);
+
         // Calculate the mean depth for the window
         double cov_sum = 0.0;
         int pos_count = 0;
@@ -83,15 +82,11 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
             {
                 break;
             }
-            try
-            {
-                cov_sum += pos_depth_map.at(pos);
+            if (pos < pos_depth_map.size()) {
+                cov_sum += pos_depth_map[pos];
                 pos_count++;
             }
-            catch (const std::out_of_range& e)
-            {
-                // Ignore out of range errors
-            }
+
         }
         double log2_cov = 0.0;
         if (pos_count > 0)
@@ -104,34 +99,47 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
             log2_cov = log2((cov_sum / (double) pos_count) / mean_chr_cov);
         }
 
-        // Loop through positions and get the log2 ratio
-        bool snp_found_in_sample = false;
-        for (int j = 0; j < pos_step; j++)
-        {
-            uint32_t pos = (uint32_t) (start_pos + i * pos_step + j);
-            if (pos > end_pos)
-            {
-                break;
-            }
+        // Store the log2 ratio for the window
+        std::string window_key = std::to_string(window_start) + "-" + std::to_string(window_end);
+        window_log2_map[window_key] = log2_cov;
+    }
 
-            // Check if the position is a SNP
-            if (snp_pos_set.find(pos) != snp_pos_set.end())
+    // Create new vectors for the SNP data
+    std::vector<uint32_t> snp_pos_hmm;
+    std::vector<double> snp_baf_hmm;
+    std::vector<double> snp_pfb_hmm;
+    std::vector<double> snp_log2_hmm;
+    std::vector<bool> is_snp_hmm;
+
+    // Loop through the window ranges and append all SNPs in the range, using
+    // the log2 ratio for the window
+    for (const auto& window : window_log2_map)
+    {
+        uint32_t window_start = std::stoi(window.first.substr(0, window.first.find('-')));
+        uint32_t window_end = std::stoi(window.first.substr(window.first.find('-') + 1));
+        double log2_cov = window.second;
+
+        // Loop through the SNP positions and add them to the SNP data
+        bool snp_found = false;
+        for (uint32_t pos : snp_pos)
+        {
+            if (pos >= window_start && pos <= window_end)
             {
-                // Update the SNP data
                 snp_pos_hmm.push_back(pos);
-                snp_baf_hmm.push_back(snp_baf[i]);
-                snp_pfb_hmm.push_back(snp_pfb[i]);
+                snp_baf_hmm.push_back(snp_baf_map[pos]);
+                snp_pfb_hmm.push_back(snp_pfb_map[pos]);
                 snp_log2_hmm.push_back(log2_cov);
                 is_snp_hmm.push_back(true);
-                snp_found_in_sample = true;
+                snp_found = true;
             }
         }
-
-        // If no SNP was found in the sample, then use the center position
-        if (!snp_found_in_sample)
+        if (!snp_found)
         {
-            uint32_t pos = (uint32_t) (start_pos + (i * pos_step) + (pos_step / 2.0));
-            snp_pos_hmm.push_back(pos);
+            // If no SNPs were found in the window, add a dummy SNP with the
+            // log2 ratio for the window, using the window center as the SNP
+            // position
+            uint32_t window_center = (window_start + window_end) / 2;
+            snp_pos_hmm.push_back(window_center);
             snp_baf_hmm.push_back(-1.0);
             snp_pfb_hmm.push_back(0.5);
             snp_log2_hmm.push_back(log2_cov);
@@ -522,7 +530,7 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>&
     }
 }
 
-void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::vector<double>& snp_baf, std::vector<double>& snp_pfb, const InputData& input_data) const
+void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf, std::unordered_map<uint32_t, double>& snp_pfb, const InputData& input_data) const
 {
     // Lock during reading
     std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
@@ -700,8 +708,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
 
             // Add the SNP position and BAF information
             snp_pos.push_back(pos);
-            snp_baf.push_back(baf);
-            snp_pfb.push_back(0.5);
+            snp_baf[pos] = baf;
+            printMessage("SNP found: " + chr + ":" + std::to_string(pos) + " BAF: " + std::to_string(baf));
             snp_found = true;
         }
     }
@@ -724,11 +732,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     uint32_t min_snp_pos = *std::min_element(snp_pos.begin(), snp_pos.end());
     uint32_t max_snp_pos = *std::max_element(snp_pos.begin(), snp_pos.end());
     std::unordered_set<uint32_t> snp_pos_set(snp_pos.begin(), snp_pos.end());
-    std::unordered_map<uint32_t, double> snp_index_map;
-    for (size_t i = 0; i < snp_pos.size(); i++)
-    {
-        snp_index_map[snp_pos[i]] = i;
-    }
     if (use_pfb)
     {
         // Set the region for the population allele frequency reader
@@ -757,9 +760,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
                 continue;  // Skip if the SNP position is not in the set
             }
 
-            // Get the SNP position index
-            size_t i = snp_index_map[pfb_pos];
-
             // Get the population frequency for the SNP
             int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count);
             if (pfb_status < 0 || count == 0)
@@ -773,7 +773,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             {
                 continue;
             }
-            snp_pfb[i] = pfb;
+            // snp_pfb[i] = pfb;
+            snp_pfb[pfb_pos] = pfb;
+            printMessage("Population frequency found: " + chr + ":" + std::to_string(pfb_pos) + " PFB: " + std::to_string(pfb));
             break;
         }
         free(pfb_f);
@@ -927,10 +929,10 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN
         }
         json_file << "],\n";
     json_file << "    \"is_snp\": [";
-        for (size_t i = 0; i < snp_data.is_snp.size(); ++i)
+        for (size_t i = 0; i < before_sv.is_snp.size(); ++i)
         {
-            json_file << snp_data.is_snp[i];
-            if (i < snp_data.is_snp.size() - 1)
+            json_file << before_sv.is_snp[i];
+            if (i < before_sv.is_snp.size() - 1)
                 json_file << ", ";
         }
         json_file << "]\n";
@@ -969,10 +971,10 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN
         }
         json_file << "],\n";
     json_file << "    \"is_snp\": [";
-        for (size_t i = 0; i < snp_data.is_snp.size(); ++i)
+        for (size_t i = 0; i < after_sv.is_snp.size(); ++i)
         {
-            json_file << snp_data.is_snp[i];
-            if (i < snp_data.is_snp.size() - 1)
+            json_file << after_sv.is_snp[i];
+            if (i < after_sv.is_snp.size() - 1)
                 json_file << ", ";
         }
         json_file << "]\n";
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 5fee517e..08530c16 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -729,9 +729,6 @@ void SVCaller::run(const InputData& input_data)
 
     // Set up the JSON output file for CNV data
     const std::string& json_fp = input_data.getCNVOutputFile();
-    // if (input_data.getSaveCNVData()) {
-    //     openJSON(json_fp);
-    // }
 
     // Calculate the mean chromosome coverage and generate the position depth
     // maps for each chromosome (I/O is multi-threaded, which is more efficient
@@ -772,65 +769,65 @@ void SVCaller::run(const InputData& input_data)
 
     // Use multi-threading across chromosomes. If a single chromosome is
     // specified, use a single main thread (multi-threading is used for file I/O)
-    int thread_count = 1;
-    if (!input_data.isSingleChr()) {
-        thread_count = input_data.getThreadCount();
-        std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl;
-    }
-    ThreadPool pool(thread_count);
-    auto process_chr = [&](const std::string& chr) {
-        try {
-            std::vector<SVCall> sv_calls;
-            std::vector<SVCall> split_sv_calls;
-            InputData chr_input_data = input_data;  // Use a thread-local copy
-            this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]);
-            {
-                std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
-                whole_genome_sv_calls[chr] = std::move(sv_calls);
-            }
-        } catch (const std::exception& e) {
-            printError("Error processing chromosome " + chr + ": " + e.what());
-        } catch (...) {
-            printError("Unknown error processing chromosome " + chr);
-        }
-    };
-
-    // Submit tasks to the thread pool and track futures
-    std::vector<std::future<void>> futures;
-    for (const auto& chr : chromosomes) {
-        futures.emplace_back(pool.enqueue([&, chr] {
-            // printMessage("Processing chromosome " + chr);
-            process_chr(chr);
-        }));
-    }
+    // int thread_count = 1;
+    // if (!input_data.isSingleChr()) {
+    //     thread_count = input_data.getThreadCount();
+    //     std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl;
+    // }
+    // ThreadPool pool(thread_count);
+    // auto process_chr = [&](const std::string& chr) {
+    //     try {
+    //         std::vector<SVCall> sv_calls;
+    //         std::vector<SVCall> split_sv_calls;
+    //         InputData chr_input_data = input_data;  // Use a thread-local copy
+    //         this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]);
+    //         {
+    //             std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
+    //             whole_genome_sv_calls[chr] = std::move(sv_calls);
+    //         }
+    //     } catch (const std::exception& e) {
+    //         printError("Error processing chromosome " + chr + ": " + e.what());
+    //     } catch (...) {
+    //         printError("Unknown error processing chromosome " + chr);
+    //     }
+    // };
+
+    // // Submit tasks to the thread pool and track futures
+    // std::vector<std::future<void>> futures;
+    // for (const auto& chr : chromosomes) {
+    //     futures.emplace_back(pool.enqueue([&, chr] {
+    //         // printMessage("Processing chromosome " + chr);
+    //         process_chr(chr);
+    //     }));
+    // }
 
-    // // Wait for all tasks to complete
-    for (auto& future : futures) {
-        try {
-            current_chr++;
-            future.get();
-        } catch (const std::exception& e) {
-            printError("Error processing chromosome task: " + std::string(e.what()));
-        } catch (...) {
-            printError("Unknown error processing chromosome task.");
-        }
-    }
-    printMessage("All tasks have finished.");
+    // // // Wait for all tasks to complete
+    // for (auto& future : futures) {
+    //     try {
+    //         current_chr++;
+    //         future.get();
+    //     } catch (const std::exception& e) {
+    //         printError("Error processing chromosome task: " + std::string(e.what()));
+    //     } catch (...) {
+    //         printError("Unknown error processing chromosome task.");
+    //     }
+    // }
+    // printMessage("All tasks have finished.");
 
     // -------------------------------------------------------
     // Run copy number variant predictions on the SVs detected from the
     // CIGAR string, using a minimum CNV length threshold
-    current_chr = 0;
-    printMessage("Running copy number predictions on CIGAR SVs...");
-    for (auto& entry : whole_genome_sv_calls) {
-        current_chr++;
-        const std::string& chr = entry.first;
-        std::vector<SVCall>& sv_calls = entry.second;
-        if (sv_calls.size() > 0) {
-            printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
-            cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
-        }
-    }
+    // current_chr = 0;
+    // printMessage("Running copy number predictions on CIGAR SVs...");
+    // for (auto& entry : whole_genome_sv_calls) {
+    //     current_chr++;
+    //     const std::string& chr = entry.first;
+    //     std::vector<SVCall>& sv_calls = entry.second;
+    //     if (sv_calls.size() > 0) {
+    //         printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
+    //         cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
+    //     }
+    // }
     // -------------------------------------------------------
 
     // Identify split-SV signatures

From c6fe7af61eea8052f445b07505d443ff9fc664fe Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 19 Mar 2025 23:08:02 -0400
Subject: [PATCH 094/134] remove debug output

---
 src/cnv_caller.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index dc978c22..26bca566 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -709,7 +709,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             // Add the SNP position and BAF information
             snp_pos.push_back(pos);
             snp_baf[pos] = baf;
-            printMessage("SNP found: " + chr + ":" + std::to_string(pos) + " BAF: " + std::to_string(baf));
             snp_found = true;
         }
     }
@@ -773,9 +772,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
             {
                 continue;
             }
-            // snp_pfb[i] = pfb;
             snp_pfb[pfb_pos] = pfb;
-            printMessage("Population frequency found: " + chr + ":" + std::to_string(pfb_pos) + " PFB: " + std::to_string(pfb));
             break;
         }
         free(pfb_f);

From 255b9a3a8086a7153fa15cbc6066c57044078fef Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 19 Mar 2025 23:24:49 -0400
Subject: [PATCH 095/134] remove test code

---
 src/sv_caller.cpp | 110 +++++++++++++++++++++++-----------------------
 1 file changed, 55 insertions(+), 55 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 08530c16..0abc413a 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -769,65 +769,65 @@ void SVCaller::run(const InputData& input_data)
 
     // Use multi-threading across chromosomes. If a single chromosome is
     // specified, use a single main thread (multi-threading is used for file I/O)
-    // int thread_count = 1;
-    // if (!input_data.isSingleChr()) {
-    //     thread_count = input_data.getThreadCount();
-    //     std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl;
-    // }
-    // ThreadPool pool(thread_count);
-    // auto process_chr = [&](const std::string& chr) {
-    //     try {
-    //         std::vector<SVCall> sv_calls;
-    //         std::vector<SVCall> split_sv_calls;
-    //         InputData chr_input_data = input_data;  // Use a thread-local copy
-    //         this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]);
-    //         {
-    //             std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
-    //             whole_genome_sv_calls[chr] = std::move(sv_calls);
-    //         }
-    //     } catch (const std::exception& e) {
-    //         printError("Error processing chromosome " + chr + ": " + e.what());
-    //     } catch (...) {
-    //         printError("Unknown error processing chromosome " + chr);
-    //     }
-    // };
-
-    // // Submit tasks to the thread pool and track futures
-    // std::vector<std::future<void>> futures;
-    // for (const auto& chr : chromosomes) {
-    //     futures.emplace_back(pool.enqueue([&, chr] {
-    //         // printMessage("Processing chromosome " + chr);
-    //         process_chr(chr);
-    //     }));
-    // }
-
-    // // // Wait for all tasks to complete
-    // for (auto& future : futures) {
-    //     try {
-    //         current_chr++;
-    //         future.get();
-    //     } catch (const std::exception& e) {
-    //         printError("Error processing chromosome task: " + std::string(e.what()));
-    //     } catch (...) {
-    //         printError("Unknown error processing chromosome task.");
-    //     }
-    // }
-    // printMessage("All tasks have finished.");
+    int thread_count = 1;
+    if (!input_data.isSingleChr()) {
+        thread_count = input_data.getThreadCount();
+        std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl;
+    }
+    ThreadPool pool(thread_count);
+    auto process_chr = [&](const std::string& chr) {
+        try {
+            std::vector<SVCall> sv_calls;
+            std::vector<SVCall> split_sv_calls;
+            InputData chr_input_data = input_data;  // Use a thread-local copy
+            this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]);
+            {
+                std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
+                whole_genome_sv_calls[chr] = std::move(sv_calls);
+            }
+        } catch (const std::exception& e) {
+            printError("Error processing chromosome " + chr + ": " + e.what());
+        } catch (...) {
+            printError("Unknown error processing chromosome " + chr);
+        }
+    };
+
+    // Submit tasks to the thread pool and track futures
+    std::vector<std::future<void>> futures;
+    for (const auto& chr : chromosomes) {
+        futures.emplace_back(pool.enqueue([&, chr] {
+            // printMessage("Processing chromosome " + chr);
+            process_chr(chr);
+        }));
+    }
+
+    // Wait for all tasks to complete
+    for (auto& future : futures) {
+        try {
+            current_chr++;
+            future.get();
+        } catch (const std::exception& e) {
+            printError("Error processing chromosome task: " + std::string(e.what()));
+        } catch (...) {
+            printError("Unknown error processing chromosome task.");
+        }
+    }
+    printMessage("All tasks have finished.");
 
     // -------------------------------------------------------
     // Run copy number variant predictions on the SVs detected from the
     // CIGAR string, using a minimum CNV length threshold
-    // current_chr = 0;
-    // printMessage("Running copy number predictions on CIGAR SVs...");
-    // for (auto& entry : whole_genome_sv_calls) {
-    //     current_chr++;
-    //     const std::string& chr = entry.first;
-    //     std::vector<SVCall>& sv_calls = entry.second;
-    //     if (sv_calls.size() > 0) {
-    //         printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
-    //         cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
-    //     }
-    // }
+    current_chr = 0;
+    printMessage("Running copy number predictions on CIGAR SVs...");
+    for (auto& entry : whole_genome_sv_calls) {
+        current_chr++;
+        const std::string& chr = entry.first;
+        std::vector<SVCall>& sv_calls = entry.second;
+        if (sv_calls.size() > 0) {
+            printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
+            cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
+        }
+    }
     // -------------------------------------------------------
 
     // Identify split-SV signatures

From 57c6c0c751294fd7acafb5b2c17b9258117ffcda Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 21 Mar 2025 14:07:29 -0400
Subject: [PATCH 096/134] add clipped base insertions

---
 src/sv_caller.cpp | 29 ++++++++++++++++++++++++++++-
 1 file changed, 28 insertions(+), 1 deletion(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 0abc413a..04338063 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -537,6 +537,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
         amb_bases_bitset.set(base);
         amb_bases_bitset.set(std::tolower(base));
     }
+
     for (int i = 0; i < cigar_len; i++) {
         int op_len = bam_cigar_oplen(cigar[i]);  // CIGAR operation length
         int op = bam_cigar_op(cigar[i]);  // CIGAR operation
@@ -599,6 +600,33 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                 }
                 SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, 1, 0);
                 addSVCall(sv_calls, sv_call);
+            
+            // Process clipped bases as potential insertions
+            } else if (op == BAM_CSOFT_CLIP && is_primary) {
+                // Get the sequence of the insertion from the query
+                std::string ins_seq_str(op_len, ' ');
+                for (int j = 0; j < op_len; j++) {
+                    // Replace ambiguous bases with N
+                    char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
+                    if (amb_bases_bitset.test(base)) {
+                        ins_seq_str[j] = 'N';
+                    } else {
+                        ins_seq_str[j] = base;
+                    }
+                }
+
+                // Add as an insertion
+                uint32_t ins_pos = pos + 1;
+                uint32_t ins_end = ins_pos + op_len - 1;
+                int read_depth = this->getReadDepth(pos_depth_map, ins_pos-1);
+                
+                // Determine the ALT allele format based on small vs. large insertion
+                std::string alt_allele = "<INS>";
+                if (op_len <= 50) {
+                    alt_allele = ins_seq_str;
+                }
+                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARCLIP", "./.", default_lh, read_depth, 1, 0);
+                addSVCall(sv_calls, sv_call);
 
             // Check if the CIGAR operation is a deletion
             } else if (op == BAM_CDEL && is_primary) {
@@ -778,7 +806,6 @@ void SVCaller::run(const InputData& input_data)
     auto process_chr = [&](const std::string& chr) {
         try {
             std::vector<SVCall> sv_calls;
-            std::vector<SVCall> split_sv_calls;
             InputData chr_input_data = input_data;  // Use a thread-local copy
             this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]);
             {

From 190a699528e16da7bced58bce1c6ad52e82a677a Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 22 Mar 2025 15:22:44 -0400
Subject: [PATCH 097/134] improve split read merging

---
 include/sv_object.h |   2 +-
 src/dbscan.cpp      |   3 -
 src/sv_caller.cpp   | 206 ++++++++++++++++++++++----------------------
 src/sv_object.cpp   |  92 +++++++++++++-------
 4 files changed, 165 insertions(+), 138 deletions(-)

diff --git a/include/sv_object.h b/include/sv_object.h
index d838e968..155b50bf 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -40,6 +40,6 @@ uint32_t getSVCount(const std::vector<SVCall>& sv_calls);
 void concatenateSVCalls(std::vector<SVCall>& sv_calls, const std::vector<SVCall>& sv_calls_update);
 
 // Merge SVs using DBSCAN clustering
-void mergeSVs(std::vector<SVCall> &sv_calls, double epsilon, int min_pts);
+void mergeSVs(std::vector<SVCall> &sv_calls, double epsilon, int min_pts, bool keep_noise);
 
 #endif // SV_OBJECT_H
diff --git a/src/dbscan.cpp b/src/dbscan.cpp
index d6c41346..c1f3f314 100644
--- a/src/dbscan.cpp
+++ b/src/dbscan.cpp
@@ -8,13 +8,10 @@
 
 void DBSCAN::fit(const std::vector<SVCall>& sv_calls) {
     int clusterId = 0;
-    // clusters.assign(points.size(), -1); // -1 means unclassified
     clusters.assign(sv_calls.size(), -1); // -1 means unclassified
 
-    // for (size_t i = 0; i < points.size(); ++i) {
     for (size_t i = 0; i < sv_calls.size(); ++i) {
         if (clusters[i] == -1) { // if point is not yet classified
-            // if (expandCluster(points, i, clusterId)) {
             if (expandCluster(sv_calls, i, clusterId)) {
                 ++clusterId;
             }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 04338063..9969342f 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -445,13 +445,17 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             // If the read distance is < 30bp while the SV is > 2kb, then this is a
             // potential deletion
             if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) {
-                SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDIST2", "./.", 0.0, 0, 0, cluster_size);
-                addSVCall(chr_sv_calls, sv_candidate);
+                // SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDIST2", "./.", 0.0, 0, 0, cluster_size);
+                // addSVCall(chr_sv_calls, sv_candidate);
 
                 // Add an inversion call if necessary
                 if (inversion) {
+                    // printMessage("[TEST] Found inversion at " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + ", length=" + std::to_string(sv_length));
                     SVCall sv_candidate(sv_start, sv_end, SVType::INV, "<INV>", "SPLITINV", "./.", 0.0, 0, 0, cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
+                } else {
+                    SVCall sv_candidate(sv_start, sv_end, SVType::DEL, "<DEL>", "SPLITDIST2", "./.", 0.0, 0, 0, cluster_size);
+                    addSVCall(chr_sv_calls, sv_candidate);
                 }
             }
 
@@ -477,11 +481,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
 
         // Print the number of merged SV calls
         printMessage(chr_name + ": Found " + std::to_string(sv_calls[chr_name].size()) + " SV candidates");
-
-        // Print all SV calls
-        for (const SVCall& sv_call : sv_calls[chr_name]) {
-            printMessage("SV: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", length: " + std::to_string(sv_call.end - sv_call.start + 1) + ", cluster size: " + std::to_string(sv_call.cluster_size) + ", group: " + std::to_string(current_group));
-        }
     }
 }
 
@@ -725,7 +724,7 @@ void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& ch
     this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome);
 
     printMessage(chr + ": Merging CIGAR...");
-    mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts);
+    mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false);
 
     int region_sv_count = getSVCount(chr_sv_calls);
     printMessage(chr + ": Found " + std::to_string(region_sv_count) + " SV candidates in the CIGAR string");
@@ -733,6 +732,9 @@ void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& ch
 
 void SVCaller::run(const InputData& input_data)
 {
+    bool cigar_svs = true;
+    bool split_svs = true;
+
     // Set up the reference genome
     printMessage("Loading the reference genome...");
     const std::string ref_filepath = input_data.getRefGenome();
@@ -795,96 +797,109 @@ void SVCaller::run(const InputData& input_data)
     int current_chr = 0;
     int total_chr_count = chromosomes.size();
 
-    // Use multi-threading across chromosomes. If a single chromosome is
-    // specified, use a single main thread (multi-threading is used for file I/O)
-    int thread_count = 1;
-    if (!input_data.isSingleChr()) {
-        thread_count = input_data.getThreadCount();
-        std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl;
-    }
-    ThreadPool pool(thread_count);
-    auto process_chr = [&](const std::string& chr) {
-        try {
-            std::vector<SVCall> sv_calls;
-            InputData chr_input_data = input_data;  // Use a thread-local copy
-            this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]);
-            {
-                std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
-                whole_genome_sv_calls[chr] = std::move(sv_calls);
+    if (cigar_svs) {
+        // Use multi-threading across chromosomes. If a single chromosome is
+        // specified, use a single main thread (multi-threading is used for file I/O)
+        int thread_count = 1;
+        if (!input_data.isSingleChr()) {
+            thread_count = input_data.getThreadCount();
+            std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl;
+        }
+        ThreadPool pool(thread_count);
+        auto process_chr = [&](const std::string& chr) {
+            try {
+                std::vector<SVCall> sv_calls;
+                InputData chr_input_data = input_data;  // Use a thread-local copy
+                this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]);
+                {
+                    std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
+                    whole_genome_sv_calls[chr] = std::move(sv_calls);
+                }
+            } catch (const std::exception& e) {
+                printError("Error processing chromosome " + chr + ": " + e.what());
+            } catch (...) {
+                printError("Unknown error processing chromosome " + chr);
             }
-        } catch (const std::exception& e) {
-            printError("Error processing chromosome " + chr + ": " + e.what());
-        } catch (...) {
-            printError("Unknown error processing chromosome " + chr);
+        };
+
+        // Submit tasks to the thread pool and track futures
+        std::vector<std::future<void>> futures;
+        for (const auto& chr : chromosomes) {
+            futures.emplace_back(pool.enqueue([&, chr] {
+                // printMessage("Processing chromosome " + chr);
+                process_chr(chr);
+            }));
         }
-    };
 
-    // Submit tasks to the thread pool and track futures
-    std::vector<std::future<void>> futures;
-    for (const auto& chr : chromosomes) {
-        futures.emplace_back(pool.enqueue([&, chr] {
-            // printMessage("Processing chromosome " + chr);
-            process_chr(chr);
-        }));
-    }
-
-    // Wait for all tasks to complete
-    for (auto& future : futures) {
-        try {
+        // Wait for all tasks to complete
+        for (auto& future : futures) {
+            try {
+                current_chr++;
+                future.get();
+            } catch (const std::exception& e) {
+                printError("Error processing chromosome task: " + std::string(e.what()));
+            } catch (...) {
+                printError("Unknown error processing chromosome task.");
+            }
+        }
+        printMessage("All tasks have finished.");
+
+        // -------------------------------------------------------
+        // Run copy number variant predictions on the SVs detected from the
+        // CIGAR string, using a minimum CNV length threshold
+        current_chr = 0;
+        printMessage("Running copy number predictions on CIGAR SVs...");
+        for (auto& entry : whole_genome_sv_calls) {
             current_chr++;
-            future.get();
-        } catch (const std::exception& e) {
-            printError("Error processing chromosome task: " + std::string(e.what()));
-        } catch (...) {
-            printError("Unknown error processing chromosome task.");
+            const std::string& chr = entry.first;
+            std::vector<SVCall>& sv_calls = entry.second;
+            if (sv_calls.size() > 0) {
+                printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
+                cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
+            }
         }
+        // -------------------------------------------------------
     }
-    printMessage("All tasks have finished.");
-
-    // -------------------------------------------------------
-    // Run copy number variant predictions on the SVs detected from the
-    // CIGAR string, using a minimum CNV length threshold
-    current_chr = 0;
-    printMessage("Running copy number predictions on CIGAR SVs...");
-    for (auto& entry : whole_genome_sv_calls) {
-        current_chr++;
-        const std::string& chr = entry.first;
-        std::vector<SVCall>& sv_calls = entry.second;
-        if (sv_calls.size() > 0) {
-            printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
-            cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
+    
+    if (split_svs) {
+        // Identify split-SV signatures
+        printMessage("Identifying split-SV signatures...");
+        std::unordered_map<std::string, std::vector<SVCall>> whole_genome_split_sv_calls;
+        this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data);
+
+        printMessage("Running copy number predictions on split-read SVs...");
+        current_chr = 0;
+        for (auto& entry : whole_genome_split_sv_calls) {
+            const std::string& chr = entry.first;
+            std::vector<SVCall>& sv_calls = entry.second;
+
+            if (sv_calls.size() > 0) {
+                current_chr++;
+                printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates...");
+                this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
+            }
         }
-    }
-    // -------------------------------------------------------
-
-    // Identify split-SV signatures
-    printMessage("Identifying split-SV signatures...");
-    std::unordered_map<std::string, std::vector<SVCall>> whole_genome_split_sv_calls;
-    this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data);
 
-    printMessage("Running copy number predictions on split-read SVs...");
-    current_chr = 0;
-    for (auto& entry : whole_genome_split_sv_calls) {
-        const std::string& chr = entry.first;
-        std::vector<SVCall>& sv_calls = entry.second;
+        printMessage("Merging split-read SVs...");
+        int min_pts = 2;
+        for (auto& entry : whole_genome_split_sv_calls) {
+            const std::string& chr = entry.first;
+            std::vector<SVCall>& sv_calls = entry.second;
+            mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), min_pts, true);
+        }
 
-        if (sv_calls.size() > 0) {
-            current_chr++;
-            printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates...");
-            this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
+        printMessage("Unifying SVs...");
+        for (auto& entry : whole_genome_split_sv_calls) {
+            const std::string& chr = entry.first;
+            std::vector<SVCall>& sv_calls = entry.second;
+            whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end());
         }
     }
+
     if (input_data.getSaveCNVData()) {
         closeJSON(json_fp);
     }
 
-    printMessage("Unifying SVs...");
-    for (auto& entry : whole_genome_split_sv_calls) {
-        const std::string& chr = entry.first;
-        std::vector<SVCall>& sv_calls = entry.second;
-        whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end());
-    }
-
     // Print the total number of SVs detected for each chromosome
     uint32_t total_sv_count = 0;
     for (const auto& entry : whole_genome_sv_calls) {
@@ -958,7 +973,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         contig_header,
         "##INFO=<ID=END,Number=1,Type=Integer,Description=\"End position of the variant described in this record\">",
         "##INFO=<ID=SVTYPE,Number=1,Type=String,Description=\"Type of structural variant\">",
-        "##INFO=<ID=SVTYPE2,Number=1,Type=String,Description=\"Type of structural variant (if more than one)\">",
         "##INFO=<ID=SVLEN,Number=1,Type=Integer,Description=\"Difference in length between REF and ALT alleles\">",
         "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Method used to call the structural variant\">",
         "##INFO=<ID=ALN,Number=1,Type=String,Description=\"Feature used to identify the structural variant\">",
@@ -1029,19 +1043,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                 total_count += 1;
             }
 
-            // For complex SVs, split the SV into multiple types (SVTYPE +
-            // SVTYPE2)
-            SVType sv_type2 = SVType::UNKNOWN;
-            if (sv_type == SVType::INV_DEL) {
-                printError("Warning: Inversion and deletion detected at " + chr + ":" + std::to_string(start) + "-" + std::to_string(end));
-                sv_type = SVType::DEL;
-                sv_type2 = SVType::INV;
-            } else if (sv_type == SVType::INV_DUP) {
-                printError("Warning: Inversion and duplication detected at " + chr + ":" + std::to_string(start) + "-" + std::to_string(end));
-                sv_type = SVType::DUP;
-                sv_type2 = SVType::INV;
-            }
-
             // Deletion
             if (sv_type == SVType::DEL) {
                 // Get the deleted sequence from the reference genome, also including the preceding base
@@ -1094,13 +1095,10 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
             // Create the VCF parameter strings
             std::string sv_type_str = getSVTypeString(sv_type);
-            std::string sv_type2_str = ".";
-            if (sv_type2 != SVType::UNKNOWN) {
-                sv_type2_str = getSVTypeString(sv_type2);
-            }
-            std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
-                ";SVTYPE2=" + sv_type2_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
-                ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(support) + ";CLUSTER=" + std::to_string(cluster_size);
+            std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(support) + ";CLUSTER=" + std::to_string(cluster_size);
+            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
+            //     ";SVTYPE2=" + sv_type2_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
+            //     ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(support) + ";CLUSTER=" + std::to_string(cluster_size);
                 
             std::string format_str = "GT:DP";
             std::string sample_str = genotype + ":" + std::to_string(read_depth);
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 9934805a..89c2b0c3 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -39,7 +39,7 @@ void concatenateSVCalls(std::vector<SVCall> &target, const std::vector<SVCall>&
     target.insert(target.end(), source.begin(), source.end());
 }
 
-void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
+void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool keep_noise)
 {
     printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts));
     
@@ -78,9 +78,19 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
 
         dbscan.fit(sv_type_calls);
         const std::vector<int>& clusters = dbscan.getClusters();
-        std::map<int, std::vector<SVCall>> cluster_map;
-        for (size_t i = 0; i < clusters.size(); ++i) {
-            cluster_map[clusters[i]].push_back(sv_type_calls[i]);
+        std::map<int, std::vector<SVCall>> cluster_map;  // Cluster ID to SV calls
+        // Create a map of cluster IDs to SV calls
+        if (sv_type == SVType::INS) {
+            // Add only non-CIGARCLIP SVs to the cluster map
+            for (size_t i = 0; i < clusters.size(); ++i) {
+                if (sv_type_calls[i].data_type != "CIGARCLIP") {
+                    cluster_map[clusters[i]].push_back(sv_type_calls[i]);
+                }
+            }
+        } else {
+            for (size_t i = 0; i < clusters.size(); ++i) {
+                cluster_map[clusters[i]].push_back(sv_type_calls[i]);
+            }
         }
 
         // Merge SVs in each cluster
@@ -89,7 +99,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
             int cluster_id = cluster.first;
             std::vector<SVCall>& cluster_sv_calls = cluster.second;
 
-
             // [TEST] If insertions, and if any SV has length between 9400 and
             // 9500, print all SV coordinates in the cluster
             bool print_all = false;
@@ -121,8 +130,14 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
                 }
             }
 
-            if (cluster_id < 0) {
-                // Add all noise points to the merged list if >10 kb
+            if (cluster_id < 0 && keep_noise) {
+
+                // Add all unclustered points to the merged list
+                for (const auto& sv_call : cluster_sv_calls) {
+                    SVCall noise_sv_call = sv_call;
+                    merged_sv_calls.push_back(noise_sv_call);
+                }
+
                 // for (const auto& sv_call : cluster_sv_calls) {
                 //     if ((sv_call.end - sv_call.start)+1 >= 10000) {
                 //         SVCall noise_sv_call = sv_call;
@@ -131,9 +146,14 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
                 //         printMessage("[TEST] Adding noise SV " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1));
                 //     }
                 // }
-                continue;  // Skip noise and unclassified points
+                // continue;  // Skip noise and unclassified points
             } else {
             // if (true) {
+
+                // ----------------------------
+                // HMM-BASED MERGING
+                // ----------------------------
+                
                 // Check if any SV has a non-zero likelihood
                 bool has_nonzero_likelihood = false;
                 if (cluster_sv_calls.size() > 0) {
@@ -159,14 +179,27 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
                     auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) {
                         return sv_call.hmm_likelihood != 0.0;
                     });
+
+                    // Add SV call
                     merged_sv_call = *it;
+                    merged_sv_calls.push_back(merged_sv_call);
 
                     // [TEST]
-                    if (print_all) {
-                        printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with highest likelihood SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1));
-                    }
+                    // print_all = true;
+                    // if (print_all) {
+                    //     printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with highest likelihood SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1));
+                    //     printMessage("SV type: " + getSVTypeString(merged_sv_call.sv_type));
+                    //     printMessage("Cluster members:");
+                    //     for (const auto& sv_call : cluster_sv_calls) {
+                    //         printMessage("  " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1));
+                    //     }
+                    // }
+
+                // ----------------------------
+                // CIGAR-BASED MERGING
+                // ----------------------------
 
-                } else {
+                } else if (cluster_sv_calls.size() > 1) {  // Could be low if all CIGARCLIP
                     // Use the median length SV of the top 10% of the cluster
                     // (shorter reads are often noise)
                     std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
@@ -181,18 +214,9 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
                     size_t median_index = top_10.size() / 2;
                     merged_sv_call = top_10[median_index];
 
-                    // // Get the starting index of the top 10% of the cluster
-                    // // (Cluster is sorted by descending length)
-                    // size_t start_index = std::max(0, (int) (cluster_sv_calls.size() * 0.9));
-
-                    // // Get the top 10% of the cluster
-                    // std::vector<SVCall> top_half(cluster_sv_calls.begin() + start_index, cluster_sv_calls.end());
-
-                    // // Get the median SV for the top 50% of the cluster
-                    // size_t median_index = top_half.size() / 2;
-                    // merged_sv_call = top_half[median_index];
-                    // int median_index = cluster_sv_calls.size() / 2;
-                    // merged_sv_call = cluster_sv_calls[median_index];
+                    // Add SV call
+                    merged_sv_call.cluster_size = (int) cluster_sv_calls.size();
+                    merged_sv_calls.push_back(merged_sv_call);
 
                     // [TEST]
                     if (print_all) {
@@ -200,12 +224,12 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
                     }
                 }
 
-                if (cluster_id < 0) {
-                    merged_sv_call.cluster_size = cluster_id;
-                } else {
-                    merged_sv_call.cluster_size = (int) cluster_sv_calls.size();
-                }
-                merged_sv_calls.push_back(merged_sv_call);
+                // if (cluster_id < 0) {
+                //     merged_sv_call.cluster_size = cluster_id;
+                // } else {
+                //     merged_sv_call.cluster_size = (int) cluster_sv_calls.size();
+                // }
+                // merged_sv_calls.push_back(merged_sv_call);
                 cluster_count++;
             }
         }
@@ -213,6 +237,14 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts)
     }
     sv_calls = std::move(merged_sv_calls); // Replace with filtered list
 
+    // Print an error if any have CIGARCLIP data type
+    for (const auto& sv_call : sv_calls) {
+        if (sv_call.data_type == "CIGARCLIP") {
+            printError("[ERROR1] Found CIGARCLIP SV in merged SVs");
+            break;
+        }
+    }
+
     int updated_size = sv_calls.size();
     printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls");
 }

From c7f7f89e7c21bcd79190aa2fffaaff817d6d2b15 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 28 Mar 2025 16:16:19 -0400
Subject: [PATCH 098/134] fix split read merge and reduce svcall overhead

---
 include/cnv_caller.h         |  47 ++--
 include/sv_caller.h          | 233 +++++++++++--------
 include/sv_object.h          |  17 +-
 include/sv_types.h           |  69 ++++++
 python/plot_distributions.py |  27 +--
 src/cnv_caller.cpp           |  17 +-
 src/sv_caller.cpp            | 423 ++++++++++++++++++++++++++++-------
 src/sv_object.cpp            |  27 +--
 8 files changed, 611 insertions(+), 249 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 609238ef..87a9f011 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -48,6 +48,19 @@ class CNVCaller {
     private:
         std::shared_mutex& shared_mutex;
 
+        void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp);
+
+        void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction) const;
+
+        // Query a region for SNPs and return the SNP data
+        void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const;
+
+        // Split a region into chunks for parallel processing
+        std::vector<std::string> splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const;
+
+    public:
+	    CNVCaller(std::shared_mutex& shared_mutex) : shared_mutex(shared_mutex) {}
+
         // Define a map of CNV genotypes by HMM predicted state.
         // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output.
         // Each of the 6 state predictions corresponds to a copy number state
@@ -59,32 +72,24 @@ class CNVCaller {
         // 4: 1/1 (Copy neutral LOH: no copy number change, GT: 1/1 for homozygous variant)
         // 5: 2/1 (One copy gain: heterozygous duplication, GT: 1/2->0/1)
         // 6: 2/2 (Two copy gain: homozygous duplication, GT: 2/2->1/1)
-        std::map<int, std::string> cnv_genotype_map = {
-            {0, "./."},
-            {1, "1/1"},
-            {2, "0/1"},
-            {3, "0/0"},
-            {4, "1/1"},
-            {5, "0/1"},
-            {6, "1/1"}
+        const std::unordered_map<int, Genotype> StateGenotypeMap = {
+            {0, Genotype::UNKNOWN},
+            {1, Genotype::HOMOZYGOUS_ALT},
+            {2, Genotype::HETEROZYGOUS},
+            {3, Genotype::HOMOZYGOUS_REF},
+            {4, Genotype::HOMOZYGOUS_ALT},
+            {5, Genotype::HETEROZYGOUS},
+            {6, Genotype::HOMOZYGOUS_ALT}
         };
 
-        void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp);
-
-        void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair<std::vector<int>, double>& prediction) const;
-
-        // Query a region for SNPs and return the SNP data
-        void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector<uint32_t>& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const;
-
-        // Split a region into chunks for parallel processing
-        std::vector<std::string> splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const;
-
-    public:
-	    CNVCaller(std::shared_mutex& shared_mutex) : shared_mutex(shared_mutex) {}
+        // Function to get the genotype string from the state
+        inline Genotype getGenotypeFromCNState(int cn_state) const {
+            return StateGenotypeMap.at(cn_state);
+        }
 
         // Run copy number prediction for a single SV candidate, returning the
         // likelihood, predicted CNV type, genotype, and whether SNPs were found
-        std::tuple<double, SVType, std::string, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
+        std::tuple<double, SVType, Genotype, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
         void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
diff --git a/include/sv_caller.h b/include/sv_caller.h
index 6e446fa6..247adf8c 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -17,111 +17,157 @@
 #include <future>
 /// @endcond
 
-struct GenomicRegion {
-    int tid;
-    hts_pos_t start;
-    hts_pos_t end;
-    int query_start;
-    int query_end;
-    bool strand;
-    int cluster_size;  // Number of alignments used for this region
-};
-
-struct PrimaryAlignment {
-    hts_pos_t start;
-    hts_pos_t end;
-    int query_start;
-    int query_end;
-    bool strand;
-    int cluster_size;  // Number of alignments used for this region
-};
-
-struct SuppAlignment {
-    int tid;
-    hts_pos_t start;
-    hts_pos_t end;
-    int query_start;
-    int query_end;
-    bool strand;
-    int cluster_size;  // Number of alignments used for this region
-};
-
-struct SplitSignature {
-    int tid;
-    hts_pos_t start;
-    hts_pos_t end;
-    bool strand;
-    hts_pos_t query_start;
-    hts_pos_t query_end;
-};
-
-// Interval Tree Node
-struct IntervalNode {
-    PrimaryAlignment region;
-    std::string qname;
-    hts_pos_t max_end;  // To optimize queries
-    std::unique_ptr<IntervalNode> left;
-    std::unique_ptr<IntervalNode> right;
-
-    IntervalNode(PrimaryAlignment r, std::string name)
-        : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {}
-};
-
-void insert(std::unique_ptr<IntervalNode>& root, const PrimaryAlignment& region, std::string qname) {
-    if (!root) {
-        root = std::make_unique<IntervalNode>(region, qname);
-        return;
-    }
-
-    if (region.start < root->region.start)
-    {
-        insert(root->left, region, qname);
-    } else {
-        insert(root->right, region, qname);
-    }
-
-    // Update max_end
-    root->max_end = std::max(root->max_end, region.end);
-}
-
-void findOverlaps(const std::unique_ptr<IntervalNode>& root, const PrimaryAlignment& query, std::vector<std::string>& result) {
-    if (!root) return;
-
-    // If overlapping, add to result
-    if (query.start <= root->region.end && query.end >= root->region.start)
-        result.push_back(root->qname);
-
-    // If left subtree may have overlaps, search left
-    if (root->left && root->left->max_end >= query.start)
-        findOverlaps(root->left, query, result);
-
-    // Always check the right subtree
-    findOverlaps(root->right, query, result);
-}
-
-struct MismatchData {
-    uint32_t query_start;
-    uint32_t query_end;
-    std::vector<int> match_map;
-};
+// struct GenomicRegion {
+//     int tid;
+//     hts_pos_t start;
+//     hts_pos_t end;
+//     int query_start;
+//     int query_end;
+//     bool strand;
+//     int cluster_size;  // Number of alignments used for this region
+// };
+
+// struct PrimaryAlignment {
+//     hts_pos_t start;
+//     hts_pos_t end;
+//     int query_start;
+//     int query_end;
+//     bool strand;
+//     int cluster_size;  // Number of alignments used for this region
+// };
+
+// struct SuppAlignment {
+//     int tid;
+//     hts_pos_t start;
+//     hts_pos_t end;
+//     int query_start;
+//     int query_end;
+//     bool strand;
+//     int cluster_size;  // Number of alignments used for this region
+// };
+
+// struct SplitSignature {
+//     int tid;
+//     hts_pos_t start;
+//     hts_pos_t end;
+//     bool strand;
+//     hts_pos_t query_start;
+//     hts_pos_t query_end;
+// };
+
+// // Interval Tree Node
+// struct IntervalNode {
+//     PrimaryAlignment region;
+//     std::string qname;
+//     hts_pos_t max_end;  // To optimize queries
+//     std::unique_ptr<IntervalNode> left;
+//     std::unique_ptr<IntervalNode> right;
+
+//     IntervalNode(PrimaryAlignment r, std::string name)
+//         : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {}
+// };
+
+// void insert(std::unique_ptr<IntervalNode>& root, const PrimaryAlignment& region, std::string qname) {
+//     if (!root) {
+//         root = std::make_unique<IntervalNode>(region, qname);
+//         return;
+//     }
+
+//     if (region.start < root->region.start)
+//     {
+//         insert(root->left, region, qname);
+//     } else {
+//         insert(root->right, region, qname);
+//     }
+
+//     // Update max_end
+//     root->max_end = std::max(root->max_end, region.end);
+// }
+
+// void findOverlaps(const std::unique_ptr<IntervalNode>& root, const PrimaryAlignment& query, std::vector<std::string>& result) {
+//     if (!root) return;
+
+//     // If overlapping, add to result
+//     if (query.start <= root->region.end && query.end >= root->region.start)
+//         result.push_back(root->qname);
+
+//     // If left subtree may have overlaps, search left
+//     if (root->left && root->left->max_end >= query.start)
+//         findOverlaps(root->left, query, result);
+
+//     // Always check the right subtree
+//     findOverlaps(root->right, query, result);
+// }
 
 class SVCaller {
     private:
+        struct GenomicRegion {
+            int tid;
+            hts_pos_t start;
+            hts_pos_t end;
+            int query_start;
+            int query_end;
+            bool strand;
+            int cluster_size;  // Number of alignments used for this region
+        };
+
+        struct PrimaryAlignment {
+            hts_pos_t start;
+            hts_pos_t end;
+            int query_start;
+            int query_end;
+            bool strand;
+            int cluster_size;  // Number of alignments used for this region
+        };
+
+        struct SuppAlignment {
+            int tid;
+            hts_pos_t start;
+            hts_pos_t end;
+            int query_start;
+            int query_end;
+            bool strand;
+            int cluster_size;  // Number of alignments used for this region
+        };
+
+        struct SplitSignature {
+            int tid;
+            hts_pos_t start;
+            hts_pos_t end;
+            bool strand;
+            hts_pos_t query_start;
+            hts_pos_t query_end;
+        };
+
+        // Interval Tree Node
+        struct IntervalNode {
+            PrimaryAlignment region;
+            std::string qname;
+            hts_pos_t max_end;  // To optimize queries
+            std::unique_ptr<IntervalNode> left;
+            std::unique_ptr<IntervalNode> right;
+
+            IntervalNode(PrimaryAlignment r, std::string name)
+                : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {}
+        };
+
         int min_mapq = 20;          // Minimum mapping quality to be considered
         mutable std::shared_mutex shared_mutex;  // Shared mutex for thread safety
 
         std::vector<std::string> getChromosomes(const std::string& bam_filepath);
 
-        void findSplitSVSignatures(std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const InputData& input_data);
+        void findSplitSVSignatures(std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const InputData& input_data, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map, const ReferenceGenome& ref_genome);
 
         // Process a single CIGAR record and find candidate SVs
-        void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, bool is_primary, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome);
+        void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome, std::unordered_map<std::string, double>& read_mismatch_rates);
 
         std::pair<int, int> getAlignmentReadPositions(bam1_t* alignment);
 
-        void processChromosome(const std::string& chr, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov);
+        void processChromosome(const std::string& chr, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov, std::unordered_map<std::string, double>& read_mismatch_rates);
+
+        void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome, std::unordered_map<std::string, double>& read_mismatch_rates);
 
-        void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome);
+        double getReadMismatchRate(bam1_t * alignment, const std::string& chr, const ReferenceGenome & ref_genome);
  
         // Read the next alignment from the BAM file in a thread-safe manner
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
@@ -138,6 +184,11 @@ class SVCaller {
 
         // Detect SVs and predict SV type from long read alignments and CNV calls
         void run(const InputData& input_data);
+
+        // Interval tree
+        void findOverlaps(const std::unique_ptr<IntervalNode>& root, const PrimaryAlignment& query, std::vector<std::string>& result);
+
+        void insert(std::unique_ptr<IntervalNode>& root, const PrimaryAlignment& region, std::string qname);
 };
 
 #endif // SV_CALLER_H
diff --git a/include/sv_object.h b/include/sv_object.h
index 155b50bf..08fe8f70 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -7,27 +7,30 @@
 #include <set>
 #include <stdexcept>
 #include <unordered_map>
+#include <string_view>
 
 #include "sv_types.h"
 
 using namespace sv_types;
 
 struct SVCall {
-    uint32_t start;
-    uint32_t end;
+    uint32_t start = 0;
+    uint32_t end = 0;
     SVType sv_type = SVType::UNKNOWN;
     std::string alt_allele = ".";
-    std::string data_type = "NA";
-    std::string genotype = "./.";
+    SVDataType data_type = SVDataType::UNKNOWN;
+    Genotype genotype = Genotype::UNKNOWN;
     double hmm_likelihood = 0.0;
     int read_depth = 0;  // Breakpoint depth
-    int support = 0;  // Number of supporting reads
+    double mismatch_rate = 0.0;  // Highest mismatch rate in reads used for the SV call
     int cluster_size = 0;  // Number of SV calls in the cluster
 
     bool operator<(const SVCall& other) const;
 
-    SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) :
-        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {}
+    SVCall() = default;
+
+    SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int read_depth, double mismatch_rate, int cluster_size) :
+        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), mismatch_rate(mismatch_rate), cluster_size(cluster_size) {}
 };
 
 void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call);
diff --git a/include/sv_types.h b/include/sv_types.h
index 26415935..58f6063b 100644
--- a/include/sv_types.h
+++ b/include/sv_types.h
@@ -39,6 +39,60 @@ namespace sv_types {
         {SVType::COMPLEX, "COMPLEX"}
     };
 
+    // Mapping of SV types to symbols
+    const std::unordered_map<SVType, std::string> SVTypeSymbol = {
+        {SVType::UNKNOWN, "."},
+        {SVType::DEL, "<DEL>"},
+        {SVType::DUP, "<DUP>"},
+        {SVType::INV, "<INV>"},
+        {SVType::INS, "<INS>"},
+        {SVType::BND, "<BND>"},
+    };
+
+    // Define constants for genotypes
+    enum class Genotype {
+        HOMOZYGOUS_REF = 0,
+        HETEROZYGOUS = 1,
+        HOMOZYGOUS_ALT = 2,
+        UNKNOWN = 3
+    };
+
+    // Mapping of genotypes to strings
+    const std::unordered_map<Genotype, std::string> GenotypeString = {
+        {Genotype::HOMOZYGOUS_REF, "0/0"},
+        {Genotype::HETEROZYGOUS, "0/1"},
+        {Genotype::HOMOZYGOUS_ALT, "1/1"},
+        {Genotype::UNKNOWN, "./."}
+    };
+
+    // Define constants for SV data types (evidence types)
+    enum class SVDataType {
+        CIGARINS = 0,
+        CIGARDEL = 1,
+        CIGARCLIP = 2,
+        SPLIT = 3,
+        SPLITDIST1 = 4,
+        SPLITDIST2 = 5,
+        SPLITINV = 6,
+        SUPPINV = 7,
+        HMM = 8,
+        UNKNOWN = 9
+    };
+
+    // Mapping of SV data types to strings
+    const std::unordered_map<SVDataType, std::string> SVDataTypeString = {
+        {SVDataType::CIGARINS, "CIGARINS"},
+        {SVDataType::CIGARDEL, "CIGARDEL"},
+        {SVDataType::CIGARCLIP, "CIGARCLIP"},
+        {SVDataType::SPLIT, "SPLIT"},
+        {SVDataType::SPLITDIST1, "SPLITDIST1"},
+        {SVDataType::SPLITDIST2, "SPLITDIST2"},
+        {SVDataType::SPLITINV, "SPLITINV"},
+        {SVDataType::SUPPINV, "SUPPINV"},
+        {SVDataType::HMM, "HMM"},
+        {SVDataType::UNKNOWN, "UNKNOWN"}
+    };
+
     // Mapping of 6 copy number states to SV types
     const std::unordered_map<int, SVType> CNVTypeMap = {
         {0, SVType::UNKNOWN},
@@ -60,6 +114,21 @@ namespace sv_types {
         return CNVTypeMap.at(cn_state);
     }
 
+    // Function to get the genotype string
+    inline std::string getGenotypeString(Genotype genotype) {
+        return GenotypeString.at(genotype);
+    }
+
+    // Function to get the SV data type string
+    inline std::string getSVDataTypeString(SVDataType data_type) {
+        return SVDataTypeString.at(data_type);
+    }
+
+    // Function to get the SV type symbol
+    inline std::string getSVTypeSymbol(SVType sv_type) {
+        return SVTypeSymbol.at(sv_type);
+    }
+
     // Function to check if an SV type is a valid update from copy number predictions
     inline bool isValidCopyNumberUpdate(SVType sv_type, SVType updated_sv_type) {
         if (updated_sv_type == SVType::UNKNOWN) {
diff --git a/python/plot_distributions.py b/python/plot_distributions.py
index 37eb1638..c2644a8a 100644
--- a/python/plot_distributions.py
+++ b/python/plot_distributions.py
@@ -89,8 +89,6 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"):
 
         # If the plot title is GIAB, then we need to convert INS to DUP if
         # INFO/SVTYPE is INS and INFO/REPTYPE is DUP
-        # if plot_title == "GIAB" and sv_type == "INS":
-        # Check if GIAB is a substring of the plot title
         if "GIAB" in plot_title and sv_type == "INS":
             if 'REPTYPE=DUP' in record['INFO']:
                 sv_type = "DUP"
@@ -110,7 +108,6 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"):
 
     # Create a dictionary of SV types and their corresponding colors.
     # From: https://davidmathlogic.com/colorblind/
-    # sv_colors = {'DEL': '#D81B60', 'DUP': '#1E88E5', 'INV': '#FFC107', 'INS': '#004D40'}
     # WONG colors
     sv_colors = {'DEL': '#E69F00', 'DUP': '#56B4E9', 'INV': '#009E73', 'INS': '#F0E442', 'INVDUP': '#D55E00', 'COMPLEX': '#CC79A7'}
 
@@ -163,16 +160,16 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"):
         # Use a log scale for the y-axis
         axes[i].set_yscale('log')
 
-        # # In the same axis, plot a known duplication if within the range of the plot
-        if sv_type == 'DUP':
-            print("TEST: Found DUP")
-            cnv_size = 776237 / size_scale
-            x_min, x_max = axes[i].get_xlim()
-            if cnv_size > x_min and cnv_size < x_max:
-                axes[i].axvline(x=cnv_size, color='black', linestyle='--')
-            else:
-                # Print the values
-                print(f'CNV size: {cnv_size}, x_min: {x_min}, x_max: {x_max}')
+        # In the same axis, plot a known duplication if within the range of the plot
+        # if sv_type == 'DUP':
+        #     print("TEST: Found DUP")
+        #     cnv_size = 776237 / size_scale
+        #     x_min, x_max = axes[i].get_xlim()
+        #     if cnv_size > x_min and cnv_size < x_max:
+        #         axes[i].axvline(x=cnv_size, color='black', linestyle='--')
+        #     else:
+        #         # Print the values
+        #         print(f'CNV size: {cnv_size}, x_min: {x_min}, x_max: {x_max}')
 
         # Refresh the plot
         plt.draw()
@@ -216,9 +213,9 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"):
     fig.update_layout(legend=dict(
         orientation='v',
         yanchor='top',
-        y=0.75,
+        y=0.9,
         xanchor='right',
-        x=0.75,
+        x=0.9,
     ))
     # # Move the legend to the bottom right outside the plot
     # fig.update_layout(legend=dict(
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 26bca566..c293d7fb 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -155,13 +155,13 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     snp_data.is_snp = std::move(is_snp_hmm);
 }
 
-std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
+std::tuple<double, SVType, Genotype, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
 {
     // Check that the start position is less than the end position
     if (start_pos > end_pos)
     {
         printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
-        return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false);
+        return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false);
     }
 
     // Run the Viterbi algorithm on SNPs in the SV region
@@ -197,7 +197,7 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     runViterbi(hmm, snp_data, prediction);
     if (prediction.first.size() == 0)
     {
-        return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false);
+        return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false);
     }
 
     std::vector<int>& state_sequence = prediction.first;
@@ -233,12 +233,13 @@ std::tuple<double, SVType, std::string, bool> CNVCaller::runCopyNumberPrediction
     
     // Update SV type and genotype based on the majority state
     SVType predicted_cnv_type = SVType::UNKNOWN;
-    std::string genotype = "./.";
+    Genotype genotype = Genotype::UNKNOWN;
     int state_count = (int) sv_states.size();
     if ((double) max_count / (double) state_count > pct_threshold)
     {
         predicted_cnv_type = getSVTypeFromCNState(max_state);
-        genotype = cnv_genotype_map.at(max_state);
+        // genotype = cnv_genotype_map.at(max_state);
+        genotype = getGenotypeFromCNState(max_state);
     }
     snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
 
@@ -360,18 +361,17 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         {
             max_state = 0;
         }
+        Genotype genotype = getGenotypeFromCNState(max_state);
 
         // Update the SV information if it does not conflict with the current SV type
         SVType updated_sv_type = getSVTypeFromCNState(max_state);
         bool is_valid_update = isValidCopyNumberUpdate(sv_call.sv_type, updated_sv_type);
         if (is_valid_update)
         {
-            std::string genotype = cnv_genotype_map.at(max_state);
-            std::string data_type = "CIGAR+HMM";
             sv_call.sv_type = updated_sv_type;
             sv_call.hmm_likelihood = likelihood;
             sv_call.genotype = genotype;
-            sv_call.data_type = data_type;
+            sv_call.data_type = SVDataType::HMM;
         }
     }
 }
@@ -1026,7 +1026,6 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN
         }
         json_file << "]\n";
     json_file << "  }\n";
-    // json_file << "},\n";
     json_file.close();
     printMessage("Saved copy number predictions for " + chr + ":" + std::to_string(start) + "-" + std::to_string(end) + " to " + filepath);
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 9969342f..eac69390 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -63,7 +63,7 @@ std::vector<std::string> SVCaller::getChromosomes(const std::string &bam_filepat
     return chromosomes;
 }
 
-void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const InputData &input_data)
+void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const InputData &input_data, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map, const ReferenceGenome& ref_genome)
 {
     // Open the BAM file
     std::string bam_filepath = input_data.getLongReadBam();
@@ -133,6 +133,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
     uint32_t num_alignments = 0;
     std::unordered_set<int> alignment_tids;  // All unique chromosome IDs
     std::unordered_set<std::string> supp_qnames;  // All unique query names
+    std::unordered_map<std::string, double> read_mismatch_rates;  // Query name -> mismatch rate
     while (readNextAlignment(fp_in, itr, bam1) >= 0) {
 
         // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
@@ -154,6 +155,12 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
 
         // Process supplementary alignments
         } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
+            // Get the mismatch rate for the read
+            const std::string supp_chr = bamHdr->target_name[bam1->core.tid];
+            double mismatch_rate = getReadMismatchRate(bam1, supp_chr, ref_genome);
+            read_mismatch_rates[qname] = mismatch_rate;
+            // printMessage("[TEST] Mismatch rate for " + qname + ": " + std::to_string(mismatch_rate) + " at position " + std::to_string(bam1->core.pos + 1) + std::string(bam_endpos(bam1) > 0 ? "-" + std::to_string(bam_endpos(bam1)) : ""));
+
             // Store chromosome (TID), start, and end positions (1-based) of the
             // supplementary alignment, and the strand (true for forward, false
             // for reverse)
@@ -260,11 +267,14 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             std::vector<int> starts;
             std::vector<int> ends;
             std::vector<bool> primary_strands;
+            double min_mismatch_rate = 1.0;
             for (const std::string& qname : primary_cluster) {
                 const PrimaryAlignment& primary_aln = chr_primary_map.at(qname);
                 starts.push_back(primary_aln.start);
                 ends.push_back(primary_aln.end);
                 primary_strands.push_back(primary_aln.strand);
+                min_mismatch_rate = std::min(min_mismatch_rate, read_mismatch_rates[qname]);
+                // printMessage("[TEST-SPLIT] Mismatch rate for " + qname + ": " + std::to_string(read_mismatch_rates[qname]));
             }
 
             // Get the largest cluster of primary alignment start positions
@@ -309,9 +319,8 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                                 distance = primary_aln.query_start - supp_aln.query_end;
                             }
                             split_distances.push_back(distance);
-                        } else {
-                            // TODO: INVERSIONS                       
                         }
+
                     } else {
                         // TODO: TRANSLOCATIONS
                     }
@@ -373,7 +382,14 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                         // If two positions were found, use the 5'most position
                         primary_pos = std::min(primary_pos, primary_pos2);
                     }
-                    SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, "<INS>", "SPLITDIST1", "./.", 0.0, 0, 0, primary_cluster_size);
+                    int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), primary_pos);
+                    // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, "<INS>", "SPLITDIST1", "./.", 0.0, read_depth, min_mismatch_rate, primary_cluster_size);
+                    SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, getSVTypeSymbol(SVType::INS), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, primary_cluster_size);
+                    
+                    // Print if end position = 162908547
+                    if (primary_pos + (read_distance - 1) == 162908547) {
+                        printMessage("[TEST] Adding insertion SV candidate at " + chr_name + ":" + std::to_string(primary_pos) + "-" + std::to_string(primary_pos + (read_distance - 1)) + " with length " + std::to_string(read_distance));
+                    }
                     addSVCall(chr_sv_calls, sv_candidate);
                 }
             }
@@ -411,7 +427,14 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             // Store the inversion as the supplementary start and end positions
             if (supp_best_start != -1 && supp_best_end != -1) {
                 if (inversion && std::abs(supp_best_start - supp_best_end) >= 50) {
-                    SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, "<INV>", "SUPPINV", "./.", 0.0, 0, 0, supp_cluster_size);
+                    int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), std::min(supp_best_start, supp_best_end));
+
+                    // Print if end position = 162908547
+                    if (std::max(supp_best_start, supp_best_end) == 162908547) {
+                        printMessage("[TEST] Adding inversion SV candidate at " + chr_name + ":" + std::to_string(std::min(supp_best_start, supp_best_end)) + "-" + std::to_string(std::max(supp_best_start, supp_best_end)) + " with length " + std::to_string(std::abs(supp_best_start - supp_best_end)));
+                    }
+
+                    SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, supp_cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
                 }
             }
@@ -445,16 +468,24 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             // If the read distance is < 30bp while the SV is > 2kb, then this is a
             // potential deletion
             if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) {
-                // SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDIST2", "./.", 0.0, 0, 0, cluster_size);
-                // addSVCall(chr_sv_calls, sv_candidate);
 
                 // Add an inversion call if necessary
+                int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start);
                 if (inversion) {
-                    // printMessage("[TEST] Found inversion at " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + ", length=" + std::to_string(sv_length));
-                    SVCall sv_candidate(sv_start, sv_end, SVType::INV, "<INV>", "SPLITINV", "./.", 0.0, 0, 0, cluster_size);
+                    SVCall sv_candidate(sv_start, sv_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
+
+                    // Print if end position = 162908547
+                    if (sv_end == 162908547) {
+                        printMessage("[TEST] Adding inversion SV candidate at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length));
+                    }
                 } else {
-                    SVCall sv_candidate(sv_start, sv_end, SVType::DEL, "<DEL>", "SPLITDIST2", "./.", 0.0, 0, 0, cluster_size);
+                    SVCall sv_candidate(sv_start, sv_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, cluster_size);
+                    
+                    // Print if end position = 162908547
+                    if (sv_end == 162908547) {
+                        printMessage("[TEST] Adding deletion SV candidate at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and mismatch rate " + std::to_string(min_mismatch_rate));
+                    }
                     addSVCall(chr_sv_calls, sv_candidate);
                 }
             }
@@ -463,7 +494,13 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             else if (sv_length >= min_length && sv_length <= max_length) {
                 SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
                 std::string alt = (sv_type == SVType::INV) ? "<INV>" : ".";
-                SVCall sv_candidate(sv_start, sv_end, sv_type, alt, "SPLIT", "./.", 0.0, 0, 0, cluster_size);
+                int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start);
+
+                // Print if end position = 162908547
+                if (sv_end == 162908547) {
+                    printMessage("[TEST] Adding CNV SV candidate at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and type " + getSVTypeSymbol(sv_type) + " and mismatch rate " + std::to_string(min_mismatch_rate));
+                }
+                SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, cluster_size);
                 addSVCall(chr_sv_calls, sv_candidate);
             }
         }
@@ -477,6 +514,12 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
         
         // Merge duplicate SV calls with identical start positions
         mergeDuplicateSVs(chr_sv_calls);
+
+        printMessage("Merged SVs:");
+        for (const auto& sv : chr_sv_calls) {
+            printMessage(" - " + getSVTypeSymbol(sv.sv_type) + " at " + chr_name + ":" + std::to_string(sv.start) + "-" + std::to_string(sv.end) + " with length " + std::to_string(sv.end - sv.start + 1) + " and cluster size " + std::to_string(sv.cluster_size));
+        }
+
         sv_calls[chr_name] = std::move(chr_sv_calls);
 
         // Print the number of merged SV calls
@@ -484,7 +527,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
     }
 }
 
-void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome)
+void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome, std::unordered_map<std::string, double>& read_mismatch_rates)
 {
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
@@ -502,14 +545,15 @@ void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, c
     // Main loop to process the alignments
     while (readNextAlignment(fp_in, itr, bam1) >= 0) {
 
-        // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
-        if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) {
+        // Skip secondary and unmapped alignments, duplicates, QC failures, and
+        // low mapping quality, and supplementary alignments
+        if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq || bam1->core.flag & BAM_FSUPPLEMENTARY) {
             continue;
         }
 
         // Process the alignment
-        bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY);
-        this->processCIGARRecord(bamHdr, bam1, sv_calls, primary, pos_depth_map, ref_genome);
+        // bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY);
+        this->processCIGARRecord(bamHdr, bam1, sv_calls, pos_depth_map, ref_genome, read_mismatch_rates);
     }
 
     // Clean up the iterator and alignment
@@ -517,10 +561,65 @@ void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, c
     bam_destroy1(bam1);
 }
 
-void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector<SVCall> &sv_calls, bool is_primary, const std::vector<uint32_t> &pos_depth_map, const ReferenceGenome &ref_genome)
+double SVCaller::getReadMismatchRate(bam1_t *alignment, const std::string& chr, const ReferenceGenome & ref_genome)
+{
+    uint32_t* cigar = bam_get_cigar(alignment);  // CIGAR array
+    int cigar_len = alignment->core.n_cigar;
+    uint32_t query_pos = 0;
+    uint32_t pos = (uint32_t)alignment->core.pos;
+    uint32_t aln_start = pos;
+    uint32_t end = (uint32_t)bam_endpos(alignment) - 1;  // Rightmost position of the alignment in the reference genome (0-based)
+
+    // Get the reference sequence
+    std::string_view ref_seq = ref_genome.query(chr, pos + 1, end + 1);
+
+    // Loop through the CIGAR string and calculate the number of matches and
+    // mismatches
+    int match_count = 0;
+    int mismatch_count = 0;
+    for (int i = 0; i < cigar_len; i++) {
+        int op_len = bam_cigar_oplen(cigar[i]);  // CIGAR operation length
+        int op = bam_cigar_op(cigar[i]);  // CIGAR operation
+        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+            for (int j = 0; j < op_len; j++) {
+                char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
+                if (base == ref_seq[pos - aln_start + j]) {
+                    match_count++;
+                } else {
+                    mismatch_count++;
+                }
+            }
+        }
+        // Update the reference position
+        // https://samtools.github.io/hts-specs/SAMv1.pdf
+        if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
+            pos += op_len;
+        }
+        
+        // Update the query position
+        if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
+            query_pos += op_len;
+        }
+    }
+
+    // Calculate the mismatch rate
+    double mismatch_rate = 0.0;
+    if (match_count + mismatch_count > 0) {
+        mismatch_rate = static_cast<double>(mismatch_count) / static_cast<double>(match_count + mismatch_count);
+    }
+    return mismatch_rate;
+}
+
+void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector<SVCall> &sv_calls, const std::vector<uint32_t> &pos_depth_map, const ReferenceGenome &ref_genome, std::unordered_map<std::string, double> &read_mismatch_rates)
 {
     std::string chr = header->target_name[alignment->core.tid];  // Chromosome name
-    uint32_t pos = (uint32_t)alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
+    uint32_t aln_start = (uint32_t)alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
+    uint32_t pos = aln_start;
+    uint32_t end = (uint32_t)bam_endpos(alignment) - 1;  // Rightmost position of the alignment in the reference genome (0-based)
+
+    // Get the reference sequence (used for mismatch rate)
+    std::string_view ref_seq = ref_genome.query(chr, pos + 1, end + 1);
+
     uint32_t* cigar = bam_get_cigar(alignment);  // CIGAR array
     int cigar_len = alignment->core.n_cigar;
     uint32_t query_pos = 0;
@@ -537,13 +636,17 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
         amb_bases_bitset.set(std::tolower(base));
     }
 
+    int match_count = 0;
+    int mismatch_count = 0;
+    std::vector<SVCall> cigar_sv_calls;
+    cigar_sv_calls.reserve(1000);
     for (int i = 0; i < cigar_len; i++) {
         int op_len = bam_cigar_oplen(cigar[i]);  // CIGAR operation length
         int op = bam_cigar_op(cigar[i]);  // CIGAR operation
         if (op_len >= 50) {
             
             // Process the CIGAR operation
-            if (op == BAM_CINS && is_primary) {
+            if (op == BAM_CINS) {
 
                 // Get the sequence of the insertion from the query
                 std::string ins_seq_str(op_len, ' ');
@@ -557,51 +660,57 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                     }
                 }
                 
-                // Before the insertion
-                if (pos >= (uint32_t)op_len-1)
-                {
-                    uint32_t bp1 = pos - (op_len - 1) + 1;
-                    uint32_t bp2 = bp1 + op_len - 1; //pos + 1;
-
-                    if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
-                    {
-                        int read_depth = this->getReadDepth(pos_depth_map, bp1);
-                        SVCall sv_call(bp1, bp2, SVType::DUP, "<DUP>", "LSEQSIM", "./.", default_lh, read_depth, 1, 0);
-                        addSVCall(sv_calls, sv_call);
-                        continue;
-                    }
-                }
-
-                // After the insertion
-                if (pos + op_len < ref_genome.getChromosomeLength(chr))
-                {
-                    uint32_t bp1 = pos + 1;
-                    uint32_t bp2 = bp1 + op_len - 1;
-
-                    if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
-                    {
-                        int read_depth = this->getReadDepth(pos_depth_map, bp1);
-                        SVCall sv_call(bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth, 1, 0);
-                        addSVCall(sv_calls, sv_call);
-                        continue;
-                    }
-                }
+                // // Before the insertion
+                // if (pos >= (uint32_t)op_len-1)
+                // {
+                //     uint32_t bp1 = pos - (op_len - 1) + 1;
+                //     uint32_t bp2 = bp1 + op_len - 1; //pos + 1;
+
+                //     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
+                //     {
+                //         int read_depth = this->getReadDepth(pos_depth_map, bp1);
+                //         SVCall sv_call(bp1, bp2, SVType::DUP, "<DUP>", "LSEQSIM", "./.", default_lh, read_depth, 1, 0);
+                //         // addSVCall(sv_calls, sv_call);
+                //         addSVCall(cigar_sv_calls, sv_call);
+                //         continue;
+                //     }
+                // }
+
+                // // After the insertion
+                // if (pos + op_len < ref_genome.getChromosomeLength(chr))
+                // {
+                //     uint32_t bp1 = pos + 1;
+                //     uint32_t bp2 = bp1 + op_len - 1;
+
+                //     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
+                //     {
+                //         int read_depth = this->getReadDepth(pos_depth_map, bp1);
+                //         SVCall sv_call(bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth, 1, 0);
+                //         // addSVCall(sv_calls, sv_call);
+                //         addSVCall(cigar_sv_calls, sv_call);
+                //         continue;
+                //     }
+                // }
 
                 // Add as an insertion
                 uint32_t ins_pos = pos + 1;
                 uint32_t ins_end = ins_pos + op_len - 1;
-                int read_depth = this->getReadDepth(pos_depth_map, ins_pos-1);
+                int read_depth = this->getReadDepth(pos_depth_map, ins_pos);
                 
                 // Determine the ALT allele format based on small vs. large insertion
                 std::string alt_allele = "<INS>";
                 if (op_len <= 50) {
                     alt_allele = ins_seq_str;
                 }
-                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, 1, 0);
-                addSVCall(sv_calls, sv_call);
+                // SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele,
+                // "CIGARINS", "./.", default_lh, read_depth, 1, 0);
+                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, read_depth, 1, 0);
+                // addSVCall(sv_calls, sv_call);
+                // addSVCall(cigar_sv_calls, sv_call);
+                cigar_sv_calls.emplace_back(sv_call);
             
             // Process clipped bases as potential insertions
-            } else if (op == BAM_CSOFT_CLIP && is_primary) {
+            } else if (op == BAM_CSOFT_CLIP) {
                 // Get the sequence of the insertion from the query
                 std::string ins_seq_str(op_len, ' ');
                 for (int j = 0; j < op_len; j++) {
@@ -617,25 +726,51 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                 // Add as an insertion
                 uint32_t ins_pos = pos + 1;
                 uint32_t ins_end = ins_pos + op_len - 1;
-                int read_depth = this->getReadDepth(pos_depth_map, ins_pos-1);
+                int read_depth = this->getReadDepth(pos_depth_map, ins_pos);
                 
                 // Determine the ALT allele format based on small vs. large insertion
                 std::string alt_allele = "<INS>";
                 if (op_len <= 50) {
                     alt_allele = ins_seq_str;
                 }
-                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARCLIP", "./.", default_lh, read_depth, 1, 0);
-                addSVCall(sv_calls, sv_call);
+                // SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele,
+                // "CIGARCLIP", "./.", default_lh, read_depth, 0.0, 0);
+                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, read_depth, 0.0, 0);
+                // addSVCall(sv_calls, sv_call);
+                cigar_sv_calls.emplace_back(sv_call);  // Commented for testing
+                // printMessage("Completed adding SV: " + std::to_string(ins_pos) + "-" + std::to_string(ins_end) + " " + alt_allele + ", RD=" + std::to_string(read_depth) + ", data type=" + sv_call.data_type);
 
             // Check if the CIGAR operation is a deletion
-            } else if (op == BAM_CDEL && is_primary) {
+            } else if (op == BAM_CDEL) {
 
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
                 int read_depth = this->getReadDepth(pos_depth_map, ref_pos);
-                SVCall sv_call(ref_pos, ref_end, SVType::DEL, "<DEL>", "CIGARDEL", "./.", default_lh, read_depth, 1, 0);
-                addSVCall(sv_calls, sv_call);
+                // SVCall sv_call(ref_pos, ref_end, SVType::DEL, "<DEL>",
+                // "CIGARDEL", "./.", default_lh, read_depth, 1, 0);
+                SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, read_depth, 1, 0);
+                // addSVCall(sv_calls, sv_call);
+                // addSVCall(cigar_sv_calls, sv_call);
+                cigar_sv_calls.emplace_back(sv_call);
             }
+            
+            // For matches, calculate the sequence identity
+            // } else if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
+            //     if (ref_seq.size() < static_cast<size_t>(op_len)) {
+            //         printError("ERROR: reference sequence length is less than the CIGAR operation length");
+            //         continue;
+            //     }
+
+            //     // printMessage("Calculating sequence identity for matches");
+            //     for (int j = 0; j < op_len; j++) {
+            //         char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
+            //         if (base == ref_seq[pos - aln_start + j]) {
+            //             match_count++;
+            //         } else {
+            //             mismatch_count++;
+            //         }
+            //     }
+            // }
         }
 
         // Update the reference position
@@ -649,6 +784,27 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
             query_pos += op_len;
         }
     }
+
+    // Get the read name
+    // std::string read_name = bam_get_qname(alignment);
+
+    // If read name starts with c08844a5 then print the read name and the number
+    // of matches and mismatches
+    // if (read_name.find("c08844a5") != std::string::npos) {
+    //     printMessage(read_name + ": matches=" + std::to_string(match_count) + ", mismatches=" + std::to_string(mismatch_count) + ", mismatches/length=" + std::to_string((double)mismatch_count / (double)(match_count + mismatch_count)));
+    // }
+    // double mismatch_rate = (double)mismatch_count / (double)(match_count + mismatch_count);
+    // if (mismatch_rate > 0) {
+    //     printMessage("Read name: " + read_name + ", mismatch rate: " + std::to_string(mismatch_rate) + ", matches: " + std::to_string(match_count) + ", mismatches: " + std::to_string(mismatch_count));
+    // }
+    // read_mismatch_rates[read_name] = mismatch_rate;
+    // printMessage("Completed processing read: " + read_name);
+
+    // Set the mismatch rate for all SVs from this read, and add the SV calls
+    for (SVCall& sv_call : cigar_sv_calls) {
+        // sv_call.mismatch_rate = mismatch_rate;
+        addSVCall(sv_calls, sv_call);
+    }
 }
 
 std::pair<int, int> SVCaller::getAlignmentReadPositions(bam1_t *alignment)
@@ -680,7 +836,7 @@ std::pair<int, int> SVCaller::getAlignmentReadPositions(bam1_t *alignment)
     return std::make_pair(query_start, query_end);
 }
 
-void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov)
+void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov, std::unordered_map<std::string, double>& read_mismatch_rates)
 {
     // Open the BAM file
     std::string bam_filepath = input_data.getLongReadBam();
@@ -721,7 +877,7 @@ void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& ch
     // -----------------------------------------------------------------------
     // Detect SVs from the CIGAR strings
     printMessage(chr + ": CIGAR SVs...");
-    this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome);
+    this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome, read_mismatch_rates);
 
     printMessage(chr + ": Merging CIGAR...");
     mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false);
@@ -794,6 +950,7 @@ void SVCaller::run(const InputData& input_data)
         chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end());
     }
     std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
+    std::unordered_map<std::string, double> read_mismatch_rates;
     int current_chr = 0;
     int total_chr_count = chromosomes.size();
 
@@ -810,10 +967,18 @@ void SVCaller::run(const InputData& input_data)
             try {
                 std::vector<SVCall> sv_calls;
                 InputData chr_input_data = input_data;  // Use a thread-local copy
-                this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]);
+                std::unordered_map<std::string, double> chr_read_mismatch_rates;
+                this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], chr_read_mismatch_rates);
                 {
                     std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
+                    
+                    // Update the SV calls for the chromosome
                     whole_genome_sv_calls[chr] = std::move(sv_calls);
+
+                    // Update the mismatch rates for each read name
+                    for (const auto& entry : chr_read_mismatch_rates) {
+                        read_mismatch_rates[entry.first] = entry.second;
+                    }
                 }
             } catch (const std::exception& e) {
                 printError("Error processing chromosome " + chr + ": " + e.what());
@@ -865,7 +1030,7 @@ void SVCaller::run(const InputData& input_data)
         // Identify split-SV signatures
         printMessage("Identifying split-SV signatures...");
         std::unordered_map<std::string, std::vector<SVCall>> whole_genome_split_sv_calls;
-        this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data);
+        this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data, chr_pos_depth_map, ref_genome);
 
         printMessage("Running copy number predictions on split-read SVs...");
         current_chr = 0;
@@ -883,7 +1048,6 @@ void SVCaller::run(const InputData& input_data)
         printMessage("Merging split-read SVs...");
         int min_pts = 2;
         for (auto& entry : whole_genome_split_sv_calls) {
-            const std::string& chr = entry.first;
             std::vector<SVCall>& sv_calls = entry.second;
             mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), min_pts, true);
         }
@@ -916,15 +1080,49 @@ void SVCaller::run(const InputData& input_data)
     this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome);
 }
 
+void SVCaller::findOverlaps(const std::unique_ptr<IntervalNode> &root, const PrimaryAlignment &query, std::vector<std::string> &result)
+{
+    if (!root) return;
+
+    // If overlapping, add to result
+    if (query.start <= root->region.end && query.end >= root->region.start)
+        result.push_back(root->qname);
+
+    // If left subtree may have overlaps, search left
+    if (root->left && root->left->max_end >= query.start)
+        findOverlaps(root->left, query, result);
+
+    // Always check the right subtree
+    findOverlaps(root->right, query, result);
+}
+
+void SVCaller::insert(std::unique_ptr<IntervalNode> &root, const PrimaryAlignment &region, std::string qname)
+{
+    if (!root) {
+        root = std::make_unique<IntervalNode>(region, qname);
+        return;
+    }
+
+    if (region.start < root->region.start)
+    {
+        insert(root->left, region, qname);
+    } else {
+        insert(root->right, region, qname);
+    }
+
+    // Update max_end
+    root->max_end = std::max(root->max_end, region.end);
+}
+
 // Run copy number predictions on the SVs detected from the split reads
 void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::vector<SVCall>& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data)
 {
-    std::vector<SVCall> processed_calls;
+    std::vector<SVCall> additional_calls;
     for (auto& sv_candidate : split_sv_calls) {
-        std::tuple<double, SVType, std::string, bool> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
+        std::tuple<double, SVType, Genotype, bool> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
         double supp_lh = std::get<0>(result);
         SVType supp_type = std::get<1>(result);
-        std::string genotype = std::get<2>(result);
+        Genotype genotype = std::get<2>(result);
 
         // For inversions with copy-neutral support, update the HMM likelihood
         if (supp_type == SVType::NEUTRAL && sv_candidate.sv_type == SVType::INV) {
@@ -933,11 +1131,57 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
 
         // Update the SV type if the support is not neutral or unknown
         else if (supp_type != SVType::UNKNOWN && supp_type != SVType::NEUTRAL) {
-            sv_candidate.sv_type = supp_type;
-            sv_candidate.alt_allele = "<" + getSVTypeString(supp_type) + ">";
-            sv_candidate.data_type += "+HMM";  // Update the data type to include HMM
-            sv_candidate.genotype = genotype;
-            sv_candidate.hmm_likelihood = supp_lh;
+            // Update information if the SV call is unknown
+            if (sv_candidate.sv_type == SVType::UNKNOWN) {
+                sv_candidate.sv_type = supp_type;
+                sv_candidate.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
+                sv_candidate.data_type = SVDataType::HMM;
+                sv_candidate.genotype = genotype;
+                sv_candidate.hmm_likelihood = supp_lh;
+
+                // Print if end position = 162908547
+                if (sv_candidate.end == 162908547) {
+                    printMessage("SV at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) +
+                        " updated to type " + getSVTypeSymbol(supp_type) +
+                        " with likelihood " + std::to_string(supp_lh) +
+                        " and genotype " + getGenotypeString(genotype));
+                }
+            // Add an additional SV call if the type is different
+            } else if (sv_candidate.sv_type != supp_type) {
+                SVCall new_sv_call = sv_candidate;  // Copy the original SV call
+                new_sv_call.sv_type = supp_type;
+                new_sv_call.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
+                new_sv_call.data_type = SVDataType::HMM;
+                new_sv_call.genotype = genotype;
+                new_sv_call.hmm_likelihood = supp_lh;
+
+                // Print if end position = 162908547
+                if (new_sv_call.end == 162908547) {
+                    printMessage("Additional SV at " + chr + ":" + std::to_string(new_sv_call.start) + "-" + std::to_string(new_sv_call.end) +
+                        " with type " + getSVTypeSymbol(supp_type) +
+                        " and likelihood " + std::to_string(supp_lh) +
+                        " and genotype " + getGenotypeString(genotype));
+                }
+                additional_calls.push_back(new_sv_call);
+            }
+        }
+    }
+
+    // Add the additional SV calls to the original list, replacing any existing
+    // ones
+    for (auto& new_sv_call : additional_calls) {
+        bool found = false;
+        for (auto& existing_sv_call : split_sv_calls) {
+            if (existing_sv_call.start == new_sv_call.start && existing_sv_call.end == new_sv_call.end &&
+                existing_sv_call.sv_type == new_sv_call.sv_type) {
+                // Update the existing SV call with the new one
+                existing_sv_call = new_sv_call;
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            addSVCall(split_sv_calls, new_sv_call);  // Add as a new SV call
         }
     }
 }
@@ -979,6 +1223,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         "##INFO=<ID=HMM,Number=1,Type=Float,Description=\"HMM likelihood\">",
         "##INFO=<ID=SUPPORT,Number=1,Type=Integer,Description=\"Number of reads supporting the variant\">",
         "##INFO=<ID=CLUSTER,Number=1,Type=Integer,Description=\"Cluster size\">",
+        "##INFO=<ID=MISMATCH,Number=1,Type=Float,Description=\"Mismatch rate\">",
         "##FILTER=<ID=PASS,Description=\"All filters passed\">",
         "##FILTER=<ID=LowQual,Description=\"Low quality\">",
         "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
@@ -1016,6 +1261,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     std::cout << "Saving SV calls to " << output_vcf << std::endl;
     int total_count = 0;
     int unclassified_svs = 0;
+    int filtered_svs = 0;
     for (const auto& pair : sv_calls) {
         std::string chr = pair.first;
         const std::vector<SVCall>& sv_calls = pair.second;
@@ -1025,19 +1271,30 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             uint32_t start = sv_call.start;
             uint32_t end = sv_call.end;
             SVType sv_type = sv_call.sv_type;
-            std::string genotype = sv_call.genotype;
-            std::string data_type_str = sv_call.data_type;
+            // std::string genotype = sv_call.genotype;
+            // std::string data_type_str = sv_call.data_type;
+            // std::string alt_allele = sv_call.alt_allele;
+            std::string genotype = getGenotypeString(sv_call.genotype);
+            std::string data_type_str = getSVDataTypeString(sv_call.data_type);
             std::string alt_allele = sv_call.alt_allele;
             double hmm_likelihood = sv_call.hmm_likelihood;
             int sv_length = end - start + 1;
             int cluster_size = sv_call.cluster_size;
             int read_depth = sv_call.read_depth;
             std::string ref_allele = ".";
-            int support = sv_call.support;
+            double mismatch_rate = sv_call.mismatch_rate;
+
+            // Set PASS filter if mismatch rate is above threshold
+            std::string filter = "PASS";
+            double max_mismatch_rate = 0.02;
+            if (mismatch_rate > max_mismatch_rate) {
+                filter = "LowQual";
+                filtered_svs += 1;
+            }
 
             // If the SV type is unknown, print a warning and skip
             if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) {
-                unclassified_svs += 1;
+                unclassified_svs += 1; 
                 continue;
             } else {
                 total_count += 1;
@@ -1095,17 +1352,16 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
             // Create the VCF parameter strings
             std::string sv_type_str = getSVTypeString(sv_type);
-            std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(support) + ";CLUSTER=" + std::to_string(cluster_size);
-            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \
-            //     ";SVTYPE2=" + sv_type2_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \
-            //     ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(support) + ";CLUSTER=" + std::to_string(cluster_size);
-                
+            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size);                
+            std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";MISMATCH=" + std::to_string(mismatch_rate);
             std::string format_str = "GT:DP";
             std::string sample_str = genotype + ":" + std::to_string(read_depth);
             std::vector<std::string> samples = {sample_str};
 
-            // Write the SV call to the file (CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLES)
-            vcf_stream << chr << "\t" << start << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "PASS" << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl;
+            // Write the SV call to the file (CHROM, POS, ID, REF, ALT, QUAL,
+            // FILTER, INFO, FORMAT, SAMPLES)
+            vcf_stream << chr << "\t" << start << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << filter << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl;
+            // vcf_stream << chr << "\t" << start << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "PASS" << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl;
         }
     }
     vcf_stream.close();
@@ -1116,6 +1372,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     if (unclassified_svs > 0) {
         std::cout << "Total unclassified SVs: " << unclassified_svs << std::endl;
     }
+    printMessage("Total PASS filtered SVs: " + std::to_string(filtered_svs));
 }
 
 int SVCaller::getReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start)
@@ -1124,7 +1381,7 @@ int SVCaller::getReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t
     try {
         read_depth += pos_depth_map.at(start);
     } catch (const std::out_of_range& e) {
-        printError("Error: Start position " + std::to_string(start) + " not found in depth map.");
+        printError("Error: Start position " + std::to_string(start) + " not found in depth map of size " + std::to_string(pos_depth_map.size()) + ". Exception: " + e.what());
     }
 
     return read_depth;
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 89c2b0c3..b94b6a78 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -83,7 +83,7 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
         if (sv_type == SVType::INS) {
             // Add only non-CIGARCLIP SVs to the cluster map
             for (size_t i = 0; i < clusters.size(); ++i) {
-                if (sv_type_calls[i].data_type != "CIGARCLIP") {
+                if (sv_type_calls[i].data_type != SVDataType::CIGARCLIP) {
                     cluster_map[clusters[i]].push_back(sv_type_calls[i]);
                 }
             }
@@ -138,17 +138,7 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                     merged_sv_calls.push_back(noise_sv_call);
                 }
 
-                // for (const auto& sv_call : cluster_sv_calls) {
-                //     if ((sv_call.end - sv_call.start)+1 >= 10000) {
-                //         SVCall noise_sv_call = sv_call;
-                //         noise_sv_call.cluster_size = cluster_id;
-                //         merged_sv_calls.push_back(noise_sv_call);
-                //         printMessage("[TEST] Adding noise SV " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1));
-                //     }
-                // }
-                // continue;  // Skip noise and unclassified points
             } else {
-            // if (true) {
 
                 // ----------------------------
                 // HMM-BASED MERGING
@@ -170,9 +160,9 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                 SVCall merged_sv_call = cluster_sv_calls[0];
                 if (has_nonzero_likelihood) {
                     // These are detected from split reads, choose the one with
-                    // the highest non-zero likelihood
+                    // the highest non-zero likelihood normalized by the length of the SV
                     std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-                        return a.hmm_likelihood > b.hmm_likelihood;
+                        return (a.hmm_likelihood / (double)(a.end - a.start + 1)) > (b.hmm_likelihood / (double)(b.end - b.start + 1));
                     });
 
                     // Obtain the highest non-zero likelihood
@@ -236,15 +226,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
         printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type));
     }
     sv_calls = std::move(merged_sv_calls); // Replace with filtered list
-
-    // Print an error if any have CIGARCLIP data type
-    for (const auto& sv_call : sv_calls) {
-        if (sv_call.data_type == "CIGARCLIP") {
-            printError("[ERROR1] Found CIGARCLIP SV in merged SVs");
-            break;
-        }
-    }
-
     int updated_size = sv_calls.size();
     printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls");
 }
@@ -272,7 +253,7 @@ void mergeDuplicateSVs(std::vector<SVCall> &sv_calls)
             // If the likelihoods are equal, keep the one with the larger cluster size
             // This is to ensure that the SV call with more supporting reads is
             // kept
-            else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size > sv_calls[i - 1].cluster_size) {
+            else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size >= sv_calls[i - 1].cluster_size) {
                 combined_sv_calls.back() = sv_call;
             }
         } else {

From b1cf23f0bf349f19f8c50deba51c0beb44f71b6f Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 29 Mar 2025 19:40:58 -0400
Subject: [PATCH 099/134] remove mismatch filter

---
 src/sv_caller.cpp | 23 ++++++++++++-----------
 src/sv_object.cpp |  7 -------
 2 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index eac69390..7a7a3191 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -383,7 +383,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                         primary_pos = std::min(primary_pos, primary_pos2);
                     }
                     int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), primary_pos);
-                    // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, "<INS>", "SPLITDIST1", "./.", 0.0, read_depth, min_mismatch_rate, primary_cluster_size);
                     SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, getSVTypeSymbol(SVType::INS), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, primary_cluster_size);
                     
                     // Print if end position = 162908547
@@ -711,6 +710,13 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
             
             // Process clipped bases as potential insertions
             } else if (op == BAM_CSOFT_CLIP) {
+                // Soft-clipped bases are considered as potential insertions
+                // Skip if the position exceeds the reference genome length
+                if (pos + 1 >= pos_depth_map.size()) {
+                    // printMessage("Skipping soft-clipped insertion at position " + std::to_string(pos + 1) + " as it exceeds the reference genome length");
+                    continue;
+                }
+
                 // Get the sequence of the insertion from the query
                 std::string ins_seq_str(op_len, ' ');
                 for (int j = 0; j < op_len; j++) {
@@ -722,12 +728,12 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                         ins_seq_str[j] = base;
                     }
                 }
-
+                
                 // Add as an insertion
                 uint32_t ins_pos = pos + 1;
                 uint32_t ins_end = ins_pos + op_len - 1;
                 int read_depth = this->getReadDepth(pos_depth_map, ins_pos);
-                
+
                 // Determine the ALT allele format based on small vs. large insertion
                 std::string alt_allele = "<INS>";
                 if (op_len <= 50) {
@@ -1283,14 +1289,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             int read_depth = sv_call.read_depth;
             std::string ref_allele = ".";
             double mismatch_rate = sv_call.mismatch_rate;
-
-            // Set PASS filter if mismatch rate is above threshold
             std::string filter = "PASS";
-            double max_mismatch_rate = 0.02;
-            if (mismatch_rate > max_mismatch_rate) {
-                filter = "LowQual";
-                filtered_svs += 1;
-            }
 
             // If the SV type is unknown, print a warning and skip
             if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) {
@@ -1381,7 +1380,9 @@ int SVCaller::getReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t
     try {
         read_depth += pos_depth_map.at(start);
     } catch (const std::out_of_range& e) {
-        printError("Error: Start position " + std::to_string(start) + " not found in depth map of size " + std::to_string(pos_depth_map.size()) + ". Exception: " + e.what());
+        // Occurs with clipped reads (insertion evidence) that are outside the
+        // range of the depth map
+        printError("Warning: Read depth for position " + std::to_string(start) + " is out of range of size " + std::to_string(pos_depth_map.size()));
     }
 
     return read_depth;
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index b94b6a78..ed3f8802 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -213,13 +213,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                         printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with median SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1));
                     }
                 }
-
-                // if (cluster_id < 0) {
-                //     merged_sv_call.cluster_size = cluster_id;
-                // } else {
-                //     merged_sv_call.cluster_size = (int) cluster_sv_calls.size();
-                // }
-                // merged_sv_calls.push_back(merged_sv_call);
                 cluster_count++;
             }
         }

From a0e713fece9c562c58f6a0914f3bc2de1faa5072 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 31 Mar 2025 15:31:31 -0400
Subject: [PATCH 100/134] update read depth and mismatch rate

---
 include/sv_caller.h |  93 ++--------------------------
 src/sv_caller.cpp   | 146 +++++++++++++-------------------------------
 2 files changed, 48 insertions(+), 191 deletions(-)

diff --git a/include/sv_caller.h b/include/sv_caller.h
index 247adf8c..4a30820c 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -17,87 +17,6 @@
 #include <future>
 /// @endcond
 
-// struct GenomicRegion {
-//     int tid;
-//     hts_pos_t start;
-//     hts_pos_t end;
-//     int query_start;
-//     int query_end;
-//     bool strand;
-//     int cluster_size;  // Number of alignments used for this region
-// };
-
-// struct PrimaryAlignment {
-//     hts_pos_t start;
-//     hts_pos_t end;
-//     int query_start;
-//     int query_end;
-//     bool strand;
-//     int cluster_size;  // Number of alignments used for this region
-// };
-
-// struct SuppAlignment {
-//     int tid;
-//     hts_pos_t start;
-//     hts_pos_t end;
-//     int query_start;
-//     int query_end;
-//     bool strand;
-//     int cluster_size;  // Number of alignments used for this region
-// };
-
-// struct SplitSignature {
-//     int tid;
-//     hts_pos_t start;
-//     hts_pos_t end;
-//     bool strand;
-//     hts_pos_t query_start;
-//     hts_pos_t query_end;
-// };
-
-// // Interval Tree Node
-// struct IntervalNode {
-//     PrimaryAlignment region;
-//     std::string qname;
-//     hts_pos_t max_end;  // To optimize queries
-//     std::unique_ptr<IntervalNode> left;
-//     std::unique_ptr<IntervalNode> right;
-
-//     IntervalNode(PrimaryAlignment r, std::string name)
-//         : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {}
-// };
-
-// void insert(std::unique_ptr<IntervalNode>& root, const PrimaryAlignment& region, std::string qname) {
-//     if (!root) {
-//         root = std::make_unique<IntervalNode>(region, qname);
-//         return;
-//     }
-
-//     if (region.start < root->region.start)
-//     {
-//         insert(root->left, region, qname);
-//     } else {
-//         insert(root->right, region, qname);
-//     }
-
-//     // Update max_end
-//     root->max_end = std::max(root->max_end, region.end);
-// }
-
-// void findOverlaps(const std::unique_ptr<IntervalNode>& root, const PrimaryAlignment& query, std::vector<std::string>& result) {
-//     if (!root) return;
-
-//     // If overlapping, add to result
-//     if (query.start <= root->region.end && query.end >= root->region.start)
-//         result.push_back(root->qname);
-
-//     // If left subtree may have overlaps, search left
-//     if (root->left && root->left->max_end >= query.start)
-//         findOverlaps(root->left, query, result);
-
-//     // Always check the right subtree
-//     findOverlaps(root->right, query, result);
-// }
 
 class SVCaller {
     private:
@@ -127,7 +46,7 @@ class SVCaller {
             int query_start;
             int query_end;
             bool strand;
-            int cluster_size;  // Number of alignments used for this region
+            double mismatch_rate;  // Mismatch rate for this alignment
         };
 
         struct SplitSignature {
@@ -159,13 +78,13 @@ class SVCaller {
         void findSplitSVSignatures(std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const InputData& input_data, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map, const ReferenceGenome& ref_genome);
 
         // Process a single CIGAR record and find candidate SVs
-        void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome, std::unordered_map<std::string, double>& read_mismatch_rates);
+        void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map);
 
         std::pair<int, int> getAlignmentReadPositions(bam1_t* alignment);
 
-        void processChromosome(const std::string& chr, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov, std::unordered_map<std::string, double>& read_mismatch_rates);
+        void processChromosome(const std::string& chr, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov);
 
-        void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome, std::unordered_map<std::string, double>& read_mismatch_rates);
+        void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map);
 
         double getReadMismatchRate(bam1_t * alignment, const std::string& chr, const ReferenceGenome & ref_genome);
  
@@ -174,10 +93,10 @@ class SVCaller {
 
         void runSplitReadCopyNumberPredictions(const std::string& chr, std::vector<SVCall>& split_sv_calls, const CNVCaller &cnv_caller, const CHMM &hmm, double mean_chr_cov, const std::vector<uint32_t> &pos_depth_map, const InputData &input_data);
 
-        void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const std::string &output_dir, const ReferenceGenome &ref_genome) const;
+        void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const std::string &output_dir, const ReferenceGenome &ref_genome, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map) const;
 
         // Query the read depth (INFO/DP) at a position
-        int getReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start);
+        int getReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start) const;
 
     public:
         SVCaller() = default;
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 7a7a3191..b25991be 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -133,7 +133,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
     uint32_t num_alignments = 0;
     std::unordered_set<int> alignment_tids;  // All unique chromosome IDs
     std::unordered_set<std::string> supp_qnames;  // All unique query names
-    std::unordered_map<std::string, double> read_mismatch_rates;  // Query name -> mismatch rate
     while (readNextAlignment(fp_in, itr, bam1) >= 0) {
 
         // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality
@@ -158,14 +157,12 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             // Get the mismatch rate for the read
             const std::string supp_chr = bamHdr->target_name[bam1->core.tid];
             double mismatch_rate = getReadMismatchRate(bam1, supp_chr, ref_genome);
-            read_mismatch_rates[qname] = mismatch_rate;
-            // printMessage("[TEST] Mismatch rate for " + qname + ": " + std::to_string(mismatch_rate) + " at position " + std::to_string(bam1->core.pos + 1) + std::string(bam_endpos(bam1) > 0 ? "-" + std::to_string(bam_endpos(bam1)) : ""));
 
             // Store chromosome (TID), start, and end positions (1-based) of the
             // supplementary alignment, and the strand (true for forward, false
             // for reverse)
             std::pair<int, int> qpos = getAlignmentReadPositions(bam1);
-            supp_map[qname].push_back(SuppAlignment{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0});
+            supp_map[qname].push_back(SuppAlignment{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), mismatch_rate});
             alignment_tids.insert(bam1->core.tid);
             supp_qnames.insert(qname);
             supplementary_count++;
@@ -177,6 +174,10 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
         }
     }
 
+    // Clean up the iterator and alignment
+    hts_itr_destroy(itr);
+    bam_destroy1(bam1);
+    
     // Remove primary alignments without supplementary alignments
     std::unordered_map<int, std::unordered_set<std::string>> to_remove;
     for (auto& chr_primary : primary_map) {
@@ -198,6 +199,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
     }
     printMessage("Removed " + std::to_string(total_removed) + " primary alignments without supplementary alignments");
 
+    // Process the primary alignments and find SVs
     for (const auto& chr_primary : primary_map) {
         int primary_tid = chr_primary.first;
         std::string chr_name = bamHdr->target_name[primary_tid];
@@ -267,14 +269,11 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             std::vector<int> starts;
             std::vector<int> ends;
             std::vector<bool> primary_strands;
-            double min_mismatch_rate = 1.0;
             for (const std::string& qname : primary_cluster) {
                 const PrimaryAlignment& primary_aln = chr_primary_map.at(qname);
                 starts.push_back(primary_aln.start);
                 ends.push_back(primary_aln.end);
                 primary_strands.push_back(primary_aln.strand);
-                min_mismatch_rate = std::min(min_mismatch_rate, read_mismatch_rates[qname]);
-                // printMessage("[TEST-SPLIT] Mismatch rate for " + qname + ": " + std::to_string(read_mismatch_rates[qname]));
             }
 
             // Get the largest cluster of primary alignment start positions
@@ -296,6 +295,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             std::vector<int> supp_ends;
             std::vector<bool> supp_strands;
             std::vector<int> split_distances;
+            std::vector<double> supp_mismatch_rates;
             for (const std::string& qname : primary_cluster) {
                 const PrimaryAlignment& primary_aln = chr_primary_map.at(qname);
                 const std::vector<SuppAlignment>& supp_alns = supp_map.at(qname);
@@ -306,6 +306,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                         supp_starts.push_back(supp_aln.start);
                         supp_ends.push_back(supp_aln.end);
                         supp_strands.push_back(supp_aln.strand);
+                        supp_mismatch_rates.push_back(supp_aln.mismatch_rate);
 
                         // Calculate the distance between the primary and supplementary
                         // alignments on the read if on the same chromosome and same
@@ -326,6 +327,11 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                     }
                 }
             }
+            double mean_supp_mismatch_rate = 0.0;
+            for (double rate : supp_mismatch_rates) {
+                mean_supp_mismatch_rate += rate;
+            }
+            mean_supp_mismatch_rate /= (double)supp_mismatch_rates.size();
 
             // Get the largest cluster of supplementary alignment start positions
             dbscan.fit(supp_starts);
@@ -382,8 +388,8 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                         // If two positions were found, use the 5'most position
                         primary_pos = std::min(primary_pos, primary_pos2);
                     }
-                    int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), primary_pos);
-                    SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, getSVTypeSymbol(SVType::INS), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, primary_cluster_size);
+                    //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), primary_pos);
+                    SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, getSVTypeSymbol(SVType::INS), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, primary_cluster_size);
                     
                     // Print if end position = 162908547
                     if (primary_pos + (read_distance - 1) == 162908547) {
@@ -426,14 +432,14 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             // Store the inversion as the supplementary start and end positions
             if (supp_best_start != -1 && supp_best_end != -1) {
                 if (inversion && std::abs(supp_best_start - supp_best_end) >= 50) {
-                    int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), std::min(supp_best_start, supp_best_end));
+                    //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), std::min(supp_best_start, supp_best_end));
 
                     // Print if end position = 162908547
                     if (std::max(supp_best_start, supp_best_end) == 162908547) {
                         printMessage("[TEST] Adding inversion SV candidate at " + chr_name + ":" + std::to_string(std::min(supp_best_start, supp_best_end)) + "-" + std::to_string(std::max(supp_best_start, supp_best_end)) + " with length " + std::to_string(std::abs(supp_best_start - supp_best_end)));
                     }
 
-                    SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, supp_cluster_size);
+                    SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, supp_cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
                 }
             }
@@ -469,22 +475,12 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) {
 
                 // Add an inversion call if necessary
-                int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start);
+                //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start);
                 if (inversion) {
-                    SVCall sv_candidate(sv_start, sv_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, cluster_size);
+                    SVCall sv_candidate(sv_start, sv_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
-
-                    // Print if end position = 162908547
-                    if (sv_end == 162908547) {
-                        printMessage("[TEST] Adding inversion SV candidate at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length));
-                    }
                 } else {
-                    SVCall sv_candidate(sv_start, sv_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, cluster_size);
-                    
-                    // Print if end position = 162908547
-                    if (sv_end == 162908547) {
-                        printMessage("[TEST] Adding deletion SV candidate at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and mismatch rate " + std::to_string(min_mismatch_rate));
-                    }
+                    SVCall sv_candidate(sv_start, sv_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
                 }
             }
@@ -493,13 +489,8 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             else if (sv_length >= min_length && sv_length <= max_length) {
                 SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
                 std::string alt = (sv_type == SVType::INV) ? "<INV>" : ".";
-                int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start);
-
-                // Print if end position = 162908547
-                if (sv_end == 162908547) {
-                    printMessage("[TEST] Adding CNV SV candidate at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and type " + getSVTypeSymbol(sv_type) + " and mismatch rate " + std::to_string(min_mismatch_rate));
-                }
-                SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, cluster_size);
+                //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start);
+                SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, cluster_size);
                 addSVCall(chr_sv_calls, sv_candidate);
             }
         }
@@ -526,7 +517,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
     }
 }
 
-void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map, const ReferenceGenome& ref_genome, std::unordered_map<std::string, double>& read_mismatch_rates)
+void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map)
 {
     // Create a read and iterator for the region
     bam1_t *bam1 = bam_init1();
@@ -552,7 +543,7 @@ void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, c
 
         // Process the alignment
         // bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY);
-        this->processCIGARRecord(bamHdr, bam1, sv_calls, pos_depth_map, ref_genome, read_mismatch_rates);
+        this->processCIGARRecord(bamHdr, bam1, sv_calls, pos_depth_map);
     }
 
     // Clean up the iterator and alignment
@@ -609,15 +600,12 @@ double SVCaller::getReadMismatchRate(bam1_t *alignment, const std::string& chr,
     return mismatch_rate;
 }
 
-void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector<SVCall> &sv_calls, const std::vector<uint32_t> &pos_depth_map, const ReferenceGenome &ref_genome, std::unordered_map<std::string, double> &read_mismatch_rates)
+void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector<SVCall> &sv_calls, const std::vector<uint32_t> &pos_depth_map)
 {
     std::string chr = header->target_name[alignment->core.tid];  // Chromosome name
     uint32_t aln_start = (uint32_t)alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
     uint32_t pos = aln_start;
-    uint32_t end = (uint32_t)bam_endpos(alignment) - 1;  // Rightmost position of the alignment in the reference genome (0-based)
-
-    // Get the reference sequence (used for mismatch rate)
-    std::string_view ref_seq = ref_genome.query(chr, pos + 1, end + 1);
+    // uint32_t end = (uint32_t)bam_endpos(alignment) - 1;  // Rightmost position of the alignment in the reference genome (0-based)
 
     uint32_t* cigar = bam_get_cigar(alignment);  // CIGAR array
     int cigar_len = alignment->core.n_cigar;
@@ -635,8 +623,6 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
         amb_bases_bitset.set(std::tolower(base));
     }
 
-    int match_count = 0;
-    int mismatch_count = 0;
     std::vector<SVCall> cigar_sv_calls;
     cigar_sv_calls.reserve(1000);
     for (int i = 0; i < cigar_len; i++) {
@@ -658,54 +644,18 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                         ins_seq_str[j] = base;
                     }
                 }
-                
-                // // Before the insertion
-                // if (pos >= (uint32_t)op_len-1)
-                // {
-                //     uint32_t bp1 = pos - (op_len - 1) + 1;
-                //     uint32_t bp2 = bp1 + op_len - 1; //pos + 1;
-
-                //     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
-                //     {
-                //         int read_depth = this->getReadDepth(pos_depth_map, bp1);
-                //         SVCall sv_call(bp1, bp2, SVType::DUP, "<DUP>", "LSEQSIM", "./.", default_lh, read_depth, 1, 0);
-                //         // addSVCall(sv_calls, sv_call);
-                //         addSVCall(cigar_sv_calls, sv_call);
-                //         continue;
-                //     }
-                // }
-
-                // // After the insertion
-                // if (pos + op_len < ref_genome.getChromosomeLength(chr))
-                // {
-                //     uint32_t bp1 = pos + 1;
-                //     uint32_t bp2 = bp1 + op_len - 1;
-
-                //     if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD))
-                //     {
-                //         int read_depth = this->getReadDepth(pos_depth_map, bp1);
-                //         SVCall sv_call(bp1, bp2, SVType::DUP, "<DUP>", "RSEQSIM", "./.", default_lh, read_depth, 1, 0);
-                //         // addSVCall(sv_calls, sv_call);
-                //         addSVCall(cigar_sv_calls, sv_call);
-                //         continue;
-                //     }
-                // }
 
                 // Add as an insertion
                 uint32_t ins_pos = pos + 1;
                 uint32_t ins_end = ins_pos + op_len - 1;
-                int read_depth = this->getReadDepth(pos_depth_map, ins_pos);
+                //int read_depth = this->getReadDepth(pos_depth_map, ins_pos);
                 
                 // Determine the ALT allele format based on small vs. large insertion
                 std::string alt_allele = "<INS>";
                 if (op_len <= 50) {
                     alt_allele = ins_seq_str;
                 }
-                // SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele,
-                // "CIGARINS", "./.", default_lh, read_depth, 1, 0);
-                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, read_depth, 1, 0);
-                // addSVCall(sv_calls, sv_call);
-                // addSVCall(cigar_sv_calls, sv_call);
+                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, 0, 1, 0);
                 cigar_sv_calls.emplace_back(sv_call);
             
             // Process clipped bases as potential insertions
@@ -713,7 +663,6 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                 // Soft-clipped bases are considered as potential insertions
                 // Skip if the position exceeds the reference genome length
                 if (pos + 1 >= pos_depth_map.size()) {
-                    // printMessage("Skipping soft-clipped insertion at position " + std::to_string(pos + 1) + " as it exceeds the reference genome length");
                     continue;
                 }
 
@@ -732,29 +681,25 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                 // Add as an insertion
                 uint32_t ins_pos = pos + 1;
                 uint32_t ins_end = ins_pos + op_len - 1;
-                int read_depth = this->getReadDepth(pos_depth_map, ins_pos);
+                //int read_depth = this->getReadDepth(pos_depth_map, ins_pos);
 
                 // Determine the ALT allele format based on small vs. large insertion
                 std::string alt_allele = "<INS>";
                 if (op_len <= 50) {
                     alt_allele = ins_seq_str;
                 }
-                // SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele,
-                // "CIGARCLIP", "./.", default_lh, read_depth, 0.0, 0);
-                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, read_depth, 0.0, 0);
-                // addSVCall(sv_calls, sv_call);
+                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, 0, 0.0, 0);
                 cigar_sv_calls.emplace_back(sv_call);  // Commented for testing
-                // printMessage("Completed adding SV: " + std::to_string(ins_pos) + "-" + std::to_string(ins_end) + " " + alt_allele + ", RD=" + std::to_string(read_depth) + ", data type=" + sv_call.data_type);
 
             // Check if the CIGAR operation is a deletion
             } else if (op == BAM_CDEL) {
 
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
-                int read_depth = this->getReadDepth(pos_depth_map, ref_pos);
+                //int read_depth = this->getReadDepth(pos_depth_map, ref_pos);
                 // SVCall sv_call(ref_pos, ref_end, SVType::DEL, "<DEL>",
                 // "CIGARDEL", "./.", default_lh, read_depth, 1, 0);
-                SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, read_depth, 1, 0);
+                SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, 0, 1, 0);
                 // addSVCall(sv_calls, sv_call);
                 // addSVCall(cigar_sv_calls, sv_call);
                 cigar_sv_calls.emplace_back(sv_call);
@@ -842,7 +787,7 @@ std::pair<int, int> SVCaller::getAlignmentReadPositions(bam1_t *alignment)
     return std::make_pair(query_start, query_end);
 }
 
-void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov, std::unordered_map<std::string, double>& read_mismatch_rates)
+void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& chr_sv_calls, const InputData& input_data, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov)
 {
     // Open the BAM file
     std::string bam_filepath = input_data.getLongReadBam();
@@ -883,7 +828,7 @@ void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& ch
     // -----------------------------------------------------------------------
     // Detect SVs from the CIGAR strings
     printMessage(chr + ": CIGAR SVs...");
-    this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome, read_mismatch_rates);
+    this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map);
 
     printMessage(chr + ": Merging CIGAR...");
     mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false);
@@ -956,7 +901,6 @@ void SVCaller::run(const InputData& input_data)
         chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end());
     }
     std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
-    std::unordered_map<std::string, double> read_mismatch_rates;
     int current_chr = 0;
     int total_chr_count = chromosomes.size();
 
@@ -973,18 +917,10 @@ void SVCaller::run(const InputData& input_data)
             try {
                 std::vector<SVCall> sv_calls;
                 InputData chr_input_data = input_data;  // Use a thread-local copy
-                std::unordered_map<std::string, double> chr_read_mismatch_rates;
-                this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], chr_read_mismatch_rates);
+                this->processChromosome(chr, sv_calls, chr_input_data, chr_pos_depth_map[chr], chr_mean_cov_map[chr]);
                 {
                     std::shared_lock<std::shared_mutex> lock(this->shared_mutex);
-                    
-                    // Update the SV calls for the chromosome
                     whole_genome_sv_calls[chr] = std::move(sv_calls);
-
-                    // Update the mismatch rates for each read name
-                    for (const auto& entry : chr_read_mismatch_rates) {
-                        read_mismatch_rates[entry.first] = entry.second;
-                    }
                 }
             } catch (const std::exception& e) {
                 printError("Error processing chromosome " + chr + ": " + e.what());
@@ -997,7 +933,6 @@ void SVCaller::run(const InputData& input_data)
         std::vector<std::future<void>> futures;
         for (const auto& chr : chromosomes) {
             futures.emplace_back(pool.enqueue([&, chr] {
-                // printMessage("Processing chromosome " + chr);
                 process_chr(chr);
             }));
         }
@@ -1083,7 +1018,7 @@ void SVCaller::run(const InputData& input_data)
     // Save to VCF
     std::cout << "Saving SVs to VCF..." << std::endl;
     const std::string output_dir = input_data.getOutputDir();
-    this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome);
+    this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome, chr_pos_depth_map);
 }
 
 void SVCaller::findOverlaps(const std::unique_ptr<IntervalNode> &root, const PrimaryAlignment &query, std::vector<std::string> &result)
@@ -1192,7 +1127,7 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
     }
 }
 
-void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const
+void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map) const
 {
     std::cout << "Creating VCF writer..." << std::endl;
     std::string output_vcf = output_dir + "/output.vcf";
@@ -1286,7 +1221,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             double hmm_likelihood = sv_call.hmm_likelihood;
             int sv_length = end - start + 1;
             int cluster_size = sv_call.cluster_size;
-            int read_depth = sv_call.read_depth;
+            //int read_depth = sv_call.read_depth;
             std::string ref_allele = ".";
             double mismatch_rate = sv_call.mismatch_rate;
             std::string filter = "PASS";
@@ -1348,6 +1283,9 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                     base = 'N';
                 }
             }
+            
+            // Get read depth
+            int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr), start);
 
             // Create the VCF parameter strings
             std::string sv_type_str = getSVTypeString(sv_type);
@@ -1374,7 +1312,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     printMessage("Total PASS filtered SVs: " + std::to_string(filtered_svs));
 }
 
-int SVCaller::getReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start)
+int SVCaller::getReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start) const
 {
     int read_depth = 0;
     try {

From b7eed21fc6816d12e6a96df8622ef430594ce4b9 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 11 Apr 2025 11:24:54 -0400
Subject: [PATCH 101/134] add cn state and read distance features

---
 include/cnv_caller.h |   2 +-
 include/sv_caller.h  |   5 +-
 include/sv_object.h  |  12 +-
 src/cnv_caller.cpp   |  12 +-
 src/sv_caller.cpp    | 530 +++++++++++++++++++++++--------------------
 src/sv_object.cpp    |   2 +-
 6 files changed, 296 insertions(+), 267 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 87a9f011..03764da8 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -89,7 +89,7 @@ class CNVCaller {
 
         // Run copy number prediction for a single SV candidate, returning the
         // likelihood, predicted CNV type, genotype, and whether SNPs were found
-        std::tuple<double, SVType, Genotype, bool> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
+        std::tuple<double, SVType, Genotype, bool, int> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
         void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
diff --git a/include/sv_caller.h b/include/sv_caller.h
index 4a30820c..4ef59700 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -46,7 +46,6 @@ class SVCaller {
             int query_start;
             int query_end;
             bool strand;
-            double mismatch_rate;  // Mismatch rate for this alignment
         };
 
         struct SplitSignature {
@@ -75,7 +74,7 @@ class SVCaller {
 
         std::vector<std::string> getChromosomes(const std::string& bam_filepath);
 
-        void findSplitSVSignatures(std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const InputData& input_data, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map, const ReferenceGenome& ref_genome);
+        void findSplitSVSignatures(std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const InputData& input_data);
 
         // Process a single CIGAR record and find candidate SVs
         void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map);
@@ -85,8 +84,6 @@ class SVCaller {
         void processChromosome(const std::string& chr, std::vector<SVCall>& combined_sv_calls, const InputData& input_data, const std::vector<uint32_t>& chr_pos_depth_map, double mean_chr_cov);
 
         void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map);
-
-        double getReadMismatchRate(bam1_t * alignment, const std::string& chr, const ReferenceGenome & ref_genome);
  
         // Read the next alignment from the BAM file in a thread-safe manner
         int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1);
diff --git a/include/sv_object.h b/include/sv_object.h
index 08fe8f70..a99fb4fb 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -21,16 +21,20 @@ struct SVCall {
     SVDataType data_type = SVDataType::UNKNOWN;
     Genotype genotype = Genotype::UNKNOWN;
     double hmm_likelihood = 0.0;
-    int read_depth = 0;  // Breakpoint depth
-    double mismatch_rate = 0.0;  // Highest mismatch rate in reads used for the SV call
+    int cn_state = 0;  // Copy number state
+    int aln_offset = 0;  // Alignment offset (read vs. reference distance factor)
+    // int read_depth = 0;  // Breakpoint depth
+    // double mismatch_rate = 0.0;  // Highest mismatch rate in reads used for the SV call
     int cluster_size = 0;  // Number of SV calls in the cluster
 
     bool operator<(const SVCall& other) const;
 
     SVCall() = default;
 
-    SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int read_depth, double mismatch_rate, int cluster_size) :
-        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), mismatch_rate(mismatch_rate), cluster_size(cluster_size) {}
+    SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int cn_state, int aln_offset, int cluster_size) :
+        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), cn_state(cn_state), aln_offset(aln_offset), cluster_size(cluster_size) {}
+    // SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int read_depth, double mismatch_rate, int cluster_size) :
+    //     start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), mismatch_rate(mismatch_rate), cluster_size(cluster_size) {}
 };
 
 void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call);
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index c293d7fb..c7832fd1 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -155,13 +155,13 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     snp_data.is_snp = std::move(is_snp_hmm);
 }
 
-std::tuple<double, SVType, Genotype, bool> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
+std::tuple<double, SVType, Genotype, bool, int> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
 {
     // Check that the start position is less than the end position
     if (start_pos > end_pos)
     {
         printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
-        return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false);
+        return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false, 0);
     }
 
     // Run the Viterbi algorithm on SNPs in the SV region
@@ -197,7 +197,7 @@ std::tuple<double, SVType, Genotype, bool> CNVCaller::runCopyNumberPrediction(st
     runViterbi(hmm, snp_data, prediction);
     if (prediction.first.size() == 0)
     {
-        return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false);
+        return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false, 0);
     }
 
     std::vector<int>& state_sequence = prediction.first;
@@ -238,7 +238,6 @@ std::tuple<double, SVType, Genotype, bool> CNVCaller::runCopyNumberPrediction(st
     if ((double) max_count / (double) state_count > pct_threshold)
     {
         predicted_cnv_type = getSVTypeFromCNState(max_state);
-        // genotype = cnv_genotype_map.at(max_state);
         genotype = getGenotypeFromCNState(max_state);
     }
     snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
@@ -281,7 +280,7 @@ std::tuple<double, SVType, Genotype, bool> CNVCaller::runCopyNumberPrediction(st
         this->saveSVCopyNumberToJSON(before_sv, after_sv, snp_data, chr, start_pos, end_pos, cnv_type_str, likelihood, json_filepath);
     }
     
-    return std::make_tuple(likelihood, predicted_cnv_type, genotype, true);
+    return std::make_tuple(likelihood, predicted_cnv_type, genotype, true, max_state);
 }
 
 
@@ -369,9 +368,10 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         if (is_valid_update)
         {
             sv_call.sv_type = updated_sv_type;
+            sv_call.data_type = SVDataType::HMM;
             sv_call.hmm_likelihood = likelihood;
             sv_call.genotype = genotype;
-            sv_call.data_type = SVDataType::HMM;
+            sv_call.cn_state = max_state;
         }
     }
 }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index b25991be..c5607e31 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -63,7 +63,7 @@ std::vector<std::string> SVCaller::getChromosomes(const std::string &bam_filepat
     return chromosomes;
 }
 
-void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const InputData &input_data, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map, const ReferenceGenome& ref_genome)
+void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const InputData &input_data)
 {
     // Open the BAM file
     std::string bam_filepath = input_data.getLongReadBam();
@@ -154,15 +154,11 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
 
         // Process supplementary alignments
         } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) {
-            // Get the mismatch rate for the read
-            const std::string supp_chr = bamHdr->target_name[bam1->core.tid];
-            double mismatch_rate = getReadMismatchRate(bam1, supp_chr, ref_genome);
-
             // Store chromosome (TID), start, and end positions (1-based) of the
             // supplementary alignment, and the strand (true for forward, false
             // for reverse)
             std::pair<int, int> qpos = getAlignmentReadPositions(bam1);
-            supp_map[qname].push_back(SuppAlignment{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), mismatch_rate});
+            supp_map[qname].push_back(SuppAlignment{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE)});
             alignment_tids.insert(bam1->core.tid);
             supp_qnames.insert(qname);
             supplementary_count++;
@@ -206,6 +202,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
         printMessage("Processing chromosome " + chr_name + " with " + std::to_string(chr_primary.second.size()) + " primary alignments");
 
         std::vector<SVCall> chr_sv_calls;
+        chr_sv_calls.reserve(1000);
         const std::unordered_map<std::string, PrimaryAlignment>& chr_primary_map = chr_primary.second;
 
         // Identify overlapping primary alignments and cluster endpoints
@@ -294,32 +291,33 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             std::vector<int> supp_starts;
             std::vector<int> supp_ends;
             std::vector<bool> supp_strands;
-            std::vector<int> split_distances;
-            std::vector<double> supp_mismatch_rates;
+            std::vector<int> read_distances;
+            std::vector<int> ref_distances;
             for (const std::string& qname : primary_cluster) {
                 const PrimaryAlignment& primary_aln = chr_primary_map.at(qname);
                 const std::vector<SuppAlignment>& supp_alns = supp_map.at(qname);
                 for (const SuppAlignment& supp_aln : supp_alns) {
                     if (supp_aln.tid == primary_tid) {
                         // Same chromosome
-                        int distance = 0;
+                        int read_distance = 0;
+                        int ref_distance = 0;
                         supp_starts.push_back(supp_aln.start);
                         supp_ends.push_back(supp_aln.end);
                         supp_strands.push_back(supp_aln.strand);
-                        supp_mismatch_rates.push_back(supp_aln.mismatch_rate);
 
                         // Calculate the distance between the primary and supplementary
                         // alignments on the read if on the same chromosome and same
                         // strand
                         if (supp_aln.strand == primary_aln.strand) {
                             // Same strand
-                            // Calculate distance (negative if overlapping)
-                            if (primary_aln.query_start <= supp_aln.query_start) {
-                                distance = supp_aln.query_start - primary_aln.query_end;
-                            } else {
-                                distance = primary_aln.query_start - supp_aln.query_end;
-                            }
-                            split_distances.push_back(distance);
+                            // Calculate distance between alignments on the read
+                            read_distance = std::max(0, std::max(static_cast<int>(supp_aln.query_start), static_cast<int>(primary_aln.query_start)) - std::min(static_cast<int>(supp_aln.query_end), static_cast<int>(primary_aln.query_end)));
+
+                            // Calculate distance between alignments on the
+                            // reference
+                            ref_distance = std::max(0, std::max(static_cast<int>(supp_aln.start), static_cast<int>(primary_aln.start)) - std::min(static_cast<int>(supp_aln.end), static_cast<int>(primary_aln.end)));
+                            read_distances.push_back(read_distance);
+                            ref_distances.push_back(ref_distance);
                         }
 
                     } else {
@@ -327,11 +325,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                     }
                 }
             }
-            double mean_supp_mismatch_rate = 0.0;
-            for (double rate : supp_mismatch_rates) {
-                mean_supp_mismatch_rate += rate;
-            }
-            mean_supp_mismatch_rate /= (double)supp_mismatch_rates.size();
 
             // Get the largest cluster of supplementary alignment start positions
             dbscan.fit(supp_starts);
@@ -341,158 +334,293 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             dbscan.fit(supp_ends);
             std::vector<int> supp_end_cluster = dbscan.getLargestCluster(supp_ends);
 
-            // Get the largest cluster of split distances
-            dbscan.fit(split_distances);
-            std::vector<int> split_distance_cluster = dbscan.getLargestCluster(split_distances);
+            // Get the largest cluster of read distances
+            dbscan.fit(read_distances);
+            std::vector<int> read_distance_cluster = dbscan.getLargestCluster(read_distances);
+
+            // Get the largest cluster of reference distances
+            dbscan.fit(ref_distances);
+            std::vector<int> ref_distance_cluster = dbscan.getLargestCluster(ref_distances);
 
             // Continue if no clusters were found
-            if (supp_start_cluster.empty() && supp_end_cluster.empty() && split_distance_cluster.empty()) {
+            if (supp_start_cluster.empty() && supp_end_cluster.empty() && read_distance_cluster.empty() && ref_distance_cluster.empty()) {
                 continue;
             }
 
             // Use the median of the largest cluster of primary and supplementary
             // alignment start, end positions as the final genome coordinates of the
             // SV
-            int primary_pos = -1;
-            int primary_pos2 = -1;
+            // int primary_pos = -1;
+            // int primary_pos2 = -1;
+            std::vector<int> primary_positions;
             int primary_cluster_size = 0;
-            if (primary_start_cluster.size() > primary_end_cluster.size()) {
+            if (!primary_start_cluster.empty()) {
                 std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
-                primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
+                primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]);
                 primary_cluster_size = primary_start_cluster.size();
-            } else if (primary_end_cluster.size() > primary_start_cluster.size()) {
-                std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
-                primary_pos = primary_end_cluster[primary_end_cluster.size() / 2];
-                primary_cluster_size = primary_end_cluster.size();
-            } else {
-                // Use both positions
-                std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
+            }
+
+            if (!primary_end_cluster.empty()) {
                 std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
-                primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
-                primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2];
-                primary_cluster_size = primary_start_cluster.size();
+                primary_positions.push_back(primary_end_cluster[primary_end_cluster.size() / 2]);
+                primary_cluster_size = std::max(primary_cluster_size, (int) primary_end_cluster.size());
             }
+            // if (primary_start_cluster.size() > primary_end_cluster.size()) {
+            //     std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
+            //     // primary_pos =
+            //     // primary_start_cluster[primary_start_cluster.size() / 2];
+            //     primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]);
+            //     primary_cluster_size = primary_start_cluster.size();
+            // } else if (primary_end_cluster.size() > primary_start_cluster.size()) {
+            //     std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
+            //     // primary_pos = primary_end_cluster[primary_end_cluster.size()
+            //     // / 2];
+            //     primary_positions.push_back(primary_end_cluster[primary_end_cluster.size() / 2]);
+            //     primary_cluster_size = primary_end_cluster.size();
+            // } else {
+            //     // Use both positions
+            //     std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
+            //     std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
+            //     // primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
+            //     // primary_pos2 = primary_end_cluster[primary_end_cluster.size()
+            //     // / 2];
+            //     primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]);
+            //     primary_cluster_size = primary_start_cluster.size();
+            // }
 
             // -------------------------------
-            // SPLIT INSERTION DETECTION
+            // SPLIT INSERTION CALLS
             int read_distance = 0;
-            if (!split_distance_cluster.empty()) {
+            int ref_distance = 0;
+            if (!read_distance_cluster.empty() && !ref_distance_cluster.empty()) {
                 // Use the median of the largest cluster of split distances as the
                 // insertion size
-                std::sort(split_distance_cluster.begin(), split_distance_cluster.end());
-                read_distance = split_distance_cluster[split_distance_cluster.size() / 2];
+                std::sort(read_distance_cluster.begin(), read_distance_cluster.end());
+                read_distance = read_distance_cluster[read_distance_cluster.size() / 2];
+                
+                std::sort(ref_distance_cluster.begin(), ref_distance_cluster.end());
+                ref_distance = ref_distance_cluster[ref_distance_cluster.size() / 2];
 
                 // Add an insertion SV call at the primary position
-                if (primary_pos != -1 && read_distance > 2000) {
-                    if (primary_pos2 != -1) {
-                        // If two positions were found, use the 5'most position
-                        primary_pos = std::min(primary_pos, primary_pos2);
-                    }
-                    //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), primary_pos);
-                    SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, getSVTypeSymbol(SVType::INS), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, primary_cluster_size);
-                    
-                    // Print if end position = 162908547
-                    if (primary_pos + (read_distance - 1) == 162908547) {
-                        printMessage("[TEST] Adding insertion SV candidate at " + chr_name + ":" + std::to_string(primary_pos) + "-" + std::to_string(primary_pos + (read_distance - 1)) + " with length " + std::to_string(read_distance));
+                // Using a minimum read distance of 2000bp since most insertions
+                // < 2kb can be identified more accurately using CIGAR-based
+                // methods
+                // if (primary_pos != -1 && read_distance > 2000) {
+                // if (primary_pos != -1) {
+                //     if (primary_pos2 != -1) {
+                //         // If two positions were found, use the 5'most position
+                //         primary_pos = std::min(primary_pos, primary_pos2);
+                //     }
+                if (!primary_positions.empty()) {
+                    int aln_offset = static_cast<int>(ref_distance - read_distance);
+                    if (read_distance > ref_distance  && read_distance >= min_length && read_distance <= max_length) {
+                        // Add an insertion SV call at the primary positions
+                        SVType sv_type = SVType::INS;
+                        // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
+                        // addSVCall(chr_sv_calls, sv_candidate);
+                        for (int primary_pos : primary_positions) {
+                            SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
+                            addSVCall(chr_sv_calls, sv_candidate);
+                        }
+                    } else if (ref_distance > read_distance && ref_distance >= min_length && ref_distance <= max_length) {
+                        for (int primary_pos : primary_positions) {
+                            SVType sv_type = SVType::DEL;
+                            SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
+                            addSVCall(chr_sv_calls, sv_candidate);
+                        }
+                        // Add a deletion SV call at the primary position
+                        // SVType sv_type = SVType::DEL;
+                        // SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
                     }
-                    addSVCall(chr_sv_calls, sv_candidate);
                 }
+
+                    // if (ref_distance >= 50 && ref_distance < read_distance) {
+                    //     // Add an insertion SV call at the primary position
+                    //     SVType sv_type = SVType::INS;
+                    //     int aln_offset = static_cast<int>(ref_distance - read_distance);
+                    //     SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
+                    //     addSVCall(chr_sv_calls, sv_candidate);
+                    // }
+                    // SVType sv_type = SVType::INS;
+                    // int aln_offset = static_cast<int>(read_distance - ref_distance);
+                    // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
+                    // addSVCall(chr_sv_calls, sv_candidate);
             }
 
+            // if (!ref_distance_cluster.empty()) {
+            //     // Use the median of the largest cluster of split distances as the
+            //     // insertion size
+            //     std::sort(ref_distance_cluster.begin(), ref_distance_cluster.end());
+            //     size_t median_index = ref_distance_cluster.size() / 2;
+            //     ref_distance = ref_distance_cluster[median_index];
+            //     read_distance = read_distance_cluster[median_index];
+
+            //     // Add a deletion SV call at the primary position
+            //     if (primary_pos != -1 && ref_distance >= 50 && ref_distance > read_distance) {
+            //         if (primary_pos2 != -1) {
+            //             // If two positions were found, use the 5'most position
+            //             primary_pos = std::min(primary_pos, primary_pos2);
+            //         }
+            //         SVType sv_type = SVType::DEL;
+            //         int aln_offset = static_cast<int>(ref_distance - read_distance);
+            //         SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
+            //         addSVCall(chr_sv_calls, sv_candidate);
+
+            //         // Add an inversion if necessary (inverted deletion)
+            //         if (inversion) {
+            //             SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
+            //             addSVCall(chr_sv_calls, sv_candidate);
+            //         }
+            //     }
+            // }
+
             // --------------------------------
 
             // Get the supplementary alignment positions
-            int supp_pos = -1;
-            int supp_pos2 = -1;
+            // int supp_pos = -1;
+            // int supp_pos2 = -1;
+            std::vector<int> supp_positions;
             int supp_cluster_size = 0;
-            int supp_best_start = -1;
-            int supp_best_end = -1;
+            // int supp_best_start = -1;
+            // int supp_best_end = -1;
+            // if (!supp_start_cluster.empty()) {
+            //     std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
+            //     supp_best_start = supp_start_cluster[supp_start_cluster.size() / 2];
+            // }
+            // if (!supp_end_cluster.empty()) {
+            //     std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
+            //     supp_best_end = supp_end_cluster[supp_end_cluster.size() / 2];
+            // }
+
             if (!supp_start_cluster.empty()) {
                 std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
-                supp_best_start = supp_start_cluster[supp_start_cluster.size() / 2];
+                supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]);
+                supp_cluster_size = supp_start_cluster.size();
             }
             if (!supp_end_cluster.empty()) {
                 std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
-                supp_best_end = supp_end_cluster[supp_end_cluster.size() / 2];
+                supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]);
+                supp_cluster_size = std::max(supp_cluster_size, (int) supp_end_cluster.size());
             }
 
-            if (supp_start_cluster.size() > supp_end_cluster.size()) {
-                supp_pos = supp_best_start;
-                supp_cluster_size = supp_start_cluster.size();
-            } else if (supp_end_cluster.size() > supp_start_cluster.size()) {
-                supp_pos = supp_best_end;
-                supp_cluster_size = supp_end_cluster.size();
-            } else if (supp_best_end == -1 && supp_best_start == -1) {
-                // Use both positions. This has been shown to occur in some nested SVs
-                supp_pos = supp_best_start;
-                supp_pos2 = supp_best_end;
-                supp_cluster_size = supp_start_cluster.size();
-            }
+            // if (supp_start_cluster.size() > supp_end_cluster.size()) {
+            //     // supp_pos = supp_best_start;
+            //     std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
+            //     supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]);
+            //     // supp_positions.push_back(supp_best_start);
+            //     supp_cluster_size = supp_start_cluster.size();
+            // } else if (supp_end_cluster.size() > supp_start_cluster.size()) {
+            //     // supp_pos = supp_best_end;
+            //     // supp_positions.push_back(supp_best_end);
+            //     std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
+            //     supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]);
+            //     supp_cluster_size = supp_end_cluster.size();
+            // } else if (supp_start_cluster.size() == supp_end_cluster.size() && !supp_start_cluster.empty() && !supp_end_cluster.empty()) {
+            //     // Use both positions. This has been shown to occur in some nested SVs
+            //     // supp_pos = supp_best_start;
+            //     // supp_pos2 = supp_best_end;
+            //     std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
+            //     std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
+            //     supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]);
+            //     supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]);
+            //     supp_cluster_size = supp_start_cluster.size();
+            //     // supp_positions.push_back(supp_best_start);
+            //     // supp_positions.push_back(supp_best_end);
+            //     supp_cluster_size = supp_start_cluster.size();
+            // }
 
             // Store the inversion as the supplementary start and end positions
-            if (supp_best_start != -1 && supp_best_end != -1) {
-                if (inversion && std::abs(supp_best_start - supp_best_end) >= 50) {
-                    //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), std::min(supp_best_start, supp_best_end));
-
-                    // Print if end position = 162908547
-                    if (std::max(supp_best_start, supp_best_end) == 162908547) {
-                        printMessage("[TEST] Adding inversion SV candidate at " + chr_name + ":" + std::to_string(std::min(supp_best_start, supp_best_end)) + "-" + std::to_string(std::max(supp_best_start, supp_best_end)) + " with length " + std::to_string(std::abs(supp_best_start - supp_best_end)));
-                    }
-
-                    SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, supp_cluster_size);
+            if (inversion && supp_positions.size() > 1) {
+                std::sort(supp_positions.begin(), supp_positions.end());
+                int supp_start = supp_positions.front();
+                int supp_end = supp_positions.back();
+                int sv_length = std::abs(supp_start - supp_end);
+                if (sv_length >= min_length && sv_length <= max_length) {
+                    SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
                 }
+                // int sv_length = std::abs(supp_best_start - supp_best_end);
+                // if (inversion && sv_length >= min_length && sv_length <= max_length) {
+                //     // SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size);
+                //     // addSVCall(chr_sv_calls, sv_candidate);
+                // }
             }
 
             // If two of either were found, use the larger SV candidate
-            if (primary_pos2 != -1) {
-                int sv_length1 = std::abs(primary_pos - supp_pos);
-                int sv_length2 = std::abs(primary_pos2 - supp_pos);
-                if (sv_length2 > sv_length1) {
-                    primary_pos = primary_pos2;
-                }
-            }
-            if (supp_pos2 != -1) {
-                int sv_length1 = std::abs(primary_pos - supp_pos);
-                int sv_length2 = std::abs(primary_pos - supp_pos2);
-                if (sv_length2 > sv_length1) {
-                    supp_pos = supp_pos2;
-                }
-            }
+            // if (primary_pos2 != -1) {
+            //     int sv_length1 = std::abs(primary_pos - supp_pos);
+            //     int sv_length2 = std::abs(primary_pos2 - supp_pos);
+            //     if (sv_length2 > sv_length1) {
+            //         primary_pos = primary_pos2;
+            //     }
+            // }
+            // if (supp_pos2 != -1) {
+            //     int sv_length1 = std::abs(primary_pos - supp_pos);
+            //     int sv_length2 = std::abs(primary_pos - supp_pos2);
+            //     if (sv_length2 > sv_length1) {
+            //         supp_pos = supp_pos2;
+            //     }
+            // }
 
-            if (primary_pos == -1 || supp_pos == -1) {
-                continue;
-            }
+            // if (primary_pos == -1 || supp_pos == -1) {
+            //     continue;
+            // }
 
             // Store the SV candidate if the length is within the specified range
-            int sv_start = std::min(primary_pos, supp_pos);
-            int sv_end = std::max(primary_pos, supp_pos);
-            int sv_length = sv_end - sv_start + 1;
-            int cluster_size = std::max(primary_cluster_size, supp_cluster_size);
+            // int sv_start = std::min(primary_pos, supp_pos);
+            // int sv_end = std::max(primary_pos, supp_pos);
+            // int sv_length = sv_end - sv_start + 1;
+            // int cluster_size = std::max(primary_cluster_size, supp_cluster_size);
 
             // If the read distance is < 30bp while the SV is > 2kb, then this is a
             // potential deletion
-            if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) {
-
-                // Add an inversion call if necessary
-                //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start);
-                if (inversion) {
-                    SVCall sv_candidate(sv_start, sv_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, cluster_size);
-                    addSVCall(chr_sv_calls, sv_candidate);
-                } else {
-                    SVCall sv_candidate(sv_start, sv_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, cluster_size);
-                    addSVCall(chr_sv_calls, sv_candidate);
-                }
-            }
+            // if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) {
+
+            //     // Add an inversion call if necessary
+            //     if (inversion) {
+            //         for (int primary_pos : primary_positions) {
+            //             for (int supp_pos : supp_positions) {
+            //                 SVCall sv_candidate(std::min(primary_pos, supp_pos), std::max(primary_pos, supp_pos), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
+            //                 addSVCall(chr_sv_calls, sv_candidate);
+            //             }
+            //         }
+            //         // SVCall sv_candidate(sv_start, sv_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
+            //         // addSVCall(chr_sv_calls, sv_candidate);
+            //     } else {
+            //         for (int primary_pos : primary_positions) {
+            //             for (int supp_pos : supp_positions) {
+            //                 uint32_t sv_start = std::min(primary_pos, supp_pos);
+            //                 uint32_t sv_end = std::max(primary_pos, supp_pos);
+            //                 if (sv_end - sv_start + 1 >= 50) {
+            //                     SVCall sv_candidate(std::min(primary_pos, supp_pos), std::max(primary_pos, supp_pos), SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
+            //                     addSVCall(chr_sv_calls, sv_candidate);
+            //             }
+            //         }
+            //         // SVCall sv_candidate(sv_start, sv_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
+            //         // addSVCall(chr_sv_calls, sv_candidate);
+            //     }
+            // }
 
             // Add a dummy SV call for CNV detection
-            else if (sv_length >= min_length && sv_length <= max_length) {
-                SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
-                std::string alt = (sv_type == SVType::INV) ? "<INV>" : ".";
-                //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start);
-                SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, cluster_size);
-                addSVCall(chr_sv_calls, sv_candidate);
+            // if (sv_length >= min_length && sv_length <= max_length) {
+            int cluster_size = std::max(primary_cluster_size, supp_cluster_size);
+            SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
+            std::string alt = (sv_type == SVType::INV) ? "<INV>" : ".";
+            for (int primary_pos : primary_positions) {
+                for (int supp_pos : supp_positions) {
+                    int sv_start = std::min(primary_pos, supp_pos);
+                    int sv_end = std::max(primary_pos, supp_pos);
+                    int sv_length = sv_end - sv_start + 1;
+                    if (sv_length >= min_length && sv_length <= max_length) {
+                        // printMessage("Adding SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and cluster size " + std::to_string(cluster_size));
+                        SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
+                        addSVCall(chr_sv_calls, sv_candidate);
+                    }
+                }
             }
+            // SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
+            // addSVCall(chr_sv_calls, sv_candidate);
+            // }
         }
 
         // Combine SVs with identical start and end positions, and sum the cluster
@@ -505,10 +633,10 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
         // Merge duplicate SV calls with identical start positions
         mergeDuplicateSVs(chr_sv_calls);
 
-        printMessage("Merged SVs:");
-        for (const auto& sv : chr_sv_calls) {
-            printMessage(" - " + getSVTypeSymbol(sv.sv_type) + " at " + chr_name + ":" + std::to_string(sv.start) + "-" + std::to_string(sv.end) + " with length " + std::to_string(sv.end - sv.start + 1) + " and cluster size " + std::to_string(sv.cluster_size));
-        }
+        // printMessage("Merged SVs:");
+        // for (const auto& sv : chr_sv_calls) {
+        //     printMessage(" - " + getSVTypeSymbol(sv.sv_type) + " at " + chr_name + ":" + std::to_string(sv.start) + "-" + std::to_string(sv.end) + " with length " + std::to_string(sv.end - sv.start + 1) + " and cluster size " + std::to_string(sv.cluster_size));
+        // }
 
         sv_calls[chr_name] = std::move(chr_sv_calls);
 
@@ -542,7 +670,6 @@ void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, c
         }
 
         // Process the alignment
-        // bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY);
         this->processCIGARRecord(bamHdr, bam1, sv_calls, pos_depth_map);
     }
 
@@ -551,61 +678,11 @@ void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, c
     bam_destroy1(bam1);
 }
 
-double SVCaller::getReadMismatchRate(bam1_t *alignment, const std::string& chr, const ReferenceGenome & ref_genome)
-{
-    uint32_t* cigar = bam_get_cigar(alignment);  // CIGAR array
-    int cigar_len = alignment->core.n_cigar;
-    uint32_t query_pos = 0;
-    uint32_t pos = (uint32_t)alignment->core.pos;
-    uint32_t aln_start = pos;
-    uint32_t end = (uint32_t)bam_endpos(alignment) - 1;  // Rightmost position of the alignment in the reference genome (0-based)
-
-    // Get the reference sequence
-    std::string_view ref_seq = ref_genome.query(chr, pos + 1, end + 1);
-
-    // Loop through the CIGAR string and calculate the number of matches and
-    // mismatches
-    int match_count = 0;
-    int mismatch_count = 0;
-    for (int i = 0; i < cigar_len; i++) {
-        int op_len = bam_cigar_oplen(cigar[i]);  // CIGAR operation length
-        int op = bam_cigar_op(cigar[i]);  // CIGAR operation
-        if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
-            for (int j = 0; j < op_len; j++) {
-                char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
-                if (base == ref_seq[pos - aln_start + j]) {
-                    match_count++;
-                } else {
-                    mismatch_count++;
-                }
-            }
-        }
-        // Update the reference position
-        // https://samtools.github.io/hts-specs/SAMv1.pdf
-        if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
-            pos += op_len;
-        }
-        
-        // Update the query position
-        if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) {
-            query_pos += op_len;
-        }
-    }
-
-    // Calculate the mismatch rate
-    double mismatch_rate = 0.0;
-    if (match_count + mismatch_count > 0) {
-        mismatch_rate = static_cast<double>(mismatch_count) / static_cast<double>(match_count + mismatch_count);
-    }
-    return mismatch_rate;
-}
-
 void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector<SVCall> &sv_calls, const std::vector<uint32_t> &pos_depth_map)
 {
     std::string chr = header->target_name[alignment->core.tid];  // Chromosome name
     uint32_t aln_start = (uint32_t)alignment->core.pos;  // Leftmost position of the alignment in the reference genome (0-based)
     uint32_t pos = aln_start;
-    // uint32_t end = (uint32_t)bam_endpos(alignment) - 1;  // Rightmost position of the alignment in the reference genome (0-based)
 
     uint32_t* cigar = bam_get_cigar(alignment);  // CIGAR array
     int cigar_len = alignment->core.n_cigar;
@@ -648,7 +725,6 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                 // Add as an insertion
                 uint32_t ins_pos = pos + 1;
                 uint32_t ins_end = ins_pos + op_len - 1;
-                //int read_depth = this->getReadDepth(pos_depth_map, ins_pos);
                 
                 // Determine the ALT allele format based on small vs. large insertion
                 std::string alt_allele = "<INS>";
@@ -681,7 +757,6 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                 // Add as an insertion
                 uint32_t ins_pos = pos + 1;
                 uint32_t ins_end = ins_pos + op_len - 1;
-                //int read_depth = this->getReadDepth(pos_depth_map, ins_pos);
 
                 // Determine the ALT allele format based on small vs. large insertion
                 std::string alt_allele = "<INS>";
@@ -689,39 +764,16 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                     alt_allele = ins_seq_str;
                 }
                 SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, 0, 0.0, 0);
-                cigar_sv_calls.emplace_back(sv_call);  // Commented for testing
+                cigar_sv_calls.emplace_back(sv_call);
 
             // Check if the CIGAR operation is a deletion
             } else if (op == BAM_CDEL) {
 
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
-                //int read_depth = this->getReadDepth(pos_depth_map, ref_pos);
-                // SVCall sv_call(ref_pos, ref_end, SVType::DEL, "<DEL>",
-                // "CIGARDEL", "./.", default_lh, read_depth, 1, 0);
                 SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, 0, 1, 0);
-                // addSVCall(sv_calls, sv_call);
-                // addSVCall(cigar_sv_calls, sv_call);
                 cigar_sv_calls.emplace_back(sv_call);
             }
-            
-            // For matches, calculate the sequence identity
-            // } else if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) {
-            //     if (ref_seq.size() < static_cast<size_t>(op_len)) {
-            //         printError("ERROR: reference sequence length is less than the CIGAR operation length");
-            //         continue;
-            //     }
-
-            //     // printMessage("Calculating sequence identity for matches");
-            //     for (int j = 0; j < op_len; j++) {
-            //         char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)];
-            //         if (base == ref_seq[pos - aln_start + j]) {
-            //             match_count++;
-            //         } else {
-            //             mismatch_count++;
-            //         }
-            //     }
-            // }
         }
 
         // Update the reference position
@@ -736,24 +788,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
         }
     }
 
-    // Get the read name
-    // std::string read_name = bam_get_qname(alignment);
-
-    // If read name starts with c08844a5 then print the read name and the number
-    // of matches and mismatches
-    // if (read_name.find("c08844a5") != std::string::npos) {
-    //     printMessage(read_name + ": matches=" + std::to_string(match_count) + ", mismatches=" + std::to_string(mismatch_count) + ", mismatches/length=" + std::to_string((double)mismatch_count / (double)(match_count + mismatch_count)));
-    // }
-    // double mismatch_rate = (double)mismatch_count / (double)(match_count + mismatch_count);
-    // if (mismatch_rate > 0) {
-    //     printMessage("Read name: " + read_name + ", mismatch rate: " + std::to_string(mismatch_rate) + ", matches: " + std::to_string(match_count) + ", mismatches: " + std::to_string(mismatch_count));
-    // }
-    // read_mismatch_rates[read_name] = mismatch_rate;
-    // printMessage("Completed processing read: " + read_name);
-
-    // Set the mismatch rate for all SVs from this read, and add the SV calls
     for (SVCall& sv_call : cigar_sv_calls) {
-        // sv_call.mismatch_rate = mismatch_rate;
         addSVCall(sv_calls, sv_call);
     }
 }
@@ -858,6 +893,9 @@ void SVCaller::run(const InputData& input_data)
         // Get the chromosomes from the input BAM file
         chromosomes = this->getChromosomes(input_data.getLongReadBam());
     }
+
+    // [TEST] Use only the last 6 chromosomes
+    // chromosomes = {"chr6", "chr7", "chr8", "chr9", "chr10", "chr11"};
     
     // Read the HMM from the file
     std::string hmm_filepath = input_data.getHMMFilepath();
@@ -971,7 +1009,7 @@ void SVCaller::run(const InputData& input_data)
         // Identify split-SV signatures
         printMessage("Identifying split-SV signatures...");
         std::unordered_map<std::string, std::vector<SVCall>> whole_genome_split_sv_calls;
-        this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data, chr_pos_depth_map, ref_genome);
+        this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data);
 
         printMessage("Running copy number predictions on split-read SVs...");
         current_chr = 0;
@@ -1060,14 +1098,17 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
 {
     std::vector<SVCall> additional_calls;
     for (auto& sv_candidate : split_sv_calls) {
-        std::tuple<double, SVType, Genotype, bool> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
+        std::tuple<double, SVType, Genotype, bool, int> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
         double supp_lh = std::get<0>(result);
         SVType supp_type = std::get<1>(result);
         Genotype genotype = std::get<2>(result);
+        int cn_state = std::get<3>(result);
 
         // For inversions with copy-neutral support, update the HMM likelihood
         if (supp_type == SVType::NEUTRAL && sv_candidate.sv_type == SVType::INV) {
             sv_candidate.hmm_likelihood = supp_lh;
+            sv_candidate.genotype = genotype;
+            sv_candidate.cn_state = cn_state;
         }
 
         // Update the SV type if the support is not neutral or unknown
@@ -1077,32 +1118,21 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
                 sv_candidate.sv_type = supp_type;
                 sv_candidate.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
                 sv_candidate.data_type = SVDataType::HMM;
-                sv_candidate.genotype = genotype;
                 sv_candidate.hmm_likelihood = supp_lh;
+                sv_candidate.genotype = genotype;
+                sv_candidate.cn_state = cn_state;
 
-                // Print if end position = 162908547
-                if (sv_candidate.end == 162908547) {
-                    printMessage("SV at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) +
-                        " updated to type " + getSVTypeSymbol(supp_type) +
-                        " with likelihood " + std::to_string(supp_lh) +
-                        " and genotype " + getGenotypeString(genotype));
-                }
             // Add an additional SV call if the type is different
             } else if (sv_candidate.sv_type != supp_type) {
                 SVCall new_sv_call = sv_candidate;  // Copy the original SV call
                 new_sv_call.sv_type = supp_type;
                 new_sv_call.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
                 new_sv_call.data_type = SVDataType::HMM;
-                new_sv_call.genotype = genotype;
                 new_sv_call.hmm_likelihood = supp_lh;
+                new_sv_call.genotype = genotype;
+                new_sv_call.cn_state = cn_state;
 
-                // Print if end position = 162908547
-                if (new_sv_call.end == 162908547) {
-                    printMessage("Additional SV at " + chr + ":" + std::to_string(new_sv_call.start) + "-" + std::to_string(new_sv_call.end) +
-                        " with type " + getSVTypeSymbol(supp_type) +
-                        " and likelihood " + std::to_string(supp_lh) +
-                        " and genotype " + getGenotypeString(genotype));
-                }
+                // Add the new SV call to the list
                 additional_calls.push_back(new_sv_call);
             }
         }
@@ -1164,7 +1194,8 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         "##INFO=<ID=HMM,Number=1,Type=Float,Description=\"HMM likelihood\">",
         "##INFO=<ID=SUPPORT,Number=1,Type=Integer,Description=\"Number of reads supporting the variant\">",
         "##INFO=<ID=CLUSTER,Number=1,Type=Integer,Description=\"Cluster size\">",
-        "##INFO=<ID=MISMATCH,Number=1,Type=Float,Description=\"Mismatch rate\">",
+        "##INFO=<ID=CN,Number=1,Type=Integer,Description=\"Copy number state\">",
+        "##INFO=<ID=ALNOFFSET,Number=1,Type=Integer,Description=\"Read vs. reference alignment offset\">",
         "##FILTER=<ID=PASS,Description=\"All filters passed\">",
         "##FILTER=<ID=LowQual,Description=\"Low quality\">",
         "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
@@ -1208,23 +1239,21 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         const std::vector<SVCall>& sv_calls = pair.second;
         std::cout << "Saving SV calls for " << chr << "..." << std::endl;
         for (const auto& sv_call : sv_calls) {
-            // Get the SV candidate and SV info
             uint32_t start = sv_call.start;
             uint32_t end = sv_call.end;
+            int sv_length = end - start + 1;
+            std::string ref_allele = ".";
+            std::string alt_allele = sv_call.alt_allele;
             SVType sv_type = sv_call.sv_type;
-            // std::string genotype = sv_call.genotype;
-            // std::string data_type_str = sv_call.data_type;
-            // std::string alt_allele = sv_call.alt_allele;
             std::string genotype = getGenotypeString(sv_call.genotype);
             std::string data_type_str = getSVDataTypeString(sv_call.data_type);
-            std::string alt_allele = sv_call.alt_allele;
             double hmm_likelihood = sv_call.hmm_likelihood;
-            int sv_length = end - start + 1;
             int cluster_size = sv_call.cluster_size;
             //int read_depth = sv_call.read_depth;
-            std::string ref_allele = ".";
-            double mismatch_rate = sv_call.mismatch_rate;
+            // double mismatch_rate = sv_call.mismatch_rate;
             std::string filter = "PASS";
+            int aln_offset = sv_call.aln_offset;
+            int cn_state = sv_call.cn_state;
 
             // If the SV type is unknown, print a warning and skip
             if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) {
@@ -1289,8 +1318,8 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
             // Create the VCF parameter strings
             std::string sv_type_str = getSVTypeString(sv_type);
-            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size);                
-            std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";MISMATCH=" + std::to_string(mismatch_rate);
+            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";MISMATCH=" + std::to_string(mismatch_rate);
+            std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";ALNOFFSET=" + std::to_string(aln_offset) + ";CN=" + std::to_string(cn_state);
             std::string format_str = "GT:DP";
             std::string sample_str = genotype + ":" + std::to_string(read_depth);
             std::vector<std::string> samples = {sample_str};
@@ -1298,7 +1327,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             // Write the SV call to the file (CHROM, POS, ID, REF, ALT, QUAL,
             // FILTER, INFO, FORMAT, SAMPLES)
             vcf_stream << chr << "\t" << start << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << filter << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl;
-            // vcf_stream << chr << "\t" << start << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "PASS" << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl;
         }
     }
     vcf_stream.close();
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index ed3f8802..7ee8f3bf 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -20,7 +20,7 @@ void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call)
 {
     // Check if the SV call is valid
     if (sv_call.start > sv_call.end) {
-        printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end));
+        printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " from data type " + getSVDataTypeString(sv_call.data_type));
         return;
     }
 

From 153db48e2e670922f6bd9ecb5142270860f191d4 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 14 Apr 2025 20:15:57 -0400
Subject: [PATCH 102/134] fix save cnv error

---
 include/cnv_caller.h |  9 +++++-
 include/sv_types.h   | 10 ++-----
 src/cnv_caller.cpp   | 45 +++++++++++++++++++----------
 src/fasta_query.cpp  | 10 ++++++-
 src/main.cpp         | 29 +++++++++++++++++++
 src/sv_caller.cpp    | 51 +++++++++++++++++++++-----------
 src/sv_object.cpp    | 69 ++++++++++----------------------------------
 7 files changed, 128 insertions(+), 95 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index 03764da8..deb9187d 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -8,6 +8,7 @@
 #include "input_data.h"
 #include "sv_types.h"
 #include "sv_object.h"
+#include "utils.h"
 
 /// @cond
 #include <string>
@@ -84,7 +85,13 @@ class CNVCaller {
 
         // Function to get the genotype string from the state
         inline Genotype getGenotypeFromCNState(int cn_state) const {
-            return StateGenotypeMap.at(cn_state);
+            // return StateGenotypeMap.at(cn_state);
+            try {
+                return StateGenotypeMap.at(cn_state);
+            } catch (const std::out_of_range& e) {
+                printError("ERROR: Invalid CN state: " + std::to_string(cn_state));
+                return Genotype::UNKNOWN;
+            }
         }
 
         // Run copy number prediction for a single SV candidate, returning the
diff --git a/include/sv_types.h b/include/sv_types.h
index 58f6063b..dd67c2a4 100644
--- a/include/sv_types.h
+++ b/include/sv_types.h
@@ -20,9 +20,7 @@ namespace sv_types {
         INS = 3,
         BND = 4,
         NEUTRAL = 5,  // Neutral copy number with unknown type
-        INV_DUP = 6,  // Inverted duplication
-        INV_DEL = 7,  // Inverted deletion
-        COMPLEX = 8  // Complex SV
+        LOH = 6  // Loss of heterozygosity
     };
 
     // Mapping of SV types to strings
@@ -34,9 +32,7 @@ namespace sv_types {
         {SVType::INS, "INS"},
         {SVType::BND, "BND"},
         {SVType::NEUTRAL, "NEUTRAL"},
-        {SVType::INV_DUP, "INVDUP"},
-        {SVType::INV_DEL, "INVDEL"},
-        {SVType::COMPLEX, "COMPLEX"}
+        {SVType::LOH, "LOH"}
     };
 
     // Mapping of SV types to symbols
@@ -99,7 +95,7 @@ namespace sv_types {
         {1, SVType::DEL},
         {2, SVType::DEL},
         {3, SVType::NEUTRAL},
-        {4, SVType::NEUTRAL},
+        {4, SVType::LOH},
         {5, SVType::DUP},
         {6, SVType::DUP}
     };
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index c7832fd1..dd602c1c 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -57,15 +57,24 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     std::vector<uint32_t> snp_pos;
     std::unordered_map<uint32_t, double> snp_baf_map;
     std::unordered_map<uint32_t, double> snp_pfb_map;
+    printMessage("Reading SNP data for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
     this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf_map, snp_pfb_map, input_data);
 
     // Get the log2 ratio for <sample_size> evenly spaced positions in the
     // region
     sample_size = std::max((int) snp_pos.size(), sample_size);
 
+    // Print an error if the end position is less than or equal to the start
+    // position
+    if (start_pos > end_pos)
+    {
+        printError("ERROR: Invalid SNP region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
+        return;
+    }
+
     // Loop through evenly spaced positions in the region and get the log2 ratio
-    double pos_step = (double) (end_pos - start_pos + 1) / (double) sample_size;
-    std::unordered_set<uint32_t> snp_pos_set(snp_pos.begin(), snp_pos.end());
+    double pos_step = static_cast<double>(end_pos - start_pos + 1) / static_cast<double>(sample_size);
+    // double pos_step = (double) (end_pos - start_pos + 1) / (double) sample_size;
     std::unordered_map<std::string, double> window_log2_map;
     for (int i = 0; i < sample_size; i++)
     {
@@ -166,31 +175,30 @@ std::tuple<double, SVType, Genotype, bool, int> CNVCaller::runCopyNumberPredicti
 
     // Run the Viterbi algorithm on SNPs in the SV region
     // Only extend the region if "save CNV data" is enabled
-    uint32_t snp_start_pos = start_pos;
-    uint32_t snp_end_pos = end_pos;
     SNPData before_sv;
     SNPData after_sv;
     if (input_data.getSaveCNVData())
     {
-        uint32_t sv_half_length = (end_pos - start_pos) / 2.0;
-        if (start_pos > 1)
+        int sv_half_length = (static_cast<int>(end_pos) - static_cast<int>(start_pos)) / 2;
+        int before_sv_start = std::max(1, static_cast<int>(start_pos) - sv_half_length);
+        int before_sv_end = std::max(1, static_cast<int>(start_pos) - 1);
+        if (before_sv_start < before_sv_end)
         {
-            uint32_t before_sv_start = std::max((uint32_t) 1, start_pos - sv_half_length);
-            uint32_t before_sv_end = start_pos - 1;
             querySNPRegion(chr, before_sv_start, before_sv_end, pos_depth_map, mean_chr_cov, before_sv, input_data);
         }
-        uint32_t chr_last_index = pos_depth_map.size() - 1;
-        if (end_pos < chr_last_index)
+
+        int chr_last_index = static_cast<int>(pos_depth_map.size()) - 1;
+        int after_sv_start = std::min(chr_last_index, static_cast<int>(end_pos) + 1);
+        int after_sv_end = std::min(chr_last_index, static_cast<int>(end_pos) + sv_half_length);
+        if (after_sv_start < after_sv_end)
         {
-            uint32_t after_sv_start = end_pos + 1;
-            uint32_t after_sv_end = std::min(chr_last_index, end_pos + sv_half_length);
             querySNPRegion(chr, after_sv_start, after_sv_end, pos_depth_map, mean_chr_cov, after_sv, input_data);
         }
     }
 
     // Query the SNP region for the SV candidate
     SNPData snp_data;
-    querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data);
+    querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data);
 
     // Run the Viterbi algorithm
     std::pair<std::vector<int>, double> prediction;
@@ -316,6 +324,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
 
         // Only extend the region if "save CNV data" is enabled
         SNPData snp_data;
+        printMessage("Querying SNP region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         this->querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data);
 
         // Run the Viterbi algorithm
@@ -324,6 +333,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         	continue;
         }
         
+        printMessage("Running Viterbi algorithm for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         std::pair<std::vector<int>, double> prediction;
         runViterbi(hmm, snp_data, prediction);
         std::vector<int>& state_sequence = prediction.first;
@@ -364,6 +374,11 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
 
         // Update the SV information if it does not conflict with the current SV type
         SVType updated_sv_type = getSVTypeFromCNState(max_state);
+
+        // For LOH predictions, or predictions with the same type,
+        // update predicted information without changing the SV type
+        printMessage("Updating SV call for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " with predicted CNV type: " + getSVTypeString(updated_sv_type));
+        updated_sv_type = (updated_sv_type == SVType::LOH) ? sv_call.sv_type : updated_sv_type;
         bool is_valid_update = isValidCopyNumberUpdate(sv_call.sv_type, updated_sv_type);
         if (is_valid_update)
         {
@@ -584,7 +599,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
     pfb_file.close();
 
     bcf_srs_t *pfb_reader = bcf_sr_init();
-    std::string chr_gnomad;
+    std::string chr_gnomad = chr;
     std::string AF_key;
     if (use_pfb)
     {
@@ -598,7 +613,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
 
         // Check if the filepath uses the 'chr' prefix notations based on the
         // chromosome name (*.chr1.vcf.gz vs *.1.vcf.gz)
-        chr_gnomad = chr;  // gnomAD data may or may not have the 'chr' prefix
+        // chr_gnomad = chr;  // gnomAD data may or may not have the 'chr' prefix
         std::string chr_prefix = "chr";
         if (pfb_filepath.find(chr_prefix) == std::string::npos)
         {
diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp
index 84a2ae07..e4f0e1dc 100644
--- a/src/fasta_query.cpp
+++ b/src/fasta_query.cpp
@@ -168,5 +168,13 @@ std::vector<std::string> ReferenceGenome::getChromosomes() const
 
 uint32_t ReferenceGenome::getChromosomeLength(std::string chr) const
 {
-    return this->chr_to_length.at(chr);
+    try
+    {
+        return this->chr_to_length.at(chr);
+    }
+    catch (const std::out_of_range& e)
+    {
+        printError("Chromosome " + chr + " not found in reference genome");
+        return 0;
+    }
 }
diff --git a/src/main.cpp b/src/main.cpp
index b793619b..64fc4e5f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -4,6 +4,11 @@
 /// @cond DOXYGEN_IGNORE
 #include <iostream>
 #include <string>
+
+// For signal handling
+#include <signal.h>
+#include <execinfo.h>
+
 // #include <optional>
 /// @endcond
 
@@ -12,8 +17,32 @@
 #include "utils.h"
 
 
+void printStackTrace(int sig)
+{
+    void *array[10];
+    size_t size;
+
+    // get void*'s for all entries on the stack
+    size = backtrace(array, 10);
+
+    // print out all the frames to stderr
+    fprintf(stderr, "Error: signal %d:\n", sig);
+    backtrace_symbols_fd(array, size, STDERR_FILENO);
+    exit(1);
+}
+
+
 void runContextSV(const std::unordered_map<std::string, std::string>& args)
 {
+    // Set up signal handling
+    signal(SIGSEGV, printStackTrace);
+    signal(SIGABRT, printStackTrace);
+    signal(SIGINT, printStackTrace);
+    signal(SIGTERM, printStackTrace);
+    signal(SIGILL, printStackTrace);
+    signal(SIGFPE, printStackTrace);
+    signal(SIGBUS, printStackTrace);
+
     // Placeholder for setting up input data and running ContextSV
     std::cout << "ContextSV version " << VERSION << std::endl;
     std::cout << "Input parameters:" << std::endl;
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index c5607e31..e10b36e7 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -895,7 +895,7 @@ void SVCaller::run(const InputData& input_data)
     }
 
     // [TEST] Use only the last 6 chromosomes
-    // chromosomes = {"chr6", "chr7", "chr8", "chr9", "chr10", "chr11"};
+    //chromosomes = {"chr1", "chrX"};
     
     // Read the HMM from the file
     std::string hmm_filepath = input_data.getHMMFilepath();
@@ -920,7 +920,8 @@ void SVCaller::run(const InputData& input_data)
         uint32_t chr_len = ref_genome.getChromosomeLength(chr);
         if (chr_len == 0) {
             printError("Chromosome " + chr + " not found in reference genome");
-            continue;
+            return;
+            // continue;
         }
         chr_pos_depth_map[chr] = std::vector<uint32_t>(chr_len+1, 0);  // 1-based index
         chr_mean_cov_map[chr] = 0.0;
@@ -954,6 +955,7 @@ void SVCaller::run(const InputData& input_data)
         auto process_chr = [&](const std::string& chr) {
             try {
                 std::vector<SVCall> sv_calls;
+                sv_calls.reserve(1000);
                 InputData chr_input_data = input_data;  // Use a thread-local copy
                 this->processChromosome(chr, sv_calls, chr_input_data, chr_pos_depth_map[chr], chr_mean_cov_map[chr]);
                 {
@@ -1104,17 +1106,22 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
         Genotype genotype = std::get<2>(result);
         int cn_state = std::get<3>(result);
 
-        // For inversions with copy-neutral support, update the HMM likelihood
-        if (supp_type == SVType::NEUTRAL && sv_candidate.sv_type == SVType::INV) {
-            sv_candidate.hmm_likelihood = supp_lh;
-            sv_candidate.genotype = genotype;
-            sv_candidate.cn_state = cn_state;
-        }
+        // // For inversions with copy-neutral support, update the HMM likelihood
+        // if (supp_type == SVType::NEUTRAL && sv_candidate.sv_type == SVType::INV) {
+        //     sv_candidate.hmm_likelihood = supp_lh;
+        //     sv_candidate.genotype = genotype;
+        //     sv_candidate.cn_state = cn_state;
+        // }
 
-        // Update the SV type if the support is not neutral or unknown
-        else if (supp_type != SVType::UNKNOWN && supp_type != SVType::NEUTRAL) {
-            // Update information if the SV call is unknown
-            if (sv_candidate.sv_type == SVType::UNKNOWN) {
+        // // Update the SV type if the state is not neutral or unknown
+        // else if (supp_type != SVType::UNKNOWN && supp_type !=
+        // SVType::NEUTRAL) {
+        
+        // Update the SV type if the predicted type is not unknown
+        if (supp_type != SVType::UNKNOWN) {
+            // Update all information if the current SV call is not known and
+            // there is a predicted CNV type
+            if (sv_candidate.sv_type == SVType::UNKNOWN && (supp_type == SVType::DEL || supp_type == SVType::DUP)) {
                 sv_candidate.sv_type = supp_type;
                 sv_candidate.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
                 sv_candidate.data_type = SVDataType::HMM;
@@ -1122,8 +1129,15 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
                 sv_candidate.genotype = genotype;
                 sv_candidate.cn_state = cn_state;
 
+            // For predictions with the same type, or LOH predictions, update the
+            // prediction information
+            } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type == sv_candidate.sv_type || supp_type == SVType::LOH)) {
+                sv_candidate.hmm_likelihood = supp_lh;
+                sv_candidate.genotype = genotype;
+                sv_candidate.cn_state = cn_state;
+
             // Add an additional SV call if the type is different
-            } else if (sv_candidate.sv_type != supp_type) {
+            } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type != sv_candidate.sv_type && (supp_type == SVType::DEL || supp_type == SVType::DUP))) {
                 SVCall new_sv_call = sv_candidate;  // Copy the original SV call
                 new_sv_call.sv_type = supp_type;
                 new_sv_call.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
@@ -1131,8 +1145,6 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
                 new_sv_call.hmm_likelihood = supp_lh;
                 new_sv_call.genotype = genotype;
                 new_sv_call.cn_state = cn_state;
-
-                // Add the new SV call to the list
                 additional_calls.push_back(new_sv_call);
             }
         }
@@ -1192,6 +1204,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         "##INFO=<ID=SVMETHOD,Number=1,Type=String,Description=\"Method used to call the structural variant\">",
         "##INFO=<ID=ALN,Number=1,Type=String,Description=\"Feature used to identify the structural variant\">",
         "##INFO=<ID=HMM,Number=1,Type=Float,Description=\"HMM likelihood\">",
+        "##INFO=<ID=LOH,Number=0,Type=Flag,Description=\"Site shows loss of heterozygosity\">",
         "##INFO=<ID=SUPPORT,Number=1,Type=Integer,Description=\"Number of reads supporting the variant\">",
         "##INFO=<ID=CLUSTER,Number=1,Type=Integer,Description=\"Cluster size\">",
         "##INFO=<ID=CN,Number=1,Type=Integer,Description=\"Copy number state\">",
@@ -1255,6 +1268,9 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             int aln_offset = sv_call.aln_offset;
             int cn_state = sv_call.cn_state;
 
+            SVType cn_type = getSVTypeFromCNState(cn_state);
+            std::string loh = (cn_type == SVType::LOH) ? ";LOH" : "";
+
             // If the SV type is unknown, print a warning and skip
             if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) {
                 unclassified_svs += 1; 
@@ -1266,7 +1282,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             // Deletion
             if (sv_type == SVType::DEL) {
                 // Get the deleted sequence from the reference genome, also including the preceding base
-                uint32_t preceding_pos = (uint32_t) std::max(1, (int) start-1);  // Make sure the position is not negative
+                uint32_t preceding_pos = (uint32_t) std::max(1, static_cast<int>(start)-1);  // Make sure the position is not negative
                 ref_allele = ref_genome.query(chr, preceding_pos, end);
 
                 // Use the preceding base as the alternate allele 
@@ -1319,7 +1335,8 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             // Create the VCF parameter strings
             std::string sv_type_str = getSVTypeString(sv_type);
             // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";MISMATCH=" + std::to_string(mismatch_rate);
-            std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";ALNOFFSET=" + std::to_string(aln_offset) + ";CN=" + std::to_string(cn_state);
+            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";ALNOFFSET=" + std::to_string(aln_offset) + ";CN=" + std::to_string(cn_state);
+            std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";ALNOFFSET=" + std::to_string(aln_offset) + ";CN=" + std::to_string(cn_state) + loh;
             std::string format_str = "GT:DP";
             std::string sample_str = genotype + ":" + std::to_string(read_depth);
             std::vector<std::string> samples = {sample_str};
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 7ee8f3bf..fc847f37 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -57,15 +57,8 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
         SVType::INV,
         SVType::INS,
         SVType::BND,
-        SVType::INV_DUP,
-        SVType::INV_DEL,
     })
     {
-        // [TEST] Skip if not insertions
-        // if (sv_type != SVType::INS) {
-        //     continue;
-        // }
-
         // Create a vector of SV calls for the current SV type and size interval
         std::vector<SVCall> sv_type_calls;
         std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) {
@@ -73,13 +66,19 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
         });
 
         if (sv_type_calls.size() < 2) {
+            // Add all unclustered points to the merged list
+            for (const auto& sv_call : sv_type_calls) {
+                SVCall noise_sv_call = sv_call;
+                merged_sv_calls.push_back(noise_sv_call);
+            }
             continue;
         }
 
         dbscan.fit(sv_type_calls);
+
+        // Create a map of cluster IDs to SV calls
         const std::vector<int>& clusters = dbscan.getClusters();
         std::map<int, std::vector<SVCall>> cluster_map;  // Cluster ID to SV calls
-        // Create a map of cluster IDs to SV calls
         if (sv_type == SVType::INS) {
             // Add only non-CIGARCLIP SVs to the cluster map
             for (size_t i = 0; i < clusters.size(); ++i) {
@@ -99,37 +98,12 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
             int cluster_id = cluster.first;
             std::vector<SVCall>& cluster_sv_calls = cluster.second;
 
-            // [TEST] If insertions, and if any SV has length between 9400 and
-            // 9500, print all SV coordinates in the cluster
-            bool print_all = false;
-            // if (sv_type == SVType::INS) {
-            //     for (const auto& sv_call : cluster_sv_calls) {
-            //         // printMessage("[TEST] SV call " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1));
-            //         // if (sv_call.end - sv_call.start >= 9400 && sv_call.end -
-            //         // sv_call.start <= 9500) {
-            //         // if (sv_call.end - sv_call.start >= 15100 && sv_call.end -
-            //         // sv_call.start <= 15200) {
-            //         // if (sv_call.end - sv_call.start >= 11200 && sv_call.end -
-            //         // sv_call.start <= 11300) {
-            //         // if (sv_call.end - sv_call.start >= 16800 && sv_call.end -
-            //         // sv_call.start <= 17000) {
-            //         // if (sv_call.end - sv_call.start >= 11300 && sv_call.end -
-            //         // sv_call.start <= 11400) {
-            //         // if (sv_call.end - sv_call.start >= 13100 && sv_call.end -
-            //         // sv_call.start <= 13200) {
-            //         if (sv_call.end - sv_call.start >= 28200 && sv_call.end - sv_call.start <= 28300) {
-            //             print_all = true;
-            //             break;
-            //         }
-            //     }
-            // }
-            if (print_all) {
-                printMessage("[TEST] Cluster " + std::to_string(cluster_id) + " has " + std::to_string(cluster_sv_calls.size()) + " SVs:");
-                for (const auto& sv_call : cluster_sv_calls) {
-                    printMessage("  " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1));
-                }
+            // Continue if fewer than 2 SV calls in the cluster (due to CIGARCLIP filter)
+            if (cluster_sv_calls.size() < 2) {
+                continue;
             }
 
+            // Add unmerged SV calls
             if (cluster_id < 0 && keep_noise) {
 
                 // Add all unclustered points to the merged list
@@ -138,6 +112,7 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                     merged_sv_calls.push_back(noise_sv_call);
                 }
 
+            // Merge clustered SV calls
             } else {
 
                 // ----------------------------
@@ -174,22 +149,13 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                     merged_sv_call = *it;
                     merged_sv_calls.push_back(merged_sv_call);
 
-                    // [TEST]
-                    // print_all = true;
-                    // if (print_all) {
-                    //     printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with highest likelihood SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1));
-                    //     printMessage("SV type: " + getSVTypeString(merged_sv_call.sv_type));
-                    //     printMessage("Cluster members:");
-                    //     for (const auto& sv_call : cluster_sv_calls) {
-                    //         printMessage("  " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1));
-                    //     }
-                    // }
-
                 // ----------------------------
                 // CIGAR-BASED MERGING
                 // ----------------------------
 
-                } else if (cluster_sv_calls.size() > 1) {  // Could be low if all CIGARCLIP
+                // } else if (cluster_sv_calls.size() > 1) {  // Could be low if
+                // all CIGARCLIP
+                } else {
                     // Use the median length SV of the top 10% of the cluster
                     // (shorter reads are often noise)
                     std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
@@ -207,11 +173,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                     // Add SV call
                     merged_sv_call.cluster_size = (int) cluster_sv_calls.size();
                     merged_sv_calls.push_back(merged_sv_call);
-
-                    // [TEST]
-                    if (print_all) {
-                        printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with median SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1));
-                    }
                 }
                 cluster_count++;
             }

From 883bfc7e1ffd66d4f014969e9494bf6777e2ed87 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 19 Apr 2025 21:15:03 -0400
Subject: [PATCH 103/134] improve cnv state predictions

---
 include/cnv_caller.h |   2 +-
 src/cnv_caller.cpp   | 107 ++++++++++++++++++++++++++++++++++---------
 src/main.cpp         |   8 ++++
 src/sv_caller.cpp    |  16 +------
 src/sv_object.cpp    |   3 ++
 5 files changed, 99 insertions(+), 37 deletions(-)

diff --git a/include/cnv_caller.h b/include/cnv_caller.h
index deb9187d..afdd78b3 100644
--- a/include/cnv_caller.h
+++ b/include/cnv_caller.h
@@ -96,7 +96,7 @@ class CNVCaller {
 
         // Run copy number prediction for a single SV candidate, returning the
         // likelihood, predicted CNV type, genotype, and whether SNPs were found
-        std::tuple<double, SVType, Genotype, bool, int> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
+        std::tuple<double, SVType, Genotype, int> runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
 
         // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings
         void runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall>& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const;
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index dd602c1c..6fd4ce64 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -57,7 +57,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     std::vector<uint32_t> snp_pos;
     std::unordered_map<uint32_t, double> snp_baf_map;
     std::unordered_map<uint32_t, double> snp_pfb_map;
-    printMessage("Reading SNP data for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
+    // printMessage("Reading SNP data for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
     this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf_map, snp_pfb_map, input_data);
 
     // Get the log2 ratio for <sample_size> evenly spaced positions in the
@@ -164,13 +164,13 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
     snp_data.is_snp = std::move(is_snp_hmm);
 }
 
-std::tuple<double, SVType, Genotype, bool, int> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
+std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector<uint32_t>& pos_depth_map, const InputData& input_data) const
 {
     // Check that the start position is less than the end position
     if (start_pos > end_pos)
     {
         printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
-        return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false, 0);
+        return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0);
     }
 
     // Run the Viterbi algorithm on SNPs in the SV region
@@ -205,50 +205,113 @@ std::tuple<double, SVType, Genotype, bool, int> CNVCaller::runCopyNumberPredicti
     runViterbi(hmm, snp_data, prediction);
     if (prediction.first.size() == 0)
     {
-        return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false, 0);
+        return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0);
     }
 
     std::vector<int>& state_sequence = prediction.first;
     double likelihood = prediction.second;
 
+    // Check whether the start position begins with 225835
+    bool debug = false;
+    // std::string start_pos_str = std::to_string(start_pos);
+    // if (start_pos_str.find("225835") != std::string::npos)
+    // {
+    //     printMessage("Found 225835 in the start position: " + start_pos_str);
+    //     debug = true;
+    // }
+
+    // Print all states if debug is enabled
+    if (debug)
+    {
+        printMessage("State sequence length: " + std::to_string(state_sequence.size()));
+        printMessage("State sequence: ");
+        for (size_t i = 0; i < state_sequence.size(); i++)
+        {
+            printMessage(std::to_string(state_sequence[i]) + " ");
+        }
+        printMessage("");
+    }
+
     // Get all the states in the SV region
-    std::vector<int> sv_states;
-    for (size_t i = 0; i < state_sequence.size(); i++)
+    // std::vector<int> sv_states;
+    // for (size_t i = 0; i < state_sequence.size(); i++)
+    // {
+    //     if (snp_data.pos[i] >= start_pos && snp_data.pos[i] <= end_pos)
+    //     {
+    //         sv_states.push_back(state_sequence[i]);
+    //     }
+    // }
+
+    // Print all states in the SV region if debug is enabled
+    if (debug)
     {
-        if (snp_data.pos[i] >= start_pos && snp_data.pos[i] <= end_pos)
+        printMessage("SV state length: " + std::to_string(state_sequence.size()));
+        printMessage("SV states: ");
+        for (size_t i = 0; i < state_sequence.size(); i++)
         {
-            sv_states.push_back(state_sequence[i]);
+            printMessage(std::to_string(state_sequence[i]) + " ");
         }
+        printMessage("");
     }
 
-    // Determine if there is a majority state within the SV region and if it
-    // is greater than 75%
-    double pct_threshold = 0.75;
+    // Determine if there is a majority state within the SV region
+    // double pct_threshold = 0.75;
     int max_state = 0;
     int max_count = 0;
+    int non_normal_count = 0;
 
-    // Combine counts for states 1 and 2, states 3 and 4, and states 5 and 6
-    for (int i = 0; i < 6; i += 2)
+    std::vector<int> state_counts(6, 0);
+    for (int state : state_sequence)
     {
-        // Combine counts for states 1 and 2, states 3 and 4, and states 5 and 6
-        int state_count = std::count(sv_states.begin(), sv_states.end(), i+1) + std::count(sv_states.begin(), sv_states.end(), i+2);
-        if (state_count > max_count)
+        // Skip state 3 (normal state)
+        if (state != 3)
         {
-            max_state = i+1;  // Set the state to the first state in the pair (sequence remains intact)
-            max_count = state_count;
+            state_counts[state - 1]++;
+            non_normal_count++;
         }
+        // state_counts[state - 1]++;
+    }
+
+    // Determine the maximum state and count
+    int max_state_index = std::distance(state_counts.begin(), std::max_element(state_counts.begin(), state_counts.end()));
+    max_state = max_state_index + 1;
+    max_count = state_counts[max_state_index];
+      
+    // Find the state with the maximum count
+    // for (int i = 0; i < 6; i += 2)
+    // {
+    //     // Combine counts for states 1 and 2, states 3 and 4, and states 5 and 6
+    //     int state_count = std::count(sv_states.begin(), sv_states.end(), i+1) + std::count(sv_states.begin(), sv_states.end(), i+2);
+    //     if (state_count > max_count)
+    //     {
+    //         max_state = i+1;  // Set the state to the first state in the pair (sequence remains intact)
+    //         max_count = state_count;
+    //     }
+    // }
+
+    int state_count = static_cast<int>(state_sequence.size());
+    if (debug)
+    {
+        printMessage("Max state: " + std::to_string(max_state));
+        printMessage("Max count: " + std::to_string(max_count));
+        printMessage("Max count percentage: " + std::to_string((double) max_count / (double) state_count));
+        printMessage("Non-normal count: " + std::to_string(non_normal_count));
+        printMessage("Non-normal count percentage: " + std::to_string((double) max_count / (double) non_normal_count));
+        printMessage("Predicted CNV type: " + getSVTypeString(getSVTypeFromCNState(max_state)));
     }
     
     // Update SV type and genotype based on the majority state
+    // SVType predicted_cnv_type = getSVTypeFromCNState(max_state);
+    // Genotype genotype = getGenotypeFromCNState(max_state);
     SVType predicted_cnv_type = SVType::UNKNOWN;
     Genotype genotype = Genotype::UNKNOWN;
-    int state_count = (int) sv_states.size();
-    if ((double) max_count / (double) state_count > pct_threshold)
+    // int state_count = (int) sv_states.size();
+    if (max_count > 0 && ((double) max_count / (double) non_normal_count) > 0.5)
     {
         predicted_cnv_type = getSVTypeFromCNState(max_state);
         genotype = getGenotypeFromCNState(max_state);
+        snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
     }
-    snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
 
     // Save the SV calls if enabled
     bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL);
@@ -288,7 +351,7 @@ std::tuple<double, SVType, Genotype, bool, int> CNVCaller::runCopyNumberPredicti
         this->saveSVCopyNumberToJSON(before_sv, after_sv, snp_data, chr, start_pos, end_pos, cnv_type_str, likelihood, json_filepath);
     }
     
-    return std::make_tuple(likelihood, predicted_cnv_type, genotype, true, max_state);
+    return std::make_tuple(likelihood, predicted_cnv_type, genotype, max_state);
 }
 
 
diff --git a/src/main.cpp b/src/main.cpp
index 64fc4e5f..89a835c9 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -43,6 +43,14 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
     signal(SIGFPE, printStackTrace);
     signal(SIGBUS, printStackTrace);
 
+    std::cout << R"(`
+    ___         _           _   _____   __
+    / __|___ _ _| |_ _____ _| |_/ __\ \ / /
+   | (__/ _ \ ' \  _/ -_) \ /  _\__ \\ V / 
+    \___\___/_||_\__\___/_\_\\__|___/ \_/  
+                                             
+    )" << std::endl;
+
     // Placeholder for setting up input data and running ContextSV
     std::cout << "ContextSV version " << VERSION << std::endl;
     std::cout << "Input parameters:" << std::endl;
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index e10b36e7..10f2ca92 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -893,9 +893,6 @@ void SVCaller::run(const InputData& input_data)
         // Get the chromosomes from the input BAM file
         chromosomes = this->getChromosomes(input_data.getLongReadBam());
     }
-
-    // [TEST] Use only the last 6 chromosomes
-    //chromosomes = {"chr1", "chrX"};
     
     // Read the HMM from the file
     std::string hmm_filepath = input_data.getHMMFilepath();
@@ -1100,22 +1097,13 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
 {
     std::vector<SVCall> additional_calls;
     for (auto& sv_candidate : split_sv_calls) {
-        std::tuple<double, SVType, Genotype, bool, int> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
+        std::tuple<double, SVType, Genotype, int> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
         double supp_lh = std::get<0>(result);
         SVType supp_type = std::get<1>(result);
         Genotype genotype = std::get<2>(result);
         int cn_state = std::get<3>(result);
 
-        // // For inversions with copy-neutral support, update the HMM likelihood
-        // if (supp_type == SVType::NEUTRAL && sv_candidate.sv_type == SVType::INV) {
-        //     sv_candidate.hmm_likelihood = supp_lh;
-        //     sv_candidate.genotype = genotype;
-        //     sv_candidate.cn_state = cn_state;
-        // }
-
-        // // Update the SV type if the state is not neutral or unknown
-        // else if (supp_type != SVType::UNKNOWN && supp_type !=
-        // SVType::NEUTRAL) {
+        // printMessage("Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVDataTypeString(sv_candidate.data_type));
         
         // Update the SV type if the predicted type is not unknown
         if (supp_type != SVType::UNKNOWN) {
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index fc847f37..6c7890a7 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -18,6 +18,9 @@ bool SVCall::operator<(const SVCall & other) const
 
 void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call)
 {
+    // Print the SV call
+    // printMessage("Adding SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " with length " + std::to_string(sv_call.end - sv_call.start + 1) + " and cluster size " + std::to_string(sv_call.cluster_size) + " from data type " + getSVDataTypeString(sv_call.data_type) + " and type " + getSVTypeString(sv_call.sv_type));
+
     // Check if the SV call is valid
     if (sv_call.start > sv_call.end) {
         printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " from data type " + getSVDataTypeString(sv_call.data_type));

From 98d41d49fb9212478d65c3441dcc975f6e5215e0 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 1 May 2025 16:43:04 -0400
Subject: [PATCH 104/134] remove test code

---
 src/cnv_caller.cpp |  71 +--------------
 src/main.cpp       |  32 ++++---
 src/sv_caller.cpp  | 216 ++++-----------------------------------------
 src/sv_object.cpp  |   5 --
 4 files changed, 36 insertions(+), 288 deletions(-)

diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 6fd4ce64..f04093af 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -211,51 +211,7 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
     std::vector<int>& state_sequence = prediction.first;
     double likelihood = prediction.second;
 
-    // Check whether the start position begins with 225835
-    bool debug = false;
-    // std::string start_pos_str = std::to_string(start_pos);
-    // if (start_pos_str.find("225835") != std::string::npos)
-    // {
-    //     printMessage("Found 225835 in the start position: " + start_pos_str);
-    //     debug = true;
-    // }
-
-    // Print all states if debug is enabled
-    if (debug)
-    {
-        printMessage("State sequence length: " + std::to_string(state_sequence.size()));
-        printMessage("State sequence: ");
-        for (size_t i = 0; i < state_sequence.size(); i++)
-        {
-            printMessage(std::to_string(state_sequence[i]) + " ");
-        }
-        printMessage("");
-    }
-
-    // Get all the states in the SV region
-    // std::vector<int> sv_states;
-    // for (size_t i = 0; i < state_sequence.size(); i++)
-    // {
-    //     if (snp_data.pos[i] >= start_pos && snp_data.pos[i] <= end_pos)
-    //     {
-    //         sv_states.push_back(state_sequence[i]);
-    //     }
-    // }
-
-    // Print all states in the SV region if debug is enabled
-    if (debug)
-    {
-        printMessage("SV state length: " + std::to_string(state_sequence.size()));
-        printMessage("SV states: ");
-        for (size_t i = 0; i < state_sequence.size(); i++)
-        {
-            printMessage(std::to_string(state_sequence[i]) + " ");
-        }
-        printMessage("");
-    }
-
     // Determine if there is a majority state within the SV region
-    // double pct_threshold = 0.75;
     int max_state = 0;
     int max_count = 0;
     int non_normal_count = 0;
@@ -269,43 +225,18 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
             state_counts[state - 1]++;
             non_normal_count++;
         }
-        // state_counts[state - 1]++;
     }
 
     // Determine the maximum state and count
     int max_state_index = std::distance(state_counts.begin(), std::max_element(state_counts.begin(), state_counts.end()));
     max_state = max_state_index + 1;
     max_count = state_counts[max_state_index];
-      
-    // Find the state with the maximum count
-    // for (int i = 0; i < 6; i += 2)
-    // {
-    //     // Combine counts for states 1 and 2, states 3 and 4, and states 5 and 6
-    //     int state_count = std::count(sv_states.begin(), sv_states.end(), i+1) + std::count(sv_states.begin(), sv_states.end(), i+2);
-    //     if (state_count > max_count)
-    //     {
-    //         max_state = i+1;  // Set the state to the first state in the pair (sequence remains intact)
-    //         max_count = state_count;
-    //     }
-    // }
-
-    int state_count = static_cast<int>(state_sequence.size());
-    if (debug)
-    {
-        printMessage("Max state: " + std::to_string(max_state));
-        printMessage("Max count: " + std::to_string(max_count));
-        printMessage("Max count percentage: " + std::to_string((double) max_count / (double) state_count));
-        printMessage("Non-normal count: " + std::to_string(non_normal_count));
-        printMessage("Non-normal count percentage: " + std::to_string((double) max_count / (double) non_normal_count));
-        printMessage("Predicted CNV type: " + getSVTypeString(getSVTypeFromCNState(max_state)));
-    }
-    
+
     // Update SV type and genotype based on the majority state
     // SVType predicted_cnv_type = getSVTypeFromCNState(max_state);
     // Genotype genotype = getGenotypeFromCNState(max_state);
     SVType predicted_cnv_type = SVType::UNKNOWN;
     Genotype genotype = Genotype::UNKNOWN;
-    // int state_count = (int) sv_states.size();
     if (max_count > 0 && ((double) max_count / (double) non_normal_count) > 0.5)
     {
         predicted_cnv_type = getSVTypeFromCNState(max_state);
diff --git a/src/main.cpp b/src/main.cpp
index 89a835c9..6f2c1b69 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -4,6 +4,7 @@
 /// @cond DOXYGEN_IGNORE
 #include <iostream>
 #include <string>
+#include <ctime>
 
 // For signal handling
 #include <signal.h>
@@ -32,6 +33,18 @@ void printStackTrace(int sig)
 }
 
 
+void printBanner()
+{
+    std::time_t now = std::time(nullptr);
+    char date_str[100];
+    std::strftime(date_str, sizeof(date_str), "%Y-%m-%d", std::localtime(&now));
+    std::cout << "═══════════════════════════════════════════════════════════════" << std::endl;
+    std::cout << "  ContextSV - Long-read Structural Variant Caller" << std::endl;
+    std::cout << "      Version: " << VERSION << std::endl;
+    std::cout << "      Date: " << date_str << std::endl;
+    std::cout << "═══════════════════════════════════════════════════════════════" << std::endl;
+}
+
 void runContextSV(const std::unordered_map<std::string, std::string>& args)
 {
     // Set up signal handling
@@ -43,20 +56,13 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
     signal(SIGFPE, printStackTrace);
     signal(SIGBUS, printStackTrace);
 
-    std::cout << R"(`
-    ___         _           _   _____   __
-    / __|___ _ _| |_ _____ _| |_/ __\ \ / /
-   | (__/ _ \ ' \  _/ -_) \ /  _\__ \\ V / 
-    \___\___/_||_\__\___/_\_\\__|___/ \_/  
-                                             
-    )" << std::endl;
-
     // Placeholder for setting up input data and running ContextSV
-    std::cout << "ContextSV version " << VERSION << std::endl;
-    std::cout << "Input parameters:" << std::endl;
-    for (const auto& arg : args) {
-        std::cout << arg.first << ": " << arg.second << std::endl;
-    }
+    // std::cout << "ContextSV version " << VERSION << std::endl;
+    // std::cout << "Input parameters:" << std::endl;
+    // for (const auto& arg : args) {
+    //     std::cout << arg.first << ": " << arg.second << std::endl;
+    // }
+    printBanner();
 
     // Set up input data
     InputData input_data;
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 10f2ca92..06504a77 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -350,8 +350,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             // Use the median of the largest cluster of primary and supplementary
             // alignment start, end positions as the final genome coordinates of the
             // SV
-            // int primary_pos = -1;
-            // int primary_pos2 = -1;
             std::vector<int> primary_positions;
             int primary_cluster_size = 0;
             if (!primary_start_cluster.empty()) {
@@ -365,28 +363,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                 primary_positions.push_back(primary_end_cluster[primary_end_cluster.size() / 2]);
                 primary_cluster_size = std::max(primary_cluster_size, (int) primary_end_cluster.size());
             }
-            // if (primary_start_cluster.size() > primary_end_cluster.size()) {
-            //     std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
-            //     // primary_pos =
-            //     // primary_start_cluster[primary_start_cluster.size() / 2];
-            //     primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]);
-            //     primary_cluster_size = primary_start_cluster.size();
-            // } else if (primary_end_cluster.size() > primary_start_cluster.size()) {
-            //     std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
-            //     // primary_pos = primary_end_cluster[primary_end_cluster.size()
-            //     // / 2];
-            //     primary_positions.push_back(primary_end_cluster[primary_end_cluster.size() / 2]);
-            //     primary_cluster_size = primary_end_cluster.size();
-            // } else {
-            //     // Use both positions
-            //     std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
-            //     std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
-            //     // primary_pos = primary_start_cluster[primary_start_cluster.size() / 2];
-            //     // primary_pos2 = primary_end_cluster[primary_end_cluster.size()
-            //     // / 2];
-            //     primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]);
-            //     primary_cluster_size = primary_start_cluster.size();
-            // }
 
             // -------------------------------
             // SPLIT INSERTION CALLS
@@ -402,96 +378,29 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                 ref_distance = ref_distance_cluster[ref_distance_cluster.size() / 2];
 
                 // Add an insertion SV call at the primary position
-                // Using a minimum read distance of 2000bp since most insertions
-                // < 2kb can be identified more accurately using CIGAR-based
-                // methods
-                // if (primary_pos != -1 && read_distance > 2000) {
-                // if (primary_pos != -1) {
-                //     if (primary_pos2 != -1) {
-                //         // If two positions were found, use the 5'most position
-                //         primary_pos = std::min(primary_pos, primary_pos2);
-                //     }
                 if (!primary_positions.empty()) {
                     int aln_offset = static_cast<int>(ref_distance - read_distance);
                     if (read_distance > ref_distance  && read_distance >= min_length && read_distance <= max_length) {
                         // Add an insertion SV call at the primary positions
                         SVType sv_type = SVType::INS;
-                        // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
-                        // addSVCall(chr_sv_calls, sv_candidate);
                         for (int primary_pos : primary_positions) {
                             SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
                             addSVCall(chr_sv_calls, sv_candidate);
                         }
                     } else if (ref_distance > read_distance && ref_distance >= min_length && ref_distance <= max_length) {
+                        // Add a deletion SV call at the primary positions
                         for (int primary_pos : primary_positions) {
                             SVType sv_type = SVType::DEL;
                             SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
                             addSVCall(chr_sv_calls, sv_candidate);
                         }
-                        // Add a deletion SV call at the primary position
-                        // SVType sv_type = SVType::DEL;
-                        // SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
                     }
                 }
-
-                    // if (ref_distance >= 50 && ref_distance < read_distance) {
-                    //     // Add an insertion SV call at the primary position
-                    //     SVType sv_type = SVType::INS;
-                    //     int aln_offset = static_cast<int>(ref_distance - read_distance);
-                    //     SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
-                    //     addSVCall(chr_sv_calls, sv_candidate);
-                    // }
-                    // SVType sv_type = SVType::INS;
-                    // int aln_offset = static_cast<int>(read_distance - ref_distance);
-                    // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
-                    // addSVCall(chr_sv_calls, sv_candidate);
             }
 
-            // if (!ref_distance_cluster.empty()) {
-            //     // Use the median of the largest cluster of split distances as the
-            //     // insertion size
-            //     std::sort(ref_distance_cluster.begin(), ref_distance_cluster.end());
-            //     size_t median_index = ref_distance_cluster.size() / 2;
-            //     ref_distance = ref_distance_cluster[median_index];
-            //     read_distance = read_distance_cluster[median_index];
-
-            //     // Add a deletion SV call at the primary position
-            //     if (primary_pos != -1 && ref_distance >= 50 && ref_distance > read_distance) {
-            //         if (primary_pos2 != -1) {
-            //             // If two positions were found, use the 5'most position
-            //             primary_pos = std::min(primary_pos, primary_pos2);
-            //         }
-            //         SVType sv_type = SVType::DEL;
-            //         int aln_offset = static_cast<int>(ref_distance - read_distance);
-            //         SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
-            //         addSVCall(chr_sv_calls, sv_candidate);
-
-            //         // Add an inversion if necessary (inverted deletion)
-            //         if (inversion) {
-            //             SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
-            //             addSVCall(chr_sv_calls, sv_candidate);
-            //         }
-            //     }
-            // }
-
-            // --------------------------------
-
             // Get the supplementary alignment positions
-            // int supp_pos = -1;
-            // int supp_pos2 = -1;
             std::vector<int> supp_positions;
             int supp_cluster_size = 0;
-            // int supp_best_start = -1;
-            // int supp_best_end = -1;
-            // if (!supp_start_cluster.empty()) {
-            //     std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
-            //     supp_best_start = supp_start_cluster[supp_start_cluster.size() / 2];
-            // }
-            // if (!supp_end_cluster.empty()) {
-            //     std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
-            //     supp_best_end = supp_end_cluster[supp_end_cluster.size() / 2];
-            // }
-
             if (!supp_start_cluster.empty()) {
                 std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
                 supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]);
@@ -503,32 +412,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                 supp_cluster_size = std::max(supp_cluster_size, (int) supp_end_cluster.size());
             }
 
-            // if (supp_start_cluster.size() > supp_end_cluster.size()) {
-            //     // supp_pos = supp_best_start;
-            //     std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
-            //     supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]);
-            //     // supp_positions.push_back(supp_best_start);
-            //     supp_cluster_size = supp_start_cluster.size();
-            // } else if (supp_end_cluster.size() > supp_start_cluster.size()) {
-            //     // supp_pos = supp_best_end;
-            //     // supp_positions.push_back(supp_best_end);
-            //     std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
-            //     supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]);
-            //     supp_cluster_size = supp_end_cluster.size();
-            // } else if (supp_start_cluster.size() == supp_end_cluster.size() && !supp_start_cluster.empty() && !supp_end_cluster.empty()) {
-            //     // Use both positions. This has been shown to occur in some nested SVs
-            //     // supp_pos = supp_best_start;
-            //     // supp_pos2 = supp_best_end;
-            //     std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
-            //     std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
-            //     supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]);
-            //     supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]);
-            //     supp_cluster_size = supp_start_cluster.size();
-            //     // supp_positions.push_back(supp_best_start);
-            //     // supp_positions.push_back(supp_best_end);
-            //     supp_cluster_size = supp_start_cluster.size();
-            // }
-
             // Store the inversion as the supplementary start and end positions
             if (inversion && supp_positions.size() > 1) {
                 std::sort(supp_positions.begin(), supp_positions.end());
@@ -539,70 +422,9 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                     SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
                 }
-                // int sv_length = std::abs(supp_best_start - supp_best_end);
-                // if (inversion && sv_length >= min_length && sv_length <= max_length) {
-                //     // SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size);
-                //     // addSVCall(chr_sv_calls, sv_candidate);
-                // }
             }
 
-            // If two of either were found, use the larger SV candidate
-            // if (primary_pos2 != -1) {
-            //     int sv_length1 = std::abs(primary_pos - supp_pos);
-            //     int sv_length2 = std::abs(primary_pos2 - supp_pos);
-            //     if (sv_length2 > sv_length1) {
-            //         primary_pos = primary_pos2;
-            //     }
-            // }
-            // if (supp_pos2 != -1) {
-            //     int sv_length1 = std::abs(primary_pos - supp_pos);
-            //     int sv_length2 = std::abs(primary_pos - supp_pos2);
-            //     if (sv_length2 > sv_length1) {
-            //         supp_pos = supp_pos2;
-            //     }
-            // }
-
-            // if (primary_pos == -1 || supp_pos == -1) {
-            //     continue;
-            // }
-
-            // Store the SV candidate if the length is within the specified range
-            // int sv_start = std::min(primary_pos, supp_pos);
-            // int sv_end = std::max(primary_pos, supp_pos);
-            // int sv_length = sv_end - sv_start + 1;
-            // int cluster_size = std::max(primary_cluster_size, supp_cluster_size);
-
-            // If the read distance is < 30bp while the SV is > 2kb, then this is a
-            // potential deletion
-            // if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) {
-
-            //     // Add an inversion call if necessary
-            //     if (inversion) {
-            //         for (int primary_pos : primary_positions) {
-            //             for (int supp_pos : supp_positions) {
-            //                 SVCall sv_candidate(std::min(primary_pos, supp_pos), std::max(primary_pos, supp_pos), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
-            //                 addSVCall(chr_sv_calls, sv_candidate);
-            //             }
-            //         }
-            //         // SVCall sv_candidate(sv_start, sv_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
-            //         // addSVCall(chr_sv_calls, sv_candidate);
-            //     } else {
-            //         for (int primary_pos : primary_positions) {
-            //             for (int supp_pos : supp_positions) {
-            //                 uint32_t sv_start = std::min(primary_pos, supp_pos);
-            //                 uint32_t sv_end = std::max(primary_pos, supp_pos);
-            //                 if (sv_end - sv_start + 1 >= 50) {
-            //                     SVCall sv_candidate(std::min(primary_pos, supp_pos), std::max(primary_pos, supp_pos), SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
-            //                     addSVCall(chr_sv_calls, sv_candidate);
-            //             }
-            //         }
-            //         // SVCall sv_candidate(sv_start, sv_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
-            //         // addSVCall(chr_sv_calls, sv_candidate);
-            //     }
-            // }
-
             // Add a dummy SV call for CNV detection
-            // if (sv_length >= min_length && sv_length <= max_length) {
             int cluster_size = std::max(primary_cluster_size, supp_cluster_size);
             SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
             std::string alt = (sv_type == SVType::INV) ? "<INV>" : ".";
@@ -618,9 +440,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                     }
                 }
             }
-            // SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
-            // addSVCall(chr_sv_calls, sv_candidate);
-            // }
         }
 
         // Combine SVs with identical start and end positions, and sum the cluster
@@ -632,12 +451,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
         
         // Merge duplicate SV calls with identical start positions
         mergeDuplicateSVs(chr_sv_calls);
-
-        // printMessage("Merged SVs:");
-        // for (const auto& sv : chr_sv_calls) {
-        //     printMessage(" - " + getSVTypeSymbol(sv.sv_type) + " at " + chr_name + ":" + std::to_string(sv.start) + "-" + std::to_string(sv.end) + " with length " + std::to_string(sv.end - sv.start + 1) + " and cluster size " + std::to_string(sv.cluster_size));
-        // }
-
         sv_calls[chr_name] = std::move(chr_sv_calls);
 
         // Print the number of merged SV calls
@@ -875,6 +688,7 @@ void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& ch
 void SVCaller::run(const InputData& input_data)
 {
     bool cigar_svs = true;
+    bool cigar_cn = true;
     bool split_svs = true;
 
     // Set up the reference genome
@@ -987,18 +801,20 @@ void SVCaller::run(const InputData& input_data)
         }
         printMessage("All tasks have finished.");
 
-        // -------------------------------------------------------
-        // Run copy number variant predictions on the SVs detected from the
-        // CIGAR string, using a minimum CNV length threshold
-        current_chr = 0;
-        printMessage("Running copy number predictions on CIGAR SVs...");
-        for (auto& entry : whole_genome_sv_calls) {
-            current_chr++;
-            const std::string& chr = entry.first;
-            std::vector<SVCall>& sv_calls = entry.second;
-            if (sv_calls.size() > 0) {
-                printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
-                cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
+        if (cigar_cn) {
+            // -------------------------------------------------------
+            // Run copy number variant predictions on the SVs detected from the
+            // CIGAR string, using a minimum CNV length threshold
+            current_chr = 0;
+            printMessage("Running copy number predictions on CIGAR SVs...");
+            for (auto& entry : whole_genome_sv_calls) {
+                current_chr++;
+                const std::string& chr = entry.first;
+                std::vector<SVCall>& sv_calls = entry.second;
+                if (sv_calls.size() > 0) {
+                    printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "...");
+                    cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
+                }
             }
         }
         // -------------------------------------------------------
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 6c7890a7..ddc0cf4b 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -18,9 +18,6 @@ bool SVCall::operator<(const SVCall & other) const
 
 void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call)
 {
-    // Print the SV call
-    // printMessage("Adding SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " with length " + std::to_string(sv_call.end - sv_call.start + 1) + " and cluster size " + std::to_string(sv_call.cluster_size) + " from data type " + getSVDataTypeString(sv_call.data_type) + " and type " + getSVTypeString(sv_call.sv_type));
-
     // Check if the SV call is valid
     if (sv_call.start > sv_call.end) {
         printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " from data type " + getSVDataTypeString(sv_call.data_type));
@@ -156,8 +153,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                 // CIGAR-BASED MERGING
                 // ----------------------------
 
-                // } else if (cluster_sv_calls.size() > 1) {  // Could be low if
-                // all CIGARCLIP
                 } else {
                     // Use the median length SV of the top 10% of the cluster
                     // (shorter reads are often noise)

From fe77d3d0d72b9e320aaf71c3b283a2bb03db8fcc Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 5 May 2025 16:45:33 -0400
Subject: [PATCH 105/134] fix false positive in split reads

---
 src/sv_caller.cpp | 74 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 57 insertions(+), 17 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 06504a77..684dcdf8 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -379,21 +379,23 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
 
                 // Add an insertion SV call at the primary position
                 if (!primary_positions.empty()) {
+                    std::sort(primary_positions.begin(), primary_positions.end());
+                    int sv_start = primary_positions[0];
                     int aln_offset = static_cast<int>(ref_distance - read_distance);
                     if (read_distance > ref_distance  && read_distance >= min_length && read_distance <= max_length) {
-                        // Add an insertion SV call at the primary positions
+                        // Add an insertion SV call at the 5'-most primary position
                         SVType sv_type = SVType::INS;
-                        for (int primary_pos : primary_positions) {
-                            SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
-                            addSVCall(chr_sv_calls, sv_candidate);
-                        }
+                        // for (int primary_pos : primary_positions) {
+                        SVCall sv_candidate(sv_start, sv_start + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
+                        addSVCall(chr_sv_calls, sv_candidate);
+                        // }
                     } else if (ref_distance > read_distance && ref_distance >= min_length && ref_distance <= max_length) {
                         // Add a deletion SV call at the primary positions
-                        for (int primary_pos : primary_positions) {
-                            SVType sv_type = SVType::DEL;
-                            SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
-                            addSVCall(chr_sv_calls, sv_candidate);
-                        }
+                        // for (int primary_pos : primary_positions) {
+                        SVType sv_type = SVType::DEL;
+                        SVCall sv_candidate(sv_start, sv_start + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
+                        addSVCall(chr_sv_calls, sv_candidate);
+                        // }
                     }
                 }
             }
@@ -1091,8 +1093,12 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
                 // Use the preceding base as the alternate allele 
                 if (ref_allele != "") {
+                    // The alt allele is the preceding base, and the reference
+                    // allele is the deleted sequence including the preceding base
                     alt_allele = ref_allele.at(0);
                 } else {
+                    // If the reference allele is empty, use a symbolic allele
+                    ref_allele = "N";  // Convention for DEL
                     alt_allele = "<DEL>";  // Symbolic allele
                     std::cerr << "Warning: Reference allele is empty for deletion at " << chr << ":" << start << "-" << end << std::endl;
                 }
@@ -1105,14 +1111,42 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
                 if (sv_type == SVType::INS) {
                     // Update the position to the preceding base
-                    int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
-                    ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
-                    start = preceding_pos;
-
-                    if (alt_allele != "<INS>") {
-                        // Insert the reference allele before the insertion
-                        alt_allele.insert(0, ref_allele);
+                    if (static_cast<int>(start) > 1) {
+                        uint32_t preceding_pos = start - 1;
+                        ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
+                        start = preceding_pos;
+                        if (ref_allele != "") {
+                            if (alt_allele != "<INS>") {
+                                // Insert the reference allele before the insertion
+                                alt_allele.insert(0, ref_allele);
+                            }
+                        } else {
+                            // If the reference allele is empty, use a symbolic allele
+                            ref_allele = "N";  // Convention for INS
+                            alt_allele = "<INS>";  // Symbolic allele
+                            std::cerr << "Warning: Reference allele is empty for insertion at " << chr << ":" << start << "-" << end << std::endl;
+                        }
+                    } else {
+                        // ref_allele = "N";  // No preceding base for the first
+                        // position
+                        // Throw an error if the insertion is at the first position
+                        std::cerr << "Error: Insertion at the first position " << chr << ":" << start << "-" << end << std::endl;
+                        continue;
                     }
+                    // int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
+                    // ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
+                    // start = preceding_pos;
+                    // if (ref_allele != "") {
+                    //     if (alt_allele != "<INS>") {
+                    //         // Insert the reference allele before the insertion
+                    //         alt_allele.insert(0, ref_allele);
+                    //     }
+                    // } else {
+                    //     // If the reference allele is empty, use a symbolic allele
+                    //     ref_allele = "N";  // Convention for INS
+                    //     alt_allele = "<INS>";  // Symbolic allele
+                    //     std::cerr << "Warning: Reference allele is empty for insertion at " << chr << ":" << start << "-" << end << std::endl;
+                    // }
                     end = start;  // Update the end position to the same base
 
                 } else {
@@ -1136,6 +1170,12 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             // Get read depth
             int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr), start);
 
+            // If read depth equals zero, then set the filter to LowQual
+            if (read_depth == 0) {
+                filter = "LowQual";
+                filtered_svs += 1;
+            }
+
             // Create the VCF parameter strings
             std::string sv_type_str = getSVTypeString(sv_type);
             // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";MISMATCH=" + std::to_string(mismatch_rate);

From fd75f7fed6ba8b8f72d74f71aac11d60d1b47b2e Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 12 May 2025 16:51:20 -0400
Subject: [PATCH 106/134] fix split sv detection errors

---
 __main__.py           |   1 -
 include/input_data.h  |  10 +-
 include/utils.h       |  52 +++++-----
 src/cnv_caller.cpp    |  88 ++++++++++++----
 src/input_data.cpp    |  49 +++------
 src/main.cpp          |  14 ---
 src/sv_caller.cpp     | 230 +++++++++++++++++++++++++++++++-----------
 tests/test_general.py |   1 -
 8 files changed, 279 insertions(+), 166 deletions(-)

diff --git a/__main__.py b/__main__.py
index a888cdbf..3821b8d1 100644
--- a/__main__.py
+++ b/__main__.py
@@ -214,7 +214,6 @@ def main():
     # Set input parameters
     input_data = contextsv.InputData()
     input_data.setVerbose(args.debug)
-    input_data.setShortReadBam(args.short_read)
     input_data.setLongReadBam(args.long_read)
     input_data.setRefGenome(args.reference)
     input_data.setSNPFilepath(args.snps)
diff --git a/include/input_data.h b/include/input_data.h
index 0687af76..452b5e6c 100644
--- a/include/input_data.h
+++ b/include/input_data.h
@@ -23,9 +23,7 @@ class InputData {
     public:
         InputData();
 
-        std::string getShortReadBam() const;
-
-        void setShortReadBam(std::string filepath);
+        void printParameters() const;
 
         std::string getLongReadBam() const;
 
@@ -65,10 +63,6 @@ class InputData {
         void setDBSCAN_Epsilon(double epsilon);
         double getDBSCAN_Epsilon() const;
 
-        // Set the minimum number of points in a cluster for DBSCAN.
-        void setDBSCAN_MinPts(int min_pts);
-        int getDBSCAN_MinPts() const;
-
         // Set the percentage of mean chromosome coverage to use for DBSCAN
         // minimum points.
         void setDBSCAN_MinPtsPct(double min_pts_pct);
@@ -105,7 +99,6 @@ class InputData {
         std::string getCNVOutputFile() const;
         
     private:
-        std::string short_read_bam;
         std::string long_read_bam;
         std::string ref_filepath;
         std::string snp_vcf_filepath;
@@ -116,7 +109,6 @@ class InputData {
         uint32_t min_cnv_length;
         int min_reads;
         double dbscan_epsilon;
-        int dbscan_min_pts;
         double dbscan_min_pts_pct;
         std::string chr;  // Chromosome to analyze
         std::pair<int32_t, int32_t> start_end;  // Region to analyze
diff --git a/include/utils.h b/include/utils.h
index 6eb1237d..d95f0a8a 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -14,32 +14,32 @@
 
 
 // Guard to close the BAM file
-struct BamFileGuard {
-    samFile* fp_in;
-    hts_idx_t* idx;
-    bam_hdr_t* bamHdr;
-
-    BamFileGuard(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr)
-        : fp_in(fp_in), idx(idx), bamHdr(bamHdr) {}
-
-    ~BamFileGuard() {
-        if (idx) {
-            hts_idx_destroy(idx);
-            idx = nullptr;
-        }
-        if (bamHdr) {
-            bam_hdr_destroy(bamHdr);
-            bamHdr = nullptr;
-        }
-        if (fp_in) {
-            sam_close(fp_in);
-            fp_in = nullptr;
-        }
-    }
-
-    BamFileGuard(const BamFileGuard&) = delete;  // Non-copyable
-    BamFileGuard& operator=(const BamFileGuard&) = delete;  // Non-assignable
-};
+// struct BamFileGuard {
+//     samFile* fp_in;
+//     hts_idx_t* idx;
+//     bam_hdr_t* bamHdr;
+
+//     BamFileGuard(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr)
+//         : fp_in(fp_in), idx(idx), bamHdr(bamHdr) {}
+
+//     ~BamFileGuard() {
+//         if (idx) {
+//             hts_idx_destroy(idx);
+//             idx = nullptr;
+//         }
+//         if (bamHdr) {
+//             bam_hdr_destroy(bamHdr);
+//             bamHdr = nullptr;
+//         }
+//         if (fp_in) {
+//             sam_close(fp_in);
+//             fp_in = nullptr;
+//         }
+//     }
+
+//     BamFileGuard(const BamFileGuard&) = delete;  // Non-copyable
+//     BamFileGuard& operator=(const BamFileGuard&) = delete;  // Non-assignable
+// };
 
 // Print the progress of a task
 void printProgress(int progress, int total);
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index f04093af..0222fb7c 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -318,7 +318,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
 
         // Only extend the region if "save CNV data" is enabled
         SNPData snp_data;
-        printMessage("Querying SNP region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
+        // printMessage("Querying SNP region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         this->querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data);
 
         // Run the Viterbi algorithm
@@ -327,7 +327,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         	continue;
         }
         
-        printMessage("Running Viterbi algorithm for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
+        // printMessage("Running Viterbi algorithm for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         std::pair<std::vector<int>, double> prediction;
         runViterbi(hmm, snp_data, prediction);
         std::vector<int>& state_sequence = prediction.first;
@@ -371,7 +371,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
 
         // For LOH predictions, or predictions with the same type,
         // update predicted information without changing the SV type
-        printMessage("Updating SV call for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " with predicted CNV type: " + getSVTypeString(updated_sv_type));
+        // printMessage("Updating SV call for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " with predicted CNV type: " + getSVTypeString(updated_sv_type));
         updated_sv_type = (updated_sv_type == SVType::LOH) ? sv_call.sv_type : updated_sv_type;
         bool is_valid_update = isValidCopyNumberUpdate(sv_call.sv_type, updated_sv_type);
         if (is_valid_update)
@@ -444,7 +444,7 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>&
         printError("ERROR: Could not load index for BAM file: " + bam_filepath);
         return;
     }
-    BamFileGuard bam_guard(bam_file, bam_index, bam_header);  // Guard to close the BAM file
+    // BamFileGuard bam_guard(bam_file, bam_index, bam_header);  // Guard to close the BAM file
 
     // Initialize the record
     bam1_t *bam_record = bam_init1();
@@ -472,6 +472,20 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>&
 
         printMessage("(" + std::to_string(++current_chr) + "/" + std::to_string(total_chr_count) + ") Reading BAM file for chromosome: " + chr);
         std::vector<uint32_t>& pos_depth_map = chr_pos_depth_map[chr];
+        int tid = bam_name2id(bam_header, chr.c_str());
+        if (tid < 0)
+        {
+            printError("ERROR: Could not find chromosome " + chr + " in BAM file.");
+            continue;
+        }
+        // Resize the depth map to the length of the chromosome
+        uint32_t chr_length = bam_header->target_len[tid] + 1;
+        if (pos_depth_map.size() != static_cast<size_t>(chr_length))
+        {
+            printError("ERROR: Chromosome length mismatch for " + chr + ": expected " + std::to_string(chr_length) + ", found " + std::to_string(pos_depth_map.size()) + ", resizing to " + std::to_string(chr_length));
+            // Resize the depth map to the length of the chromosome
+            pos_depth_map.resize(chr_length, 0);
+        }
         while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0)
         {
             // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads
@@ -517,26 +531,58 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>&
             }
         }
         hts_itr_destroy(bam_iter);
-        
-        // Parallel sum of the depth map
-        uint64_t cum_depth = std::reduce(
-            std::execution::par,
-            pos_depth_map.begin(),
-            pos_depth_map.end(),
-            0ULL
-        );
-
-        // Parallel count of the non-zero depth positions
-        uint32_t pos_count = std::count_if(
-            std::execution::par,
-            pos_depth_map.begin(),
-            pos_depth_map.end(),
-            [](uint32_t depth) { return depth > 0; }
-        );
 
+        // You can parallelize the depth map calculation here but first close the
+        // BAM file and index
+        // Bam cleanup (delete guard if using this)
+        // bam_destroy1(bam_record);
+        // bam_hdr_destroy(bam_header);
+        // sam_close(bam_file);
+        // bam_index_destroy(bam_index);
+        // bam_record = nullptr;
+        // bam_header = nullptr;
+        // bam_file = nullptr;
+        // bam_index = nullptr;
+        
+        // // Parallel sum of the depth map
+        // uint64_t cum_depth = std::reduce(
+        //     std::execution::par,
+        //     pos_depth_map.begin(),
+        //     pos_depth_map.end(),
+        //     0ULL
+        // );
+
+        // // Parallel count of the non-zero depth positions
+        // uint32_t pos_count = std::count_if(
+        //     std::execution::par,
+        //     pos_depth_map.begin(),
+        //     pos_depth_map.end(),
+        //     [](uint32_t depth) { return depth > 0; }
+        // );
+
+        // Sum without parallelization
+        uint64_t cum_depth = std::accumulate(pos_depth_map.begin(), pos_depth_map.end(), 0ULL);
+        uint32_t pos_count = std::count_if(pos_depth_map.begin(), pos_depth_map.end(), [](uint32_t depth) { return depth > 0; });
+
+        // Calculate the mean coverage for the chromosome
         double mean_chr_cov = (pos_count > 0) ? static_cast<double>(cum_depth) / static_cast<double>(pos_count) : 0.0;
-        chr_mean_cov_map[chr] = mean_chr_cov;
+        printMessage("Mean coverage for chromosome " + chr + ": " + std::to_string(mean_chr_cov));
+        if (mean_chr_cov != 0.0) {
+        	chr_mean_cov_map[chr] = mean_chr_cov;
+    	}
     }
+
+    // Clean up the BAM file and index
+    printMessage("Closing BAM file " + bam_filepath);
+    bam_destroy1(bam_record);
+    hts_idx_destroy(bam_index);
+    bam_hdr_destroy(bam_header);
+    sam_close(bam_file);
+    bam_record = nullptr;
+    bam_index = nullptr;
+    bam_header = nullptr;
+    bam_file = nullptr;
+    printMessage("BAM file closed.");
 }
 
 void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector<uint32_t>& snp_pos, std::unordered_map<uint32_t, double>& snp_baf, std::unordered_map<uint32_t, double>& snp_pfb, const InputData& input_data) const
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 5661eb3b..cd55f67e 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -16,7 +16,6 @@
 // Constructor
 InputData::InputData()
 {
-    this->short_read_bam = "";
     this->long_read_bam = "";
     this->ref_filepath = "";
     this->snp_vcf_filepath = "";
@@ -28,7 +27,6 @@ InputData::InputData()
     this->min_cnv_length = 1000;
     this->min_reads = 5;
     this->dbscan_epsilon = 0.99;
-    this->dbscan_min_pts = 15;
     this->dbscan_min_pts_pct = 0.0;
     this->thread_count = 1;
     this->hmm_filepath = "data/wgs.hmm";
@@ -38,29 +36,24 @@ InputData::InputData()
     this->cnv_output_file = "";
 }
 
-std::string InputData::getShortReadBam() const
+void InputData::printParameters() const
 {
-    return this->short_read_bam;
-}
-
-void InputData::setShortReadBam(std::string filepath)
-{
-    this->short_read_bam = filepath;
-
-    // Check if empty string
-    if (filepath.empty())
+    std::cout << "Input parameters:" << std::endl;
+    std::cout << "Long read BAM: " << this->long_read_bam << std::endl;
+    std::cout << "Reference genome: " << this->ref_filepath << std::endl;
+    std::cout << "SNP VCF: " << this->snp_vcf_filepath << std::endl;
+    std::cout << "Output directory: " << this->output_dir << std::endl;
+    std::cout << "Sample size: " << this->sample_size << std::endl;
+    std::cout << "Minimum CNV length: " << this->min_cnv_length << std::endl;
+    std::cout << "DBSCAN epsilon: " << this->dbscan_epsilon << std::endl;
+    std::cout << "DBSCAN minimum points percentage: " << this->dbscan_min_pts_pct * 100.0f  << "%"  << std::endl;
+    if (this->region_set)
     {
-        return;
-        
-    } else {
-        // Check if the file exists
-        FILE *fp = fopen(filepath.c_str(), "r");
-        if (fp == NULL)
-        {
-            throw std::runtime_error("Short read BAM file does not exist: " + filepath);
-        } else {
-            fclose(fp);
-        }
+        std::cout << "Region set to: chr" + this->chr + ":" + std::to_string(this->start_end.first) + "-" + std::to_string(this->start_end.second) + "\n";
+    }
+    else
+    {
+        std::cout << "Running on whole genome" << std::endl;
     }
 }
 
@@ -169,16 +162,6 @@ double InputData::getDBSCAN_Epsilon() const
     return this->dbscan_epsilon;
 }
 
-void InputData::setDBSCAN_MinPts(int min_pts)
-{
-    this->dbscan_min_pts = min_pts;
-}
-
-int InputData::getDBSCAN_MinPts() const
-{
-    return this->dbscan_min_pts;
-}
-
 void InputData::setDBSCAN_MinPtsPct(double min_pts_pct)
 {
     this->dbscan_min_pts_pct = min_pts_pct;
diff --git a/src/main.cpp b/src/main.cpp
index 6f2c1b69..5ba0fb1b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -56,18 +56,11 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
     signal(SIGFPE, printStackTrace);
     signal(SIGBUS, printStackTrace);
 
-    // Placeholder for setting up input data and running ContextSV
-    // std::cout << "ContextSV version " << VERSION << std::endl;
-    // std::cout << "Input parameters:" << std::endl;
-    // for (const auto& arg : args) {
-    //     std::cout << arg.first << ": " << arg.second << std::endl;
-    // }
     printBanner();
 
     // Set up input data
     InputData input_data;
     input_data.setLongReadBam(args.at("bam-file"));
-    input_data.setShortReadBam(args.at("bam-file"));
     input_data.setRefGenome(args.at("ref-file"));
     input_data.setSNPFilepath(args.at("snps-file"));
     input_data.setOutputDir(args.at("output-dir"));
@@ -107,10 +100,6 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
         input_data.setDBSCAN_Epsilon(std::stod(args.at("epsilon")));
     }
 
-    if (args.find("min-pts") != args.end()) {
-        input_data.setDBSCAN_MinPts(std::stoi(args.at("min-pts")));
-    }
-
     if (args.find("min-pts-pct") != args.end()) {
         input_data.setDBSCAN_MinPtsPct(std::stod(args.at("min-pts-pct")));
     }
@@ -146,7 +135,6 @@ void printUsage(const std::string& programName) {
                 << "  -n, --sample-size <size>      Sample size for HMM predictions\n"
                 << "     --min-cnv <min_length>     Minimum CNV length\n"
                 << "     --eps <epsilon>             DBSCAN epsilon\n"
-                << "     --min-pts <min_pts>         DBSCAN minimum points\n"
                 << "     --min-pts-pct <min_pts_pct> Percentage of mean chr. coverage to use for DBSCAN minimum points\n"
                 << "  -e, --eth <eth_file>          ETH file\n"
                 << "  -p, --pfb <pfb_file>          PFB file\n"
@@ -186,8 +174,6 @@ std::unordered_map<std::string, std::string> parseArguments(int argc, char* argv
             args["min-reads"] = argv[++i];
         } else if (arg == "--eps" && i + 1 < argc) {
             args["epsilon"] = argv[++i];
-        } else if (arg == "--min-pts" && i + 1 < argc) {
-            args["min-pts"] = argv[++i];
         } else if (arg == "--min-pts-pct" && i + 1 < argc) {
             args["min-pts-pct"] = argv[++i];
         } else if ((arg == "-e" || arg == "--eth") && i + 1 < argc) {
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 684dcdf8..c08c3c32 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -94,7 +94,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
         printError("ERROR: failed to load index for " + bam_filepath);
         return;
     }
-    BamFileGuard bam_guard(fp_in, idx, bamHdr);  // Guard to close the BAM file
 
     // Alignment data structures
     std::unordered_map<int, std::unordered_map<std::string, PrimaryAlignment>> primary_map;  // TID-> qname -> primary alignment
@@ -173,6 +172,11 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
     // Clean up the iterator and alignment
     hts_itr_destroy(itr);
     bam_destroy1(bam1);
+
+    // Clean up the BAM file and index
+    sam_close(fp_in);
+    hts_idx_destroy(idx);
+    // bam_hdr_destroy(bamHdr);
     
     // Remove primary alignments without supplementary alignments
     std::unordered_map<int, std::unordered_set<std::string>> to_remove;
@@ -310,12 +314,35 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                         // strand
                         if (supp_aln.strand == primary_aln.strand) {
                             // Same strand
+
+                            // Check if the primary alignment is 5'-most
+                            bool primary_5p = false;
+                            if (primary_aln.start < supp_aln.start) {
+                                primary_5p = true;
+                            }
+
                             // Calculate distance between alignments on the read
                             read_distance = std::max(0, std::max(static_cast<int>(supp_aln.query_start), static_cast<int>(primary_aln.query_start)) - std::min(static_cast<int>(supp_aln.query_end), static_cast<int>(primary_aln.query_end)));
 
                             // Calculate distance between alignments on the
                             // reference
                             ref_distance = std::max(0, std::max(static_cast<int>(supp_aln.start), static_cast<int>(primary_aln.start)) - std::min(static_cast<int>(supp_aln.end), static_cast<int>(primary_aln.end)));
+                            
+                            // Throw an error if the read distance is negative
+                            if (read_distance < 0) {
+                                printError("ERROR: negative read distance between primary and supplementary alignments for " + qname);
+                            }
+                            // Throw an error if the reference distance is
+                            // negative
+                            if (ref_distance < 0) {
+                                printError("ERROR: negative reference distance between primary and supplementary alignments for " + qname);
+                            }
+
+                            // Use a negative read distance to indicate that the
+                            // primary alignment is not 5'-most
+                            if (!primary_5p) {
+                                read_distance = -read_distance;
+                            }
                             read_distances.push_back(read_distance);
                             ref_distances.push_back(ref_distance);
                         }
@@ -352,16 +379,52 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             // SV
             std::vector<int> primary_positions;
             int primary_cluster_size = 0;
+            bool primary_start = false;
+            bool primary_end = false;
             if (!primary_start_cluster.empty()) {
                 std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
                 primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]);
                 primary_cluster_size = primary_start_cluster.size();
+                primary_start = true;
             }
 
             if (!primary_end_cluster.empty()) {
                 std::sort(primary_end_cluster.begin(), primary_end_cluster.end());
                 primary_positions.push_back(primary_end_cluster[primary_end_cluster.size() / 2]);
                 primary_cluster_size = std::max(primary_cluster_size, (int) primary_end_cluster.size());
+                primary_end = true;
+            }
+
+            // Get the supplementary alignment positions
+            std::vector<int> supp_positions;
+            bool supp_start = false;
+            bool supp_end = false;
+            int supp_cluster_size = 0;
+            if (!supp_start_cluster.empty()) {
+                std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
+                supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]);
+                supp_cluster_size = supp_start_cluster.size();
+                supp_start = true;
+            }
+            if (!supp_end_cluster.empty()) {
+                std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
+                supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]);
+                supp_cluster_size = std::max(supp_cluster_size, (int) supp_end_cluster.size());
+                supp_end = true;
+            }
+
+            // Store the inversion as the supplementary start and end positions
+            if (inversion && supp_positions.size() > 1) {
+                std::sort(supp_positions.begin(), supp_positions.end());
+                int supp_start = supp_positions.front();
+                int supp_end = supp_positions.back();
+                int sv_length = std::abs(supp_start - supp_end);
+
+                // Use 50bp as the minimum length for an inversion
+                if (sv_length >= 50 && sv_length <= max_length) {
+                    SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size);
+                    addSVCall(chr_sv_calls, sv_candidate);
+                }
             }
 
             // -------------------------------
@@ -373,59 +436,76 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                 // insertion size
                 std::sort(read_distance_cluster.begin(), read_distance_cluster.end());
                 read_distance = read_distance_cluster[read_distance_cluster.size() / 2];
+                bool primary_5p_most = read_distance > 0;
+                read_distance = std::abs(read_distance);
                 
                 std::sort(ref_distance_cluster.begin(), ref_distance_cluster.end());
                 ref_distance = ref_distance_cluster[ref_distance_cluster.size() / 2];
 
                 // Add an insertion SV call at the primary position
-                if (!primary_positions.empty()) {
+
+                // bool print_debug = false;
+                bool print_debug = true;
+                
+                // int sv_start = primary_positions[0];
+                // Use the 3'-most primary position as the start position
+                int sv_start;
+                bool split_candidate_sv = false;
+                if (primary_5p_most && primary_end) {
                     std::sort(primary_positions.begin(), primary_positions.end());
-                    int sv_start = primary_positions[0];
+                    // Supplementary alignment is downstream with the
+                    // insertion sequence, starting at the 3'-most
+                    // primary position
+                    sv_start = primary_positions.back();
+
+                    // Print debug if SV start equals 223608935
+                    // if (sv_start == 223608936) {
+                    //     print_debug = true;
+                    // printMessage("DEBUG: SV start is" + std::to_string(sv_start) + " and read distance is " + std::to_string(read_distance) + " and ref distance is " + std::to_string(ref_distance) + " and primary 5p_most is " + std::to_string(primary_5p_most) + " and primary positions are " + std::to_string(primary_positions[0]) + " and " + std::to_string(primary_positions.back()) + " and number of primary positions is " + std::to_string(primary_positions.size()) + " and start bool is " + std::to_string(primary_start) + " and end bool is " + std::to_string(primary_end));
+                    // }
+                    split_candidate_sv = true;
+                } else if (!primary_5p_most && supp_end) {
+                    
+                    // Supplementary alignment is upstream with the
+                    // insertion sequence, starting at the 5'-most
+                    // primary position
+                    // sv_start = primary_positions.front();
+                    std::sort(supp_positions.begin(), supp_positions.end());
+                    sv_start = supp_positions.back();
+
+                    // Print debug if SV start equals 223608935
+                    // if (sv_start == 223608936) {
+                    //     print_debug = true;
+                    // printMessage("DEBUG: SV start is " + std::to_string(sv_start) + " and read distance is " + std::to_string(read_distance) + " and ref distance is " + std::to_string(ref_distance) + " and primary 5p_most is " + std::to_string(primary_5p_most) + " and primary positions are " + std::to_string(primary_positions[0]) + " and " + std::to_string(primary_positions.back()) + " and number of primary positions is " + std::to_string(primary_positions.size()) + " and start bool is " + std::to_string(primary_start) + " and end bool is " + std::to_string(primary_end));
+                    // }
+                    split_candidate_sv = true;
+                }
+                if (split_candidate_sv) {
                     int aln_offset = static_cast<int>(ref_distance - read_distance);
                     if (read_distance > ref_distance  && read_distance >= min_length && read_distance <= max_length) {
                         // Add an insertion SV call at the 5'-most primary position
                         SVType sv_type = SVType::INS;
-                        // for (int primary_pos : primary_positions) {
                         SVCall sv_candidate(sv_start, sv_start + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
                         addSVCall(chr_sv_calls, sv_candidate);
                         // }
                     } else if (ref_distance > read_distance && ref_distance >= min_length && ref_distance <= max_length) {
                         // Add a deletion SV call at the primary positions
-                        // for (int primary_pos : primary_positions) {
                         SVType sv_type = SVType::DEL;
+
+                        // if (print_debug) {
+                        //     printMessage("DEBUG: Adding deletion SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_start + (ref_distance-1)) + " with length " + std::to_string(ref_distance) + " and cluster size " + std::to_string(primary_cluster_size));
+                        // }
+
+                        // Add a dummy SV call before and after the start
+                        // position for HMM predictions
+                        // SVType sv_type = SVType::UNKNOWN;
                         SVCall sv_candidate(sv_start, sv_start + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
                         addSVCall(chr_sv_calls, sv_candidate);
-                        // }
+                        // SVCall sv_candidate2(sv_start + (ref_distance-1), sv_start + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
                     }
                 }
             }
 
-            // Get the supplementary alignment positions
-            std::vector<int> supp_positions;
-            int supp_cluster_size = 0;
-            if (!supp_start_cluster.empty()) {
-                std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
-                supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]);
-                supp_cluster_size = supp_start_cluster.size();
-            }
-            if (!supp_end_cluster.empty()) {
-                std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
-                supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]);
-                supp_cluster_size = std::max(supp_cluster_size, (int) supp_end_cluster.size());
-            }
-
-            // Store the inversion as the supplementary start and end positions
-            if (inversion && supp_positions.size() > 1) {
-                std::sort(supp_positions.begin(), supp_positions.end());
-                int supp_start = supp_positions.front();
-                int supp_end = supp_positions.back();
-                int sv_length = std::abs(supp_start - supp_end);
-                if (sv_length >= min_length && sv_length <= max_length) {
-                    SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size);
-                    addSVCall(chr_sv_calls, sv_candidate);
-                }
-            }
-
             // Add a dummy SV call for CNV detection
             int cluster_size = std::max(primary_cluster_size, supp_cluster_size);
             SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN;
@@ -446,7 +526,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
 
         // Combine SVs with identical start and end positions, and sum the cluster
         // sizes
-        printMessage("Combining SVs with identical start positions");
+        // printMessage("Combining SVs with identical start positions");
         std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
             return a.start < b.start || (a.start == b.start && a.end < b.end);
         });
@@ -458,6 +538,9 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
         // Print the number of merged SV calls
         printMessage(chr_name + ": Found " + std::to_string(sv_calls[chr_name].size()) + " SV candidates");
     }
+
+    // Clean up the BAM header
+    bam_hdr_destroy(bamHdr);
 }
 
 void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector<SVCall>& sv_calls, const std::vector<uint32_t>& pos_depth_map)
@@ -546,7 +629,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                 if (op_len <= 50) {
                     alt_allele = ins_seq_str;
                 }
-                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, 0, 1, 0);
+                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, 0, 0, 0);
                 cigar_sv_calls.emplace_back(sv_call);
             
             // Process clipped bases as potential insertions
@@ -578,7 +661,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                 if (op_len <= 50) {
                     alt_allele = ins_seq_str;
                 }
-                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, 0, 0.0, 0);
+                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, 0, 0, 0);
                 cigar_sv_calls.emplace_back(sv_call);
 
             // Check if the CIGAR operation is a deletion
@@ -586,7 +669,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
 
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
-                SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, 0, 1, 0);
+                SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, 0, 0, 0);
                 cigar_sv_calls.emplace_back(sv_call);
             }
         }
@@ -664,7 +747,7 @@ void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& ch
         printError("ERROR: failed to load index for " + bam_filepath);
         return;
     }
-    BamFileGuard bam_guard(fp_in, idx, bamHdr);  // Guard to close the BAM file
+    // BamFileGuard bam_guard(fp_in, idx, bamHdr);  // Guard to close the BAM file
 
     // Get DBSCAN parameters
     double dbscan_epsilon = input_data.getDBSCAN_Epsilon();
@@ -680,6 +763,11 @@ void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& ch
     printMessage(chr + ": CIGAR SVs...");
     this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map);
 
+    // Clean up the BAM file and index
+    sam_close(fp_in);
+    hts_idx_destroy(idx);
+    bam_hdr_destroy(bamHdr);
+
     printMessage(chr + ": Merging CIGAR...");
     mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false);
 
@@ -693,6 +781,9 @@ void SVCaller::run(const InputData& input_data)
     bool cigar_cn = true;
     bool split_svs = true;
 
+    // Print the input data
+    input_data.printParameters();
+
     // Set up the reference genome
     printMessage("Loading the reference genome...");
     const std::string ref_filepath = input_data.getRefGenome();
@@ -742,16 +833,34 @@ void SVCaller::run(const InputData& input_data)
     cnv_caller.calculateMeanChromosomeCoverage(chromosomes, chr_pos_depth_map, chr_mean_cov_map, bam_filepath, chr_thread_count);
 
     // Remove chromosomes with no reads (mean coverage is zero)
-    std::vector<std::string> null_chr;
+    printMessage("Removing chromosomes with no reads...");
+    std::vector<std::string> valid_chr;
     for (const auto& chr : chromosomes) {
+    	if (chr_mean_cov_map.find(chr) != chr_mean_cov_map.end()) {
+    		valid_chr.push_back(chr);
+	}
+	chromosomes = valid_chr;
+    	/*
+        try {
+            if (chr_mean_cov_map.at(chr) == 0.0) {
+                printMessage("Chromosome " + chr + " has no reads");
+            }
+        } catch (const std::out_of_range& e) {
+            printError("Chromosome " + chr + " not found in mean coverage map: " + std::string(e.what()));
+        }*/
+        /*
+        // Check if the chromosome has no reads
         if (chr_mean_cov_map[chr] == 0.0) {
             null_chr.push_back(chr);
         }
+        */
     }
+	/*
+    printMessage("Removing " + std::to_string(null_chr.size()) + " chromosomes with no reads...");
     for (const auto& chr : null_chr) {
         printMessage("Removing chromosome " + chr + " with no reads...");
         chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end());
-    }
+    }*/
     std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
     int current_chr = 0;
     int total_chr_count = chromosomes.size();
@@ -1017,6 +1126,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         "##INFO=<ID=ALNOFFSET,Number=1,Type=Integer,Description=\"Read vs. reference alignment offset\">",
         "##FILTER=<ID=PASS,Description=\"All filters passed\">",
         "##FILTER=<ID=LowQual,Description=\"Low quality\">",
+        "##FILTER=<ID=AssemblyGap,Description=\"Assembly gap\">",
         "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">",
         "##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Read depth at the variant site (sum of start and end positions)\">",
     };
@@ -1053,6 +1163,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     int total_count = 0;
     int unclassified_svs = 0;
     int filtered_svs = 0;
+    int assembly_gaps = 0;
     for (const auto& pair : sv_calls) {
         std::string chr = pair.first;
         const std::vector<SVCall>& sv_calls = pair.second;
@@ -1068,8 +1179,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             std::string data_type_str = getSVDataTypeString(sv_call.data_type);
             double hmm_likelihood = sv_call.hmm_likelihood;
             int cluster_size = sv_call.cluster_size;
-            //int read_depth = sv_call.read_depth;
-            // double mismatch_rate = sv_call.mismatch_rate;
             std::string filter = "PASS";
             int aln_offset = sv_call.aln_offset;
             int cn_state = sv_call.cn_state;
@@ -1091,8 +1200,21 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                 uint32_t preceding_pos = (uint32_t) std::max(1, static_cast<int>(start)-1);  // Make sure the position is not negative
                 ref_allele = ref_genome.query(chr, preceding_pos, end);
 
+
+
                 // Use the preceding base as the alternate allele 
                 if (ref_allele != "") {
+                    // If the sequence is >90% N, skip the SV call (assembly
+                    // gap)
+                    int allele_length_90pct = static_cast<int>(ref_allele.size() * 0.9);
+                    if (std::count(ref_allele.begin(), ref_allele.end(), 'N') > allele_length_90pct) {
+                        assembly_gaps += 1;
+                        // continue;
+
+                        // Don't skip but set the filter to assembly gap
+                        filter = "AssemblyGap";
+                    }
+
                     // The alt allele is the preceding base, and the reference
                     // allele is the deleted sequence including the preceding base
                     alt_allele = ref_allele.at(0);
@@ -1127,26 +1249,10 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                             std::cerr << "Warning: Reference allele is empty for insertion at " << chr << ":" << start << "-" << end << std::endl;
                         }
                     } else {
-                        // ref_allele = "N";  // No preceding base for the first
-                        // position
                         // Throw an error if the insertion is at the first position
                         std::cerr << "Error: Insertion at the first position " << chr << ":" << start << "-" << end << std::endl;
                         continue;
                     }
-                    // int64_t preceding_pos = (int64_t) std::max(1, (int) start-1);  // Make sure the position is not negative
-                    // ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos);
-                    // start = preceding_pos;
-                    // if (ref_allele != "") {
-                    //     if (alt_allele != "<INS>") {
-                    //         // Insert the reference allele before the insertion
-                    //         alt_allele.insert(0, ref_allele);
-                    //     }
-                    // } else {
-                    //     // If the reference allele is empty, use a symbolic allele
-                    //     ref_allele = "N";  // Convention for INS
-                    //     alt_allele = "<INS>";  // Symbolic allele
-                    //     std::cerr << "Warning: Reference allele is empty for insertion at " << chr << ":" << start << "-" << end << std::endl;
-                    // }
                     end = start;  // Update the end position to the same base
 
                 } else {
@@ -1171,10 +1277,11 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr), start);
 
             // If read depth equals zero, then set the filter to LowQual
-            if (read_depth == 0) {
-                filter = "LowQual";
-                filtered_svs += 1;
-            }
+            // if (read_depth == 0) {
+            //     printError("Warning: Read depth is zero for " + chr + ":" + std::to_string(start) + "-" + std::to_string(end));
+            //     filter = "LowQual";
+            //     filtered_svs += 1;
+            // }
 
             // Create the VCF parameter strings
             std::string sv_type_str = getSVTypeString(sv_type);
@@ -1199,6 +1306,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         std::cout << "Total unclassified SVs: " << unclassified_svs << std::endl;
     }
     printMessage("Total PASS filtered SVs: " + std::to_string(filtered_svs));
+    printMessage("Total filtered assembly gaps: " + std::to_string(assembly_gaps));
 }
 
 int SVCaller::getReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start) const
diff --git a/tests/test_general.py b/tests/test_general.py
index ac7d5d8d..ff65faba 100644
--- a/tests/test_general.py
+++ b/tests/test_general.py
@@ -40,7 +40,6 @@ def test_run():
     
     # Set input parameters.
     input_data = contextsv.InputData()
-    input_data.setShortReadBam(TEST_BAM_FILE)
     input_data.setLongReadBam(TEST_BAM_FILE)
     input_data.setRefGenome(TEST_REF_FILE)
     input_data.setSNPFilepath(TEST_SNPS_FILE)

From d86e1c7384c73aee0dcf230a8fb5874b1bfba1a7 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 16 May 2025 16:09:42 -0400
Subject: [PATCH 107/134] filter assembly gaps

---
 include/input_data.h |   5 +
 include/sv_caller.h  |   3 +-
 include/sv_object.h  |  14 +--
 include/sv_types.h   |  23 ++++-
 src/cnv_caller.cpp   |   3 +-
 src/input_data.cpp   |  29 ++++++
 src/main.cpp         |   5 +
 src/sv_caller.cpp    | 217 +++++++++++++++++++++++++++++++++++++------
 src/sv_object.cpp    |   8 +-
 9 files changed, 262 insertions(+), 45 deletions(-)

diff --git a/include/input_data.h b/include/input_data.h
index 452b5e6c..1e2c3c1e 100644
--- a/include/input_data.h
+++ b/include/input_data.h
@@ -51,6 +51,10 @@ class InputData {
         void setEthnicity(std::string ethnicity);
         std::string getEthnicity() const;
 
+        // Set the assembly gaps file.
+        void setAssemblyGaps(std::string filepath);
+        std::string getAssemblyGaps() const;
+
         // Set the sample size for HMM predictions.
         void setSampleSize(int sample_size);
         int getSampleSize() const;
@@ -116,6 +120,7 @@ class InputData {
         int thread_count;
         std::string hmm_filepath;
         std::string cnv_filepath;
+        std::string assembly_gaps;  // Assembly gaps file
         bool verbose;  // True if verbose output is enabled
         bool save_cnv_data;  // True if SNP CNV regions should be extended around SV breakpoints, and saved to a TSV file (Large performance hit)
         bool single_chr;
diff --git a/include/sv_caller.h b/include/sv_caller.h
index 4ef59700..997603ef 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -90,7 +90,8 @@ class SVCaller {
 
         void runSplitReadCopyNumberPredictions(const std::string& chr, std::vector<SVCall>& split_sv_calls, const CNVCaller &cnv_caller, const CHMM &hmm, double mean_chr_cov, const std::vector<uint32_t> &pos_depth_map, const InputData &input_data);
 
-        void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const std::string &output_dir, const ReferenceGenome &ref_genome, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map) const;
+        void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const InputData &input_data, const ReferenceGenome &ref_genome, const std::unordered_map<std::string, std::vector<uint32_t>> &chr_pos_depth_map) const;
+        // void saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>> &sv_calls, const std::string &output_dir, const ReferenceGenome &ref_genome, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map) const;
 
         // Query the read depth (INFO/DP) at a position
         int getReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start) const;
diff --git a/include/sv_object.h b/include/sv_object.h
index a99fb4fb..4fd34c56 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -18,23 +18,23 @@ struct SVCall {
     uint32_t end = 0;
     SVType sv_type = SVType::UNKNOWN;
     std::string alt_allele = ".";
-    SVDataType data_type = SVDataType::UNKNOWN;
+    // SVDataType data_type = SVDataType::UNKNOWN;
+    SVEvidenceFlags aln_type;
     Genotype genotype = Genotype::UNKNOWN;
     double hmm_likelihood = 0.0;
     int cn_state = 0;  // Copy number state
     int aln_offset = 0;  // Alignment offset (read vs. reference distance factor)
-    // int read_depth = 0;  // Breakpoint depth
-    // double mismatch_rate = 0.0;  // Highest mismatch rate in reads used for the SV call
     int cluster_size = 0;  // Number of SV calls in the cluster
 
     bool operator<(const SVCall& other) const;
 
     SVCall() = default;
 
-    SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int cn_state, int aln_offset, int cluster_size) :
-        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), cn_state(cn_state), aln_offset(aln_offset), cluster_size(cluster_size) {}
-    // SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int read_depth, double mismatch_rate, int cluster_size) :
-    //     start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), mismatch_rate(mismatch_rate), cluster_size(cluster_size) {}
+    SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, SVEvidenceFlags aln_type, Genotype genotype, double hmm_likelihood, int cn_state, int aln_offset, int cluster_size) :
+        start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), aln_type(aln_type), genotype(genotype), hmm_likelihood(hmm_likelihood), cn_state(cn_state), aln_offset(aln_offset), cluster_size(cluster_size) {}
+
+    // SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int cn_state, int aln_offset, int cluster_size) :
+    //     start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), cn_state(cn_state), aln_offset(aln_offset), cluster_size(cluster_size) {}
 };
 
 void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call);
diff --git a/include/sv_types.h b/include/sv_types.h
index dd67c2a4..359d0dc9 100644
--- a/include/sv_types.h
+++ b/include/sv_types.h
@@ -7,6 +7,7 @@
 #include <set>
 #include <unordered_map>
 #include <tuple>
+#include <bitset>
 /// @endcond
 
 namespace sv_types {
@@ -75,6 +76,8 @@ namespace sv_types {
         UNKNOWN = 9
     };
 
+    using SVEvidenceFlags = std::bitset<10>;  // Bitset for SV data types
+
     // Mapping of SV data types to strings
     const std::unordered_map<SVDataType, std::string> SVDataTypeString = {
         {SVDataType::CIGARINS, "CIGARINS"},
@@ -105,6 +108,20 @@ namespace sv_types {
         return SVTypeString.at(sv_type);
     }
 
+    // Function to get the SV alignment type string from the bitset
+    inline std::string getSVAlignmentTypeString(SVEvidenceFlags aln_type) {
+        std::string result;
+        for (size_t i = 0; i < SVDataTypeString.size(); ++i) {
+            if (aln_type.test(i)) {
+                result += SVDataTypeString.at(static_cast<SVDataType>(i)) + ",";
+            }
+        }
+        if (!result.empty()) {
+            result.pop_back();  // Remove the trailing comma
+        }
+        return result;
+    }
+
     // Function to get the SV type from the CNV state
     inline SVType getSVTypeFromCNState(int cn_state) {
         return CNVTypeMap.at(cn_state);
@@ -116,9 +133,9 @@ namespace sv_types {
     }
 
     // Function to get the SV data type string
-    inline std::string getSVDataTypeString(SVDataType data_type) {
-        return SVDataTypeString.at(data_type);
-    }
+    // inline std::string getSVDataTypeString(SVDataType data_type) {
+    //     return SVDataTypeString.at(data_type);
+    // }
 
     // Function to get the SV type symbol
     inline std::string getSVTypeSymbol(SVType sv_type) {
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 0222fb7c..60dca02f 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -377,7 +377,8 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         if (is_valid_update)
         {
             sv_call.sv_type = updated_sv_type;
-            sv_call.data_type = SVDataType::HMM;
+            // sv_call.data_type = SVDataType::HMM;
+            sv_call.aln_type.set(static_cast<size_t>(SVDataType::HMM));
             sv_call.hmm_likelihood = likelihood;
             sv_call.genotype = genotype;
             sv_call.cn_state = max_state;
diff --git a/src/input_data.cpp b/src/input_data.cpp
index cd55f67e..3e7ad69d 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -34,6 +34,7 @@ InputData::InputData()
     this->save_cnv_data = false;
     this->single_chr = false;
     this->cnv_output_file = "";
+    this->assembly_gaps = "";
 }
 
 void InputData::printParameters() const
@@ -142,6 +143,34 @@ void InputData::setEthnicity(std::string ethnicity)
     this->ethnicity = ethnicity;
 }
 
+void InputData::setAssemblyGaps(std::string filepath)
+{
+    // Check if the file exists
+    FILE *fp = fopen(filepath.c_str(), "r");
+    if (fp == NULL)
+    {
+        std::cerr << "Assembly gaps file does not exist: " << filepath << std::endl;
+        exit(1);
+    }
+
+    // Check if the file is a BED file
+    std::string ext = filepath.substr(filepath.find_last_of(".") + 1);
+    if (ext != "bed")
+    {
+        std::cerr << "Assembly gaps file is not a BED file: " << filepath << std::endl;
+        exit(1);
+    }
+    fclose(fp);
+
+    // Set the assembly gaps file
+    this->assembly_gaps = filepath;
+}
+
+std::string InputData::getAssemblyGaps() const
+{
+    return this->assembly_gaps;
+}
+
 uint32_t InputData::getMinCNVLength() const
 {
     return this->min_cnv_length;
diff --git a/src/main.cpp b/src/main.cpp
index 5ba0fb1b..f7f5db0a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -88,6 +88,9 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
     if (args.find("pfb-file") != args.end()) {
         input_data.setAlleleFreqFilepaths(args.at("pfb-file"));
     }
+    if (args.find("assembly-gaps") != args.end()) {
+        input_data.setAssemblyGaps(args.at("assembly-gaps"));
+    }
     if (args.find("save-cnv") != args.end()) {
         input_data.saveCNVData(true);
     }
@@ -180,6 +183,8 @@ std::unordered_map<std::string, std::string> parseArguments(int argc, char* argv
             args["eth"] = argv[++i];
         } else if ((arg == "-p" || arg == "--pfb") && i + 1 < argc) {
             args["pfb-file"] = argv[++i];
+        } else if (arg == "--assembly-gaps" && i + 1 < argc) {
+            args["assembly-gaps"] = argv[++i];
         } else if (arg == "--save-cnv") {
             args["save-cnv"] = "true";
         } else if (arg == "--debug") {
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index c08c3c32..b1e4be60 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -21,6 +21,7 @@
 #include <condition_variable>
 #include <bitset>
 #include <unordered_set>
+#include <sstream>
 
 #include "ThreadPool.h"
 #include "utils.h"
@@ -379,13 +380,13 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             // SV
             std::vector<int> primary_positions;
             int primary_cluster_size = 0;
-            bool primary_start = false;
+            // bool primary_start = false;
             bool primary_end = false;
             if (!primary_start_cluster.empty()) {
                 std::sort(primary_start_cluster.begin(), primary_start_cluster.end());
                 primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]);
                 primary_cluster_size = primary_start_cluster.size();
-                primary_start = true;
+                // primary_start = true;
             }
 
             if (!primary_end_cluster.empty()) {
@@ -397,14 +398,14 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
 
             // Get the supplementary alignment positions
             std::vector<int> supp_positions;
-            bool supp_start = false;
+            // bool supp_start = false;
             bool supp_end = false;
             int supp_cluster_size = 0;
             if (!supp_start_cluster.empty()) {
                 std::sort(supp_start_cluster.begin(), supp_start_cluster.end());
                 supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]);
                 supp_cluster_size = supp_start_cluster.size();
-                supp_start = true;
+                // supp_start = true;
             }
             if (!supp_end_cluster.empty()) {
                 std::sort(supp_end_cluster.begin(), supp_end_cluster.end());
@@ -422,7 +423,10 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
 
                 // Use 50bp as the minimum length for an inversion
                 if (sv_length >= 50 && sv_length <= max_length) {
-                    SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size);
+                    SVEvidenceFlags aln_type;
+                    aln_type.set(static_cast<size_t>(SVDataType::SUPPINV));
+                    SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), aln_type, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size);
+                    // SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size);
                     addSVCall(chr_sv_calls, sv_candidate);
                 }
             }
@@ -480,17 +484,24 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                     // }
                     split_candidate_sv = true;
                 }
+                SVEvidenceFlags aln_type;
+                aln_type.set(static_cast<size_t>(SVDataType::SPLITDIST1));
                 if (split_candidate_sv) {
                     int aln_offset = static_cast<int>(ref_distance - read_distance);
                     if (read_distance > ref_distance  && read_distance >= min_length && read_distance <= max_length) {
                         // Add an insertion SV call at the 5'-most primary position
                         SVType sv_type = SVType::INS;
-                        SVCall sv_candidate(sv_start, sv_start + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
+                        SVCall sv_candidate(sv_start, sv_start + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), aln_type, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
+                        // SVCall sv_candidate(sv_start, sv_start + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
                         addSVCall(chr_sv_calls, sv_candidate);
                         // }
                     } else if (ref_distance > read_distance && ref_distance >= min_length && ref_distance <= max_length) {
                         // Add a deletion SV call at the primary positions
-                        SVType sv_type = SVType::DEL;
+                        // SVType sv_type = SVType::DEL;
+
+                        // Set it to unknown, SV type will be determined by the
+                        // HMM prediction
+                        SVType sv_type = SVType::UNKNOWN;
 
                         // if (print_debug) {
                         //     printMessage("DEBUG: Adding deletion SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_start + (ref_distance-1)) + " with length " + std::to_string(ref_distance) + " and cluster size " + std::to_string(primary_cluster_size));
@@ -499,9 +510,12 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                         // Add a dummy SV call before and after the start
                         // position for HMM predictions
                         // SVType sv_type = SVType::UNKNOWN;
-                        SVCall sv_candidate(sv_start, sv_start + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
+                        // SVCall sv_candidate(sv_start, sv_start +
+                        // (ref_distance-1), sv_type, getSVTypeSymbol(sv_type),
+                        // SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0,
+                        // aln_offset, primary_cluster_size);
+                        SVCall sv_candidate(sv_start, sv_start + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), aln_type, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
                         addSVCall(chr_sv_calls, sv_candidate);
-                        // SVCall sv_candidate2(sv_start + (ref_distance-1), sv_start + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
                     }
                 }
             }
@@ -517,7 +531,12 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                     int sv_length = sv_end - sv_start + 1;
                     if (sv_length >= min_length && sv_length <= max_length) {
                         // printMessage("Adding SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and cluster size " + std::to_string(cluster_size));
-                        SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
+                        // SVCall sv_candidate(sv_start, sv_end, sv_type, alt,
+                        // SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, 0,
+                        // cluster_size);
+                        SVEvidenceFlags aln_type;
+                        aln_type.set(static_cast<size_t>(SVDataType::SPLIT));
+                        SVCall sv_candidate(sv_start, sv_end, sv_type, alt, aln_type, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
                         addSVCall(chr_sv_calls, sv_candidate);
                     }
                 }
@@ -629,7 +648,10 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                 if (op_len <= 50) {
                     alt_allele = ins_seq_str;
                 }
-                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, 0, 0, 0);
+                SVEvidenceFlags aln_type;
+                aln_type.set(static_cast<size_t>(SVDataType::CIGARINS));
+                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, aln_type, Genotype::UNKNOWN, default_lh, 0, 0, 0);
+                // SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, 0, 0, 0);
                 cigar_sv_calls.emplace_back(sv_call);
             
             // Process clipped bases as potential insertions
@@ -661,7 +683,10 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
                 if (op_len <= 50) {
                     alt_allele = ins_seq_str;
                 }
-                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, 0, 0, 0);
+                SVEvidenceFlags aln_type;
+                aln_type.set(static_cast<size_t>(SVDataType::CIGARCLIP));
+                SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, aln_type, Genotype::UNKNOWN, default_lh, 0, 0, 0);
+                // SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, 0, 0, 0);
                 cigar_sv_calls.emplace_back(sv_call);
 
             // Check if the CIGAR operation is a deletion
@@ -669,7 +694,10 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec
 
                 ref_pos = pos+1;
                 ref_end = ref_pos + op_len -1;
-                SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, 0, 0, 0);
+                SVEvidenceFlags aln_type;
+                aln_type.set(static_cast<size_t>(SVDataType::CIGARDEL));
+                SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), aln_type, Genotype::UNKNOWN, default_lh, 0, 0, 0);
+                // SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, 0, 0, 0);
                 cigar_sv_calls.emplace_back(sv_call);
             }
         }
@@ -965,6 +993,38 @@ void SVCaller::run(const InputData& input_data)
         }
     }
 
+    // Merge any duplicate SV calls from the CIGAR and split-read
+    // detections (same start positions)
+    printMessage("Merging CIGAR and split read SV calls...");
+    for (auto& entry : whole_genome_sv_calls) {
+        std::vector<SVCall>& sv_calls = entry.second;
+        // mergeDuplicateSVs(sv_calls);
+        // mergeSVs(sv_calls, 0.1, 2, false);
+
+        // [TEST 1] Keep noise and use the DBSCAN epsilon from the
+        // command line
+        // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), 2, true);
+
+        // [TEST 2] Remove noise and use the DBSCAN epsilon from the
+        // command line (= really low recall, and low precision)
+        // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), 2, false);
+
+        // [TEST 3] Remove noise and use a DBSCAN epsilon of 0.1 (low recall,
+        // higher precision)
+        // mergeSVs(sv_calls, 0.1, 2, false);
+
+        // [TEST 4] Keep noise and use a DBSCAN epsilon of 0.1 (slightly better
+        // recall)
+        // Using a more aggressive epsilon works better for the final merge
+        mergeSVs(sv_calls, 0.1, 2, true);
+
+        // [TEST 5] Keep noise and use a DBSCAN epsilon of 0.01 (1 more FP)
+        // mergeSVs(sv_calls, 0.01, 2, true);
+
+        // [TEST 6] do nothing (reduced precision, same recall as #4)
+        // continue;
+    }
+
     if (input_data.getSaveCNVData()) {
         closeJSON(json_fp);
     }
@@ -981,8 +1041,9 @@ void SVCaller::run(const InputData& input_data)
 
     // Save to VCF
     std::cout << "Saving SVs to VCF..." << std::endl;
-    const std::string output_dir = input_data.getOutputDir();
-    this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome, chr_pos_depth_map);
+    // const std::string output_dir = input_data.getOutputDir();
+    // this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome, chr_pos_depth_map);
+    this->saveToVCF(whole_genome_sv_calls, input_data, ref_genome, chr_pos_depth_map);
 }
 
 void SVCaller::findOverlaps(const std::unique_ptr<IntervalNode> &root, const PrimaryAlignment &query, std::vector<std::string> &result)
@@ -1030,6 +1091,14 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
         Genotype genotype = std::get<2>(result);
         int cn_state = std::get<3>(result);
 
+        bool print_debug = false;
+        if (sv_candidate.start == 15287019) {
+        // if (true) {
+            print_debug = true;
+
+            printMessage("DEBUG: Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type));
+        }
+
         // printMessage("Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVDataTypeString(sv_candidate.data_type));
         
         // Update the SV type if the predicted type is not unknown
@@ -1039,11 +1108,16 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
             if (sv_candidate.sv_type == SVType::UNKNOWN && (supp_type == SVType::DEL || supp_type == SVType::DUP)) {
                 sv_candidate.sv_type = supp_type;
                 sv_candidate.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
-                sv_candidate.data_type = SVDataType::HMM;
+                sv_candidate.aln_type.set(static_cast<size_t>(SVDataType::HMM));
+                // sv_candidate.data_type = SVDataType::HMM;
                 sv_candidate.hmm_likelihood = supp_lh;
                 sv_candidate.genotype = genotype;
                 sv_candidate.cn_state = cn_state;
 
+                if (print_debug) {
+                    printMessage("DEBUG [1]: Updating SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type));
+                }
+
             // For predictions with the same type, or LOH predictions, update the
             // prediction information
             } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type == sv_candidate.sv_type || supp_type == SVType::LOH)) {
@@ -1051,16 +1125,25 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
                 sv_candidate.genotype = genotype;
                 sv_candidate.cn_state = cn_state;
 
+                if (print_debug) {
+                    printMessage("DEBUG [2]: Updating SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type));
+                }
+
             // Add an additional SV call if the type is different
             } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type != sv_candidate.sv_type && (supp_type == SVType::DEL || supp_type == SVType::DUP))) {
                 SVCall new_sv_call = sv_candidate;  // Copy the original SV call
                 new_sv_call.sv_type = supp_type;
                 new_sv_call.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
-                new_sv_call.data_type = SVDataType::HMM;
+                // new_sv_call.aln_type = SVDataType::HMM;
+                new_sv_call.aln_type.set(static_cast<size_t>(SVDataType::HMM));
                 new_sv_call.hmm_likelihood = supp_lh;
                 new_sv_call.genotype = genotype;
                 new_sv_call.cn_state = cn_state;
                 additional_calls.push_back(new_sv_call);
+
+                if (print_debug) {
+                    printMessage("DEBUG [3]: Adding additional SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type));
+                }
             }
         }
     }
@@ -1084,9 +1167,46 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
     }
 }
 
-void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map) const
+// void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map) const
+void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const InputData &input_data, const ReferenceGenome& ref_genome, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map) const
 {
+    // Check if an assembly gap file was provided
+    std::string assembly_gap_file = input_data.getAssemblyGaps();
+    std::unordered_map<std::string, std::vector<std::pair<uint32_t, uint32_t>>> assembly_gaps;
+    if (!assembly_gap_file.empty()) {
+        std::cout << "Loading assembly gap file: " << assembly_gap_file << std::endl;
+        // Load the assembly gap file and process it
+        std::ifstream gap_stream(assembly_gap_file);
+        if (!gap_stream.is_open()) {
+            printError("Failed to open assembly gap file: " + assembly_gap_file);
+            return;
+        }
+        std::string line;
+        while (std::getline(gap_stream, line)) {
+            // Skip empty lines and comments
+            if (line.empty() || line[0] == '#') {
+                continue;
+            }
+
+            // Parse the line (assuming tab-separated values)
+            std::istringstream iss(line);
+            std::string chr;
+            uint32_t start, end;
+            if (!(iss >> chr >> start >> end)) {
+                printError("Failed to parse assembly gap file line: " + line);
+                continue;
+            }
+            // Add the assembly gap to the map
+            assembly_gaps[chr].emplace_back(start, end);
+            // Print the assembly gap information
+            // std::cout << "Assembly gap: " << chr << ":" << start << "-" << end << std::endl;
+        }
+        gap_stream.close();
+        std::cout << "Loaded " << assembly_gaps.size() << " assembly gaps." << std::endl;
+    }
+
     std::cout << "Creating VCF writer..." << std::endl;
+    std::string output_dir = input_data.getOutputDir();
     std::string output_vcf = output_dir + "/output.vcf";
     std::cout << "Writing VCF file to " << output_vcf << std::endl;
 	std::ofstream vcf_stream(output_vcf);
@@ -1163,7 +1283,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
     int total_count = 0;
     int unclassified_svs = 0;
     int filtered_svs = 0;
-    int assembly_gaps = 0;
+    int assembly_gap_filtered_svs = 0;
     for (const auto& pair : sv_calls) {
         std::string chr = pair.first;
         const std::vector<SVCall>& sv_calls = pair.second;
@@ -1176,7 +1296,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             std::string alt_allele = sv_call.alt_allele;
             SVType sv_type = sv_call.sv_type;
             std::string genotype = getGenotypeString(sv_call.genotype);
-            std::string data_type_str = getSVDataTypeString(sv_call.data_type);
+            std::string data_type_str = getSVAlignmentTypeString(sv_call.aln_type);
             double hmm_likelihood = sv_call.hmm_likelihood;
             int cluster_size = sv_call.cluster_size;
             std::string filter = "PASS";
@@ -1196,24 +1316,61 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
             // Deletion
             if (sv_type == SVType::DEL) {
+                // Check if the deletion is in an assembly gap (0-based)
+                if (assembly_gap_file != "") {
+                    bool in_assembly_gap = false;
+                    // if (assembly_gaps.find(chr) != assembly_gaps.end()) {
+                    auto it = assembly_gaps.find(chr);
+                    if (it != assembly_gaps.end()) {
+                        // Check if the deletion overlaps with any assembly gaps
+                        for (const auto& gap : assembly_gaps[chr]) {
+                            // Determine if the deletion overlaps with the
+                            // assembly gap by greater than 50%
+                            uint32_t overlap_start = std::max(start, gap.first + 1);  // Convert to 1-based
+                            uint32_t overlap_end = std::min(end, gap.second + 1);  // Convert to 1-based
+                            if (overlap_start <= overlap_end) {
+                                // Calculate the overlap length
+                                uint32_t overlap_length = overlap_end - overlap_start + 1;
+                                // Calculate the percentage of overlap
+                                double overlap_pct = static_cast<double>(overlap_length) / static_cast<double>(sv_length);
+                                if (overlap_pct > 0.2) {
+                                    in_assembly_gap = true;
+                                    break;
+                                }
+                            }
+                            // double overlap = 0.0;
+                            // uint32_t gap_start = gap.first + 1;  // Convert to 1-based
+                            // uint32_t gap_end = gap.second + 1;  // Convert to 1-based
+                            // overlap = static_cast<double>(std::min(end, gap_end) - std::max(start, gap_start) + 1) / static_cast<double>(sv_length);
+                            // if (overlap > 0.2) {
+                            //     std::cout << "Assembly gap overlap is " << overlap << " for " << chr << ":" << start << "-" << end << std::endl;
+                            //     in_assembly_gap = true;
+                            //     break;
+                            // }
+                        }
+                        if (in_assembly_gap) {
+                            filter = "AssemblyGap";
+                            assembly_gap_filtered_svs += 1;
+                        }
+                    }
+                }
+
                 // Get the deleted sequence from the reference genome, also including the preceding base
                 uint32_t preceding_pos = (uint32_t) std::max(1, static_cast<int>(start)-1);  // Make sure the position is not negative
                 ref_allele = ref_genome.query(chr, preceding_pos, end);
 
-
-
                 // Use the preceding base as the alternate allele 
                 if (ref_allele != "") {
                     // If the sequence is >90% N, skip the SV call (assembly
                     // gap)
-                    int allele_length_90pct = static_cast<int>(ref_allele.size() * 0.9);
-                    if (std::count(ref_allele.begin(), ref_allele.end(), 'N') > allele_length_90pct) {
-                        assembly_gaps += 1;
-                        // continue;
+                    // int allele_length_90pct = static_cast<int>(ref_allele.size() * 0.9);
+                    // if (std::count(ref_allele.begin(), ref_allele.end(), 'N') > allele_length_90pct) {
+                    //     assembly_gaps += 1;
+                    //     // continue;
 
-                        // Don't skip but set the filter to assembly gap
-                        filter = "AssemblyGap";
-                    }
+                    //     // Don't skip but set the filter to assembly gap
+                    //     filter = "AssemblyGap";
+                    // }
 
                     // The alt allele is the preceding base, and the reference
                     // allele is the deleted sequence including the preceding base
@@ -1306,7 +1463,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
         std::cout << "Total unclassified SVs: " << unclassified_svs << std::endl;
     }
     printMessage("Total PASS filtered SVs: " + std::to_string(filtered_svs));
-    printMessage("Total filtered assembly gaps: " + std::to_string(assembly_gaps));
+    printMessage("Total filtered assembly gaps: " + std::to_string(assembly_gap_filtered_svs));
 }
 
 int SVCaller::getReadDepth(const std::vector<uint32_t>& pos_depth_map, uint32_t start) const
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index ddc0cf4b..2621b785 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -20,7 +20,7 @@ void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call)
 {
     // Check if the SV call is valid
     if (sv_call.start > sv_call.end) {
-        printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " from data type " + getSVDataTypeString(sv_call.data_type));
+        printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " from data type " + getSVAlignmentTypeString(sv_call.aln_type));
         return;
     }
 
@@ -82,7 +82,9 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
         if (sv_type == SVType::INS) {
             // Add only non-CIGARCLIP SVs to the cluster map
             for (size_t i = 0; i < clusters.size(); ++i) {
-                if (sv_type_calls[i].data_type != SVDataType::CIGARCLIP) {
+                // if (sv_type_calls[i].data_type != SVDataType::CIGARCLIP) {
+                // Use the SVEvidenceFlags to check for CIGARCLIP
+                if (!sv_type_calls[i].aln_type.test(static_cast<size_t>(SVDataType::CIGARCLIP))) {
                     cluster_map[clusters[i]].push_back(sv_type_calls[i]);
                 }
             }
@@ -175,7 +177,7 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                 cluster_count++;
             }
         }
-        printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type));
+        printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + " SVs");
     }
     sv_calls = std::move(merged_sv_calls); // Replace with filtered list
     int updated_size = sv_calls.size();

From 0acbf130daf2125b380e3a83f0ccb004b780cab4 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 16 May 2025 19:34:38 -0400
Subject: [PATCH 108/134] improve split sv merging

---
 src/sv_caller.cpp | 27 ++++-----------------
 src/sv_object.cpp | 60 ++++++++++++++++++++++++++++++++---------------
 2 files changed, 46 insertions(+), 41 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index b1e4be60..3e9f2ddc 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -514,6 +514,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                         // (ref_distance-1), sv_type, getSVTypeSymbol(sv_type),
                         // SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0,
                         // aln_offset, primary_cluster_size);
+                        // printMessage("DEBUG: Adding deletion SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_start + (ref_distance-1)) + " with length " + std::to_string(ref_distance) + " and cluster size " + std::to_string(primary_cluster_size) + " and 5p-most is " + std::to_string(primary_5p_most) + " and read distance is " + std::to_string(read_distance) + " and ref distance is " + std::to_string(ref_distance));
                         SVCall sv_candidate(sv_start, sv_start + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), aln_type, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
                         addSVCall(chr_sv_calls, sv_candidate);
                     }
@@ -527,7 +528,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             for (int primary_pos : primary_positions) {
                 for (int supp_pos : supp_positions) {
                     int sv_start = std::min(primary_pos, supp_pos);
-                    int sv_end = std::max(primary_pos, supp_pos);
+                    int sv_end = std::max(primary_pos, supp_pos) - 1;
                     int sv_length = sv_end - sv_start + 1;
                     if (sv_length >= min_length && sv_length <= max_length) {
                         // printMessage("Adding SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and cluster size " + std::to_string(cluster_size));
@@ -550,7 +551,8 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             return a.start < b.start || (a.start == b.start && a.end < b.end);
         });
         
-        // Merge duplicate SV calls with identical start positions
+        // Merge duplicate SV calls with identical start and end positions, and sum the
+        // cluster sizes
         mergeDuplicateSVs(chr_sv_calls);
         sv_calls[chr_name] = std::move(chr_sv_calls);
 
@@ -868,27 +870,7 @@ void SVCaller::run(const InputData& input_data)
     		valid_chr.push_back(chr);
 	}
 	chromosomes = valid_chr;
-    	/*
-        try {
-            if (chr_mean_cov_map.at(chr) == 0.0) {
-                printMessage("Chromosome " + chr + " has no reads");
-            }
-        } catch (const std::out_of_range& e) {
-            printError("Chromosome " + chr + " not found in mean coverage map: " + std::string(e.what()));
-        }*/
-        /*
-        // Check if the chromosome has no reads
-        if (chr_mean_cov_map[chr] == 0.0) {
-            null_chr.push_back(chr);
-        }
-        */
     }
-	/*
-    printMessage("Removing " + std::to_string(null_chr.size()) + " chromosomes with no reads...");
-    for (const auto& chr : null_chr) {
-        printMessage("Removing chromosome " + chr + " with no reads...");
-        chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end());
-    }*/
     std::unordered_map<std::string, std::vector<SVCall>> whole_genome_sv_calls;
     int current_chr = 0;
     int total_chr_count = chromosomes.size();
@@ -1017,6 +999,7 @@ void SVCaller::run(const InputData& input_data)
         // recall)
         // Using a more aggressive epsilon works better for the final merge
         mergeSVs(sv_calls, 0.1, 2, true);
+        // continue;
 
         // [TEST 5] Keep noise and use a DBSCAN epsilon of 0.01 (1 more FP)
         // mergeSVs(sv_calls, 0.01, 2, true);
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 2621b785..1fe60b31 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -136,13 +136,21 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                 
                 SVCall merged_sv_call = cluster_sv_calls[0];
                 if (has_nonzero_likelihood) {
-                    // These are detected from split reads, choose the one with
-                    // the highest non-zero likelihood normalized by the length of the SV
+                    // // These are detected from split reads, choose the one with
+                    // // the highest non-zero likelihood normalized by the length of the SV
+                    // std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+                    //     return (a.hmm_likelihood / (double)(a.end - a.start + 1)) > (b.hmm_likelihood / (double)(b.end - b.start + 1));
+                    // });
+
+                    // // Obtain the highest non-zero likelihood
+                    // auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) {
+                    //     return sv_call.hmm_likelihood != 0.0;
+                    // });
+
+                    // Choose the SV with the highest cluster size of all SVs with non-zero likelihood
                     std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-                        return (a.hmm_likelihood / (double)(a.end - a.start + 1)) > (b.hmm_likelihood / (double)(b.end - b.start + 1));
+                        return a.cluster_size > b.cluster_size || (a.cluster_size == b.cluster_size && a.hmm_likelihood > b.hmm_likelihood);
                     });
-
-                    // Obtain the highest non-zero likelihood
                     auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) {
                         return sv_call.hmm_likelihood != 0.0;
                     });
@@ -195,24 +203,38 @@ void mergeDuplicateSVs(std::vector<SVCall> &sv_calls)
     });
     for (size_t i = 0; i < sv_calls.size(); i++) {
         SVCall& sv_call = sv_calls[i];
-        // For SVs at the same start position with the same SV type, keep the one
-        // with the highest likelihood
-        if (i > 0 && sv_call.start == sv_calls[i - 1].start && ((sv_call.sv_type == sv_calls[i - 1].sv_type) || sv_call.sv_type == SVType::UNKNOWN || sv_calls[i - 1].sv_type == SVType::UNKNOWN)) {
-            // Keep the SV call with a non-zero likelihood
-            // The HMM prediction is more reliable than the split read prediction
-            if (sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) {
-                combined_sv_calls.back() = sv_call;
-            }
 
-            // If the likelihoods are equal, keep the one with the larger cluster size
-            // This is to ensure that the SV call with more supporting reads is
-            // kept
-            else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size >= sv_calls[i - 1].cluster_size) {
-                combined_sv_calls.back() = sv_call;
-            }
+        // Merge cluster sizes if start and end positions are the same
+        if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.end == sv_calls[i - 1].end) {
+            // Combine the cluster sizes
+            sv_call.cluster_size += sv_calls[i - 1].cluster_size;
+            combined_sv_calls.back() = sv_call;
         } else {
             combined_sv_calls.push_back(sv_call);
         }
+        // SVCall& sv_call = sv_calls[i];
+        // // For SVs at the same start position with the same SV type, keep the one
+        // // with the highest likelihood
+        // if (i > 0 && sv_call.start == sv_calls[i - 1].start && ((sv_call.sv_type == sv_calls[i - 1].sv_type) || sv_call.sv_type == SVType::UNKNOWN || sv_calls[i - 1].sv_type == SVType::UNKNOWN)) {
+        //     // Keep the SV call with a non-zero likelihood
+        //     // The HMM prediction is more reliable than the split read prediction
+        //     if (sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) {
+        //         // Combine the cluster sizes
+        //         sv_call.cluster_size += sv_calls[i - 1].cluster_size;
+        //         combined_sv_calls.back() = sv_call;
+        //     }
+
+        //     // If the likelihoods are equal, keep the one with the larger cluster size
+        //     // This is to ensure that the SV call with more supporting reads is
+        //     // kept
+        //     else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size >= sv_calls[i - 1].cluster_size) {
+        //         // Combine the cluster sizes
+        //         sv_call.cluster_size += sv_calls[i - 1].cluster_size;
+        //         combined_sv_calls.back() = sv_call;
+        //     }
+        // } else {
+        //     combined_sv_calls.push_back(sv_call);
+        // }
     }
     int merge_count = initial_size - combined_sv_calls.size();
     sv_calls = std::move(combined_sv_calls); // Replace with filtered list

From 5affdd3edeef080d15c88bd33fb2133c6d43f141 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 16 May 2025 23:00:57 -0400
Subject: [PATCH 109/134] fix dup prediction error

---
 src/cnv_caller.cpp | 29 ++++++++++++++++++++++++++++-
 src/khmm.cpp       | 10 ++++++++++
 src/main.cpp       | 13 +++++++++----
 src/sv_caller.cpp  | 16 +++++++++++-----
 4 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 60dca02f..52c6ea67 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -173,6 +173,13 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
         return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0);
     }
 
+    // bool print_debug = (start_pos == 62971016 || start_pos == 62971017);
+    bool print_debug = false;
+    if (print_debug)
+    {
+        printMessage("DEBUG: Running copy number prediction for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
+    }
+
     // Run the Viterbi algorithm on SNPs in the SV region
     // Only extend the region if "save CNV data" is enabled
     SNPData before_sv;
@@ -200,6 +207,20 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
     SNPData snp_data;
     querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data);
 
+    if (print_debug)
+    {
+        printMessage("DEBUG: SNP data size: " + std::to_string(snp_data.pos.size()));
+        printMessage("DEBUG: SNP data baf size: " + std::to_string(snp_data.baf.size()));
+        printMessage("DEBUG: SNP data pfb size: " + std::to_string(snp_data.pfb.size()));
+        printMessage("DEBUG: SNP data log2_cov size: " + std::to_string(snp_data.log2_cov.size()));
+        printMessage("DEBUG: mean_chr_cov: " + std::to_string(mean_chr_cov));
+        // Print all log2_cov values
+        for (size_t i = 0; i < snp_data.log2_cov.size(); i++)
+        {
+            printMessage("DEBUG: SNP data log2_cov[" + std::to_string(i) + "]: " + std::to_string(snp_data.log2_cov[i]));
+        }
+    }
+
     // Run the Viterbi algorithm
     std::pair<std::vector<int>, double> prediction;
     runViterbi(hmm, snp_data, prediction);
@@ -219,6 +240,11 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
     std::vector<int> state_counts(6, 0);
     for (int state : state_sequence)
     {
+        if (print_debug)
+        {
+            printMessage("DEBUG: State: " + std::to_string(state));
+        }
+
         // Skip state 3 (normal state)
         if (state != 3)
         {
@@ -245,8 +271,9 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
     }
 
     // Save the SV calls if enabled
+    uint32_t min_length = 30000;
     bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL);
-    if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 50000)
+    if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) >= min_length)
     {
         // Set B-allele and population frequency values to 0 for non-SNPs
         for (size_t i = 0; i < snp_data.pos.size(); i++)
diff --git a/src/khmm.cpp b/src/khmm.cpp
index 43ed3958..7dfce96e 100644
--- a/src/khmm.cpp
+++ b/src/khmm.cpp
@@ -58,12 +58,22 @@ std::pair<std::vector<int>, double> testVit_CHMM(CHMM hmm, int T, std::vector<do
 double b1iot(int state, std::vector<double> mean, std::vector<double> sd, double uf, double o)
 {
 	// Get the values (0-based indexing)
+
+	// Fix within the expected normalized coverage range
 	if (o < mean[0])
 	{
 		o = mean[0];
+	} else if (o > mean[5])
+	{
+		o = mean[5];
 	}
+
 	double p = uf + ((1 - uf) * pdf_normal(o, mean[state-1], sd[state-1]));
 
+	// Print the equation and the result
+	// printMessage("b1iot: state = " + std::to_string(state) + ", mean = " + std::to_string(mean[state-1]) + ", sd = " + std::to_string(sd[state-1]) + ", uf = " + std::to_string(uf) + ", o = " + std::to_string(o) + ", p = " + std::to_string(p));
+	// printMessage("Equation: b1iot = uf + ((1 - uf) * pdf_normal(o, mean[state-1], sd[state-1]))");
+
 	return log(p);
 }
 
diff --git a/src/main.cpp b/src/main.cpp
index f7f5db0a..4755a0e4 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -111,11 +111,16 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
     if (input_data.getSaveCNVData()) {
         const std::string output_dir = input_data.getOutputDir();
         std::string json_filepath = output_dir + "/CNVCalls.json";
-        int json_file_count = 1;
-        while (fileExists(json_filepath)) {
-            json_filepath = output_dir + "/CNVCalls_" + std::to_string(json_file_count) + ".json";
-            json_file_count++;
+
+        // Remove the old JSON file if it exists
+        if (fileExists(json_filepath)) {
+            remove(json_filepath.c_str());
         }
+        // int json_file_count = 1;
+        // while (fileExists(json_filepath)) {
+        //     json_filepath = output_dir + "/CNVCalls_" + std::to_string(json_file_count) + ".json";
+        //     json_file_count++;
+        // }
         input_data.setCNVOutputFile(json_filepath);
         std::cout << "Saving CNV data to: " << json_filepath << std::endl;
     }
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 3e9f2ddc..bf3936b9 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -1068,6 +1068,12 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
 {
     std::vector<SVCall> additional_calls;
     for (auto& sv_candidate : split_sv_calls) {
+
+        // [TEST] Skip the SV start is not 62971016 or 62971017
+        // if (sv_candidate.start != 62971016 && sv_candidate.start != 62971017) {
+        //     continue;
+        // }
+
         std::tuple<double, SVType, Genotype, int> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
         double supp_lh = std::get<0>(result);
         SVType supp_type = std::get<1>(result);
@@ -1075,12 +1081,12 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
         int cn_state = std::get<3>(result);
 
         bool print_debug = false;
-        if (sv_candidate.start == 15287019) {
-        // if (true) {
-            print_debug = true;
+        // if (sv_candidate.start == 15287019) {
+        // // if (true) {
+        //     print_debug = true;
 
-            printMessage("DEBUG: Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type));
-        }
+        //     printMessage("DEBUG: Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type));
+        // }
 
         // printMessage("Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVDataTypeString(sv_candidate.data_type));
         

From 34f8a9a80ee8eba5fa597e07745a23eddba0fb4a Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Sat, 17 May 2025 14:20:28 -0400
Subject: [PATCH 110/134] reduce cnv false positives

---
 src/sv_caller.cpp | 53 +++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 44 insertions(+), 9 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index bf3936b9..55c2a663 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -1110,6 +1110,7 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
             // For predictions with the same type, or LOH predictions, update the
             // prediction information
             } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type == sv_candidate.sv_type || supp_type == SVType::LOH)) {
+                sv_candidate.aln_type.set(static_cast<size_t>(SVDataType::HMM));
                 sv_candidate.hmm_likelihood = supp_lh;
                 sv_candidate.genotype = genotype;
                 sv_candidate.cn_state = cn_state;
@@ -1120,15 +1121,49 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
 
             // Add an additional SV call if the type is different
             } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type != sv_candidate.sv_type && (supp_type == SVType::DEL || supp_type == SVType::DUP))) {
-                SVCall new_sv_call = sv_candidate;  // Copy the original SV call
-                new_sv_call.sv_type = supp_type;
-                new_sv_call.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
-                // new_sv_call.aln_type = SVDataType::HMM;
-                new_sv_call.aln_type.set(static_cast<size_t>(SVDataType::HMM));
-                new_sv_call.hmm_likelihood = supp_lh;
-                new_sv_call.genotype = genotype;
-                new_sv_call.cn_state = cn_state;
-                additional_calls.push_back(new_sv_call);
+                // For inversions, just update the alignment type, copy number
+                // state, and HMM likelihood. Coverage changes for these may be
+                // predicted as CNVs
+                if (sv_candidate.sv_type == SVType::INV) {
+                    sv_candidate.aln_type.set(static_cast<size_t>(SVDataType::HMM));
+                    sv_candidate.hmm_likelihood = supp_lh;
+                    sv_candidate.genotype = genotype;
+                    sv_candidate.cn_state = cn_state;
+                // For insertions predicted as duplications, update all information
+                } else if (sv_candidate.sv_type == SVType::INS && supp_type == SVType::DUP) {
+                    sv_candidate.sv_type = supp_type;
+                    sv_candidate.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
+                    sv_candidate.aln_type.set(static_cast<size_t>(SVDataType::HMM));
+                    sv_candidate.hmm_likelihood = supp_lh;
+                    sv_candidate.genotype = genotype;
+                    sv_candidate.cn_state = cn_state;
+                } else {
+                    // Add a new SV call with the conflicting type
+                    SVCall new_sv_call = sv_candidate;  // Copy the original SV call
+                    new_sv_call.sv_type = supp_type;
+                    new_sv_call.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
+                    new_sv_call.aln_type.set(static_cast<size_t>(SVDataType::HMM));
+                    new_sv_call.hmm_likelihood = supp_lh;
+                    new_sv_call.genotype = genotype;
+                    new_sv_call.cn_state = cn_state;
+                    additional_calls.push_back(new_sv_call);
+                }
+                // SVCall new_sv_call = sv_candidate;  // Copy the original SV call
+                // // new_sv_call.sv_type = supp_type;
+
+                // // Update the SV type unless the current type is inversion
+                // if (sv_candidate.sv_type != SVType::INV) {
+                //     new_sv_call.sv_type = supp_type;
+                //     new_sv_call.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
+                //     new_sv_call.genotype = genotype;
+                // }
+
+                // // new_sv_call.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
+                // new_sv_call.aln_type.set(static_cast<size_t>(SVDataType::HMM));
+                // new_sv_call.hmm_likelihood = supp_lh;
+                // // new_sv_call.genotype = genotype;
+                // new_sv_call.cn_state = cn_state;
+                // additional_calls.push_back(new_sv_call);
 
                 if (print_debug) {
                     printMessage("DEBUG [3]: Adding additional SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type));

From 7948062a5269075219cf01b343d892705bee4484 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 19 May 2025 10:35:04 -0400
Subject: [PATCH 111/134] assembly gaps

---
 src/cnv_caller.cpp | 75 ++++++++++++++++++++++++++++++----------------
 src/sv_caller.cpp  | 74 ++++++++++++++++++++++-----------------------
 2 files changed, 87 insertions(+), 62 deletions(-)

diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 52c6ea67..3bc2016e 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -235,46 +235,71 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
     // Determine if there is a majority state within the SV region
     int max_state = 0;
     int max_count = 0;
-    int non_normal_count = 0;
-
-    std::vector<int> state_counts(6, 0);
-    for (int state : state_sequence)
+    for (int i = 0; i < 6; i++)
     {
-        if (print_debug)
+        int state_count = std::count(state_sequence.begin(), state_sequence.end(), i+1);
+        if (state_count > max_count)
         {
-            printMessage("DEBUG: State: " + std::to_string(state));
+            max_state = i+1;
+            max_count = state_count;
         }
+    }
 
-        // Skip state 3 (normal state)
-        if (state != 3)
-        {
-            state_counts[state - 1]++;
-            non_normal_count++;
-        }
+    // If there is no majority state, then set the state to unknown
+    double pct_threshold = 0.50;
+    int state_count = (int) state_sequence.size();
+    if ((double) max_count / (double) state_count < pct_threshold)
+    {
+        max_state = 0;
     }
+    Genotype genotype = getGenotypeFromCNState(max_state);
+    SVType predicted_cnv_type = getSVTypeFromCNState(max_state);
+    // snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
+
+    
+    // int non_normal_count = 0;
+
+    // std::vector<int> state_counts(6, 0);
+    // for (int state : state_sequence)
+    // {
+    //     if (print_debug)
+    //     {
+    //         printMessage("DEBUG: State: " + std::to_string(state));
+    //     }
+
+    //     // Skip state 3 (normal state)
+    //     if (state != 3)
+    //     {
+    //         state_counts[state - 1]++;
+    //         non_normal_count++;
+    //     }
+    // }
 
     // Determine the maximum state and count
-    int max_state_index = std::distance(state_counts.begin(), std::max_element(state_counts.begin(), state_counts.end()));
-    max_state = max_state_index + 1;
-    max_count = state_counts[max_state_index];
+    // int max_state_index = std::distance(state_counts.begin(), std::max_element(state_counts.begin(), state_counts.end()));
+    // max_state = max_state_index + 1;
+    // max_count = state_counts[max_state_index];
 
     // Update SV type and genotype based on the majority state
     // SVType predicted_cnv_type = getSVTypeFromCNState(max_state);
     // Genotype genotype = getGenotypeFromCNState(max_state);
-    SVType predicted_cnv_type = SVType::UNKNOWN;
-    Genotype genotype = Genotype::UNKNOWN;
-    if (max_count > 0 && ((double) max_count / (double) non_normal_count) > 0.5)
-    {
-        predicted_cnv_type = getSVTypeFromCNState(max_state);
-        genotype = getGenotypeFromCNState(max_state);
-        snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
-    }
+    // SVType predicted_cnv_type = SVType::UNKNOWN;
+    // Genotype genotype = Genotype::UNKNOWN;
+    // if (max_count > 0 && ((double) max_count / (double) non_normal_count) > 0.5)
+    // {
+    //     predicted_cnv_type = getSVTypeFromCNState(max_state);
+    //     genotype = getGenotypeFromCNState(max_state);
+    //     snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
+    // }
 
     // Save the SV calls if enabled
     uint32_t min_length = 30000;
     bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL);
     if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) >= min_length)
     {
+        // Move the state sequence to the SNP data
+        snp_data.state_sequence = std::move(state_sequence);
+
         // Set B-allele and population frequency values to 0 for non-SNPs
         for (size_t i = 0; i < snp_data.pos.size(); i++)
         {
@@ -371,7 +396,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         }
 
         // Determine if there is a majority state within the SV region and if it
-        // is greater than 75%
+        // is greater than 50%
         int max_state = 0;
         int max_count = 0;
         for (int i = 0; i < 6; i++)
@@ -385,7 +410,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
         }
 
         // If there is no majority state, then set the state to unknown
-        double pct_threshold = 0.75;
+        double pct_threshold = 0.50;
         int state_count = (int) sv_states.size();
         if ((double) max_count / (double) state_count < pct_threshold)
         {
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 55c2a663..1238f660 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -449,7 +449,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                 // Add an insertion SV call at the primary position
 
                 // bool print_debug = false;
-                bool print_debug = true;
+                // bool print_debug = true;
                 
                 // int sv_start = primary_positions[0];
                 // Use the 3'-most primary position as the start position
@@ -1338,47 +1338,47 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                 total_count += 1;
             }
 
-            // Deletion
-            if (sv_type == SVType::DEL) {
-                // Check if the deletion is in an assembly gap (0-based)
-                if (assembly_gap_file != "") {
-                    bool in_assembly_gap = false;
-                    // if (assembly_gaps.find(chr) != assembly_gaps.end()) {
-                    auto it = assembly_gaps.find(chr);
-                    if (it != assembly_gaps.end()) {
-                        // Check if the deletion overlaps with any assembly gaps
-                        for (const auto& gap : assembly_gaps[chr]) {
-                            // Determine if the deletion overlaps with the
-                            // assembly gap by greater than 50%
-                            uint32_t overlap_start = std::max(start, gap.first + 1);  // Convert to 1-based
-                            uint32_t overlap_end = std::min(end, gap.second + 1);  // Convert to 1-based
-                            if (overlap_start <= overlap_end) {
-                                // Calculate the overlap length
-                                uint32_t overlap_length = overlap_end - overlap_start + 1;
-                                // Calculate the percentage of overlap
-                                double overlap_pct = static_cast<double>(overlap_length) / static_cast<double>(sv_length);
-                                if (overlap_pct > 0.2) {
-                                    in_assembly_gap = true;
-                                    break;
-                                }
+            // Check if the SV is in an assembly gap (0-based)
+            if (assembly_gap_file != "") {
+                bool in_assembly_gap = false;
+                // if (assembly_gaps.find(chr) != assembly_gaps.end()) {
+                auto it = assembly_gaps.find(chr);
+                if (it != assembly_gaps.end()) {
+                    // Check if the deletion overlaps with any assembly gaps
+                    for (const auto& gap : assembly_gaps[chr]) {
+                        // Determine if the deletion overlaps with the
+                        // assembly gap by greater than 50%
+                        uint32_t overlap_start = std::max(start, gap.first + 1);  // Convert to 1-based
+                        uint32_t overlap_end = std::min(end, gap.second + 1);  // Convert to 1-based
+                        if (overlap_start <= overlap_end) {
+                            // Calculate the overlap length
+                            uint32_t overlap_length = overlap_end - overlap_start + 1;
+                            // Calculate the percentage of overlap
+                            double overlap_pct = static_cast<double>(overlap_length) / static_cast<double>(sv_length);
+                            if (overlap_pct > 0.2) {
+                                in_assembly_gap = true;
+                                break;
                             }
-                            // double overlap = 0.0;
-                            // uint32_t gap_start = gap.first + 1;  // Convert to 1-based
-                            // uint32_t gap_end = gap.second + 1;  // Convert to 1-based
-                            // overlap = static_cast<double>(std::min(end, gap_end) - std::max(start, gap_start) + 1) / static_cast<double>(sv_length);
-                            // if (overlap > 0.2) {
-                            //     std::cout << "Assembly gap overlap is " << overlap << " for " << chr << ":" << start << "-" << end << std::endl;
-                            //     in_assembly_gap = true;
-                            //     break;
-                            // }
-                        }
-                        if (in_assembly_gap) {
-                            filter = "AssemblyGap";
-                            assembly_gap_filtered_svs += 1;
                         }
+                        // double overlap = 0.0;
+                        // uint32_t gap_start = gap.first + 1;  // Convert to 1-based
+                        // uint32_t gap_end = gap.second + 1;  // Convert to 1-based
+                        // overlap = static_cast<double>(std::min(end, gap_end) - std::max(start, gap_start) + 1) / static_cast<double>(sv_length);
+                        // if (overlap > 0.2) {
+                        //     std::cout << "Assembly gap overlap is " << overlap << " for " << chr << ":" << start << "-" << end << std::endl;
+                        //     in_assembly_gap = true;
+                        //     break;
+                        // }
+                    }
+                    if (in_assembly_gap) {
+                        filter = "AssemblyGap";
+                        assembly_gap_filtered_svs += 1;
                     }
                 }
+            }
 
+            // Deletion
+            if (sv_type == SVType::DEL) {
                 // Get the deleted sequence from the reference genome, also including the preceding base
                 uint32_t preceding_pos = (uint32_t) std::max(1, static_cast<int>(start)-1);  // Make sure the position is not negative
                 ref_allele = ref_genome.query(chr, preceding_pos, end);

From 32e14eacd9b481b7663ebcebf778526124316ad2 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 19 May 2025 13:27:32 -0400
Subject: [PATCH 112/134] add inversion hmm prediction and improve merging

---
 src/cnv_caller.cpp | 39 +++++++++++++++++----------------------
 src/sv_caller.cpp  | 34 +++++-----------------------------
 src/sv_object.cpp  |  8 ++++++--
 3 files changed, 28 insertions(+), 53 deletions(-)

diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 3bc2016e..7f14aec1 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -173,13 +173,6 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
         return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0);
     }
 
-    // bool print_debug = (start_pos == 62971016 || start_pos == 62971017);
-    bool print_debug = false;
-    if (print_debug)
-    {
-        printMessage("DEBUG: Running copy number prediction for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
-    }
-
     // Run the Viterbi algorithm on SNPs in the SV region
     // Only extend the region if "save CNV data" is enabled
     SNPData before_sv;
@@ -207,20 +200,6 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
     SNPData snp_data;
     querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data);
 
-    if (print_debug)
-    {
-        printMessage("DEBUG: SNP data size: " + std::to_string(snp_data.pos.size()));
-        printMessage("DEBUG: SNP data baf size: " + std::to_string(snp_data.baf.size()));
-        printMessage("DEBUG: SNP data pfb size: " + std::to_string(snp_data.pfb.size()));
-        printMessage("DEBUG: SNP data log2_cov size: " + std::to_string(snp_data.log2_cov.size()));
-        printMessage("DEBUG: mean_chr_cov: " + std::to_string(mean_chr_cov));
-        // Print all log2_cov values
-        for (size_t i = 0; i < snp_data.log2_cov.size(); i++)
-        {
-            printMessage("DEBUG: SNP data log2_cov[" + std::to_string(i) + "]: " + std::to_string(snp_data.log2_cov[i]));
-        }
-    }
-
     // Run the Viterbi algorithm
     std::pair<std::vector<int>, double> prediction;
     runViterbi(hmm, snp_data, prediction);
@@ -245,6 +224,13 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
         }
     }
 
+    bool print_debug = false;
+    if (start_pos == 70955983) // || start_pos == 70955984)
+    {
+        print_debug = true;
+        printMessage("Max state for " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + " is " + std::to_string(max_state) + " with count " + std::to_string(max_count) + " of " + std::to_string(state_sequence.size()));
+    }
+
     // If there is no majority state, then set the state to unknown
     double pct_threshold = 0.50;
     int state_count = (int) state_sequence.size();
@@ -252,11 +238,20 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
     {
         max_state = 0;
     }
+
+    if (print_debug)
+    {
+        printMessage("Pct max count: " + std::to_string((double) max_count / (double) state_count));
+    }
+
     Genotype genotype = getGenotypeFromCNState(max_state);
     SVType predicted_cnv_type = getSVTypeFromCNState(max_state);
     // snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
 
-    
+    if (print_debug)
+    {
+        printMessage("Predicted CNV type: " + getSVTypeString(predicted_cnv_type) + " with genotype " + getGenotypeString(genotype) + " and likelihood " + std::to_string(likelihood));
+    }
     // int non_normal_count = 0;
 
     // std::vector<int> state_counts(6, 0);
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 1238f660..3ba9628b 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -799,7 +799,8 @@ void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& ch
     bam_hdr_destroy(bamHdr);
 
     printMessage(chr + ": Merging CIGAR...");
-    mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false);
+    // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false);
+    mergeSVs(chr_sv_calls, 0.1, dbscan_min_pts, false);
 
     int region_sv_count = getSVCount(chr_sv_calls);
     printMessage(chr + ": Found " + std::to_string(region_sv_count) + " SV candidates in the CIGAR string");
@@ -980,32 +981,7 @@ void SVCaller::run(const InputData& input_data)
     printMessage("Merging CIGAR and split read SV calls...");
     for (auto& entry : whole_genome_sv_calls) {
         std::vector<SVCall>& sv_calls = entry.second;
-        // mergeDuplicateSVs(sv_calls);
-        // mergeSVs(sv_calls, 0.1, 2, false);
-
-        // [TEST 1] Keep noise and use the DBSCAN epsilon from the
-        // command line
-        // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), 2, true);
-
-        // [TEST 2] Remove noise and use the DBSCAN epsilon from the
-        // command line (= really low recall, and low precision)
-        // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), 2, false);
-
-        // [TEST 3] Remove noise and use a DBSCAN epsilon of 0.1 (low recall,
-        // higher precision)
-        // mergeSVs(sv_calls, 0.1, 2, false);
-
-        // [TEST 4] Keep noise and use a DBSCAN epsilon of 0.1 (slightly better
-        // recall)
-        // Using a more aggressive epsilon works better for the final merge
-        mergeSVs(sv_calls, 0.1, 2, true);
-        // continue;
-
-        // [TEST 5] Keep noise and use a DBSCAN epsilon of 0.01 (1 more FP)
-        // mergeSVs(sv_calls, 0.01, 2, true);
-
-        // [TEST 6] do nothing (reduced precision, same recall as #4)
-        // continue;
+        // mergeSVs(sv_calls, 0.1, 2, true);
     }
 
     if (input_data.getSaveCNVData()) {
@@ -1107,9 +1083,9 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
                     printMessage("DEBUG [1]: Updating SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type));
                 }
 
-            // For predictions with the same type, or LOH predictions, update the
+            // For predictions with the same type, or LOH, neutral predictions, update the
             // prediction information
-            } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type == sv_candidate.sv_type || supp_type == SVType::LOH)) {
+            } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type == sv_candidate.sv_type || supp_type == SVType::LOH || supp_type == SVType::NEUTRAL)) {
                 sv_candidate.aln_type.set(static_cast<size_t>(SVDataType::HMM));
                 sv_candidate.hmm_likelihood = supp_lh;
                 sv_candidate.genotype = genotype;
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 1fe60b31..5a80e39d 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -147,10 +147,14 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                     //     return sv_call.hmm_likelihood != 0.0;
                     // });
 
-                    // Choose the SV with the highest cluster size of all SVs with non-zero likelihood
+                    // Choose the SV with the highest cluster size of all SVs
+                    // with non-zero likelihood (if equal, choose the larger SV)
                     std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-                        return a.cluster_size > b.cluster_size || (a.cluster_size == b.cluster_size && a.hmm_likelihood > b.hmm_likelihood);
+                        return a.cluster_size > b.cluster_size || (a.cluster_size == b.cluster_size && a.end - a.start > b.end - b.start);
                     });
+                    // std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+                    //     return a.cluster_size > b.cluster_size || (a.cluster_size == b.cluster_size && a.hmm_likelihood > b.hmm_likelihood);
+                    // });
                     auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) {
                         return sv_call.hmm_likelihood != 0.0;
                     });

From ab2743675334b49101ce6d51e951860ca35bbad4 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Mon, 19 May 2025 15:44:38 -0400
Subject: [PATCH 113/134] achieve highest recall for large svs

---
 src/sv_caller.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 3ba9628b..70d0e86e 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -965,7 +965,9 @@ void SVCaller::run(const InputData& input_data)
         int min_pts = 2;
         for (auto& entry : whole_genome_split_sv_calls) {
             std::vector<SVCall>& sv_calls = entry.second;
-            mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), min_pts, true);
+            // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), min_pts,
+            // true);
+            mergeSVs(sv_calls, 0.1, min_pts, true);
         }
 
         printMessage("Unifying SVs...");
@@ -981,7 +983,7 @@ void SVCaller::run(const InputData& input_data)
     printMessage("Merging CIGAR and split read SV calls...");
     for (auto& entry : whole_genome_sv_calls) {
         std::vector<SVCall>& sv_calls = entry.second;
-        // mergeSVs(sv_calls, 0.1, 2, true);
+        mergeSVs(sv_calls, 0.1, 2, true);
     }
 
     if (input_data.getSaveCNVData()) {

From 940f61e86737dce87caa553bee7dfd9df3f0d6f5 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Tue, 20 May 2025 21:27:26 -0400
Subject: [PATCH 114/134] revert merge

---
 src/sv_caller.cpp | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 70d0e86e..4c8aeee0 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -799,8 +799,9 @@ void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& ch
     bam_hdr_destroy(bamHdr);
 
     printMessage(chr + ": Merging CIGAR...");
-    // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false);
-    mergeSVs(chr_sv_calls, 0.1, dbscan_min_pts, false);
+    mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false);
+    // mergeSVs(chr_sv_calls, 0.1, dbscan_min_pts, false);
+    // mergeSVs(chr_sv_calls, 0.3, dbscan_min_pts, false);
 
     int region_sv_count = getSVCount(chr_sv_calls);
     printMessage(chr + ": Found " + std::to_string(region_sv_count) + " SV candidates in the CIGAR string");
@@ -962,12 +963,13 @@ void SVCaller::run(const InputData& input_data)
         }
 
         printMessage("Merging split-read SVs...");
-        int min_pts = 2;
+        // int min_pts = 2;
         for (auto& entry : whole_genome_split_sv_calls) {
             std::vector<SVCall>& sv_calls = entry.second;
             // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), min_pts,
             // true);
-            mergeSVs(sv_calls, 0.1, min_pts, true);
+            mergeSVs(sv_calls, 0.1, 2, true);
+            // mergeSVs(sv_calls, 0.3, min_pts, true);
         }
 
         printMessage("Unifying SVs...");
@@ -984,6 +986,7 @@ void SVCaller::run(const InputData& input_data)
     for (auto& entry : whole_genome_sv_calls) {
         std::vector<SVCall>& sv_calls = entry.second;
         mergeSVs(sv_calls, 0.1, 2, true);
+        // mergeSVs(sv_calls, 0.3, 2, true);
     }
 
     if (input_data.getSaveCNVData()) {

From 87be75f5eb9b51ef9af52a5f36378d85b92be4f6 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 21 May 2025 14:18:01 -0400
Subject: [PATCH 115/134] remove comments

---
 src/cnv_caller.cpp |  89 +-----------------------------
 src/sv_caller.cpp  | 131 +--------------------------------------------
 src/sv_object.cpp  |  34 ------------
 3 files changed, 3 insertions(+), 251 deletions(-)

diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index 7f14aec1..c7045955 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -74,7 +74,6 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end
 
     // Loop through evenly spaced positions in the region and get the log2 ratio
     double pos_step = static_cast<double>(end_pos - start_pos + 1) / static_cast<double>(sample_size);
-    // double pos_step = (double) (end_pos - start_pos + 1) / (double) sample_size;
     std::unordered_map<std::string, double> window_log2_map;
     for (int i = 0; i < sample_size; i++)
     {
@@ -224,13 +223,6 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
         }
     }
 
-    bool print_debug = false;
-    if (start_pos == 70955983) // || start_pos == 70955984)
-    {
-        print_debug = true;
-        printMessage("Max state for " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + " is " + std::to_string(max_state) + " with count " + std::to_string(max_count) + " of " + std::to_string(state_sequence.size()));
-    }
-
     // If there is no majority state, then set the state to unknown
     double pct_threshold = 0.50;
     int state_count = (int) state_sequence.size();
@@ -239,53 +231,8 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
         max_state = 0;
     }
 
-    if (print_debug)
-    {
-        printMessage("Pct max count: " + std::to_string((double) max_count / (double) state_count));
-    }
-
     Genotype genotype = getGenotypeFromCNState(max_state);
     SVType predicted_cnv_type = getSVTypeFromCNState(max_state);
-    // snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
-
-    if (print_debug)
-    {
-        printMessage("Predicted CNV type: " + getSVTypeString(predicted_cnv_type) + " with genotype " + getGenotypeString(genotype) + " and likelihood " + std::to_string(likelihood));
-    }
-    // int non_normal_count = 0;
-
-    // std::vector<int> state_counts(6, 0);
-    // for (int state : state_sequence)
-    // {
-    //     if (print_debug)
-    //     {
-    //         printMessage("DEBUG: State: " + std::to_string(state));
-    //     }
-
-    //     // Skip state 3 (normal state)
-    //     if (state != 3)
-    //     {
-    //         state_counts[state - 1]++;
-    //         non_normal_count++;
-    //     }
-    // }
-
-    // Determine the maximum state and count
-    // int max_state_index = std::distance(state_counts.begin(), std::max_element(state_counts.begin(), state_counts.end()));
-    // max_state = max_state_index + 1;
-    // max_count = state_counts[max_state_index];
-
-    // Update SV type and genotype based on the majority state
-    // SVType predicted_cnv_type = getSVTypeFromCNState(max_state);
-    // Genotype genotype = getGenotypeFromCNState(max_state);
-    // SVType predicted_cnv_type = SVType::UNKNOWN;
-    // Genotype genotype = Genotype::UNKNOWN;
-    // if (max_count > 0 && ((double) max_count / (double) non_normal_count) > 0.5)
-    // {
-    //     predicted_cnv_type = getSVTypeFromCNState(max_state);
-    //     genotype = getGenotypeFromCNState(max_state);
-    //     snp_data.state_sequence = std::move(state_sequence);  // Move the state sequence to the SNP data
-    // }
 
     // Save the SV calls if enabled
     uint32_t min_length = 30000;
@@ -365,7 +312,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector<SVCall
 
         // Only extend the region if "save CNV data" is enabled
         SNPData snp_data;
-        // printMessage("Querying SNP region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         this->querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data);
 
         // Run the Viterbi algorithm
@@ -462,7 +408,6 @@ std::vector<std::string> CNVCaller::splitRegionIntoChunks(std::string chr, uint3
 void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>& chromosomes, std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map, std::unordered_map<std::string, double>& chr_mean_cov_map, const std::string& bam_filepath, int thread_count) const
 {
     // Open the BAM file
-    // std::shared_lock<std::shared_mutex> lock(this->shared_mutex);  // Lock the BAM file
     printMessage("Opening BAM file: " + bam_filepath);
     samFile *bam_file = sam_open(bam_filepath.c_str(), "r");
     if (!bam_file)
@@ -531,7 +476,6 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>&
         if (pos_depth_map.size() != static_cast<size_t>(chr_length))
         {
             printError("ERROR: Chromosome length mismatch for " + chr + ": expected " + std::to_string(chr_length) + ", found " + std::to_string(pos_depth_map.size()) + ", resizing to " + std::to_string(chr_length));
-            // Resize the depth map to the length of the chromosome
             pos_depth_map.resize(chr_length, 0);
         }
         while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0)
@@ -580,35 +524,6 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector<std::string>&
         }
         hts_itr_destroy(bam_iter);
 
-        // You can parallelize the depth map calculation here but first close the
-        // BAM file and index
-        // Bam cleanup (delete guard if using this)
-        // bam_destroy1(bam_record);
-        // bam_hdr_destroy(bam_header);
-        // sam_close(bam_file);
-        // bam_index_destroy(bam_index);
-        // bam_record = nullptr;
-        // bam_header = nullptr;
-        // bam_file = nullptr;
-        // bam_index = nullptr;
-        
-        // // Parallel sum of the depth map
-        // uint64_t cum_depth = std::reduce(
-        //     std::execution::par,
-        //     pos_depth_map.begin(),
-        //     pos_depth_map.end(),
-        //     0ULL
-        // );
-
-        // // Parallel count of the non-zero depth positions
-        // uint32_t pos_count = std::count_if(
-        //     std::execution::par,
-        //     pos_depth_map.begin(),
-        //     pos_depth_map.end(),
-        //     [](uint32_t depth) { return depth > 0; }
-        // );
-
-        // Sum without parallelization
         uint64_t cum_depth = std::accumulate(pos_depth_map.begin(), pos_depth_map.end(), 0ULL);
         uint32_t pos_count = std::count_if(pos_depth_map.begin(), pos_depth_map.end(), [](uint32_t depth) { return depth > 0; });
 
@@ -701,7 +616,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
 
         // Check if the filepath uses the 'chr' prefix notations based on the
         // chromosome name (*.chr1.vcf.gz vs *.1.vcf.gz)
-        // chr_gnomad = chr;  // gnomAD data may or may not have the 'chr' prefix
         std::string chr_prefix = "chr";
         if (pfb_filepath.find(chr_prefix) == std::string::npos)
         {
@@ -745,7 +659,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui
         bcf_sr_set_threads(pfb_reader, thread_count);
     }
 
-    // Read the SNP data ----------------------------------------------
+    // Read the SNP data
+    
     // Set the region
     std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos);
     if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0)  //chr.c_str(), 0) < 0)
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index 4c8aeee0..cf39c5de 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -446,42 +446,20 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                 std::sort(ref_distance_cluster.begin(), ref_distance_cluster.end());
                 ref_distance = ref_distance_cluster[ref_distance_cluster.size() / 2];
 
-                // Add an insertion SV call at the primary position
-
-                // bool print_debug = false;
-                // bool print_debug = true;
-                
-                // int sv_start = primary_positions[0];
                 // Use the 3'-most primary position as the start position
                 int sv_start;
                 bool split_candidate_sv = false;
                 if (primary_5p_most && primary_end) {
                     std::sort(primary_positions.begin(), primary_positions.end());
-                    // Supplementary alignment is downstream with the
-                    // insertion sequence, starting at the 3'-most
-                    // primary position
                     sv_start = primary_positions.back();
-
-                    // Print debug if SV start equals 223608935
-                    // if (sv_start == 223608936) {
-                    //     print_debug = true;
-                    // printMessage("DEBUG: SV start is" + std::to_string(sv_start) + " and read distance is " + std::to_string(read_distance) + " and ref distance is " + std::to_string(ref_distance) + " and primary 5p_most is " + std::to_string(primary_5p_most) + " and primary positions are " + std::to_string(primary_positions[0]) + " and " + std::to_string(primary_positions.back()) + " and number of primary positions is " + std::to_string(primary_positions.size()) + " and start bool is " + std::to_string(primary_start) + " and end bool is " + std::to_string(primary_end));
-                    // }
                     split_candidate_sv = true;
                 } else if (!primary_5p_most && supp_end) {
                     
                     // Supplementary alignment is upstream with the
                     // insertion sequence, starting at the 5'-most
                     // primary position
-                    // sv_start = primary_positions.front();
                     std::sort(supp_positions.begin(), supp_positions.end());
                     sv_start = supp_positions.back();
-
-                    // Print debug if SV start equals 223608935
-                    // if (sv_start == 223608936) {
-                    //     print_debug = true;
-                    // printMessage("DEBUG: SV start is " + std::to_string(sv_start) + " and read distance is " + std::to_string(read_distance) + " and ref distance is " + std::to_string(ref_distance) + " and primary 5p_most is " + std::to_string(primary_5p_most) + " and primary positions are " + std::to_string(primary_positions[0]) + " and " + std::to_string(primary_positions.back()) + " and number of primary positions is " + std::to_string(primary_positions.size()) + " and start bool is " + std::to_string(primary_start) + " and end bool is " + std::to_string(primary_end));
-                    // }
                     split_candidate_sv = true;
                 }
                 SVEvidenceFlags aln_type;
@@ -492,29 +470,13 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                         // Add an insertion SV call at the 5'-most primary position
                         SVType sv_type = SVType::INS;
                         SVCall sv_candidate(sv_start, sv_start + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), aln_type, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
-                        // SVCall sv_candidate(sv_start, sv_start + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
                         addSVCall(chr_sv_calls, sv_candidate);
                         // }
                     } else if (ref_distance > read_distance && ref_distance >= min_length && ref_distance <= max_length) {
-                        // Add a deletion SV call at the primary positions
-                        // SVType sv_type = SVType::DEL;
 
                         // Set it to unknown, SV type will be determined by the
                         // HMM prediction
                         SVType sv_type = SVType::UNKNOWN;
-
-                        // if (print_debug) {
-                        //     printMessage("DEBUG: Adding deletion SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_start + (ref_distance-1)) + " with length " + std::to_string(ref_distance) + " and cluster size " + std::to_string(primary_cluster_size));
-                        // }
-
-                        // Add a dummy SV call before and after the start
-                        // position for HMM predictions
-                        // SVType sv_type = SVType::UNKNOWN;
-                        // SVCall sv_candidate(sv_start, sv_start +
-                        // (ref_distance-1), sv_type, getSVTypeSymbol(sv_type),
-                        // SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0,
-                        // aln_offset, primary_cluster_size);
-                        // printMessage("DEBUG: Adding deletion SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_start + (ref_distance-1)) + " with length " + std::to_string(ref_distance) + " and cluster size " + std::to_string(primary_cluster_size) + " and 5p-most is " + std::to_string(primary_5p_most) + " and read distance is " + std::to_string(read_distance) + " and ref distance is " + std::to_string(ref_distance));
                         SVCall sv_candidate(sv_start, sv_start + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), aln_type, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size);
                         addSVCall(chr_sv_calls, sv_candidate);
                     }
@@ -531,10 +493,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
                     int sv_end = std::max(primary_pos, supp_pos) - 1;
                     int sv_length = sv_end - sv_start + 1;
                     if (sv_length >= min_length && sv_length <= max_length) {
-                        // printMessage("Adding SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and cluster size " + std::to_string(cluster_size));
-                        // SVCall sv_candidate(sv_start, sv_end, sv_type, alt,
-                        // SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, 0,
-                        // cluster_size);
                         SVEvidenceFlags aln_type;
                         aln_type.set(static_cast<size_t>(SVDataType::SPLIT));
                         SVCall sv_candidate(sv_start, sv_end, sv_type, alt, aln_type, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size);
@@ -546,7 +504,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
 
         // Combine SVs with identical start and end positions, and sum the cluster
         // sizes
-        // printMessage("Combining SVs with identical start positions");
         std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
             return a.start < b.start || (a.start == b.start && a.end < b.end);
         });
@@ -555,8 +512,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
         // cluster sizes
         mergeDuplicateSVs(chr_sv_calls);
         sv_calls[chr_name] = std::move(chr_sv_calls);
-
-        // Print the number of merged SV calls
         printMessage(chr_name + ": Found " + std::to_string(sv_calls[chr_name].size()) + " SV candidates");
     }
 
@@ -800,8 +755,6 @@ void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& ch
 
     printMessage(chr + ": Merging CIGAR...");
     mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false);
-    // mergeSVs(chr_sv_calls, 0.1, dbscan_min_pts, false);
-    // mergeSVs(chr_sv_calls, 0.3, dbscan_min_pts, false);
 
     int region_sv_count = getSVCount(chr_sv_calls);
     printMessage(chr + ": Found " + std::to_string(region_sv_count) + " SV candidates in the CIGAR string");
@@ -963,13 +916,9 @@ void SVCaller::run(const InputData& input_data)
         }
 
         printMessage("Merging split-read SVs...");
-        // int min_pts = 2;
         for (auto& entry : whole_genome_split_sv_calls) {
             std::vector<SVCall>& sv_calls = entry.second;
-            // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), min_pts,
-            // true);
             mergeSVs(sv_calls, 0.1, 2, true);
-            // mergeSVs(sv_calls, 0.3, min_pts, true);
         }
 
         printMessage("Unifying SVs...");
@@ -986,7 +935,6 @@ void SVCaller::run(const InputData& input_data)
     for (auto& entry : whole_genome_sv_calls) {
         std::vector<SVCall>& sv_calls = entry.second;
         mergeSVs(sv_calls, 0.1, 2, true);
-        // mergeSVs(sv_calls, 0.3, 2, true);
     }
 
     if (input_data.getSaveCNVData()) {
@@ -1050,27 +998,12 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
     std::vector<SVCall> additional_calls;
     for (auto& sv_candidate : split_sv_calls) {
 
-        // [TEST] Skip the SV start is not 62971016 or 62971017
-        // if (sv_candidate.start != 62971016 && sv_candidate.start != 62971017) {
-        //     continue;
-        // }
-
         std::tuple<double, SVType, Genotype, int> result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data);
         double supp_lh = std::get<0>(result);
         SVType supp_type = std::get<1>(result);
         Genotype genotype = std::get<2>(result);
         int cn_state = std::get<3>(result);
 
-        bool print_debug = false;
-        // if (sv_candidate.start == 15287019) {
-        // // if (true) {
-        //     print_debug = true;
-
-        //     printMessage("DEBUG: Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type));
-        // }
-
-        // printMessage("Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVDataTypeString(sv_candidate.data_type));
-        
         // Update the SV type if the predicted type is not unknown
         if (supp_type != SVType::UNKNOWN) {
             // Update all information if the current SV call is not known and
@@ -1079,15 +1012,10 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
                 sv_candidate.sv_type = supp_type;
                 sv_candidate.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
                 sv_candidate.aln_type.set(static_cast<size_t>(SVDataType::HMM));
-                // sv_candidate.data_type = SVDataType::HMM;
                 sv_candidate.hmm_likelihood = supp_lh;
                 sv_candidate.genotype = genotype;
                 sv_candidate.cn_state = cn_state;
 
-                if (print_debug) {
-                    printMessage("DEBUG [1]: Updating SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type));
-                }
-
             // For predictions with the same type, or LOH, neutral predictions, update the
             // prediction information
             } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type == sv_candidate.sv_type || supp_type == SVType::LOH || supp_type == SVType::NEUTRAL)) {
@@ -1096,10 +1024,6 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
                 sv_candidate.genotype = genotype;
                 sv_candidate.cn_state = cn_state;
 
-                if (print_debug) {
-                    printMessage("DEBUG [2]: Updating SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type));
-                }
-
             // Add an additional SV call if the type is different
             } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type != sv_candidate.sv_type && (supp_type == SVType::DEL || supp_type == SVType::DUP))) {
                 // For inversions, just update the alignment type, copy number
@@ -1129,26 +1053,6 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
                     new_sv_call.cn_state = cn_state;
                     additional_calls.push_back(new_sv_call);
                 }
-                // SVCall new_sv_call = sv_candidate;  // Copy the original SV call
-                // // new_sv_call.sv_type = supp_type;
-
-                // // Update the SV type unless the current type is inversion
-                // if (sv_candidate.sv_type != SVType::INV) {
-                //     new_sv_call.sv_type = supp_type;
-                //     new_sv_call.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
-                //     new_sv_call.genotype = genotype;
-                // }
-
-                // // new_sv_call.alt_allele = getSVTypeSymbol(supp_type);  // Update the ALT allele format
-                // new_sv_call.aln_type.set(static_cast<size_t>(SVDataType::HMM));
-                // new_sv_call.hmm_likelihood = supp_lh;
-                // // new_sv_call.genotype = genotype;
-                // new_sv_call.cn_state = cn_state;
-                // additional_calls.push_back(new_sv_call);
-
-                if (print_debug) {
-                    printMessage("DEBUG [3]: Adding additional SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type));
-                }
             }
         }
     }
@@ -1203,8 +1107,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             }
             // Add the assembly gap to the map
             assembly_gaps[chr].emplace_back(start, end);
-            // Print the assembly gap information
-            // std::cout << "Assembly gap: " << chr << ":" << start << "-" << end << std::endl;
         }
         gap_stream.close();
         std::cout << "Loaded " << assembly_gaps.size() << " assembly gaps." << std::endl;
@@ -1322,7 +1224,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
             // Check if the SV is in an assembly gap (0-based)
             if (assembly_gap_file != "") {
                 bool in_assembly_gap = false;
-                // if (assembly_gaps.find(chr) != assembly_gaps.end()) {
                 auto it = assembly_gaps.find(chr);
                 if (it != assembly_gaps.end()) {
                     // Check if the deletion overlaps with any assembly gaps
@@ -1334,6 +1235,7 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                         if (overlap_start <= overlap_end) {
                             // Calculate the overlap length
                             uint32_t overlap_length = overlap_end - overlap_start + 1;
+                            
                             // Calculate the percentage of overlap
                             double overlap_pct = static_cast<double>(overlap_length) / static_cast<double>(sv_length);
                             if (overlap_pct > 0.2) {
@@ -1341,15 +1243,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                                 break;
                             }
                         }
-                        // double overlap = 0.0;
-                        // uint32_t gap_start = gap.first + 1;  // Convert to 1-based
-                        // uint32_t gap_end = gap.second + 1;  // Convert to 1-based
-                        // overlap = static_cast<double>(std::min(end, gap_end) - std::max(start, gap_start) + 1) / static_cast<double>(sv_length);
-                        // if (overlap > 0.2) {
-                        //     std::cout << "Assembly gap overlap is " << overlap << " for " << chr << ":" << start << "-" << end << std::endl;
-                        //     in_assembly_gap = true;
-                        //     break;
-                        // }
                     }
                     if (in_assembly_gap) {
                         filter = "AssemblyGap";
@@ -1366,17 +1259,6 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
 
                 // Use the preceding base as the alternate allele 
                 if (ref_allele != "") {
-                    // If the sequence is >90% N, skip the SV call (assembly
-                    // gap)
-                    // int allele_length_90pct = static_cast<int>(ref_allele.size() * 0.9);
-                    // if (std::count(ref_allele.begin(), ref_allele.end(), 'N') > allele_length_90pct) {
-                    //     assembly_gaps += 1;
-                    //     // continue;
-
-                    //     // Don't skip but set the filter to assembly gap
-                    //     filter = "AssemblyGap";
-                    // }
-
                     // The alt allele is the preceding base, and the reference
                     // allele is the deleted sequence including the preceding base
                     alt_allele = ref_allele.at(0);
@@ -1434,21 +1316,10 @@ void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCal
                     base = 'N';
                 }
             }
-            
-            // Get read depth
             int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr), start);
 
-            // If read depth equals zero, then set the filter to LowQual
-            // if (read_depth == 0) {
-            //     printError("Warning: Read depth is zero for " + chr + ":" + std::to_string(start) + "-" + std::to_string(end));
-            //     filter = "LowQual";
-            //     filtered_svs += 1;
-            // }
-
             // Create the VCF parameter strings
             std::string sv_type_str = getSVTypeString(sv_type);
-            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";MISMATCH=" + std::to_string(mismatch_rate);
-            // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";ALNOFFSET=" + std::to_string(aln_offset) + ";CN=" + std::to_string(cn_state);
             std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";ALNOFFSET=" + std::to_string(aln_offset) + ";CN=" + std::to_string(cn_state) + loh;
             std::string format_str = "GT:DP";
             std::string sample_str = genotype + ":" + std::to_string(read_depth);
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 5a80e39d..9b4172b7 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -136,17 +136,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                 
                 SVCall merged_sv_call = cluster_sv_calls[0];
                 if (has_nonzero_likelihood) {
-                    // // These are detected from split reads, choose the one with
-                    // // the highest non-zero likelihood normalized by the length of the SV
-                    // std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-                    //     return (a.hmm_likelihood / (double)(a.end - a.start + 1)) > (b.hmm_likelihood / (double)(b.end - b.start + 1));
-                    // });
-
-                    // // Obtain the highest non-zero likelihood
-                    // auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) {
-                    //     return sv_call.hmm_likelihood != 0.0;
-                    // });
-
                     // Choose the SV with the highest cluster size of all SVs
                     // with non-zero likelihood (if equal, choose the larger SV)
                     std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
@@ -216,29 +205,6 @@ void mergeDuplicateSVs(std::vector<SVCall> &sv_calls)
         } else {
             combined_sv_calls.push_back(sv_call);
         }
-        // SVCall& sv_call = sv_calls[i];
-        // // For SVs at the same start position with the same SV type, keep the one
-        // // with the highest likelihood
-        // if (i > 0 && sv_call.start == sv_calls[i - 1].start && ((sv_call.sv_type == sv_calls[i - 1].sv_type) || sv_call.sv_type == SVType::UNKNOWN || sv_calls[i - 1].sv_type == SVType::UNKNOWN)) {
-        //     // Keep the SV call with a non-zero likelihood
-        //     // The HMM prediction is more reliable than the split read prediction
-        //     if (sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) {
-        //         // Combine the cluster sizes
-        //         sv_call.cluster_size += sv_calls[i - 1].cluster_size;
-        //         combined_sv_calls.back() = sv_call;
-        //     }
-
-        //     // If the likelihoods are equal, keep the one with the larger cluster size
-        //     // This is to ensure that the SV call with more supporting reads is
-        //     // kept
-        //     else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size >= sv_calls[i - 1].cluster_size) {
-        //         // Combine the cluster sizes
-        //         sv_call.cluster_size += sv_calls[i - 1].cluster_size;
-        //         combined_sv_calls.back() = sv_call;
-        //     }
-        // } else {
-        //     combined_sv_calls.push_back(sv_call);
-        // }
     }
     int merge_count = initial_size - combined_sv_calls.size();
     sv_calls = std::move(combined_sv_calls); // Replace with filtered list

From b62c9d8762245f1f3dbec1fa34bb6195e9b8b7bc Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 21 May 2025 17:47:19 -0400
Subject: [PATCH 116/134] save cluster plot data and merge duplicates

---
 include/sv_object.h |  10 ++-
 src/sv_caller.cpp   |  11 +++-
 src/sv_object.cpp   | 147 +++++++++++++++++++++++++++++++++++++++++---
 3 files changed, 152 insertions(+), 16 deletions(-)

diff --git a/include/sv_object.h b/include/sv_object.h
index 4fd34c56..8b9b2347 100644
--- a/include/sv_object.h
+++ b/include/sv_object.h
@@ -32,9 +32,6 @@ struct SVCall {
 
     SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, SVEvidenceFlags aln_type, Genotype genotype, double hmm_likelihood, int cn_state, int aln_offset, int cluster_size) :
         start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), aln_type(aln_type), genotype(genotype), hmm_likelihood(hmm_likelihood), cn_state(cn_state), aln_offset(aln_offset), cluster_size(cluster_size) {}
-
-    // SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int cn_state, int aln_offset, int cluster_size) :
-    //     start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), cn_state(cn_state), aln_offset(aln_offset), cluster_size(cluster_size) {}
 };
 
 void addSVCall(std::vector<SVCall>& sv_calls, SVCall& sv_call);
@@ -44,9 +41,10 @@ void mergeDuplicateSVs(std::vector<SVCall>& sv_calls);
 
 uint32_t getSVCount(const std::vector<SVCall>& sv_calls);
 
-void concatenateSVCalls(std::vector<SVCall>& sv_calls, const std::vector<SVCall>& sv_calls_update);
-
 // Merge SVs using DBSCAN clustering
-void mergeSVs(std::vector<SVCall> &sv_calls, double epsilon, int min_pts, bool keep_noise);
+void mergeSVs(std::vector<SVCall> &sv_calls, double epsilon, int min_pts, bool keep_noise, const std::string& json_filepath = "");
+
+// Save clusters of SV calls to a JSON file
+void saveClustersToJSON(const std::string& filename, const std::map<int, std::vector<SVCall>>& clusters);
 
 #endif // SV_OBJECT_H
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index cf39c5de..a1a52e53 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -754,6 +754,13 @@ void SVCaller::processChromosome(const std::string& chr, std::vector<SVCall>& ch
     bam_hdr_destroy(bamHdr);
 
     printMessage(chr + ": Merging CIGAR...");
+    // Save JSON if chr21
+    // if (chr == "chr21") {
+    //     std::string json_fp = input_data.getOutputDir() + "/" + chr + ".json";
+    //     mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, true, json_fp);
+    // } else {
+    //     mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false);
+    // }
     mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false);
 
     int region_sv_count = getSVCount(chr_sv_calls);
@@ -953,8 +960,6 @@ void SVCaller::run(const InputData& input_data)
 
     // Save to VCF
     std::cout << "Saving SVs to VCF..." << std::endl;
-    // const std::string output_dir = input_data.getOutputDir();
-    // this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome, chr_pos_depth_map);
     this->saveToVCF(whole_genome_sv_calls, input_data, ref_genome, chr_pos_depth_map);
 }
 
@@ -1076,7 +1081,7 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve
     }
 }
 
-// void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map) const
+
 void SVCaller::saveToVCF(const std::unordered_map<std::string, std::vector<SVCall>>& sv_calls, const InputData &input_data, const ReferenceGenome& ref_genome, const std::unordered_map<std::string, std::vector<uint32_t>>& chr_pos_depth_map) const
 {
     // Check if an assembly gap file was provided
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index 9b4172b7..b763efd2 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -7,6 +7,8 @@
 #include <stdexcept>
 #include <iostream>
 #include <numeric>
+#include <fstream>
+#include <map>
 
 #include "dbscan.h"
 #include "utils.h"
@@ -39,7 +41,7 @@ void concatenateSVCalls(std::vector<SVCall> &target, const std::vector<SVCall>&
     target.insert(target.end(), source.begin(), source.end());
 }
 
-void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool keep_noise)
+void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool keep_noise, const std::string& json_filepath)
 {
     printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts));
     
@@ -59,6 +61,8 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
         SVType::BND,
     })
     {
+        std::vector<SVCall> merged_sv_type_calls;
+
         // Create a vector of SV calls for the current SV type and size interval
         std::vector<SVCall> sv_type_calls;
         std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) {
@@ -69,7 +73,8 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
             // Add all unclustered points to the merged list
             for (const auto& sv_call : sv_type_calls) {
                 SVCall noise_sv_call = sv_call;
-                merged_sv_calls.push_back(noise_sv_call);
+                // merged_sv_calls.push_back(noise_sv_call);
+                merged_sv_type_calls.push_back(noise_sv_call);
             }
             continue;
         }
@@ -82,7 +87,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
         if (sv_type == SVType::INS) {
             // Add only non-CIGARCLIP SVs to the cluster map
             for (size_t i = 0; i < clusters.size(); ++i) {
-                // if (sv_type_calls[i].data_type != SVDataType::CIGARCLIP) {
                 // Use the SVEvidenceFlags to check for CIGARCLIP
                 if (!sv_type_calls[i].aln_type.test(static_cast<size_t>(SVDataType::CIGARCLIP))) {
                     cluster_map[clusters[i]].push_back(sv_type_calls[i]);
@@ -94,6 +98,23 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
             }
         }
 
+        // Save clusters to JSON if requested
+        if (!json_filepath.empty()) {
+            // Create the directory if it doesn't exist
+            std::string dir = json_filepath.substr(0, json_filepath.find_last_of('/'));
+            if (!fileExists(dir)) {
+                std::string command = "mkdir -p " + dir;
+                system(command.c_str());
+            }
+            // Save the clusters to a JSON file
+            // Prepend the SV type before the extension
+            // Remove the file extension from the JSON filename
+            std::string json_filename_no_ext = json_filepath.substr(0, json_filepath.find_last_of('.'));
+            std::string json_filename = json_filename_no_ext + "_" + getSVTypeString(sv_type) + ".json";
+            // std::string json_filename = json_filepath + "/clusters_" + getSVTypeString(sv_type) + ".json";
+            saveClustersToJSON(json_filename, cluster_map);
+        }
+
         // Merge SVs in each cluster
         int cluster_count = 0;
         for (auto& cluster : cluster_map) {
@@ -111,7 +132,8 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                 // Add all unclustered points to the merged list
                 for (const auto& sv_call : cluster_sv_calls) {
                     SVCall noise_sv_call = sv_call;
-                    merged_sv_calls.push_back(noise_sv_call);
+                    // merged_sv_calls.push_back(noise_sv_call);
+                    merged_sv_type_calls.push_back(noise_sv_call);
                 }
 
             // Merge clustered SV calls
@@ -150,7 +172,8 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
 
                     // Add SV call
                     merged_sv_call = *it;
-                    merged_sv_calls.push_back(merged_sv_call);
+                    // merged_sv_calls.push_back(merged_sv_call);
+                    merged_sv_type_calls.push_back(merged_sv_call);
 
                 // ----------------------------
                 // CIGAR-BASED MERGING
@@ -173,16 +196,126 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
 
                     // Add SV call
                     merged_sv_call.cluster_size = (int) cluster_sv_calls.size();
-                    merged_sv_calls.push_back(merged_sv_call);
+                    // merged_sv_calls.push_back(merged_sv_call);
+                    merged_sv_type_calls.push_back(merged_sv_call);
                 }
                 cluster_count++;
             }
         }
-        printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + " SVs");
+        printMessage("Merged " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + ", found " + std::to_string(merged_sv_type_calls.size()) + " merged SV calls");
+
+        // Merge overlapping SVs by cluster size
+        std::sort(merged_sv_type_calls.begin(), merged_sv_type_calls.end(), [](const SVCall& a, const SVCall& b) {
+            return a.start < b.start || (a.start == b.start && a.end < b.end);
+        });
+        std::vector<SVCall> merged_sv_calls_final;
+        for (size_t i = 0; i < merged_sv_type_calls.size(); i++) {
+            SVCall& sv_call = merged_sv_type_calls[i];
+
+            // Merge cluster sizes if they overlap
+            if (i > 0 && sv_call.start <= merged_sv_type_calls[i - 1].end) {
+                // Keep the larger cluster size
+                if (sv_call.cluster_size > merged_sv_type_calls[i - 1].cluster_size) {
+                    merged_sv_calls_final.push_back(sv_call);
+                }
+            } else {
+                merged_sv_calls_final.push_back(sv_call);
+            }
+        }
+        printMessage("Merged " + std::to_string(merged_sv_type_calls.size()) + " overlapping SV calls into " + std::to_string(merged_sv_calls_final.size()) + " merged SV calls");
+        
+        // Insert merged SV calls into the final list
+        merged_sv_calls.insert(merged_sv_calls.end(), merged_sv_calls_final.begin(), merged_sv_calls_final.end());
+
+        // printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + " SVs");
     }
     sv_calls = std::move(merged_sv_calls); // Replace with filtered list
     int updated_size = sv_calls.size();
     printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls");
+
+    // // Merge overlapping SVs by cluster size
+    // std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
+    //     return a.start < b.start || (a.start == b.start && a.end < b.end);
+    // });
+    // std::vector<SVCall> merged_sv_calls_final;
+    // for (size_t i = 0; i < sv_calls.size(); i++) {
+    //     SVCall& sv_call = sv_calls[i];
+
+    //     // Merge cluster sizes if they overlap
+    //     if (i > 0 && sv_call.start <= sv_calls[i - 1].end) {
+    //         // Keep the larger cluster size
+    //         if (sv_call.cluster_size > sv_calls[i - 1].cluster_size) {
+    //             sv_calls[i - 1] = sv_call;
+    //         }
+    //     } else {
+    //         merged_sv_calls_final.push_back(sv_call);
+    //     }
+    // }
+    // sv_calls = std::move(merged_sv_calls_final); // Replace with filtered list
+    // int final_size = sv_calls.size();
+    // printMessage("Merged " + std::to_string(updated_size) + " overlapping SV calls into " + std::to_string(final_size) + " SV calls");
+}
+
+void saveClustersToJSON(const std::string &filename, const std::map<int, std::vector<SVCall>> &clusters)
+{
+    // Check if the filename is empty
+    if (filename.empty()) {
+        printError("ERROR: Filename is empty");
+        return;
+    }
+
+    // Remove the file if it already exists
+    if (fileExists(filename)) {
+        std::remove(filename.c_str());
+    }
+
+    // Open the JSON file for writing
+    std::ofstream json_file(filename);
+
+    if (!json_file.is_open()) {
+        printError("ERROR: Unable to open JSON file for writing: " + filename);
+        return;
+    }
+    json_file << "{\n";
+    json_file << "  \"clusters\": [\n";
+    size_t count = 0;
+    // for (size_t i = 0; i < clusters.size(); ++i) {
+    for (const auto& [cluster_id, cluster] : clusters) {
+        if (cluster_id < 0) {
+            continue; // Skip noise points
+        }
+
+        // const auto& cluster = clusters.at(i);
+        // const auto& cluster = sv_list;
+        json_file << "    {\n";
+        json_file << "      \"cluster_id\": " << cluster_id << ",\n";
+        json_file << "      \"cluster_size\": " << cluster.size() << ",\n";
+        json_file << "      \"sv_calls\": [\n";
+        for (size_t j = 0; j < cluster.size(); ++j) {
+            const auto& sv_call = cluster[j];
+            json_file << "        {\n";
+            json_file << "          \"start\": " << sv_call.start << ",\n";
+            json_file << "          \"end\": " << sv_call.end << "\n";
+            // json_file << "          \"sv_type\": \"" << getSVTypeString(sv_call.sv_type) << "\",\n";
+            // json_file << "          \"alt_allele\": \"" << sv_call.alt_allele << "\",\n";
+            // json_file << "          \"genotype\": \"" << getGenotypeString(sv_call.genotype) << "\",\n";
+            // json_file << "          \"hmm_likelihood\": " << sv_call.hmm_likelihood << "\n";
+            json_file << "        }" << (j < cluster.size() - 1 ? "," : "") << "\n";
+        }
+        json_file << "      ]\n";
+        // json_file << "    }" << (i < clusters.size() - 1 ? "," : "") << "\n";
+        count++;
+        if (count < clusters.size() - 1) {
+            json_file << "    }," << "\n";
+        } else {
+            json_file << "    }\n";
+            printMessage("JSON found last cluster: " + std::to_string(cluster_id));
+        }
+    }
+    json_file << "  ]\n";
+    json_file << "}\n";
+    json_file.close();
+    printMessage("Saved clusters to JSON file: " + filename);
 }
 
 void mergeDuplicateSVs(std::vector<SVCall> &sv_calls)

From 17ebf23ee93790684a3787d31893bb6287720a97 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Thu, 26 Jun 2025 14:54:12 -0400
Subject: [PATCH 117/134] add debug mode

---
 Makefile           |   3 +
 Makefile-cpp       |   5 ++
 include/debug.h    |  23 +++++
 src/cnv_caller.cpp |  18 +++-
 src/debug.cpp      |   4 +
 src/input_data.cpp |  23 ++---
 src/sv_caller.cpp  |  66 +++++++-------
 src/sv_object.cpp  | 208 +++++++++++++++++++++++++++++++--------------
 8 files changed, 242 insertions(+), 108 deletions(-)
 create mode 100644 include/debug.h
 create mode 100644 src/debug.cpp

diff --git a/Makefile b/Makefile
index 6b0170ae..b8186167 100644
--- a/Makefile
+++ b/Makefile
@@ -9,6 +9,9 @@ python:
 cpp:
 	$(MAKE) -f Makefile-cpp
 
+debug:
+	$(MAKE) -f Makefile-cpp DEBUG=1
+
 clean:
 	$(MAKE) -f Makefile-python clean
 	$(MAKE) -f Makefile-cpp clean
diff --git a/Makefile-cpp b/Makefile-cpp
index e77cf0a8..ceda4018 100644
--- a/Makefile-cpp
+++ b/Makefile-cpp
@@ -19,6 +19,11 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib
 # Compiler and Flags
 CXX := g++
 CXXFLAGS := -std=c++17 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic
+
+ifdef DEBUG
+	CXXFLAGS += -DDEBUG
+endif
+
 LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR)  # Add rpath for shared libraries
 LDLIBS := -lhts  # Link with libhts.a or libhts.so
 
diff --git a/include/debug.h b/include/debug.h
new file mode 100644
index 00000000..08038b3c
--- /dev/null
+++ b/include/debug.h
@@ -0,0 +1,23 @@
+// debug.h
+#pragma once
+
+#include <iostream>
+#include <mutex>
+#include <chrono>
+#include <iomanip>
+#include <sstream>
+
+extern std::mutex debug_mutex;
+
+#ifdef DEBUG
+    #define DEBUG_PRINT(x) do { \
+        std::lock_guard<std::mutex> lock(debug_mutex); \
+        auto now = std::chrono::system_clock::now(); \
+        std::time_t now_time = std::chrono::system_clock::to_time_t(now); \
+        std::ostringstream oss; \
+        oss << std::put_time(std::localtime(&now_time), "%Y-%m-%d %H:%M:%S"); \
+        std::cerr << oss.str() << " - " << x << std::endl; \
+    } while (0)
+#else
+    #define DEBUG_PRINT(x)
+#endif
diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp
index c7045955..66f1f146 100644
--- a/src/cnv_caller.cpp
+++ b/src/cnv_caller.cpp
@@ -171,7 +171,23 @@ std::tuple<double, SVType, Genotype, int> CNVCaller::runCopyNumberPrediction(std
         printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
         return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0);
     }
-
+    /*
+    // Check that there is no large number of zero-depth positions in the region
+    int zero_depth_count = 0;
+    for (uint32_t pos = start_pos; pos <= end_pos; pos++)
+    {
+        if (pos < pos_depth_map.size() && pos_depth_map[pos] == 0)
+        {
+            zero_depth_count++;
+        }
+    }
+    if (zero_depth_count > 0.1 * (end_pos - start_pos + 1))
+    {
+        printError("WARNING: Too many zero-depth positions in the SV region for copy number prediction, skipping: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos));
+        return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0);
+    }
+    */
+   
     // Run the Viterbi algorithm on SNPs in the SV region
     // Only extend the region if "save CNV data" is enabled
     SNPData before_sv;
diff --git a/src/debug.cpp b/src/debug.cpp
new file mode 100644
index 00000000..2028e5f6
--- /dev/null
+++ b/src/debug.cpp
@@ -0,0 +1,4 @@
+// debug.cpp
+#include "debug.h"
+
+std::mutex debug_mutex;
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 3e7ad69d..4e0211df 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -8,6 +8,7 @@
 #include <thread>
 
 #include "utils.h"
+#include "debug.h"  // For DEBUG_PRINT
 /// @endcond
 
 #define MIN_PFB 0.01  // Minimum SNP population allele frequency
@@ -39,22 +40,22 @@ InputData::InputData()
 
 void InputData::printParameters() const
 {
-    std::cout << "Input parameters:" << std::endl;
-    std::cout << "Long read BAM: " << this->long_read_bam << std::endl;
-    std::cout << "Reference genome: " << this->ref_filepath << std::endl;
-    std::cout << "SNP VCF: " << this->snp_vcf_filepath << std::endl;
-    std::cout << "Output directory: " << this->output_dir << std::endl;
-    std::cout << "Sample size: " << this->sample_size << std::endl;
-    std::cout << "Minimum CNV length: " << this->min_cnv_length << std::endl;
-    std::cout << "DBSCAN epsilon: " << this->dbscan_epsilon << std::endl;
-    std::cout << "DBSCAN minimum points percentage: " << this->dbscan_min_pts_pct * 100.0f  << "%"  << std::endl;
+    DEBUG_PRINT("Input parameters:");
+    DEBUG_PRINT("Long read BAM: " << this->long_read_bam);
+    DEBUG_PRINT("Reference genome: " << this->ref_filepath);
+    DEBUG_PRINT("SNP VCF: " << this->snp_vcf_filepath);
+    DEBUG_PRINT("Output directory: " << this->output_dir);
+    DEBUG_PRINT("Sample size: " << this->sample_size);
+    DEBUG_PRINT("Minimum CNV length: " << this->min_cnv_length);
+    DEBUG_PRINT("DBSCAN epsilon: " << this->dbscan_epsilon);
+    DEBUG_PRINT("DBSCAN minimum points percentage: " << this->dbscan_min_pts_pct * 100.0f << "%");
     if (this->region_set)
     {
-        std::cout << "Region set to: chr" + this->chr + ":" + std::to_string(this->start_end.first) + "-" + std::to_string(this->start_end.second) + "\n";
+        DEBUG_PRINT("Region set to: chr" + this->chr + ":" + std::to_string(this->start_end.first) + "-" + std::to_string(this->start_end.second));
     }
     else
     {
-        std::cout << "Running on whole genome" << std::endl;
+        DEBUG_PRINT("Running on whole genome");
     }
 }
 
diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp
index a1a52e53..29dab604 100644
--- a/src/sv_caller.cpp
+++ b/src/sv_caller.cpp
@@ -30,6 +30,7 @@
 #include "fasta_query.h"
 #include "dbscan.h"
 #include "dbscan1d.h"
+#include "debug.h"
 /// @endcond
 
 # define DUP_SEQSIM_THRESHOLD 0.9  // Sequence similarity threshold for duplication detection
@@ -415,21 +416,21 @@ void SVCaller::findSplitSVSignatures(std::unordered_map<std::string, std::vector
             }
 
             // Store the inversion as the supplementary start and end positions
-            if (inversion && supp_positions.size() > 1) {
-                std::sort(supp_positions.begin(), supp_positions.end());
-                int supp_start = supp_positions.front();
-                int supp_end = supp_positions.back();
-                int sv_length = std::abs(supp_start - supp_end);
-
-                // Use 50bp as the minimum length for an inversion
-                if (sv_length >= 50 && sv_length <= max_length) {
-                    SVEvidenceFlags aln_type;
-                    aln_type.set(static_cast<size_t>(SVDataType::SUPPINV));
-                    SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), aln_type, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size);
-                    // SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size);
-                    addSVCall(chr_sv_calls, sv_candidate);
-                }
-            }
+            // if (inversion && supp_positions.size() > 1) {
+            //     std::sort(supp_positions.begin(), supp_positions.end());
+            //     int supp_start = supp_positions.front();
+            //     int supp_end = supp_positions.back();
+            //     int sv_length = std::abs(supp_start - supp_end);
+
+            //     // Use 50bp as the minimum length for an inversion
+            //     if (sv_length >= 50 && sv_length <= max_length) {
+            //         SVEvidenceFlags aln_type;
+            //         aln_type.set(static_cast<size_t>(SVDataType::SUPPINV));
+            //         SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), aln_type, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size);
+            //         // SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size);
+            //         addSVCall(chr_sv_calls, sv_candidate);
+            //     }
+            // }
 
             // -------------------------------
             // SPLIT INSERTION CALLS
@@ -772,6 +773,8 @@ void SVCaller::run(const InputData& input_data)
     bool cigar_svs = true;
     bool cigar_cn = true;
     bool split_svs = true;
+    bool merge_split_svs = true;
+    bool merge_final_svs = true;
 
     // Print the input data
     input_data.printParameters();
@@ -904,12 +907,11 @@ void SVCaller::run(const InputData& input_data)
     }
     
     if (split_svs) {
-        // Identify split-SV signatures
-        printMessage("Identifying split-SV signatures...");
+        DEBUG_PRINT("Identifying split-SV signatures...");
         std::unordered_map<std::string, std::vector<SVCall>> whole_genome_split_sv_calls;
         this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data);
 
-        printMessage("Running copy number predictions on split-read SVs...");
+        DEBUG_PRINT("Running copy number predictions on split-read SVs...");
         current_chr = 0;
         for (auto& entry : whole_genome_split_sv_calls) {
             const std::string& chr = entry.first;
@@ -917,18 +919,20 @@ void SVCaller::run(const InputData& input_data)
 
             if (sv_calls.size() > 0) {
                 current_chr++;
-                printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates...");
+                DEBUG_PRINT("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates...");
                 this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data);
             }
         }
 
-        printMessage("Merging split-read SVs...");
-        for (auto& entry : whole_genome_split_sv_calls) {
-            std::vector<SVCall>& sv_calls = entry.second;
-            mergeSVs(sv_calls, 0.1, 2, true);
+        if (merge_split_svs) {
+            DEBUG_PRINT("Merging split-read SVs...");
+            for (auto& entry : whole_genome_split_sv_calls) {
+                std::vector<SVCall>& sv_calls = entry.second;
+                mergeSVs(sv_calls, 0.1, 2, true);
+            }
         }
 
-        printMessage("Unifying SVs...");
+        DEBUG_PRINT("Unifying SVs...");
         for (auto& entry : whole_genome_split_sv_calls) {
             const std::string& chr = entry.first;
             std::vector<SVCall>& sv_calls = entry.second;
@@ -936,12 +940,14 @@ void SVCaller::run(const InputData& input_data)
         }
     }
 
-    // Merge any duplicate SV calls from the CIGAR and split-read
-    // detections (same start positions)
-    printMessage("Merging CIGAR and split read SV calls...");
-    for (auto& entry : whole_genome_sv_calls) {
-        std::vector<SVCall>& sv_calls = entry.second;
-        mergeSVs(sv_calls, 0.1, 2, true);
+    if (merge_final_svs) {
+        // Merge any duplicate SV calls from the CIGAR and split-read
+        // detections (same start positions)
+        DEBUG_PRINT("Merging CIGAR and split read SV calls...");
+        for (auto& entry : whole_genome_sv_calls) {
+            std::vector<SVCall>& sv_calls = entry.second;
+            mergeSVs(sv_calls, 0.1, 2, true);
+        }
     }
 
     if (input_data.getSaveCNVData()) {
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index b763efd2..d09bd6fe 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -12,6 +12,7 @@
 
 #include "dbscan.h"
 #include "utils.h"
+#include "debug.h"
 
 bool SVCall::operator<(const SVCall & other) const
 {
@@ -49,6 +50,14 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
         return;
     }
 
+    // Set this to print cluster information for a specific SV call for debugging
+    // This is useful for debugging purposes to see how the SVs are merged
+    bool debug_mode = false;
+    int debug_start = 10414914;  // Set to -1 to disable
+    int debug_svlen_min = 15000;
+    int debug_svlen_max = 16000;
+    SVType debug_sv_type = SVType::INV;
+
     // Cluster SVs using DBSCAN for each SV type
     int initial_size = sv_calls.size();
     std::vector<SVCall> merged_sv_calls;
@@ -61,6 +70,13 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
         SVType::BND,
     })
     {
+        // Skip if not the debug SV type
+        if (debug_mode && (sv_type != debug_sv_type)) {
+            DEBUG_PRINT("DEBUG: Skipping SV type " + getSVTypeString(sv_type) + " for debug mode");
+            continue;
+        }
+
+        DEBUG_PRINT("Merging SV type: " + getSVTypeString(sv_type) + " (epsilon=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts) + ", num SVs=" + std::to_string(sv_calls.size()) + ")");
         std::vector<SVCall> merged_sv_type_calls;
 
         // Create a vector of SV calls for the current SV type and size interval
@@ -73,7 +89,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
             // Add all unclustered points to the merged list
             for (const auto& sv_call : sv_type_calls) {
                 SVCall noise_sv_call = sv_call;
-                // merged_sv_calls.push_back(noise_sv_call);
                 merged_sv_type_calls.push_back(noise_sv_call);
             }
             continue;
@@ -84,18 +99,8 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
         // Create a map of cluster IDs to SV calls
         const std::vector<int>& clusters = dbscan.getClusters();
         std::map<int, std::vector<SVCall>> cluster_map;  // Cluster ID to SV calls
-        if (sv_type == SVType::INS) {
-            // Add only non-CIGARCLIP SVs to the cluster map
-            for (size_t i = 0; i < clusters.size(); ++i) {
-                // Use the SVEvidenceFlags to check for CIGARCLIP
-                if (!sv_type_calls[i].aln_type.test(static_cast<size_t>(SVDataType::CIGARCLIP))) {
-                    cluster_map[clusters[i]].push_back(sv_type_calls[i]);
-                }
-            }
-        } else {
-            for (size_t i = 0; i < clusters.size(); ++i) {
-                cluster_map[clusters[i]].push_back(sv_type_calls[i]);
-            }
+        for (size_t i = 0; i < clusters.size(); ++i) {
+            cluster_map[clusters[i]].push_back(sv_type_calls[i]);
         }
 
         // Save clusters to JSON if requested
@@ -111,7 +116,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
             // Remove the file extension from the JSON filename
             std::string json_filename_no_ext = json_filepath.substr(0, json_filepath.find_last_of('.'));
             std::string json_filename = json_filename_no_ext + "_" + getSVTypeString(sv_type) + ".json";
-            // std::string json_filename = json_filepath + "/clusters_" + getSVTypeString(sv_type) + ".json";
             saveClustersToJSON(json_filename, cluster_map);
         }
 
@@ -121,6 +125,30 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
             int cluster_id = cluster.first;
             std::vector<SVCall>& cluster_sv_calls = cluster.second;
 
+            // Continue unless the debug SV call is in the cluster
+            // if (debug_mode && cluster_id >= 0) {
+            //     if (!cluster_sv_calls.empty() &&
+            //         std::any_of(cluster_sv_calls.begin(), cluster_sv_calls.end(),
+            //             [debug_start, debug_sv_type, debug_svlen_min, debug_svlen_max](const SVCall& sv_call) {
+            //                 const int len = std::abs(static_cast<int>(sv_call.end - sv_call.start));
+
+            //                 const bool start_ok = (debug_start < 0 || static_cast<int>(sv_call.start) == debug_start);
+
+            //                 const bool len_ok = (debug_svlen_min == -1 || len >= debug_svlen_min) &&
+            //                                     (debug_svlen_max == -1 || len <= debug_svlen_max);
+
+            //                 const bool type_ok = (debug_sv_type == SVType::UNKNOWN || sv_call.sv_type == debug_sv_type);
+
+            //                 return start_ok && len_ok && type_ok;
+            //             }
+            //         )) {
+            //         DEBUG_PRINT("DEBUG: Found SV call in noise cluster " + std::to_string(cluster_id) + " with type " + getSVTypeString(debug_sv_type));
+
+            //     } else {
+            //         continue;
+            //     }
+            // }
+
             // Continue if fewer than 2 SV calls in the cluster (due to CIGARCLIP filter)
             if (cluster_sv_calls.size() < 2) {
                 continue;
@@ -132,8 +160,16 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                 // Add all unclustered points to the merged list
                 for (const auto& sv_call : cluster_sv_calls) {
                     SVCall noise_sv_call = sv_call;
-                    // merged_sv_calls.push_back(noise_sv_call);
                     merged_sv_type_calls.push_back(noise_sv_call);
+
+                    // Print the added SV calls if >10 kb and the debug SV type
+                    if (debug_mode && noise_sv_call.sv_type == debug_sv_type && (noise_sv_call.end - noise_sv_call.start) > 10000) {
+                        DEBUG_PRINT("DEBUG: Adding noise SV call at " + std::to_string(noise_sv_call.start) + "-" + std::to_string(noise_sv_call.end) +
+                                    ", type: " + getSVTypeString(noise_sv_call.sv_type) +
+                                    ", length: " + std::to_string(noise_sv_call.end - noise_sv_call.start) +
+                                    ", cluster size: " + std::to_string(noise_sv_call.cluster_size) +
+                                    ", likelihood: " + std::to_string(noise_sv_call.hmm_likelihood));
+                    }
                 }
 
             // Merge clustered SV calls
@@ -163,16 +199,12 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                     std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
                         return a.cluster_size > b.cluster_size || (a.cluster_size == b.cluster_size && a.end - a.start > b.end - b.start);
                     });
-                    // std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-                    //     return a.cluster_size > b.cluster_size || (a.cluster_size == b.cluster_size && a.hmm_likelihood > b.hmm_likelihood);
-                    // });
                     auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) {
                         return sv_call.hmm_likelihood != 0.0;
                     });
 
                     // Add SV call
                     merged_sv_call = *it;
-                    // merged_sv_calls.push_back(merged_sv_call);
                     merged_sv_type_calls.push_back(merged_sv_call);
 
                 // ----------------------------
@@ -186,25 +218,79 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                         return (a.end - a.start) > (b.end - b.start);
                     });
 
-                    // Get the top 10% of the cluster
-                    size_t top_10_percent = std::max(1, (int) (cluster_sv_calls.size() * 0.1));
-                    std::vector<SVCall> top_10(cluster_sv_calls.begin(), cluster_sv_calls.begin() + top_10_percent);
+                    // Print the added SV calls if >10 kb and the debug SV type
+                    if (debug_mode && sv_type == debug_sv_type) {
+                        DEBUG_PRINT("DEBUG: Cluster " + std::to_string(cluster_id) + " with " + std::to_string(cluster_sv_calls.size()) + " SV calls (length sorted):");
+                        for (const auto& sv_call : cluster_sv_calls) {
+                            if ((sv_call.end - sv_call.start) > 10000) {
+                                DEBUG_PRINT("DEBUG: SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) +
+                                            ", type: " + getSVTypeString(sv_call.sv_type) +
+                                            ", length: " + std::to_string(sv_call.end - sv_call.start) +
+                                            ", cluster size: " + std::to_string(sv_call.cluster_size) +
+                                            ", likelihood: " + std::to_string(sv_call.hmm_likelihood));
+                            }
+                        }
+                    }
+
+                    // Get the top % of the cluster
+                    double top_pct = 0.2;
+                    size_t top_pct_size = std::max(1, (int) (cluster_sv_calls.size() *  top_pct));
+                    std::vector<SVCall> top_pct_calls(cluster_sv_calls.begin(), cluster_sv_calls.begin() + top_pct_size);
+
+                    // Print the added SV calls if >10 kb and the debug SV type
+                    if (debug_mode && sv_type == debug_sv_type) {
+                        DEBUG_PRINT("DEBUG: Top  " + std::to_string((int)(top_pct * 100)) + "% of cluster " + std::to_string(cluster_id) + " with " +
+                                    std::to_string(top_pct_calls.size()) + " SV calls (length sorted):");
+                        for (const auto& sv_call : top_pct_calls) {
+                            if ((sv_call.end - sv_call.start) > 10000) {
+                                DEBUG_PRINT("DEBUG: SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) +
+                                            ", type: " + getSVTypeString(sv_call.sv_type) +
+                                            ", length: " + std::to_string(sv_call.end - sv_call.start) +
+                                            ", cluster size: " + std::to_string(sv_call.cluster_size) +
+                                            ", likelihood: " + std::to_string(sv_call.hmm_likelihood));
+                            }
+                        }
+                    }
 
-                    // Get the median SV for the top 10% of the cluster
-                    size_t median_index = top_10.size() / 2;
-                    merged_sv_call = top_10[median_index];
+                    // Get the median SV for the top % of the cluster
+                    size_t median_index = top_pct_calls.size() / 2;
+                    merged_sv_call = top_pct_calls[median_index];
+
+                    // Print the merged SV call
+                    if (debug_mode && sv_type == debug_sv_type) {
+                        DEBUG_PRINT("DEBUG: Merged SV call at " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) +
+                                    ", type: " + getSVTypeString(merged_sv_call.sv_type) +
+                                    ", length: " + std::to_string(merged_sv_call.end - merged_sv_call.start) +
+                                    ", cluster size: " + std::to_string(merged_sv_call.cluster_size) +
+                                    ", likelihood: " + std::to_string(merged_sv_call.hmm_likelihood));
+                    }
 
                     // Add SV call
                     merged_sv_call.cluster_size = (int) cluster_sv_calls.size();
-                    // merged_sv_calls.push_back(merged_sv_call);
                     merged_sv_type_calls.push_back(merged_sv_call);
                 }
                 cluster_count++;
             }
         }
-        printMessage("Merged " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + ", found " + std::to_string(merged_sv_type_calls.size()) + " merged SV calls");
+        DEBUG_PRINT("Merged " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + ", found " + std::to_string(merged_sv_type_calls.size()) + " merged SV calls");
+
+        // Print SV call start, end, type, and length for debugging if > 10 kb
+        if (debug_mode && sv_type == debug_sv_type) {
+            DEBUG_PRINT("DEBUG: Merged SV calls for " + getSVTypeString(sv_type) + ":");
+            for (const auto& sv_call : merged_sv_type_calls) {
+                // if ((int)sv_call.start == debug_start) {
+                if ((sv_call.end - sv_call.start) > 10000) {
+                    DEBUG_PRINT("DEBUG: SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) +
+                                ", type: " + getSVTypeString(sv_call.sv_type) +
+                                ", length: " + std::to_string(sv_call.end - sv_call.start) +
+                                ", cluster size: " + std::to_string(sv_call.cluster_size) +
+                                ", likelihood: " + std::to_string(sv_call.hmm_likelihood));
+                }
+            }
+        }
 
-        // Merge overlapping SVs by cluster size
+        /*
+        // Merge overlapping SVs by SV length
         std::sort(merged_sv_type_calls.begin(), merged_sv_type_calls.end(), [](const SVCall& a, const SVCall& b) {
             return a.start < b.start || (a.start == b.start && a.end < b.end);
         });
@@ -214,46 +300,45 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
 
             // Merge cluster sizes if they overlap
             if (i > 0 && sv_call.start <= merged_sv_type_calls[i - 1].end) {
-                // Keep the larger cluster size
-                if (sv_call.cluster_size > merged_sv_type_calls[i - 1].cluster_size) {
-                    merged_sv_calls_final.push_back(sv_call);
+                // Keep the larger SV call (end - start) if they overlap
+                if ((sv_call.end - sv_call.start) > (merged_sv_type_calls[i - 1].end - merged_sv_type_calls[i - 1].start)) {
+                    merged_sv_type_calls[i - 1] = sv_call; // Replace the previous SV call with the current one
                 }
+                // Keep the larger cluster size
+                // if (sv_call.cluster_size > merged_sv_type_calls[i - 1].cluster_size) {
+                //     merged_sv_calls_final.push_back(sv_call);
+                // }
             } else {
                 merged_sv_calls_final.push_back(sv_call);
             }
         }
-        printMessage("Merged " + std::to_string(merged_sv_type_calls.size()) + " overlapping SV calls into " + std::to_string(merged_sv_calls_final.size()) + " merged SV calls");
+        DEBUG_PRINT("Merged " + std::to_string(merged_sv_type_calls.size()) + " overlapping SV calls into " + std::to_string(merged_sv_calls_final.size()) + " merged SV calls");
         
-        // Insert merged SV calls into the final list
-        merged_sv_calls.insert(merged_sv_calls.end(), merged_sv_calls_final.begin(), merged_sv_calls_final.end());
+        // Print merged SV calls for debugging
+        if (debug_mode) {
+            DEBUG_PRINT("DEBUG: Final merged SV calls for " + getSVTypeString(sv_type) + ":");
+            for (const auto& sv_call : merged_sv_calls_final) {
+                // if ((int)sv_call.start == debug_start) {
+                if (sv_call.sv_type == SVType::DUP) {
+                    DEBUG_PRINT("DEBUG: SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) +
+                                ", type: " + getSVTypeString(sv_call.sv_type) +
+                                ", length: " + std::to_string(sv_call.end - sv_call.start) +
+                                ", cluster size: " + std::to_string(sv_call.cluster_size) +
+                                ", likelihood: " + std::to_string(sv_call.hmm_likelihood));
+                }
+            }
+        }
 
-        // printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + " SVs");
+        // Insert merged SV calls into the final list
+        merged_sv_calls.insert(merged_sv_calls.end(),
+        merged_sv_calls_final.begin(), merged_sv_calls_final.end());
+        */
+        merged_sv_calls.insert(merged_sv_calls.end(),
+                               merged_sv_type_calls.begin(), merged_sv_type_calls.end());
     }
     sv_calls = std::move(merged_sv_calls); // Replace with filtered list
     int updated_size = sv_calls.size();
     printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls");
-
-    // // Merge overlapping SVs by cluster size
-    // std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) {
-    //     return a.start < b.start || (a.start == b.start && a.end < b.end);
-    // });
-    // std::vector<SVCall> merged_sv_calls_final;
-    // for (size_t i = 0; i < sv_calls.size(); i++) {
-    //     SVCall& sv_call = sv_calls[i];
-
-    //     // Merge cluster sizes if they overlap
-    //     if (i > 0 && sv_call.start <= sv_calls[i - 1].end) {
-    //         // Keep the larger cluster size
-    //         if (sv_call.cluster_size > sv_calls[i - 1].cluster_size) {
-    //             sv_calls[i - 1] = sv_call;
-    //         }
-    //     } else {
-    //         merged_sv_calls_final.push_back(sv_call);
-    //     }
-    // }
-    // sv_calls = std::move(merged_sv_calls_final); // Replace with filtered list
-    // int final_size = sv_calls.size();
-    // printMessage("Merged " + std::to_string(updated_size) + " overlapping SV calls into " + std::to_string(final_size) + " SV calls");
 }
 
 void saveClustersToJSON(const std::string &filename, const std::map<int, std::vector<SVCall>> &clusters)
@@ -271,7 +356,6 @@ void saveClustersToJSON(const std::string &filename, const std::map<int, std::ve
 
     // Open the JSON file for writing
     std::ofstream json_file(filename);
-
     if (!json_file.is_open()) {
         printError("ERROR: Unable to open JSON file for writing: " + filename);
         return;
@@ -279,14 +363,11 @@ void saveClustersToJSON(const std::string &filename, const std::map<int, std::ve
     json_file << "{\n";
     json_file << "  \"clusters\": [\n";
     size_t count = 0;
-    // for (size_t i = 0; i < clusters.size(); ++i) {
     for (const auto& [cluster_id, cluster] : clusters) {
         if (cluster_id < 0) {
             continue; // Skip noise points
         }
 
-        // const auto& cluster = clusters.at(i);
-        // const auto& cluster = sv_list;
         json_file << "    {\n";
         json_file << "      \"cluster_id\": " << cluster_id << ",\n";
         json_file << "      \"cluster_size\": " << cluster.size() << ",\n";
@@ -296,14 +377,9 @@ void saveClustersToJSON(const std::string &filename, const std::map<int, std::ve
             json_file << "        {\n";
             json_file << "          \"start\": " << sv_call.start << ",\n";
             json_file << "          \"end\": " << sv_call.end << "\n";
-            // json_file << "          \"sv_type\": \"" << getSVTypeString(sv_call.sv_type) << "\",\n";
-            // json_file << "          \"alt_allele\": \"" << sv_call.alt_allele << "\",\n";
-            // json_file << "          \"genotype\": \"" << getGenotypeString(sv_call.genotype) << "\",\n";
-            // json_file << "          \"hmm_likelihood\": " << sv_call.hmm_likelihood << "\n";
             json_file << "        }" << (j < cluster.size() - 1 ? "," : "") << "\n";
         }
         json_file << "      ]\n";
-        // json_file << "    }" << (i < clusters.size() - 1 ? "," : "") << "\n";
         count++;
         if (count < clusters.size() - 1) {
             json_file << "    }," << "\n";

From 4ba3680f7088d70d5c543bbd97bee877300213f4 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 27 Jun 2025 12:04:52 -0400
Subject: [PATCH 118/134] Update .gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 9f6f43d4..b7c96918 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,3 +97,6 @@ valgrind.log
 *.log
 *.err
 *.out
+
+# Snakemake files
+.snakemake

From 81b09ac17cea7c11f7e62a0ee615b2e28648cded Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Wed, 9 Jul 2025 17:05:11 -0400
Subject: [PATCH 119/134] update makefile

---
 .gitignore      |  1 +
 Makefile        | 68 +++++++++++++++++++++++++++++++++++++++++--------
 Makefile-cpp    | 59 ------------------------------------------
 Makefile-python | 15 -----------
 environment.yml |  9 -------
 5 files changed, 58 insertions(+), 94 deletions(-)
 delete mode 100644 Makefile-cpp
 delete mode 100644 Makefile-python

diff --git a/.gitignore b/.gitignore
index b7c96918..73f38beb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -100,3 +100,4 @@ valgrind.log
 
 # Snakemake files
 .snakemake
+snakemake_bench/results/
diff --git a/Makefile b/Makefile
index b8186167..cfab069c 100644
--- a/Makefile
+++ b/Makefile
@@ -1,17 +1,63 @@
-# Top-Level Makefile
+# Directories
+INCL_DIR := $(CURDIR)/include
+SRC_DIR := $(CURDIR)/src
+BUILD_DIR := $(CURDIR)/build
+LIB_DIR := $(CURDIR)/lib
 
-.PHONY: python cpp clean
+# Version header
+VERSION := $(shell git describe --tags --always)
+VERSION_HEADER := $(INCL_DIR)/version.h
+.PHONY: $(VERSION_HEADER)
+	@echo "#pragma once" > $@
+	@echo "#define VERSION \"$(VERSION)\"" >> $@
 
-# Targets for the sub-makefiles
-python:
-	$(MAKE) -f Makefile-python
+# Conda environment directories
+CONDA_PREFIX := $(shell echo $$CONDA_PREFIX)
+CONDA_INCL_DIR := $(CONDA_PREFIX)/include
+CONDA_LIB_DIR := $(CONDA_PREFIX)/lib
 
-cpp:
-	$(MAKE) -f Makefile-cpp
+# Compiler and Flags
+CXX := g++
+CXXFLAGS := -std=c++17 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic
 
-debug:
-	$(MAKE) -f Makefile-cpp DEBUG=1
+# ifdef DEBUG
+# 	CXXFLAGS += -DDEBUG
+# endif
 
+LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR)  # Add rpath for shared libraries
+LDLIBS := -lhts  # Link with libhts.a or libhts.so
+
+# Enable thread sanitizer (TSan)
+# ifeq ($(TSAN),1)
+# TSAN_FLAGS := -fsanitize=thread -fPIE -pie -g
+# CXXFLAGS += $(TSAN_FLAGS)
+# LDFLAGS += $(TSAN_FLAGS)
+# endif
+
+# Sources and Output
+# SOURCES := $(wildcard $(SRC_DIR)/*.cpp)
+SOURCES := $(filter-out $(SRC_DIR)/swig_wrapper.cpp, $(wildcard $(SRC_DIR)/*.cpp))  # Filter out the SWIG wrapper from the sources
+OBJECTS := $(patsubst $(SRC_DIR)/%.cpp,$(BUILD_DIR)/%.o,$(SOURCES))
+TARGET := $(BUILD_DIR)/cpp_module
+
+# Default target
+all: $(TARGET)
+
+# Debug target
+debug: CXXFLAGS += -DDEBUG
+debug: all
+
+# Link the executable
+$(TARGET): $(OBJECTS)
+	@mkdir -p $(BUILD_DIR)
+	$(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDLIBS)
+
+# Compile source files
+$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp
+	@mkdir -p $(BUILD_DIR)
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
+# Clean the build directory
 clean:
-	$(MAKE) -f Makefile-python clean
-	$(MAKE) -f Makefile-cpp clean
+	rm -rf $(BUILD_DIR)
+	
\ No newline at end of file
diff --git a/Makefile-cpp b/Makefile-cpp
deleted file mode 100644
index ceda4018..00000000
--- a/Makefile-cpp
+++ /dev/null
@@ -1,59 +0,0 @@
-# Directories
-INCL_DIR := $(CURDIR)/include
-SRC_DIR := $(CURDIR)/src
-BUILD_DIR := $(CURDIR)/build
-LIB_DIR := $(CURDIR)/lib
-
-# Version header
-VERSION := $(shell git describe --tags --always)
-VERSION_HEADER := $(INCL_DIR)/version.h
-.PHONY: $(VERSION_HEADER)
-	@echo "#pragma once" > $@
-	@echo "#define VERSION \"$(VERSION)\"" >> $@
-
-# Conda environment directories
-CONDA_PREFIX := $(shell echo $$CONDA_PREFIX)
-CONDA_INCL_DIR := $(CONDA_PREFIX)/include
-CONDA_LIB_DIR := $(CONDA_PREFIX)/lib
-
-# Compiler and Flags
-CXX := g++
-CXXFLAGS := -std=c++17 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic
-
-ifdef DEBUG
-	CXXFLAGS += -DDEBUG
-endif
-
-LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR)  # Add rpath for shared libraries
-LDLIBS := -lhts  # Link with libhts.a or libhts.so
-
-# Enable thread sanitizer (TSan)
-# ifeq ($(TSAN),1)
-# TSAN_FLAGS := -fsanitize=thread -fPIE -pie -g
-# CXXFLAGS += $(TSAN_FLAGS)
-# LDFLAGS += $(TSAN_FLAGS)
-# endif
-
-# Sources and Output
-# SOURCES := $(wildcard $(SRC_DIR)/*.cpp)
-SOURCES := $(filter-out $(SRC_DIR)/swig_wrapper.cpp, $(wildcard $(SRC_DIR)/*.cpp))  # Filter out the SWIG wrapper from the sources
-OBJECTS := $(patsubst $(SRC_DIR)/%.cpp,$(BUILD_DIR)/%.o,$(SOURCES))
-TARGET := $(BUILD_DIR)/cpp_module
-
-# Default target
-all: $(TARGET)
-
-# Link the executable
-$(TARGET): $(OBJECTS)
-	@mkdir -p $(BUILD_DIR)
-	$(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDLIBS)
-
-# Compile source files
-$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp
-	@mkdir -p $(BUILD_DIR)
-	$(CXX) $(CXXFLAGS) -c $< -o $@
-
-# Clean the build directory
-clean:
-	rm -rf $(BUILD_DIR)
-	
\ No newline at end of file
diff --git a/Makefile-python b/Makefile-python
deleted file mode 100644
index 361ba11b..00000000
--- a/Makefile-python
+++ /dev/null
@@ -1,15 +0,0 @@
-INCL_DIR := $(CURDIR)/include
-SRC_DIR := $(CURDIR)/src
-LIB_DIR := $(CURDIR)/lib
-
-
-all:
-	# Generate the SWIG wrapper (C++ -> Python)
-	swig -c++ -python -I$(INCL_DIR) -o $(SRC_DIR)/swig_wrapper.cpp -outdir $(LIB_DIR) $(SRC_DIR)/swig_wrapper.i
-
-	# Compile the SWIG wrapper using setuptools
-	python3 setup.py build_ext --build-lib $(LIB_DIR)
-
-clean:
-	rm -rf $(LIB_DIR)/*.so $(LIB_DIR)/contextsv.py
-	
\ No newline at end of file
diff --git a/environment.yml b/environment.yml
index 26f46822..538f5bc3 100644
--- a/environment.yml
+++ b/environment.yml
@@ -8,14 +8,5 @@ dependencies:
   - python
   - numpy
   - htslib
-  - swig
   - pytest
   - plotly
-
-# [A] Generate directly from the file:
-#  conda env create -f environment.yml -n contextsv
-# [B] Generate after creating a new environment:
-# conda create -n contextsv
-# conda activate contextsv
-# conda env update -f environment.yml --prune  # Prune removes unused packages
-

From 9f2aea2e4bd874a3ff90b42f8796a3eaf9afe99e Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 12:53:00 -0400
Subject: [PATCH 120/134] update cnv plots

---
 .gitignore               |  1 +
 python/cnv_plots_json.py | 40 +++++++++++++++++++++++++++++++---------
 src/main.cpp             |  5 -----
 3 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/.gitignore b/.gitignore
index 73f38beb..b7478d26 100644
--- a/.gitignore
+++ b/.gitignore
@@ -55,6 +55,7 @@ CMakeSettings.json
 
 # Output folder
 output/
+python/
 
 # Doxygen
 docs/html/
diff --git a/python/cnv_plots_json.py b/python/cnv_plots_json.py
index 31a59110..768058e9 100644
--- a/python/cnv_plots_json.py
+++ b/python/cnv_plots_json.py
@@ -1,11 +1,16 @@
+import os
+import argparse
+import json
+import numpy as np
 import plotly
 from plotly.subplots import make_subplots
-import json
-import argparse
+
+min_sv_length = 200000 # Minimum SV length in base pairs
 
 # Set up argument parser
 parser = argparse.ArgumentParser(description='Generate CNV plots from JSON data.')
 parser.add_argument('json_file', type=str, help='Path to the JSON file containing SV data')
+parser.add_argument('chromosome', type=str, help='Chromosome to filter the SVs by (e.g., "chr3")', nargs='?', default=None)
 args = parser.parse_args()
 
 # Load your JSON data
@@ -32,6 +37,15 @@
 # Loop through each SV (assuming your JSON contains multiple SVs)
 for sv in sv_data:
 
+    # If a chromosome is specified, filter the SVs by that chromosome
+    if args.chromosome and sv['chromosome'] != args.chromosome:
+        continue
+
+    # Filter out SVs that are smaller than the minimum length
+    if np.abs(sv['size']) < min_sv_length:
+        print(f"Skipping SV {sv['chromosome']}:{sv['start']}-{sv['end']} of type {sv['sv_type']} with size {sv['size']} bp (smaller than {min_sv_length} bp)")
+        continue
+
     # Extract data for plotting
     positions_before = sv['before_sv']['positions']
     b_allele_freq_before = sv['before_sv']['b_allele_freq']
@@ -61,9 +75,13 @@
         b_allele_freq = sv[section]['b_allele_freq']
         population_freq = sv[section]['population_freq']
         log2_ratio = sv[section]['log2_ratio']
+        is_snp = sv[section]['is_snp']
+
+        # Set all b-allele frequencies to NaN if not SNPs
+        b_allele_freq = [freq if is_snp_val else float('nan') for freq, is_snp_val in zip(b_allele_freq, is_snp)]
 
         if section == "sv":
-            is_snp = sv[section]['is_snp']
+            # is_snp = sv[section]['is_snp']
             states = sv[section]['states']
             state_colors = [state_colors_dict[str(state)] for state in states]
             marker_symbols = ['circle' if is_snp_val else 'circle-open' for is_snp_val in is_snp]
@@ -81,7 +99,7 @@
                     f"Population Frequency: {population_freq[i]}<br>"
                 )
         else:
-            is_snp = sv[section]['is_snp']
+            # is_snp = sv[section]['is_snp']
             state_colors = ['black'] * len(positions)
             # marker_symbols = ['circle-open'] * len(positions)
             marker_symbols = ['circle' if is_snp_val else 'circle-open' for is_snp_val in is_snp]
@@ -105,7 +123,7 @@
             hoverinfo='text',
             marker=dict(
                 color=state_colors,
-                size=10,
+                size=5,
                 symbol=marker_symbols,
             ),
             line=dict(
@@ -125,7 +143,7 @@
             hoverinfo='text',
             marker=dict(
                 color=state_colors,
-                size=10,
+                size=5,
                 symbol=marker_symbols,
             ),
             line=dict(
@@ -214,6 +232,10 @@
     #     width = 800
     # )
     # Save the plot to an HTML file (use a unique filename per SV)
-    file_name = f"output/SV_{chromosome}_{start}_{end}.html"
-    fig.write_html(file_name)
-    print(f"Plot saved as {file_name}")
+    # Use the input filepath directory as the output directory
+    output_dir = os.path.dirname(args.json_file)
+    svlen_kb = sv_length // 1000
+    file_name = f"SV_{chromosome}_{start}_{end}_{sv_type}_{svlen_kb}kb.html"
+    file_path = os.path.join(output_dir, file_name)
+    fig.write_html(file_path)
+    print(f"Plot saved as {file_path}")
diff --git a/src/main.cpp b/src/main.cpp
index 4755a0e4..5425a7c1 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -116,11 +116,6 @@ void runContextSV(const std::unordered_map<std::string, std::string>& args)
         if (fileExists(json_filepath)) {
             remove(json_filepath.c_str());
         }
-        // int json_file_count = 1;
-        // while (fileExists(json_filepath)) {
-        //     json_filepath = output_dir + "/CNVCalls_" + std::to_string(json_file_count) + ".json";
-        //     json_file_count++;
-        // }
         input_data.setCNVOutputFile(json_filepath);
         std::cout << "Saving CNV data to: " << json_filepath << std::endl;
     }

From 83570521d77b27724cd8227b8e748b051d2d3312 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 13:39:49 -0400
Subject: [PATCH 121/134] simplify environment and update installation

---
 Makefile           |  16 ++-----
 README.md          | 115 ++++++++++++++-------------------------------
 environment.yml    |   9 ++--
 src/input_data.cpp |   8 ++--
 src/sv_object.cpp  |  73 ----------------------------
 5 files changed, 44 insertions(+), 177 deletions(-)

diff --git a/Makefile b/Makefile
index cfab069c..207f7e09 100644
--- a/Makefile
+++ b/Makefile
@@ -20,25 +20,15 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib
 CXX := g++
 CXXFLAGS := -std=c++17 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic
 
-# ifdef DEBUG
-# 	CXXFLAGS += -DDEBUG
-# endif
-
+# Linker Flags
+# Ensure that the library paths are set correctly for linking
 LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR)  # Add rpath for shared libraries
 LDLIBS := -lhts  # Link with libhts.a or libhts.so
 
-# Enable thread sanitizer (TSan)
-# ifeq ($(TSAN),1)
-# TSAN_FLAGS := -fsanitize=thread -fPIE -pie -g
-# CXXFLAGS += $(TSAN_FLAGS)
-# LDFLAGS += $(TSAN_FLAGS)
-# endif
-
 # Sources and Output
-# SOURCES := $(wildcard $(SRC_DIR)/*.cpp)
 SOURCES := $(filter-out $(SRC_DIR)/swig_wrapper.cpp, $(wildcard $(SRC_DIR)/*.cpp))  # Filter out the SWIG wrapper from the sources
 OBJECTS := $(patsubst $(SRC_DIR)/%.cpp,$(BUILD_DIR)/%.o,$(SOURCES))
-TARGET := $(BUILD_DIR)/cpp_module
+TARGET := $(BUILD_DIR)/contextsv
 
 # Default target
 all: $(TARGET)
diff --git a/README.md b/README.md
index e84f006d..707f6f64 100644
--- a/README.md
+++ b/README.md
@@ -12,33 +12,51 @@ corresponding reference genome (FASTA), a VCF with high-quality SNPs
 Class documentation is available at <a href="https://wglab.openbioinformatics.org/ContextSV">https://wglab.openbioinformatics.org/ContextSV</a>
 </p>
 
-## Installation (Linux)
-### Using Anaconda (recommended)
-First, install [Anaconda](https://www.anaconda.com/).
+## Installation
 
-Next, create a new environment. This installation has been tested with Python 3.11:
-
-```
-conda create -n contextsv python=3.11
-conda activate contextsv
-```
-
-ContextSV can then be installed using the following command:
+### Building from source (for testing/development)
+ContextSV requires HTSLib as a dependency that can be installed using  [Anaconda](https://www.anaconda.com/). Create an environment
+containing HTSLib: 
 
 ```
-conda install -c bioconda -c wglab contextsv=1.0.0
+conda create -n htsenv -c bioconda -c conda-forge htslib
+conda activate htsenv
 ```
 
-### Building from source (for testing/development)
-First install [Anaconda](https://www.anaconda.com/). Then follow the instructions below to install LongReadSum and its dependencies:
+Then follow the instructions below to build ContextSV:
 
 ```
 git clone https://github.com/WGLab/ContextSV
 cd ContextSV
-conda env create -f environment.yml
 make
 ```
 
+ContextSV can then be run:
+```
+./build/contextsv --help
+
+Usage: ./build/contextsv [options]
+Options:
+  -b, --bam <bam_file>          Long-read BAM file (required)
+  -r, --ref <ref_file>          Reference genome FASTA file (required)
+  -s, --snp <vcf_file>          SNPs VCF file (required)
+  -o, --outdir <output_dir>     Output directory (required)
+  -c, --chr <chromosome>        Chromosome
+  -r, --region <region>         Region (start-end)
+  -t, --threads <thread_count>  Number of threads
+  -h, --hmm <hmm_file>          HMM file
+  -n, --sample-size <size>      Sample size for HMM predictions
+     --min-cnv <min_length>     Minimum CNV length
+     --eps <epsilon>             DBSCAN epsilon
+     --min-pts-pct <min_pts_pct> Percentage of mean chr. coverage to use for DBSCAN minimum points
+  -e, --eth <eth_file>          ETH file
+  -p, --pfb <pfb_file>          PFB file
+     --save-cnv                 Save CNV data
+     --debug                    Debug mode with verbose logging
+     --version                  Print version and exit
+  -h, --help                    Print usage and exit
+```
+
 ## Downloading gnomAD SNP population frequencies
 SNP population allele frequency
 information is used for copy number predictions in this tool (see
@@ -53,7 +71,7 @@ Download links for genome VCF files are located here (last updated April 3,
  - **gnomAD v2.1.1 (GRCh37)**: https://gnomad.broadinstitute.org/downloads#2
 
 
-### Example download
+### Script for downloading gnomAD VCFs
 ```
 download_dir="~/data/gnomad/v4.0.0/"
 
@@ -78,71 +96,6 @@ X=~/data/gnomad/v4.0.0/gnomad.genomes.v4.0.sites.chrX.vcf.bgz
 Y=~/data/gnomad/v4.0.0/gnomad.genomes.v4.0.sites.chrY.vcf.bgz
 ```
 
-## Calling structural variants
-### Example full script generating a merged VCF of structural variants
-```
-# Activate the environment
-conda activate contextsv
-
-# Set the input reference genome
-ref_file="~/data/GRCh38.fa"
-
-# Set the input alignment file (e.g. from minimap2)
-long_read_bam="~/data/HG002.GRCh38.bam"
-
-# Set the input SNPs file (e.g. from NanoCaller)
-snps_file="~/data/variant_calls.snps.vcf.gz"
-
-# Set the SNP population frequencies filepath
-pfb_file="~/data/gnomadv4_filepaths.txt"
-
-# Set the output directory
-output_dir=~/data/contextSV_output
-
-# Specify the number of threads (system-specific)
-thread_count=40
-
-# Run SV calling (~3-4 hours for whole-genome, 40 cores)
-python contextsv --threads $thread_count -o $output_dir -lr $long_read_bam --snps $snps_file --reference $ref_file --pfb $pfb_file
-
-# The output VCF filepath is located here:
-output_vcf=$output_dir/sv_calls.vcf
-
-# Merge SVs (~3-4 hours for whole-genome, 40 cores)
-python contextsv --merge $output_vcf
-
-# The final merged VCF filepath is located here:
-merged_vcf=$output_dir/sv_calls.merged.vcf
-```
-
-## Input arguments
-
-```
-python contextsv --help
-
-ContextSV: A tool for integrative structural variant detection.
-
-options:
-  -h, --help            show this help message and exit
-  -lr LONG_READ, --long-read LONG_READ
-                        path to the long read alignment BAM file
-  -g REFERENCE, --reference REFERENCE
-                        path to the reference genome FASTA file
-  -s SNPS, --snps SNPS  path to the SNPs VCF file
-  --pfb PFB             path to the file with SNP population frequency VCF filepaths (see docs for format)
-  -o OUTPUT, --output OUTPUT
-                        path to the output directory
-  -r REGION, --region REGION
-                        region to analyze (e.g. chr1, chr1:1000-2000). If not provided, the entire genome will be analyzed
-  -t THREADS, --threads THREADS
-                        number of threads to use
-  --hmm HMM             path to the PennCNV HMM file
-  --window-size WINDOW_SIZE
-                        window size for calculating log2 ratios for CNV predictions (default: 10 kb)
-  -d, --debug           debug mode (verbose logging)
-  -v, --version         print the version number and exit
-```
-
 ## Revision history
 For release history, please visit [here](https://github.com/WGLab/ContextSV/releases). 
 
diff --git a/environment.yml b/environment.yml
index 538f5bc3..1dd41ce2 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,12 +1,9 @@
 name: contextsv
 channels:
-  - defaults
-  - anaconda
-  - conda-forge
   - bioconda
+  - conda-forge
 dependencies:
-  - python
-  - numpy
+  - python=3.11
+  - numpy=1.26
   - htslib
   - pytest
-  - plotly
diff --git a/src/input_data.cpp b/src/input_data.cpp
index 4e0211df..3b53a7d7 100644
--- a/src/input_data.cpp
+++ b/src/input_data.cpp
@@ -24,11 +24,11 @@ InputData::InputData()
     this->start_end = std::make_pair(0, 0);
     this->region_set = false;
     this->output_dir = "";
-    this->sample_size = 100;
-    this->min_cnv_length = 1000;
+    this->sample_size = 20;
+    this->min_cnv_length = 2000;  // Default minimum CNV length
     this->min_reads = 5;
-    this->dbscan_epsilon = 0.99;
-    this->dbscan_min_pts_pct = 0.0;
+    this->dbscan_epsilon = 0.1;
+    this->dbscan_min_pts_pct = 0.1;
     this->thread_count = 1;
     this->hmm_filepath = "data/wgs.hmm";
     this->verbose = false;
diff --git a/src/sv_object.cpp b/src/sv_object.cpp
index d09bd6fe..d6f46b82 100644
--- a/src/sv_object.cpp
+++ b/src/sv_object.cpp
@@ -53,9 +53,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
     // Set this to print cluster information for a specific SV call for debugging
     // This is useful for debugging purposes to see how the SVs are merged
     bool debug_mode = false;
-    int debug_start = 10414914;  // Set to -1 to disable
-    int debug_svlen_min = 15000;
-    int debug_svlen_max = 16000;
     SVType debug_sv_type = SVType::INV;
 
     // Cluster SVs using DBSCAN for each SV type
@@ -125,30 +122,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
             int cluster_id = cluster.first;
             std::vector<SVCall>& cluster_sv_calls = cluster.second;
 
-            // Continue unless the debug SV call is in the cluster
-            // if (debug_mode && cluster_id >= 0) {
-            //     if (!cluster_sv_calls.empty() &&
-            //         std::any_of(cluster_sv_calls.begin(), cluster_sv_calls.end(),
-            //             [debug_start, debug_sv_type, debug_svlen_min, debug_svlen_max](const SVCall& sv_call) {
-            //                 const int len = std::abs(static_cast<int>(sv_call.end - sv_call.start));
-
-            //                 const bool start_ok = (debug_start < 0 || static_cast<int>(sv_call.start) == debug_start);
-
-            //                 const bool len_ok = (debug_svlen_min == -1 || len >= debug_svlen_min) &&
-            //                                     (debug_svlen_max == -1 || len <= debug_svlen_max);
-
-            //                 const bool type_ok = (debug_sv_type == SVType::UNKNOWN || sv_call.sv_type == debug_sv_type);
-
-            //                 return start_ok && len_ok && type_ok;
-            //             }
-            //         )) {
-            //         DEBUG_PRINT("DEBUG: Found SV call in noise cluster " + std::to_string(cluster_id) + " with type " + getSVTypeString(debug_sv_type));
-
-            //     } else {
-            //         continue;
-            //     }
-            // }
-
             // Continue if fewer than 2 SV calls in the cluster (due to CIGARCLIP filter)
             if (cluster_sv_calls.size() < 2) {
                 continue;
@@ -278,7 +251,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
         if (debug_mode && sv_type == debug_sv_type) {
             DEBUG_PRINT("DEBUG: Merged SV calls for " + getSVTypeString(sv_type) + ":");
             for (const auto& sv_call : merged_sv_type_calls) {
-                // if ((int)sv_call.start == debug_start) {
                 if ((sv_call.end - sv_call.start) > 10000) {
                     DEBUG_PRINT("DEBUG: SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) +
                                 ", type: " + getSVTypeString(sv_call.sv_type) +
@@ -288,51 +260,6 @@ void mergeSVs(std::vector<SVCall>& sv_calls, double epsilon, int min_pts, bool k
                 }
             }
         }
-
-        /*
-        // Merge overlapping SVs by SV length
-        std::sort(merged_sv_type_calls.begin(), merged_sv_type_calls.end(), [](const SVCall& a, const SVCall& b) {
-            return a.start < b.start || (a.start == b.start && a.end < b.end);
-        });
-        std::vector<SVCall> merged_sv_calls_final;
-        for (size_t i = 0; i < merged_sv_type_calls.size(); i++) {
-            SVCall& sv_call = merged_sv_type_calls[i];
-
-            // Merge cluster sizes if they overlap
-            if (i > 0 && sv_call.start <= merged_sv_type_calls[i - 1].end) {
-                // Keep the larger SV call (end - start) if they overlap
-                if ((sv_call.end - sv_call.start) > (merged_sv_type_calls[i - 1].end - merged_sv_type_calls[i - 1].start)) {
-                    merged_sv_type_calls[i - 1] = sv_call; // Replace the previous SV call with the current one
-                }
-                // Keep the larger cluster size
-                // if (sv_call.cluster_size > merged_sv_type_calls[i - 1].cluster_size) {
-                //     merged_sv_calls_final.push_back(sv_call);
-                // }
-            } else {
-                merged_sv_calls_final.push_back(sv_call);
-            }
-        }
-        DEBUG_PRINT("Merged " + std::to_string(merged_sv_type_calls.size()) + " overlapping SV calls into " + std::to_string(merged_sv_calls_final.size()) + " merged SV calls");
-        
-        // Print merged SV calls for debugging
-        if (debug_mode) {
-            DEBUG_PRINT("DEBUG: Final merged SV calls for " + getSVTypeString(sv_type) + ":");
-            for (const auto& sv_call : merged_sv_calls_final) {
-                // if ((int)sv_call.start == debug_start) {
-                if (sv_call.sv_type == SVType::DUP) {
-                    DEBUG_PRINT("DEBUG: SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) +
-                                ", type: " + getSVTypeString(sv_call.sv_type) +
-                                ", length: " + std::to_string(sv_call.end - sv_call.start) +
-                                ", cluster size: " + std::to_string(sv_call.cluster_size) +
-                                ", likelihood: " + std::to_string(sv_call.hmm_likelihood));
-                }
-            }
-        }
-
-        // Insert merged SV calls into the final list
-        merged_sv_calls.insert(merged_sv_calls.end(),
-        merged_sv_calls_final.begin(), merged_sv_calls_final.end());
-        */
         merged_sv_calls.insert(merged_sv_calls.end(),
                                merged_sv_type_calls.begin(), merged_sv_type_calls.end());
     }

From f4f964b3eb687c562bb80cd9de85c335ef6a6ad9 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 13:44:52 -0400
Subject: [PATCH 122/134] update environment

---
 environment.yml | 1 +
 src/main.cpp    | 1 +
 2 files changed, 2 insertions(+)

diff --git a/environment.yml b/environment.yml
index 1dd41ce2..f99a29cd 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,5 +1,6 @@
 name: contextsv
 channels:
+  - defaults
   - bioconda
   - conda-forge
 dependencies:
diff --git a/src/main.cpp b/src/main.cpp
index 5425a7c1..874f444f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -141,6 +141,7 @@ void printUsage(const std::string& programName) {
                 << "     --min-pts-pct <min_pts_pct> Percentage of mean chr. coverage to use for DBSCAN minimum points\n"
                 << "  -e, --eth <eth_file>          ETH file\n"
                 << "  -p, --pfb <pfb_file>          PFB file\n"
+                << "     --assembly-gaps <gaps_file> Assembly gaps file\n"
                 << "     --save-cnv                 Save CNV data\n"
                 << "     --debug                    Debug mode with verbose logging\n"
                 << "     --version                  Print version and exit\n"

From 94ae67dc6b136eaff1860395cee1eb1bb7da2632 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 14:17:00 -0400
Subject: [PATCH 123/134] update python version

---
 environment.yml | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/environment.yml b/environment.yml
index f99a29cd..867f41a4 100644
--- a/environment.yml
+++ b/environment.yml
@@ -1,10 +1,9 @@
 name: contextsv
 channels:
-  - defaults
   - bioconda
   - conda-forge
 dependencies:
-  - python=3.11
-  - numpy=1.26
+  - python=3.10
+  - numpy
   - htslib
   - pytest

From 8ff85bf48122be52c7946f5aa7157580fd6aca49 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 14:20:47 -0400
Subject: [PATCH 124/134] update build yml

---
 .github/workflows/build-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml
index 11dc3093..3ba12930 100644
--- a/.github/workflows/build-tests.yml
+++ b/.github/workflows/build-tests.yml
@@ -28,7 +28,7 @@ jobs:
       with:
         activate-environment: contextsv
         environment-file: environment.yml
-        python-version: 3.9
+        python-version: 3.10
         auto-activate-base: false
 
     - name: Install samtools and bcftools using sudo apt-get

From d13f68caff5e2f6806538732f33882e10f5ca73a Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 14:29:07 -0400
Subject: [PATCH 125/134] update build yml

---
 .github/workflows/build-tests.yml | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml
index 3ba12930..ca2fac4f 100644
--- a/.github/workflows/build-tests.yml
+++ b/.github/workflows/build-tests.yml
@@ -23,18 +23,23 @@ jobs:
       shell: bash --login {0}
       run: unzip TestData.zip
 
-    - name: Set up conda environment
+    - name: Set up conda (miniconda)
       uses: conda-incubator/setup-miniconda@v2
       with:
-        activate-environment: contextsv
-        environment-file: environment.yml
-        python-version: 3.10
-        auto-activate-base: false
+          auto-activate-base: true
+          miniforge-variant: Mambaforge
+          miniforge-version: latest
+          use-mamba: true  # Use mamba natively
 
-    - name: Install samtools and bcftools using sudo apt-get
+    - name: Configure conda channels and create environment with mamba
+      shell: bash --login {0}
       run: |
-        sudo apt-get update
-        sudo apt-get install -y samtools bcftools
+        conda config --remove channels defaults || true
+        conda config --add channels conda-forge
+        conda config --add channels bioconda
+        conda config --set channel_priority strict
+        mamba env create -f environment.yml
+        conda activate contextsv
 
     - name: Build C++ code
       shell: bash --login {0}  # --login enables PATH variable access

From 05ff3f8a78eb8690f107da135e6b28741102487c Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 14:33:22 -0400
Subject: [PATCH 126/134] update build yml

---
 .github/workflows/build-tests.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml
index ca2fac4f..25eebe21 100644
--- a/.github/workflows/build-tests.yml
+++ b/.github/workflows/build-tests.yml
@@ -26,10 +26,10 @@ jobs:
     - name: Set up conda (miniconda)
       uses: conda-incubator/setup-miniconda@v2
       with:
-          auto-activate-base: true
-          miniforge-variant: Mambaforge
+          use-mamba: true
+          miniforge-variant: Miniforge
           miniforge-version: latest
-          use-mamba: true  # Use mamba natively
+          auto-activate-base: true
 
     - name: Configure conda channels and create environment with mamba
       shell: bash --login {0}

From 562a040070ef22b53477456718ef2b284363c0fb Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 14:38:40 -0400
Subject: [PATCH 127/134] update build yml

---
 .github/workflows/build-tests.yml | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml
index 25eebe21..e269e900 100644
--- a/.github/workflows/build-tests.yml
+++ b/.github/workflows/build-tests.yml
@@ -23,23 +23,22 @@ jobs:
       shell: bash --login {0}
       run: unzip TestData.zip
 
-    - name: Set up conda (miniconda)
+    - name: Set up conda environment
       uses: conda-incubator/setup-miniconda@v2
       with:
-          use-mamba: true
-          miniforge-variant: Miniforge
-          miniforge-version: latest
-          auto-activate-base: true
+        activate-environment: contextsv
+        environment-file: environment.yml
+        python-version: 3.10
+        auto-activate-base: false
 
-    - name: Configure conda channels and create environment with mamba
-      shell: bash --login {0}
+    - name: Configure conda channels (remove defaults)
+      shell: bash -l {0}
       run: |
-        conda config --remove channels defaults || true
+        conda config --remove channels defaults
         conda config --add channels conda-forge
         conda config --add channels bioconda
         conda config --set channel_priority strict
-        mamba env create -f environment.yml
-        conda activate contextsv
+        conda info
 
     - name: Build C++ code
       shell: bash --login {0}  # --login enables PATH variable access

From 3b1f02a760b3797606cce4a37fe6f6c6a483330e Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 14:45:54 -0400
Subject: [PATCH 128/134] build yml update

---
 .github/workflows/build-tests.yml | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml
index e269e900..52b9cca1 100644
--- a/.github/workflows/build-tests.yml
+++ b/.github/workflows/build-tests.yml
@@ -23,22 +23,21 @@ jobs:
       shell: bash --login {0}
       run: unzip TestData.zip
 
-    - name: Set up conda environment
+    - name: Set up conda (Miniconda only)
       uses: conda-incubator/setup-miniconda@v2
       with:
-        activate-environment: contextsv
-        environment-file: environment.yml
+        auto-activate-base: true
         python-version: 3.10
-        auto-activate-base: false
 
-    - name: Configure conda channels (remove defaults)
+    - name: Configure conda channels and create environment
       shell: bash -l {0}
       run: |
-        conda config --remove channels defaults
+        conda config --remove channels defaults || true
         conda config --add channels conda-forge
         conda config --add channels bioconda
         conda config --set channel_priority strict
-        conda info
+        conda info  # confirm the change
+        conda env create -f environment.yml
 
     - name: Build C++ code
       shell: bash --login {0}  # --login enables PATH variable access

From b58db5116bcf8d2b7283ed89ab62b7c40db317e5 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 14:50:02 -0400
Subject: [PATCH 129/134] update build yml

---
 .github/workflows/build-tests.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml
index 52b9cca1..f3358d34 100644
--- a/.github/workflows/build-tests.yml
+++ b/.github/workflows/build-tests.yml
@@ -27,7 +27,6 @@ jobs:
       uses: conda-incubator/setup-miniconda@v2
       with:
         auto-activate-base: true
-        python-version: 3.10
 
     - name: Configure conda channels and create environment
       shell: bash -l {0}

From d4b6c0c63723db4482667b5b25d72062e1dc738e Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 14:53:55 -0400
Subject: [PATCH 130/134] set build env

---
 .github/workflows/build-tests.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml
index f3358d34..274ce4ae 100644
--- a/.github/workflows/build-tests.yml
+++ b/.github/workflows/build-tests.yml
@@ -41,10 +41,14 @@ jobs:
     - name: Build C++ code
       shell: bash --login {0}  # --login enables PATH variable access
       run: |
+        source $(conda info --base)/etc/profile.d/conda.sh
+        conda activate contextsv
         make
 
     - name: Run unit tests
       shell: bash --login {0}
       run: |
+        source $(conda info --base)/etc/profile.d/conda.sh
+        conda activate contextsv
         mkdir -p tests/output
         python -m pytest -s -v tests/test_general.py

From a93076af17c15f35c39cd512e1b57331083d8427 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 15:04:24 -0400
Subject: [PATCH 131/134] update build yml

---
 .github/workflows/build-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml
index 274ce4ae..5100a6ad 100644
--- a/.github/workflows/build-tests.yml
+++ b/.github/workflows/build-tests.yml
@@ -43,7 +43,7 @@ jobs:
       run: |
         source $(conda info --base)/etc/profile.d/conda.sh
         conda activate contextsv
-        make
+        make CONDA_PREFIX=$CONDA_PREFIX
 
     - name: Run unit tests
       shell: bash --login {0}

From 82ad9b231c930b82538c098ae88f3c89bf9adee8 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 15:36:00 -0400
Subject: [PATCH 132/134] htslib debug output

---
 .github/workflows/build-tests.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml
index 5100a6ad..79d9bdd2 100644
--- a/.github/workflows/build-tests.yml
+++ b/.github/workflows/build-tests.yml
@@ -43,6 +43,8 @@ jobs:
       run: |
         source $(conda info --base)/etc/profile.d/conda.sh
         conda activate contextsv
+        echo "CONDA_PREFIX=$CONDA_PREFIX"
+        ls -l $CONDA_PREFIX/include/htslib
         make CONDA_PREFIX=$CONDA_PREFIX
 
     - name: Run unit tests

From 7e2b09f062784823e7e4cd87fac7b2a6b1902510 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 15:39:14 -0400
Subject: [PATCH 133/134] update integer type

---
 include/sv_caller.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/include/sv_caller.h b/include/sv_caller.h
index 997603ef..a0883e6c 100644
--- a/include/sv_caller.h
+++ b/include/sv_caller.h
@@ -22,8 +22,8 @@ class SVCaller {
     private:
         struct GenomicRegion {
             int tid;
-            hts_pos_t start;
-            hts_pos_t end;
+            int start;
+            int end;
             int query_start;
             int query_end;
             bool strand;
@@ -31,8 +31,8 @@ class SVCaller {
         };
 
         struct PrimaryAlignment {
-            hts_pos_t start;
-            hts_pos_t end;
+            int start;
+            int end;
             int query_start;
             int query_end;
             bool strand;
@@ -41,8 +41,8 @@ class SVCaller {
 
         struct SuppAlignment {
             int tid;
-            hts_pos_t start;
-            hts_pos_t end;
+            int start;
+            int end;
             int query_start;
             int query_end;
             bool strand;
@@ -50,18 +50,18 @@ class SVCaller {
 
         struct SplitSignature {
             int tid;
-            hts_pos_t start;
-            hts_pos_t end;
+            int start;
+            int end;
             bool strand;
-            hts_pos_t query_start;
-            hts_pos_t query_end;
+            int query_start;
+            int query_end;
         };
 
         // Interval Tree Node
         struct IntervalNode {
             PrimaryAlignment region;
             std::string qname;
-            hts_pos_t max_end;  // To optimize queries
+            int max_end;  // To optimize queries
             std::unique_ptr<IntervalNode> left;
             std::unique_ptr<IntervalNode> right;
 

From be814572c53703d6a036e56d5c733c93d7fbbc29 Mon Sep 17 00:00:00 2001
From: jonperdomo <jonperdomodb@gmail.com>
Date: Fri, 1 Aug 2025 15:49:47 -0400
Subject: [PATCH 134/134] update unit test

---
 .github/workflows/build-tests.yml | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml
index 79d9bdd2..5c4bbb13 100644
--- a/.github/workflows/build-tests.yml
+++ b/.github/workflows/build-tests.yml
@@ -52,5 +52,11 @@ jobs:
       run: |
         source $(conda info --base)/etc/profile.d/conda.sh
         conda activate contextsv
-        mkdir -p tests/output
-        python -m pytest -s -v tests/test_general.py
+        ./build/contextsv --version
+        ./build/contextsv --help
+
+      # run: |
+      #   source $(conda info --base)/etc/profile.d/conda.sh
+      #   conda activate contextsv
+      #   mkdir -p tests/output
+      #   python -m pytest -s -v tests/test_general.py