From 253e9719863b67a8f7a3265c54029f181ec76901 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 30 Oct 2024 13:20:26 -0400 Subject: [PATCH 001/134] Merging and batch processing improvements --- python/sv_merger.py | 69 ++++++++++++++++++++++++++++++++------------- src/contextsv.cpp | 9 +++--- src/sv_caller.cpp | 42 ++++++++++++++++----------- src/sv_data.cpp | 28 ++++++++++++++---- 4 files changed, 101 insertions(+), 47 deletions(-) diff --git a/python/sv_merger.py b/python/sv_merger.py index 2d0027b1..56c0ae26 100644 --- a/python/sv_merger.py +++ b/python/sv_merger.py @@ -89,6 +89,12 @@ def update_support(record, cluster_size): return record +def weighted_score(read_support, hmm_score, sv_len, weight_hmm, weight_sv_len): + """ + Calculate a weighted score based on read support and HMM score. + """ + return (1 - weight_hmm) * read_support + weight_hmm * hmm_score + def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): """ Cluster SV breakpoints using HDBSCAN. @@ -131,22 +137,24 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): cluster_labels = [] # dbscan = DBSCAN(eps=30000, min_samples=3) - dbscan = HDBSCAN(min_cluster_size=cluster_size_min, min_samples=3) + logging.info("Clustering %d SV breakpoints with parameters: min_cluster_size=%d", len(breakpoints), cluster_size_min) + dbscan = HDBSCAN(min_cluster_size=cluster_size_min, min_samples=2) if len(breakpoints) > 0: logging.info("Clustering %d SV breakpoints...", len(breakpoints)) cluster_labels = dbscan.fit_predict(breakpoints) logging.info("Label counts: %d", len(np.unique(cluster_labels))) - # Set all 0 values to NaN - hmm_scores[hmm_scores == 0] = np.nan - # Merge SVs with the same label unique_labels = np.unique(cluster_labels) + logging.info("Unique labels: %s", unique_labels) + for label in unique_labels: # Skip label -1 (outliers) - if label == -1: + # if label == -1: + # Skip label -1 (outliers) only if there are no other clusters + if label == -1 and len(unique_labels) > 1: # # Print the positions if any are within a certain range # pos_min = 180915940 # pos_max = 180950356 @@ -171,9 +179,10 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): idx = cluster_labels == label # Get HMM and read support values for the cluster - max_score_idx = 0 # Default to the first SV in the cluster + # max_score_idx = 0 # Default to the first SV in the cluster cluster_hmm_scores = np.array(hmm_scores[idx]) cluster_depth_scores = np.array(sv_support[idx]) + cluster_sv_lengths = np.array(breakpoints[idx][:, 1] - breakpoints[idx][:, 0] + 1) max_hmm = None max_support = None max_hmm_idx = None @@ -189,20 +198,40 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): max_support_idx = np.argmax(cluster_depth_scores) max_support = cluster_depth_scores[max_support_idx] - # For deletions, choose the SV with the highest HMM score if available - if sv_type == 'DEL': - if max_hmm is not None: - max_score_idx = max_hmm_idx - elif max_support is not None: - max_score_idx = max_support_idx - - # For insertions and duplications, choose the SV with the highest read - # support if available - elif sv_type == 'INS/DUP': - if max_support is not None: - max_score_idx = max_support_idx - elif max_hmm is not None: - max_score_idx = max_hmm_idx + # Use a weighted approach to choose the best SV based on HMM and + # support. Deletions have higher priority for HMM scores, while + # insertions and duplications have higher priority for read alignment + # support. + # hmm_weight = 0.7 if sv_type == 'DEL' else 0.3 + hmm_weight = 0.4 + sv_len_weight = 0.4 + max_score_idx = 0 # Default to the first SV in the cluster + max_score = weighted_score(cluster_depth_scores[max_score_idx], cluster_hmm_scores[max_score_idx], cluster_sv_lengths[max_score_idx], hmm_weight, sv_len_weight) + for k, hmm_loglh in enumerate(cluster_hmm_scores): + sv_len = cluster_sv_lengths[k] / 1000 # Normalize SV length to kilobases + read_support = cluster_depth_scores[k] + score = weighted_score(read_support, hmm_loglh, sv_len, hmm_weight, sv_len_weight) + if score > max_score: + max_score = score + max_score_idx = k + + # Get the VCF record with the highest depth score + max_record = vcf_df.iloc[idx, :].iloc[max_score_idx, :] + + # # For deletions, choose the SV with the highest HMM score if available + # if sv_type == 'DEL': + # if max_hmm is not None: + # max_score_idx = max_hmm_idx + # elif max_support is not None: + # max_score_idx = max_support_idx + + # # For insertions and duplications, choose the SV with the highest read + # # support if available + # elif sv_type == 'INS/DUP': + # if max_support is not None: + # max_score_idx = max_support_idx + # elif max_hmm is not None: + # max_score_idx = max_hmm_idx # Get the VCF record with the highest depth score max_record = vcf_df.iloc[idx, :].iloc[max_score_idx, :] diff --git a/src/contextsv.cpp b/src/contextsv.cpp index 47d68054..0a502881 100644 --- a/src/contextsv.cpp +++ b/src/contextsv.cpp @@ -32,7 +32,7 @@ int ContextSV::run() SVData sv_calls = sv_caller.run(); // Print the total number of SVs called - std::cout << "Total SVs called: " << sv_calls.totalCalls() << std::endl; + // std::cout << "Total SVs called: " << sv_calls.totalCalls() << std::endl; // Write SV calls to file std::string output_dir = this->input_data->getOutputDir(); @@ -40,9 +40,10 @@ int ContextSV::run() sv_calls.saveToVCF(ref_genome, output_dir); // Format and print the time taken to call SVs - auto end_sv = std::chrono::high_resolution_clock::now(); - std::string elapsed_time = getElapsedTime(start_sv, end_sv); - std::cout << "SV calling complete. Found " << sv_calls.totalCalls() << " total SVs. Time taken (h:m:s) = " << elapsed_time << std::endl; + // auto end_sv = std::chrono::high_resolution_clock::now(); + // std::string elapsed_time = getElapsedTime(start_sv, end_sv); + std::cout << "SV calling complete." << std::endl; + // std::cout << "SV calling complete. Found " << sv_calls.totalCalls() << " total SVs. Time taken (h:m:s) = " << elapsed_time << std::endl; return 0; } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 179a5360..76011de9 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -411,12 +411,13 @@ SVData SVCaller::run() } int chr_count = chromosomes.size(); - // Loop through each region and detect SVs + // Loop through each region and detect SVs (Note: The main loop is + // single-threaded) std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl; + int chunk_count = 100; // Number of chunks to split the chromosome into int region_count = 0; auto start1 = std::chrono::high_resolution_clock::now(); SVData sv_calls; - int chunk_count = 10000; // Number of chunks to split the chromosome into int min_cnv_length = this->input_data->getMinCNVLength(); for (const auto& chr : chromosomes) { std::cout << "Running SV detection for chromosome " << chr << "..." << std::endl; @@ -433,10 +434,14 @@ SVData SVCaller::run() // Use one chunk for the region std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end); region_chunks.push_back(chunk); + std::cout << "Using specified region " << chunk << "..." << std::endl; } else { int chr_len = this->input_data->getRefGenomeChromosomeLength(chr); - int chunk_size = chr_len / chunk_count; + std::cout << "Chromosome length: " << chr_len << std::endl; + std::cout << "Chunk count: " << chunk_count << std::endl; + int chunk_size = std::ceil((double)chr_len / chunk_count); + std::cout << "Chunk size: " << chunk_size << std::endl; for (int i = 0; i < chunk_count; i++) { int start = i * chunk_size + 1; // 1-based int end = start + chunk_size; @@ -446,6 +451,7 @@ SVData SVCaller::run() std::string chunk = chr + ":" + std::to_string(start) + "-" + std::to_string(end); region_chunks.push_back(chunk); } + std::cout << "Split chromosome " << chr << " into " << region_chunks.size() << " chunks." << std::endl; } // Load chromosome data for copy number predictions @@ -494,8 +500,10 @@ SVData SVCaller::run() // std::cout << "Extracted aligments for " << region_count << " of " << chr_count << " chromosome(s)..." << std::endl; } - auto end1 = std::chrono::high_resolution_clock::now(); - std::cout << "Finished detecting " << sv_calls.totalCalls() << " SVs from " << chr_count << " chromosome(s). Elapsed time: " << getElapsedTime(start1, end1) << std::endl; + // auto end1 = std::chrono::high_resolution_clock::now(); + std::cout << "SV calling completed." << std::endl; + // int total_sv_calls = sv_calls.totalCalls(); + // std::cout << "Finished detecting " << sv_calls.totalCalls() << " SVs from " << chr_count << " chromosome(s). Elapsed time: " << getElapsedTime(start1, end1) << std::endl; return sv_calls; } @@ -582,12 +590,12 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map std::vector> sv_list; // SV candidate and alignment type // Use the gap ends as the SV endpoints - if (primary_start - supp_end >= min_cnv_length) { - SVCandidate sv_candidate(supp_end+1, primary_start+1, "."); - std::pair sv_pair(sv_candidate, "GAPINNER_A"); - sv_list.push_back(sv_pair); - sv_count++; - } + // if (primary_start - supp_end >= min_cnv_length) { + // SVCandidate sv_candidate(supp_end+1, primary_start+1, "."); + // std::pair sv_pair(sv_candidate, "GAPINNER_A"); + // sv_list.push_back(sv_pair); + // sv_count++; + // } // Also use the alignment ends as the SV endpoints if (primary_end - supp_start >= min_cnv_length) { @@ -608,12 +616,12 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map std::vector> sv_list; // SV candidate and alignment type // Use the gap ends as the SV endpoints - if (supp_start - primary_end >= min_cnv_length) { - SVCandidate sv_candidate(primary_end+1, supp_start+1, "."); - std::pair sv_pair(sv_candidate, "GAPINNER_B"); - sv_list.push_back(sv_pair); - sv_count++; - } + // if (supp_start - primary_end >= min_cnv_length) { + // SVCandidate sv_candidate(primary_end+1, supp_start+1, "."); + // std::pair sv_pair(sv_candidate, "GAPINNER_B"); + // sv_list.push_back(sv_pair); + // sv_count++; + // } // Also use the alignment ends as the SV endpoints if (supp_end - primary_start >= min_cnv_length) { diff --git a/src/sv_data.cpp b/src/sv_data.cpp index 96e5a2fd..37ff32c3 100644 --- a/src/sv_data.cpp +++ b/src/sv_data.cpp @@ -133,12 +133,31 @@ int SVData::getClippedBaseSupport(std::string chr, int64_t pos, int64_t end) void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) { // Create a VCF writer + std::cout << "Creating VCF writer..." << std::endl; std::string output_vcf = output_dir + "/output.vcf"; VcfWriter vcf_writer(output_vcf); + std::cout << "Writing VCF file to " << output_vcf << std::endl; // Set the sample name std::string sample_name = "SAMPLE"; + std::cout << "Getting reference genome filepath..." << std::endl; + try { + std::string ref_fp = ref_genome.getFilepath(); + std::cout << "Reference genome filepath: " << ref_fp << std::endl; + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + return; + } + + std::cout << "Getting reference genome header..." << std::endl; + try { + ref_genome.getContigHeader(); + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + return; + } + // Set the header lines std::vector header_lines = { std::string("##reference=") + ref_genome.getFilepath(), @@ -159,6 +178,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) }; // Write the header lines + std::cout << "Writing VCF header..." << std::endl; vcf_writer.writeHeader(header_lines); // Save the SV calls @@ -251,12 +271,6 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) alt_allele = ""; // Set the repeat type as an interspersed duplication - repeat_type = "INTERSPERSED"; - } else if (sv_type == TANDUP) { - // Use a symbolic allele for tandem duplications - alt_allele = ""; - - // Set the repeat type repeat_type = "TANDEM"; } } @@ -301,10 +315,12 @@ std::set SVData::getChromosomes() int SVData::totalCalls() { + std::cout << "Calculating total SV calls..." << std::endl; int sv_calls = 0; for (auto const& sv_call : this->sv_calls) { sv_calls += sv_call.second.size(); } + std::cout << "Total SV calls: " << sv_calls << std::endl; return sv_calls; } From e7211ed6ad84f2c0d31aa6666958a7638d489bf1 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 31 Oct 2024 14:48:57 -0400 Subject: [PATCH 002/134] Add inversion detection --- include/cnv_caller.h | 6 ++--- include/sv_caller.h | 4 +-- include/sv_types.h | 4 +-- src/cnv_caller.cpp | 32 ++++++++++++++++------- src/sv_caller.cpp | 59 ++++++++++++++++++++++++++++++++++--------- src/sv_data.cpp | 6 ++--- tests/test_general.py | 2 +- 7 files changed, 81 insertions(+), 32 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index c913c24b..7b52e05e 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -83,8 +83,8 @@ class CNVCaller { {0, sv_types::UNKNOWN}, {1, sv_types::DEL}, {2, sv_types::DEL}, - {3, sv_types::UNKNOWN}, - {4, sv_types::UNKNOWN}, + {3, sv_types::NEUTRAL}, + {4, sv_types::NEUTRAL}, {5, sv_types::DUP}, {6, sv_types::DUP} }; @@ -125,7 +125,7 @@ class CNVCaller { // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings SNPData runCIGARCopyNumberPrediction(std::string chr, std::map& sv_candidates, int min_length); - void updateSVsFromCopyNumberPrediction(SVData& sv_calls, std::vector>& sv_list, std::string chr); + void updateSVsFromCopyNumberPrediction(SVData& sv_calls, std::vector>& sv_list, std::string chr, bool inversion); // Calculate the mean chromosome coverage double calculateMeanChromosomeCoverage(std::string chr); diff --git a/include/sv_caller.h b/include/sv_caller.h index c461d101..ed11d08f 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -17,8 +17,8 @@ /// @endcond // SV candidate alignment data (chr, start, end, sequence, query start, query -// end, mismatch map) -using AlignmentData = std::tuple>; +// end, mismatch map, strand) +using AlignmentData = std::tuple, bool>; using AlignmentVector = std::vector; // Query map (query name, alignment vector) diff --git a/include/sv_types.h b/include/sv_types.h index 7e002777..af82ffec 100644 --- a/include/sv_types.h +++ b/include/sv_types.h @@ -16,11 +16,11 @@ namespace sv_types { static const int INV = 2; static const int INS = 3; static const int BND = 4; - static const int TANDUP = 5; // Tandem duplication + static const int NEUTRAL = 5; // Neutral copy number with unknown type static const int UNKNOWN = -1; // Define SVTypeString for SV types - static const std::string SVTypeString[] = {"DEL", "DUP", "INV", "INS", "BND", "DUP"}; + static const std::string SVTypeString[] = {"DEL", "DUP", "INV", "INS", "BND", "NEUT"}; // Create a struct for storing SV information struct SVInfo { diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index ca0588a2..255b1ba6 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -35,6 +35,10 @@ using namespace sv_types; std::pair, double> CNVCaller::runViterbi(CHMM hmm, SNPData& snp_data) { int data_count = (int) snp_data.pos.size(); + if (data_count == 0) + { + throw std::runtime_error("Error: No SNP data found for Viterbi algorithm."); + } std::lock_guard lock(this->hmm_mtx); // Lock the mutex for the HMM std::pair, double> state_sequence = testVit_CHMM(hmm, data_count, snp_data.log2_cov, snp_data.baf, snp_data.pfb); return state_sequence; @@ -47,12 +51,6 @@ std::pair CNVCaller::querySNPRegion(std::string chr, int64_t star bool snps_found = false; int window_size = this->input_data->getWindowSize(); - // std::cout << "Querying SNPs for region " << chr << ":" << start_pos << - // "-" << end_pos << "..." << std::endl; - // TEST - if (start_pos == 43593639 && end_pos == 43608172) { - printMessage("Querying SNPs for region " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); - } // printMessage("Querying SNPs for region " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); for (int64_t i = start_pos; i <= end_pos; i += window_size) { @@ -117,7 +115,7 @@ std::pair CNVCaller::querySNPRegion(std::string chr, int64_t star return std::make_pair(snp_data, snps_found); } -void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector> &sv_list, std::string chr) +void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector> &sv_list, std::string chr, bool inversion) { // Throw an error if there are more than two SV candidates if (sv_list.size() > 2) { @@ -155,6 +153,21 @@ void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector< aln_type += "_NOSNPS"; } + // Update the SV type if inversion is detected and the best CNV type is + // copy neutral + if (inversion && (best_cnv_type == sv_types::NEUTRAL)) + { + best_cnv_type = sv_types::INV; + printMessage("Inversion detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); + } + + // If the dummy call was used, then throw an error if the best SV type + // is unknown + if (std::get<0>(best_sv_candidate) == 0 && std::get<1>(best_sv_candidate) == 0) + { + throw std::runtime_error("Error: No valid SV type found for copy number prediction."); + } + // Add the SV call to the main SV data sv_calls.add(chr, start_pos, end_pos, best_cnv_type, ".", aln_type, best_genotype, best_likelihood); } @@ -204,7 +217,7 @@ std::tuple CNVCaller::runCopyNumberPredicti // Query the SNP region for the SV candidate std::pair snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, this->snp_info, this->pos_depth_map, this->mean_chr_cov); - SNPData sv_snps = snp_call.first; + SNPData& sv_snps = snp_call.first; bool sv_snps_found = snp_call.second; // Run the Viterbi algorithm @@ -264,7 +277,8 @@ std::tuple CNVCaller::runCopyNumberPredicti // Save the SV calls as a TSV file if enabled int64_t sv_start_pos = std::get<0>(best_pos); int64_t sv_end_pos = std::get<1>(best_pos); - if (this->input_data->getSaveCNVData() && predicted_cnv_type != sv_types::UNKNOWN && (sv_end_pos - sv_start_pos) > 10000) + bool copy_number_change = (predicted_cnv_type != sv_types::UNKNOWN && predicted_cnv_type != sv_types::NEUTRAL); + if (this->input_data->getSaveCNVData() && copy_number_change && (sv_end_pos - sv_start_pos) > 10000) { std::string cnv_type_str = SVTypeString[predicted_cnv_type]; std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) sv_start_pos) + "-" + std::to_string((int) sv_end_pos) + "_SPLITALN.tsv"; diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 76011de9..f800d1c3 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -89,6 +89,9 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) int64_t start = bam1->core.pos; int64_t end = bam_endpos(bam1); // This is the first position after the alignment + // Get the strand + bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); + // Call SVs directly from the CIGAR string std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true); std::unordered_map match_map = std::get<0>(query_info); @@ -96,7 +99,7 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) int32_t query_end = std::get<2>(query_info); // Add the primary alignment to the map - AlignmentData alignment(chr, start, end, ".", query_start, query_end, match_map); + AlignmentData alignment(chr, start, end, ".", query_start, query_end, std::move(match_map), fwd_strand); primary_alignments[qname] = std::move(alignment); // Process supplementary alignments @@ -107,6 +110,9 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) int32_t start = bam1->core.pos; int32_t end = bam_endpos(bam1); + // Get the strand + bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); + // Get CIGAR string information, but don't call SVs std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false); const std::unordered_map& match_map = std::get<0>(query_info); @@ -114,7 +120,7 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) int32_t query_end = std::get<2>(query_info); // Add the supplementary alignment to the map - AlignmentData alignment(chr, start, end, ".", query_start, query_end, std::move(match_map)); + AlignmentData alignment(chr, start, end, ".", query_start, query_end, std::move(match_map), fwd_strand); supplementary_alignments[qname].emplace_back(alignment); // If Read ID == 8873acc1-eb84-415d-8557-a32a8f52ccee, print the @@ -438,10 +444,10 @@ SVData SVCaller::run() } else { int chr_len = this->input_data->getRefGenomeChromosomeLength(chr); - std::cout << "Chromosome length: " << chr_len << std::endl; - std::cout << "Chunk count: " << chunk_count << std::endl; + // std::cout << "Chromosome length: " << chr_len << std::endl; + // std::cout << "Chunk count: " << chunk_count << std::endl; int chunk_size = std::ceil((double)chr_len / chunk_count); - std::cout << "Chunk size: " << chunk_size << std::endl; + // std::cout << "Chunk size: " << chunk_size << std::endl; for (int i = 0; i < chunk_count; i++) { int start = i * chunk_size + 1; // 1-based int end = start + chunk_size; @@ -451,7 +457,7 @@ SVData SVCaller::run() std::string chunk = chr + ":" + std::to_string(start) + "-" + std::to_string(end); region_chunks.push_back(chunk); } - std::cout << "Split chromosome " << chr << " into " << region_chunks.size() << " chunks." << std::endl; + std::cout << "Split chromosome " << chr << " into " << region_chunks.size() << " chunks of size " << chunk_size << "..." << std::endl; } // Load chromosome data for copy number predictions @@ -524,6 +530,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map int32_t primary_query_start = std::get<4>(primary_alignment); int32_t primary_query_end = std::get<5>(primary_alignment); std::unordered_map primary_match_map = std::get<6>(primary_alignment); + bool primary_strand = std::get<7>(primary_alignment); // Loop through the supplementary alignments and find gaps and overlaps AlignmentVector supp_alignments = supp_map[qname]; @@ -543,9 +550,9 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map int32_t supp_query_start = std::get<4>(supp_alignment); int32_t supp_query_end = std::get<5>(supp_alignment); std::unordered_map supp_match_map = std::get<6>(supp_alignment); + bool supp_strand = std::get<7>(supp_alignment); - // Determine if there is overlap between the primary and - // supplementary query sequences + // Resolve overlaps between the primary and supplementary query sequences int32_t overlap_start = std::max(primary_query_start, supp_query_start); int32_t overlap_end = std::min(primary_query_end, supp_query_end); int32_t overlap_length = overlap_end - overlap_start; @@ -582,8 +589,36 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map } } - // Gap analysis (deletion or duplication) - if (supp_start < primary_start && supp_end < primary_start) { + // [1] Inversion detection from primary and supplementary alignments + // on opposite strands + if (primary_strand != supp_strand) { + // std::cout << "Inversion detected for read " << qname << std::endl; + // std::cout << "Primary read position: " << primary_start << "-" << primary_end << std::endl; + // std::cout << "Supplementary read position: " << supp_start << "-" << supp_end << std::endl; + + std::vector> sv_list; // SV candidate and alignment type + + // Use the supplementary alignment coordinates as the SV + // endpoints + if (supp_end - supp_start >= min_cnv_length) { + SVCandidate sv_candidate(supp_start+1, supp_end+1, "."); + std::pair sv_pair(sv_candidate, "INVERSION"); + sv_list.push_back(sv_pair); + sv_count++; + // SVCandidate sv_candidate(supp_start+1, primary_end+1, "."); + // std::pair sv_pair(sv_candidate, "INVERSION"); + // sv_list.push_back(sv_pair); + // sv_count++; + } + + // Determine which SV to keep based on HMM prediction likelihood + if (sv_list.size() > 0) { + cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr, true); + } + } + + // [2] CNV detection based on primary and supplementary alignment boundaries + else if (supp_start < primary_start && supp_end < primary_start) { // Gap with supplementary before primary: // [supp_start] [supp_end] -- [primary_start] [primary_end] @@ -607,7 +642,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // Determine which SV to keep based on HMM prediction likelihood if (sv_list.size() > 0) { - cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr); + cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr, false); } } else if (supp_start > primary_end && supp_end > primary_end) { @@ -633,7 +668,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // Determine which SV to keep based on HMM prediction likelihood if (sv_list.size() > 0) { - cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr); + cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr, false); } } } diff --git a/src/sv_data.cpp b/src/sv_data.cpp index 37ff32c3..2ade74c6 100644 --- a/src/sv_data.cpp +++ b/src/sv_data.cpp @@ -50,7 +50,7 @@ int SVData::add(std::string chr, int64_t start, int64_t end, int sv_type, std::s // For insertions and duplications, the SV length is the length of the // inserted sequence, not including the insertion position int sv_length = 0; - if (sv_type == INS || sv_type == DUP || sv_type == TANDUP) { + if (sv_type == INS || sv_type == DUP) { sv_length = end - start; } else { // For deletions, the SV length is the length of the deletion @@ -216,7 +216,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) int64_t end = std::get<1>(candidate); // If the SV type is unknown, skip it - if (sv_type == UNKNOWN) { + if (sv_type == UNKNOWN || sv_type == NEUTRAL) { skip_count += 1; continue; } @@ -247,7 +247,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) pos = preceding_pos; // Duplications and insertions - } else if (sv_type == INS || sv_type == DUP || sv_type == TANDUP) { + } else if (sv_type == INS || sv_type == DUP) { // Use the preceding base as the reference allele int64_t preceding_pos = (int64_t) std::max(1, (int) pos-1); // Make sure the position is not negative ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos); diff --git a/tests/test_general.py b/tests/test_general.py index 499805c8..9083eb6a 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -64,7 +64,7 @@ def test_run(): # Check that the VCF file has the correct number of lines. with open(output_file, 'r', encoding='utf-8') as f: - assert len(f.readlines()) == 21 + assert len(f.readlines()) == 23 # Check that the VCF file has the correct header, and the correct # VCF CHROM, POS, and INFO fields in the next 2 lines. From 71e375e36baefd2672a20743b9da3ac511528e0f Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Nov 2024 09:13:32 -0400 Subject: [PATCH 003/134] Fix vcf writer error and add invdup --- .gitignore | 3 +++ include/sv_data.h | 16 ++++++++-------- include/sv_types.h | 43 +++++++++++++++++++++++++++++++++---------- include/vcf_writer.h | 4 +--- src/cnv_caller.cpp | 16 ++++++++++++---- src/contextsv.cpp | 2 ++ src/sv_caller.cpp | 18 ++++++++++++++++++ src/sv_data.cpp | 19 +++++++++++++------ src/vcf_writer.cpp | 20 +++++++++----------- 9 files changed, 99 insertions(+), 42 deletions(-) diff --git a/.gitignore b/.gitignore index 50575520..c627421d 100644 --- a/.gitignore +++ b/.gitignore @@ -84,3 +84,6 @@ data/hg19ToHg38.over.chain.gz # Test images python/dbscan_clustering*.png python/dist_plots + +# Temporary files +lib/.nfs* diff --git a/include/sv_data.h b/include/sv_data.h index 414d2eda..44a45590 100644 --- a/include/sv_data.h +++ b/include/sv_data.h @@ -24,14 +24,14 @@ class SVData { std::map, int> clipped_base_support; // SV type to string map for VCF output - std::map sv_type_map = { - {0, "DEL"}, - {1, "DUP"}, - {2, "INV"}, - {3, "INS"}, - {4, "BND"}, - {5, "DUP"} - }; + // std::map sv_type_map = { + // {0, "DEL"}, + // {1, "DUP"}, + // {2, "INV"}, + // {3, "INS"}, + // {4, "BND"}, + // {5, "DUP"} + // }; public: SVData() {}; diff --git a/include/sv_types.h b/include/sv_types.h index af82ffec..2d4573bf 100644 --- a/include/sv_types.h +++ b/include/sv_types.h @@ -10,17 +10,40 @@ /// @endcond namespace sv_types { + // Define constants for SV types - static const int DEL = 0; - static const int DUP = 1; - static const int INV = 2; - static const int INS = 3; - static const int BND = 4; - static const int NEUTRAL = 5; // Neutral copy number with unknown type - static const int UNKNOWN = -1; - - // Define SVTypeString for SV types - static const std::string SVTypeString[] = {"DEL", "DUP", "INV", "INS", "BND", "NEUT"}; + enum SVType { + UNKNOWN = -1, + DEL = 0, + DUP = 1, + INV = 2, + INS = 3, + BND = 4, + NEUTRAL = 5, // Neutral copy number with unknown type + INV_DUP = 6 // Inversion duplication + }; + + // Mapping of SV types to strings + const std::unordered_map SVTypeString = { + {DEL, "DEL"}, + {DUP, "DUP"}, + {INV, "INV"}, + {INS, "INS"}, + {BND, "BND"}, + {NEUTRAL, "NEUT"}, + {INV_DUP, "INVDUP"} + }; + // static const int UNKNOWN = -1; + // static const int DEL = 0; + // static const int DUP = 1; + // static const int INV = 2; + // static const int INS = 3; + // static const int BND = 4; + // static const int NEUTRAL = 5; // Neutral copy number with unknown type + // static const int INV_DUP = 6; // Inversion duplication + + // // Define SVTypeString for SV types (for VCF output) + // static const std::string SVTypeString[] = {"DEL", "DUP", "INV", "INS", "BND", "NEUT", "INVDUP"}; // Create a struct for storing SV information struct SVInfo { diff --git a/include/vcf_writer.h b/include/vcf_writer.h index 800df144..e395ea37 100644 --- a/include/vcf_writer.h +++ b/include/vcf_writer.h @@ -8,6 +8,7 @@ class VcfWriter { public: // Constructor VcfWriter(const std::string& filename); + ~VcfWriter(); void writeHeader(const std::vector& headerLines); void writeRecord(const std::string& chrom, int pos, const std::string& id, const std::string& ref, const std::string& alt, @@ -15,9 +16,6 @@ class VcfWriter { const std::string& info, const std::string& format, const std::vector& samples); - // Close the VCF file - void close(); - private: std::ofstream file_stream; }; diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 255b1ba6..c47bda5e 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -154,11 +154,19 @@ void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector< } // Update the SV type if inversion is detected and the best CNV type is - // copy neutral - if (inversion && (best_cnv_type == sv_types::NEUTRAL)) + // copy neutral or duplication + if (inversion) // && (best_cnv_type == sv_types::NEUTRAL)) { - best_cnv_type = sv_types::INV; - printMessage("Inversion detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); + if (best_cnv_type == sv_types::NEUTRAL) + { + best_cnv_type = sv_types::INV; + } else if (best_cnv_type == sv_types::DUP) + { + best_cnv_type = sv_types::INV_DUP; + printMessage("INVDUP detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); + } + // best_cnv_type = sv_types::INV; + // printMessage("Inversion detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); } // If the dummy call was used, then throw an error if the best SV type diff --git a/src/contextsv.cpp b/src/contextsv.cpp index 0a502881..5553dacc 100644 --- a/src/contextsv.cpp +++ b/src/contextsv.cpp @@ -43,6 +43,8 @@ int ContextSV::run() // auto end_sv = std::chrono::high_resolution_clock::now(); // std::string elapsed_time = getElapsedTime(start_sv, end_sv); std::cout << "SV calling complete." << std::endl; + // int sv_count = sv_calls.totalCalls(); + // std::cout << "Found " << sv_count << " total SVs." << std::endl; // std::cout << "SV calling complete. Found " << sv_calls.totalCalls() << " total SVs. Time taken (h:m:s) = " << elapsed_time << std::endl; return 0; diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index f800d1c3..c79f8ddb 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -589,6 +589,24 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map } } + // TODO: + // if (find_complex_events) + // # Calculate likelihood for entire coordinate + // likelihood_entire = hmm_model.predict_likelihood(entire_coordinate) + + // # Split coordinates into smaller sections and calculate likelihoods + // subsections = split_coordinates(entire_coordinate) + // likelihoods_subsections = [hmm_model.predict_likelihood(sub) for sub in subsections] + + // # Determine best likelihood from subsections + // best_likelihood_split = max(likelihoods_subsections) + + // # Compare and decide + // if likelihood_entire > best_likelihood_split: + // best_choice = "entire coordinate" + // else: + // best_choice = "split coordinates" + // [1] Inversion detection from primary and supplementary alignments // on opposite strands if (primary_strand != supp_strand) { diff --git a/src/sv_data.cpp b/src/sv_data.cpp index 2ade74c6..3ea742f3 100644 --- a/src/sv_data.cpp +++ b/src/sv_data.cpp @@ -135,6 +135,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) // Create a VCF writer std::cout << "Creating VCF writer..." << std::endl; std::string output_vcf = output_dir + "/output.vcf"; + std::cout << "Writing VCF file to " << output_vcf << std::endl; VcfWriter vcf_writer(output_vcf); std::cout << "Writing VCF file to " << output_vcf << std::endl; @@ -184,8 +185,8 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) // Save the SV calls std::cout << "Saving SV calls to " << output_vcf << std::endl; std::string sv_method = "CONTEXTSVv0.1"; - int num_sv_calls = this->totalCalls(); int skip_count = 0; + int total_count = 0; std::set chrs = this->getChromosomes(); for (auto const& chr : chrs) { if (this->sv_calls.find(chr) == this->sv_calls.end()) { @@ -219,6 +220,8 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) if (sv_type == UNKNOWN || sv_type == NEUTRAL) { skip_count += 1; continue; + } else { + total_count += 1; } // Process by SV type @@ -277,7 +280,8 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) // Create the VCF parameter strings int clipped_base_support = this->getClippedBaseSupport(chr, pos, end); - std::string sv_type_str = this->sv_type_map[sv_type]; + // std::string sv_type_str = this->sv_type_map[sv_type]; + std::string sv_type_str = sv_types::SVTypeString[sv_type]; std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \ ";SVLEN=" + std::to_string(sv_length) + ";SUPPORT=" + std::to_string(read_support) + \ ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";CLIPSUP=" + std::to_string(clipped_base_support) + \ @@ -293,10 +297,13 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) } // Print the number of SV calls skipped - std::cout << "Skipped " << skip_count << " of " << num_sv_calls << " SV calls because the SV type is unknown" << std::endl; + std::cout << "Finished writing VCF file." << std::endl; + // int num_sv_calls = this->totalCalls(); + // std::cout << "Skipped " << skip_count << " of " << num_sv_calls << " SV calls because the SV type is unknown" << std::endl; + // std::cout << "Finished writing VCF file with " << num_sv_calls - skip_count << " SV calls" << std::endl; // Close the output stream - vcf_writer.close(); + // vcf_writer.close(); } std::map& SVData::getChromosomeSVs(std::string chr) @@ -315,12 +322,12 @@ std::set SVData::getChromosomes() int SVData::totalCalls() { - std::cout << "Calculating total SV calls..." << std::endl; + // std::cout << "Calculating total SV calls..." << std::endl; int sv_calls = 0; for (auto const& sv_call : this->sv_calls) { sv_calls += sv_call.second.size(); } - std::cout << "Total SV calls: " << sv_calls << std::endl; + // std::cout << "Total SV calls: " << sv_calls << std::endl; return sv_calls; } diff --git a/src/vcf_writer.cpp b/src/vcf_writer.cpp index eaf41ea5..8c93a36f 100644 --- a/src/vcf_writer.cpp +++ b/src/vcf_writer.cpp @@ -7,17 +7,21 @@ VcfWriter::VcfWriter(const std::string &filename) { - // Remove the file if it already exists - std::remove(filename.c_str()); - - // Open the VCF file - this->file_stream.open(filename); + // Open the VCF file, overwrite if it already exists + this->file_stream.open(filename, std::ios::out); if (!this->file_stream.is_open()) { std::cerr << "Error: Unable to open " << filename << std::endl; exit(1); } } +VcfWriter::~VcfWriter() +{ + if (this->file_stream.is_open()) { + this->file_stream.close(); + } +} + void VcfWriter::writeHeader(const std::vector &headerLines) { // Add the file format @@ -55,9 +59,3 @@ void VcfWriter::writeRecord(const std::string &chrom, int pos, const std::string // Write a record to the VCF file this->file_stream << chrom << "\t" << pos << "\t" << id << "\t" << ref << "\t" << alt << "\t" << qual << "\t" << filter << "\t" << info << "\t" << format << "\t" << samples[0] << std::endl; } - -void VcfWriter::close() -{ - // Close the VCF file - this->file_stream.close(); -} From e4df9d16fa854080d54981f289db4171a2f918c8 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Nov 2024 13:02:13 -0400 Subject: [PATCH 004/134] Update sv types --- include/cnv_caller.h | 15 ++-------- include/sv_data.h | 4 +-- include/sv_types.h | 65 +++++++++++++++++++++++++++----------------- src/cnv_caller.cpp | 47 ++++++++++++++------------------ src/contextsv.cpp | 29 ++++---------------- src/sv_caller.cpp | 20 ++------------ src/sv_data.cpp | 51 ++++++++++++---------------------- 7 files changed, 90 insertions(+), 141 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 7b52e05e..23a70640 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -78,17 +78,6 @@ class CNVCaller { {6, "1/1"} }; - // Define a map of CNV types by HMM predicted state (0=No predicted state) - std ::map cnv_type_map = { - {0, sv_types::UNKNOWN}, - {1, sv_types::DEL}, - {2, sv_types::DEL}, - {3, sv_types::NEUTRAL}, - {4, sv_types::NEUTRAL}, - {5, sv_types::DUP}, - {6, sv_types::DUP} - }; - void updateSNPData(SNPData& snp_data, int64_t pos, double pfb, double baf, double log2_cov, bool is_snp); std::pair, double> runViterbi(CHMM hmm, SNPData &snp_data); @@ -99,7 +88,7 @@ class CNVCaller { // Run copy number prediction for a chunk of SV candidates from CIGAR strings void runCIGARCopyNumberPredictionChunk(std::string chr, std::map& sv_candidates, std::vector sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map& pos_depth_map); - void updateSVCopyNumber(std::map& sv_candidates, SVCandidate key, int sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood); + void updateSVCopyNumber(std::map& sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood); void updateDPValue(std::map& sv_candidates, SVCandidate key, int dp_value); @@ -120,7 +109,7 @@ class CNVCaller { // Run copy number prediction for a pair of SV candidates, and add only // the SV candidate with the highest likelihood - std::tuple runCopyNumberPredictionPair(std::string chr, SVCandidate sv_one, SVCandidate sv_two); + std::tuple runCopyNumberPredictionPair(std::string chr, SVCandidate sv_one, SVCandidate sv_two); // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings SNPData runCIGARCopyNumberPrediction(std::string chr, std::map& sv_candidates, int min_length); diff --git a/include/sv_data.h b/include/sv_data.h index 44a45590..548d6513 100644 --- a/include/sv_data.h +++ b/include/sv_data.h @@ -1,7 +1,6 @@ #ifndef SV_DATA_H #define SV_DATA_H -#include "fasta_query.h" // For querying the reference genome /// @cond #include @@ -10,6 +9,7 @@ #include #include "sv_types.h" +#include "fasta_query.h" /// @endcond // Include the SV types namespace @@ -36,7 +36,7 @@ class SVData { public: SVData() {}; - int add(std::string chr, int64_t start, int64_t end, int sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood); + int add(std::string chr, int64_t start, int64_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood); void concatenate(const SVData& sv_data); diff --git a/include/sv_types.h b/include/sv_types.h index 2d4573bf..97b49185 100644 --- a/include/sv_types.h +++ b/include/sv_types.h @@ -12,7 +12,7 @@ namespace sv_types { // Define constants for SV types - enum SVType { + enum class SVType { UNKNOWN = -1, DEL = 0, DUP = 1, @@ -24,15 +24,38 @@ namespace sv_types { }; // Mapping of SV types to strings - const std::unordered_map SVTypeString = { - {DEL, "DEL"}, - {DUP, "DUP"}, - {INV, "INV"}, - {INS, "INS"}, - {BND, "BND"}, - {NEUTRAL, "NEUT"}, - {INV_DUP, "INVDUP"} + const std::unordered_map SVTypeString = { + {SVType::UNKNOWN, "UNKNOWN"}, + {SVType::DEL, "DEL"}, + {SVType::DUP, "DUP"}, + {SVType::INV, "INV"}, + {SVType::INS, "INS"}, + {SVType::BND, "BND"}, + {SVType::NEUTRAL, "NEUTRAL"}, + {SVType::INV_DUP, "INV_DUP"} }; + + // Mapping of 6 copy number states to SV types + const std::unordered_map CNVTypeMap = { + {0, SVType::UNKNOWN}, + {1, SVType::DEL}, + {2, SVType::DEL}, + {3, SVType::NEUTRAL}, + {4, SVType::NEUTRAL}, + {5, SVType::DUP}, + {6, SVType::DUP} + }; + + // Function to get the SV type string + inline std::string getSVTypeString(SVType sv_type) { + return SVTypeString.at(sv_type); + } + + // Function to get the SV type from the CNV state + inline SVType getSVTypeFromCNState(int cn_state) { + return CNVTypeMap.at(cn_state); + } + // static const int UNKNOWN = -1; // static const int DEL = 0; // static const int DUP = 1; @@ -47,7 +70,7 @@ namespace sv_types { // Create a struct for storing SV information struct SVInfo { - int sv_type; + SVType sv_type; int read_support; // Number of reads supporting the SV breakpoints int read_depth; // Read depth at the SV start position std::set data_type; // Alignment type used to call the SV @@ -55,25 +78,17 @@ namespace sv_types { std::string genotype = "./."; // Default genotype (no call) double hmm_likelihood = 0.0; // HMM likelihood score for the state sequence - SVInfo() : - sv_type(-1), read_support(0), read_depth(0), data_type({}), sv_length(0), genotype("./."), hmm_likelihood(0.0){} + SVInfo() = default; + // SVInfo() : + // sv_type(-1), read_support(0), read_depth(0), data_type({}), sv_length(0), genotype("./."), hmm_likelihood(0.0){} - SVInfo(int sv_type, int read_support, int read_depth, std::string data_type, int sv_length, std::string genotype, double hmm_likelihood) : + SVInfo(SVType sv_type, int read_support, int read_depth, std::string data_type, int sv_length, std::string genotype, double hmm_likelihood) : sv_type(sv_type), read_support(read_support), read_depth(read_depth), data_type({data_type}), sv_length(sv_length), genotype(genotype), hmm_likelihood(hmm_likelihood) {} }; - // SV (start, end, alt_allele) - using SVCandidate = std::tuple; - - // Chromosome to SV candidate to read depth map - using SVDepthMap = std::unordered_map>; - - // Define a map for storing copy number calls by SV candidate - using SVCopyNumberMap = std::map>; - - // Create a type for storing SV update information from copy number caller - // (SVCandidate, SV type, genotype, data type) - using SVUpdate = std::tuple; + // Type definition for SV-related structures + using SVCandidate = std::tuple; // SV (start, end, alt_allele) + using SVDepthMap = std::unordered_map>; // Chromosome -> SV candidate -> SV info } #endif // SV_TYPES_H diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index c47bda5e..35793d47 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -132,7 +132,7 @@ void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector< // candidate with the highest likelihood SVCandidate& sv_one = sv_list[0].first; SVCandidate& sv_two = sv_list[1].first; - std::tuple cnv_prediction = this->runCopyNumberPredictionPair(chr, sv_one, sv_two); + std::tuple cnv_prediction = this->runCopyNumberPredictionPair(chr, sv_one, sv_two); // Get the SV info int best_index = std::get<0>(cnv_prediction); @@ -143,7 +143,7 @@ void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector< // Get the prediction data double best_likelihood = std::get<1>(cnv_prediction); - int best_cnv_type = std::get<2>(cnv_prediction); + SVType best_cnv_type = std::get<2>(cnv_prediction); std::string best_genotype = std::get<3>(cnv_prediction); bool snps_found = std::get<4>(cnv_prediction); if (snps_found) @@ -157,12 +157,12 @@ void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector< // copy neutral or duplication if (inversion) // && (best_cnv_type == sv_types::NEUTRAL)) { - if (best_cnv_type == sv_types::NEUTRAL) + if (best_cnv_type == SVType::NEUTRAL) { - best_cnv_type = sv_types::INV; - } else if (best_cnv_type == sv_types::DUP) + best_cnv_type = SVType::INV; + } else if (best_cnv_type == SVType::DUP) { - best_cnv_type = sv_types::INV_DUP; + best_cnv_type = SVType::INV_DUP; printMessage("INVDUP detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); } // best_cnv_type = sv_types::INV; @@ -180,7 +180,7 @@ void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector< sv_calls.add(chr, start_pos, end_pos, best_cnv_type, ".", aln_type, best_genotype, best_likelihood); } -std::tuple CNVCaller::runCopyNumberPredictionPair(std::string chr, SVCandidate sv_one, SVCandidate sv_two) +std::tuple CNVCaller::runCopyNumberPredictionPair(std::string chr, SVCandidate sv_one, SVCandidate sv_two) { // std::cout << "Running copy number prediction for SV pair " << chr << ":" << std::get<0>(sv_one) << "-" << std::get<1>(sv_one) << " and " << std::get<0>(sv_two) << "-" << std::get<1>(sv_two) << "..." << std::endl; double best_likelihood = 0.0; @@ -197,7 +197,7 @@ std::tuple CNVCaller::runCopyNumberPredicti // calculateDepthsForSNPRegion(chr, region_start_pos, region_end_pos, pos_depth_map); int current_index = 0; - int predicted_cnv_type = sv_types::UNKNOWN; + SVType predicted_cnv_type = SVType::UNKNOWN; std::string genotype = "./."; for (const auto& sv_call : {sv_one, sv_two}) { @@ -262,7 +262,7 @@ std::tuple CNVCaller::runCopyNumberPredicti int state_count = (int) sv_states.size(); if ((double) max_count / (double) state_count > pct_threshold) { - predicted_cnv_type = cnv_type_map[max_state]; + predicted_cnv_type = getSVTypeFromCNState(max_state); genotype = cnv_genotype_map[max_state]; } @@ -285,10 +285,10 @@ std::tuple CNVCaller::runCopyNumberPredicti // Save the SV calls as a TSV file if enabled int64_t sv_start_pos = std::get<0>(best_pos); int64_t sv_end_pos = std::get<1>(best_pos); - bool copy_number_change = (predicted_cnv_type != sv_types::UNKNOWN && predicted_cnv_type != sv_types::NEUTRAL); + bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL); if (this->input_data->getSaveCNVData() && copy_number_change && (sv_end_pos - sv_start_pos) > 10000) { - std::string cnv_type_str = SVTypeString[predicted_cnv_type]; + std::string cnv_type_str = getSVTypeString(predicted_cnv_type); std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) sv_start_pos) + "-" + std::to_string((int) sv_end_pos) + "_SPLITALN.tsv"; std::cout << "Saving SV split-alignment copy number predictions to " << sv_filename << std::endl; this->saveSVCopyNumberToTSV(best_snp_data, sv_filename, chr, best_pos.first, best_pos.second, cnv_type_str, best_likelihood); @@ -323,14 +323,7 @@ SNPData CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::map(sv_candidates.begin()->first); - // int64_t last_pos = std::get<1>(sv_candidates.rbegin()->first); - // std::unordered_map pos_depth_map; - // calculateDepthsForSNPRegion(chr, first_pos, last_pos, pos_depth_map); - - // Run copy number prediction for the SV candidates - // Loop through each SV candidate and predict the copy number state + printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "..."); // Create a map with counts for each CNV type @@ -457,7 +450,7 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::mapinput_data->getSaveCNVData() && updated_sv_type != sv_types::UNKNOWN && (end_pos - start_pos) > 10000) + SVType updated_sv_type = sv_candidates[sv_call].sv_type; + if (this->input_data->getSaveCNVData() && updated_sv_type != SVType::UNKNOWN && (end_pos - start_pos) > 10000) { // Add the state sequence to the SNP data (avoid copying the data) sv_snps.state_sequence = std::move(state_sequence); // Save the SV calls as a TSV file - std::string cnv_type_str = SVTypeString[updated_sv_type]; + std::string cnv_type_str = getSVTypeString(updated_sv_type); std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_CIGAR.tsv"; // std::cout << "Saving SV CIGAR copy number predictions to " << // sv_filename << std::endl; @@ -492,16 +485,16 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map &sv_candidates, SVCandidate key, int sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood) +void CNVCaller::updateSVCopyNumber(std::map &sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood) { // Update SV data from the HMM copy number prediction // Lock the SV candidate map std::lock_guard lock(this->sv_candidates_mtx); // Update the SV type if the update is not unknown, and if the types don't - // conflict (To avoid overwriting CIGAR-based SV calls with SNP-based calls) - int current_sv_type = sv_candidates[key].sv_type; - if ((sv_type_update != sv_types::UNKNOWN) && ((current_sv_type == sv_type_update) || (current_sv_type == sv_types::UNKNOWN))) + // conflict (To avoid overwriting previous calls) + SVType current_sv_type = sv_candidates[key].sv_type; + if ((sv_type_update != SVType::UNKNOWN) && ((current_sv_type == sv_type_update) || (current_sv_type == SVType::UNKNOWN))) { sv_candidates[key].sv_type = sv_type_update; // Update the SV type sv_candidates[key].data_type.insert(data_type); // Update the data type diff --git a/src/contextsv.cpp b/src/contextsv.cpp index 5553dacc..c0d4acd7 100644 --- a/src/contextsv.cpp +++ b/src/contextsv.cpp @@ -20,32 +20,13 @@ ContextSV::ContextSV(InputData& input_data) // Entry point int ContextSV::run() { - // Start the program's timer - auto start_sv = std::chrono::high_resolution_clock::now(); - - // Get the reference genome - FASTAQuery ref_genome = this->input_data->getRefGenome(); - - // Call SVs from long read alignments: - std::cout << "Running alignment-based SV calling..." << std::endl; - SVCaller sv_caller(*this->input_data); - SVData sv_calls = sv_caller.run(); - - // Print the total number of SVs called - // std::cout << "Total SVs called: " << sv_calls.totalCalls() << std::endl; - - // Write SV calls to file - std::string output_dir = this->input_data->getOutputDir(); + FASTAQuery ref_genome = this->input_data->getRefGenome(); // Load the reference genome + SVCaller sv_caller(*this->input_data); // Create an SV caller object + SVData sv_calls = sv_caller.run(); // Run the SV caller + std::string output_dir = this->input_data->getOutputDir(); // Get the output directory std::cout << "Writing SV calls to file " << output_dir << "/output.vcf..." << std::endl; - sv_calls.saveToVCF(ref_genome, output_dir); - - // Format and print the time taken to call SVs - // auto end_sv = std::chrono::high_resolution_clock::now(); - // std::string elapsed_time = getElapsedTime(start_sv, end_sv); + sv_calls.saveToVCF(ref_genome, output_dir); // Save the SV calls to a VCF file std::cout << "SV calling complete." << std::endl; - // int sv_count = sv_calls.totalCalls(); - // std::cout << "Found " << sv_count << " total SVs." << std::endl; - // std::cout << "SV calling complete. Found " << sv_calls.totalCalls() << " total SVs. Time taken (h:m:s) = " << elapsed_time << std::endl; return 0; } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index c79f8ddb..926ae663 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -297,9 +297,9 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr // Lock the SV calls object and add the insertion std::lock_guard lock(this->sv_mtx); if (is_duplication) { - sv_calls.add(chr, ref_pos, ref_end, DUP, ins_seq_str, "CIGARDUP", "./.", 0.0); + sv_calls.add(chr, ref_pos, ref_end, SVType::DUP, ins_seq_str, "CIGARDUP", "./.", 0.0); } else { - sv_calls.add(chr, ref_pos, ref_end, INS, ins_seq_str, "CIGARINS", "./.", 0.0); + sv_calls.add(chr, ref_pos, ref_end, SVType::INS, ins_seq_str, "CIGARINS", "./.", 0.0); } } @@ -315,7 +315,7 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr // Lock the SV calls object and add the deletion // std::lock_guard lock(this->sv_mtx); - sv_calls.add(chr, ref_pos, ref_end, DEL, ".", "CIGARDEL", "./.", 0.0); + sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", "CIGARDEL", "./.", 0.0); } // Check if the CIGAR operation is a clipped base @@ -422,7 +422,6 @@ SVData SVCaller::run() std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl; int chunk_count = 100; // Number of chunks to split the chromosome into int region_count = 0; - auto start1 = std::chrono::high_resolution_clock::now(); SVData sv_calls; int min_cnv_length = this->input_data->getMinCNVLength(); for (const auto& chr : chromosomes) { @@ -430,8 +429,6 @@ SVData SVCaller::run() // Split the chromosome into chunks std::vector region_chunks; - - // Get the region start and end positions if (this->input_data->isRegionSet()) { std::pair region = this->input_data->getRegion(); int region_start = region.first; @@ -444,10 +441,7 @@ SVData SVCaller::run() } else { int chr_len = this->input_data->getRefGenomeChromosomeLength(chr); - // std::cout << "Chromosome length: " << chr_len << std::endl; - // std::cout << "Chunk count: " << chunk_count << std::endl; int chunk_size = std::ceil((double)chr_len / chunk_count); - // std::cout << "Chunk size: " << chunk_size << std::endl; for (int i = 0; i < chunk_count; i++) { int start = i * chunk_size + 1; // 1-based int end = start + chunk_size; @@ -464,12 +458,10 @@ SVData SVCaller::run() std::cout << "Loading chromosome data for copy number predictions..." << std::endl; CNVCaller cnv_caller(*this->input_data); cnv_caller.loadChromosomeData(chr); - // std::cout << "Loaded chromosome data for copy number predictions." << std::endl; // Process each chunk one at a time std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl; for (const auto& sub_region : region_chunks) { - // Detect SVs from the sub-region // std::cout << "Detecting CIGAR string SVs from " << sub_region << "..." << std::endl; RegionData region_data = this->detectSVsFromRegion(sub_region); SVData& sv_calls_region = std::get<0>(region_data); @@ -488,8 +480,6 @@ SVData SVCaller::run() std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl; cnv_caller.runCIGARCopyNumberPrediction(chr, cigar_svs, min_cnv_length); } - // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl; - // cnv_caller.runCIGARCopyNumberPrediction(chr, cigar_svs, min_cnv_length); // Run split-read SV detection in a single thread, combined with // copy number variant predictions @@ -503,13 +493,9 @@ SVData SVCaller::run() // Increment the region count region_count++; std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)..." << std::endl; - // std::cout << "Extracted aligments for " << region_count << " of " << chr_count << " chromosome(s)..." << std::endl; } - // auto end1 = std::chrono::high_resolution_clock::now(); std::cout << "SV calling completed." << std::endl; - // int total_sv_calls = sv_calls.totalCalls(); - // std::cout << "Finished detecting " << sv_calls.totalCalls() << " SVs from " << chr_count << " chromosome(s). Elapsed time: " << getElapsedTime(start1, end1) << std::endl; return sv_calls; } diff --git a/src/sv_data.cpp b/src/sv_data.cpp index 3ea742f3..055f2ebe 100644 --- a/src/sv_data.cpp +++ b/src/sv_data.cpp @@ -8,7 +8,7 @@ /// @endcond -int SVData::add(std::string chr, int64_t start, int64_t end, int sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) +int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) { // Check if the alternate allele contains ambiguous bases const std::unordered_set ambiguous_bases = {'R', 'Y', 'W', 'S', 'K', 'M', 'B', 'D', 'H', 'V'}; @@ -26,7 +26,7 @@ int SVData::add(std::string chr, int64_t start, int64_t end, int sv_type, std::s sv_info.read_support += 1; // Update the SV type if it is unknown - if (sv_info.sv_type == UNKNOWN) { + if (sv_info.sv_type == SVType::UNKNOWN) { sv_info.sv_type = sv_type; } @@ -47,14 +47,11 @@ int SVData::add(std::string chr, int64_t start, int64_t end, int sv_type, std::s // Otherwise, add the SV candidate to the map } else { - // For insertions and duplications, the SV length is the length of the - // inserted sequence, not including the insertion position - int sv_length = 0; - if (sv_type == INS || sv_type == DUP) { - sv_length = end - start; - } else { - // For deletions, the SV length is the length of the deletion - sv_length = end - start + 1; + int sv_length = end - start; + + // For deletions, the SV length is the length of the deletion, including the start position + if (sv_type == SVType::DEL) { + sv_length++; } // Create a new SVInfo object (SV type, alignment support, read depth, data type, SV length, genotype) @@ -198,7 +195,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) // Get the SV candidate and SV info SVCandidate candidate = sv_call.first; SVInfo info = sv_call.second; - int sv_type = info.sv_type; + SVType sv_type = info.sv_type; int read_support = info.read_support; int read_depth = info.read_depth; int sv_length = info.sv_length; @@ -217,7 +214,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) int64_t end = std::get<1>(candidate); // If the SV type is unknown, skip it - if (sv_type == UNKNOWN || sv_type == NEUTRAL) { + if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) { skip_count += 1; continue; } else { @@ -230,7 +227,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) std::string repeat_type = "NA"; // Deletion - if (sv_type == DEL) { + if (sv_type == SVType::DEL) { // Get the deleted sequence from the reference genome, also including the preceding base int64_t preceding_pos = (int64_t) std::max(1, (int) pos-1); // Make sure the position is not negative ref_allele = ref_genome.query(chr, preceding_pos, end); @@ -239,7 +236,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) if (ref_allele != "") { alt_allele = ref_allele.at(0); } else { - alt_allele = ""; // Use symbolic allele for imprecise deletions + alt_allele = ""; // Symbolic allele std::cerr << "Warning: Reference allele is empty for deletion at " << chr << ":" << pos << "-" << end << std::endl; } @@ -249,18 +246,16 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) // Update the position pos = preceding_pos; - // Duplications and insertions - } else if (sv_type == INS || sv_type == DUP) { + // Other types (duplications, insertions, inversions) + } else { // Use the preceding base as the reference allele int64_t preceding_pos = (int64_t) std::max(1, (int) pos-1); // Make sure the position is not negative ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos); // Format novel insertions - if (sv_type == INS) { + if (sv_type == SVType::INS) { // Use the insertion sequence as the alternate allele alt_allele = std::get<2>(candidate); - - // Insert the reference base into the alternate allele alt_allele.insert(0, ref_allele); // Update the position @@ -269,19 +264,15 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) // Update the end position to the start position to change from // query to reference coordinates for insertions end = pos; - } else if (sv_type == DUP) { - // Use a symbolic allele for duplications - alt_allele = ""; - - // Set the repeat type as an interspersed duplication + } else if (sv_type == SVType::DUP) { + alt_allele = ""; // Symbolic allele repeat_type = "TANDEM"; } } // Create the VCF parameter strings int clipped_base_support = this->getClippedBaseSupport(chr, pos, end); - // std::string sv_type_str = this->sv_type_map[sv_type]; - std::string sv_type_str = sv_types::SVTypeString[sv_type]; + std::string sv_type_str = getSVTypeString(sv_type); std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \ ";SVLEN=" + std::to_string(sv_length) + ";SUPPORT=" + std::to_string(read_support) + \ ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";CLIPSUP=" + std::to_string(clipped_base_support) + \ @@ -297,13 +288,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) } // Print the number of SV calls skipped - std::cout << "Finished writing VCF file." << std::endl; - // int num_sv_calls = this->totalCalls(); - // std::cout << "Skipped " << skip_count << " of " << num_sv_calls << " SV calls because the SV type is unknown" << std::endl; - // std::cout << "Finished writing VCF file with " << num_sv_calls - skip_count << " SV calls" << std::endl; - - // Close the output stream - // vcf_writer.close(); + std::cout << "Finished writing VCF file. Total SV calls: " << total_count << ", skipped: " << skip_count << " with unknown SV type" << std::endl; } std::map& SVData::getChromosomeSVs(std::string chr) From 6fb4b72c6f10b10d3b1e58ecaa059bf5bc1289b2 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Nov 2024 13:29:17 -0400 Subject: [PATCH 005/134] Update docs --- src/sv_caller.cpp | 82 ++++++++++------------------------------------- src/sv_data.cpp | 27 ++++------------ 2 files changed, 24 insertions(+), 85 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 926ae663..4ce5e940 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -189,31 +189,12 @@ SVCaller::SVCaller(InputData &input_data) std::tuple, int32_t, int32_t> SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& sv_calls, bool is_primary) { - // Get the chromosome - std::string chr = header->target_name[alignment->core.tid]; - - // Get the position of the alignment in the reference genome - int32_t pos = alignment->core.pos; - - // Get the CIGAR string - uint32_t* cigar = bam_get_cigar(alignment); - - // Get the CIGAR length + std::string chr = header->target_name[alignment->core.tid]; // Chromosome name + int32_t pos = alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) + uint32_t* cigar = bam_get_cigar(alignment); // CIGAR array int cigar_len = alignment->core.n_cigar; - - // Track the query position int query_pos = 0; - - // Loop through the CIGAR string (0-based) and detect insertions and deletions in - // reference coordinates (1-based) - // POS is the leftmost position of where the alignment maps to the reference: - // https://genome.sph.umich.edu/wiki/SAM - // std::vector threads; - // std::vector sv_calls_vec; - - // Create a map of query position to match/mismatch (1/0) for calculating - // the mismatch rate at alignment overlaps - std::unordered_map query_match_map; + std::unordered_map query_match_map; // Query position to match/mismatch (1/0) map // Loop through the CIGAR string, process operations, detect SVs (primary // only), update clipped base support, calculate sequence identity for @@ -226,24 +207,14 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr bool first_op = false; // First alignment operation for the query for (int i = 0; i < cigar_len; i++) { - // Get the CIGAR operation - int op = bam_cigar_op(cigar[i]); - - // Get the CIGAR operation length - int op_len = bam_cigar_oplen(cigar[i]); + int op = bam_cigar_op(cigar[i]); // CIGAR operation + int op_len = bam_cigar_oplen(cigar[i]); // CIGAR operation length - // Check if the CIGAR operation is an insertion + // Process the CIGAR operation if (op == BAM_CINS && is_primary) { - - // Add the SV if greater than the minimum SV size if (op_len >= this->min_sv_size) { // Get the sequence of the insertion from the query - // std::string ins_seq_str = ""; - // uint8_t* seq_ptr = bam_get_seq(alignment); - // for (int j = 0; j < op_len; j++) { - // ins_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)]; - // } std::string ins_seq_str(op_len, ' '); for (int j = 0; j < op_len; j++) { ins_seq_str[j] = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; @@ -293,9 +264,6 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr // Add to SV calls (1-based) with the appropriate SV type ref_pos = pos+1; ref_end = ref_pos + op_len -1; - - // Lock the SV calls object and add the insertion - std::lock_guard lock(this->sv_mtx); if (is_duplication) { sv_calls.add(chr, ref_pos, ref_end, SVType::DUP, ins_seq_str, "CIGARDUP", "./.", 0.0); } else { @@ -307,23 +275,17 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr } else if (op == BAM_CDEL && is_primary) { // Add the SV if greater than the minimum SV size - if (op_len >= this->min_sv_size) { - - // Add the deletion to the SV calls (1-based) + if (op_len >= this->min_sv_size) + { ref_pos = pos+1; ref_end = ref_pos + op_len -1; - - // Lock the SV calls object and add the deletion - // std::lock_guard lock(this->sv_mtx); - sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", "CIGARDEL", "./.", 0.0); + sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", "CIGARDEL", "./.", 0.0); // Add the deletion } // Check if the CIGAR operation is a clipped base } else if (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP) { - // Update the clipped base support - // std::lock_guard lock(this->sv_mtx); - sv_calls.updateClippedBaseSupport(chr, pos); + sv_calls.updateClippedBaseSupport(chr, pos); // Update clipped base support // Update the query alignment start position if (!first_op) { @@ -394,21 +356,14 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr } } - // Update the query end position - query_end = query_pos; + query_end = query_pos; // Last alignment position in the query - // Return the mismatch map and the query start and end positions return std::tuple, int32_t, int32_t>(query_match_map, query_start, query_end); } -// Detect SVs from split read alignments (primary and supplementary) and -// directly from the CIGAR string SVData SVCaller::run() { - // Open the BAM file - std::string bam_filepath = this->input_data->getLongReadBam(); - - // Get the region data + // Get the chromosomes to process std::vector chromosomes; if (this->input_data->getChromosome() != "") { chromosomes.push_back(this->input_data->getChromosome()); @@ -417,8 +372,7 @@ SVData SVCaller::run() } int chr_count = chromosomes.size(); - // Loop through each region and detect SVs (Note: The main loop is - // single-threaded) + // Loop through each region and detect SVs in chunks std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl; int chunk_count = 100; // Number of chunks to split the chromosome into int region_count = 0; @@ -430,11 +384,11 @@ SVData SVCaller::run() // Split the chromosome into chunks std::vector region_chunks; if (this->input_data->isRegionSet()) { + + // Use one chunk for the specified region std::pair region = this->input_data->getRegion(); int region_start = region.first; int region_end = region.second; - - // Use one chunk for the region std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end); region_chunks.push_back(chunk); std::cout << "Using specified region " << chunk << "..." << std::endl; @@ -485,9 +439,7 @@ SVData SVCaller::run() // copy number variant predictions std::cout << "Detecting copy number variants from split reads..." << std::endl; this->detectSVsFromSplitReads(sv_calls_region, primary_map, supp_map, cnv_caller); - - // Add the SV calls to the main SV calls object - sv_calls.concatenate(sv_calls_region); + sv_calls.concatenate(sv_calls_region); // Add the calls to the main set } // Increment the region count diff --git a/src/sv_data.cpp b/src/sv_data.cpp index 055f2ebe..797ef704 100644 --- a/src/sv_data.cpp +++ b/src/sv_data.cpp @@ -21,7 +21,8 @@ int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std // Check if the SV candidate already exists in the map SVCandidate candidate(start, end, alt_allele); if (this->sv_calls[chr].find(candidate) != this->sv_calls[chr].end()) { - // Update the alignment-based support count (+1) + + // Update the alignment-based support count SVInfo& sv_info = this->sv_calls[chr][candidate]; sv_info.read_support += 1; @@ -39,9 +40,7 @@ int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std if ((sv_info.hmm_likelihood == 0.0) || (hmm_likelihood > sv_info.hmm_likelihood)) { sv_info.hmm_likelihood = hmm_likelihood; } - - // Add the alignment type used to call the SV - sv_info.data_type.insert(data_type); + sv_info.data_type.insert(data_type); // Add the alignment type to the set return 0; // SV call already exists @@ -54,11 +53,8 @@ int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std sv_length++; } - // Create a new SVInfo object (SV type, alignment support, read depth, data type, SV length, genotype) SVInfo sv_info(sv_type, 1, 0, data_type, sv_length, genotype, hmm_likelihood); - - // Add the SV candidate to the map - this->sv_calls[chr][candidate] = sv_info; + this->sv_calls[chr][candidate] = sv_info; // Add the SV candidate to the map return 1; // SV call added } @@ -88,10 +84,8 @@ void SVData::updateClippedBaseSupport(std::string chr, int64_t pos) // Update clipped base support std::pair key(chr, pos); if (this->clipped_base_support.find(key) != this->clipped_base_support.end()) { - // Update the depth this->clipped_base_support[key] += 1; } else { - // Add the depth this->clipped_base_support[key] = 1; } } @@ -175,11 +169,9 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) "##FORMAT=" }; - // Write the header lines std::cout << "Writing VCF header..." << std::endl; vcf_writer.writeHeader(header_lines); - // Save the SV calls std::cout << "Saving SV calls to " << output_vcf << std::endl; std::string sv_method = "CONTEXTSVv0.1"; int skip_count = 0; @@ -240,11 +232,9 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) std::cerr << "Warning: Reference allele is empty for deletion at " << chr << ":" << pos << "-" << end << std::endl; } - // Make the SV length negative - sv_length = -1 * sv_length; + sv_length = -1 * sv_length; // Negative length for deletions - // Update the position - pos = preceding_pos; + pos = preceding_pos; // Update the position to the preceding base // Other types (duplications, insertions, inversions) } else { @@ -258,8 +248,7 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) alt_allele = std::get<2>(candidate); alt_allele.insert(0, ref_allele); - // Update the position - pos = preceding_pos; + pos = preceding_pos; // Update the position to the preceding base // Update the end position to the start position to change from // query to reference coordinates for insertions @@ -307,12 +296,10 @@ std::set SVData::getChromosomes() int SVData::totalCalls() { - // std::cout << "Calculating total SV calls..." << std::endl; int sv_calls = 0; for (auto const& sv_call : this->sv_calls) { sv_calls += sv_call.second.size(); } - // std::cout << "Total SV calls: " << sv_calls << std::endl; return sv_calls; } From a38ada7a5e08c2f69199a60d9a85b2feddb1c572 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Nov 2024 13:33:05 -0400 Subject: [PATCH 006/134] Update docs --- src/sv_caller.cpp | 51 +++++++++-------------------------------------- 1 file changed, 9 insertions(+), 42 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 4ce5e940..0a511965 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -34,36 +34,34 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) RegionData SVCaller::detectSVsFromRegion(std::string region) { - SVData sv_calls; + // Open the BAM file std::string bam_filepath = this->input_data->getLongReadBam(); - - // Open the BAM file in a thread-safe manner samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); if (fp_in == NULL) { std::cerr << "ERROR: failed to open " << bam_filepath << std::endl; exit(1); } - // Get the header in a thread-safe manner + // Load the header for the BAM file bam_hdr_t *bamHdr = sam_hdr_read(fp_in); if (bamHdr == NULL) { std::cerr << "ERROR: failed to read header for " << bam_filepath << std::endl; exit(1); } - // Get the index in a thread-safe manner + // Load the index for the BAM file hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str()); if (idx == NULL) { std::cerr << "ERROR: failed to load index for " << bam_filepath << std::endl; exit(1); } - // Create a read and iterator for the region in a thread-safe manner + // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str()); - // Loop through the alignments - // Create a map of primary and supplementary alignments by QNAME (query template name) + // Main loop to process the alignments + SVData sv_calls; int num_alignments = 0; PrimaryMap primary_alignments; SuppMap supplementary_alignments; @@ -78,18 +76,15 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) // Do nothing } else { - // Get the QNAME (query template name) for associating split reads - std::string qname = bam_get_qname(bam1); + std::string qname = bam_get_qname(bam1); // Query template name // Process primary alignments if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { - // Get the primary alignment chromosome, start, end, and depth + // Get the primary alignment information std::string chr = bamHdr->target_name[bam1->core.tid]; int64_t start = bam1->core.pos; int64_t end = bam_endpos(bam1); // This is the first position after the alignment - - // Get the strand bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); // Call SVs directly from the CIGAR string @@ -105,12 +100,10 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) // Process supplementary alignments } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { - // Add the supplementary alignment to the map + // Get the supplementary alignment information std::string chr = bamHdr->target_name[bam1->core.tid]; int32_t start = bam1->core.pos; int32_t end = bam_endpos(bam1); - - // Get the strand bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); // Get CIGAR string information, but don't call SVs @@ -122,49 +115,24 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) // Add the supplementary alignment to the map AlignmentData alignment(chr, start, end, ".", query_start, query_end, std::move(match_map), fwd_strand); supplementary_alignments[qname].emplace_back(alignment); - - // If Read ID == 8873acc1-eb84-415d-8557-a32a8f52ccee, print the - // alignment - // if (qname == "8873acc1-eb84-415d-8557-a32a8f52ccee") { - // std::cout << "Supplementary alignment: " << chr << ":" << start << "-" << end << std::endl; - // std::cout << "Query start: " << query_start << ", Query end: " << query_end << std::endl; - // std::cout << "Match map: "; - // for (const auto& entry : match_map) { - // std::cout << entry.first << ":" << entry.second << " "; - // } - // std::cout << std::endl; - // } } } - // Increment the number of alignment records processed num_alignments++; } - // Destroy the iterator hts_itr_destroy(itr); - - // Destroy the read bam_destroy1(bam1); - - // Close the BAM file sam_close(fp_in); - - // Destroy the header bam_hdr_destroy(bamHdr); - - // Destroy the index hts_idx_destroy(idx); // Return the SV calls and the primary and supplementary alignments - // return std::make_tuple(sv_calls, primary_alignments, - // supplementary_alignments); return std::make_tuple(std::move(sv_calls), std::move(primary_alignments), std::move(supplementary_alignments)); } double SVCaller::calculateMismatchRate(std::unordered_map &match_map, int32_t start, int32_t end) { - // Calculate the mismatch rate int match_count = 0; int mismatch_count = 0; for (int i = start; i <= end; i++) { @@ -178,7 +146,6 @@ double SVCaller::calculateMismatchRate(std::unordered_map &match_map, } double mismatch_rate = (double)mismatch_count / (double)(match_count + mismatch_count); - // Return the mismatch rate return mismatch_rate; } From 1cf3a127902ee19d3b5352c9ed6d434dbab79822 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Nov 2024 13:41:47 -0400 Subject: [PATCH 007/134] Remove unused code and update docs --- include/cnv_caller.h | 5 +- src/cnv_caller.cpp | 161 ++----------------------------------------- src/sv_caller.cpp | 32 +-------- 3 files changed, 11 insertions(+), 187 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 23a70640..a281219f 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -119,9 +119,6 @@ class CNVCaller { // Calculate the mean chromosome coverage double calculateMeanChromosomeCoverage(std::string chr); - // Calculate read depths for a region - void calculateDepthsForSNPRegion(std::string chr, int64_t start_pos, int64_t end_pos, std::unordered_map& pos_depth_map); - // Calculate the log2 ratio for a region given the read depths and mean // chromosome coverage double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::unordered_map& pos_depth_map, double mean_chr_cov); @@ -133,7 +130,7 @@ class CNVCaller { // of population frequencies for each SNP location void getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info); - // Save a TSV with B-allele frequencies, log 2 ratios, and copy number predictions + // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, int64_t start, int64_t end, std::string sv_type, double likelihood); }; diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 35793d47..ed71237a 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -189,13 +189,6 @@ std::tuple CNVCaller::runCopyNumberPredi int best_index = 0; std::pair best_pos; SNPData best_snp_data; - - // Get read depths for the SV candidate region - // int64_t region_start_pos = std::min(std::get<0>(sv_one), std::get<0>(sv_two)); - // int64_t region_end_pos = std::max(std::get<1>(sv_one), std::get<1>(sv_two)); - // std::unordered_map pos_depth_map; - // calculateDepthsForSNPRegion(chr, region_start_pos, region_end_pos, pos_depth_map); - int current_index = 0; SVType predicted_cnv_type = SVType::UNKNOWN; std::string genotype = "./."; @@ -377,31 +370,17 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map(candidate); int64_t end_pos = std::get<1>(candidate); - // // [TEST] Skip if not in the following list of SVs - // std::vector sv_list = {"chr19:53013528-53051102", "chr1:43593639-43617165", "chr6:35786784-35799012", "chr1:152787870-152798352", "chr17:41265461-41275765", "chr5:180950357-181003515"}; - // std::string sv_key = chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos); - // if (std::find(sv_list.begin(), sv_list.end(), sv_key) == sv_list.end()) - // { - // continue; - // } - // Get the depth at the start position. This is used as the FORMAT/DP // value in the VCF file int dp_value = pos_depth_map[start_pos]; this->updateDPValue(sv_candidates, sv_call, dp_value); - // Loop through the SV region, calculate the log2 ratios, and run the - // Viterbi algorithm to predict the copy number states - - // We will run the Viterbi algorithm on SNPs in the SV region +/- 1/2 - // the SV length + // Loop through the SV region +/- 1/2 SV length and run copy number + // predictions int64_t sv_half_length = (end_pos - start_pos) / 2.0; - // std::cout << "SV half length: " << sv_half_length << std::endl; int64_t query_start = std::max((int64_t) 1, start_pos - sv_half_length); int64_t query_end = end_pos + sv_half_length; @@ -514,10 +493,7 @@ void CNVCaller::updateSVCopyNumber(std::map &sv_candidates, void CNVCaller::updateDPValue(std::map& sv_candidates, SVCandidate key, int dp_value) { - // Lock the SV candidate map std::lock_guard lock(this->sv_candidates_mtx); - - // Update the DP value sv_candidates[key].read_depth = dp_value; } @@ -540,8 +516,6 @@ std::vector CNVCaller::splitRegionIntoChunks(std::string chr, int64 // Add the region chunk to the vector region_chunks.push_back(chr + ":" + std::to_string(chunk_start) + "-" + std::to_string(chunk_end)); - - // Update the chunk start chunk_start = chunk_end + 1; } @@ -558,7 +532,6 @@ std::vector> CNVCaller::splitSVCandidatesIntoChunks(std std::vector current_sv_chunk; for (auto const& sv_call : sv_candidates) { - // Add the SV candidate to the current chunk current_sv_chunk.push_back(sv_call.first); // If the current chunk size is reached, then add the chunk to the @@ -587,23 +560,19 @@ CNVCaller::CNVCaller(InputData &input_data) void CNVCaller::loadChromosomeData(std::string chr) { - // Read the HMM from file std::string hmm_filepath = this->input_data->getHMMFilepath(); std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; this->hmm = ReadCHMM(hmm_filepath.c_str()); - // Calculate the mean chromosome coverage and generate the position-depth map printMessage("Calculating mean chromosome coverage for " + chr + "..."); mean_chr_cov = calculateMeanChromosomeCoverage(chr); printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov)); this->mean_chr_cov = mean_chr_cov; - // Read the SNP positions and B-allele frequency values from the VCF file std::cout << "Reading SNP allele frequencies for chromosome " << chr << " from VCF file..." << std::endl; std::string snp_filepath = this->input_data->getSNPFilepath(); readSNPAlleleFrequencies(chr, snp_filepath, this->snp_info); - // Get the population frequencies for each SNP std::cout << "Obtaining SNP population frequencies for chromosome " << chr << "..." << std::endl; getSNPPopulationFrequencies(chr, this->snp_info); std::cout << "Finished loading chromosome data for " << chr << std::endl; @@ -685,107 +654,6 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) return mean_chr_cov; } -void CNVCaller::calculateDepthsForSNPRegion(std::string chr, int64_t start_pos, int64_t end_pos, std::unordered_map& pos_depth_map) -{ - std::cout << "Calculating read depths for SV region " << chr << ":" << start_pos << "-" << end_pos << "..." << std::endl; - - // // If extending the CNV regions, then extend the SV region by window size * - // // N. Otherwise, log2 ratios will be zero due to missing read depth data - // // before/after the first/last SV positions - // if (this->input_data->getSaveCNVData()) - // { - // int extend_factor = 100; - // int window_size = this->input_data->getWindowSize(); - // start_pos = std::max((int64_t) 1, start_pos - (window_size * extend_factor)); - // end_pos = end_pos + (window_size * extend_factor); - // } - - // // Split the region into equal parts for each thread if the region is larger - // // than 100 kb - // int num_threads = this->input_data->getThreadCount(); - // std::vector region_chunks; - // int64_t region_size = end_pos - start_pos; - // if (region_size < 100000) - // { - // region_chunks.push_back(chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos)); - // } else { - // region_chunks = splitRegionIntoChunks(chr, start_pos, end_pos, num_threads); - // } - - // // Loop through each region chunk and get the mean chromosome coverage in - // // parallel - // std::string input_filepath = this->input_data->getShortReadBam(); - // std::vector>> futures; - // for (const auto& region_chunk : region_chunks) - // { - // // Create a lambda function to get the mean chromosome coverage for the - // // region chunk - // auto get_pos_depth_map = [region_chunk, input_filepath]() -> std::unordered_map - // { - // // Run samtools depth on the entire region, and print positions and - // // depths (not chromosome) - // const int cmd_size = 256; - // char cmd[cmd_size]; - // snprintf(cmd, cmd_size, - // "samtools depth -r %s %s | awk '{print $2, $3}'", - // region_chunk.c_str(), input_filepath.c_str()); - - // // Open a pipe to read the output of the command - // FILE *fp = popen(cmd, "r"); - // if (fp == NULL) - // { - // std::cerr << "ERROR: Could not open pipe for command: " << cmd << std::endl; - // exit(EXIT_FAILURE); - // } - - // // Create a map of positions and depths - // std::unordered_map pos_depth_map; - // const int line_size = 1024; - // char line[line_size]; - // while (fgets(line, line_size, fp) != NULL) - // { - // // Parse the line - // uint64_t pos; - // int depth; - // if (sscanf(line, "%ld%d", &pos, &depth) == 2) - // { - // // Add the position and depth to the map - // pos_depth_map[pos] = depth; - // } else { - // // No reads - // } - // } - - // // Close the pipe - // pclose(fp); - - // return pos_depth_map; - // }; - - // // Create a future for the thread - // std::future> future = std::async(std::launch::async, get_pos_depth_map); - - // // Add the future to the vector - // futures.push_back(std::move(future)); - // } - - // // Loop through the futures and get the results - // int current_chunk = 0; - // for (auto& future : futures) - // { - // current_chunk++; - // future.wait(); - // std::unordered_map result = std::move(future.get()); - - // // Merge the position depth maps - // this->mergePosDepthMaps(pos_depth_map, result); - // if (this->input_data->getVerbose()) - // { - // printMessage("Completed region chunk " + std::to_string(current_chunk) + " of " + std::to_string(region_chunks.size()) + "..."); - // } - // } -} - void CNVCaller::mergePosDepthMaps(std::unordered_map& main_map, std::unordered_map& map_update) { // Merge the second depth map into the first @@ -857,16 +725,14 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, std::cerr << "ERROR: " << index_error << std::endl; exit(1); } - - // Close the pipe - pclose(index_fp); + pclose(index_fp); // Close the process // Filter variants by depth, quality, and region if (this->input_data->getVerbose()) { std::cout << "Filtering SNPs by depth, quality, and region..." << std::endl; } - // // Check if a region was specified by the user + // Check if a region was specified by the user std::string region_str = chr; if (this->input_data->isRegionSet()) { @@ -947,8 +813,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, snp_info.insertSNPAlleleFrequency(chr, pos, baf); } - // Close the pipe - pclose(fp); + pclose(fp); // Close the process if (this->input_data->getVerbose()) { std::cout << "Finished extracting B-allele frequency data from filtered SNPs" << std::endl; @@ -1033,12 +898,6 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) // Run bcftools query to get the population frequencies for the // chromosome within the SNP region, filtering for SNPS only, // and within the MIN-MAX range of frequencies. - // TODO: Update to use ethnicity-specific population frequencies - // Example from gnomAD: - // ##INFO= - // std::string ethnicity_suffix = "_asj"; // Ashkenazi Jewish - // (leave empty for all populations) std::string filter_criteria = "INFO/variant_type=\"snv\" && " + AF_key + " >= " + std::to_string(MIN_PFB) + " && " + AF_key + " <= " + std::to_string(MAX_PFB); std::string cmd = \ "bcftools query -r " + region_chunk + " -f '%POS\t%" + AF_key + "\n' -i '" + filter_criteria + "' " + pfb_filepath + " 2>/dev/null"; @@ -1065,8 +924,7 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) double pfb; if (sscanf(line, "%d%lf", &pos, &pfb) == 2) { - // Add the position and population frequency to the map - pos_pfb_map[pos] = pfb; + pos_pfb_map[pos] = pfb; // Add the position and population frequency to the map } } pclose(fp); @@ -1083,10 +941,7 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) int pfb_count = 0; for (auto& future : futures) { - // Wait for the future to finish future.wait(); - - // Get the result from the future std::unordered_map result = std::move(future.get()); // Loop through the result and add to SNPInfo @@ -1096,12 +951,10 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) int pos = pair.first; double pfb = pair.second; - // Lock the SNPInfo mutex + // Add the population frequency to the SNPInfo this->snp_data_mtx.lock(); snp_info.insertSNPPopulationFrequency(chr_snp, pos, pfb); this->snp_data_mtx.unlock(); - - // Increment the population frequency count pfb_count++; // [TEST] Print 15 values diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 0a511965..f20ddf69 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -25,10 +25,7 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) { - // Read the next alignment int ret = sam_itr_next(fp_in, itr, bam1); - - // Return the result of reading the next alignment return ret; } @@ -409,7 +406,6 @@ SVData SVCaller::run() sv_calls.concatenate(sv_calls_region); // Add the calls to the main set } - // Increment the region count region_count++; std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)..." << std::endl; } @@ -441,12 +437,10 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map AlignmentVector supp_alignments = supp_map[qname]; for (const auto& supp_alignment : supp_alignments) { - // Get the supplementary alignment chromosome - std::string supp_chr = std::get<0>(supp_alignment); - // Skip supplementary alignments that are on a different chromosome // for now (TODO: Use for identifying trans-chromosomal SVs such as // translocations) + std::string supp_chr = std::get<0>(supp_alignment); if (primary_chr != supp_chr) { continue; } @@ -528,10 +522,6 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map std::pair sv_pair(sv_candidate, "INVERSION"); sv_list.push_back(sv_pair); sv_count++; - // SVCandidate sv_candidate(supp_start+1, primary_end+1, "."); - // std::pair sv_pair(sv_candidate, "INVERSION"); - // sv_list.push_back(sv_pair); - // sv_count++; } // Determine which SV to keep based on HMM prediction likelihood @@ -547,15 +537,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // [supp_start] [supp_end] -- [primary_start] [primary_end] std::vector> sv_list; // SV candidate and alignment type - // Use the gap ends as the SV endpoints - // if (primary_start - supp_end >= min_cnv_length) { - // SVCandidate sv_candidate(supp_end+1, primary_start+1, "."); - // std::pair sv_pair(sv_candidate, "GAPINNER_A"); - // sv_list.push_back(sv_pair); - // sv_count++; - // } - - // Also use the alignment ends as the SV endpoints + // Use the alignment ends as the SV endpoints if (primary_end - supp_start >= min_cnv_length) { SVCandidate sv_candidate(supp_start+1, primary_end+1, "."); std::pair sv_pair(sv_candidate, "GAPOUTER_A"); @@ -573,15 +555,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // [primary_start] [primary_end] -- [supp_start] [supp_end] std::vector> sv_list; // SV candidate and alignment type - // Use the gap ends as the SV endpoints - // if (supp_start - primary_end >= min_cnv_length) { - // SVCandidate sv_candidate(primary_end+1, supp_start+1, "."); - // std::pair sv_pair(sv_candidate, "GAPINNER_B"); - // sv_list.push_back(sv_pair); - // sv_count++; - // } - - // Also use the alignment ends as the SV endpoints + // Use the alignment ends as the SV endpoints if (supp_end - primary_start >= min_cnv_length) { SVCandidate sv_candidate(primary_start+1, supp_end+1, "."); std::pair sv_pair(sv_candidate, "GAPOUTER_B"); From 181f6d1b5e490ca96acf4d9e22d5ee5a9aca7fd2 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Nov 2024 14:40:34 -0400 Subject: [PATCH 008/134] simplify predictions --- include/cnv_caller.h | 8 +- src/cnv_caller.cpp | 207 +++++++++++------------------------------- src/sv_caller.cpp | 104 ++++++++++++++------- tests/test_general.py | 2 +- 4 files changed, 129 insertions(+), 192 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index a281219f..86f0abf9 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -107,15 +107,13 @@ class CNVCaller { // Load file data for a chromosome (SNP positions, BAF values, and PFB values) void loadChromosomeData(std::string chr); - // Run copy number prediction for a pair of SV candidates, and add only - // the SV candidate with the highest likelihood - std::tuple runCopyNumberPredictionPair(std::string chr, SVCandidate sv_one, SVCandidate sv_two); + // Run copy number prediction for a single SV candidate, returning the + // likelihood, predicted CNV type, genotype, and whether SNPs were found + std::tuple runCopyNumberPrediction(std::string chr, SVCandidate& sv_candidate); // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings SNPData runCIGARCopyNumberPrediction(std::string chr, std::map& sv_candidates, int min_length); - void updateSVsFromCopyNumberPrediction(SVData& sv_calls, std::vector>& sv_list, std::string chr, bool inversion); - // Calculate the mean chromosome coverage double calculateMeanChromosomeCoverage(std::string chr); diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index ed71237a..9e575771 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -115,181 +115,80 @@ std::pair CNVCaller::querySNPRegion(std::string chr, int64_t star return std::make_pair(snp_data, snps_found); } -void CNVCaller::updateSVsFromCopyNumberPrediction(SVData &sv_calls, std::vector> &sv_list, std::string chr, bool inversion) +std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, SVCandidate& candidate) { - // Throw an error if there are more than two SV candidates - if (sv_list.size() > 2) { - throw std::runtime_error("Error: More than two SV candidates found for copy number prediction comparisons."); - } - - // Add a dummy call to the SV list if there is only one SV candidate - if (sv_list.size() == 1) { - SVCandidate dummy(0, 0, "."); - sv_list.push_back(std::make_pair(dummy, ".")); - } - - // Run copy number prediction for the SV pair and add only the SV - // candidate with the highest likelihood - SVCandidate& sv_one = sv_list[0].first; - SVCandidate& sv_two = sv_list[1].first; - std::tuple cnv_prediction = this->runCopyNumberPredictionPair(chr, sv_one, sv_two); - - // Get the SV info - int best_index = std::get<0>(cnv_prediction); - SVCandidate& best_sv_candidate = sv_list[best_index].first; - int64_t start_pos = std::get<0>(best_sv_candidate); - int64_t end_pos = std::get<1>(best_sv_candidate); - std::string aln_type = sv_list[best_index].second; - - // Get the prediction data - double best_likelihood = std::get<1>(cnv_prediction); - SVType best_cnv_type = std::get<2>(cnv_prediction); - std::string best_genotype = std::get<3>(cnv_prediction); - bool snps_found = std::get<4>(cnv_prediction); - if (snps_found) - { - aln_type += "_SNPS"; - } else { - aln_type += "_NOSNPS"; - } - - // Update the SV type if inversion is detected and the best CNV type is - // copy neutral or duplication - if (inversion) // && (best_cnv_type == sv_types::NEUTRAL)) - { - if (best_cnv_type == SVType::NEUTRAL) - { - best_cnv_type = SVType::INV; - } else if (best_cnv_type == SVType::DUP) + // std::cout << "Running copy number prediction for SV pair " << chr << ":" << std::get<0>(sv_one) << "-" << std::get<1>(sv_one) << " and " << std::get<0>(sv_two) << "-" << std::get<1>(sv_two) << "..." << std::endl; + // Get the start and end positions of the SV call + int64_t start_pos = std::get<0>(candidate); + int64_t end_pos = std::get<1>(candidate); + + // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2 + // the SV length + int64_t sv_length = (end_pos - start_pos) / 2.0; + int64_t snp_start_pos = std::max((int64_t) 1, start_pos - sv_length); + int64_t snp_end_pos = end_pos + sv_length; + + // Query the SNP region for the SV candidate + std::pair snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, this->snp_info, this->pos_depth_map, this->mean_chr_cov); + SNPData& sv_snps = snp_call.first; + bool sv_snps_found = snp_call.second; + + // Run the Viterbi algorithm + std::pair, double> prediction = runViterbi(this->hmm, sv_snps); + std::vector& state_sequence = prediction.first; + double likelihood = prediction.second; + + // Get all the states in the SV region + std::vector sv_states; + for (size_t i = 0; i < state_sequence.size(); i++) + { + if (sv_snps.pos[i] >= start_pos && sv_snps.pos[i] <= end_pos) { - best_cnv_type = SVType::INV_DUP; - printMessage("INVDUP detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); + sv_states.push_back(state_sequence[i]); } - // best_cnv_type = sv_types::INV; - // printMessage("Inversion detected for SV candidate " + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); } - // If the dummy call was used, then throw an error if the best SV type - // is unknown - if (std::get<0>(best_sv_candidate) == 0 && std::get<1>(best_sv_candidate) == 0) + // Determine if there is a majority state within the SV region and if it + // is greater than 75% + double pct_threshold = 0.75; + int max_state = 0; + int max_count = 0; + for (int i = 0; i < 6; i++) { - throw std::runtime_error("Error: No valid SV type found for copy number prediction."); + int state_count = std::count(sv_states.begin(), sv_states.end(), i+1); + if (state_count > max_count) + { + max_state = i+1; + max_count = state_count; + } } - - // Add the SV call to the main SV data - sv_calls.add(chr, start_pos, end_pos, best_cnv_type, ".", aln_type, best_genotype, best_likelihood); -} - -std::tuple CNVCaller::runCopyNumberPredictionPair(std::string chr, SVCandidate sv_one, SVCandidate sv_two) -{ - // std::cout << "Running copy number prediction for SV pair " << chr << ":" << std::get<0>(sv_one) << "-" << std::get<1>(sv_one) << " and " << std::get<0>(sv_two) << "-" << std::get<1>(sv_two) << "..." << std::endl; - double best_likelihood = 0.0; - bool best_likelihood_set = false; - bool snps_found = false; - int best_index = 0; - std::pair best_pos; - SNPData best_snp_data; - int current_index = 0; + + // Update SV type and genotype based on the majority state SVType predicted_cnv_type = SVType::UNKNOWN; std::string genotype = "./."; - for (const auto& sv_call : {sv_one, sv_two}) + int state_count = (int) sv_states.size(); + if ((double) max_count / (double) state_count > pct_threshold) { - // Get the SV candidate - const SVCandidate& candidate = sv_call; - - // Get the start and end positions of the SV call - int64_t start_pos = std::get<0>(candidate); - int64_t end_pos = std::get<1>(candidate); - - // Skip if the start position equals zero (dummy call) - if (start_pos == 0) { - continue; - } - - // Get the depth at the start position, which is used as the FORMAT/DP - // value - // int dp_value = pos_depth_map[start_pos]; - - // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2 - // the SV length - int64_t sv_length = (end_pos - start_pos) / 2.0; - int64_t snp_start_pos = std::max((int64_t) 1, start_pos - sv_length); - int64_t snp_end_pos = end_pos + sv_length; - - // Query the SNP region for the SV candidate - std::pair snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, this->snp_info, this->pos_depth_map, this->mean_chr_cov); - SNPData& sv_snps = snp_call.first; - bool sv_snps_found = snp_call.second; - - // Run the Viterbi algorithm - std::pair, double> prediction = runViterbi(this->hmm, sv_snps); - std::vector& state_sequence = prediction.first; - double likelihood = prediction.second; - - // Get all the states in the SV region - std::vector sv_states; - for (size_t i = 0; i < state_sequence.size(); i++) - { - if (sv_snps.pos[i] >= start_pos && sv_snps.pos[i] <= end_pos) - { - sv_states.push_back(state_sequence[i]); - } - } - - // Determine if there is a majority state within the SV region and if it - // is greater than 75% - double pct_threshold = 0.75; - int max_state = 0; - int max_count = 0; - for (int i = 0; i < 6; i++) - { - int state_count = std::count(sv_states.begin(), sv_states.end(), i+1); - if (state_count > max_count) - { - max_state = i+1; - max_count = state_count; - } - } - - // Update SV type and genotype based on the majority state - int state_count = (int) sv_states.size(); - if ((double) max_count / (double) state_count > pct_threshold) - { - predicted_cnv_type = getSVTypeFromCNState(max_state); - genotype = cnv_genotype_map[max_state]; - } - - // Update the best SV call based on the likelihood - if (!best_likelihood_set || (likelihood > best_likelihood)) - { - best_likelihood = likelihood; - best_likelihood_set = true; - snps_found = sv_snps_found; - best_index = current_index; - - // Add the state sequence to the SNP data (avoid copying the data) - sv_snps.state_sequence = std::move(state_sequence); - best_snp_data = std::move(sv_snps); - best_pos = std::make_pair(start_pos, end_pos); - } - current_index++; + predicted_cnv_type = getSVTypeFromCNState(max_state); + genotype = cnv_genotype_map[max_state]; } + sv_snps.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data // Save the SV calls as a TSV file if enabled - int64_t sv_start_pos = std::get<0>(best_pos); - int64_t sv_end_pos = std::get<1>(best_pos); bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL); - if (this->input_data->getSaveCNVData() && copy_number_change && (sv_end_pos - sv_start_pos) > 10000) + if (this->input_data->getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 10000) { std::string cnv_type_str = getSVTypeString(predicted_cnv_type); - std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) sv_start_pos) + "-" + std::to_string((int) sv_end_pos) + "_SPLITALN.tsv"; + std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv"; std::cout << "Saving SV split-alignment copy number predictions to " << sv_filename << std::endl; - this->saveSVCopyNumberToTSV(best_snp_data, sv_filename, chr, best_pos.first, best_pos.second, cnv_type_str, best_likelihood); + this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood); + // this->saveSVCopyNumberToTSV(best_snp_data, sv_filename, chr, best_pos.first, best_pos.second, cnv_type_str, best_likelihood); } - return std::make_tuple(best_index, best_likelihood, predicted_cnv_type, genotype, snps_found); + return std::make_tuple(likelihood, predicted_cnv_type, genotype, sv_snps_found); } + SNPData CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::map &sv_candidates, int min_length) { SNPInfo& snp_info = this->snp_info; diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index f20ddf69..6521fd77 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include "utils.h" #include "sv_types.h" @@ -433,6 +434,11 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map std::unordered_map primary_match_map = std::get<6>(primary_alignment); bool primary_strand = std::get<7>(primary_alignment); + // Sort the supplementary alignments by chr, start, and end + std::sort(supp_map[qname].begin(), supp_map[qname].end(), [](const AlignmentData& a, const AlignmentData& b) { + return std::get<0>(a) < std::get<0>(b) || (std::get<0>(a) == std::get<0>(b) && std::get<1>(a) < std::get<1>(b)) || (std::get<0>(a) == std::get<0>(b) && std::get<1>(a) == std::get<1>(b) && std::get<2>(a) < std::get<2>(b)); + }); + // Loop through the supplementary alignments and find gaps and overlaps AlignmentVector supp_alignments = supp_map[qname]; for (const auto& supp_alignment : supp_alignments) { @@ -497,7 +503,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // subsections = split_coordinates(entire_coordinate) // likelihoods_subsections = [hmm_model.predict_likelihood(sub) for sub in subsections] - // # Determine best likelihood from subsections + // # Determine best (or worst?) likelihood from subsections (also print all likelihoods for each component) // best_likelihood_split = max(likelihoods_subsections) // # Compare and decide @@ -505,6 +511,10 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // best_choice = "entire coordinate" // else: // best_choice = "split coordinates" + bool find_complex_events = true; + if (find_complex_events) { + // std::cout << "Complex event detection not implemented yet" << std::endl; + } // [1] Inversion detection from primary and supplementary alignments // on opposite strands @@ -512,21 +522,34 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // std::cout << "Inversion detected for read " << qname << std::endl; // std::cout << "Primary read position: " << primary_start << "-" << primary_end << std::endl; // std::cout << "Supplementary read position: " << supp_start << "-" << supp_end << std::endl; - - std::vector> sv_list; // SV candidate and alignment type - - // Use the supplementary alignment coordinates as the SV - // endpoints if (supp_end - supp_start >= min_cnv_length) { SVCandidate sv_candidate(supp_start+1, supp_end+1, "."); - std::pair sv_pair(sv_candidate, "INVERSION"); - sv_list.push_back(sv_pair); - sv_count++; - } + std::tuple result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate); + double likelihood = std::get<0>(result); + SVType cnv_type = std::get<1>(result); + std::string genotype = std::get<2>(result); + bool snps_found = std::get<3>(result); + std::string aln_type = "LOG2"; + if (snps_found) { + aln_type += "_SNPS"; + } else { + aln_type += "_NOSNPS"; + } - // Determine which SV to keep based on HMM prediction likelihood - if (sv_list.size() > 0) { - cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr, true); + // Update the SV type for inversions + if (cnv_type == SVType::NEUTRAL) { + cnv_type = SVType::INV; + } else if (cnv_type == SVType::DUP) { + cnv_type = SVType::INV_DUP; + } else { + cnv_type = SVType::UNKNOWN; + } + + // Add the SV call to the main SV data if not unknown + if (cnv_type != SVType::UNKNOWN) { + sv_calls.add(supp_chr, supp_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood); + } + sv_count++; } } @@ -535,37 +558,54 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // Gap with supplementary before primary: // [supp_start] [supp_end] -- [primary_start] [primary_end] - std::vector> sv_list; // SV candidate and alignment type - - // Use the alignment ends as the SV endpoints if (primary_end - supp_start >= min_cnv_length) { SVCandidate sv_candidate(supp_start+1, primary_end+1, "."); - std::pair sv_pair(sv_candidate, "GAPOUTER_A"); - sv_list.push_back(sv_pair); - sv_count++; - } - // Determine which SV to keep based on HMM prediction likelihood - if (sv_list.size() > 0) { - cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr, false); + // Run copy number prediction for the SV candidate + std::tuple result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate); + double likelihood = std::get<0>(result); + SVType cnv_type = std::get<1>(result); + std::string genotype = std::get<2>(result); + bool snps_found = std::get<3>(result); + std::string aln_type = "GAPOUTER_A"; + if (snps_found) { + aln_type += "_SNPS"; + } else { + aln_type += "_NOSNPS"; + } + + // Add the SV call to the main SV data if not unknown + if (cnv_type != SVType::UNKNOWN) { + sv_calls.add(supp_chr, supp_start, primary_end, cnv_type, ".", aln_type, genotype, likelihood); + } + sv_count++; } } else if (supp_start > primary_end && supp_end > primary_end) { // Gap with supplementary after primary: // [primary_start] [primary_end] -- [supp_start] [supp_end] - std::vector> sv_list; // SV candidate and alignment type - // Use the alignment ends as the SV endpoints if (supp_end - primary_start >= min_cnv_length) { SVCandidate sv_candidate(primary_start+1, supp_end+1, "."); - std::pair sv_pair(sv_candidate, "GAPOUTER_B"); - sv_list.push_back(sv_pair); - sv_count++; - } - // Determine which SV to keep based on HMM prediction likelihood - if (sv_list.size() > 0) { - cnv_caller.updateSVsFromCopyNumberPrediction(sv_calls, sv_list, supp_chr, false); + // Run copy number prediction for the SV candidate + std::tuple result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate); + double likelihood = std::get<0>(result); + SVType cnv_type = std::get<1>(result); + std::string genotype = std::get<2>(result); + bool snps_found = std::get<3>(result); + std::string aln_type = "GAPOUTER_B"; + if (snps_found) { + aln_type += "_SNPS"; + } else { + aln_type += "_NOSNPS"; + } + + // Add the SV call to the main SV data if not unknown + if (cnv_type != SVType::UNKNOWN) { + sv_calls.add(supp_chr, primary_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood); + } + sv_count++; } } } diff --git a/tests/test_general.py b/tests/test_general.py index 9083eb6a..8f5d4006 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -64,7 +64,7 @@ def test_run(): # Check that the VCF file has the correct number of lines. with open(output_file, 'r', encoding='utf-8') as f: - assert len(f.readlines()) == 23 + assert len(f.readlines()) == 22 # Check that the VCF file has the correct header, and the correct # VCF CHROM, POS, and INFO fields in the next 2 lines. From fa4f744bef6f8a9fb0ae4e9803e8f29fcad962b9 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Nov 2024 16:02:32 -0400 Subject: [PATCH 009/134] Simplify split sv calling --- src/sv_caller.cpp | 313 ++++++++++++++++++++++------------------------ 1 file changed, 149 insertions(+), 164 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 6521fd77..3a3d3723 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -261,18 +261,15 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr // Update match/mismatch query map if (op == BAM_CEQUAL) { - // match_count += op_len; for (int j = 0; j < op_len; j++) { query_match_map[query_pos + j] = 1; } } else if (op == BAM_CDIFF) { - // mismatch_count += op_len; for (int j = 0; j < op_len; j++) { query_match_map[query_pos + j] = 0; } } else if (op == BAM_CMATCH) { - // Compare read and reference sequences - // Get the sequence from the query + // Get the read sequence uint8_t* seq_ptr = bam_get_seq(alignment); std::string cmatch_seq_str = ""; for (int j = 0; j < op_len; j++) { @@ -433,180 +430,168 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map int32_t primary_query_end = std::get<5>(primary_alignment); std::unordered_map primary_match_map = std::get<6>(primary_alignment); bool primary_strand = std::get<7>(primary_alignment); + if (supp_map.find(qname) == supp_map.end()) { + continue; + } - // Sort the supplementary alignments by chr, start, and end - std::sort(supp_map[qname].begin(), supp_map[qname].end(), [](const AlignmentData& a, const AlignmentData& b) { - return std::get<0>(a) < std::get<0>(b) || (std::get<0>(a) == std::get<0>(b) && std::get<1>(a) < std::get<1>(b)) || (std::get<0>(a) == std::get<0>(b) && std::get<1>(a) == std::get<1>(b) && std::get<2>(a) < std::get<2>(b)); - }); - - // Loop through the supplementary alignments and find gaps and overlaps - AlignmentVector supp_alignments = supp_map[qname]; - for (const auto& supp_alignment : supp_alignments) { - - // Skip supplementary alignments that are on a different chromosome - // for now (TODO: Use for identifying trans-chromosomal SVs such as - // translocations) - std::string supp_chr = std::get<0>(supp_alignment); - if (primary_chr != supp_chr) { - continue; - } - int32_t supp_start = std::get<1>(supp_alignment); - int32_t supp_end = std::get<2>(supp_alignment); - int32_t supp_query_start = std::get<4>(supp_alignment); - int32_t supp_query_end = std::get<5>(supp_alignment); - std::unordered_map supp_match_map = std::get<6>(supp_alignment); - bool supp_strand = std::get<7>(supp_alignment); - - // Resolve overlaps between the primary and supplementary query sequences - int32_t overlap_start = std::max(primary_query_start, supp_query_start); - int32_t overlap_end = std::min(primary_query_end, supp_query_end); - int32_t overlap_length = overlap_end - overlap_start; - if (overlap_length > 0) { - // std::cout << "Overlap detected for read " << qname << std::endl; - // std::cout << "Primary read position: " << primary_query_start << "-" << primary_query_end << std::endl; - // std::cout << "Supplementary read position: " << supp_query_start << "-" << supp_query_end << std::endl; - // std::cout << "Overlap range: " << overlap_start << "-" << overlap_end << std::endl; - // std::cout << "Overlap length: " << overlap_length << std::endl; - // std::cout << "Primary reference position: " << primary_start << "-" << primary_end << std::endl; - // std::cout << "Supplementary reference position: " << supp_start << "-" << supp_end << std::endl; - - // Calculate the mismatch rate for each alignment at the overlap - double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1); - double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1); - // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl; - // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl; - - // Trim the overlap from the alignment with the higher mismatch - // rate - if (primary_mismatch_rate > supp_mismatch_rate) { - if (overlap_start == primary_query_start) { - primary_start += overlap_length; - } else if (overlap_end == primary_query_end) { - primary_end -= overlap_length; - } - - } else { - if (overlap_start == supp_query_start) { - supp_start += overlap_length; - } else if (overlap_end == supp_query_end) { - supp_end -= overlap_length; - } + // Find the largest alignment on the primary chromosome + AlignmentData supp_alignment = supp_map[qname][0]; + int32_t largest_supp_length = 0; + auto largest_supp_it = supp_map[qname].end(); + for (auto it = supp_map[qname].begin(); it != supp_map[qname].end();) { + const auto& supp_chr = std::get<0>(*it); + if (supp_chr != primary_chr) { + it = supp_map[qname].erase(it); + } else { + int32_t supp_length = std::get<2>(*it) - std::get<1>(*it); + if (supp_length > largest_supp_length) { + largest_supp_length = supp_length; + largest_supp_it = it; } + ++it; } + } + if (largest_supp_it == supp_map[qname].end()) { + continue; // No primary chromosome alignments + } + supp_alignment = *largest_supp_it; + + // Run SV detection from the primary and supplementary alignment + std::string supp_chr = std::get<0>(supp_alignment); + int32_t supp_start = std::get<1>(supp_alignment); + int32_t supp_end = std::get<2>(supp_alignment); + int32_t supp_query_start = std::get<4>(supp_alignment); + int32_t supp_query_end = std::get<5>(supp_alignment); + std::unordered_map supp_match_map = std::get<6>(supp_alignment); + bool supp_strand = std::get<7>(supp_alignment); + + // Resolve overlaps between the primary and supplementary query sequences + int32_t overlap_start = std::max(primary_query_start, supp_query_start); + int32_t overlap_end = std::min(primary_query_end, supp_query_end); + int32_t overlap_length = overlap_end - overlap_start; + if (overlap_length > 0) { + // std::cout << "Overlap detected for read " << qname << std::endl; + // std::cout << "Primary read position: " << primary_query_start << "-" << primary_query_end << std::endl; + // std::cout << "Supplementary read position: " << supp_query_start << "-" << supp_query_end << std::endl; + // std::cout << "Overlap range: " << overlap_start << "-" << overlap_end << std::endl; + // std::cout << "Overlap length: " << overlap_length << std::endl; + // std::cout << "Primary reference position: " << primary_start << "-" << primary_end << std::endl; + // std::cout << "Supplementary reference position: " << supp_start << "-" << supp_end << std::endl; + + // Calculate the mismatch rate for each alignment at the overlap + double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1); + double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1); + // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl; + // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl; + + // Trim the overlap from the alignment with the higher mismatch + // rate + if (primary_mismatch_rate > supp_mismatch_rate) { + if (overlap_start == primary_query_start) { + primary_start += overlap_length; + } else if (overlap_end == primary_query_end) { + primary_end -= overlap_length; + } - // TODO: - // if (find_complex_events) - // # Calculate likelihood for entire coordinate - // likelihood_entire = hmm_model.predict_likelihood(entire_coordinate) - - // # Split coordinates into smaller sections and calculate likelihoods - // subsections = split_coordinates(entire_coordinate) - // likelihoods_subsections = [hmm_model.predict_likelihood(sub) for sub in subsections] - - // # Determine best (or worst?) likelihood from subsections (also print all likelihoods for each component) - // best_likelihood_split = max(likelihoods_subsections) - - // # Compare and decide - // if likelihood_entire > best_likelihood_split: - // best_choice = "entire coordinate" - // else: - // best_choice = "split coordinates" - bool find_complex_events = true; - if (find_complex_events) { - // std::cout << "Complex event detection not implemented yet" << std::endl; + } else { + if (overlap_start == supp_query_start) { + supp_start += overlap_length; + } else if (overlap_end == supp_query_end) { + supp_end -= overlap_length; + } } + } - // [1] Inversion detection from primary and supplementary alignments - // on opposite strands - if (primary_strand != supp_strand) { - // std::cout << "Inversion detected for read " << qname << std::endl; - // std::cout << "Primary read position: " << primary_start << "-" << primary_end << std::endl; - // std::cout << "Supplementary read position: " << supp_start << "-" << supp_end << std::endl; - if (supp_end - supp_start >= min_cnv_length) { - SVCandidate sv_candidate(supp_start+1, supp_end+1, "."); - std::tuple result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate); - double likelihood = std::get<0>(result); - SVType cnv_type = std::get<1>(result); - std::string genotype = std::get<2>(result); - bool snps_found = std::get<3>(result); - std::string aln_type = "LOG2"; - if (snps_found) { - aln_type += "_SNPS"; - } else { - aln_type += "_NOSNPS"; - } + // [1] Inversion detection from primary and supplementary alignments + // on opposite strands + if (primary_strand != supp_strand) { + // std::cout << "Inversion detected for read " << qname << std::endl; + // std::cout << "Primary read position: " << primary_start << "-" << primary_end << std::endl; + // std::cout << "Supplementary read position: " << supp_start << "-" << supp_end << std::endl; + if (supp_end - supp_start >= min_cnv_length) { + SVCandidate sv_candidate(supp_start+1, supp_end+1, "."); + std::tuple result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate); + double likelihood = std::get<0>(result); + SVType cnv_type = std::get<1>(result); + std::string genotype = std::get<2>(result); + bool snps_found = std::get<3>(result); + std::string aln_type = "LOG2"; + if (snps_found) { + aln_type += "_SNPS"; + } else { + aln_type += "_NOSNPS"; + } - // Update the SV type for inversions - if (cnv_type == SVType::NEUTRAL) { - cnv_type = SVType::INV; - } else if (cnv_type == SVType::DUP) { - cnv_type = SVType::INV_DUP; - } else { - cnv_type = SVType::UNKNOWN; - } - - // Add the SV call to the main SV data if not unknown - if (cnv_type != SVType::UNKNOWN) { - sv_calls.add(supp_chr, supp_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood); - } - sv_count++; + // Update the SV type for inversions + if (cnv_type == SVType::NEUTRAL) { + cnv_type = SVType::INV; + } else if (cnv_type == SVType::DUP) { + cnv_type = SVType::INV_DUP; + } else { + cnv_type = SVType::UNKNOWN; + } + + // Add the SV call to the main SV data if not unknown + if (cnv_type != SVType::UNKNOWN) { + sv_calls.add(supp_chr, supp_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood); } + sv_count++; } + } - // [2] CNV detection based on primary and supplementary alignment boundaries - else if (supp_start < primary_start && supp_end < primary_start) { - - // Gap with supplementary before primary: - // [supp_start] [supp_end] -- [primary_start] [primary_end] - if (primary_end - supp_start >= min_cnv_length) { - SVCandidate sv_candidate(supp_start+1, primary_end+1, "."); - - // Run copy number prediction for the SV candidate - std::tuple result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate); - double likelihood = std::get<0>(result); - SVType cnv_type = std::get<1>(result); - std::string genotype = std::get<2>(result); - bool snps_found = std::get<3>(result); - std::string aln_type = "GAPOUTER_A"; - if (snps_found) { - aln_type += "_SNPS"; - } else { - aln_type += "_NOSNPS"; - } + // [2] CNV detection based on primary and supplementary alignment boundaries + else if (supp_start < primary_start && supp_end < primary_start) { + + // Gap with supplementary before primary: + // [supp_start] [supp_end] -- [primary_start] [primary_end] + if (primary_end - supp_start >= min_cnv_length) { + SVCandidate sv_candidate(supp_start+1, primary_end+1, "."); + + // Run copy number prediction for the SV candidate + std::tuple result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate); + double likelihood = std::get<0>(result); + SVType cnv_type = std::get<1>(result); + std::string genotype = std::get<2>(result); + bool snps_found = std::get<3>(result); + std::string aln_type = "GAPOUTER_A"; + if (snps_found) { + aln_type += "_SNPS"; + } else { + aln_type += "_NOSNPS"; + } - // Add the SV call to the main SV data if not unknown - if (cnv_type != SVType::UNKNOWN) { - sv_calls.add(supp_chr, supp_start, primary_end, cnv_type, ".", aln_type, genotype, likelihood); - } - sv_count++; + // Add the SV call to the main SV data if not unknown + if (cnv_type != SVType::UNKNOWN) { + sv_calls.add(supp_chr, supp_start, primary_end, cnv_type, ".", aln_type, genotype, likelihood); + } + sv_count++; + } + + } else if (supp_start > primary_end && supp_end > primary_end) { + // Gap with supplementary after primary: + // [primary_start] [primary_end] -- [supp_start] [supp_end] + + if (supp_end - primary_start >= min_cnv_length) { + SVCandidate sv_candidate(primary_start+1, supp_end+1, "."); + + // Run copy number prediction for the SV candidate + std::tuple result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate); + double likelihood = std::get<0>(result); + SVType cnv_type = std::get<1>(result); + std::string genotype = std::get<2>(result); + bool snps_found = std::get<3>(result); + std::string aln_type = "GAPOUTER_B"; + if (snps_found) { + aln_type += "_SNPS"; + } else { + aln_type += "_NOSNPS"; } - - } else if (supp_start > primary_end && supp_end > primary_end) { - // Gap with supplementary after primary: - // [primary_start] [primary_end] -- [supp_start] [supp_end] - - if (supp_end - primary_start >= min_cnv_length) { - SVCandidate sv_candidate(primary_start+1, supp_end+1, "."); - - // Run copy number prediction for the SV candidate - std::tuple result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate); - double likelihood = std::get<0>(result); - SVType cnv_type = std::get<1>(result); - std::string genotype = std::get<2>(result); - bool snps_found = std::get<3>(result); - std::string aln_type = "GAPOUTER_B"; - if (snps_found) { - aln_type += "_SNPS"; - } else { - aln_type += "_NOSNPS"; - } - // Add the SV call to the main SV data if not unknown - if (cnv_type != SVType::UNKNOWN) { - sv_calls.add(supp_chr, primary_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood); - } - sv_count++; + // Add the SV call to the main SV data if not unknown + if (cnv_type != SVType::UNKNOWN) { + sv_calls.add(supp_chr, primary_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood); } + sv_count++; } } } From 956e7e51fcc28967a612b468dbe7dcf606256e1c Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 4 Nov 2024 12:59:45 -0500 Subject: [PATCH 010/134] Fix GT error and reduce fps --- include/cnv_caller.h | 4 +- include/sv_types.h | 21 +-- python/sv_merger.py | 7 +- src/cnv_caller.cpp | 3 +- src/sv_caller.cpp | 367 ++++++++++++++++++++++++++++++------------ src/sv_data.cpp | 6 + tests/test_general.py | 2 +- 7 files changed, 286 insertions(+), 124 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 86f0abf9..af211fc8 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -61,7 +61,7 @@ class CNVCaller { // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output. // Each of the 6 state predictions corresponds to a copy number state // (0=No predicted state) - // 0: 1/1 (Normal diploid: no copy number change, GT: 1/1) + // 0: Unknown (No predicted state) // 1: 0/0 (Two copy loss: homozygous deletion, GT: 0/0) // 2: 1/0 (One copy loss: heterozygous deletion, GT: 0/1) // 3: 1/1 (Normal diploid: no copy number change, GT: 1/1) @@ -69,7 +69,7 @@ class CNVCaller { // 5: 2/1 (One copy gain: heterozygous duplication, GT: 1/2->0/1) // 6: 2/2 (Two copy gain: homozygous duplication, GT: 2/2->1/1) std ::map cnv_genotype_map = { - {0, "1/1"}, + {0, "./."}, {1, "0/0"}, {2, "0/1"}, {3, "1/1"}, diff --git a/include/sv_types.h b/include/sv_types.h index 97b49185..335a4033 100644 --- a/include/sv_types.h +++ b/include/sv_types.h @@ -20,7 +20,8 @@ namespace sv_types { INS = 3, BND = 4, NEUTRAL = 5, // Neutral copy number with unknown type - INV_DUP = 6 // Inversion duplication + INV_DUP = 6, // Inversion duplication + COMPLEX = 7 // Complex SV }; // Mapping of SV types to strings @@ -32,7 +33,8 @@ namespace sv_types { {SVType::INS, "INS"}, {SVType::BND, "BND"}, {SVType::NEUTRAL, "NEUTRAL"}, - {SVType::INV_DUP, "INV_DUP"} + {SVType::INV_DUP, "INVDUP"}, + {SVType::COMPLEX, "COMPLEX"} }; // Mapping of 6 copy number states to SV types @@ -56,18 +58,6 @@ namespace sv_types { return CNVTypeMap.at(cn_state); } - // static const int UNKNOWN = -1; - // static const int DEL = 0; - // static const int DUP = 1; - // static const int INV = 2; - // static const int INS = 3; - // static const int BND = 4; - // static const int NEUTRAL = 5; // Neutral copy number with unknown type - // static const int INV_DUP = 6; // Inversion duplication - - // // Define SVTypeString for SV types (for VCF output) - // static const std::string SVTypeString[] = {"DEL", "DUP", "INV", "INS", "BND", "NEUT", "INVDUP"}; - // Create a struct for storing SV information struct SVInfo { SVType sv_type; @@ -79,9 +69,6 @@ namespace sv_types { double hmm_likelihood = 0.0; // HMM likelihood score for the state sequence SVInfo() = default; - // SVInfo() : - // sv_type(-1), read_support(0), read_depth(0), data_type({}), sv_length(0), genotype("./."), hmm_likelihood(0.0){} - SVInfo(SVType sv_type, int read_support, int read_depth, std::string data_type, int sv_length, std::string genotype, double hmm_likelihood) : sv_type(sv_type), read_support(read_support), read_depth(read_depth), data_type({data_type}), sv_length(sv_length), genotype(genotype), hmm_likelihood(hmm_likelihood) {} }; diff --git a/python/sv_merger.py b/python/sv_merger.py index 56c0ae26..d2c59977 100644 --- a/python/sv_merger.py +++ b/python/sv_merger.py @@ -137,6 +137,10 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): cluster_labels = [] # dbscan = DBSCAN(eps=30000, min_samples=3) + + if len(breakpoints) == 1: + return merged_records + logging.info("Clustering %d SV breakpoints with parameters: min_cluster_size=%d", len(breakpoints), cluster_size_min) dbscan = HDBSCAN(min_cluster_size=cluster_size_min, min_samples=2) if len(breakpoints) > 0: @@ -144,6 +148,7 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): cluster_labels = dbscan.fit_predict(breakpoints) logging.info("Label counts: %d", len(np.unique(cluster_labels))) + # Merge SVs with the same label unique_labels = np.unique(cluster_labels) @@ -421,4 +426,4 @@ def sv_merger(vcf_file_path, cluster_size_min=3, suffix='.merged'): # DBSCAN sv_merger(vcf_file_path, cluster_size_min=cluster_size_min, suffix=suffix) - \ No newline at end of file + diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 9e575771..e052f45f 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -182,9 +182,8 @@ std::tuple CNVCaller::runCopyNumberPrediction std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv"; std::cout << "Saving SV split-alignment copy number predictions to " << sv_filename << std::endl; this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood); - // this->saveSVCopyNumberToTSV(best_snp_data, sv_filename, chr, best_pos.first, best_pos.second, cnv_type_str, best_likelihood); } - + return std::make_tuple(likelihood, predicted_cnv_type, genotype, sv_snps_found); } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 3a3d3723..53f20ffb 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -434,121 +434,286 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map continue; } - // Find the largest alignment on the primary chromosome - AlignmentData supp_alignment = supp_map[qname][0]; - int32_t largest_supp_length = 0; - auto largest_supp_it = supp_map[qname].end(); + // Resolve overlaps between the primary and supplementary query + // sequences + for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { + std::string supp_chr = std::get<0>(*it); + // int32_t supp_start = std::get<1>(*it); + // int32_t supp_end = std::get<2>(*it); + int32_t supp_query_start = std::get<4>(*it); + int32_t supp_query_end = std::get<5>(*it); + std::unordered_map supp_match_map = std::get<6>(*it); + bool supp_strand = std::get<7>(*it); + + // Resolve overlaps between the primary and supplementary query + // sequences + int32_t overlap_start = std::max(primary_query_start, supp_query_start); + int32_t overlap_end = std::min(primary_query_end, supp_query_end); + int32_t overlap_length = overlap_end - overlap_start; + if (overlap_length > 0) { + + // Calculate the mismatch rate for each alignment at the overlap + double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1); + double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1); + // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl; + // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl; + + // Trim the overlap from the alignment with the higher mismatch + // rate + if (primary_mismatch_rate > supp_mismatch_rate) { + if (overlap_start == primary_query_start) { + primary_start += overlap_length; + } else if (overlap_end == primary_query_end) { + primary_end -= overlap_length; + } + + } else { + if (overlap_start == supp_query_start) { + // supp_start += overlap_length; + // Update the value in the supp map + std::get<1>(*it) += overlap_length; + } else if (overlap_end == supp_query_end) { + // supp_end -= overlap_length; + // Update the value in the supp map + std::get<2>(*it) -= overlap_length; + } + } + } + } + + // Remove supplementary alignments that are not on the same chromosome + // as the primary alignment for (auto it = supp_map[qname].begin(); it != supp_map[qname].end();) { - const auto& supp_chr = std::get<0>(*it); - if (supp_chr != primary_chr) { + if (std::get<0>(*it) != primary_chr) { it = supp_map[qname].erase(it); } else { - int32_t supp_length = std::get<2>(*it) - std::get<1>(*it); - if (supp_length > largest_supp_length) { - largest_supp_length = supp_length; - largest_supp_it = it; - } ++it; } } - if (largest_supp_it == supp_map[qname].end()) { - continue; // No primary chromosome alignments - } - supp_alignment = *largest_supp_it; - - // Run SV detection from the primary and supplementary alignment - std::string supp_chr = std::get<0>(supp_alignment); - int32_t supp_start = std::get<1>(supp_alignment); - int32_t supp_end = std::get<2>(supp_alignment); - int32_t supp_query_start = std::get<4>(supp_alignment); - int32_t supp_query_end = std::get<5>(supp_alignment); - std::unordered_map supp_match_map = std::get<6>(supp_alignment); - bool supp_strand = std::get<7>(supp_alignment); - - // Resolve overlaps between the primary and supplementary query sequences - int32_t overlap_start = std::max(primary_query_start, supp_query_start); - int32_t overlap_end = std::min(primary_query_end, supp_query_end); - int32_t overlap_length = overlap_end - overlap_start; - if (overlap_length > 0) { - // std::cout << "Overlap detected for read " << qname << std::endl; - // std::cout << "Primary read position: " << primary_query_start << "-" << primary_query_end << std::endl; - // std::cout << "Supplementary read position: " << supp_query_start << "-" << supp_query_end << std::endl; - // std::cout << "Overlap range: " << overlap_start << "-" << overlap_end << std::endl; - // std::cout << "Overlap length: " << overlap_length << std::endl; - // std::cout << "Primary reference position: " << primary_start << "-" << primary_end << std::endl; - // std::cout << "Supplementary reference position: " << supp_start << "-" << supp_end << std::endl; - - // Calculate the mismatch rate for each alignment at the overlap - double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1); - double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1); - // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl; - // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl; - - // Trim the overlap from the alignment with the higher mismatch - // rate - if (primary_mismatch_rate > supp_mismatch_rate) { - if (overlap_start == primary_query_start) { - primary_start += overlap_length; - } else if (overlap_end == primary_query_end) { - primary_end -= overlap_length; - } - } else { - if (overlap_start == supp_query_start) { - supp_start += overlap_length; - } else if (overlap_end == supp_query_end) { - supp_end -= overlap_length; - } + // Loop through the supplementary alignments, find the largest + // supplementary alignment, and the closest non-overlapping + // supplementary alignment to the primary alignment + AlignmentData largest_supp_alignment = supp_map[qname][0]; + AlignmentData closest_supp_alignment = supp_map[qname][0]; + int32_t largest_supp_length = 0; + int32_t closest_supp_distance = std::numeric_limits::max(); + for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { + const auto& supp_chr = std::get<0>(*it); + int32_t supp_start = std::get<1>(*it); + int32_t supp_end = std::get<2>(*it); + int32_t supp_length = supp_end - supp_start; + int32_t supp_distance = std::numeric_limits::max(); + if (supp_start > primary_end) { + supp_distance = supp_start - primary_end; + } else if (supp_end < primary_start) { + supp_distance = primary_start - supp_end; + } + if (supp_length > largest_supp_length) { + largest_supp_length = supp_length; + largest_supp_alignment = *it; + } + if (supp_distance < closest_supp_distance) { + closest_supp_distance = supp_distance; + closest_supp_alignment = *it; } } - // [1] Inversion detection from primary and supplementary alignments - // on opposite strands - if (primary_strand != supp_strand) { - // std::cout << "Inversion detected for read " << qname << std::endl; - // std::cout << "Primary read position: " << primary_start << "-" << primary_end << std::endl; - // std::cout << "Supplementary read position: " << supp_start << "-" << supp_end << std::endl; - if (supp_end - supp_start >= min_cnv_length) { - SVCandidate sv_candidate(supp_start+1, supp_end+1, "."); - std::tuple result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate); - double likelihood = std::get<0>(result); - SVType cnv_type = std::get<1>(result); - std::string genotype = std::get<2>(result); - bool snps_found = std::get<3>(result); - std::string aln_type = "LOG2"; - if (snps_found) { - aln_type += "_SNPS"; - } else { - aln_type += "_NOSNPS"; - } + // Find if there are any reverse strand alignments between the primary + // and supplementary alignment + bool complex_sv_found = false; + int32_t largest_supp_start = std::get<1>(largest_supp_alignment); + int32_t largest_supp_end = std::get<2>(largest_supp_alignment); + for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { + if (std::get<7>(*it) != std::get<7>(primary_alignment)) { // Check if the strands are different + // Check if it is between the primary and supplementary + // alignment + int32_t rev_supp_start = std::get<1>(*it); + int32_t rev_supp_end = std::get<2>(*it); + if ((rev_supp_start > primary_end && rev_supp_end < largest_supp_start) || (rev_supp_start > largest_supp_end && rev_supp_end < primary_start)) { + // [primary_end] -- [supp_reverse] -- [supp_start] + // Or: [supp_end] -- [supp_reverse] -- [primary_start] + + // Detect CNVs at the primary alignment + SVType primary_type = SVType::UNKNOWN; + if (primary_end - primary_start >= this->input_data->getMinCNVLength()) { + SVCandidate sv_candidate(primary_start+1, primary_end+1, "."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); + double primary_likelihood = std::get<0>(result); + primary_type = std::get<1>(result); + + // Break if prediction is unknown + if (primary_type == SVType::UNKNOWN) { + continue; + } + } - // Update the SV type for inversions - if (cnv_type == SVType::NEUTRAL) { - cnv_type = SVType::INV; - } else if (cnv_type == SVType::DUP) { - cnv_type = SVType::INV_DUP; - } else { - cnv_type = SVType::UNKNOWN; - } - - // Add the SV call to the main SV data if not unknown - if (cnv_type != SVType::UNKNOWN) { - sv_calls.add(supp_chr, supp_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood); + // Detect CNVs at the largest supplementary alignment + SVType largest_supp_type = SVType::UNKNOWN; + if (largest_supp_end - largest_supp_start >= this->input_data->getMinCNVLength()) { + SVCandidate sv_candidate(largest_supp_start+1, largest_supp_end+1, "."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); + double largest_supp_likelihood = std::get<0>(result); + largest_supp_type = std::get<1>(result); + + // Break if prediction is unknown + if (largest_supp_type == SVType::UNKNOWN) { + continue; + } + } + + // Predict between the primary and largest supplementary + // alignment + int32_t left_start = std::min(primary_end, largest_supp_end); + int32_t right_end = std::max(primary_start, largest_supp_start); + + // Detect CNVs between the left and reverse supplementary + // alignment + SVType left_type = SVType::UNKNOWN; + // if (rev_supp_start - primary_end >= + // this->input_data->getMinCNVLength()) { + if (rev_supp_start - left_start >= this->input_data->getMinCNVLength()) { + // SVCandidate sv_candidate(primary_end+1, + // rev_supp_start+1, "."); + SVCandidate sv_candidate(left_start+1, rev_supp_start+1, "."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); + double left_likelihood = std::get<0>(result); + left_type = std::get<1>(result); + + // Break if prediction is unknown + if (left_type == SVType::UNKNOWN) { + continue; + } + } + + // Detect CNVs at the reverse alignment + SVType rev_type = SVType::UNKNOWN; + if (rev_supp_end - rev_supp_start >= this->input_data->getMinCNVLength()) { + SVCandidate sv_candidate(rev_supp_start+1, rev_supp_end+1, "."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); + double inv_likelihood = std::get<0>(result); + rev_type = std::get<1>(result); + if (rev_type == SVType::NEUTRAL) { + rev_type = SVType::INV; + } else if (rev_type == SVType::DUP) { + rev_type = SVType::INV_DUP; + } + + // Break if prediction is unknown + if (rev_type == SVType::UNKNOWN) { + continue; + } + } + + // Detect CNVs between the reverse supplementary and the + // supplementary alignment (right side) + SVType right_type = SVType::UNKNOWN; + // if (supp_start - rev_supp_end >= + // this->input_data->getMinCNVLength()) { + if (right_end - rev_supp_end >= this->input_data->getMinCNVLength()) { + SVCandidate sv_candidate(rev_supp_end+1, right_end+1, "."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); + double right_likelihood = std::get<0>(result); + right_type = std::get<1>(result); + + // Break if prediction is unknown + if (right_type == SVType::UNKNOWN) { + continue; + } + } + + // Resolve the SV type and coordinates + std::string sv_type_str = ""; + int32_t sv_start_pos = left_start; + int32_t sv_end_pos = left_start; + + // Alignment predictions + if (primary_start < left_start) { + if (primary_type != SVType::NEUTRAL && primary_type != SVType::UNKNOWN) { + sv_type_str += getSVTypeString(primary_type) + "+"; + sv_end_pos = primary_end; + } else { + sv_start_pos = primary_start; + } + } else { + if (largest_supp_type != SVType::NEUTRAL && largest_supp_type != SVType::UNKNOWN) { + sv_type_str += getSVTypeString(largest_supp_type) + "+"; + sv_end_pos = largest_supp_end; + } else { + sv_start_pos = largest_supp_start; + } + } + + // Between-alignments predictions + if (left_type != SVType::NEUTRAL && left_type != SVType::UNKNOWN) { + sv_type_str += getSVTypeString(left_type) + "+"; + sv_end_pos = rev_supp_start; + } else { + sv_start_pos = rev_supp_start; + } + + if (rev_type != SVType::NEUTRAL && rev_type != SVType::UNKNOWN) { + sv_type_str += getSVTypeString(rev_type) + "+"; + sv_end_pos = rev_supp_end; + } else { + sv_start_pos = rev_supp_end; + } + + if (right_type != SVType::NEUTRAL && right_type != SVType::UNKNOWN) { + sv_end_pos = right_end; + sv_type_str += getSVTypeString(right_type) + "+"; + } + + // Alignments predictions + if (primary_end > right_end) { + if (primary_type != SVType::NEUTRAL && primary_type != SVType::UNKNOWN) { + sv_type_str += getSVTypeString(primary_type) + "+"; + sv_end_pos = primary_end; + } else { + sv_start_pos = primary_start; + } + } else { + if (largest_supp_type != SVType::NEUTRAL && largest_supp_type != SVType::UNKNOWN) { + sv_type_str += getSVTypeString(largest_supp_type) + "+"; + sv_end_pos = largest_supp_end; + } else { + sv_start_pos = largest_supp_start; + } + } + + if (sv_type_str != "") { + sv_type_str.pop_back(); // Remove the last '+' + + // Add the complex SV + complex_sv_found = true; + std::cout << "Complex SV detected of type " << sv_type_str << " at positions " << primary_chr << ":" << left_start << "-" << right_end << std::endl; + sv_count++; + + // Add the complex SV + sv_calls.add(primary_chr, sv_start_pos+2, sv_end_pos+1, SVType::COMPLEX, ".", "COMPLEX", "./.", 0.0); + } } - sv_count++; } } - // [2] CNV detection based on primary and supplementary alignment boundaries - else if (supp_start < primary_start && supp_end < primary_start) { + if (complex_sv_found) { + continue; // Continue to the next alignment + } + + // [2] CNV detection based on primary and largest supplementary + // alignment boundaries + // else if (largest_supp_start < primary_start && largest_supp_end < + // primary_start) { + std::string largest_supp_chr = std::get<0>(largest_supp_alignment); + if (largest_supp_start < primary_start && largest_supp_end < primary_start) { // Gap with supplementary before primary: // [supp_start] [supp_end] -- [primary_start] [primary_end] - if (primary_end - supp_start >= min_cnv_length) { - SVCandidate sv_candidate(supp_start+1, primary_end+1, "."); + if (primary_end - largest_supp_start >= min_cnv_length) { + SVCandidate sv_candidate(largest_supp_start+1, primary_end+1, "."); // Run copy number prediction for the SV candidate - std::tuple result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate); + std::tuple result = cnv_caller.runCopyNumberPrediction(largest_supp_chr, sv_candidate); double likelihood = std::get<0>(result); SVType cnv_type = std::get<1>(result); std::string genotype = std::get<2>(result); @@ -562,20 +727,20 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // Add the SV call to the main SV data if not unknown if (cnv_type != SVType::UNKNOWN) { - sv_calls.add(supp_chr, supp_start, primary_end, cnv_type, ".", aln_type, genotype, likelihood); + sv_calls.add(largest_supp_chr, largest_supp_start+1, primary_end+1, cnv_type, ".", aln_type, genotype, likelihood); } sv_count++; } - } else if (supp_start > primary_end && supp_end > primary_end) { + } else if (largest_supp_start > primary_end && largest_supp_end > primary_end) { // Gap with supplementary after primary: // [primary_start] [primary_end] -- [supp_start] [supp_end] - if (supp_end - primary_start >= min_cnv_length) { - SVCandidate sv_candidate(primary_start+1, supp_end+1, "."); + if (largest_supp_end - primary_start >= min_cnv_length) { + SVCandidate sv_candidate(primary_start+1, largest_supp_end+1, "."); // Run copy number prediction for the SV candidate - std::tuple result = cnv_caller.runCopyNumberPrediction(supp_chr, sv_candidate); + std::tuple result = cnv_caller.runCopyNumberPrediction(largest_supp_chr, sv_candidate); double likelihood = std::get<0>(result); SVType cnv_type = std::get<1>(result); std::string genotype = std::get<2>(result); @@ -589,7 +754,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // Add the SV call to the main SV data if not unknown if (cnv_type != SVType::UNKNOWN) { - sv_calls.add(supp_chr, primary_start, supp_end, cnv_type, ".", aln_type, genotype, likelihood); + sv_calls.add(largest_supp_chr, primary_start+1, largest_supp_end+1, cnv_type, ".", aln_type, genotype, likelihood); } sv_count++; } diff --git a/src/sv_data.cpp b/src/sv_data.cpp index 797ef704..18ea8056 100644 --- a/src/sv_data.cpp +++ b/src/sv_data.cpp @@ -10,6 +10,12 @@ int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) { + // Throw an error if the genotype is not valid + if (genotype != "./." && genotype != "0/0" && genotype != "0/1" && genotype != "1/1") { + std::cerr << "Error: Invalid genotype " << genotype << std::endl; + return -1; + } + // Check if the alternate allele contains ambiguous bases const std::unordered_set ambiguous_bases = {'R', 'Y', 'W', 'S', 'K', 'M', 'B', 'D', 'H', 'V'}; for (char c : alt_allele) { diff --git a/tests/test_general.py b/tests/test_general.py index 8f5d4006..499805c8 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -64,7 +64,7 @@ def test_run(): # Check that the VCF file has the correct number of lines. with open(output_file, 'r', encoding='utf-8') as f: - assert len(f.readlines()) == 22 + assert len(f.readlines()) == 21 # Check that the VCF file has the correct header, and the correct # VCF CHROM, POS, and INFO fields in the next 2 lines. From e88f5629b838511fe429331c656683194aecdb3b Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 11 Nov 2024 16:22:13 -0500 Subject: [PATCH 011/134] Fix alt allele and likelihood comparisons --- include/sv_data.h | 2 +- include/sv_types.h | 2 +- src/khmm.cpp | 6 +- src/sv_caller.cpp | 339 ++++++++++++++---------------------------- src/sv_data.cpp | 9 +- tests/test_general.py | 2 +- 6 files changed, 128 insertions(+), 232 deletions(-) diff --git a/include/sv_data.h b/include/sv_data.h index 548d6513..c321bf64 100644 --- a/include/sv_data.h +++ b/include/sv_data.h @@ -36,7 +36,7 @@ class SVData { public: SVData() {}; - int add(std::string chr, int64_t start, int64_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood); + int add(std::string chr, int32_t start, int32_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood); void concatenate(const SVData& sv_data); diff --git a/include/sv_types.h b/include/sv_types.h index 335a4033..f58e6f7b 100644 --- a/include/sv_types.h +++ b/include/sv_types.h @@ -74,7 +74,7 @@ namespace sv_types { }; // Type definition for SV-related structures - using SVCandidate = std::tuple; // SV (start, end, alt_allele) + using SVCandidate = std::tuple; // SV (start, end, alt_allele) using SVDepthMap = std::unordered_map>; // Chromosome -> SV candidate -> SV info } diff --git a/src/khmm.cpp b/src/khmm.cpp index a5d553be..375325a1 100644 --- a/src/khmm.cpp +++ b/src/khmm.cpp @@ -378,8 +378,12 @@ std::pair, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect free_dmatrix(biot, 1, hmm.N, 1, T); free_dmatrix(A1, 1, hmm.N, 1, hmm.N); + // Normalize the log likelihood by the sample size + double min_prob_normalized = min_prob / (double)T; + // Return the state sequence and its likelihood - return std::make_pair(q, min_prob); + // return std::make_pair(q, min_prob); + return std::make_pair(q, min_prob_normalized); } CHMM ReadCHMM(const char *filename) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 53f20ffb..12c6d5f4 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -491,6 +491,17 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map } } + // Run copy number variant predictions on the primary alignment + SVType primary_type = SVType::UNKNOWN; + double primary_log_likelihood = std::numeric_limits::lowest(); + if (primary_end - primary_start >= min_cnv_length) { + SVCandidate sv_candidate(primary_start+1, primary_end+1, "."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); + primary_log_likelihood = std::get<0>(result); + // primary_log_likelihood /= (double)(primary_end - primary_start); // Normalize the log likelihood by the length + primary_type = std::get<1>(result); + } + // Loop through the supplementary alignments, find the largest // supplementary alignment, and the closest non-overlapping // supplementary alignment to the primary alignment @@ -498,6 +509,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map AlignmentData closest_supp_alignment = supp_map[qname][0]; int32_t largest_supp_length = 0; int32_t closest_supp_distance = std::numeric_limits::max(); + int32_t closest_supp_length = 0; for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { const auto& supp_chr = std::get<0>(*it); int32_t supp_start = std::get<1>(*it); @@ -514,249 +526,124 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map largest_supp_alignment = *it; } if (supp_distance < closest_supp_distance) { - closest_supp_distance = supp_distance; + closest_supp_length = supp_length; closest_supp_alignment = *it; + closest_supp_distance = supp_distance; } } - // Find if there are any reverse strand alignments between the primary - // and supplementary alignment - bool complex_sv_found = false; - int32_t largest_supp_start = std::get<1>(largest_supp_alignment); - int32_t largest_supp_end = std::get<2>(largest_supp_alignment); - for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { - if (std::get<7>(*it) != std::get<7>(primary_alignment)) { // Check if the strands are different - // Check if it is between the primary and supplementary - // alignment - int32_t rev_supp_start = std::get<1>(*it); - int32_t rev_supp_end = std::get<2>(*it); - if ((rev_supp_start > primary_end && rev_supp_end < largest_supp_start) || (rev_supp_start > largest_supp_end && rev_supp_end < primary_start)) { - // [primary_end] -- [supp_reverse] -- [supp_start] - // Or: [supp_end] -- [supp_reverse] -- [primary_start] - - // Detect CNVs at the primary alignment - SVType primary_type = SVType::UNKNOWN; - if (primary_end - primary_start >= this->input_data->getMinCNVLength()) { - SVCandidate sv_candidate(primary_start+1, primary_end+1, "."); - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - double primary_likelihood = std::get<0>(result); - primary_type = std::get<1>(result); - - // Break if prediction is unknown - if (primary_type == SVType::UNKNOWN) { - continue; - } - } - - // Detect CNVs at the largest supplementary alignment - SVType largest_supp_type = SVType::UNKNOWN; - if (largest_supp_end - largest_supp_start >= this->input_data->getMinCNVLength()) { - SVCandidate sv_candidate(largest_supp_start+1, largest_supp_end+1, "."); - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - double largest_supp_likelihood = std::get<0>(result); - largest_supp_type = std::get<1>(result); - - // Break if prediction is unknown - if (largest_supp_type == SVType::UNKNOWN) { - continue; - } - } - - // Predict between the primary and largest supplementary - // alignment - int32_t left_start = std::min(primary_end, largest_supp_end); - int32_t right_end = std::max(primary_start, largest_supp_start); - - // Detect CNVs between the left and reverse supplementary - // alignment - SVType left_type = SVType::UNKNOWN; - // if (rev_supp_start - primary_end >= - // this->input_data->getMinCNVLength()) { - if (rev_supp_start - left_start >= this->input_data->getMinCNVLength()) { - // SVCandidate sv_candidate(primary_end+1, - // rev_supp_start+1, "."); - SVCandidate sv_candidate(left_start+1, rev_supp_start+1, "."); - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - double left_likelihood = std::get<0>(result); - left_type = std::get<1>(result); - - // Break if prediction is unknown - if (left_type == SVType::UNKNOWN) { - continue; - } - } - - // Detect CNVs at the reverse alignment - SVType rev_type = SVType::UNKNOWN; - if (rev_supp_end - rev_supp_start >= this->input_data->getMinCNVLength()) { - SVCandidate sv_candidate(rev_supp_start+1, rev_supp_end+1, "."); - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - double inv_likelihood = std::get<0>(result); - rev_type = std::get<1>(result); - if (rev_type == SVType::NEUTRAL) { - rev_type = SVType::INV; - } else if (rev_type == SVType::DUP) { - rev_type = SVType::INV_DUP; - } - - // Break if prediction is unknown - if (rev_type == SVType::UNKNOWN) { - continue; - } - } - - // Detect CNVs between the reverse supplementary and the - // supplementary alignment (right side) - SVType right_type = SVType::UNKNOWN; - // if (supp_start - rev_supp_end >= - // this->input_data->getMinCNVLength()) { - if (right_end - rev_supp_end >= this->input_data->getMinCNVLength()) { - SVCandidate sv_candidate(rev_supp_end+1, right_end+1, "."); - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - double right_likelihood = std::get<0>(result); - right_type = std::get<1>(result); - - // Break if prediction is unknown - if (right_type == SVType::UNKNOWN) { - continue; - } - } - - // Resolve the SV type and coordinates - std::string sv_type_str = ""; - int32_t sv_start_pos = left_start; - int32_t sv_end_pos = left_start; - - // Alignment predictions - if (primary_start < left_start) { - if (primary_type != SVType::NEUTRAL && primary_type != SVType::UNKNOWN) { - sv_type_str += getSVTypeString(primary_type) + "+"; - sv_end_pos = primary_end; - } else { - sv_start_pos = primary_start; - } - } else { - if (largest_supp_type != SVType::NEUTRAL && largest_supp_type != SVType::UNKNOWN) { - sv_type_str += getSVTypeString(largest_supp_type) + "+"; - sv_end_pos = largest_supp_end; - } else { - sv_start_pos = largest_supp_start; - } - } - - // Between-alignments predictions - if (left_type != SVType::NEUTRAL && left_type != SVType::UNKNOWN) { - sv_type_str += getSVTypeString(left_type) + "+"; - sv_end_pos = rev_supp_start; - } else { - sv_start_pos = rev_supp_start; - } - - if (rev_type != SVType::NEUTRAL && rev_type != SVType::UNKNOWN) { - sv_type_str += getSVTypeString(rev_type) + "+"; - sv_end_pos = rev_supp_end; - } else { - sv_start_pos = rev_supp_end; - } - - if (right_type != SVType::NEUTRAL && right_type != SVType::UNKNOWN) { - sv_end_pos = right_end; - sv_type_str += getSVTypeString(right_type) + "+"; - } - - // Alignments predictions - if (primary_end > right_end) { - if (primary_type != SVType::NEUTRAL && primary_type != SVType::UNKNOWN) { - sv_type_str += getSVTypeString(primary_type) + "+"; - sv_end_pos = primary_end; - } else { - sv_start_pos = primary_start; - } - } else { - if (largest_supp_type != SVType::NEUTRAL && largest_supp_type != SVType::UNKNOWN) { - sv_type_str += getSVTypeString(largest_supp_type) + "+"; - sv_end_pos = largest_supp_end; - } else { - sv_start_pos = largest_supp_start; - } - } + // Run copy number variant predictions on the largest supplementary + // alignment + double largest_supp_log_likelihood = std::numeric_limits::lowest(); + SVType largest_supp_type = SVType::UNKNOWN; + if (largest_supp_length >= min_cnv_length) { + SVCandidate sv_candidate(std::get<1>(largest_supp_alignment)+1, std::get<2>(largest_supp_alignment)+1, "."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); + largest_supp_log_likelihood = std::get<0>(result); + // largest_supp_log_likelihood /= (double)largest_supp_length; // Normalize the log likelihood by the length + largest_supp_type = std::get<1>(result); + } - if (sv_type_str != "") { - sv_type_str.pop_back(); // Remove the last '+' + // Run copy number variant predictions on the closest non-overlapping + // supplementary alignment (if not the same as the largest) + double closest_supp_log_likelihood = std::numeric_limits::lowest(); + SVType closest_supp_type = SVType::UNKNOWN; + if (largest_supp_alignment != closest_supp_alignment) { + if (closest_supp_length >= min_cnv_length) { + SVCandidate sv_candidate(std::get<1>(closest_supp_alignment)+1, std::get<2>(closest_supp_alignment)+1, "."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); + closest_supp_log_likelihood = std::get<0>(result); + // closest_supp_log_likelihood /= (double)closest_supp_length; // Normalize the log likelihood by the length + closest_supp_type = std::get<1>(result); + } + } - // Add the complex SV - complex_sv_found = true; - std::cout << "Complex SV detected of type " << sv_type_str << " at positions " << primary_chr << ":" << left_start << "-" << right_end << std::endl; - sv_count++; + // Loop through all the supplementary alignments and find the highest + // likelihood prediction + double best_supp_log_likelihood = std::numeric_limits::lowest(); + SVType best_supp_type = SVType::UNKNOWN; + std::pair best_supp_candidate; + for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { + int32_t supp_start = std::get<1>(*it); + int32_t supp_end = std::get<2>(*it); - // Add the complex SV - sv_calls.add(primary_chr, sv_start_pos+2, sv_end_pos+1, SVType::COMPLEX, ".", "COMPLEX", "./.", 0.0); - } + // Create the SV candidate as the boundary of the supplementary + // and primary alignment + // int32_t sv_start = std::min(primary_start, std::get<1>(*it)); + // int32_t sv_end = std::max(primary_end, std::get<2>(*it)); + int32_t sv_start = std::min(primary_start, supp_start); + int32_t sv_end = std::max(primary_end, supp_end); + SVCandidate sv_candidate(sv_start+1, sv_end+1, "."); + + // Determine if the strand is the same as the primary alignment + bool same_strand = std::get<7>(*it) == std::get<7>(primary_alignment); + + // SVCandidate sv_candidate(std::get<1>(*it)+1, std::get<2>(*it)+1, "."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); + double supp_likelihood = std::get<0>(result); + SVType supp_type = std::get<1>(result); + + // If opposite strand, set the type to INV or INV_DUP + if (!same_strand) { + if (supp_type == SVType::NEUTRAL) { + supp_type = SVType::INV; + } else if (supp_type == SVType::DUP) { + supp_type = SVType::INV_DUP; } } - } - if (complex_sv_found) { - continue; // Continue to the next alignment + if (supp_type != SVType::UNKNOWN && supp_likelihood > best_supp_log_likelihood) { + best_supp_log_likelihood = supp_likelihood; + // best_supp_log_likelihood /= (double)(sv_end - sv_start); // Normalize the log likelihood by the length + best_supp_type = supp_type; + best_supp_candidate = std::make_pair(supp_start, supp_end); + } } - // [2] CNV detection based on primary and largest supplementary - // alignment boundaries - // else if (largest_supp_start < primary_start && largest_supp_end < - // primary_start) { - std::string largest_supp_chr = std::get<0>(largest_supp_alignment); - if (largest_supp_start < primary_start && largest_supp_end < primary_start) { - - // Gap with supplementary before primary: - // [supp_start] [supp_end] -- [primary_start] [primary_end] - if (primary_end - largest_supp_start >= min_cnv_length) { - SVCandidate sv_candidate(largest_supp_start+1, primary_end+1, "."); - - // Run copy number prediction for the SV candidate - std::tuple result = cnv_caller.runCopyNumberPrediction(largest_supp_chr, sv_candidate); - double likelihood = std::get<0>(result); - SVType cnv_type = std::get<1>(result); - std::string genotype = std::get<2>(result); - bool snps_found = std::get<3>(result); - std::string aln_type = "GAPOUTER_A"; - if (snps_found) { - aln_type += "_SNPS"; - } else { - aln_type += "_NOSNPS"; + // Add the SV call with the highest likelihood prediction + if (best_supp_log_likelihood > primary_log_likelihood || best_supp_log_likelihood > largest_supp_log_likelihood || best_supp_log_likelihood > closest_supp_log_likelihood) { + int32_t sv_start = best_supp_candidate.first; + int32_t sv_end = best_supp_candidate.second; + sv_calls.add(primary_chr, sv_start, sv_end, best_supp_type, ".", "SPLITREAD", "./.", best_supp_log_likelihood); + sv_count++; + } else { + // Resolve complex SVs + // Simplest case: Largest supplementary is also the closest + if (largest_supp_alignment == closest_supp_alignment) { + // [primary] -- [supp_start] -- [supp_end] + // Determine if opposite strands + bool opposite_strands = std::get<7>(largest_supp_alignment) != std::get<7>(primary_alignment); + + // Determine if the supplementary alignment is an inversion + if (opposite_strands) { + if (largest_supp_type == SVType::NEUTRAL) { + largest_supp_type = SVType::INV; + } else if (largest_supp_type == SVType::DUP) { + largest_supp_type = SVType::INV_DUP; + } } - // Add the SV call to the main SV data if not unknown - if (cnv_type != SVType::UNKNOWN) { - sv_calls.add(largest_supp_chr, largest_supp_start+1, primary_end+1, cnv_type, ".", aln_type, genotype, likelihood); - } - sv_count++; - } - - } else if (largest_supp_start > primary_end && largest_supp_end > primary_end) { - // Gap with supplementary after primary: - // [primary_start] [primary_end] -- [supp_start] [supp_end] - - if (largest_supp_end - primary_start >= min_cnv_length) { - SVCandidate sv_candidate(primary_start+1, largest_supp_end+1, "."); - - // Run copy number prediction for the SV candidate - std::tuple result = cnv_caller.runCopyNumberPrediction(largest_supp_chr, sv_candidate); - double likelihood = std::get<0>(result); - SVType cnv_type = std::get<1>(result); - std::string genotype = std::get<2>(result); - bool snps_found = std::get<3>(result); - std::string aln_type = "GAPOUTER_B"; - if (snps_found) { - aln_type += "_SNPS"; + // Get the SV type strings + std::string primary_type_str = getSVTypeString(primary_type); + std::string supp_type_str = getSVTypeString(largest_supp_type); + + // Determine the order of the primary and supplementary + // alignment to resolve the SV + if (std::get<1>(largest_supp_alignment) < primary_start) { + // [supp_start] -- [supp_end] -- [primary] + std::string complex_sv_type_str = supp_type_str + "+" + primary_type_str; + + // Add the complex SV call + sv_calls.add(primary_chr, std::get<1>(largest_supp_alignment), primary_end, SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0); + sv_count++; } else { - aln_type += "_NOSNPS"; - } + // [primary] -- [supp_start] -- [supp_end] + std::string complex_sv_type_str = primary_type_str + "+" + supp_type_str; - // Add the SV call to the main SV data if not unknown - if (cnv_type != SVType::UNKNOWN) { - sv_calls.add(largest_supp_chr, primary_start+1, largest_supp_end+1, cnv_type, ".", aln_type, genotype, likelihood); + // Add the complex SV call + sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0); + sv_count++; } - sv_count++; } } } diff --git a/src/sv_data.cpp b/src/sv_data.cpp index 18ea8056..e9b925dc 100644 --- a/src/sv_data.cpp +++ b/src/sv_data.cpp @@ -8,7 +8,7 @@ /// @endcond -int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) +int SVData::add(std::string chr, int32_t start, int32_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) { // Throw an error if the genotype is not valid if (genotype != "./." && genotype != "0/0" && genotype != "0/1" && genotype != "1/1") { @@ -16,9 +16,14 @@ int SVData::add(std::string chr, int64_t start, int64_t end, SVType sv_type, std return -1; } + // Trim the alternate allele if it is too long + if (alt_allele.length() > 100) { + alt_allele = alt_allele.substr(0, 100); + } + // Check if the alternate allele contains ambiguous bases const std::unordered_set ambiguous_bases = {'R', 'Y', 'W', 'S', 'K', 'M', 'B', 'D', 'H', 'V'}; - for (char c : alt_allele) { + for (char &c : alt_allele) { if (ambiguous_bases.count(c) > 0) { c = 'N'; } diff --git a/tests/test_general.py b/tests/test_general.py index 499805c8..52d3fc84 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -64,7 +64,7 @@ def test_run(): # Check that the VCF file has the correct number of lines. with open(output_file, 'r', encoding='utf-8') as f: - assert len(f.readlines()) == 21 + assert len(f.readlines()) == 25 # Check that the VCF file has the correct header, and the correct # VCF CHROM, POS, and INFO fields in the next 2 lines. From 41f83826e7bfabb0089ce4422235be43b92347e3 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Tue, 12 Nov 2024 17:59:47 -0500 Subject: [PATCH 012/134] Fix CN length filter error --- include/sv_data.h | 10 ----- include/vcf_writer.h | 14 ++++++- python/plot_distributions.py | 6 ++- src/cnv_caller.cpp | 64 ++++++++++++++++++++---------- src/input_data.cpp | 3 +- src/sv_caller.cpp | 76 ++++++++++++++++++++++++++++++++++-- src/sv_data.cpp | 3 -- src/vcf_writer.cpp | 30 +++++++++++--- tests/test_general.py | 2 +- 9 files changed, 161 insertions(+), 47 deletions(-) diff --git a/include/sv_data.h b/include/sv_data.h index c321bf64..f4ed6e25 100644 --- a/include/sv_data.h +++ b/include/sv_data.h @@ -22,16 +22,6 @@ class SVData { // Map of clipped base support by position (chr, pos) : depth std::map, int> clipped_base_support; - - // SV type to string map for VCF output - // std::map sv_type_map = { - // {0, "DEL"}, - // {1, "DUP"}, - // {2, "INV"}, - // {3, "INS"}, - // {4, "BND"}, - // {5, "DUP"} - // }; public: SVData() {}; diff --git a/include/vcf_writer.h b/include/vcf_writer.h index e395ea37..a6f7707c 100644 --- a/include/vcf_writer.h +++ b/include/vcf_writer.h @@ -1,3 +1,6 @@ +#ifndef VCF_WRITER_H +#define VCF_WRITER_H + /// @cond #include #include @@ -6,9 +9,14 @@ class VcfWriter { public: - // Constructor - VcfWriter(const std::string& filename); + explicit VcfWriter(const std::string& filename); + // VcfWriter(const std::string& filename); ~VcfWriter(); + + // Delete copy constructor and assignment operator + VcfWriter(const VcfWriter&) = delete; + VcfWriter& operator=(const VcfWriter&) = delete; + void writeHeader(const std::vector& headerLines); void writeRecord(const std::string& chrom, int pos, const std::string& id, const std::string& ref, const std::string& alt, @@ -19,3 +27,5 @@ class VcfWriter { private: std::ofstream file_stream; }; + +#endif // VCF_WRITER_H diff --git a/python/plot_distributions.py b/python/plot_distributions.py index 1f684ed8..7db7cb2a 100644 --- a/python/plot_distributions.py +++ b/python/plot_distributions.py @@ -90,10 +90,12 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"): # Create a dictionary of SV types and their corresponding colors. # From: https://davidmathlogic.com/colorblind/ - sv_colors = {'DEL': '#D81B60', 'DUP': '#1E88E5', 'INV': '#FFC107', 'INS': '#004D40'} + # sv_colors = {'DEL': '#D81B60', 'DUP': '#1E88E5', 'INV': '#FFC107', 'INS': '#004D40'} + # WONG colors + sv_colors = {'DEL': '#E69F00', 'DUP': '#56B4E9', 'INV': '#009E73', 'INS': '#F0E442', 'INVDUP': '#D55E00', 'COMPLEX': '#CC79A7'} # Create a dictionary of SV types and their corresponding labels - sv_labels = {'DEL': 'Deletion', 'DUP': 'Duplication', 'INV': 'Inversion', 'INS': 'Insertion'} + sv_labels = {'DEL': 'Deletion', 'DUP': 'Duplication', 'INV': 'Inversion', 'INS': 'Insertion', 'INVDUP': 'Inverted Duplication', 'COMPLEX': 'Complex'} # Get the list of SV types and sort them in the order of the labels sv_types = sorted(sv_sizes.keys(), key=lambda x: sv_labels[x]) diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index e052f45f..7d07f0ea 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -197,22 +197,22 @@ SNPData CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::map filtered_sv_candidates; - for (const auto& sv_call : sv_candidates) - { - int64_t start_pos = std::get<0>(sv_call.first); - int64_t end_pos = std::get<1>(sv_call.first); - if ((end_pos - start_pos) >= min_length) - { - filtered_sv_candidates[sv_call.first] = sv_call.second; - } - } - sv_candidates = std::move(filtered_sv_candidates); - int sv_count = (int) sv_candidates.size(); - if (sv_count == 0) - { - return snp_data; - } + // std::map filtered_sv_candidates; + // for (const auto& sv_call : sv_candidates) + // { + // int64_t start_pos = std::get<0>(sv_call.first); + // int64_t end_pos = std::get<1>(sv_call.first); + // if ((end_pos - start_pos) >= min_length) + // { + // filtered_sv_candidates[sv_call.first] = sv_call.second; + // } + // } + // sv_candidates = std::move(filtered_sv_candidates); + // int sv_count = (int) sv_candidates.size(); + // if (sv_count == 0) + // { + // return snp_data; + // } printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "..."); @@ -271,6 +271,12 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map(candidate); int64_t end_pos = std::get<1>(candidate); + // Skip if not the minimum length for CNV predictions + if ((end_pos - start_pos) < this->input_data->getMinCNVLength()) + { + continue; + } + // Get the depth at the start position. This is used as the FORMAT/DP // value in the VCF file int dp_value = pos_depth_map[start_pos]; @@ -479,8 +485,16 @@ void CNVCaller::loadChromosomeData(std::string chr) // Calculate the mean chromosome coverage double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) { - // Split the chromosome into equal parts for each thread + + // Use a maximum of 8 threads to avoid overloading the system with too many + // parallel processes int num_threads = this->input_data->getThreadCount(); + if (num_threads > 8) + { + num_threads = 8; + } + + // Split the chromosome into equal parts for each thread uint32_t chr_len = this->input_data->getRefGenomeChromosomeLength(chr); std::vector region_chunks = splitRegionIntoChunks(chr, 1, chr_len, num_threads); @@ -780,12 +794,19 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) } } + // Use a maximum of 8 threads to avoid overloading the system with too many + // processes + int num_threads = this->input_data->getThreadCount(); + if (num_threads > 8) + { + num_threads = 8; + } + // Split region into chunks and get the population frequencies in parallel std::cout << "SNP range for chromosome " << chr << ": " << snp_start << "-" << snp_end << std::endl; - int num_threads = this->input_data->getThreadCount(); std::vector region_chunks = splitRegionIntoChunks(chr_gnomad, snp_start, snp_end, num_threads); std::unordered_map pos_pfb_map; - std::vector threads; + // std::vector threads; std::vector>> futures; for (const auto& region_chunk : region_chunks) { @@ -800,7 +821,8 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) std::string cmd = \ "bcftools query -r " + region_chunk + " -f '%POS\t%" + AF_key + "\n' -i '" + filter_criteria + "' " + pfb_filepath + " 2>/dev/null"; - std::cout << "Command: " << cmd << std::endl; + // std::cout << "Command: " << cmd << std::endl; + printMessage("Running command: " + cmd); // Open a pipe to read the output of the command FILE *fp = popen(cmd.c_str(), "r"); @@ -812,6 +834,7 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) // Loop through the BCFTOOLS output and populate the map of population // frequencies + // printMessage("Parsing population frequencies for chromosome " + chr + "..."); std::unordered_map pos_pfb_map; const int line_size = 256; char line[line_size]; @@ -826,6 +849,7 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) } } pclose(fp); + // printMessage("Finished parsing population frequencies for chromosome " + chr + "..."); return pos_pfb_map; }; diff --git a/src/input_data.cpp b/src/input_data.cpp index 85e4f8d1..8867d0ff 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -197,9 +197,10 @@ void InputData::setRegion(std::string region) // Set the region this->start_end = std::make_pair(start, end); this->region_set = true; + + std::cout << "Region set to " << this->chr << ":" << start << "-" << end << std::endl; } } - std::cout << "Region set to " << this->start_end.first << "-" << this->start_end.second << std::endl; } std::pair InputData::getRegion() diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 12c6d5f4..590c8db4 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -429,7 +429,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map int32_t primary_query_start = std::get<4>(primary_alignment); int32_t primary_query_end = std::get<5>(primary_alignment); std::unordered_map primary_match_map = std::get<6>(primary_alignment); - bool primary_strand = std::get<7>(primary_alignment); + // bool primary_strand = std::get<7>(primary_alignment); if (supp_map.find(qname) == supp_map.end()) { continue; } @@ -443,7 +443,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map int32_t supp_query_start = std::get<4>(*it); int32_t supp_query_end = std::get<5>(*it); std::unordered_map supp_match_map = std::get<6>(*it); - bool supp_strand = std::get<7>(*it); + // bool supp_strand = std::get<7>(*it); // Resolve overlaps between the primary and supplementary query // sequences @@ -511,7 +511,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map int32_t closest_supp_distance = std::numeric_limits::max(); int32_t closest_supp_length = 0; for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { - const auto& supp_chr = std::get<0>(*it); + // const auto& supp_chr = std::get<0>(*it); int32_t supp_start = std::get<1>(*it); int32_t supp_end = std::get<2>(*it); int32_t supp_length = supp_end - supp_start; @@ -644,6 +644,76 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0); sv_count++; } + } else { + // Resolve complex SVs with multiple supplementary alignments + // Determine the order of the primary and supplementary + // alignments + // [primary] -- [closest_supp] -- [largest_supp] + // [closest_supp] -- [primary] -- [largest_supp] + // [largest_supp] -- [closest_supp] -- [primary] + // [largest_supp] -- [primary] -- [closest_supp] + // Only consider case 1 for efficiency: + if (primary_end < std::get<1>(closest_supp_alignment) && std::get<2>(closest_supp_alignment) < std::get<1>(largest_supp_alignment)) { + // [primary] -- [closest_supp] -- [largest_supp] + // Determine if the closest supplementary alignment is an + // inversion + if (std::get<7>(closest_supp_alignment) != std::get<7>(primary_alignment)) { + if (closest_supp_type == SVType::NEUTRAL) { + closest_supp_type = SVType::INV; + } else if (closest_supp_type == SVType::DUP) { + closest_supp_type = SVType::INV_DUP; + } + } + + // Run copy number variant predictions on the region between + // the closest supplementary alignment and the largest + // supplementary alignment + SVCandidate sv_candidate(std::get<2>(closest_supp_alignment)+1, std::get<1>(largest_supp_alignment)+1, "."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); + // double complex_log_likelihood = std::get<0>(result); + SVType complex_type = std::get<1>(result); + + // if (std::get<7>(largest_supp_alignment) != std::get<7>(primary_alignment)) { + // if (largest_supp_type == SVType::NEUTRAL) { + // largest_supp_type = SVType::INV; + // } else if (largest_supp_type == SVType::DUP) { + // largest_supp_type = SVType::INV_DUP; + // } + // } + + std::string primary_type_str = getSVTypeString(primary_type); + std::string closest_supp_type_str = getSVTypeString(closest_supp_type); + // std::string largest_supp_type_str = getSVTypeString(largest_supp_type); + // std::string complex_sv_type_str = primary_type_str + "+" + closest_supp_type_str; + + + // Combine the types if equal and not unknown/neutral + std::string complex_sv_type_str = ""; + if (primary_type != SVType::UNKNOWN && primary_type != SVType::NEUTRAL) { + complex_sv_type_str += primary_type_str; + } + if (closest_supp_type != primary_type && closest_supp_type != SVType::UNKNOWN && closest_supp_type != SVType::NEUTRAL) { + if (complex_sv_type_str != "") { + complex_sv_type_str += "+"; + } + complex_sv_type_str += closest_supp_type_str; + } + if (complex_type != closest_supp_type && complex_type != primary_type && complex_type != SVType::UNKNOWN && complex_type != SVType::NEUTRAL) { + if (complex_sv_type_str != "") { + complex_sv_type_str += "+"; + } + complex_sv_type_str += getSVTypeString(complex_type); + } + + // Add the complex SV call if not empty + if (complex_sv_type_str != "") { + std::cout << "Found complex SV type: " << complex_sv_type_str << std::endl; + sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0); + sv_count++; + } + } + + } } } diff --git a/src/sv_data.cpp b/src/sv_data.cpp index e9b925dc..600ce14c 100644 --- a/src/sv_data.cpp +++ b/src/sv_data.cpp @@ -139,9 +139,6 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) std::string output_vcf = output_dir + "/output.vcf"; std::cout << "Writing VCF file to " << output_vcf << std::endl; VcfWriter vcf_writer(output_vcf); - std::cout << "Writing VCF file to " << output_vcf << std::endl; - - // Set the sample name std::string sample_name = "SAMPLE"; std::cout << "Getting reference genome filepath..." << std::endl; diff --git a/src/vcf_writer.cpp b/src/vcf_writer.cpp index 8c93a36f..7cf2108d 100644 --- a/src/vcf_writer.cpp +++ b/src/vcf_writer.cpp @@ -3,16 +3,36 @@ /// @cond #include #include +#include /// @endcond VcfWriter::VcfWriter(const std::string &filename) { - // Open the VCF file, overwrite if it already exists - this->file_stream.open(filename, std::ios::out); - if (!this->file_stream.is_open()) { - std::cerr << "Error: Unable to open " << filename << std::endl; - exit(1); + try { + this->file_stream.exceptions(std::ofstream::failbit | std::ofstream::badbit); // Enable exceptions + this->file_stream.open(filename, std::ios::out | std::ios::trunc); // Open the file for writing + } catch (const std::ofstream::failure &e) { + std::cerr << "Error opening file " << filename << ": " << e.what() << std::endl; + exit(EXIT_FAILURE); } + // // Open the VCF file, overwrite if it already exists + // try { + // this->file_stream.open(filename, std::ios::out); + // if (!this->file_stream.is_open()) { + // std::cerr << "Error: Unable to open " << filename << std::endl; + // exit(1); + // } + // } catch (std::exception &e) { + // std::cerr << "Error: " << e.what() << std::endl; + // exit(1); + // } + + // // this->file_stream.open(filename, std::ios::out); + // if (!this->file_stream.is_open()) { + // std::cerr << "Error: Unable to open " << filename << std::endl; + // exit(1); + // } + // std::cout << "Opened " << filename << " for writing" << std::endl; } VcfWriter::~VcfWriter() diff --git a/tests/test_general.py b/tests/test_general.py index 52d3fc84..92776b11 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -64,7 +64,7 @@ def test_run(): # Check that the VCF file has the correct number of lines. with open(output_file, 'r', encoding='utf-8') as f: - assert len(f.readlines()) == 25 + assert len(f.readlines()) == 41 # Check that the VCF file has the correct header, and the correct # VCF CHROM, POS, and INFO fields in the next 2 lines. From a19c21ae989789173be2b14a3438f1854713f5ce Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 13 Nov 2024 16:10:10 -0500 Subject: [PATCH 013/134] Venn diagram plots and vcf writer error --- include/vcf_writer.h | 2 +- python/plot_venn.py | 47 ++++++++++++++++++++++++++++++++++++++++++++ python/sv_merger.py | 2 +- src/sv_caller.cpp | 19 +++++++++++++----- src/vcf_writer.cpp | 16 ++++++++++----- 5 files changed, 74 insertions(+), 12 deletions(-) create mode 100644 python/plot_venn.py diff --git a/include/vcf_writer.h b/include/vcf_writer.h index a6f7707c..a9bd5931 100644 --- a/include/vcf_writer.h +++ b/include/vcf_writer.h @@ -9,7 +9,7 @@ class VcfWriter { public: - explicit VcfWriter(const std::string& filename); + explicit VcfWriter(std::string filename); // VcfWriter(const std::string& filename); ~VcfWriter(); diff --git a/python/plot_venn.py b/python/plot_venn.py new file mode 100644 index 00000000..8e5e73bd --- /dev/null +++ b/python/plot_venn.py @@ -0,0 +1,47 @@ +# from matplotlib_venn import venn3 +from matplotlib_venn import venn2 +import argparse + +import matplotlib.pyplot as plt + +def plot_venn(AB, Ab, aB, output, title_AB, title_Ab, title_aB): + plt.figure(figsize=(8, 8)) + + print('AB:', AB) + print('Ab:', Ab) + print('aB:', aB) + + # Create scaled subsets for the venn diagram + scaling_factor = 1000 + scaled_AB = AB / scaling_factor + scaled_Ab = Ab / scaling_factor + scaled_aB = aB / scaling_factor + + # Create a venn diagram scaled to the number of elements in each set + # venn = venn2(subsets=(AB, Ab, aB), set_labels=(title_Ab, title_aB)) + venn = venn2(subsets=(scaled_Ab, scaled_aB, scaled_AB), set_labels=(title_Ab, title_aB)) + + # Update the labels to reflect the actual counts + venn.get_label_by_id('10').set_text(str(Ab)) + venn.get_label_by_id('01').set_text(str(aB)) + venn.get_label_by_id('11').set_text(str(AB)) + + # Update the title + plt.title("ContextSV and " + title_aB.capitalize() + " Venn Diagram (All SV types)") + plt.savefig(output) + plt.close() + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Generate a Venn diagram.') + parser.add_argument('-a', type=int, required=True, help='Shared count') + parser.add_argument('-b', type=int, required=True, help='False positive count') + parser.add_argument('-c', type=int, required=True, help='False negative count') + parser.add_argument('-o', '--output', type=str, required=True, help='Output file path') + parser.add_argument('-a_title', type=str, required=True, help='Title for set A') + parser.add_argument('-b_title', type=str, required=True, help='Title for set B') + parser.add_argument('-c_title', type=str, required=True, help='Title for set C') + + args = parser.parse_args() + + plot_venn(args.a, args.b, args.c, args.output, args.a_title, args.b_title, args.c_title) + print(f'Venn diagram saved to {args.output}') diff --git a/python/sv_merger.py b/python/sv_merger.py index d2c59977..fa2a3a30 100644 --- a/python/sv_merger.py +++ b/python/sv_merger.py @@ -152,7 +152,7 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): # Merge SVs with the same label unique_labels = np.unique(cluster_labels) - logging.info("Unique labels: %s", unique_labels) + # logging.info("Unique labels: %s", unique_labels) for label in unique_labels: diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 590c8db4..d591792b 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -332,12 +332,17 @@ SVData SVCaller::run() } else { chromosomes = this->input_data->getRefGenomeChromosomes(); } - int chr_count = chromosomes.size(); + + // [TEST] Only process the last N chromosomes + // last_n = 10; + // chromosomes = std::vector(chromosomes.end()-last_n, chromosomes.end()); + // chromosomes = std::vector(chromosomes.end()-3, chromosomes.end()); // Loop through each region and detect SVs in chunks + int chr_count = chromosomes.size(); + int current_chr = 0; std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl; int chunk_count = 100; // Number of chunks to split the chromosome into - int region_count = 0; SVData sv_calls; int min_cnv_length = this->input_data->getMinCNVLength(); for (const auto& chr : chromosomes) { @@ -377,6 +382,8 @@ SVData SVCaller::run() // Process each chunk one at a time std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl; + int region_count = region_chunks.size(); + int current_region = 0; for (const auto& sub_region : region_chunks) { // std::cout << "Detecting CIGAR string SVs from " << sub_region << "..." << std::endl; RegionData region_data = this->detectSVsFromRegion(sub_region); @@ -385,7 +392,7 @@ SVData SVCaller::run() SuppMap& supp_map = std::get<2>(region_data); int region_sv_count = sv_calls_region.totalCalls(); if (region_sv_count > 0) { - std::cout << "Detected " << region_sv_count << " SVs from " << sub_region << "..." << std::endl; + std::cout << "Detected " << region_sv_count << " CIGAR SVs from " << sub_region << "..." << std::endl; } // Run copy number variant predictions on the SVs detected from the @@ -402,11 +409,13 @@ SVData SVCaller::run() std::cout << "Detecting copy number variants from split reads..." << std::endl; this->detectSVsFromSplitReads(sv_calls_region, primary_map, supp_map, cnv_caller); sv_calls.concatenate(sv_calls_region); // Add the calls to the main set + std::cout << "Completed " << ++current_region << " of " << region_count << " region(s)..." << std::endl; } - region_count++; - std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)..." << std::endl; + std::cout << "Completed " << ++current_chr << " of " << chr_count << " chromosome(s)..." << std::endl; + // std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)" << std::endl; } + std::cout << "SV calling completed." << std::endl; diff --git a/src/vcf_writer.cpp b/src/vcf_writer.cpp index 7cf2108d..0d5f60e7 100644 --- a/src/vcf_writer.cpp +++ b/src/vcf_writer.cpp @@ -6,15 +6,21 @@ #include /// @endcond -VcfWriter::VcfWriter(const std::string &filename) +VcfWriter::VcfWriter(std::string filename) { try { + std::cout << "Opening file..." << std::endl; this->file_stream.exceptions(std::ofstream::failbit | std::ofstream::badbit); // Enable exceptions this->file_stream.open(filename, std::ios::out | std::ios::trunc); // Open the file for writing - } catch (const std::ofstream::failure &e) { - std::cerr << "Error opening file " << filename << ": " << e.what() << std::endl; - exit(EXIT_FAILURE); - } + std::cout << "File opened." << std::endl; + } catch (std::exception &e) { + std::cerr << "Error: " << e.what() << std::endl; + exit(EXIT_FAILURE); + } + //} catch (const std::ofstream::failure &e) { + // std::cerr << "Error opening file " << filename << ": " << e.what() << std::endl; + // exit(EXIT_FAILURE); + //} // // Open the VCF file, overwrite if it already exists // try { // this->file_stream.open(filename, std::ios::out); From 5dc01cb569ed0ba8844783891157fb876c811c06 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 14 Nov 2024 13:19:30 -0500 Subject: [PATCH 014/134] Fix vcf writer error --- include/vcf_writer.h | 31 ---------------- python/plot_venn.py | 2 +- src/sv_data.cpp | 42 +++++++++++++++++++-- src/vcf_writer.cpp | 87 -------------------------------------------- 4 files changed, 39 insertions(+), 123 deletions(-) delete mode 100644 include/vcf_writer.h delete mode 100644 src/vcf_writer.cpp diff --git a/include/vcf_writer.h b/include/vcf_writer.h deleted file mode 100644 index a9bd5931..00000000 --- a/include/vcf_writer.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef VCF_WRITER_H -#define VCF_WRITER_H - -/// @cond -#include -#include -#include -/// @endcond - -class VcfWriter { -public: - explicit VcfWriter(std::string filename); - // VcfWriter(const std::string& filename); - ~VcfWriter(); - - // Delete copy constructor and assignment operator - VcfWriter(const VcfWriter&) = delete; - VcfWriter& operator=(const VcfWriter&) = delete; - - void writeHeader(const std::vector& headerLines); - void writeRecord(const std::string& chrom, int pos, const std::string& id, - const std::string& ref, const std::string& alt, - const std::string& qual, const std::string& filter, - const std::string& info, const std::string& format, - const std::vector& samples); - -private: - std::ofstream file_stream; -}; - -#endif // VCF_WRITER_H diff --git a/python/plot_venn.py b/python/plot_venn.py index 8e5e73bd..eb7e8e78 100644 --- a/python/plot_venn.py +++ b/python/plot_venn.py @@ -27,7 +27,7 @@ def plot_venn(AB, Ab, aB, output, title_AB, title_Ab, title_aB): venn.get_label_by_id('11').set_text(str(AB)) # Update the title - plt.title("ContextSV and " + title_aB.capitalize() + " Venn Diagram (All SV types)") + plt.title("contextsv and " + title_aB + " venn diagram (all SV types)") plt.savefig(output) plt.close() diff --git a/src/sv_data.cpp b/src/sv_data.cpp index 600ce14c..085e737e 100644 --- a/src/sv_data.cpp +++ b/src/sv_data.cpp @@ -1,5 +1,4 @@ #include "sv_data.h" -#include "vcf_writer.h" /// @cond #include @@ -138,7 +137,10 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) std::cout << "Creating VCF writer..." << std::endl; std::string output_vcf = output_dir + "/output.vcf"; std::cout << "Writing VCF file to " << output_vcf << std::endl; - VcfWriter vcf_writer(output_vcf); + std::ofstream vcf_stream(output_vcf); + if (!vcf_stream.is_open()) { + throw std::runtime_error("Failed to open VCF file for writing."); + } std::string sample_name = "SAMPLE"; std::cout << "Getting reference genome filepath..." << std::endl; @@ -178,7 +180,35 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) }; std::cout << "Writing VCF header..." << std::endl; - vcf_writer.writeHeader(header_lines); + + // Add the file format + std::string file_format = "##fileformat=VCFv4.2"; + vcf_stream << file_format << std::endl; + + // Add date and time + time_t rawtime; + struct tm * timeinfo; + char buffer[80]; + time (&rawtime); + timeinfo = localtime(&rawtime); + strftime(buffer, sizeof(buffer), "%Y%m%d", timeinfo); + vcf_stream << "##fileDate=" << buffer << std::endl; + + // Add source + std::string source = "##source=ContexSV"; + vcf_stream << source << std::endl; + + // Loop over the header metadata lines + for (const auto &line : header_lines) { + vcf_stream << line << std::endl; + } + + // Add the header line + std::string header_line = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE"; + vcf_stream << header_line << std::endl; + + // Flush the stream to ensure that the header is written + //this->file_stream.flush(); std::cout << "Saving SV calls to " << output_vcf << std::endl; std::string sv_method = "CONTEXTSVv0.1"; @@ -280,7 +310,11 @@ void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) std::vector samples = {sample_str}; // Write the SV call to the file (CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLES) - vcf_writer.writeRecord(chr, pos, ".", ref_allele, alt_allele, ".", "PASS", info_str, format_str, samples); + vcf_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "PASS" << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl; + if (total_count % 1000 == 0) + { + std::cout << "Wrote SV at " << chr << ": " << pos << ", total=" << total_count << std::endl; + } } } diff --git a/src/vcf_writer.cpp b/src/vcf_writer.cpp deleted file mode 100644 index 0d5f60e7..00000000 --- a/src/vcf_writer.cpp +++ /dev/null @@ -1,87 +0,0 @@ -#include "vcf_writer.h" - -/// @cond -#include -#include -#include -/// @endcond - -VcfWriter::VcfWriter(std::string filename) -{ - try { - std::cout << "Opening file..." << std::endl; - this->file_stream.exceptions(std::ofstream::failbit | std::ofstream::badbit); // Enable exceptions - this->file_stream.open(filename, std::ios::out | std::ios::trunc); // Open the file for writing - std::cout << "File opened." << std::endl; - } catch (std::exception &e) { - std::cerr << "Error: " << e.what() << std::endl; - exit(EXIT_FAILURE); - } - //} catch (const std::ofstream::failure &e) { - // std::cerr << "Error opening file " << filename << ": " << e.what() << std::endl; - // exit(EXIT_FAILURE); - //} - // // Open the VCF file, overwrite if it already exists - // try { - // this->file_stream.open(filename, std::ios::out); - // if (!this->file_stream.is_open()) { - // std::cerr << "Error: Unable to open " << filename << std::endl; - // exit(1); - // } - // } catch (std::exception &e) { - // std::cerr << "Error: " << e.what() << std::endl; - // exit(1); - // } - - // // this->file_stream.open(filename, std::ios::out); - // if (!this->file_stream.is_open()) { - // std::cerr << "Error: Unable to open " << filename << std::endl; - // exit(1); - // } - // std::cout << "Opened " << filename << " for writing" << std::endl; -} - -VcfWriter::~VcfWriter() -{ - if (this->file_stream.is_open()) { - this->file_stream.close(); - } -} - -void VcfWriter::writeHeader(const std::vector &headerLines) -{ - // Add the file format - std::string file_format = "##fileformat=VCFv4.2"; - this->file_stream << file_format << std::endl; - - // Add date and time - time_t rawtime; - struct tm * timeinfo; - char buffer[80]; - time (&rawtime); - timeinfo = localtime(&rawtime); - strftime(buffer, sizeof(buffer), "%Y%m%d", timeinfo); - file_stream << "##fileDate=" << buffer << std::endl; - - // Add source - std::string source = "##source=ContexSV"; - this->file_stream << source << std::endl; - - // Loop over the header metadata lines - for (auto &line : headerLines) { - this->file_stream << line << std::endl; - } - - // Add the header line - std::string header_line = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE"; - this->file_stream << header_line << std::endl; - - // Flush the stream to ensure that the header is written - this->file_stream.flush(); -} - -void VcfWriter::writeRecord(const std::string &chrom, int pos, const std::string &id, const std::string &ref, const std::string &alt, const std::string &qual, const std::string &filter, const std::string &info, const std::string &format, const std::vector &samples) -{ - // Write a record to the VCF file - this->file_stream << chrom << "\t" << pos << "\t" << id << "\t" << ref << "\t" << alt << "\t" << qual << "\t" << filter << "\t" << info << "\t" << format << "\t" << samples[0] << std::endl; -} From b9b489044e033b4fb12ac390bfcb98e5b2044c14 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 15 Nov 2024 11:22:12 -0500 Subject: [PATCH 015/134] Fix overlap errors --- include/cnv_caller.h | 2 +- python/sv_merger.py | 8 +- src/cnv_caller.cpp | 31 +--- src/khmm.cpp | 13 +- src/sv_caller.cpp | 384 ++++++++++++++++++++++++++++++++---------- tests/test_general.py | 4 +- 6 files changed, 315 insertions(+), 127 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index af211fc8..80c69f89 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -109,7 +109,7 @@ class CNVCaller { // Run copy number prediction for a single SV candidate, returning the // likelihood, predicted CNV type, genotype, and whether SNPs were found - std::tuple runCopyNumberPrediction(std::string chr, SVCandidate& sv_candidate); + std::tuple runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate); // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings SNPData runCIGARCopyNumberPrediction(std::string chr, std::map& sv_candidates, int min_length); diff --git a/python/sv_merger.py b/python/sv_merger.py index fa2a3a30..172cba6e 100644 --- a/python/sv_merger.py +++ b/python/sv_merger.py @@ -89,7 +89,7 @@ def update_support(record, cluster_size): return record -def weighted_score(read_support, hmm_score, sv_len, weight_hmm, weight_sv_len): +def weighted_score(read_support, hmm_score, weight_hmm): """ Calculate a weighted score based on read support and HMM score. """ @@ -209,13 +209,11 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): # support. # hmm_weight = 0.7 if sv_type == 'DEL' else 0.3 hmm_weight = 0.4 - sv_len_weight = 0.4 max_score_idx = 0 # Default to the first SV in the cluster - max_score = weighted_score(cluster_depth_scores[max_score_idx], cluster_hmm_scores[max_score_idx], cluster_sv_lengths[max_score_idx], hmm_weight, sv_len_weight) + max_score = weighted_score(cluster_depth_scores[max_score_idx], cluster_hmm_scores[max_score_idx], hmm_weight) for k, hmm_loglh in enumerate(cluster_hmm_scores): - sv_len = cluster_sv_lengths[k] / 1000 # Normalize SV length to kilobases read_support = cluster_depth_scores[k] - score = weighted_score(read_support, hmm_loglh, sv_len, hmm_weight, sv_len_weight) + score = weighted_score(read_support, hmm_loglh, hmm_weight) if score > max_score: max_score = score max_score_idx = k diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 7d07f0ea..ceaeb5ec 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -115,9 +115,8 @@ std::pair CNVCaller::querySNPRegion(std::string chr, int64_t star return std::make_pair(snp_data, snps_found); } -std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, SVCandidate& candidate) +std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate) { - // std::cout << "Running copy number prediction for SV pair " << chr << ":" << std::get<0>(sv_one) << "-" << std::get<1>(sv_one) << " and " << std::get<0>(sv_two) << "-" << std::get<1>(sv_two) << "..." << std::endl; // Get the start and end positions of the SV call int64_t start_pos = std::get<0>(candidate); int64_t end_pos = std::get<1>(candidate); @@ -127,6 +126,7 @@ std::tuple CNVCaller::runCopyNumberPrediction int64_t sv_length = (end_pos - start_pos) / 2.0; int64_t snp_start_pos = std::max((int64_t) 1, start_pos - sv_length); int64_t snp_end_pos = end_pos + sv_length; + // printMessage("Running copy number prediction for SV candidate " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + " with SNP region " + chr + ":" + std::to_string(snp_start_pos) + "-" + std::to_string(snp_end_pos) + "..."); // Query the SNP region for the SV candidate std::pair snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, this->snp_info, this->pos_depth_map, this->mean_chr_cov); @@ -153,12 +153,15 @@ std::tuple CNVCaller::runCopyNumberPrediction double pct_threshold = 0.75; int max_state = 0; int max_count = 0; - for (int i = 0; i < 6; i++) + + // Combine counts for states 1 and 2, states 3 and 4, and states 5 and 6 + for (int i = 0; i < 6; i += 2) { - int state_count = std::count(sv_states.begin(), sv_states.end(), i+1); + // Combine counts for states 1 and 2, states 3 and 4, and states 5 and 6 + int state_count = std::count(sv_states.begin(), sv_states.end(), i+1) + std::count(sv_states.begin(), sv_states.end(), i+2); if (state_count > max_count) { - max_state = i+1; + max_state = i+1; // Set the state to the first state in the pair (sequence remains intact) max_count = state_count; } } @@ -196,24 +199,6 @@ SNPData CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::mapmean_chr_cov; SNPData snp_data; - // Filter the SV candidates by length - // std::map filtered_sv_candidates; - // for (const auto& sv_call : sv_candidates) - // { - // int64_t start_pos = std::get<0>(sv_call.first); - // int64_t end_pos = std::get<1>(sv_call.first); - // if ((end_pos - start_pos) >= min_length) - // { - // filtered_sv_candidates[sv_call.first] = sv_call.second; - // } - // } - // sv_candidates = std::move(filtered_sv_candidates); - // int sv_count = (int) sv_candidates.size(); - // if (sv_count == 0) - // { - // return snp_data; - // } - printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "..."); diff --git a/src/khmm.cpp b/src/khmm.cpp index 375325a1..3b3bffa6 100644 --- a/src/khmm.cpp +++ b/src/khmm.cpp @@ -339,12 +339,12 @@ std::pair, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect // of the state sequence ending in state i at time T, along with observing // the sequence O1, O2. q[T] = 1; - double min_prob = -VITHUGE; + double final_lh = -VITHUGE; for (i = 1; i <= hmm.N; i++) { - if (delta[T][i] > min_prob) + if (delta[T][i] > final_lh) { - min_prob = delta[T][i]; + final_lh = delta[T][i]; q[T] = i; } } @@ -378,12 +378,7 @@ std::pair, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect free_dmatrix(biot, 1, hmm.N, 1, T); free_dmatrix(A1, 1, hmm.N, 1, hmm.N); - // Normalize the log likelihood by the sample size - double min_prob_normalized = min_prob / (double)T; - - // Return the state sequence and its likelihood - // return std::make_pair(q, min_prob); - return std::make_pair(q, min_prob_normalized); + return std::make_pair(q, final_lh); } CHMM ReadCHMM(const char *filename) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index d591792b..3cc41aed 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -170,6 +170,8 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr int32_t query_start = 0; // First alignment position in the query int32_t query_end = 0; // Last alignment position in the query bool first_op = false; // First alignment operation for the query + double default_lh = std::numeric_limits::lowest(); // Default likelihood + // double default_lh = std::numeric_limits::quiet_NaN(); // Default likelihood for (int i = 0; i < cigar_len; i++) { int op = bam_cigar_op(cigar[i]); // CIGAR operation @@ -230,9 +232,9 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr ref_pos = pos+1; ref_end = ref_pos + op_len -1; if (is_duplication) { - sv_calls.add(chr, ref_pos, ref_end, SVType::DUP, ins_seq_str, "CIGARDUP", "./.", 0.0); + sv_calls.add(chr, ref_pos, ref_end, SVType::DUP, ins_seq_str, "CIGARDUP", "./.", default_lh); } else { - sv_calls.add(chr, ref_pos, ref_end, SVType::INS, ins_seq_str, "CIGARINS", "./.", 0.0); + sv_calls.add(chr, ref_pos, ref_end, SVType::INS, ins_seq_str, "CIGARINS", "./.", default_lh); } } @@ -244,7 +246,7 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr { ref_pos = pos+1; ref_end = ref_pos + op_len -1; - sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", "CIGARDEL", "./.", 0.0); // Add the deletion + sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", "CIGARDEL", "./.", default_lh); // Add to SV calls (1-based) } // Check if the CIGAR operation is a clipped base @@ -443,52 +445,49 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map continue; } - // Resolve overlaps between the primary and supplementary query - // sequences - for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { - std::string supp_chr = std::get<0>(*it); - // int32_t supp_start = std::get<1>(*it); - // int32_t supp_end = std::get<2>(*it); - int32_t supp_query_start = std::get<4>(*it); - int32_t supp_query_end = std::get<5>(*it); - std::unordered_map supp_match_map = std::get<6>(*it); - // bool supp_strand = std::get<7>(*it); - - // Resolve overlaps between the primary and supplementary query - // sequences - int32_t overlap_start = std::max(primary_query_start, supp_query_start); - int32_t overlap_end = std::min(primary_query_end, supp_query_end); - int32_t overlap_length = overlap_end - overlap_start; - if (overlap_length > 0) { - - // Calculate the mismatch rate for each alignment at the overlap - double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1); - double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1); - // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl; - // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl; - - // Trim the overlap from the alignment with the higher mismatch - // rate - if (primary_mismatch_rate > supp_mismatch_rate) { - if (overlap_start == primary_query_start) { - primary_start += overlap_length; - } else if (overlap_end == primary_query_end) { - primary_end -= overlap_length; - } - - } else { - if (overlap_start == supp_query_start) { - // supp_start += overlap_length; - // Update the value in the supp map - std::get<1>(*it) += overlap_length; - } else if (overlap_end == supp_query_end) { - // supp_end -= overlap_length; - // Update the value in the supp map - std::get<2>(*it) -= overlap_length; - } - } - } - } + // // Resolve overlaps between the primary and supplementary query + // // sequences + // for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { + // std::string supp_chr = std::get<0>(*it); + // // int32_t supp_start = std::get<1>(*it); + // // int32_t supp_end = std::get<2>(*it); + // int32_t supp_query_start = std::get<4>(*it); + // int32_t supp_query_end = std::get<5>(*it); + // std::unordered_map supp_match_map = std::get<6>(*it); + // // bool supp_strand = std::get<7>(*it); + + // // Resolve overlaps between the primary and supplementary query + // // sequences + // if (primary_query_start < supp_query_end && primary_query_end > supp_query_start || supp_query_start < primary_query_end && supp_query_end > primary_query_start) { + + // // Calculate the mismatch rate for each alignment at the overlap + // double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1); + // double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1); + // // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl; + // // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl; + + // // Trim the overlap from the alignment with the higher mismatch + // // rate + // if (primary_mismatch_rate > supp_mismatch_rate) { + // if (overlap_start == primary_query_start) { + // primary_start += overlap_length; + // } else if (overlap_end == primary_query_end) { + // primary_end -= overlap_length; + // } + + // } else { + // if (overlap_start == supp_query_start) { + // // supp_start += overlap_length; + // // Update the value in the supp map + // std::get<1>(*it) += overlap_length; + // } else if (overlap_end == supp_query_end) { + // // supp_end -= overlap_length; + // // Update the value in the supp map + // std::get<2>(*it) -= overlap_length; + // } + // } + // } + // } // Remove supplementary alignments that are not on the same chromosome // as the primary alignment @@ -502,11 +501,12 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // Run copy number variant predictions on the primary alignment SVType primary_type = SVType::UNKNOWN; - double primary_log_likelihood = std::numeric_limits::lowest(); + double primary_lh = std::numeric_limits::lowest(); + int32_t primary_lh_t = 0; if (primary_end - primary_start >= min_cnv_length) { SVCandidate sv_candidate(primary_start+1, primary_end+1, "."); std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - primary_log_likelihood = std::get<0>(result); + primary_lh = std::get<0>(result); // primary_log_likelihood /= (double)(primary_end - primary_start); // Normalize the log likelihood by the length primary_type = std::get<1>(result); } @@ -523,7 +523,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // const auto& supp_chr = std::get<0>(*it); int32_t supp_start = std::get<1>(*it); int32_t supp_end = std::get<2>(*it); - int32_t supp_length = supp_end - supp_start; + int32_t supp_length = supp_end - supp_start + 1; int32_t supp_distance = std::numeric_limits::max(); if (supp_start > primary_end) { supp_distance = supp_start - primary_end; @@ -543,80 +543,288 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // Run copy number variant predictions on the largest supplementary // alignment - double largest_supp_log_likelihood = std::numeric_limits::lowest(); + double largest_supp_lh = std::numeric_limits::lowest(); SVType largest_supp_type = SVType::UNKNOWN; + int largest_supp_lh_t = 0; if (largest_supp_length >= min_cnv_length) { SVCandidate sv_candidate(std::get<1>(largest_supp_alignment)+1, std::get<2>(largest_supp_alignment)+1, "."); std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - largest_supp_log_likelihood = std::get<0>(result); + largest_supp_lh = std::get<0>(result); // largest_supp_log_likelihood /= (double)largest_supp_length; // Normalize the log likelihood by the length largest_supp_type = std::get<1>(result); } // Run copy number variant predictions on the closest non-overlapping // supplementary alignment (if not the same as the largest) - double closest_supp_log_likelihood = std::numeric_limits::lowest(); + double closest_supp_lh = std::numeric_limits::lowest(); SVType closest_supp_type = SVType::UNKNOWN; + int closest_supp_lh_t = 0; if (largest_supp_alignment != closest_supp_alignment) { if (closest_supp_length >= min_cnv_length) { SVCandidate sv_candidate(std::get<1>(closest_supp_alignment)+1, std::get<2>(closest_supp_alignment)+1, "."); std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - closest_supp_log_likelihood = std::get<0>(result); + closest_supp_lh = std::get<0>(result); // closest_supp_log_likelihood /= (double)closest_supp_length; // Normalize the log likelihood by the length closest_supp_type = std::get<1>(result); + int32_t closest_supp_start = std::get<1>(closest_supp_alignment); + int32_t closest_supp_end = std::get<2>(closest_supp_alignment); } } + // Define constants representing read scenarios used for SV detection + const int NOCALL = -1; // Default + const int PRIM_SUPP_BD = 0; // Primary and supplementary boundary + const int PRIM_SUPP_GAP = 1; // Primary and supplementary gap + const int SUPP_PRIM_BD = 2; // Supplementary and primary boundary + const int SUPP_PRIM_GAP = 3; // Supplementary and primary gap + // Loop through all the supplementary alignments and find the highest // likelihood prediction - double best_supp_log_likelihood = std::numeric_limits::lowest(); + double best_split_aln_lh = std::numeric_limits::lowest(); + double best_split_aln_lh_norm = std::numeric_limits::lowest(); + // int best_split_aln_length = 0; SVType best_supp_type = SVType::UNKNOWN; std::pair best_supp_candidate; + AlignmentData& best_split_alignment = supp_map[qname][0]; + int best_scenario = NOCALL; for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { int32_t supp_start = std::get<1>(*it); int32_t supp_end = std::get<2>(*it); + bool primary_before_supp = primary_start < supp_start; + + // Create the SV candidate as the boundary of the primary and + // supplementary alignments + SVCandidate split_boundary; + SVCandidate split_gap; + bool invalid_gap = false; + if (primary_before_supp) { + split_boundary = SVCandidate(primary_start+1, supp_end+1, "."); + + // Check for an invalid gap (overlap) + if (primary_end >= supp_start) { + invalid_gap = true; + } else { + split_gap = SVCandidate(primary_end+1, supp_start+1, "."); + } + // split_gap = SVCandidate(primary_end+1, supp_start+1, "."); - // Create the SV candidate as the boundary of the supplementary - // and primary alignment - // int32_t sv_start = std::min(primary_start, std::get<1>(*it)); - // int32_t sv_end = std::max(primary_end, std::get<2>(*it)); - int32_t sv_start = std::min(primary_start, supp_start); - int32_t sv_end = std::max(primary_end, supp_end); - SVCandidate sv_candidate(sv_start+1, sv_end+1, "."); + } else { + split_boundary = SVCandidate(supp_start+1, primary_end+1, "."); - // Determine if the strand is the same as the primary alignment - bool same_strand = std::get<7>(*it) == std::get<7>(primary_alignment); + // Check for an invalid gap (overlap) + if (supp_end >= primary_start) { + invalid_gap = true; + } else { + split_gap = SVCandidate(supp_end+1, primary_start+1, "."); + } + } - // SVCandidate sv_candidate(std::get<1>(*it)+1, std::get<2>(*it)+1, "."); - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - double supp_likelihood = std::get<0>(result); - SVType supp_type = std::get<1>(result); + // Create a vector of the two SV candidates, don't add the gap if + // it is an overlap, or if either SV is less than the minimum CNV + // length + std::vector sv_candidates; + if (!invalid_gap && std::get<1>(split_gap) - std::get<0>(split_gap) >= min_cnv_length) { + sv_candidates.push_back(split_gap); + } + if (std::get<1>(split_boundary) - std::get<0>(split_boundary) >= min_cnv_length) { + sv_candidates.push_back(split_boundary); + } + + // Continue if no SV candidates + if (sv_candidates.size() == 0) { + continue; + } + + // Run copy number variant predictions on both, and keep the + // prediction with the highest normalized log likelihood + double chosen_lh_norm = std::numeric_limits::lowest(); + SVType chosen_type = SVType::UNKNOWN; + std::pair chosen_candidate; + std::string chosen_candidate_str = "BOUNDARY"; + int split_scenario = NOCALL; + for (const auto& sv_candidate : sv_candidates) { + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); + double current_lh = std::get<0>(result); + SVType current_type = std::get<1>(result); + + // Normalize the log likelihood by the state sequence length + double current_lh_norm = current_lh;// / (double)T; + // if (sv_candidate == split_boundary) { + // std::cout << "Boundary candidate: " << current_lh_norm << ", before normalization: " << current_lh << std::endl; + // } else if (sv_candidate == split_gap) { + // std::cout << "Gap candidate: " << current_lh_norm << ", before normalization: " << current_lh << std::endl; + // } + + // Update the current SV candidate if the likelihood is higher + if (current_type != SVType::UNKNOWN && current_lh_norm > chosen_lh_norm) { + chosen_lh_norm = current_lh_norm; + chosen_type = current_type; + chosen_candidate = std::make_pair(std::get<0>(sv_candidate), std::get<1>(sv_candidate)); + + // Update the candidate string + if (sv_candidate == split_boundary) { + chosen_candidate_str = "BOUNDARY"; + if (primary_before_supp) { + split_scenario = PRIM_SUPP_BD; + } else { + split_scenario = SUPP_PRIM_BD; + } + } else if (sv_candidate == split_gap) { + chosen_candidate_str = "GAP"; + if (primary_before_supp) { + split_scenario = PRIM_SUPP_GAP; + } else { + split_scenario = SUPP_PRIM_GAP; + } + } + // std::cout << "Updated candidate: " << chosen_candidate_str << " with likelihood: " << current_lh_norm << std::endl; + } else if (current_type == SVType::UNKNOWN) { + // std::cerr << "ERROR: Unknown SV type" << std::endl; + // exit(1); + } + } + + // std::cout << "Chosen candidate: " << chosen_candidate_str << std::endl; + + // Continue if unknown SV type + if (chosen_type == SVType::UNKNOWN) { + std::cerr << "ERROR: Unknown SV type" << std::endl; + continue; + } // If opposite strand, set the type to INV or INV_DUP + bool same_strand = std::get<7>(*it) == std::get<7>(primary_alignment); if (!same_strand) { - if (supp_type == SVType::NEUTRAL) { - supp_type = SVType::INV; - } else if (supp_type == SVType::DUP) { - supp_type = SVType::INV_DUP; + if (chosen_type == SVType::NEUTRAL) { + chosen_type = SVType::INV; + } else if (chosen_type == SVType::DUP) { + chosen_type = SVType::INV_DUP; } } - if (supp_type != SVType::UNKNOWN && supp_likelihood > best_supp_log_likelihood) { - best_supp_log_likelihood = supp_likelihood; - // best_supp_log_likelihood /= (double)(sv_end - sv_start); // Normalize the log likelihood by the length - best_supp_type = supp_type; - best_supp_candidate = std::make_pair(supp_start, supp_end); + if (chosen_lh_norm > best_split_aln_lh_norm) { + // best_supp_log_likelihood = supp_likelihood; + // best_supp_log_likelihood /= (double)(sv_end - sv_start); // + // Normalize the log likelihood by the length + // best_split_aln_lh = split_aln_lh; + best_split_aln_lh_norm = chosen_lh_norm; + // best_split_aln_length = split_aln_length; + best_supp_type = chosen_type; + best_supp_candidate = chosen_candidate; + best_split_alignment = *it; + best_scenario = split_scenario; + } else if (chosen_lh_norm <= best_split_aln_lh_norm) { + // std::cerr << "ERROR: split_aln_lh_norm is less than or equal to best_split_aln_lh_norm" << std::endl; + // exit(1); } } + // If the likelihood is equal to the lowest value, print an error + if (best_split_aln_lh_norm == std::numeric_limits::lowest()) { + // std::cerr << "ERROR: best_supp_log_likelihood is the lowest value" << std::endl; + // exit(1); + } + + // Print the likelihoods + // std::cout << "Primary log likelihood: " << primary_lh << std::endl; + // std::cout << "Largest supplementary log likelihood: " << largest_supp_lh << std::endl; + // std::cout << "Closest supplementary log likelihood: " << closest_supp_lh << std::endl; + // // std::cout << "Best split alignment log likelihood: " << best_split_aln_lh << std::endl; + // std::cout << "Best split alignment log likelihood (normalized): " << best_split_aln_lh_norm << std::endl; + // std::cout << "Best scenario: " << best_scenario << std::endl; + // Add the SV call with the highest likelihood prediction - if (best_supp_log_likelihood > primary_log_likelihood || best_supp_log_likelihood > largest_supp_log_likelihood || best_supp_log_likelihood > closest_supp_log_likelihood) { + // + // Determine the normalized log likelihood for the combined alignments + // by summing and normalizing the log likelihoods by the length + double complex_lh = 0.0; + double complex_lh_norm = 0.0; + if (largest_supp_alignment == closest_supp_alignment) { + int32_t complex_t = primary_lh_t + largest_supp_lh_t; + complex_lh = primary_lh + largest_supp_lh; + complex_lh_norm = complex_lh;// / complex_t; + } else { + int32_t complex_t = primary_lh_t + largest_supp_lh_t + closest_supp_lh_t; + complex_lh = primary_lh + largest_supp_lh + closest_supp_lh; + complex_lh_norm = complex_lh;// / complex_t; + } + // std::cout << "Complex log likelihood (normalized): " << complex_lh_norm << std::endl; + + // Compare the best split alignment likelihood to the complex likelihood + // if (best_supp_log_likelihood > primary_log_likelihood || best_supp_log_likelihood > largest_supp_log_likelihood || best_supp_log_likelihood > closest_supp_log_likelihood) { + if (best_split_aln_lh_norm > complex_lh_norm) { int32_t sv_start = best_supp_candidate.first; int32_t sv_end = best_supp_candidate.second; - sv_calls.add(primary_chr, sv_start, sv_end, best_supp_type, ".", "SPLITREAD", "./.", best_supp_log_likelihood); + + // Print an error and continue if the end is less than the start + if (sv_end < sv_start) { + std::cerr << "ERROR: SV end is less than the start: " << sv_start << " - " << sv_end << ", SV type: " << getSVTypeString(best_supp_type) << std::endl; + continue; + } + + // Resolve overlaps between the primary and supplementary query + // sequences for deletions (not usually an issue for other types) + if (best_supp_type == SVType::DEL) { + AlignmentData& best_supp_alignment = best_split_alignment; + int32_t supp_start = std::get<1>(best_supp_alignment); + int32_t supp_end = std::get<2>(best_supp_alignment); + int32_t supp_query_start = std::get<4>(best_supp_alignment); + int32_t supp_query_end = std::get<5>(best_supp_alignment); + std::unordered_map supp_match_map = std::get<6>(best_supp_alignment); + + // Resolve overlaps between the primary and supplementary query + // sequences + // int32_t overlap_start = std::max(primary_query_start, supp_query_start); + // int32_t overlap_end = std::min(primary_query_end, supp_query_end); + // int32_t overlap_length = overlap_end - overlap_start; + bool gap_present = primary_query_end < supp_query_start || supp_query_end < primary_query_start; + if (!gap_present) { + int32_t overlap_start = std::max(primary_query_start, supp_query_start); + int32_t overlap_end = std::min(primary_query_end, supp_query_end); + int32_t overlap_length = overlap_end - overlap_start; + + // Calculate the mismatch rate for each alignment at the overlap + double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end); + double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end); + // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl; + // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl; + + // Trim the overlap from the alignment with the higher mismatch + // rate + if (primary_mismatch_rate > supp_mismatch_rate) { + + // Handle each scenario + if (best_scenario == PRIM_SUPP_BD || best_scenario == PRIM_SUPP_GAP) { + // Primary is first, incorporate the overlap into + // the beginning of the deletion + sv_start -= overlap_length; + } else if (best_scenario == SUPP_PRIM_BD || best_scenario == SUPP_PRIM_GAP) { + // Primary is last, incorporate the overlap into + // the end of the deletion + sv_end += overlap_length; + } + } else { + + // Handle each scenario + if (best_scenario == SUPP_PRIM_BD || best_scenario == SUPP_PRIM_GAP) { + // Supplementary is first, incorporate the overlap into + // the beginning of the deletion + sv_start -= overlap_length; + } else if (best_scenario == PRIM_SUPP_BD || best_scenario == PRIM_SUPP_GAP) { + // Supplementary is last, incorporate the overlap into + // the end of the deletion + sv_end += overlap_length; + } + } + } + } + + // Add the best split alignment as the SV call + sv_calls.add(primary_chr, sv_start, sv_end, best_supp_type, ".", "SPLITREAD", "./.", best_split_aln_lh_norm); sv_count++; } else { // Resolve complex SVs + // Simplest case: Largest supplementary is also the closest if (largest_supp_alignment == closest_supp_alignment) { // [primary] -- [supp_start] -- [supp_end] @@ -643,14 +851,14 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map std::string complex_sv_type_str = supp_type_str + "+" + primary_type_str; // Add the complex SV call - sv_calls.add(primary_chr, std::get<1>(largest_supp_alignment), primary_end, SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0); + sv_calls.add(primary_chr, std::get<1>(largest_supp_alignment), primary_end, SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm); sv_count++; } else { // [primary] -- [supp_start] -- [supp_end] std::string complex_sv_type_str = primary_type_str + "+" + supp_type_str; // Add the complex SV call - sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0); + sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm); sv_count++; } } else { @@ -697,32 +905,34 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // Combine the types if equal and not unknown/neutral + std::cout << "Resolving complex SVs..." << std::endl; std::string complex_sv_type_str = ""; if (primary_type != SVType::UNKNOWN && primary_type != SVType::NEUTRAL) { complex_sv_type_str += primary_type_str; + std::cout << "[1] Updated to type: " << complex_sv_type_str << std::endl; } if (closest_supp_type != primary_type && closest_supp_type != SVType::UNKNOWN && closest_supp_type != SVType::NEUTRAL) { if (complex_sv_type_str != "") { complex_sv_type_str += "+"; } complex_sv_type_str += closest_supp_type_str; + std::cout << "[2] Updated to type: " << complex_sv_type_str << std::endl; } if (complex_type != closest_supp_type && complex_type != primary_type && complex_type != SVType::UNKNOWN && complex_type != SVType::NEUTRAL) { if (complex_sv_type_str != "") { complex_sv_type_str += "+"; } complex_sv_type_str += getSVTypeString(complex_type); + std::cout << "[3] Updated to type: " << complex_sv_type_str << std::endl; } // Add the complex SV call if not empty if (complex_sv_type_str != "") { std::cout << "Found complex SV type: " << complex_sv_type_str << std::endl; - sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", 0.0); + sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm); sv_count++; } - } - - + } } } } diff --git a/tests/test_general.py b/tests/test_general.py index 92776b11..dbca30b9 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -78,11 +78,11 @@ def test_run(): fields = line.strip().split('\t') assert fields[0] == "21" assert fields[1] == "14458394" - assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1341;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS,;CLIPSUP=0;REPTYPE=NA;HMM=0.000000" + assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1341;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS,;CLIPSUP=0;REPTYPE=NA;HMM=-179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.000000" elif i == header_line + 2: fields = line.strip().split('\t') assert fields[0] == "21" assert fields[1] == "14458394" - assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1344;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS,;CLIPSUP=0;REPTYPE=NA;HMM=0.000000" + assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1344;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS,;CLIPSUP=0;REPTYPE=NA;HMM=-179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.000000" break \ No newline at end of file From 9665e8ef2a2293b62e10454cd01864c43e73cfbe Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 16 Nov 2024 14:33:12 -0500 Subject: [PATCH 016/134] Create a cpp module for memory tests --- .gitignore | 1 + Makefile | 19 +++++++++++-------- Makefile-cpp | 43 +++++++++++++++++++++++++++++++++++++++++++ Makefile-python | 15 +++++++++++++++ setup.py | 3 +++ 5 files changed, 73 insertions(+), 8 deletions(-) create mode 100644 Makefile-cpp create mode 100644 Makefile-python diff --git a/.gitignore b/.gitignore index c627421d..939d32b9 100644 --- a/.gitignore +++ b/.gitignore @@ -67,6 +67,7 @@ python/dbscan python/agglo linktoscripts tests/data +tests/cpp_module_out # Population allele frequency filepaths data/gnomadv2_filepaths.txt diff --git a/Makefile b/Makefile index b6f7ddab..6b0170ae 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,14 @@ -INCL_DIR := $(CURDIR)/include -SRC_DIR := $(CURDIR)/src -LIB_DIR := $(CURDIR)/lib +# Top-Level Makefile +.PHONY: python cpp clean -all: - # Generate the SWIG wrapper (C++ -> Python) - swig -c++ -python -I$(INCL_DIR) -o $(SRC_DIR)/swig_wrapper.cpp -outdir $(LIB_DIR) $(SRC_DIR)/swig_wrapper.i +# Targets for the sub-makefiles +python: + $(MAKE) -f Makefile-python - # Compile the SWIG wrapper using setuptools - python3 setup.py build_ext --build-lib $(LIB_DIR) +cpp: + $(MAKE) -f Makefile-cpp + +clean: + $(MAKE) -f Makefile-python clean + $(MAKE) -f Makefile-cpp clean diff --git a/Makefile-cpp b/Makefile-cpp new file mode 100644 index 00000000..cf76256b --- /dev/null +++ b/Makefile-cpp @@ -0,0 +1,43 @@ +# Directories +INCL_DIR := $(CURDIR)/include +SRC_DIR := $(CURDIR)/src +BUILD_DIR := $(CURDIR)/build +LIB_DIR := $(CURDIR)/lib + +# Conda environment directories +CONDA_PREFIX := $(shell echo $$CONDA_PREFIX) +CONDA_INCL_DIR := $(CONDA_PREFIX)/include +CONDA_LIB_DIR := $(CONDA_PREFIX)/lib + +# Compiler and Flags +CXX := g++ +CXXFLAGS := -std=c++11 -I$(INCL_DIR) -I$(CONDA_INCL_DIR) +LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR) # Add rpath for shared libraries + +# Link htslib +LDLIBS := -lhts # Link with libhts.a or libhts.so +# LDLIBS := -lmylib # Link with libraries in LIB_DIR, e.g., libmylib.a or libmylib.so + +# Sources and Output +# SOURCES := $(wildcard $(SRC_DIR)/*.cpp) +SOURCES := $(filter-out $(SRC_DIR)/swig_wrapper.cpp, $(wildcard $(SRC_DIR)/*.cpp)) # Filter out the SWIG wrapper from the sources +OBJECTS := $(patsubst $(SRC_DIR)/%.cpp,$(BUILD_DIR)/%.o,$(SOURCES)) +TARGET := $(BUILD_DIR)/cpp_module + +# Default target +all: $(TARGET) + +# Link the executable +$(TARGET): $(OBJECTS) + @mkdir -p $(BUILD_DIR) + $(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDLIBS) + +# Compile source files +$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp + @mkdir -p $(BUILD_DIR) + $(CXX) $(CXXFLAGS) -c $< -o $@ + +# Clean the build directory +clean: + rm -rf $(BUILD_DIR) + \ No newline at end of file diff --git a/Makefile-python b/Makefile-python new file mode 100644 index 00000000..361ba11b --- /dev/null +++ b/Makefile-python @@ -0,0 +1,15 @@ +INCL_DIR := $(CURDIR)/include +SRC_DIR := $(CURDIR)/src +LIB_DIR := $(CURDIR)/lib + + +all: + # Generate the SWIG wrapper (C++ -> Python) + swig -c++ -python -I$(INCL_DIR) -o $(SRC_DIR)/swig_wrapper.cpp -outdir $(LIB_DIR) $(SRC_DIR)/swig_wrapper.i + + # Compile the SWIG wrapper using setuptools + python3 setup.py build_ext --build-lib $(LIB_DIR) + +clean: + rm -rf $(LIB_DIR)/*.so $(LIB_DIR)/contextsv.py + \ No newline at end of file diff --git a/setup.py b/setup.py index c57fb30f..5b6d9ef2 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,10 @@ # Set the project dependencies SRC_DIR = "src" +# SRC_FILES = glob.glob(os.path.join(SRC_DIR, "*.cpp")) SRC_FILES = glob.glob(os.path.join(SRC_DIR, "*.cpp")) +SRC_FILES = [f for f in SRC_FILES if "main.cpp" not in f] # Ignore the main.cpp file + INCLUDE_DIR = "include" INCLUDE_FILES = glob.glob(os.path.join(INCLUDE_DIR, "*.h")) From a50530886f4916a4e7435d337bd7998e93297a35 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 16 Nov 2024 20:13:15 -0500 Subject: [PATCH 017/134] Fix all memory leaks --- .gitignore | 1 + include/khmm.h | 51 ++--- include/sv_caller.h | 1 - src/cnv_caller.cpp | 74 ++++++-- src/contextsv.cpp | 1 + src/input_data.cpp | 14 +- src/khmm.cpp | 409 +++++++++++++++++++++++++---------------- src/main.cpp | 55 ++++++ src/sv_data.cpp | 24 ++- src/swig_interface.cpp | 4 - 10 files changed, 414 insertions(+), 220 deletions(-) create mode 100644 src/main.cpp diff --git a/.gitignore b/.gitignore index 939d32b9..5b1177ee 100644 --- a/.gitignore +++ b/.gitignore @@ -88,3 +88,4 @@ python/dist_plots # Temporary files lib/.nfs* +valgrind.log diff --git a/include/khmm.h b/include/khmm.h index 2f7ebe14..8f86e7a4 100644 --- a/include/khmm.h +++ b/include/khmm.h @@ -10,28 +10,26 @@ #include /// @endcond -typedef struct { - int N; /* number of states; Q={1,2,...,N} */ - int M; /* number of observation symbols; V={1,2,...,M}*/ - double **A; /* A[1..N][1..N]. a[i][j] is the transition prob - of going from state i at time t to state j - at time t+1 */ - double **B; /* B[1..N][1..M]. b[j][k] is the probability of - of observing symbol k in state j */ - double *pi; /* pi[1..N] pi[i] is the initial state distribution. */ - double *B1_mean; /* B1_mean[1..N] mean of a continuous Gaussian distribution for state 1 through N*/ - double *B1_sd; /*B1_sd standard deviation of B1 values, which is the same for all states*/ - double B1_uf; /*B1_uniform_fraction: the contribution of uniform distribution to the finite mixture model */ - double *B2_mean; /* B2_mean[1..4] is the average of B_allele_freq*/ - double *B2_sd; /* B2_sd[1..4] is the standard deviation of four B_allele_freq, B2_sd[5] is specially for state1, where B is modelled as a wide normal distribution */ - double B2_uf; /* B2_uniform_fraction: the fraction of uniform distribution in the finite mixture model */ - - int NP_flag; /*flag of 1 and 0 to indicate whether Non-Polymorhpic marker information is contained with HMM file*/ - double *B3_mean; /* B3_mean[1..N] mean of non-polymorphic probe for state 1 through N*/ - double *B3_sd; /* B3_sd[1..4] is the standard deviation of B3 values*/ - double B3_uf; /* B3_uniform_fraction: */ - int dist; /* new parameter to facilitate CNV calling from resequencing data (2009 April) */ -} CHMM; +// Struct for HMM (C++ RAII style) +struct CHMM +{ + int N; // Number of states + int M; // Number of observation symbols + std::vector> A; // Transition probability matrix + std::vector> B; // Emission probability matrix + std::vector pi; // Initial state distribution + std::vector B1_mean; // Mean of a continuous Gaussian distribution for state 1 through N + std::vector B1_sd; // Standard deviation of B1 values, which is the same for all states + double B1_uf; // B1_uniform_fraction: the contribution of uniform distribution to the finite mixture model + std::vector B2_mean; // B2_mean[1..4] is the average of B_allele_freq + std::vector B2_sd; // B2_sd[1..4] is the standard deviation of four B_allele_freq, B2_sd[5] is specially for state1, where B is modelled as a wide normal distribution + double B2_uf; // B2_uniform_fraction: the fraction of uniform distribution in the finite mixture model + int NP_flag; + std::vector B3_mean; + std::vector B3_sd; + double B3_uf; + int dist; +}; /************************************ @@ -39,10 +37,13 @@ typedef struct { ************************************/ /// Read an HMM from a file -CHMM ReadCHMM (const char *filename); +CHMM ReadCHMM (const std::string filename); -// /// Free the memory allocated for an HMM -// void FreeCHMM(CHMM *phmm); +// Read a matrix +std::vector> readMatrix(std::ifstream& file, int rows, int cols); + +// Read a vector +std::vector readVector(std::ifstream& file, int size); /// Run the main HMM algorithm std::pair, double> testVit_CHMM(CHMM hmm, int T, std::vector& O1, std::vector& O2, std::vector& pfb); diff --git a/include/sv_caller.h b/include/sv_caller.h index ed11d08f..0a94b254 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -31,7 +31,6 @@ class SVCaller { int min_sv_size = 50; // Minimum SV size to be considered int min_mapq = 20; // Minimum mapping quality to be considered InputData* input_data; - std::mutex sv_mtx; // Mutex for locking the SV data // Detect SVs from the CIGAR string of a read alignment, and return the // mismatch rate, and the start and end positions of the query sequence diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index ceaeb5ec..951775c6 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -481,7 +481,17 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) // Split the chromosome into equal parts for each thread uint32_t chr_len = this->input_data->getRefGenomeChromosomeLength(chr); + if (chr_len == 0) + { + printError("ERROR: Chromosome length is zero for: " + chr); + return 0.0; + } std::vector region_chunks = splitRegionIntoChunks(chr, 1, chr_len, num_threads); + if (region_chunks.empty()) + { + printError("ERROR: Failed to split chromosome into regions."); + return 0.0; + } // Calculate the mean chromosome coverage in parallel uint32_t pos_count = 0; @@ -496,23 +506,22 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) { // Run samtools depth on the entire region, and print positions and // depths (not chromosome) - const int cmd_size = 256; - char cmd[cmd_size]; - snprintf(cmd, cmd_size,\ + size_t cmd_size = input_filepath.size() + 256; + std::vector cmd(cmd_size); + snprintf(cmd.data(), cmd_size,\ "samtools depth -r %s %s | awk '{print $2, $3}'",\ region_chunk.c_str(), input_filepath.c_str()); // Open a pipe to read the output of the command - FILE *fp = popen(cmd, "r"); + FILE *fp = popen(cmd.data(), "r"); if (fp == NULL) { - printError("ERROR: Could not open pipe for command: " + std::string(cmd)); - exit(EXIT_FAILURE); + throw std::runtime_error("ERROR: Could not open pipe for command: " + std::string(cmd.data())); } // Parse the outputs (position and depth) std::unordered_map pos_depth_map; - const int line_size = 256; + const int line_size = 1024; char line[line_size]; uint32_t pos; int depth; @@ -527,26 +536,53 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) cum_depth += depth; } } - pclose(fp); // Close the process + + // Check if pclose fails + if (pclose(fp) == -1) + { + throw std::runtime_error("ERROR: Failed to close pipe for command: " + std::string(cmd.data())); + } + //pclose(fp); // Close the process return std::make_tuple(pos_count, cum_depth, pos_depth_map); }; - std::future>> future = std::async(std::launch::async, get_mean_chr_cov); - futures.push_back(std::move(future)); + + futures.emplace_back(std::async(std::launch::async, get_mean_chr_cov)); + //std::future>> future = std::async(std::launch::async, get_mean_chr_cov); + //futures.push_back(std::move(future)); } - // Loop through the futures and get the results + // Thread-safe map merging (using mutex) + std::mutex merge_mutex; for (auto& future : futures) { - future.wait(); - std::tuple> result = std::move(future.get()); - - // Update the position count, cumulative depth, and merge the position-depth maps - pos_count += std::get<0>(result); - cum_depth += std::get<1>(result); - this->mergePosDepthMaps(this->pos_depth_map, std::get<2>(result)); + try + { + future.wait(); + auto result = std::move(future.get()); + + // Safely merge results + std::lock_guard lock(merge_mutex); + pos_count += std::get<0>(result); + cum_depth += std::get<1>(result); + this->mergePosDepthMaps(this->pos_depth_map, std::get<2>(result)); + } + catch (const std::exception& ex) + { + printError("ERROR: Exception in thread execution - " + std::string(ex.what())); + return 0.0; + } } - double mean_chr_cov = (double) cum_depth / (double) pos_count; + + // Validate and calculate mean chromosome coverage + if (pos_count == 0) + { + printError("ERROR: No positions found in chromosome coverage calculation."); + return 0.0; + } + + double mean_chr_cov = static_cast(cum_depth) / static_cast(pos_count); + return mean_chr_cov; } diff --git a/src/contextsv.cpp b/src/contextsv.cpp index c0d4acd7..4f35c052 100644 --- a/src/contextsv.cpp +++ b/src/contextsv.cpp @@ -24,6 +24,7 @@ int ContextSV::run() SVCaller sv_caller(*this->input_data); // Create an SV caller object SVData sv_calls = sv_caller.run(); // Run the SV caller std::string output_dir = this->input_data->getOutputDir(); // Get the output directory + std::cout << "Writing SV calls to file " << output_dir << "/output.vcf..." << std::endl; sv_calls.saveToVCF(ref_genome, output_dir); // Save the SV calls to a VCF file std::cout << "SV calling complete." << std::endl; diff --git a/src/input_data.cpp b/src/input_data.cpp index 8867d0ff..99a4cade 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -42,7 +42,7 @@ void InputData::setShortReadBam(std::string filepath) this->short_read_bam = filepath; // Check if empty string - if (filepath == "") + if (filepath.empty()) { return; @@ -51,8 +51,9 @@ void InputData::setShortReadBam(std::string filepath) FILE *fp = fopen(filepath.c_str(), "r"); if (fp == NULL) { - std::cerr << "Short read BAM file does not exist: " << filepath << std::endl; - exit(1); + throw std::runtime_error("Short read BAM file does not exist: " + filepath); + } else { + fclose(fp); } } } @@ -67,7 +68,7 @@ void InputData::setLongReadBam(std::string filepath) this->long_read_bam = filepath; // Check if empty string - if (filepath == "") + if (filepath.empty()) { return; @@ -76,8 +77,9 @@ void InputData::setLongReadBam(std::string filepath) FILE *fp = fopen(filepath.c_str(), "r"); if (fp == NULL) { - std::cerr << "Long read BAM file does not exist: " << filepath << std::endl; - exit(1); + throw std::runtime_error("Long read BAM file does not exist: " + filepath); + } else { + fclose(fp); } } } diff --git a/src/khmm.cpp b/src/khmm.cpp index 3b3bffa6..bdb6eb8b 100644 --- a/src/khmm.cpp +++ b/src/khmm.cpp @@ -3,10 +3,13 @@ /// @cond #include +#include #include #include #include #include +#include +#include /// @endcond #define STATE_CHANGE 100000.0 /*this is the expected changes (D value) in the transition matrix*/ @@ -50,30 +53,53 @@ std::pair, double> testVit_CHMM(CHMM hmm, int T, std::vector mean, std::vector sd, double uf, double o) { - if (o < mean[1]) + // if (o < mean[1]) + // { + // o = mean[1]; + // } + // double p = uf + ((1 - uf) * pdf_normal(o, mean[state], sd[state])); + + // Get the values (0-based indexing) + if (o < mean[0]) { - o = mean[1]; + o = mean[0]; } - double p = uf + ((1 - uf) * pdf_normal(o, mean[state], sd[state])); + double p = uf + ((1 - uf) * pdf_normal(o, mean[state-1], sd[state-1])); return log(p); } -double b2iot(int state, double *mean, double *sd, double uf, double pfb, double b) +// double b2iot(int state, double *mean, double *sd, double uf, double pfb, double b) +double b2iot(int state, const std::vector mean, const std::vector sd, double uf, double pfb, double b) { + // double p = 0; + // double mean0 = mean[1]; // mean[1] = 0 + // double mean25 = mean[2]; // mean[2] = 0.25 + // double mean33 = mean[3]; // mean[3] = 0.33 + // double mean50 = mean[4]; // mean[4] = 0.5 + // double mean50_state1 = mean[5]; // mean[5] = 0.5 + // double sd0 = sd[1]; // sd[1] = 0 + // double sd25 = sd[2]; // sd[2] = 0.25 + // double sd33 = sd[3]; // sd[3] = 0.33 + // double sd50 = sd[4]; // sd[4] = 0.5 + // double sd50_state1 = sd[5]; // sd[5] = 0.5 + // p = uf; // UF = previous alpha (transition probability) + + // Get the values (0-based indexing) double p = 0; - double mean0 = mean[1]; // mean[1] = 0 - double mean25 = mean[2]; // mean[2] = 0.25 - double mean33 = mean[3]; // mean[3] = 0.33 - double mean50 = mean[4]; // mean[4] = 0.5 - double mean50_state1 = mean[5]; // mean[5] = 0.5 - double sd0 = sd[1]; // sd[1] = 0 - double sd25 = sd[2]; // sd[2] = 0.25 - double sd33 = sd[3]; // sd[3] = 0.33 - double sd50 = sd[4]; // sd[4] = 0.5 - double sd50_state1 = sd[5]; // sd[5] = 0.5 + double mean0 = mean[0]; // mean[0] = 0 + double mean25 = mean[1]; // mean[1] = 0.25 + double mean33 = mean[2]; // mean[2] = 0.33 + double mean50 = mean[3]; // mean[3] = 0.5 + double mean50_state1 = mean[4]; // mean[4] = 0.5 + double sd0 = sd[0]; // sd[0] = 0 + double sd25 = sd[1]; // sd[1] = 0.25 + double sd33 = sd[2]; // sd[2] = 0.33 + double sd50 = sd[3]; // sd[3] = 0.5 + double sd50_state1 = sd[4]; // sd[4] = 0.5 p = uf; // UF = previous alpha (transition probability) // PDF normal is the transition probability distrubution a_ij (initialized @@ -247,7 +273,9 @@ std::pair, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect { for (j = 1; j <= hmm.N; j++) { - A1[i][j] = hmm.A[i][j]; + // A1[i][j] = hmm.A[i][j]; + // Update for 0-based indexing + A1[i][j] = hmm.A[i-1][j-1]; } } @@ -257,9 +285,15 @@ std::pair, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect // Threshold any zero values to avoid calculation issues. for (i = 1; i <= hmm.N; i++) { - if (hmm.pi[i] == 0) - hmm.pi[i] = 1e-9; /*eliminate problems with zero probability*/ - hmm.pi[i] = log(hmm.pi[i]); // Convert to log probability due to underflow + // if (hmm.pi[i] == 0) + // hmm.pi[i] = 1e-9; /*eliminate problems with zero probability*/ + // hmm.pi[i] = log(hmm.pi[i]); // Convert to log probability due to underflow + + // Update to 0-based indexing + if (hmm.pi[i-1] == 0) { + hmm.pi[i-1] = 1e-9; /*eliminate problems with zero probability*/ + } + hmm.pi[i-1] = log(hmm.pi[i-1]); // Convert to log probability due to underflow } // Biot is the NxT matrix of state observation likelihoods. @@ -302,7 +336,12 @@ std::pair, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect /* 1. Initialization */ for (i = 1; i <= hmm.N; i++) { - delta[1][i] = hmm.pi[i] + biot[i][1]; // Initialize the delta matrix (log probability) to the initial state distribution + the emission probability + // delta[1][i] = hmm.pi[i] + biot[i][1]; // Initialize the delta matrix + // (log probability) to the initial state distribution + the emission + // probability + + // Update to 0-based indexing + delta[1][i] = hmm.pi[i-1] + biot[i][1]; // Initialize the delta matrix psi[1][i] = 0; // Initialize the psi matrix (state sequence) to 0 (no state) } @@ -372,7 +411,9 @@ std::pair, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect for (i = 1; i <= hmm.N; i++) { /*recover the HMM model as original*/ - hmm.pi[i] = exp(hmm.pi[i]); + // hmm.pi[i] = exp(hmm.pi[i]); + // Update to 0-based indexing + hmm.pi[i-1] = exp(hmm.pi[i-1]); } free_dmatrix(biot, 1, hmm.N, 1, T); @@ -381,155 +422,209 @@ std::pair, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect return std::make_pair(q, final_lh); } -CHMM ReadCHMM(const char *filename) +CHMM ReadCHMM(const std::string filename) { - FILE *fp; + std::ifstream file(filename); + if (!file.is_open()) + { + throw std::runtime_error("Error opening file"); + } CHMM hmm; - int i, j, k; - fp = fopen(filename, "r"); - if (!fp) - fprintf(stderr, "Error: cannot read from HMM file %s\n", filename); - if (fscanf(fp, "M=%d\n", &(hmm.M)) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read M annotation from HMM file"); - if (fscanf(fp, "N=%d\n", &(hmm.N)) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read N annotation from HMM file"); + // Read M + std::string line; + std::getline(file, line); + if (sscanf(line.c_str(), "M=%d", &hmm.M) != 1) + { + throw std::runtime_error("Error reading M"); + } - if (fscanf(fp, "A:\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read A annotation from HMM file"); - hmm.A = (double **)dmatrix(1, hmm.N, 1, hmm.N); - for (i = 1; i <= hmm.N; i++) + // Read N + std::getline(file, line); + if (sscanf(line.c_str(), "N=%d", &hmm.N) != 1) { - for (j = 1; j <= hmm.N; j++) - { - if (fscanf(fp, "%lf", &(hmm.A[i][j])) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read A matrix from HMM file"); - } - if (fscanf(fp, "\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file"); + throw std::runtime_error("Error reading N"); } - if (fscanf(fp, "B:\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B annotation from HMM file"); - hmm.B = (double **)dmatrix(1, hmm.N, 1, hmm.M); - for (j = 1; j <= hmm.N; j++) + // Read A + std::getline(file, line); + if (line != "A:") { - for (k = 1; k <= hmm.M; k++) - { - if (fscanf(fp, "%lf", &(hmm.B[j][k])) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B matrix from HMM file"); - } - if (fscanf(fp, "\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file"); + throw std::runtime_error("Error reading A"); + } + hmm.A = readMatrix(file, hmm.N, hmm.N); + if (hmm.A.size() != (size_t)hmm.N || hmm.A[0].size() != (size_t)hmm.N) + { + throw std::runtime_error("Error reading A"); } - if (fscanf(fp, "pi:\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read PI annotation from HMM file"); - hmm.pi = (double *)dvector(1, hmm.N); - for (i = 1; i <= hmm.N; i++) + // Print A + // std::cout << "A: " << std::endl; + // for (int i = 0; i < hmm.N; i++) + // { + // for (int j = 0; j < hmm.N; j++) + // { + // std::cout << std::setprecision(10) << hmm.A[i][j] << " "; + // } + // std::cout << std::endl; + // } + + // Read B + std::getline(file, line); + if (line != "B:") + { + throw std::runtime_error("Error reading B"); + } + hmm.B = readMatrix(file, hmm.N, hmm.M); + if (hmm.B.size() != (size_t)hmm.N || hmm.B[0].size() != (size_t)hmm.M) { - if (fscanf(fp, "%lf", &(hmm.pi[i])) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read PI vector from HMM file"); - if (hmm.pi[i] < 1e-6) - hmm.pi[i] = 1e-6; + throw std::runtime_error("Error reading B"); } - if (fscanf(fp, "\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file"); - if (fscanf(fp, "B1_mean:\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B1_mean annotation from HMM file"); - hmm.B1_mean = (double *)dvector(1, hmm.N); - for (i = 1; i <= hmm.N; i++) - if (fscanf(fp, "%lf", &(hmm.B1_mean[i])) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B1_mean vector from HMM file"); - if (fscanf(fp, "\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file"); - - if (fscanf(fp, "B1_sd:\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B1_sd annotation from HMM file"); - hmm.B1_sd = (double *)dvector(1, hmm.N); - for (i = 1; i <= hmm.N; i++) - if (fscanf(fp, "%lf", &(hmm.B1_sd[i])) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B1_sd from HMM file"); - if (fscanf(fp, "\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file"); - - if (fscanf(fp, "B1_uf:\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B1_uf annotation from HMM file"); - if (fscanf(fp, "%lf", &(hmm.B1_uf)) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B1_uf from HMM file"); - if (fscanf(fp, "\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file"); - - if (fscanf(fp, "B2_mean:\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B2_mean annotation from HMM file"); - hmm.B2_mean = (double *)dvector(1, 5); - for (i = 1; i <= 5; i++) - if (fscanf(fp, "%lf", &(hmm.B2_mean[i])) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B2_mean from HMM file"); - if (fscanf(fp, "\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file"); - - if (fscanf(fp, "B2_sd:\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B2_sd annotation from HMM file"); - hmm.B2_sd = (double *)dvector(1, 5); - for (i = 1; i <= 5; i++) - if (fscanf(fp, "%lf", &(hmm.B2_sd[i])) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B2_sd from HMM file"); - if (fscanf(fp, "\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file"); - - if (fscanf(fp, "B2_uf:\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B2_uf annotation from HMM file"); - if (fscanf(fp, "%lf", &(hmm.B2_uf)) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B2_uf from HMM file"); - if (fscanf(fp, "\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file"); - - if (fscanf(fp, "B3_mean:\n") != EOF) - { - hmm.NP_flag = 1; - hmm.B3_mean = (double *)dvector(1, hmm.N); - for (i = 1; i <= hmm.N; i++) - if (fscanf(fp, "%lf", &(hmm.B3_mean[i])) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B3_mean from HMM file"); - if (fscanf(fp, "\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file"); - if (fscanf(fp, "B3_sd:\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B3_sd annotation from HMM file"); - hmm.B3_sd = (double *)dvector(1, hmm.N); - for (i = 1; i <= hmm.N; i++) - if (fscanf(fp, "%lf", &(hmm.B3_sd[i])) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B3_sd from HMM file"); - if (fscanf(fp, "\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file"); - if (fscanf(fp, "B3_uf:\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B3_uf annotation from HMM file"); - if (fscanf(fp, "%lf", &(hmm.B3_uf)) == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read B3_uf from HMM file"); - if (fscanf(fp, "\n") == EOF) - fprintf(stderr, "khmm::ReadCHMM: cannot read return character from HMM file"); - } - else - { - hmm.NP_flag = 0; - } - - if (fscanf(fp, "DIST:\n") != EOF) - { - if (fscanf(fp, "%d", &(hmm.dist)) == EOF) - fprintf(stderr, "khmm:ReadCHMM: cannot read DIST from HMM file"); - } - else - { - // hmm.dist = STATE_CHANGE; - // snp_dist is the default distance between two SNPs in the same state - // (not used in this implementation) - // Set it to 1 to disable the distance model - hmm.dist = 1; - } - - fclose(fp); + // Read pi + std::getline(file, line); + if (line != "pi:") + { + throw std::runtime_error("Error reading pi"); + } + hmm.pi = readVector(file, hmm.N); + if (hmm.pi.size() != (size_t)hmm.N) + { + throw std::runtime_error("Error reading pi"); + } + + // Print pi + // std::cout << "pi: "; + // for (int i = 0; i < hmm.N; i++) + // { + // std::cout << std::setprecision(10) << hmm.pi[i] << " "; + // } + + // Read B1_mean + std::getline(file, line); + if (line != "B1_mean:") + { + throw std::runtime_error("Error reading B1_mean"); + } + hmm.B1_mean = readVector(file, hmm.N); + if (hmm.B1_mean.size() != (size_t)hmm.N) + { + throw std::runtime_error("Error reading B1_mean"); + } + + // Print B1_mean + // std::cout << "B1_mean: "; + // for (int i = 0; i < hmm.N; i++) + // { + // std::cout << std::setprecision(10) << hmm.B1_mean[i] << " "; + // } + + // Read B1_sd + std::getline(file, line); + if (line != "B1_sd:") + { + throw std::runtime_error("Error reading B1_sd"); + } + hmm.B1_sd = readVector(file, hmm.N); + if (hmm.B1_sd.size() != (size_t)hmm.N) + { + throw std::runtime_error("Error reading B1_sd"); + } + + // Print B1_sd + // std::cout << "B1_sd: "; + // for (int i = 0; i < hmm.N; i++) + // { + // std::cout << std::setprecision(10) << hmm.B1_sd[i] << " "; + // } + + // Read B1_uf + std::getline(file, line); + if (line != "B1_uf:") + { + throw std::runtime_error("Error reading B1_uf"); + } + std::getline(file, line); + try { + hmm.B1_uf = std::stod(line); + } catch (const std::invalid_argument& e) { + throw std::runtime_error("Error reading B1_uf"); + } + + // Print B1_uf + // std::cout << "B1_uf: " << std::setprecision(10) << hmm.B1_uf << std::endl; + + // Read B2_mean + std::getline(file, line); + if (line != "B2_mean:") + { + throw std::runtime_error("Error reading B2_mean"); + } + hmm.B2_mean = readVector(file, 5); + if (hmm.B2_mean.size() != (size_t)5) + { + throw std::runtime_error("Error reading B2_mean"); + } + + // Read B2_sd + std::getline(file, line); + if (line != "B2_sd:") + { + throw std::runtime_error("Error reading B2_sd"); + } + hmm.B2_sd = readVector(file, 5); + if (hmm.B2_sd.size() != (size_t)5) + { + throw std::runtime_error("Error reading B2_sd"); + } + + // Read B2_uf + std::getline(file, line); + if (line != "B2_uf:") + { + throw std::runtime_error("Error reading B2_uf"); + } + std::getline(file, line); + try { + hmm.B2_uf = std::stod(line); + } catch (const std::invalid_argument& e) { + throw std::runtime_error("Error reading B2_uf"); + } + + // Print B2_uf + // std::cout << "B2_uf: " << std::setprecision(10) << hmm.B2_uf << std::endl; + return hmm; } + +std::vector> readMatrix(std::ifstream &file, int rows, int cols) +{ + std::vector> matrix(rows, std::vector(cols)); + for (int i = 0; i < rows; i++) + { + for (int j = 0; j < cols; j++) + { + if (!(file >> matrix[i][j])) + { + throw std::runtime_error("Error reading matrix"); + } + } + } + file.ignore(std::numeric_limits::max(), '\n'); + return matrix; +} + +std::vector readVector(std::ifstream &file, int size) +{ + std::vector vector(size); + for (int i = 0; i < size; i++) + { + if (!(file >> vector[i])) + { + throw std::runtime_error("Error reading vector"); + } + } + file.ignore(std::numeric_limits::max(), '\n'); + return vector; +} diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 00000000..e1ac4c12 --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,55 @@ + +#include "swig_interface.h" +#include "input_data.h" + +/// @cond DOXYGEN_IGNORE +#include +#include +/// @endcond + +// Placeholder for ContextSV library includes +// #include "ContextSV.h" + +void runContextSV(const std::string& bamFile, const std::string& refFile, const std::string& vcfFile, const std::string& outputDir, int threadCount = 1) { + // Placeholder for setting up input data and running ContextSV + std::cout << "Running ContextSV with the following files:" << std::endl; + std::cout << "BAM file: " << bamFile << std::endl; + std::cout << "Reference file: " << refFile << std::endl; + std::cout << "VCF file: " << vcfFile << std::endl; + std::cout << "Output directory: " << outputDir << std::endl; + + // Set up input data + InputData input_data; + input_data.setShortReadBam(bamFile); + input_data.setLongReadBam(bamFile); + input_data.setRefGenome(refFile); + input_data.setSNPFilepath(vcfFile); + input_data.setChromosome("21"); + input_data.setRegion("14486099-14515105"); + input_data.setThreadCount(1); + input_data.setAlleleFreqFilepaths(""); + input_data.setHMMFilepath(""); + input_data.setOutputDir(outputDir); + input_data.saveCNVData(true); + input_data.setThreadCount(threadCount); + + // Run ContextSV + run(input_data); +} + +int main(int argc, char* argv[]) { + if (argc != 6) { + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + std::string bamFile = argv[1]; + std::string refFile = argv[2]; + std::string vcfFile = argv[3]; + std::string outputDir = argv[4]; + int threadCount = std::stoi(argv[5]); + + runContextSV(bamFile, refFile, vcfFile, outputDir, threadCount); + + return 0; +} diff --git a/src/sv_data.cpp b/src/sv_data.cpp index 085e737e..a0611c8d 100644 --- a/src/sv_data.cpp +++ b/src/sv_data.cpp @@ -6,7 +6,6 @@ #include /// @endcond - int SVData::add(std::string chr, int32_t start, int32_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) { // Throw an error if the genotype is not valid @@ -72,19 +71,28 @@ int SVData::add(std::string chr, int32_t start, int32_t end, SVType sv_type, std void SVData::concatenate(const SVData &sv_data) { + if (sv_data.sv_calls.empty()) { + std::cerr << "Error: SVData object is empty." << std::endl; + return; + } + // Iterate over the chromosomes in the other SVData object for (auto const& chr_sv_calls : sv_data.sv_calls) { - std::string chr = chr_sv_calls.first; + const auto &chr = chr_sv_calls.first; + // std::string chr = chr_sv_calls.first; + auto ¤t_chr_calls = this->sv_calls[chr]; // Iterate over the SV calls in the other SVData object for (auto const& sv_call : chr_sv_calls.second) { - // Add the SV call to the map of candidate locations. Since the region - // is unique (per chromosome), there is no need to check if the SV - // candidate already exists in the map. - SVCandidate candidate = sv_call.first; // (start, end, alt_allele) - SVInfo info = sv_call.second; // (sv_type, read_support, data_type, sv_length) - this->sv_calls[chr][candidate] = info; + // Add the SV call to the map of candidate locations + std::pair::iterator, bool> result = current_chr_calls.emplace(sv_call); + bool inserted = result.second; + + // Throw a warning if the SV candidate already exists + if (!inserted) { + std::cerr << "Warning: SV candidate already exists in the map." << std::endl; + } } } } diff --git a/src/swig_interface.cpp b/src/swig_interface.cpp index 87334fec..8d2e7a42 100644 --- a/src/swig_interface.cpp +++ b/src/swig_interface.cpp @@ -1,7 +1,3 @@ -// -// Created by jperdomo on 1/8/2023. -// - #include "swig_interface.h" #include "contextsv.h" From 7cac258e88c11252f80227852202e9f33ffde4d7 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 20 Nov 2024 11:47:58 -0500 Subject: [PATCH 018/134] Improve memory management --- Makefile-cpp | 2 +- include/cnv_caller.h | 10 +- include/contextsv.h | 4 +- include/fasta_query.h | 12 +- include/input_data.h | 6 +- include/sv_caller.h | 19 +- include/sv_data.h | 2 +- include/sv_object.h | 37 ++++ python/sv_merger.py | 5 +- src/cnv_caller.cpp | 206 +++++++++-------- src/contextsv.cpp | 18 +- src/fasta_query.cpp | 23 +- src/input_data.cpp | 2 +- src/main.cpp | 43 +++- src/snp_info.cpp | 2 +- src/sv_caller.cpp | 505 +++++++++++++++++++++++++++++++++--------- src/sv_data.cpp | 2 +- src/sv_object.cpp | 126 +++++++++++ 18 files changed, 769 insertions(+), 255 deletions(-) create mode 100644 include/sv_object.h create mode 100644 src/sv_object.cpp diff --git a/Makefile-cpp b/Makefile-cpp index cf76256b..e6ba7d30 100644 --- a/Makefile-cpp +++ b/Makefile-cpp @@ -11,7 +11,7 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib # Compiler and Flags CXX := g++ -CXXFLAGS := -std=c++11 -I$(INCL_DIR) -I$(CONDA_INCL_DIR) +CXXFLAGS := -std=c++14 -I$(INCL_DIR) -I$(CONDA_INCL_DIR) LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR) # Add rpath for shared libraries # Link htslib diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 80c69f89..22a14cc9 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -9,6 +9,7 @@ #include "cnv_data.h" #include "sv_data.h" #include "sv_types.h" +#include "sv_object.h" /// @cond #include @@ -47,7 +48,7 @@ struct SNPData { // CNVCaller: Detect CNVs and return the state sequence by SNP position class CNVCaller { private: - InputData* input_data; + InputData& input_data; mutable std::mutex sv_candidates_mtx; // SV candidate map mutex mutable std::mutex snp_data_mtx; // SNP data mutex mutable std::mutex hmm_mtx; // HMM mutex @@ -86,7 +87,7 @@ class CNVCaller { std::pair querySNPRegion(std::string chr, int64_t start_pos, int64_t end_pos, SNPInfo &snp_info, std::unordered_map &pos_depth_map, double mean_chr_cov); // Run copy number prediction for a chunk of SV candidates from CIGAR strings - void runCIGARCopyNumberPredictionChunk(std::string chr, std::map& sv_candidates, std::vector sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map& pos_depth_map); + void runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map& pos_depth_map); void updateSVCopyNumber(std::map& sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood); @@ -102,7 +103,7 @@ class CNVCaller { void mergePosDepthMaps(std::unordered_map& main_map, std::unordered_map& map_update); public: - CNVCaller(InputData& input_data); + explicit CNVCaller(InputData& input_data); // Load file data for a chromosome (SNP positions, BAF values, and PFB values) void loadChromosomeData(std::string chr); @@ -112,7 +113,8 @@ class CNVCaller { std::tuple runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate); // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings - SNPData runCIGARCopyNumberPrediction(std::string chr, std::map& sv_candidates, int min_length); + // SNPData runCIGARCopyNumberPrediction(std::string chr, std::map& sv_candidates, int min_length); + void runCIGARCopyNumberPrediction(std::string chr, std::set& sv_candidates, int min_length); // Calculate the mean chromosome coverage double calculateMeanChromosomeCoverage(std::string chr); diff --git a/include/contextsv.h b/include/contextsv.h index b2a5d6e3..56a82a54 100644 --- a/include/contextsv.h +++ b/include/contextsv.h @@ -13,10 +13,10 @@ class ContextSV { private: - InputData* input_data; + InputData& input_data; public: - ContextSV(InputData& input_data); + explicit ContextSV(InputData& input_data); // Entry point int run(); diff --git a/include/fasta_query.h b/include/fasta_query.h index 558728bf..ffa88d8a 100644 --- a/include/fasta_query.h +++ b/include/fasta_query.h @@ -1,4 +1,4 @@ -// FASTAQuery: A class for querying a FASTA file. +// ReferenceGenome: A class for querying a reference genome FASTA file. #ifndef FASTA_QUERY_H #define FASTA_QUERY_H @@ -10,7 +10,7 @@ #include /// @endcond -class FASTAQuery { +class ReferenceGenome { private: std::string fasta_filepath; std::vector chromosomes; @@ -18,17 +18,17 @@ class FASTAQuery { public: int setFilepath(std::string fasta_filepath); - std::string getFilepath(); - std::string query(std::string chr, int64_t pos_start, int64_t pos_end); + std::string getFilepath() const; + std::string query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const; // Get the chromosome contig lengths in VCF header format - std::string getContigHeader(); + std::string getContigHeader() const; // Get the list of chromosomes, used for whole genome analysis std::vector getChromosomes(); // Get the length of a chromosome - int64_t getChromosomeLength(std::string chr); + uint32_t getChromosomeLength(std::string chr); }; #endif // FASTA_QUERY_H diff --git a/include/input_data.h b/include/input_data.h index 1042d664..0a74125f 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -38,8 +38,8 @@ class InputData { // Set the filepath to the reference genome FASTA file. void setRefGenome(std::string fasta_filepath); - // Return a reference to the FASTAQuery object. - const FASTAQuery& getRefGenome() const; + // Return a reference to the ReferenceGenome object. + const ReferenceGenome& getRefGenome() const; // FASTAQuery getRefGenome(); // Query the reference genome for a sequence. @@ -111,7 +111,7 @@ class InputData { std::string snp_vcf_filepath; std::string ethnicity; std::unordered_map pfb_filepaths; // Map of population frequency VCF filepaths by chromosome - FASTAQuery fasta_query; + ReferenceGenome fasta_query; std::string output_dir; int window_size; int min_cnv_length; diff --git a/include/sv_caller.h b/include/sv_caller.h index 0a94b254..371cc53d 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -7,6 +7,8 @@ #include "input_data.h" #include "cnv_data.h" #include "sv_data.h" +#include "sv_object.h" +#include "fasta_query.h" #include @@ -24,38 +26,41 @@ using AlignmentVector = std::vector; // Query map (query name, alignment vector) using PrimaryMap = std::unordered_map; using SuppMap = std::unordered_map; -using RegionData = std::tuple; +// using RegionData = std::tuple; class SVCaller { private: int min_sv_size = 50; // Minimum SV size to be considered int min_mapq = 20; // Minimum mapping quality to be considered - InputData* input_data; + InputData& input_data; // Detect SVs from the CIGAR string of a read alignment, and return the // mismatch rate, and the start and end positions of the query sequence - std::tuple, int32_t, int32_t> detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& sv_calls, bool is_primary); + std::tuple, int32_t, int32_t> detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set& sv_calls, bool is_primary); // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. - RegionData detectSVsFromRegion(std::string region); + // RegionData detectSVsFromRegion(std::string region); + std::tuple, PrimaryMap, SuppMap> detectCIGARSVs(std::string region); // Read the next alignment from the BAM file in a thread-safe manner int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); // Detect SVs from split alignments - void detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller); + void detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller); // Calculate the mismatch rate given a map of query positions to // match/mismatch (1/0) values within a specified range of the query // sequence double calculateMismatchRate(std::unordered_map& mismatch_map, int32_t start, int32_t end); + void saveToVCF(const std::unordered_map>& sv_calls, const ReferenceGenome& ref_genome); + public: - SVCaller(InputData& input_data); + explicit SVCaller(InputData& input_data); // Detect SVs and predict SV type from long read alignments and CNV calls - SVData run(); + std::unordered_map> run(); }; #endif // SV_CALLER_H diff --git a/include/sv_data.h b/include/sv_data.h index f4ed6e25..fef815ed 100644 --- a/include/sv_data.h +++ b/include/sv_data.h @@ -35,7 +35,7 @@ class SVData { int getClippedBaseSupport(std::string chr, int64_t pos, int64_t end); - void saveToVCF(FASTAQuery& ref_genome, std::string output_dir); + void saveToVCF(ReferenceGenome& ref_genome, std::string output_dir); std::map& getChromosomeSVs(std::string chr); diff --git a/include/sv_object.h b/include/sv_object.h new file mode 100644 index 00000000..3128dfe1 --- /dev/null +++ b/include/sv_object.h @@ -0,0 +1,37 @@ +#ifndef SV_OBJECT_H +#define SV_OBJECT_H + +#include +#include +#include +#include +#include + +// Struct to represent a structural variant call +struct SVCall { + uint32_t start; + uint32_t end; + std::string sv_type = "NA"; + std::string alt_allele = "."; + std::string data_type = "NA"; + std::string genotype = "./."; + double hmm_likelihood = 0.0; + int support = 0; + + // Comparison operator for std::set + bool operator<(const SVCall& other) const; + + // Constructor with parameters for all fields + SVCall(uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int support) : + start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), support(support) {} +}; + +void addSVCall(std::set& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood); + +std::vector> splitSVsIntoChunks(std::set& sv_calls, int chunk_count); + +uint32_t getSVCount(const std::set& sv_calls); + +void concatenateSVCalls(std::set& sv_calls, const std::set& sv_calls_update); + +#endif // SV_OBJECT_H diff --git a/python/sv_merger.py b/python/sv_merger.py index 172cba6e..4254e0ad 100644 --- a/python/sv_merger.py +++ b/python/sv_merger.py @@ -130,8 +130,9 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): # Get the HMM likelihood scores hmm_scores = vcf_df['INFO'].str.extract(r'HMM=(-?\d+\.?\d*)', expand=False).astype(float) - # Set all 0 values to NaN - hmm_scores[hmm_scores == 0] = np.nan + # Set all 0 values to a low negative value + hmm_scores[hmm_scores == 0] = -1e-100 + # hmm_scores[hmm_scores == 0] = np.nan # Cluster SV breakpoints using HDBSCAN cluster_labels = [] diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 951775c6..55e363ae 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -31,6 +31,11 @@ using namespace sv_types; +CNVCaller::CNVCaller(InputData &input_data) + : input_data(input_data) // Initialize the input data +{ +} + // Function to call the Viterbi algorithm for the CHMM std::pair, double> CNVCaller::runViterbi(CHMM hmm, SNPData& snp_data) { @@ -39,7 +44,7 @@ std::pair, double> CNVCaller::runViterbi(CHMM hmm, SNPData& snp { throw std::runtime_error("Error: No SNP data found for Viterbi algorithm."); } - std::lock_guard lock(this->hmm_mtx); // Lock the mutex for the HMM + // std::lock_guard lock(this->hmm_mtx); // Lock the mutex for the HMM std::pair, double> state_sequence = testVit_CHMM(hmm, data_count, snp_data.log2_cov, snp_data.baf, snp_data.pfb); return state_sequence; } @@ -49,7 +54,7 @@ std::pair CNVCaller::querySNPRegion(std::string chr, int64_t star { SNPData snp_data; bool snps_found = false; - int window_size = this->input_data->getWindowSize(); + int window_size = this->input_data.getWindowSize(); // printMessage("Querying SNPs for region " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); for (int64_t i = start_pos; i <= end_pos; i += window_size) @@ -61,9 +66,9 @@ std::pair CNVCaller::querySNPRegion(std::string chr, int64_t star // Get the SNP info for the window // std::cout << "Querying SNPs for window " << chr << ":" << window_start << "-" << window_end << "..." << std::endl; - this->snp_data_mtx.lock(); + // this->snp_data_mtx.lock(); std::tuple, std::vector, std::vector> window_snps = snp_info.querySNPs(chr, window_start, window_end); - this->snp_data_mtx.unlock(); + // this->snp_data_mtx.unlock(); std::vector& snp_window_pos = std::get<0>(window_snps); // SNP positions std::vector& snp_window_bafs = std::get<1>(window_snps); // B-allele frequencies std::vector& snp_window_pfbs = std::get<2>(window_snps); // Population frequencies of the B allele @@ -134,6 +139,7 @@ std::tuple CNVCaller::runCopyNumberPrediction bool sv_snps_found = snp_call.second; // Run the Viterbi algorithm + // printMessage("[TEST] Running Viterbi algorithm for SV candidate " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); std::pair, double> prediction = runViterbi(this->hmm, sv_snps); std::vector& state_sequence = prediction.first; double likelihood = prediction.second; @@ -179,10 +185,10 @@ std::tuple CNVCaller::runCopyNumberPrediction // Save the SV calls as a TSV file if enabled bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL); - if (this->input_data->getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 10000) + if (this->input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 10000) { std::string cnv_type_str = getSVTypeString(predicted_cnv_type); - std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv"; + std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv"; std::cout << "Saving SV split-alignment copy number predictions to " << sv_filename << std::endl; this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood); } @@ -191,54 +197,58 @@ std::tuple CNVCaller::runCopyNumberPrediction } -SNPData CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::map &sv_candidates, int min_length) +void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set &sv_candidates, int min_length) { - SNPInfo& snp_info = this->snp_info; CHMM& hmm = this->hmm; - int window_size = this->input_data->getWindowSize(); - double mean_chr_cov = this->mean_chr_cov; - SNPData snp_data; - - + int window_size = this->input_data.getWindowSize(); + double mean_chr_cov = this->mean_chr_cov; printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "..."); // Create a map with counts for each CNV type - std::map cnv_type_counts; - for (int i = 0; i < 6; i++) - { - cnv_type_counts[i] = 0; - } - - // Split the SV candidates into chunks for each thread - int chunk_count = this->input_data->getThreadCount(); - std::vector> sv_chunks = splitSVCandidatesIntoChunks(sv_candidates, chunk_count); - - // Loop through each SV chunk and run the copy number prediction in parallel - std::vector> futures; - for (const auto& sv_chunk : sv_chunks) - { - // Run the copy number prediction for the SV chunk - std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, std::ref(sv_candidates), sv_chunk, std::ref(snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map)); - } - - // Get the SNP data for each SV chunk - int current_chunk = 0; - for (auto& future : futures) - { - current_chunk++; - SNPData chunk_snp_data = std::move(future.get()); - if (this->input_data->getVerbose()) - { - printMessage("Finished processing SV chunk " + std::to_string(current_chunk) + " of " + std::to_string(chunk_count) + "..."); - } - } + // std::map cnv_type_counts; + // for (int i = 0; i < 6; i++) + // { + // cnv_type_counts[i] = 0; + // } + + runCIGARCopyNumberPredictionChunk(chr, sv_candidates, this->snp_info, hmm, window_size, mean_chr_cov, this->pos_depth_map); + // // Split the SV candidates into chunks for each thread + // int chunk_count = this->input_data.getThreadCount(); + // // std::vector> sv_chunks = splitSVCandidatesIntoChunks(sv_candidates, chunk_count); + // std::vector> sv_chunks = splitSVsIntoChunks(sv_candidates, chunk_count); + + // // Loop through each SV chunk and run the copy number prediction in parallel + // // std::vector> futures; + // std::vector> futures; + // for (auto& sv_chunk : sv_chunks) + // { + // // Run the copy number prediction for the SV chunk + // futures.emplace_back(std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, std::ref(sv_chunk), std::ref(this->snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map))); + // // futures.emplace_back(std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, std::ref(sv_chunk), std::ref(this->snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map))); + // // std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, sv_chunk, std::ref(this->snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map)); + // } + + // // Wait for all the futures to finish + // int current_chunk = 0; + // for (auto& future : futures) + // { + // current_chunk++; + // try { + // future.wait(); + // // SNPData chunk_snp_data = std::move(future.get()); + // if (this->input_data.getVerbose()) + // { + // printMessage("Finished processing SV chunk " + std::to_string(current_chunk) + " of " + std::to_string(chunk_count) + "..."); + // } + // } catch (const std::exception& e) { + // printError("Error processing SV chunk " + std::to_string(current_chunk) + " of " + std::to_string(chunk_count) + ": " + e.what()); + // } + // } printMessage("Finished predicting copy number states for chromosome " + chr + "..."); - - return snp_data; } -void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map& sv_candidates, std::vector sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map& pos_depth_map) +void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map& pos_depth_map) { // printMessage("Running copy number prediction for " + std::to_string(sv_chunk.size()) + " SV candidates on chromosome " + chr + "..."); // Map with counts for each CNV type @@ -249,23 +259,25 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map(candidate); - int64_t end_pos = std::get<1>(candidate); + // const SVCandidate& candidate = sv_call; + // int64_t start_pos = std::get<0>(candidate); + // int64_t end_pos = std::get<1>(candidate); + uint32_t start_pos = sv_call.start; + uint32_t end_pos = sv_call.end; // Skip if not the minimum length for CNV predictions - if ((end_pos - start_pos) < this->input_data->getMinCNVLength()) + if ((int)(end_pos - start_pos) < this->input_data.getMinCNVLength()) { continue; } // Get the depth at the start position. This is used as the FORMAT/DP // value in the VCF file - int dp_value = pos_depth_map[start_pos]; - this->updateDPValue(sv_candidates, sv_call, dp_value); + // int dp_value = pos_depth_map[start_pos]; + // this->updateDPValue(sv_candidates, sv_call, dp_value); // Loop through the SV region +/- 1/2 SV length and run copy number // predictions @@ -273,19 +285,21 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::map, double> prediction = runViterbi(hmm, sv_snps); std::vector& state_sequence = prediction.first; double likelihood = prediction.second; + // printMessage("Finished running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "..."); // Get all the states in the SV region + // printMessage("Getting states for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "..."); std::vector sv_states; for (size_t i = 0; i < state_sequence.size(); i++) { @@ -318,7 +332,7 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::mapupdateSVCopyNumber(sv_candidates, sv_call, cnv_type, data_type, genotype, likelihood); + // Update the SV copy number data if not unknown + // printMessage("Updating SV copy number data for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "..."); + if (updated_sv_type != SVType::UNKNOWN) + { + std::string sv_type_str = getSVTypeString(updated_sv_type); + addSVCall(sv_chunk, sv_call.start, sv_call.end, sv_type_str, ".", data_type, genotype, likelihood); + // std::string sv_type_str = getSVTypeString(updated_sv_type); + // sv_call.sv_type = sv_type_str; + // sv_call.data_type += "," + data_type; + // sv_call.genotype = genotype; + // sv_call.hmm_likelihood = likelihood; + } + // this->updateSVCopyNumber(sv_candidates, sv_call, cnv_type, data_type, genotype, likelihood); // Save the SV calls as a TSV file if enabled, if the SV type is // known, and the length is greater than 10 kb - SVType updated_sv_type = sv_candidates[sv_call].sv_type; - if (this->input_data->getSaveCNVData() && updated_sv_type != SVType::UNKNOWN && (end_pos - start_pos) > 10000) + // SVType updated_sv_type = sv_candidates[sv_call].sv_type; + if (this->input_data.getSaveCNVData() && updated_sv_type != SVType::UNKNOWN && (end_pos - start_pos) > 10000) { // Add the state sequence to the SNP data (avoid copying the data) sv_snps.state_sequence = std::move(state_sequence); // Save the SV calls as a TSV file std::string cnv_type_str = getSVTypeString(updated_sv_type); - std::string sv_filename = this->input_data->getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_CIGAR.tsv"; + std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_CIGAR.tsv"; // std::cout << "Saving SV CIGAR copy number predictions to " << // sv_filename << std::endl; printMessage("Saving SV CIGAR copy number predictions to " + sv_filename); @@ -442,14 +467,9 @@ std::vector> CNVCaller::splitSVCandidatesIntoChunks(std return sv_chunks; } -CNVCaller::CNVCaller(InputData &input_data) -{ - this->input_data = &input_data; -} - void CNVCaller::loadChromosomeData(std::string chr) { - std::string hmm_filepath = this->input_data->getHMMFilepath(); + std::string hmm_filepath = this->input_data.getHMMFilepath(); std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; this->hmm = ReadCHMM(hmm_filepath.c_str()); @@ -459,7 +479,7 @@ void CNVCaller::loadChromosomeData(std::string chr) this->mean_chr_cov = mean_chr_cov; std::cout << "Reading SNP allele frequencies for chromosome " << chr << " from VCF file..." << std::endl; - std::string snp_filepath = this->input_data->getSNPFilepath(); + std::string snp_filepath = this->input_data.getSNPFilepath(); readSNPAlleleFrequencies(chr, snp_filepath, this->snp_info); std::cout << "Obtaining SNP population frequencies for chromosome " << chr << "..." << std::endl; @@ -471,16 +491,21 @@ void CNVCaller::loadChromosomeData(std::string chr) double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) { + bool test=true; + if (test) { + return 30.0; + } + // Use a maximum of 8 threads to avoid overloading the system with too many // parallel processes - int num_threads = this->input_data->getThreadCount(); + int num_threads = this->input_data.getThreadCount(); if (num_threads > 8) { num_threads = 8; } // Split the chromosome into equal parts for each thread - uint32_t chr_len = this->input_data->getRefGenomeChromosomeLength(chr); + uint32_t chr_len = this->input_data.getRefGenomeChromosomeLength(chr); if (chr_len == 0) { printError("ERROR: Chromosome length is zero for: " + chr); @@ -497,7 +522,7 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) uint32_t pos_count = 0; uint64_t cum_depth = 0; std::vector>>> futures; - std::string input_filepath = this->input_data->getShortReadBam(); + std::string input_filepath = this->input_data.getShortReadBam(); for (const auto& region_chunk : region_chunks) { // Create a lambda function to get the mean chromosome coverage for the @@ -638,7 +663,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, // Check that the SNP file is sorted by running bcftools index and reading // the error output std::string index_cmd = "bcftools index " + filepath + " 2>&1 | grep -i error"; - if (this->input_data->getVerbose()) { + if (this->input_data.getVerbose()) { std::cout << "Command: " << index_cmd << std::endl; } @@ -661,33 +686,33 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, pclose(index_fp); // Close the process // Filter variants by depth, quality, and region - if (this->input_data->getVerbose()) { + if (this->input_data.getVerbose()) { std::cout << "Filtering SNPs by depth, quality, and region..." << std::endl; } // Check if a region was specified by the user std::string region_str = chr; - if (this->input_data->isRegionSet()) + if (this->input_data.isRegionSet()) { - std::pair region = this->input_data->getRegion(); + std::pair region = this->input_data.getRegion(); region_str = chr + ":" + std::to_string(region.first) + "-" + std::to_string(region.second); } - std::string filtered_snp_vcf_filepath = this->input_data->getOutputDir() + "/filtered_snps.vcf"; + std::string filtered_snp_vcf_filepath = this->input_data.getOutputDir() + "/filtered_snps.vcf"; std::string cmd = "bcftools view -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath; - if (this->input_data->getVerbose()) { + if (this->input_data.getVerbose()) { std::cout << "Filtering SNPs by depth and quality..." << std::endl; std::cout << "Command: " << cmd << std::endl; } system(cmd.c_str()); - if (this->input_data->getVerbose()) { + if (this->input_data.getVerbose()) { std::cout << "Filtered SNPs written to " << filtered_snp_vcf_filepath << std::endl; } // Extract B-allele frequency data from the VCF file and sort by chromosome // and position - if (this->input_data->getVerbose()) { + if (this->input_data.getVerbose()) { std::cout << "Extracting B-allele frequency data from filtered SNPs..." << std::endl; } cmd = "bcftools query -f '%POS,[%AD]\n' " + filtered_snp_vcf_filepath + " 2>/dev/null"; @@ -748,7 +773,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, pclose(fp); // Close the process - if (this->input_data->getVerbose()) { + if (this->input_data.getVerbose()) { std::cout << "Finished extracting B-allele frequency data from filtered SNPs" << std::endl; } } @@ -756,7 +781,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) { // Get the population frequency file for the chromosome - std::string pfb_filepath = this->input_data->getAlleleFreqFilepath(chr); + std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr); if (pfb_filepath == "") { std::cout << "No population frequency file provided for chromosome " << chr << std::endl; @@ -765,9 +790,9 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) // Determine the ethnicity-specific allele frequency key std::string AF_key = "AF"; - if (this->input_data->getEthnicity() != "") + if (this->input_data.getEthnicity() != "") { - AF_key += "_" + this->input_data->getEthnicity(); + AF_key += "_" + this->input_data.getEthnicity(); } // Check if the filepath uses the 'chr' prefix notations based on the @@ -804,10 +829,10 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) std::pair snp_range = snp_info.getSNPRange(chr); int64_t snp_start = snp_range.first; int64_t snp_end = snp_range.second; - if (this->input_data->isRegionSet()) + if (this->input_data.isRegionSet()) { // Get the user-defined region - std::pair region = this->input_data->getRegion(); + std::pair region = this->input_data.getRegion(); if (snp_start < region.first) { snp_start = region.first; } else if (snp_end > region.second) { @@ -817,7 +842,7 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) // Use a maximum of 8 threads to avoid overloading the system with too many // processes - int num_threads = this->input_data->getThreadCount(); + int num_threads = this->input_data.getThreadCount(); if (num_threads > 8) { num_threads = 8; @@ -876,8 +901,9 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) }; // Create a future for the thread - std::future> future = std::async(std::launch::async, get_pfb); - futures.push_back(std::move(future)); + futures.emplace_back(std::async(std::launch::async, get_pfb)); + // std::future> future = std::async(std::launch::async, get_pfb); + // futures.push_back(std::move(future)); } // Loop through the futures and get the results @@ -895,9 +921,9 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) double pfb = pair.second; // Add the population frequency to the SNPInfo - this->snp_data_mtx.lock(); + // this->snp_data_mtx.lock(); snp_info.insertSNPPopulationFrequency(chr_snp, pos, pfb); - this->snp_data_mtx.unlock(); + // this->snp_data_mtx.unlock(); pfb_count++; // [TEST] Print 15 values diff --git a/src/contextsv.cpp b/src/contextsv.cpp index 4f35c052..1e22b650 100644 --- a/src/contextsv.cpp +++ b/src/contextsv.cpp @@ -13,20 +13,22 @@ /// @endcond ContextSV::ContextSV(InputData& input_data) + : input_data(input_data) // Initialize the input data { - this->input_data = &input_data; } -// Entry point int ContextSV::run() { - FASTAQuery ref_genome = this->input_data->getRefGenome(); // Load the reference genome - SVCaller sv_caller(*this->input_data); // Create an SV caller object - SVData sv_calls = sv_caller.run(); // Run the SV caller - std::string output_dir = this->input_data->getOutputDir(); // Get the output directory + ReferenceGenome ref_genome = this->input_data.getRefGenome(); // Load the reference genome + SVCaller sv_caller(this->input_data); // Create an SV caller object + // SVCaller sv_caller(*this->input_data); // Create an SV caller object + // SVData sv_calls = sv_caller.run(); // Run the SV caller + std::unordered_map> sv_calls = sv_caller.run(); // Run the SV caller + // std::string output_dir = this->input_data->getOutputDir(); // Get the output directory - std::cout << "Writing SV calls to file " << output_dir << "/output.vcf..." << std::endl; - sv_calls.saveToVCF(ref_genome, output_dir); // Save the SV calls to a VCF file + // std::cout << "Writing SV calls to file " << output_dir << "/output.vcf..." << std::endl; + // sv_caller.saveToVCF(ref_genome, output_dir); // Save the SV calls to a VCF file + // sv_calls.saveToVCF(ref_genome, output_dir); // Save the SV calls to a VCF file std::cout << "SV calling complete." << std::endl; return 0; diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp index e1bc9bea..ee220d1c 100644 --- a/src/fasta_query.cpp +++ b/src/fasta_query.cpp @@ -12,7 +12,7 @@ /// @endcond -int FASTAQuery::setFilepath(std::string fasta_filepath) +int ReferenceGenome::setFilepath(std::string fasta_filepath) { if (fasta_filepath == "") { @@ -92,13 +92,13 @@ int FASTAQuery::setFilepath(std::string fasta_filepath) return 0; } -std::string FASTAQuery::getFilepath() +std::string ReferenceGenome::getFilepath() const { return this->fasta_filepath; } // Function to get the reference sequence at a given position range -std::string FASTAQuery::query(std::string chr, int64_t pos_start, int64_t pos_end) +std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const { // Convert positions from 1-indexed (reference) to 0-indexed (string indexing) pos_start--; @@ -110,15 +110,17 @@ std::string FASTAQuery::query(std::string chr, int64_t pos_start, int64_t pos_en { return ""; } - if (pos_end >= (int64_t)this->chr_to_seq[chr].length()) + // if (pos_end >= (uint32_t)this->chr_to_seq[chr].length()) + if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length()) { return ""; } - int64_t length = pos_end - pos_start + 1; + uint32_t length = pos_end - pos_start + 1; // Get the sequence - const std::string& sequence = this->chr_to_seq[chr]; + const std::string& sequence = this->chr_to_seq.at(chr); + // const std::string& sequence = this->chr_to_seq[chr]; // Get the substring // std::string subsequence = sequence.substr(pos_start, length); @@ -133,7 +135,7 @@ std::string FASTAQuery::query(std::string chr, int64_t pos_start, int64_t pos_en } // Function to get the chromosome contig lengths in VCF header format -std::string FASTAQuery::getContigHeader() +std::string ReferenceGenome::getContigHeader() const { std::string contig_header = ""; @@ -149,7 +151,8 @@ std::string FASTAQuery::getContigHeader() for (auto const& chr : chromosomes) { // Add the contig header line - contig_header += "##contig=\n"; + contig_header += "##contig=\n"; + // contig_header += "##contig=\n"; } // Remove the last newline character @@ -158,12 +161,12 @@ std::string FASTAQuery::getContigHeader() return contig_header; } -std::vector FASTAQuery::getChromosomes() +std::vector ReferenceGenome::getChromosomes() { return this->chromosomes; } -int64_t FASTAQuery::getChromosomeLength(std::string chr) +uint32_t ReferenceGenome::getChromosomeLength(std::string chr) { return this->chr_to_seq[chr].length(); } diff --git a/src/input_data.cpp b/src/input_data.cpp index 99a4cade..186e4617 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -90,7 +90,7 @@ void InputData::setRefGenome(std::string fasta_filepath) this->fasta_query.setFilepath(fasta_filepath); } -const FASTAQuery &InputData::getRefGenome() const +const ReferenceGenome& InputData::getRefGenome() const { return this->fasta_query; } diff --git a/src/main.cpp b/src/main.cpp index e1ac4c12..558e2493 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -10,12 +10,14 @@ // Placeholder for ContextSV library includes // #include "ContextSV.h" -void runContextSV(const std::string& bamFile, const std::string& refFile, const std::string& vcfFile, const std::string& outputDir, int threadCount = 1) { +void runContextSV(const std::string& bamFile, const std::string& refFile, const std::string& vcfFile, const std::string& outputDir, int threadCount = 1, const std::string& hmmFile = "", int windowSize = 2500, int minCNV = 2500, const std::string& eth = "", const std::string& pfbFile = "") +{ // Placeholder for setting up input data and running ContextSV std::cout << "Running ContextSV with the following files:" << std::endl; std::cout << "BAM file: " << bamFile << std::endl; std::cout << "Reference file: " << refFile << std::endl; std::cout << "VCF file: " << vcfFile << std::endl; + std::cout << "Thread count: " << threadCount << std::endl; std::cout << "Output directory: " << outputDir << std::endl; // Set up input data @@ -24,21 +26,23 @@ void runContextSV(const std::string& bamFile, const std::string& refFile, const input_data.setLongReadBam(bamFile); input_data.setRefGenome(refFile); input_data.setSNPFilepath(vcfFile); - input_data.setChromosome("21"); - input_data.setRegion("14486099-14515105"); - input_data.setThreadCount(1); - input_data.setAlleleFreqFilepaths(""); - input_data.setHMMFilepath(""); + //input_data.setChromosome("21"); + //input_data.setRegion("14486099-14515105"); + input_data.setThreadCount(threadCount); + input_data.setAlleleFreqFilepaths(pfbFile); + input_data.setHMMFilepath(hmmFile); input_data.setOutputDir(outputDir); - input_data.saveCNVData(true); + input_data.saveCNVData(false); input_data.setThreadCount(threadCount); + input_data.setWindowSize(windowSize); + input_data.setMinCNVLength(minCNV); // Run ContextSV run(input_data); } int main(int argc, char* argv[]) { - if (argc != 6) { + if (argc < 6) { std::cerr << "Usage: " << argv[0] << " " << std::endl; return 1; } @@ -48,8 +52,29 @@ int main(int argc, char* argv[]) { std::string vcfFile = argv[3]; std::string outputDir = argv[4]; int threadCount = std::stoi(argv[5]); + + std::string hmmFile = ""; + int windowSize = 2500; + int minCNV = 2500; + std::string eth = ""; + std::string pfbFile = ""; + if (argc == 11) { + hmmFile = argv[6]; + windowSize = std::stoi(argv[7]); + minCNV = std::stoi(argv[8]); + eth = argv[9]; + pfbFile = argv[10]; + } + + runContextSV(bamFile, refFile, vcfFile, outputDir, threadCount, hmmFile, windowSize, minCNV, eth, pfbFile); + + //std::string hmmFile = argv[6]; + //int windowSize = std::stoi(argv[7]); + //int minCNV = std::stoi(argv[8]); + //std::string eth = argv[9]; + //std::string pfbFile = argv[10]; - runContextSV(bamFile, refFile, vcfFile, outputDir, threadCount); + //runContextSV(bamFile, refFile, vcfFile, outputDir, threadCount, "", 2500, 2500, "", ""); return 0; } diff --git a/src/snp_info.cpp b/src/snp_info.cpp index 90045402..36efeb4b 100644 --- a/src/snp_info.cpp +++ b/src/snp_info.cpp @@ -52,7 +52,7 @@ void SNPInfo::insertSNPPopulationFrequency(std::string chr, int64_t pos, double std::tuple, std::vector, std::vector> SNPInfo::querySNPs(std::string chr, int64_t start, int64_t end) { // Lock the mutex for reading SNP information - std::lock_guard lock(this->snp_info_mtx); + // std::lock_guard lock(this->snp_info_mtx); chr = removeChrPrefix(chr); diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 3cc41aed..73b6cfea 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include "utils.h" #include "sv_types.h" @@ -24,16 +25,22 @@ # define DUP_SEQSIM_THRESHOLD 0.9 // Sequence similarity threshold for duplication detection +SVCaller::SVCaller(InputData &input_data) + : input_data(input_data) // Initialize the input data +{ +} + int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) { int ret = sam_itr_next(fp_in, itr, bam1); return ret; } -RegionData SVCaller::detectSVsFromRegion(std::string region) +// RegionData SVCaller::detectSVsFromRegion(std::string region) +std::tuple, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(std::string region) { // Open the BAM file - std::string bam_filepath = this->input_data->getLongReadBam(); + std::string bam_filepath = this->input_data.getLongReadBam(); samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); if (fp_in == NULL) { std::cerr << "ERROR: failed to open " << bam_filepath << std::endl; @@ -42,78 +49,87 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) // Load the header for the BAM file bam_hdr_t *bamHdr = sam_hdr_read(fp_in); - if (bamHdr == NULL) { - std::cerr << "ERROR: failed to read header for " << bam_filepath << std::endl; - exit(1); + if (!bamHdr) { + sam_close(fp_in); + throw std::runtime_error("ERROR: failed to read header for " + bam_filepath); } // Load the index for the BAM file hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str()); - if (idx == NULL) { - std::cerr << "ERROR: failed to load index for " << bam_filepath << std::endl; - exit(1); + if (!idx) { + bam_hdr_destroy(bamHdr); + sam_close(fp_in); + throw std::runtime_error("ERROR: failed to load index for " + bam_filepath); } // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); + if (!bam1) { + hts_idx_destroy(idx); + bam_hdr_destroy(bamHdr); + sam_close(fp_in); + throw std::runtime_error("ERROR: failed to initialize BAM record"); + } hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str()); + if (!itr) { + bam_destroy1(bam1); + hts_idx_destroy(idx); + bam_hdr_destroy(bamHdr); + sam_close(fp_in); + throw std::runtime_error("ERROR: failed to query region " + region); + } // Main loop to process the alignments - SVData sv_calls; + // SVData sv_calls; + std::set sv_calls; int num_alignments = 0; PrimaryMap primary_alignments; SuppMap supplementary_alignments; while (readNextAlignment(fp_in, itr, bam1) >= 0) { - // Skip secondary and unmapped alignments, duplicates, and QC failures - if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL) { - // Do nothing - - // Skip alignments with low mapping quality - } else if (bam1->core.qual < this->min_mapq) { - // Do nothing - - } else { - std::string qname = bam_get_qname(bam1); // Query template name - - // Process primary alignments - if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { - - // Get the primary alignment information - std::string chr = bamHdr->target_name[bam1->core.tid]; - int64_t start = bam1->core.pos; - int64_t end = bam_endpos(bam1); // This is the first position after the alignment - bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); - - // Call SVs directly from the CIGAR string - std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true); - std::unordered_map match_map = std::get<0>(query_info); - int32_t query_start = std::get<1>(query_info); - int32_t query_end = std::get<2>(query_info); - - // Add the primary alignment to the map - AlignmentData alignment(chr, start, end, ".", query_start, query_end, std::move(match_map), fwd_strand); - primary_alignments[qname] = std::move(alignment); - - // Process supplementary alignments - } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { - - // Get the supplementary alignment information - std::string chr = bamHdr->target_name[bam1->core.tid]; - int32_t start = bam1->core.pos; - int32_t end = bam_endpos(bam1); - bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); - - // Get CIGAR string information, but don't call SVs - std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false); - const std::unordered_map& match_map = std::get<0>(query_info); - int32_t query_start = std::get<1>(query_info); - int32_t query_end = std::get<2>(query_info); - - // Add the supplementary alignment to the map - AlignmentData alignment(chr, start, end, ".", query_start, query_end, std::move(match_map), fwd_strand); - supplementary_alignments[qname].emplace_back(alignment); - } + // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality + if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) { + continue; + } + const std::string qname = bam_get_qname(bam1); // Query template name + + // Process primary alignments + if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { + + // Get the primary alignment information + std::string chr = bamHdr->target_name[bam1->core.tid]; + int64_t start = bam1->core.pos; + int64_t end = bam_endpos(bam1); // This is the first position after the alignment + bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); + + // Call SVs directly from the CIGAR string + std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true); + std::unordered_map match_map = std::get<0>(query_info); + int32_t query_start = std::get<1>(query_info); + int32_t query_end = std::get<2>(query_info); + + // Add the primary alignment to the map + AlignmentData alignment(chr, start, end, ".", query_start, query_end, match_map, fwd_strand); + primary_alignments[qname] = alignment; + + // Process supplementary alignments + } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { + + // Get the supplementary alignment information + std::string chr = bamHdr->target_name[bam1->core.tid]; + int32_t start = bam1->core.pos; + int32_t end = bam_endpos(bam1); + bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); + + // Get CIGAR string information, but don't call SVs + std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false); + const std::unordered_map& match_map = std::get<0>(query_info); + int32_t query_start = std::get<1>(query_info); + int32_t query_end = std::get<2>(query_info); + + // Add the supplementary alignment to the map + AlignmentData alignment(chr, start, end, ".", query_start, query_end, match_map, fwd_strand); + supplementary_alignments[qname].emplace_back(alignment); } num_alignments++; @@ -121,12 +137,11 @@ RegionData SVCaller::detectSVsFromRegion(std::string region) hts_itr_destroy(itr); bam_destroy1(bam1); - sam_close(fp_in); - bam_hdr_destroy(bamHdr); hts_idx_destroy(idx); + bam_hdr_destroy(bamHdr); + sam_close(fp_in); - // Return the SV calls and the primary and supplementary alignments - return std::make_tuple(std::move(sv_calls), std::move(primary_alignments), std::move(supplementary_alignments)); + return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments); } double SVCaller::calculateMismatchRate(std::unordered_map &match_map, int32_t start, int32_t end) @@ -147,12 +162,7 @@ double SVCaller::calculateMismatchRate(std::unordered_map &match_map, return mismatch_rate; } -SVCaller::SVCaller(InputData &input_data) -{ - this->input_data = &input_data; -} - -std::tuple, int32_t, int32_t> SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, SVData& sv_calls, bool is_primary) +std::tuple, int32_t, int32_t> SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set& sv_calls, bool is_primary) { std::string chr = header->target_name[alignment->core.tid]; // Chromosome name int32_t pos = alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) @@ -170,7 +180,7 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr int32_t query_start = 0; // First alignment position in the query int32_t query_end = 0; // Last alignment position in the query bool first_op = false; // First alignment operation for the query - double default_lh = std::numeric_limits::lowest(); // Default likelihood + double default_lh = 0.0; // double default_lh = std::numeric_limits::quiet_NaN(); // Default likelihood for (int i = 0; i < cigar_len; i++) { @@ -205,7 +215,7 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr // Get the string for the window (1-based coordinates) ins_ref_pos = j + 1; - std::string window_str = this->input_data->queryRefGenome(chr, ins_ref_pos, ins_ref_pos + op_len - 1); + std::string window_str = this->input_data.queryRefGenome(chr, ins_ref_pos, ins_ref_pos + op_len - 1); // Continue if the window string is empty (out-of-range) if (window_str == "") { @@ -228,13 +238,31 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr } } + // Determine whether to use a symbolic allele (>50bp) or the + // actual sequence + if (op_len > 50) { + ins_seq_str = ""; + } else { + ins_seq_str = ins_seq_str; + } + // Add to SV calls (1-based) with the appropriate SV type ref_pos = pos+1; ref_end = ref_pos + op_len -1; if (is_duplication) { - sv_calls.add(chr, ref_pos, ref_end, SVType::DUP, ins_seq_str, "CIGARDUP", "./.", default_lh); + // sv_calls.add(chr, ref_pos, ref_end, SVType::DUP, + // ins_seq_str, "CIGARDUP", "./.", default_lh); + //printMessage("[TEST] FOUND CIGAR DUP"); + // sv_calls.insert(SVCall{(uint32_t)ref_pos, + // (uint32_t)ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", + // default_lh}); + addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh); } else { - sv_calls.add(chr, ref_pos, ref_end, SVType::INS, ins_seq_str, "CIGARINS", "./.", default_lh); + // sv_calls.add(chr, ref_pos, ref_end, SVType::INS, ins_seq_str, "CIGARINS", "./.", default_lh); + // sv_calls.insert(SVCall{(uint32_t)ref_pos, + // (uint32_t)ref_end, "INS", ins_seq_str, "CIGARINS", "./.", + // default_lh}); + addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh); } } @@ -246,13 +274,17 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr { ref_pos = pos+1; ref_end = ref_pos + op_len -1; - sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", "CIGARDEL", "./.", default_lh); // Add to SV calls (1-based) + // sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", + // "CIGARDEL", "./.", default_lh); // Add to SV calls (1-based) + // sv_calls.insert(SVCall{(uint32_t)ref_pos, (uint32_t)ref_end, + // "DEL", ".", "CIGARDEL", "./.", default_lh}); + addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh); } // Check if the CIGAR operation is a clipped base } else if (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP) { - sv_calls.updateClippedBaseSupport(chr, pos); // Update clipped base support + // sv_calls.updateClippedBaseSupport(chr, pos); // Update clipped base support // Update the query alignment start position if (!first_op) { @@ -280,7 +312,7 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr // Get the corresponding reference sequence int cmatch_pos = pos + 1; // Querying the reference genome is 1-based - std::string cmatch_ref_str = this->input_data->queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1); + std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1); // Check that the two sequence lengths are equal if (cmatch_seq_str.length() != cmatch_ref_str.length()) { @@ -325,37 +357,42 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr return std::tuple, int32_t, int32_t>(query_match_map, query_start, query_end); } -SVData SVCaller::run() +std::unordered_map> SVCaller::run() { // Get the chromosomes to process std::vector chromosomes; - if (this->input_data->getChromosome() != "") { - chromosomes.push_back(this->input_data->getChromosome()); + if (this->input_data.getChromosome() != "") { + chromosomes.push_back(this->input_data.getChromosome()); } else { - chromosomes = this->input_data->getRefGenomeChromosomes(); + chromosomes = this->input_data.getRefGenomeChromosomes(); } // [TEST] Only process the last N chromosomes - // last_n = 10; + // int last_n = 3; // chromosomes = std::vector(chromosomes.end()-last_n, chromosomes.end()); - // chromosomes = std::vector(chromosomes.end()-3, chromosomes.end()); + // std::cout << "[DEBUG] Running last " << last_n << " chromosomes" << std::endl; + // //chromosomes = std::vector(chromosomes.end()-3, chromosomes.end()); // Loop through each region and detect SVs in chunks int chr_count = chromosomes.size(); int current_chr = 0; std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl; int chunk_count = 100; // Number of chunks to split the chromosome into - SVData sv_calls; - int min_cnv_length = this->input_data->getMinCNVLength(); + // SVData sv_calls; + // std::vector> sv_calls; + // std::unordered_map> sv_calls; + uint32_t total_sv_count = 0; + std::unordered_map> whole_genome_sv_calls; + int min_cnv_length = this->input_data.getMinCNVLength(); for (const auto& chr : chromosomes) { std::cout << "Running SV detection for chromosome " << chr << "..." << std::endl; // Split the chromosome into chunks std::vector region_chunks; - if (this->input_data->isRegionSet()) { + if (this->input_data.isRegionSet()) { // Use one chunk for the specified region - std::pair region = this->input_data->getRegion(); + std::pair region = this->input_data.getRegion(); int region_start = region.first; int region_end = region.second; std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end); @@ -363,7 +400,7 @@ SVData SVCaller::run() std::cout << "Using specified region " << chunk << "..." << std::endl; } else { - int chr_len = this->input_data->getRefGenomeChromosomeLength(chr); + int chr_len = this->input_data.getRefGenomeChromosomeLength(chr); int chunk_size = std::ceil((double)chr_len / chunk_count); for (int i = 0; i < chunk_count; i++) { int start = i * chunk_size + 1; // 1-based @@ -379,58 +416,86 @@ SVData SVCaller::run() // Load chromosome data for copy number predictions std::cout << "Loading chromosome data for copy number predictions..." << std::endl; - CNVCaller cnv_caller(*this->input_data); + CNVCaller cnv_caller(this->input_data); cnv_caller.loadChromosomeData(chr); // Process each chunk one at a time std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl; int region_count = region_chunks.size(); int current_region = 0; + std::set combined_sv_calls; for (const auto& sub_region : region_chunks) { // std::cout << "Detecting CIGAR string SVs from " << sub_region << "..." << std::endl; - RegionData region_data = this->detectSVsFromRegion(sub_region); - SVData& sv_calls_region = std::get<0>(region_data); + std::tuple, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(sub_region); + std::set& subregion_sv_calls = std::get<0>(region_data); PrimaryMap& primary_map = std::get<1>(region_data); SuppMap& supp_map = std::get<2>(region_data); - int region_sv_count = sv_calls_region.totalCalls(); - if (region_sv_count > 0) { - std::cout << "Detected " << region_sv_count << " CIGAR SVs from " << sub_region << "..." << std::endl; - } + // SVData& subregion_sv_calls = std::get<0>(region_data); + // PrimaryMap& primary_map = std::get<1>(region_data); + // SuppMap& supp_map = std::get<2>(region_data); + // int region_sv_count = subregion_sv_calls.totalCalls(); + // if (region_sv_count > 0) { + // std::cout << "Detected " << region_sv_count << " CIGAR SVs from " << sub_region << "..." << std::endl; + // } + // int region_sv_count = subregion_sv_calls.count(); + int region_sv_count = getSVCount(subregion_sv_calls); + printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); // Run copy number variant predictions on the SVs detected from the // CIGAR string, using a minimum CNV length threshold // std::cout << "Detecting copy number variants from CIGAR string SVs..." << std::endl; - std::map& cigar_svs = sv_calls_region.getChromosomeSVs(chr); - if (cigar_svs.size() > 0) { + // std::map& cigar_svs = subregion_sv_calls.getChromosomeSVs(chr); + // if (cigar_svs.size() > 0) { + if (region_sv_count > 0) { std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl; - cnv_caller.runCIGARCopyNumberPrediction(chr, cigar_svs, min_cnv_length); + // cnv_caller.runCIGARCopyNumberPrediction(chr, cigar_svs, + // min_cnv_length); + cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length); } - // Run split-read SV detection in a single thread, combined with - // copy number variant predictions + // Run split-read SV and copy number variant predictions std::cout << "Detecting copy number variants from split reads..." << std::endl; - this->detectSVsFromSplitReads(sv_calls_region, primary_map, supp_map, cnv_caller); - sv_calls.concatenate(sv_calls_region); // Add the calls to the main set + this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller); + // sv_calls.concatenate(subregion_sv_calls); // Add the calls to the + // main set + // sv_calls.emplace_back(subregion_sv_calls); + + // Combine the SV calls from the current region + std::cout << "Combining SV calls from " << sub_region << "..." << std::endl; + concatenateSVCalls(combined_sv_calls, subregion_sv_calls); std::cout << "Completed " << ++current_region << " of " << region_count << " region(s)..." << std::endl; + + // [TEST] Break after the first region + // std::cout << "[DEBUG] Breaking after the first region" << std::endl; + // break; } std::cout << "Completed " << ++current_chr << " of " << chr_count << " chromosome(s)..." << std::endl; + int chr_sv_count = getSVCount(combined_sv_calls); + whole_genome_sv_calls[chr] = combined_sv_calls; + std::cout << "Total SVs detected for chromosome " << chr << ": " << chr_sv_count << std::endl; + total_sv_count += chr_sv_count; + std::cout << "Cumulative total SVs: " << total_sv_count << std::endl; // std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)" << std::endl; } - + + // SVData sv_calls_combined; + // for (const auto& subregion_sv_calls : sv_calls) { + // sv_calls_combined.concatenate(subregion_sv_calls); + // } std::cout << "SV calling completed." << std::endl; - return sv_calls; + return whole_genome_sv_calls; } // Detect SVs from split read alignments -void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller) +void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller) { // Find split-read SV evidence int sv_count = 0; - int min_cnv_length = this->input_data->getMinCNVLength(); + int min_cnv_length = this->input_data.getMinCNVLength(); for (const auto& entry : primary_map) { std::string qname = entry.first; AlignmentData primary_alignment = entry.second; @@ -688,7 +753,7 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // Continue if unknown SV type if (chosen_type == SVType::UNKNOWN) { - std::cerr << "ERROR: Unknown SV type" << std::endl; + // std::cerr << "ERROR: Unknown SV type" << std::endl; continue; } @@ -820,7 +885,9 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map } // Add the best split alignment as the SV call - sv_calls.add(primary_chr, sv_start, sv_end, best_supp_type, ".", "SPLITREAD", "./.", best_split_aln_lh_norm); + // sv_calls.add(primary_chr, sv_start, sv_end, best_supp_type, ".", + // "SPLITREAD", "./.", best_split_aln_lh_norm); + std::string sv_type_str = getSVTypeString(best_supp_type); sv_count++; } else { // Resolve complex SVs @@ -851,14 +918,18 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map std::string complex_sv_type_str = supp_type_str + "+" + primary_type_str; // Add the complex SV call - sv_calls.add(primary_chr, std::get<1>(largest_supp_alignment), primary_end, SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm); + addSVCall(sv_calls, (uint32_t)std::get<1>(largest_supp_alignment), (uint32_t)primary_end, "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm); + // sv_calls.insert(SVCall{(uint32_t)std::get<1>(largest_supp_alignment), (uint32_t)primary_end, "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm}); + // sv_calls.add(primary_chr, std::get<1>(largest_supp_alignment), primary_end, SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm); sv_count++; } else { // [primary] -- [supp_start] -- [supp_end] std::string complex_sv_type_str = primary_type_str + "+" + supp_type_str; // Add the complex SV call - sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm); + addSVCall(sv_calls, (uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm); + // sv_calls.insert(SVCall{(uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm}); + // sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm); sv_count++; } } else { @@ -929,7 +1000,11 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map // Add the complex SV call if not empty if (complex_sv_type_str != "") { std::cout << "Found complex SV type: " << complex_sv_type_str << std::endl; - sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm); + // sv_calls.add(primary_chr, primary_start, + // std::get<2>(largest_supp_alignment), SVType::COMPLEX, + // ".", complex_sv_type_str, "./.", complex_lh_norm); + // sv_calls.insert(SVCall{(uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm}); + addSVCall(sv_calls, (uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm); sv_count++; } } @@ -942,3 +1017,215 @@ void SVCaller::detectSVsFromSplitReads(SVData& sv_calls, PrimaryMap& primary_map std::cout << "Found " << sv_count << " SVs from split-read alignments" << std::endl; } } + +void SVCaller::saveToVCF(const std::unordered_map >& sv_calls, const ReferenceGenome& ref_genome) +{ + std::cout << "Creating VCF writer..." << std::endl; + // std::string output_vcf = output_dir + "/output.vcf"; + std::string output_vcf = this->input_data.getOutputDir() + "/output.vcf"; + std::cout << "Writing VCF file to " << output_vcf << std::endl; + std::ofstream vcf_stream(output_vcf); + if (!vcf_stream.is_open()) { + throw std::runtime_error("Failed to open VCF file for writing."); + } + std::string sample_name = "SAMPLE"; + + std::cout << "Getting reference genome filepath..." << std::endl; + try { + std::string ref_fp = ref_genome.getFilepath(); + std::cout << "Reference genome filepath: " << ref_fp << std::endl; + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + return; + } + + std::cout << "Getting reference genome header..." << std::endl; + try { + ref_genome.getContigHeader(); + } catch (const std::exception& e) { + std::cerr << "Error: " << e.what() << std::endl; + return; + } + + // Set the header lines + std::vector header_lines = { + std::string("##reference=") + ref_genome.getFilepath(), + ref_genome.getContigHeader(), + "##INFO=", + "##INFO=", + "##INFO=", + "##INFO=", + "##INFO=", + "##INFO=", + "##INFO=", + "##INFO=", + "##INFO=", + "##FILTER=", + "##FILTER=", + "##FORMAT=", + "##FORMAT=" + }; + + std::cout << "Writing VCF header..." << std::endl; + + // Add the file format + std::string file_format = "##fileformat=VCFv4.2"; + vcf_stream << file_format << std::endl; + + // Add date and time + time_t rawtime; + struct tm * timeinfo; + char buffer[80]; + time (&rawtime); + timeinfo = localtime(&rawtime); + strftime(buffer, sizeof(buffer), "%Y%m%d", timeinfo); + vcf_stream << "##fileDate=" << buffer << std::endl; + + // Add source + std::string source = "##source=ContexSV"; + vcf_stream << source << std::endl; + + // Loop over the header metadata lines + for (const auto &line : header_lines) { + vcf_stream << line << std::endl; + } + + // Add the header line + std::string header_line = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE"; + vcf_stream << header_line << std::endl; + + // Flush the stream to ensure that the header is written + //this->file_stream.flush(); + + std::cout << "Saving SV calls to " << output_vcf << std::endl; + std::string sv_method = "CONTEXTSVv0.1"; + int skip_count = 0; + int total_count = 0; + // std::set chrs = this->getChromosomes(); + //for (auto const& chr : chrs) { + for (const auto& pair : sv_calls) { + // if (this->sv_calls.find(chr) == this->sv_calls.end()) { + // continue; + // } + std::string chr = pair.first; + const std::set& sv_calls = pair.second; + std::cout << "Saving SV calls for " << chr << "..." << std::endl; + // for (auto const& sv_call : this->sv_calls[chr]) { + for (const auto& sv_call : sv_calls) { + // Get the SV candidate and SV info + uint32_t start = sv_call.start; + uint32_t end = sv_call.end; + std::string sv_type_str = sv_call.sv_type; + std::string genotype = sv_call.genotype; + std::string data_type_str = sv_call.data_type; + std::string alt_allele = sv_call.alt_allele; + double hmm_likelihood = sv_call.hmm_likelihood; + int sv_length = end - start; + if (sv_type_str == "DEL") { + sv_length++; + } + int read_support = sv_call.support; + int read_depth = 0; + // SVType sv_type = sv_call.sv_type; + // SVCandidate candidate = sv_call.first; + // SVInfo info = sv_call.second; + // SVType sv_type = info.sv_type; + // int read_support = info.read_support; + // int read_depth = info.read_depth; + // int read_depth = 0; + // int read_support = 0; + // int sv_length = info.sv_length; + // std::set data_type = info.data_type; + // std::string genotype = info.genotype; + // double hmm_likelihood = info.hmm_likelihood; + + // Convert the data type set to a string + // std::string data_type_str = ""; + // for (auto const& type : data_type) { + // data_type_str += type + ","; + // } + + // Get the CHROM, POS, END, and ALT + // uint32_t pos = std::get<0>(candidate); + // uint32_t end = std::get<1>(candidate); + + // If the SV type is unknown, skip it + if (sv_type_str == "UNKNOWN" || sv_type_str == "NEUTRAL") { + skip_count += 1; + continue; + } else { + total_count += 1; + } + + // Process by SV type + std::string ref_allele = "."; + // std::string alt_allele = "."; + std::string repeat_type = "NA"; + + // Deletion + if (sv_type_str == "DEL") { + // Get the deleted sequence from the reference genome, also including the preceding base + int64_t preceding_pos = (int64_t) std::max(1, (int) start-1); // Make sure the position is not negative + ref_allele = ref_genome.query(chr, preceding_pos, end); + + // Use the preceding base as the alternate allele + if (ref_allele != "") { + alt_allele = ref_allele.at(0); + } else { + alt_allele = ""; // Symbolic allele + std::cerr << "Warning: Reference allele is empty for deletion at " << chr << ":" << start << "-" << end << std::endl; + } + + sv_length = -1 * sv_length; // Negative length for deletions + + start = preceding_pos; // Update the position to the preceding base + + // Other types (duplications, insertions, inversions) + } else { + // Use the preceding base as the reference allele + int64_t preceding_pos = (int64_t) std::max(1, (int) start-1); // Make sure the position is not negative + ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos); + + // Format novel insertions + if (sv_type_str == "INS") { + // Check if in symbolic form + if (alt_allele != "") { + // Use the insertion sequence as the alternate allele + // alt_allele = std::get<2>(candidate); + alt_allele.insert(0, ref_allele); + } + start = preceding_pos; // Update the position to the preceding base + + // Update the end position to the start position to change from + // query to reference coordinates for insertions + end = start; + } + } + + // Create the VCF parameter strings + // int clipped_base_support = this->getClippedBaseSupport(chr, pos, + // end); + int clipped_base_support = 0; + // std::string sv_type_str = getSVTypeString(sv_type); + std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \ + ";SVLEN=" + std::to_string(sv_length) + ";SUPPORT=" + std::to_string(read_support) + \ + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";CLIPSUP=" + std::to_string(clipped_base_support) + \ + ";REPTYPE=" + repeat_type + ";HMM=" + std::to_string(hmm_likelihood); + + std::string format_str = "GT:DP"; + std::string sample_str = genotype + ":" + std::to_string(read_depth); + std::vector samples = {sample_str}; + + // Write the SV call to the file (CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLES) + vcf_stream << chr << "\t" << start << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "PASS" << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl; + if (total_count % 1000 == 0) + { + std::cout << "Wrote SV at " << chr << ": " << start << ", total=" << total_count << std::endl; + } + } + } + + // Print the number of SV calls skipped + std::cout << "Finished writing VCF file. Total SV calls: " << total_count << ", skipped: " << skip_count << " with unknown SV type" << std::endl; +} + diff --git a/src/sv_data.cpp b/src/sv_data.cpp index a0611c8d..d2dfd605 100644 --- a/src/sv_data.cpp +++ b/src/sv_data.cpp @@ -139,7 +139,7 @@ int SVData::getClippedBaseSupport(std::string chr, int64_t pos, int64_t end) return clipped_base_support; } -void SVData::saveToVCF(FASTAQuery& ref_genome, std::string output_dir) +void SVData::saveToVCF(ReferenceGenome& ref_genome, std::string output_dir) { // Create a VCF writer std::cout << "Creating VCF writer..." << std::endl; diff --git a/src/sv_object.cpp b/src/sv_object.cpp new file mode 100644 index 00000000..15aefdfd --- /dev/null +++ b/src/sv_object.cpp @@ -0,0 +1,126 @@ +#include "sv_object.h" +#include "sv_object.h" +#include +#include +#include +#include +#include +#include + +bool SVCall::operator<(const SVCall & other) const +{ + return std::tie(start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood) < + std::tie(other.start, other.end, other.sv_type, other.alt_allele, other.data_type, other.genotype, other.hmm_likelihood); +} + +void addSVCall(std::set& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) +{ + // Throw an error if unknown SV type + if (sv_type == "UNKNOWN") { + throw std::runtime_error("ERROR: Cannot add unknown SV type"); + } + + if (start >= end) { + throw std::runtime_error("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end)); + } + + // If the SV call already exists (start and end position), then update all information if the + // likelihood is higher + // std::cout << "[TEST1] Adding SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl; + std::vector updates; + bool print_out = false; + for (auto it = sv_calls.begin(); it != sv_calls.end();) + { + if (it->start == start && it->end == end) + { + if (hmm_likelihood > it->hmm_likelihood) + { + std::cout << "[DEBUG] Found higher likelihood for SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl; + print_out = true; + // Update the data type and support + std::string new_data_type = it->data_type + "," + data_type; + int new_support = it->support + 1; + + updates.push_back(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support}); + + // Erase and re-insert the SV call + // Erase the current iterator and safely insert the new SV call + // sv_calls.erase(it); + it = sv_calls.erase(it); // Erase and get the next iterator + // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support}); + } else { + // Return if no update is needed + return; + } + } else { + // Increment the iterator if the SV call does not match + ++it; + } + } + + if (print_out) + { + std::cout << "[DEBUG] Adding updates" << std::endl; + } + + // Insert the updates + for (const auto& update : updates) + { + sv_calls.insert(update); + } + + if (print_out) + { + std::cout << "[DEBUG] Added updates" << std::endl; + } + + + // Add the SV call if it does not exist + // std::cout << "[TEST2] Adding SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl; + // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1}); + // std::cout << "[TEST3] Added SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl; +} + +std::vector> splitSVsIntoChunks(std::set& sv_calls, int chunk_count) +{ + // Split the SV calls into chunks + std::vector> sv_chunks; + int sv_count = (int) sv_calls.size(); + int chunk_size = std::ceil((double) sv_count / (double) chunk_count); + int current_chunk = 0; + std::set current_sv_chunk; + for (const auto& sv_call : sv_calls) + { + current_sv_chunk.insert(sv_call); + + // If the current chunk size is reached, then add the chunk to the + // vector and reset the current chunk + if ((int) current_sv_chunk.size() == chunk_size) + { + // sv_chunks.insert(current_sv_chunk); + sv_chunks.push_back(current_sv_chunk); + current_sv_chunk.clear(); + current_chunk++; + } + } + + // Add the last chunk if it is not empty + if (!current_sv_chunk.empty()) + { + sv_chunks.push_back(current_sv_chunk); + // sv_chunks.insert(current_sv_chunk); + } + + return sv_chunks; +} + +uint32_t getSVCount(const std::set& sv_calls) +{ + return (uint32_t) sv_calls.size(); +} + +void concatenateSVCalls(std::set &target, const std::set &source) +{ + // Efficiently concatenate two sets of SV calls + target.insert(source.begin(), source.end()); +} From d5d7dcdc0e0c9116679005a0c1092deabb884b01 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 23 Nov 2024 12:52:30 -0500 Subject: [PATCH 019/134] Add merging and efficiency updates --- include/cnv_caller.h | 17 +- include/input_data.h | 2 +- include/snp_info.h | 14 +- include/sv_caller.h | 4 +- include/sv_object.h | 2 + include/utils.h | 2 + python/sv_merger.py | 2 +- src/cnv_caller.cpp | 511 ++++++++++++++++++++++-------------------- src/input_data.cpp | 2 +- src/snp_info.cpp | 44 ++-- src/sv_caller.cpp | 122 ++++++---- src/sv_object.cpp | 124 ++++++++-- src/utils.cpp | 11 +- tests/test_general.py | 4 +- 14 files changed, 513 insertions(+), 348 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 22a14cc9..858c7454 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -27,7 +27,7 @@ using namespace sv_types; // SNP data is a struct containing vectors used in predicting copy number // states. It is sorted by SNP position. struct SNPData { - std::vector pos; + std::vector pos; std::vector pfb; std::vector baf; std::vector log2_cov; @@ -57,6 +57,9 @@ class CNVCaller { SNPInfo snp_info; double mean_chr_cov = 0.0; std::unordered_map pos_depth_map; + std::unordered_map snp_baf_map; + std::set snp_baf_keys; + std::unordered_map snp_pfb_map; // Define a map of CNV genotypes by HMM predicted state. // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output. @@ -79,22 +82,24 @@ class CNVCaller { {6, "1/1"} }; - void updateSNPData(SNPData& snp_data, int64_t pos, double pfb, double baf, double log2_cov, bool is_snp); + void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp); std::pair, double> runViterbi(CHMM hmm, SNPData &snp_data); // Query a region for SNPs and return the SNP data - std::pair querySNPRegion(std::string chr, int64_t start_pos, int64_t end_pos, SNPInfo &snp_info, std::unordered_map &pos_depth_map, double mean_chr_cov); + std::pair querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, SNPInfo &snp_info, std::unordered_map &pos_depth_map, double mean_chr_cov); + + std::tuple, std::vector, std::vector> querySNPs(std::string chr, uint32_t start, uint32_t end); // Run copy number prediction for a chunk of SV candidates from CIGAR strings - void runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map& pos_depth_map); + void runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, CHMM hmm, int window_size, double mean_chr_cov); void updateSVCopyNumber(std::map& sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood); void updateDPValue(std::map& sv_candidates, SVCandidate key, int dp_value); // Split a region into chunks for parallel processing - std::vector splitRegionIntoChunks(std::string chr, int64_t start_pos, int64_t end_pos, int chunk_count); + std::vector splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count); // Split SV candidates into chunks for parallel processing std::vector> splitSVCandidatesIntoChunks(std::map& sv_candidates, int chunk_count); @@ -131,7 +136,7 @@ class CNVCaller { void getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info); // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions - void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, int64_t start, int64_t end, std::string sv_type, double likelihood); + void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood); }; #endif // CNV_CALLER_H diff --git a/include/input_data.h b/include/input_data.h index 0a74125f..718b5264 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -43,7 +43,7 @@ class InputData { // FASTAQuery getRefGenome(); // Query the reference genome for a sequence. - std::string queryRefGenome(std::string chr, int64_t pos_start, int64_t pos_end); + std::string queryRefGenome(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const; // Get the chromosomes in the reference genome. std::vector getRefGenomeChromosomes(); diff --git a/include/snp_info.h b/include/snp_info.h index 0b57a629..51278951 100644 --- a/include/snp_info.h +++ b/include/snp_info.h @@ -11,30 +11,30 @@ // Define the comparator for the binary search tree by SNP position (first // element of tuple) struct SNPCompare { - bool operator()(const std::tuple& a, const std::tuple& b) const { + bool operator()(const std::tuple& a, const std::tuple& b) const { return std::get<0>(a) < std::get<0>(b); } }; // Define the data structure for SNP frequencies sorted by position -using BST = std::set, SNPCompare>; +using BST = std::set, SNPCompare>; class SNPInfo { public: SNPInfo() {} // Insert a SNP into the map with its position and B-allele frequency - void insertSNPAlleleFrequency(std::string chr, int64_t pos, double baf); + void insertSNPAlleleFrequency(const std::string& chr, uint32_t pos, double baf); // Insert a SNP into the map with its position and population frequency of // the B allele - void insertSNPPopulationFrequency(std::string chr, int64_t pos, double pfb); + void insertSNPPopulationFrequency(const std::string& chr, uint32_t pos, double pfb); // Query SNPs within a range (start, end) and return their BAF and PFB values - std::tuple, std::vector, std::vector> querySNPs(std::string chr, int64_t start, int64_t end); + std::tuple, std::vector, std::vector> querySNPs(std::string chr, uint32_t start, uint32_t end); // Get the range of SNP positions for a given chromosome - std::pair getSNPRange(std::string chr); + std::pair getSNPRange(std::string chr); private: @@ -45,7 +45,7 @@ class SNPInfo { std::unordered_map snp_baf_map; // Define the map of chromosome to SNP population frequency - std::unordered_map> snp_pfb_map; + std::unordered_map> snp_pfb_map; }; #endif // SNP_INFO_H diff --git a/include/sv_caller.h b/include/sv_caller.h index 371cc53d..184248d7 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -41,7 +41,7 @@ class SVCaller { // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. // RegionData detectSVsFromRegion(std::string region); - std::tuple, PrimaryMap, SuppMap> detectCIGARSVs(std::string region); + std::tuple, PrimaryMap, SuppMap> detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region); // Read the next alignment from the BAM file in a thread-safe manner int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); @@ -54,7 +54,7 @@ class SVCaller { // sequence double calculateMismatchRate(std::unordered_map& mismatch_map, int32_t start, int32_t end); - void saveToVCF(const std::unordered_map>& sv_calls, const ReferenceGenome& ref_genome); + void saveToVCF(const std::unordered_map>& sv_calls); public: explicit SVCaller(InputData& input_data); diff --git a/include/sv_object.h b/include/sv_object.h index 3128dfe1..fb52691e 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -28,6 +28,8 @@ struct SVCall { void addSVCall(std::set& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood); +void mergeSVs(std::set& sv_calls); + std::vector> splitSVsIntoChunks(std::set& sv_calls, int chunk_count); uint32_t getSVCount(const std::set& sv_calls); diff --git a/include/utils.h b/include/utils.h index 41efb411..4ec19138 100644 --- a/include/utils.h +++ b/include/utils.h @@ -23,4 +23,6 @@ void printError(std::string message); std::string getElapsedTime(std::chrono::high_resolution_clock::time_point start, std::chrono::high_resolution_clock::time_point end); +std::string removeChrPrefix(std::string chr); + #endif // UTILS_H diff --git a/python/sv_merger.py b/python/sv_merger.py index 4254e0ad..78733f6d 100644 --- a/python/sv_merger.py +++ b/python/sv_merger.py @@ -153,7 +153,7 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): # Merge SVs with the same label unique_labels = np.unique(cluster_labels) - # logging.info("Unique labels: %s", unique_labels) + logging.info("Unique labels: %s", unique_labels) for label in unique_labels: diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 55e363ae..e8428ec5 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -50,26 +50,29 @@ std::pair, double> CNVCaller::runViterbi(CHMM hmm, SNPData& snp } // Function to obtain SNP information for a region -std::pair CNVCaller::querySNPRegion(std::string chr, int64_t start_pos, int64_t end_pos, SNPInfo& snp_info, std::unordered_map& pos_depth_map, double mean_chr_cov) +std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, SNPInfo& snp_info, std::unordered_map& pos_depth_map, double mean_chr_cov) { SNPData snp_data; bool snps_found = false; - int window_size = this->input_data.getWindowSize(); + uint32_t window_size = (uint32_t)this->input_data.getWindowSize(); // printMessage("Querying SNPs for region " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); - for (int64_t i = start_pos; i <= end_pos; i += window_size) + for (uint32_t i = start_pos; i <= end_pos; i += window_size) { // Run a sliding non-overlapping window of size window_size across // the SV region and calculate the log2 ratio for each window - int64_t window_start = i; - int64_t window_end = std::min(i + window_size - 1, end_pos); + uint32_t window_start = i; + uint32_t window_end = std::min(i + window_size - 1, end_pos); // Get the SNP info for the window // std::cout << "Querying SNPs for window " << chr << ":" << window_start << "-" << window_end << "..." << std::endl; // this->snp_data_mtx.lock(); - std::tuple, std::vector, std::vector> window_snps = snp_info.querySNPs(chr, window_start, window_end); + // std::tuple, std::vector, + // std::vector> window_snps = snp_info.querySNPs(chr, + // window_start, window_end); + std::tuple, std::vector, std::vector> window_snps = this->querySNPs(chr, window_start, window_end); // this->snp_data_mtx.unlock(); - std::vector& snp_window_pos = std::get<0>(window_snps); // SNP positions + std::vector& snp_window_pos = std::get<0>(window_snps); // SNP positions std::vector& snp_window_bafs = std::get<1>(window_snps); // B-allele frequencies std::vector& snp_window_pfbs = std::get<2>(window_snps); // Population frequencies of the B allele @@ -93,8 +96,8 @@ std::pair CNVCaller::querySNPRegion(std::string chr, int64_t star snps_found = true; // Loop through the SNPs and calculate the log2 ratios - int64_t bin_start = window_start; - int64_t bin_end = 0; + uint32_t bin_start = window_start; + uint32_t bin_end = 0; for (int j = 0; j < snp_count; j++) { // SNP bin starts at 1/2 the distance between the previous SNP @@ -104,7 +107,7 @@ std::pair CNVCaller::querySNPRegion(std::string chr, int64_t star // between the first SNP and the next SNP, and for the last // SNP, the bin starts at 1/2 the distance between the previous // SNP and the last SNP and ends at the window end. - int64_t snp_pos = snp_window_pos[j]; + uint32_t snp_pos = snp_window_pos[j]; bin_end = snp_pos + (j == snp_count-1 ? (window_end - snp_pos) / 2 : (snp_window_pos[j+1] - snp_pos) / 2); // Calculate the log2 ratio for the SNP bin @@ -123,14 +126,17 @@ std::pair CNVCaller::querySNPRegion(std::string chr, int64_t star std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate) { // Get the start and end positions of the SV call - int64_t start_pos = std::get<0>(candidate); - int64_t end_pos = std::get<1>(candidate); + uint32_t start_pos = std::get<0>(candidate); + uint32_t end_pos = std::get<1>(candidate); // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2 // the SV length - int64_t sv_length = (end_pos - start_pos) / 2.0; - int64_t snp_start_pos = std::max((int64_t) 1, start_pos - sv_length); - int64_t snp_end_pos = end_pos + sv_length; + uint32_t sv_half_length = (end_pos - start_pos) / 2.0; + // uint32_t snp_start_pos = std::max((uint32_t)1, start_pos - sv_length); + // Prevent underflow (start_pos - sv_length) if start_pos < sv_length + uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1; + uint32_t snp_end_pos = end_pos + sv_half_length; + // std::cout << "CNP for " << chr << ":" << start_pos << "-" << end_pos << "(" << snp_start_pos << ", " << snp_end_pos << ")" << std::endl; // printMessage("Running copy number prediction for SV candidate " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + " with SNP region " + chr + ":" + std::to_string(snp_start_pos) + "-" + std::to_string(snp_end_pos) + "..."); // Query the SNP region for the SV candidate @@ -138,6 +144,13 @@ std::tuple CNVCaller::runCopyNumberPrediction SNPData& sv_snps = snp_call.first; bool sv_snps_found = snp_call.second; + /* + if (sv_snps.pos.size() == 0) { + std::cerr << "ERROR [2]: No windows for SV " << chr << ":" << std::to_string((int)start_pos) << "-" << std::to_string((int)end_pos) << " (" << snp_start_pos << "," << snp_end_pos << std::endl; + continue; + } + */ + // Run the Viterbi algorithm // printMessage("[TEST] Running Viterbi algorithm for SV candidate " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); std::pair, double> prediction = runViterbi(this->hmm, sv_snps); @@ -211,7 +224,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set & // cnv_type_counts[i] = 0; // } - runCIGARCopyNumberPredictionChunk(chr, sv_candidates, this->snp_info, hmm, window_size, mean_chr_cov, this->pos_depth_map); + runCIGARCopyNumberPredictionChunk(chr, sv_candidates, hmm, window_size, mean_chr_cov); // // Split the SV candidates into chunks for each thread // int chunk_count = this->input_data.getThreadCount(); // // std::vector> sv_chunks = splitSVCandidatesIntoChunks(sv_candidates, chunk_count); @@ -248,7 +261,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set & printMessage("Finished predicting copy number states for chromosome " + chr + "..."); } -void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, SNPInfo& snp_info, CHMM hmm, int window_size, double mean_chr_cov, std::unordered_map& pos_depth_map) +void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, CHMM hmm, int window_size, double mean_chr_cov) { // printMessage("Running copy number prediction for " + std::to_string(sv_chunk.size()) + " SV candidates on chromosome " + chr + "..."); // Map with counts for each CNV type @@ -261,19 +274,29 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set(candidate); // int64_t end_pos = std::get<1>(candidate); uint32_t start_pos = sv_call.start; uint32_t end_pos = sv_call.end; + + // Error if start > end + if (start_pos >= end_pos) + { + std::cerr << "Position error for CIGAR SV at " << chr << ":" << start_pos << "-" << end_pos << std::endl; + continue; + } // Skip if not the minimum length for CNV predictions - if ((int)(end_pos - start_pos) < this->input_data.getMinCNVLength()) + if ((end_pos - start_pos) < (uint32_t)this->input_data.getMinCNVLength()) { continue; } + std::cout << "CIGAR SV at " << chr << ":" << start_pos << "-" << end_pos << std::endl; + // Get the depth at the start position. This is used as the FORMAT/DP // value in the VCF file // int dp_value = pos_depth_map[start_pos]; @@ -281,18 +304,26 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set sv_half_length ? start_pos - sv_half_length : 1; + uint32_t snp_end_pos = end_pos + sv_half_length; + // std::cout << "CIGAR sv_half_length:" << sv_half_length << std::endl; + // std::cout << "CIGAR SV query at " << chr << ":" << query_start << "-" << query_end << std::endl; // printMessage("Querying SNPs for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", qstart = " + std::to_string(query_start) + ", qend = " + std::to_string(query_end)); - std::pair snp_call = this->querySNPRegion(chr, query_start, query_end, snp_info, pos_depth_map, mean_chr_cov); + std::pair snp_call = this->querySNPRegion(chr, snp_start_pos, snp_end_pos, snp_info, this->pos_depth_map, mean_chr_cov); // printMessage("Finished querying SNPs for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); SNPData& sv_snps = snp_call.first; bool snps_found = snp_call.second; // Run the Viterbi algorithm // printMessage("[TEST2] Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "..."); + + if (sv_snps.pos.size() == 0) { + std::cerr << "ERROR: No windows for SV " << chr << ":" << start_pos << "-" << end_pos << " (" << snp_start_pos << "," << snp_end_pos << std::endl; + continue; + } + std::pair, double> prediction = runViterbi(hmm, sv_snps); std::vector& state_sequence = prediction.first; double likelihood = prediction.second; @@ -411,14 +442,14 @@ void CNVCaller::updateDPValue(std::map& sv_candidates, SVCan sv_candidates[key].read_depth = dp_value; } -std::vector CNVCaller::splitRegionIntoChunks(std::string chr, int64_t start_pos, int64_t end_pos, int chunk_count) +std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) { // Split the region into chunks std::vector region_chunks; - int64_t region_length = end_pos - start_pos + 1; - int64_t chunk_size = std::ceil((double) region_length / (double) chunk_count); - int64_t chunk_start = start_pos; - int64_t chunk_end = 0; + uint32_t region_length = end_pos - start_pos + 1; + uint32_t chunk_size = std::ceil((double) region_length / (double) chunk_count); + uint32_t chunk_start = start_pos; + uint32_t chunk_end = 0; for (int i = 0; i < chunk_count; i++) { chunk_end = chunk_start + chunk_size - 1; @@ -474,9 +505,9 @@ void CNVCaller::loadChromosomeData(std::string chr) this->hmm = ReadCHMM(hmm_filepath.c_str()); printMessage("Calculating mean chromosome coverage for " + chr + "..."); - mean_chr_cov = calculateMeanChromosomeCoverage(chr); + this->mean_chr_cov = calculateMeanChromosomeCoverage(chr); + //this->mean_chr_cov = 30.0; printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov)); - this->mean_chr_cov = mean_chr_cov; std::cout << "Reading SNP allele frequencies for chromosome " << chr << " from VCF file..." << std::endl; std::string snp_filepath = this->input_data.getSNPFilepath(); @@ -490,124 +521,109 @@ void CNVCaller::loadChromosomeData(std::string chr) // Calculate the mean chromosome coverage double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) { + // Open the BAM file + std::string bam_filepath = this->input_data.getShortReadBam(); + samFile *bam_file = sam_open(bam_filepath.c_str(), "r"); + if (!bam_file) + { + throw std::runtime_error("ERROR: Could not open BAM file: " + bam_filepath); + } - bool test=true; - if (test) { - return 30.0; - } + // Read the header + bam_hdr_t *bam_header = sam_hdr_read(bam_file); + if (!bam_header) + { + sam_close(bam_file); + throw std::runtime_error("ERROR: Could not read header from BAM file: " + bam_filepath); + } - // Use a maximum of 8 threads to avoid overloading the system with too many - // parallel processes - int num_threads = this->input_data.getThreadCount(); - if (num_threads > 8) + // Load the index + hts_idx_t *bam_index = sam_index_load(bam_file, bam_filepath.c_str()); + if (!bam_index) { - num_threads = 8; + bam_hdr_destroy(bam_header); + sam_close(bam_file); + throw std::runtime_error("ERROR: Could not load index for BAM file: " + bam_filepath); } - // Split the chromosome into equal parts for each thread - uint32_t chr_len = this->input_data.getRefGenomeChromosomeLength(chr); - if (chr_len == 0) + // Create an iterator for the chromosome + hts_itr_t *bam_iter = sam_itr_querys(bam_index, bam_header, chr.c_str()); + if (!bam_iter) { - printError("ERROR: Chromosome length is zero for: " + chr); - return 0.0; + hts_idx_destroy(bam_index); + bam_hdr_destroy(bam_header); + sam_close(bam_file); + throw std::runtime_error("ERROR: Could not create iterator for chromosome: " + chr); } - std::vector region_chunks = splitRegionIntoChunks(chr, 1, chr_len, num_threads); - if (region_chunks.empty()) + + // Initialize the record + bam1_t *bam_record = bam_init1(); + if (!bam_record) { - printError("ERROR: Failed to split chromosome into regions."); - return 0.0; + hts_itr_destroy(bam_iter); + hts_idx_destroy(bam_index); + bam_hdr_destroy(bam_header); + sam_close(bam_file); + throw std::runtime_error("ERROR: Could not initialize BAM record."); } - // Calculate the mean chromosome coverage in parallel - uint32_t pos_count = 0; - uint64_t cum_depth = 0; - std::vector>>> futures; - std::string input_filepath = this->input_data.getShortReadBam(); - for (const auto& region_chunk : region_chunks) + // Iterate through the chromosome and update the depth map + std::unordered_map chr_pos_depth_map; + while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0) { - // Create a lambda function to get the mean chromosome coverage for the - // region chunk - auto get_mean_chr_cov = [region_chunk, input_filepath]() -> std::tuple> + + // Parse the CIGAR string to get the depth (match, sequence match, and + // mismatch) + // uint32_t depth = 0; + uint32_t pos = bam_record->core.pos + 1; // 0-based to 1-based + uint32_t ref_pos = pos; + uint32_t cigar_len = bam_record->core.n_cigar; + uint32_t *cigar = bam_get_cigar(bam_record); + for (uint32_t i = 0; i < cigar_len; i++) { - // Run samtools depth on the entire region, and print positions and - // depths (not chromosome) - size_t cmd_size = input_filepath.size() + 256; - std::vector cmd(cmd_size); - snprintf(cmd.data(), cmd_size,\ - "samtools depth -r %s %s | awk '{print $2, $3}'",\ - region_chunk.c_str(), input_filepath.c_str()); - - // Open a pipe to read the output of the command - FILE *fp = popen(cmd.data(), "r"); - if (fp == NULL) + uint32_t op = bam_cigar_op(cigar[i]); + uint32_t op_len = bam_cigar_oplen(cigar[i]); + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - throw std::runtime_error("ERROR: Could not open pipe for command: " + std::string(cmd.data())); - } - - // Parse the outputs (position and depth) - std::unordered_map pos_depth_map; - const int line_size = 1024; - char line[line_size]; - uint32_t pos; - int depth; - uint32_t pos_count = 0; - uint64_t cum_depth = 0; - while (fgets(line, line_size, fp) != NULL) - { - if (sscanf(line, "%u%d", &pos, &depth) == 2) + // Update the depth for each position in the alignment + for (uint32_t j = 0; j < op_len; j++) { - pos_depth_map[pos] = depth; - pos_count++; - cum_depth += depth; + chr_pos_depth_map[ref_pos + j]++; } } - // Check if pclose fails - if (pclose(fp) == -1) - { - throw std::runtime_error("ERROR: Failed to close pipe for command: " + std::string(cmd.data())); + // Update the reference coordinate based on the CIGAR operation + // https://samtools.github.io/hts-specs/SAMv1.pdf + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { + ref_pos += op_len; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) { + // Do nothing + } else { + throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op)); } - //pclose(fp); // Close the process - - return std::make_tuple(pos_count, cum_depth, pos_depth_map); - }; - - futures.emplace_back(std::async(std::launch::async, get_mean_chr_cov)); - //std::future>> future = std::async(std::launch::async, get_mean_chr_cov); - //futures.push_back(std::move(future)); - } - - // Thread-safe map merging (using mutex) - std::mutex merge_mutex; - for (auto& future : futures) - { - try - { - future.wait(); - auto result = std::move(future.get()); - - // Safely merge results - std::lock_guard lock(merge_mutex); - pos_count += std::get<0>(result); - cum_depth += std::get<1>(result); - this->mergePosDepthMaps(this->pos_depth_map, std::get<2>(result)); - } - catch (const std::exception& ex) - { - printError("ERROR: Exception in thread execution - " + std::string(ex.what())); - return 0.0; } } - - // Validate and calculate mean chromosome coverage - if (pos_count == 0) + + // Clean up + bam_destroy1(bam_record); + hts_itr_destroy(bam_iter); + hts_idx_destroy(bam_index); + bam_hdr_destroy(bam_header); + sam_close(bam_file); + + // Calculate the mean chromosome coverage + uint64_t cum_depth = 0; + uint32_t pos_count = 0; + for (auto& pos_depth : chr_pos_depth_map) { - printError("ERROR: No positions found in chromosome coverage calculation."); - return 0.0; + cum_depth += pos_depth.second; + pos_count++; } - - double mean_chr_cov = static_cast(cum_depth) / static_cast(pos_count); + double mean_chr_cov = (double) cum_depth / (double) pos_count; + + // Update the position depth map + this->pos_depth_map = std::move(chr_pos_depth_map); return mean_chr_cov; } @@ -615,10 +631,9 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) void CNVCaller::mergePosDepthMaps(std::unordered_map& main_map, std::unordered_map& map_update) { // Merge the second depth map into the first - main_map.reserve(main_map.size() + map_update.size()); for (auto& pos_depth : map_update) { - main_map[pos_depth.first] = std::move(pos_depth.second); + main_map[pos_depth.first] = pos_depth.second; } } @@ -660,6 +675,7 @@ double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std:: void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, SNPInfo& snp_info) { + // Check that the SNP file is sorted by running bcftools index and reading // the error output std::string index_cmd = "bcftools index " + filepath + " 2>&1 | grep -i error"; @@ -699,7 +715,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, } std::string filtered_snp_vcf_filepath = this->input_data.getOutputDir() + "/filtered_snps.vcf"; - std::string cmd = "bcftools view -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath; + int thread_count = this->input_data.getThreadCount(); + // std::string cmd = "bcftools view -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath; + std::string cmd = "bcftools view --threads " + std::to_string(thread_count) + " -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath; if (this->input_data.getVerbose()) { std::cout << "Filtering SNPs by depth and quality..." << std::endl; std::cout << "Command: " << cmd << std::endl; @@ -725,13 +743,14 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, // Read the reference and alternate allele depths from the VCF file std::string alt_allele = ""; // Alternate allele - uint64_t pos = 0; + uint32_t pos = 0; int ref_ad = 0; int alt_ad = 0; - const int line_size = 256; + const int line_size = 1024; char line[line_size]; // Line buffer std::vector locations; std::vector bafs; + std::string chr_no_prefix = removeChrPrefix(chr); while (fgets(line, line_size, fp) != NULL) { // Parse the line @@ -742,7 +761,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, // Get the position from column 2 if (col == 0) { - pos = atoi(tok); + pos = (uint32_t)atoi(tok); } // Get the AD for the reference allele from column 3 @@ -768,7 +787,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, // Add a new location and BAF value to the chromosome's SNP data // (population frequency and log2 ratio will be added later) - snp_info.insertSNPAlleleFrequency(chr, pos, baf); + // snp_info.insertSNPAlleleFrequency(chr_no_prefix, pos, baf); + this->snp_baf_map[pos] = baf; + this->snp_baf_keys.insert(pos); } pclose(fp); // Close the process @@ -787,7 +808,7 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) std::cout << "No population frequency file provided for chromosome " << chr << std::endl; return; } - + // Determine the ethnicity-specific allele frequency key std::string AF_key = "AF"; if (this->input_data.getEthnicity() != "") @@ -796,12 +817,11 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) } // Check if the filepath uses the 'chr' prefix notations based on the - // chromosome name (e.g., *.chr1.vcf.gz vs *.1.vcf.gz) + // chromosome name (*.chr1.vcf.gz vs *.1.vcf.gz) std::string chr_gnomad = chr; // gnomAD data may or may not have the 'chr' prefix std::string chr_prefix = "chr"; if (pfb_filepath.find(chr_prefix) == std::string::npos) { - // gnomaAD does not use the 'chr' prefix // Remove the 'chr' prefix from the chromosome name if (chr_gnomad.find(chr_prefix) != std::string::npos) { @@ -817,125 +837,68 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) // Remove the 'chr' prefix from the chromosome name for SNP data. All // SNP data in this program does not use the 'chr' prefix - std::string chr_snp = chr; - if (chr_snp.find(chr_prefix) != std::string::npos) - { - chr_snp = chr_snp.substr(chr_prefix.length()); - } - std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl; - - // Get the start and end SNP positions for the chromosome (1-based - // index) - std::pair snp_range = snp_info.getSNPRange(chr); - int64_t snp_start = snp_range.first; - int64_t snp_end = snp_range.second; - if (this->input_data.isRegionSet()) - { - // Get the user-defined region - std::pair region = this->input_data.getRegion(); - if (snp_start < region.first) { - snp_start = region.first; - } else if (snp_end > region.second) { - snp_end = region.second; - } - } + std::string chr_no_prefix = removeChrPrefix(chr); - // Use a maximum of 8 threads to avoid overloading the system with too many - // processes - int num_threads = this->input_data.getThreadCount(); - if (num_threads > 8) - { - num_threads = 8; - } + std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl; + int thread_count = this->input_data.getThreadCount(); + + // Run bcftools query to get the population frequencies for the + // chromosome within the SNP region, filtering for SNPS only, + // and within the MIN-MAX range of frequencies. + std::string snps_fp = this->input_data.getOutputDir() + "/filtered_snps.vcf"; + std::string filter_criteria = "INFO/variant_type=\"snv\" && " + AF_key + " >= " + std::to_string(MIN_PFB) + " && " + AF_key + " <= " + std::to_string(MAX_PFB); + std::string cmd = \ + "bcftools view --threads " + std::to_string(thread_count) + " -T " + snps_fp + " -i '" + filter_criteria + "' " + pfb_filepath + " | bcftools query -f '%POS\t%" + AF_key + "\n' 2>/dev/null"; + + // printMessage("Running command: " + cmd); + std::cout << "Running command: " << cmd << std::endl; - // Split region into chunks and get the population frequencies in parallel - std::cout << "SNP range for chromosome " << chr << ": " << snp_start << "-" << snp_end << std::endl; - std::vector region_chunks = splitRegionIntoChunks(chr_gnomad, snp_start, snp_end, num_threads); - std::unordered_map pos_pfb_map; - // std::vector threads; - std::vector>> futures; - for (const auto& region_chunk : region_chunks) + // Open a pipe to read the output of the command + FILE *fp = popen(cmd.c_str(), "r"); + if (fp == NULL) { - // Create a lambda function to get the population frequencies for the - // region chunk - auto get_pfb = [region_chunk, pfb_filepath, AF_key]() -> std::unordered_map - { - // Run bcftools query to get the population frequencies for the - // chromosome within the SNP region, filtering for SNPS only, - // and within the MIN-MAX range of frequencies. - std::string filter_criteria = "INFO/variant_type=\"snv\" && " + AF_key + " >= " + std::to_string(MIN_PFB) + " && " + AF_key + " <= " + std::to_string(MAX_PFB); - std::string cmd = \ - "bcftools query -r " + region_chunk + " -f '%POS\t%" + AF_key + "\n' -i '" + filter_criteria + "' " + pfb_filepath + " 2>/dev/null"; - - // std::cout << "Command: " << cmd << std::endl; - printMessage("Running command: " + cmd); - - // Open a pipe to read the output of the command - FILE *fp = popen(cmd.c_str(), "r"); - if (fp == NULL) - { - std::cerr << "ERROR: Could not open pipe for command: " << cmd << std::endl; - exit(1); - } - - // Loop through the BCFTOOLS output and populate the map of population - // frequencies - // printMessage("Parsing population frequencies for chromosome " + chr + "..."); - std::unordered_map pos_pfb_map; - const int line_size = 256; - char line[line_size]; - while (fgets(line, line_size, fp) != NULL) - { - // Parse the line - int pos; - double pfb; - if (sscanf(line, "%d%lf", &pos, &pfb) == 2) - { - pos_pfb_map[pos] = pfb; // Add the position and population frequency to the map - } - } - pclose(fp); - // printMessage("Finished parsing population frequencies for chromosome " + chr + "..."); - - return pos_pfb_map; - }; - - // Create a future for the thread - futures.emplace_back(std::async(std::launch::async, get_pfb)); - // std::future> future = std::async(std::launch::async, get_pfb); - // futures.push_back(std::move(future)); + throw std::runtime_error("ERROR: Could not open pipe for command: " + cmd); } - // Loop through the futures and get the results - int pfb_count = 0; - for (auto& future : futures) + // Loop through the BCFTOOLS output and populate the map of population + // frequencies + // printMessage("Parsing population frequencies for chromosome " + chr + + // "..."); + std::cout << "Parsing population frequencies for chromosome " << chr << "..." << std::endl; + // const int line_size = 256; + // char line[line_size]; + int print_count = 0; + // while (fgets(line, line_size, fp) != NULL) + char line[2048]; + while (fgets(line, sizeof(line), fp) != NULL) { - future.wait(); - std::unordered_map result = std::move(future.get()); - - // Loop through the result and add to SNPInfo - // printMessage("Adding population frequencies to SNPInfo..."); - for (auto& pair : result) - { - int pos = pair.first; - double pfb = pair.second; - - // Add the population frequency to the SNPInfo - // this->snp_data_mtx.lock(); - snp_info.insertSNPPopulationFrequency(chr_snp, pos, pfb); - // this->snp_data_mtx.unlock(); - pfb_count++; - - // [TEST] Print 15 values - if (pfb_count < 15) + std::istringstream iss(line); + // Parse the line + int pos; + double pfb; + // if (sscanf(line, "%d\t%lf", &pos, &pfb) == 2) + if (iss >> pos >> pfb){ + // pos_pfb_map[pos] = pfb; // Add the position and population + // frequency to the map + // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb); + + // Print the first 10 population frequencies + if (print_count < 10) { - printMessage("Population frequency for " + chr + ":" + std::to_string(pos) + " = " + std::to_string(pfb)); + std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; + // printMessage("Population frequency for " + chr + ":" + + // std::to_string(pos) + " = " + std::to_string(pfb)); + this->snp_pfb_map[pos] = pfb; + print_count++; } } } + pclose(fp); + std::cout << "Finished reading population frequencies for chromosome " << chr << std::endl; + // printMessage("Finished parsing population frequencies for chromosome " + chr + "..."); } -void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, int64_t start, int64_t end, std::string sv_type, double likelihood) +void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) { // Open the TSV file for writing std::ofstream tsv_file(filepath); @@ -988,7 +951,7 @@ void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, s for (int i = 0; i < snp_count; i++) { // Get the SNP data - int64_t pos = snp_data.pos[i]; + uint32_t pos = snp_data.pos[i]; bool is_snp = snp_data.is_snp[i]; double pfb = snp_data.pfb[i]; double baf = snp_data.baf[i]; @@ -1017,7 +980,7 @@ void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, s tsv_file.close(); } -void CNVCaller::updateSNPData(SNPData& snp_data, int64_t pos, double pfb, double baf, double log2_cov, bool is_snp) +void CNVCaller::updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp) { // Update the SNP data snp_data.pos.emplace_back(pos); @@ -1026,3 +989,65 @@ void CNVCaller::updateSNPData(SNPData& snp_data, int64_t pos, double pfb, double snp_data.log2_cov.emplace_back(log2_cov); snp_data.is_snp.emplace_back(is_snp); } + +std::tuple, std::vector, std::vector> CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end) +{ + // Lock the mutex for reading SNP information + // std::lock_guard lock(this->snp_info_mtx); + + chr = removeChrPrefix(chr); + + // Create an ordered map of SNP positions to BAF and PFB values + std::map> snp_map; + + // Query SNPs within a range (start, end) and return their BAF and PFB + // values as separate vectors + std::vector bafs; + std::vector pfbs; + std::vector pos; + double pfb_default = 0.5; + + // Query the SNPs within the range and return their BAFs and corresponding + // positions + auto snp_start = this->snp_baf_keys.lower_bound(start); + auto snp_end = this->snp_baf_keys.upper_bound(end); + + if (snp_start == this->snp_baf_keys.end()) + { + return std::make_tuple(pos, bafs, pfbs); + } + + for (auto it = snp_start; it != snp_end; it++) + { + uint32_t snp_pos = *it; + pos.push_back(snp_pos); + bafs.push_back(this->snp_baf_map[snp_pos]); + + // Get the PFB value for the SNP + if (this->snp_pfb_map.find(snp_pos) != this->snp_pfb_map.end()) + { + pfbs.push_back(this->snp_pfb_map[snp_pos]); + } else { + pfbs.push_back(pfb_default); + } + } + // auto& baf_bst = this->snp_baf_map[chr]; + // auto baf_start = baf_bst.lower_bound({start, 0.0}); + // auto baf_end = baf_bst.upper_bound({end, 0.0}); + // for (auto it = baf_start; it != baf_end; it++) { + // bafs.push_back(std::get<1>(*it)); + // pos.push_back(std::get<0>(*it)); + // } + + + + // auto& pfb_map = this->snp_pfb_map[chr]; + // for (size_t i = 0; i < pos.size(); i++) { + // uint32_t snp_pos = pos[i]; + // if (pfb_map.find(snp_pos) != pfb_map.end()) { + // pfbs[i] = pfb_map[snp_pos]; + // } + // } + + return std::make_tuple(pos, bafs, pfbs); +} diff --git a/src/input_data.cpp b/src/input_data.cpp index 186e4617..572ed92a 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -95,7 +95,7 @@ const ReferenceGenome& InputData::getRefGenome() const return this->fasta_query; } -std::string InputData::queryRefGenome(std::string chr, int64_t pos_start, int64_t pos_end) +std::string InputData::queryRefGenome(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const { return this->fasta_query.query(chr, pos_start, pos_end); } diff --git a/src/snp_info.cpp b/src/snp_info.cpp index 36efeb4b..1dc7b4a7 100644 --- a/src/snp_info.cpp +++ b/src/snp_info.cpp @@ -1,4 +1,5 @@ #include "snp_info.h" +#include "utils.h" /// @cond #include @@ -11,45 +12,36 @@ #define MIN_PFB 0.01 -// Function to remove the 'chr' prefix from chromosome names -std::string removeChrPrefix(std::string chr) +void SNPInfo::insertSNPAlleleFrequency(const std::string& chr, uint32_t pos, double baf) { - if (chr.find("chr") != std::string::npos) { - return chr.substr(3); - } - return chr; -} - -void SNPInfo::insertSNPAlleleFrequency(std::string chr, int64_t pos, double baf) -{ - chr = removeChrPrefix(chr); + // chr = removeChrPrefix(chr); // Add the chromosome to the SNP B-allele frequency map if it does not exist - if (this->snp_baf_map.find(chr) == this->snp_baf_map.end()) { - this->snp_baf_map[chr] = BST(); - } + // if (this->snp_baf_map.find(chr) == this->snp_baf_map.end()) { + // this->snp_baf_map[chr] = BST(); + // } // Insert the SNP into the map with its position and B-allele frequency // using a binary search tree to keep the SNP positions sorted this->snp_baf_map[chr].insert({pos, baf}); } -void SNPInfo::insertSNPPopulationFrequency(std::string chr, int64_t pos, double pfb) +void SNPInfo::insertSNPPopulationFrequency(const std::string& chr, uint32_t pos, double pfb) { - chr = removeChrPrefix(chr); + // chr = removeChrPrefix(chr); // Add the chromosome to the SNP population frequency map if it does not // exist - if (this->snp_pfb_map.find(chr) == this->snp_pfb_map.end()) { - this->snp_pfb_map[chr] = std::unordered_map(); - } + // if (this->snp_pfb_map.find(chr) == this->snp_pfb_map.end()) { + // this->snp_pfb_map[chr] = std::unordered_map(); + // } // Insert the SNP into the map with its position and population frequency of // the B allele this->snp_pfb_map[chr][pos] = pfb; } -std::tuple, std::vector, std::vector> SNPInfo::querySNPs(std::string chr, int64_t start, int64_t end) +std::tuple, std::vector, std::vector> SNPInfo::querySNPs(std::string chr, uint32_t start, uint32_t end) { // Lock the mutex for reading SNP information // std::lock_guard lock(this->snp_info_mtx); @@ -57,13 +49,13 @@ std::tuple, std::vector, std::vector> SNPIn chr = removeChrPrefix(chr); // Create an ordered map of SNP positions to BAF and PFB values - std::map> snp_map; + std::map> snp_map; // Query SNPs within a range (start, end) and return their BAF and PFB // values as separate vectors std::vector bafs; std::vector pfbs; - std::vector pos; + std::vector pos; // Check if the chromosome exists in the B-allele frequency map if (this->snp_baf_map.find(chr) == this->snp_baf_map.end()) { @@ -91,7 +83,7 @@ std::tuple, std::vector, std::vector> SNPIn // Query the PFBs for all SNP positions with PFB data auto& pfb_map = this->snp_pfb_map[chr]; for (size_t i = 0; i < pos.size(); i++) { - int64_t snp_pos = pos[i]; + uint32_t snp_pos = pos[i]; if (pfb_map.find(snp_pos) != pfb_map.end()) { pfbs[i] = pfb_map[snp_pos]; } @@ -100,13 +92,13 @@ std::tuple, std::vector, std::vector> SNPIn return std::make_tuple(pos, bafs, pfbs); } -std::pair SNPInfo::getSNPRange(std::string chr) +std::pair SNPInfo::getSNPRange(std::string chr) { chr = removeChrPrefix(chr); // Get the range of SNP positions for a given chromosome - int64_t start = 0; - int64_t end = 0; + uint32_t start = 0; + uint32_t end = 0; if (this->snp_baf_map.find(chr) != this->snp_baf_map.end()) { auto& baf_bst = this->snp_baf_map[chr]; start = std::get<0>(*baf_bst.begin()); diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 73b6cfea..cd44585e 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -37,30 +37,30 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) } // RegionData SVCaller::detectSVsFromRegion(std::string region) -std::tuple, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(std::string region) +std::tuple, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region) { - // Open the BAM file - std::string bam_filepath = this->input_data.getLongReadBam(); - samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); - if (fp_in == NULL) { - std::cerr << "ERROR: failed to open " << bam_filepath << std::endl; - exit(1); - } + // // Open the BAM file + // std::string bam_filepath = this->input_data.getLongReadBam(); + // samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); + // if (fp_in == NULL) { + // std::cerr << "ERROR: failed to open " << bam_filepath << std::endl; + // exit(1); + // } - // Load the header for the BAM file - bam_hdr_t *bamHdr = sam_hdr_read(fp_in); - if (!bamHdr) { - sam_close(fp_in); - throw std::runtime_error("ERROR: failed to read header for " + bam_filepath); - } + // // Load the header for the BAM file + // bam_hdr_t *bamHdr = sam_hdr_read(fp_in); + // if (!bamHdr) { + // sam_close(fp_in); + // throw std::runtime_error("ERROR: failed to read header for " + bam_filepath); + // } - // Load the index for the BAM file - hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str()); - if (!idx) { - bam_hdr_destroy(bamHdr); - sam_close(fp_in); - throw std::runtime_error("ERROR: failed to load index for " + bam_filepath); - } + // // Load the index for the BAM file + // hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str()); + // if (!idx) { + // bam_hdr_destroy(bamHdr); + // sam_close(fp_in); + // throw std::runtime_error("ERROR: failed to load index for " + bam_filepath); + // } // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); @@ -135,11 +135,15 @@ std::tuple, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(std:: num_alignments++; } + // Clean up the iterator and alignment hts_itr_destroy(itr); bam_destroy1(bam1); - hts_idx_destroy(idx); - bam_hdr_destroy(bamHdr); - sam_close(fp_in); + + // hts_itr_destroy(itr); + // bam_destroy1(bam1); + // hts_idx_destroy(idx); + // bam_hdr_destroy(bamHdr); + // sam_close(fp_in); return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments); } @@ -181,7 +185,6 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr int32_t query_end = 0; // Last alignment position in the query bool first_op = false; // First alignment operation for the query double default_lh = 0.0; - // double default_lh = std::numeric_limits::quiet_NaN(); // Default likelihood for (int i = 0; i < cigar_len; i++) { int op = bam_cigar_op(cigar[i]); // CIGAR operation @@ -373,6 +376,28 @@ std::unordered_map> SVCaller::run() // std::cout << "[DEBUG] Running last " << last_n << " chromosomes" << std::endl; // //chromosomes = std::vector(chromosomes.end()-3, chromosomes.end()); + // Open the BAM file + std::string bam_filepath = this->input_data.getLongReadBam(); + samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); + if (!fp_in) { + throw std::runtime_error("ERROR: failed to open " + bam_filepath); + } + + // Load the header for the BAM file + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); + if (!bamHdr) { + sam_close(fp_in); + throw std::runtime_error("ERROR: failed to read header for " + bam_filepath); + } + + // Load the index for the BAM file + hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str()); + if (!idx) { + bam_hdr_destroy(bamHdr); + sam_close(fp_in); + throw std::runtime_error("ERROR: failed to load index for " + bam_filepath); + } + // Loop through each region and detect SVs in chunks int chr_count = chromosomes.size(); int current_chr = 0; @@ -426,7 +451,8 @@ std::unordered_map> SVCaller::run() std::set combined_sv_calls; for (const auto& sub_region : region_chunks) { // std::cout << "Detecting CIGAR string SVs from " << sub_region << "..." << std::endl; - std::tuple, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(sub_region); + // std::tuple, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(sub_region); + std::tuple, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region); std::set& subregion_sv_calls = std::get<0>(region_data); PrimaryMap& primary_map = std::get<1>(region_data); SuppMap& supp_map = std::get<2>(region_data); @@ -460,6 +486,10 @@ std::unordered_map> SVCaller::run() // main set // sv_calls.emplace_back(subregion_sv_calls); + // Merge the SV calls from the current region + std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; + mergeSVs(subregion_sv_calls); + // Combine the SV calls from the current region std::cout << "Combining SV calls from " << sub_region << "..." << std::endl; concatenateSVCalls(combined_sv_calls, subregion_sv_calls); @@ -479,12 +509,19 @@ std::unordered_map> SVCaller::run() // std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)" << std::endl; } + // Clean up the BAM file, header, and index + hts_idx_destroy(idx); + bam_hdr_destroy(bamHdr); + sam_close(fp_in); + // SVData sv_calls_combined; // for (const auto& subregion_sv_calls : sv_calls) { // sv_calls_combined.concatenate(subregion_sv_calls); // } - std::cout << "SV calling completed." << std::endl; + // Save to VCF + std::cout << "Saving SVs to VCF..." << std::endl; + this->saveToVCF(whole_genome_sv_calls); return whole_genome_sv_calls; } @@ -570,6 +607,7 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p int32_t primary_lh_t = 0; if (primary_end - primary_start >= min_cnv_length) { SVCandidate sv_candidate(primary_start+1, primary_end+1, "."); + // std::cout << "TEST5" << std::endl; std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); primary_lh = std::get<0>(result); // primary_log_likelihood /= (double)(primary_end - primary_start); // Normalize the log likelihood by the length @@ -613,6 +651,7 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p int largest_supp_lh_t = 0; if (largest_supp_length >= min_cnv_length) { SVCandidate sv_candidate(std::get<1>(largest_supp_alignment)+1, std::get<2>(largest_supp_alignment)+1, "."); + // std::cout << "TEST1" << std::endl; std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); largest_supp_lh = std::get<0>(result); // largest_supp_log_likelihood /= (double)largest_supp_length; // Normalize the log likelihood by the length @@ -627,6 +666,7 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p if (largest_supp_alignment != closest_supp_alignment) { if (closest_supp_length >= min_cnv_length) { SVCandidate sv_candidate(std::get<1>(closest_supp_alignment)+1, std::get<2>(closest_supp_alignment)+1, "."); + // std::cout << "TEST2" << std::endl; std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); closest_supp_lh = std::get<0>(result); // closest_supp_log_likelihood /= (double)closest_supp_length; // Normalize the log likelihood by the length @@ -708,6 +748,8 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p std::string chosen_candidate_str = "BOUNDARY"; int split_scenario = NOCALL; for (const auto& sv_candidate : sv_candidates) { + // std::cout << "TEST3: primary = " << primary_start << ", " << primary_end << " supp = " << supp_start << ", " << supp_end << std::endl; + // std::cout << "Position: " << std::get<0>(sv_candidate) << ", " << std::get<1>(sv_candidate) << std::endl; std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); double current_lh = std::get<0>(result); SVType current_type = std::get<1>(result); @@ -957,6 +999,7 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p // the closest supplementary alignment and the largest // supplementary alignment SVCandidate sv_candidate(std::get<2>(closest_supp_alignment)+1, std::get<1>(largest_supp_alignment)+1, "."); + // std::cout << "TEST4" << std::endl; std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); // double complex_log_likelihood = std::get<0>(result); SVType complex_type = std::get<1>(result); @@ -1018,7 +1061,7 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p } } -void SVCaller::saveToVCF(const std::unordered_map >& sv_calls, const ReferenceGenome& ref_genome) +void SVCaller::saveToVCF(const std::unordered_map >& sv_calls) { std::cout << "Creating VCF writer..." << std::endl; // std::string output_vcf = output_dir + "/output.vcf"; @@ -1032,25 +1075,19 @@ void SVCaller::saveToVCF(const std::unordered_map std::cout << "Getting reference genome filepath..." << std::endl; try { - std::string ref_fp = ref_genome.getFilepath(); + std::string ref_fp = this->input_data.getRefGenome().getFilepath(); std::cout << "Reference genome filepath: " << ref_fp << std::endl; } catch (const std::exception& e) { std::cerr << "Error: " << e.what() << std::endl; return; } - std::cout << "Getting reference genome header..." << std::endl; - try { - ref_genome.getContigHeader(); - } catch (const std::exception& e) { - std::cerr << "Error: " << e.what() << std::endl; - return; - } - // Set the header lines + std::cout << "Getting reference genome header..." << std::endl; + const std::string contig_header = this->input_data.getRefGenome().getContigHeader(); std::vector header_lines = { - std::string("##reference=") + ref_genome.getFilepath(), - ref_genome.getContigHeader(), + std::string("##reference=") + + contig_header, "##INFO=", "##INFO=", "##INFO=", @@ -1166,7 +1203,8 @@ void SVCaller::saveToVCF(const std::unordered_map if (sv_type_str == "DEL") { // Get the deleted sequence from the reference genome, also including the preceding base int64_t preceding_pos = (int64_t) std::max(1, (int) start-1); // Make sure the position is not negative - ref_allele = ref_genome.query(chr, preceding_pos, end); + // ref_allele = ref_genome.query(chr, preceding_pos, end); + ref_allele = this->input_data.queryRefGenome(chr, preceding_pos, end); // Use the preceding base as the alternate allele if (ref_allele != "") { @@ -1184,7 +1222,9 @@ void SVCaller::saveToVCF(const std::unordered_map } else { // Use the preceding base as the reference allele int64_t preceding_pos = (int64_t) std::max(1, (int) start-1); // Make sure the position is not negative - ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos); + // ref_allele = ref_genome.query(chr, preceding_pos, + // preceding_pos); + ref_allele = this->input_data.queryRefGenome(chr, preceding_pos, preceding_pos); // Format novel insertions if (sv_type_str == "INS") { diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 15aefdfd..b2c5827f 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -9,8 +9,8 @@ bool SVCall::operator<(const SVCall & other) const { - return std::tie(start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood) < - std::tie(other.start, other.end, other.sv_type, other.alt_allele, other.data_type, other.genotype, other.hmm_likelihood); + return start < other.start || (start == other.start && end < other.end); + //return std::tie(start, end) < std::tie(other.start, other.end); } void addSVCall(std::set& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) @@ -27,29 +27,41 @@ void addSVCall(std::set& sv_calls, uint32_t start, uint32_t end, std::st // If the SV call already exists (start and end position), then update all information if the // likelihood is higher // std::cout << "[TEST1] Adding SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl; - std::vector updates; + SVCall new_sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1}; + + sv_calls.insert(new_sv_call); + + /* + bool exists = false; bool print_out = false; for (auto it = sv_calls.begin(); it != sv_calls.end();) { if (it->start == start && it->end == end) { + exists = true; if (hmm_likelihood > it->hmm_likelihood) { - std::cout << "[DEBUG] Found higher likelihood for SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl; + //std::cout << "[DEBUG] Found higher likelihood for SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl; print_out = true; // Update the data type and support - std::string new_data_type = it->data_type + "," + data_type; - int new_support = it->support + 1; + // std::string new_data_type = it->data_type + "," + data_type; + // int new_support = it->support + 1; + new_sv_call.data_type = it->data_type + "," + data_type; + new_sv_call.support = it->support + 1; + //higher_lh = true; - updates.push_back(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support}); + // updates.push_back(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support}); // Erase and re-insert the SV call - // Erase the current iterator and safely insert the new SV call - // sv_calls.erase(it); - it = sv_calls.erase(it); // Erase and get the next iterator + // Erase the current iterator and safely insert the new SV calls + std::cout << "Erasing iterator." << std::endl; + sv_calls.erase(it); + std::cout << "Iterator erased." << std::endl; + break; + //it = sv_calls.erase(it); // Erase and get the next iterator // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support}); } else { - // Return if no update is needed + // End if the SV exists but is lower lh return; } } else { @@ -62,23 +74,35 @@ void addSVCall(std::set& sv_calls, uint32_t start, uint32_t end, std::st { std::cout << "[DEBUG] Adding updates" << std::endl; } - - // Insert the updates - for (const auto& update : updates) + + // Update the SV call if it does not exist, or if the likelihood is higher + // than the existing call + if (print_out) { - sv_calls.insert(update); + std::cout << "[DEBUG] Inserting call" << std::endl; } - + sv_calls.insert(new_sv_call); if (print_out) { - std::cout << "[DEBUG] Added updates" << std::endl; + std::cout << "[DEBUG] Call inserted" << std::endl; } + // Insert the updates + // for (const auto& update : updates) + // { + // sv_calls.insert(update); + // } + + // if (print_out) + // { + // std::cout << "[DEBUG] Added updates" << std::endl; + // } // Add the SV call if it does not exist // std::cout << "[TEST2] Adding SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl; // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1}); // std::cout << "[TEST3] Added SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl; + */ } std::vector> splitSVsIntoChunks(std::set& sv_calls, int chunk_count) @@ -124,3 +148,69 @@ void concatenateSVCalls(std::set &target, const std::set &source // Efficiently concatenate two sets of SV calls target.insert(source.begin(), source.end()); } + +void mergeSVs(std::set& sv_calls) { + if (sv_calls.size() < 2) { + return; + } + + // Merge SV calls if they overlap by at least 50% + int initial_size = sv_calls.size(); + std::vector merged_sv_calls; + auto it = sv_calls.begin(); + SVCall current_merge = *it++; + + for (; it != sv_calls.end(); ++it) { + const SVCall& next = *it; + + // Check if the SV calls overlap by at least 50% + uint32_t overlap_start = std::max(current_merge.start, next.start); + uint32_t overlap_end = std::min(current_merge.end, next.end); + uint32_t overlap_length = (overlap_start < overlap_end) ? overlap_end - overlap_start : 0; + + uint32_t current_length = current_merge.end - current_merge.start; + uint32_t next_length = next.end - next.start; + + // Merge the SV calls if the overlap is at least 50% of the current or + // next SV call + double overlap_pct_current = static_cast(overlap_length) / current_length; + double overlap_pct_next = static_cast(overlap_length) / next_length; + + if (overlap_pct_current >= 0.5 || overlap_pct_next >= 0.5) { + // Merge the SV calls based on the likelihood + if (next.hmm_likelihood != 0.0) { + // Update the likelihood if the next SV call has a likelihood + // and it is higher than the current merged SV call + if (next.hmm_likelihood > current_merge.hmm_likelihood) { + current_merge = next; + } + } else { + // If both have no likelihood (CIGAR only), then merge the SV calls + // based on largest SV length + if (next.hmm_likelihood == current_merge.hmm_likelihood) { + if (next_length > current_length) { + current_merge = next; + } + } + // if (next_length > current_length) { + // current_merge = next; + // } + } + } else { + // No overlap: Save the SV and continue + merged_sv_calls.push_back(current_merge); + current_merge = next; + } + } + + // Add the last merged SV call + merged_sv_calls.push_back(current_merge); + + // Update the SV calls + sv_calls.clear(); + for (const auto& sv_call : merged_sv_calls) { + sv_calls.insert(sv_call); + } + int updated_size = sv_calls.size(); + std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl; +} diff --git a/src/utils.cpp b/src/utils.cpp index 62088fe2..db083f97 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -99,4 +99,13 @@ std::string getElapsedTime(std::chrono::high_resolution_clock::time_point start, int seconds = elapsed.count() - (hours * 3600) - (minutes * 60); std::string elapsed_time = std::to_string(hours) + ":" + std::to_string(minutes) + ":" + std::to_string(seconds); return elapsed_time; -} \ No newline at end of file +} + +// Function to remove the 'chr' prefix from chromosome names +std::string removeChrPrefix(std::string chr) +{ + if (chr.find("chr") != std::string::npos) { + return chr.substr(3); + } + return chr; +} diff --git a/tests/test_general.py b/tests/test_general.py index dbca30b9..1d81fe96 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -64,11 +64,11 @@ def test_run(): # Check that the VCF file has the correct number of lines. with open(output_file, 'r', encoding='utf-8') as f: - assert len(f.readlines()) == 41 + assert len(f.readlines()) == 32 # Check that the VCF file has the correct header, and the correct # VCF CHROM, POS, and INFO fields in the next 2 lines. - header_line = 18 + header_line = 17 with open(output_file, 'r', encoding='utf-8') as f: for i, line in enumerate(f): if i == header_line: From 7d36424f278d0537ffc85189df8ba0947bcd9ec5 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 23 Nov 2024 14:59:04 -0500 Subject: [PATCH 020/134] Update pfb to htslib --- .gitignore | 1 + include/cnv_caller.h | 9 ++- src/cnv_caller.cpp | 182 +++++++++++++++++++++++++++++++++---------- src/sv_caller.cpp | 7 ++ 4 files changed, 153 insertions(+), 46 deletions(-) diff --git a/.gitignore b/.gitignore index 5b1177ee..8284049d 100644 --- a/.gitignore +++ b/.gitignore @@ -73,6 +73,7 @@ tests/cpp_module_out data/gnomadv2_filepaths.txt data/gnomadv3_filepaths.txt data/gnomadv4_filepaths.txt +data/gnomadv4_filepaths_ssd.txt data/gnomadv4_hg19_filepaths.txt # Training data diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 858c7454..f518f055 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -56,10 +56,11 @@ class CNVCaller { SNPData snp_data; SNPInfo snp_info; double mean_chr_cov = 0.0; - std::unordered_map pos_depth_map; - std::unordered_map snp_baf_map; - std::set snp_baf_keys; - std::unordered_map snp_pfb_map; + std::unordered_map pos_depth_map; // Read depth map + std::unordered_map snp_baf_map; // SNP B-allele frequency map + // std::set snp_alt_map; // SNP B-allele map + std::set snp_baf_keys; // SNP positions for BAF values + std::unordered_map snp_pfb_map; // SNP population frequency map // Define a map of CNV genotypes by HMM predicted state. // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output. diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index e8428ec5..de19d28f 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -3,6 +3,9 @@ #include +#include +#include + /// @cond #include #include @@ -295,7 +298,7 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::setinput_data.getThreadCount()); + // Read the header bam_hdr_t *bam_header = sam_hdr_read(bam_file); if (!bam_header) @@ -842,58 +848,150 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl; int thread_count = this->input_data.getThreadCount(); - // Run bcftools query to get the population frequencies for the - // chromosome within the SNP region, filtering for SNPS only, - // and within the MIN-MAX range of frequencies. - std::string snps_fp = this->input_data.getOutputDir() + "/filtered_snps.vcf"; - std::string filter_criteria = "INFO/variant_type=\"snv\" && " + AF_key + " >= " + std::to_string(MIN_PFB) + " && " + AF_key + " <= " + std::to_string(MAX_PFB); - std::string cmd = \ - "bcftools view --threads " + std::to_string(thread_count) + " -T " + snps_fp + " -i '" + filter_criteria + "' " + pfb_filepath + " | bcftools query -f '%POS\t%" + AF_key + "\n' 2>/dev/null"; - - // printMessage("Running command: " + cmd); - std::cout << "Running command: " << cmd << std::endl; + // Open the population frequency file + std::cout << "Opening population frequency file: " << pfb_filepath << std::endl; + htsFile *pfb_file = hts_open(pfb_filepath.c_str(), "r"); + if (!pfb_file) + { + throw std::runtime_error("ERROR: Could not open population frequency file: " + pfb_filepath); + } - // Open a pipe to read the output of the command - FILE *fp = popen(cmd.c_str(), "r"); - if (fp == NULL) + // Enable multi-threading + std::cout << "Setting number of threads to " << thread_count << std::endl; + hts_set_threads(pfb_file, thread_count); + + // Read the header + std::cout << "Reading header from population frequency file..." << std::endl; + bcf_hdr_t *pfb_header = bcf_hdr_read(pfb_file); + if (!pfb_header) { - throw std::runtime_error("ERROR: Could not open pipe for command: " + cmd); + bcf_close(pfb_file); + throw std::runtime_error("ERROR: Could not read header from population frequency file: " + pfb_filepath); } - // Loop through the BCFTOOLS output and populate the map of population - // frequencies - // printMessage("Parsing population frequencies for chromosome " + chr + - // "..."); - std::cout << "Parsing population frequencies for chromosome " << chr << "..." << std::endl; - // const int line_size = 256; - // char line[line_size]; + // Set up the record + std::cout << "Initializing BCF record..." << std::endl; + bcf1_t *pfb_record = bcf_init(); + if (!pfb_record) + { + bcf_hdr_destroy(pfb_header); + bcf_close(pfb_file); + throw std::runtime_error("ERROR: Could not initialize BCF record."); + } + + // Read the population frequencies for the chromosome + std::cout << "[TEST] Reading population frequencies for chromosome " << chr << " (AF_key = " << AF_key << ")..." << std::endl; int print_count = 0; - // while (fgets(line, line_size, fp) != NULL) - char line[2048]; - while (fgets(line, sizeof(line), fp) != NULL) + while (bcf_read(pfb_file, pfb_header, pfb_record) == 0) { - std::istringstream iss(line); - // Parse the line - int pos; - double pfb; - // if (sscanf(line, "%d\t%lf", &pos, &pfb) == 2) - if (iss >> pos >> pfb){ - // pos_pfb_map[pos] = pfb; // Add the position and population - // frequency to the map - // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb); - - // Print the first 10 population frequencies - if (print_count < 10) + // Get the chromosome and position + // std::cout << "Reading record..." << std::endl; + // std::string record_chr = bcf_hdr_id2name(pfb_header, pfb_record->rid); + uint32_t pos = pfb_record->pos + 1; // 0-based to 1-based + + // Skip if not a SNP, or if the position is not in the BAF map + // if (!bcf_is_snp(pfb_record) || this->snp_baf_keys.find(pos) == this->snp_baf_keys.end()) + if (!bcf_is_snp(pfb_record) || this->snp_baf_keys.count(pos) == 0) + { + continue; + } + + // Get the population frequency for the SNP + // std::cout << "Getting population frequency..." << std::endl; + // double pfb = 0.0; + // int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb, NULL); + // if (pfb_status < 0) + // { + // continue; + // } + float *pfb_f = NULL; + int count = 0; + int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count); + if (pfb_status < 0 || count == 0) + { + std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl; + continue; + } + double pfb = (double) pfb_f[0]; + free(pfb_f); + + // Continue if the population frequency is outside the threshold + if (pfb <= MIN_PFB || pfb >= MAX_PFB) + { + continue; + } + + // Add the population frequency to the SNP data + // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb); + if (this->snp_pfb_map.find(pos) == this->snp_pfb_map.end()) + { + this->snp_pfb_map[pos] = pfb; + } else { + // Keep the larger population frequency + if (pfb > this->snp_pfb_map[pos]) { - std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; - // printMessage("Population frequency for " + chr + ":" + - // std::to_string(pos) + " = " + std::to_string(pfb)); this->snp_pfb_map[pos] = pfb; - print_count++; } } + if (print_count < 10) + { + std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; + print_count++; + } } - pclose(fp); + + // // Run bcftools query to get the population frequencies for the + // // chromosome within the SNP region, filtering for SNPS only, + // // and within the MIN-MAX range of frequencies. + // std::string snps_fp = this->input_data.getOutputDir() + "/filtered_snps.vcf"; + // std::string filter_criteria = "INFO/variant_type=\"snv\" && " + AF_key + " >= " + std::to_string(MIN_PFB) + " && " + AF_key + " <= " + std::to_string(MAX_PFB); + // std::string cmd = \ + // "bcftools view --threads " + std::to_string(thread_count) + " -T " + snps_fp + " -i '" + filter_criteria + "' " + pfb_filepath + " | bcftools query -f '%POS\t%" + AF_key + "\n' 2>/dev/null"; + + // // printMessage("Running command: " + cmd); + // std::cout << "Running command: " << cmd << std::endl; + + // // Open a pipe to read the output of the command + // FILE *fp = popen(cmd.c_str(), "r"); + // if (fp == NULL) + // { + // throw std::runtime_error("ERROR: Could not open pipe for command: " + cmd); + // } + + // // Loop through the BCFTOOLS output and populate the map of population + // // frequencies + // // printMessage("Parsing population frequencies for chromosome " + chr + + // // "..."); + // std::cout << "Parsing population frequencies for chromosome " << chr << "..." << std::endl; + // // const int line_size = 256; + // // char line[line_size]; + // int print_count = 0; + // // while (fgets(line, line_size, fp) != NULL) + // char line[2048]; + // while (fgets(line, sizeof(line), fp) != NULL) + // { + // std::istringstream iss(line); + // // Parse the line + // int pos; + // double pfb; + // // if (sscanf(line, "%d\t%lf", &pos, &pfb) == 2) + // if (iss >> pos >> pfb){ + // // pos_pfb_map[pos] = pfb; // Add the position and population + // // frequency to the map + // // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb); + + // // Print the first 10 population frequencies + // if (print_count < 10) + // { + // std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; + // // printMessage("Population frequency for " + chr + ":" + + // // std::to_string(pos) + " = " + std::to_string(pfb)); + // this->snp_pfb_map[pos] = pfb; + // print_count++; + // } + // } + // } + // pclose(fp); std::cout << "Finished reading population frequencies for chromosome " << chr << std::endl; // printMessage("Finished parsing population frequencies for chromosome " + chr + "..."); } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index cd44585e..228a93a2 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -383,6 +383,13 @@ std::unordered_map> SVCaller::run() throw std::runtime_error("ERROR: failed to open " + bam_filepath); } + // Enable multi-threading + int num_threads = this->input_data.getThreadCount(); + if (num_threads > 1) { + std::cout << "Running SV detection with " << num_threads << " thread(s)..." << std::endl; + } + hts_set_threads(fp_in, num_threads); + // Load the header for the BAM file bam_hdr_t *bamHdr = sam_hdr_read(fp_in); if (!bamHdr) { From ba19aeb24ef8214f34047a39aa3f39ce9615ac44 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 25 Nov 2024 01:58:15 -0500 Subject: [PATCH 021/134] update reading snp vcf to htslib --- include/cnv_caller.h | 15 +- python/sv_merger.py | 2 +- src/cnv_caller.cpp | 1124 ++++++++++++++++++++++++++++++++---------- src/sv_object.cpp | 12 +- 4 files changed, 876 insertions(+), 277 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index f518f055..e8d3d3b8 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -57,10 +57,10 @@ class CNVCaller { SNPInfo snp_info; double mean_chr_cov = 0.0; std::unordered_map pos_depth_map; // Read depth map - std::unordered_map snp_baf_map; // SNP B-allele frequency map + // std::unordered_map snp_baf_map; // SNP B-allele frequency map // std::set snp_alt_map; // SNP B-allele map - std::set snp_baf_keys; // SNP positions for BAF values - std::unordered_map snp_pfb_map; // SNP population frequency map + // std::set snp_baf_keys; // SNP positions for BAF values + // std::unordered_map snp_pfb_map; // SNP population frequency map // Define a map of CNV genotypes by HMM predicted state. // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output. @@ -90,7 +90,7 @@ class CNVCaller { // Query a region for SNPs and return the SNP data std::pair querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, SNPInfo &snp_info, std::unordered_map &pos_depth_map, double mean_chr_cov); - std::tuple, std::vector, std::vector> querySNPs(std::string chr, uint32_t start, uint32_t end); + void querySNPs(std::string chr, uint32_t start, uint32_t end, std::set& snp_pos, std::unordered_map& snp_baf, std::unordered_map& snp_pfb); // Run copy number prediction for a chunk of SV candidates from CIGAR strings void runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, CHMM hmm, int window_size, double mean_chr_cov); @@ -130,11 +130,14 @@ class CNVCaller { double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::unordered_map& pos_depth_map, double mean_chr_cov); // Read SNP positions and BAF values from the VCF file of SNP calls - void readSNPAlleleFrequencies(std::string chr, std::string filepath, SNPInfo& snp_info); + // void readSNPAlleleFrequencies(std::string chr, std::string filepath, SNPInfo& snp_info); // Read SNP population frequencies from the PFB file and return a vector // of population frequencies for each SNP location - void getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info); + // void getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info); + + void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::set& snp_pos, std::unordered_map& snp_baf); + void readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map& snp_pfb_map); // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood); diff --git a/python/sv_merger.py b/python/sv_merger.py index 78733f6d..b2d1491a 100644 --- a/python/sv_merger.py +++ b/python/sv_merger.py @@ -153,7 +153,7 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): # Merge SVs with the same label unique_labels = np.unique(cluster_labels) - logging.info("Unique labels: %s", unique_labels) + #logging.info("Unique labels: %s", unique_labels) for label in unique_labels: diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index de19d28f..4fcc1604 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -5,6 +5,7 @@ #include #include +#include /// @cond #include @@ -59,7 +60,18 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta bool snps_found = false; uint32_t window_size = (uint32_t)this->input_data.getWindowSize(); - // printMessage("Querying SNPs for region " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); + // Query the SNPs for the entire region + std::set snp_pos; + std::unordered_map snp_baf; + std::unordered_map snp_pfb; + this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb); + // std::pair, std::vector, std::vector> snp_query = this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb); + // std::vector& snp_pos = std::get<0>(snp_query); + // std::vector& snp_pfb = std::get<1>(snp_query); + // std::vector& snp_baf = std::get<2>(snp_query); + + // Loop through the range of the SV region and query the SNPs in a sliding + // window, then calculate the log2 ratio for each window for (uint32_t i = start_pos; i <= end_pos; i += window_size) { // Run a sliding non-overlapping window of size window_size across @@ -68,27 +80,25 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta uint32_t window_end = std::min(i + window_size - 1, end_pos); // Get the SNP info for the window - // std::cout << "Querying SNPs for window " << chr << ":" << window_start << "-" << window_end << "..." << std::endl; - // this->snp_data_mtx.lock(); - // std::tuple, std::vector, - // std::vector> window_snps = snp_info.querySNPs(chr, - // window_start, window_end); - std::tuple, std::vector, std::vector> window_snps = this->querySNPs(chr, window_start, window_end); - // this->snp_data_mtx.unlock(); - std::vector& snp_window_pos = std::get<0>(window_snps); // SNP positions - std::vector& snp_window_bafs = std::get<1>(window_snps); // B-allele frequencies - std::vector& snp_window_pfbs = std::get<2>(window_snps); // Population frequencies of the B allele + std::vector snp_window_pos; + std::vector snp_window_bafs; + std::vector snp_window_pfbs; + auto it_start = snp_pos.lower_bound(window_start); + auto it_end = snp_pos.upper_bound(window_end); + for (auto it = it_start; it != it_end; it++) + { + snp_window_pos.push_back(*it); + snp_window_bafs.push_back(snp_baf[*it]); + snp_window_pfbs.push_back(snp_pfb[*it]); + } // Loop though the SNP positions and calculate the log2 ratio for // the window up to the SNP, then calculate the log2 ratio centered // at the SNP, and finally calculate the log2 ratio for the window // after the SNP, and continue until the end of the window - std::vector window_log2_ratios; - int snp_count = (int) snp_window_pos.size(); - - // If there are no SNPs in the window, then use the default BAF and - // PFB values, and the coverage log2 ratio - if (snp_count == 0) + // (If there are no SNPs in the window, then use the default BAF and + // PFB values, and the coverage log2 ratio) + if (snp_window_pos.size() == 0) { double window_log2_ratio = calculateLog2Ratio(window_start, window_end, pos_depth_map, mean_chr_cov); double pfb_default = 0.5; @@ -99,23 +109,18 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta snps_found = true; // Loop through the SNPs and calculate the log2 ratios - uint32_t bin_start = window_start; - uint32_t bin_end = 0; - for (int j = 0; j < snp_count; j++) + // uint32_t bin_start = window_start; + // uint32_t bin_end = 0; + for (int j = 0; j < (int) snp_window_pos.size(); j++) { - // SNP bin starts at 1/2 the distance between the previous SNP - // and the current SNP, and ends at 1/2 the distance between - // the current SNP and the next SNP. For the first SNP, the - // bin starts at the window start and ends at 1/2 the distance - // between the first SNP and the next SNP, and for the last - // SNP, the bin starts at 1/2 the distance between the previous - // SNP and the last SNP and ends at the window end. - uint32_t snp_pos = snp_window_pos[j]; - bin_end = snp_pos + (j == snp_count-1 ? (window_end - snp_pos) / 2 : (snp_window_pos[j+1] - snp_pos) / 2); + // Just use a window centered at the SNP position + uint32_t bin_start = snp_window_pos[j] - window_size / 2; + uint32_t bin_end = snp_window_pos[j] + window_size / 2; // Calculate the log2 ratio for the SNP bin double bin_cov = calculateLog2Ratio(bin_start, bin_end, pos_depth_map, mean_chr_cov); - this->updateSNPData(snp_data, snp_pos, snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true); + this->updateSNPData(snp_data, snp_window_pos[j], snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true); + // this->updateSNPData(snp_data, snp_pos, snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true); // Update the previous bin start bin_start = bin_end + 1; @@ -512,13 +517,13 @@ void CNVCaller::loadChromosomeData(std::string chr) //this->mean_chr_cov = 30.0; printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov)); - std::cout << "Reading SNP allele frequencies for chromosome " << chr << " from VCF file..." << std::endl; - std::string snp_filepath = this->input_data.getSNPFilepath(); - readSNPAlleleFrequencies(chr, snp_filepath, this->snp_info); + // std::cout << "Reading SNP allele frequencies for chromosome " << chr << " from VCF file..." << std::endl; + // std::string snp_filepath = this->input_data.getSNPFilepath(); + // readSNPAlleleFrequencies(chr, snp_filepath, this->snp_info); - std::cout << "Obtaining SNP population frequencies for chromosome " << chr << "..." << std::endl; - getSNPPopulationFrequencies(chr, this->snp_info); - std::cout << "Finished loading chromosome data for " << chr << std::endl; + // std::cout << "Obtaining SNP population frequencies for chromosome " << chr << "..." << std::endl; + // getSNPPopulationFrequencies(chr, this->snp_info); + // std::cout << "Finished loading chromosome data for " << chr << std::endl; } // Calculate the mean chromosome coverage @@ -679,133 +684,563 @@ double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std:: return window_log2_ratio; } -void CNVCaller::readSNPAlleleFrequencies(std::string chr, std::string filepath, SNPInfo& snp_info) +void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::set& snp_pos, std::unordered_map& snp_baf) { - - // Check that the SNP file is sorted by running bcftools index and reading - // the error output - std::string index_cmd = "bcftools index " + filepath + " 2>&1 | grep -i error"; - if (this->input_data.getVerbose()) { - std::cout << "Command: " << index_cmd << std::endl; - } - - // Open a pipe to read the output of the command - FILE *index_fp = popen(index_cmd.c_str(), "r"); - if (index_fp == NULL) + // Get the SNP file path + std::string snp_filepath = this->input_data.getSNPFilepath(); + if (snp_filepath.empty()) { - std::cerr << "ERROR: Could not open pipe for command: " << index_cmd << std::endl; - exit(1); + throw std::runtime_error("ERROR: SNP file path is empty."); } - // Read the output of the command - const int error_size = 256; - char index_error[error_size]; - while (fgets(index_error, error_size, index_fp) != NULL) + // Initialize the synced reader + bcf_srs_t *snp_reader = bcf_sr_init(); + if (!snp_reader) { - std::cerr << "ERROR: " << index_error << std::endl; - exit(1); - } - pclose(index_fp); // Close the process - - // Filter variants by depth, quality, and region - if (this->input_data.getVerbose()) { - std::cout << "Filtering SNPs by depth, quality, and region..." << std::endl; + throw std::runtime_error("ERROR: Could not initialize SNP reader."); } - // Check if a region was specified by the user - std::string region_str = chr; - if (this->input_data.isRegionSet()) + // Read the SNP header + htsFile *snp_file = bcf_open(snp_filepath.c_str(), "r"); + bcf_hdr_t *snp_header = bcf_hdr_read(snp_file); + if (!snp_header) { - std::pair region = this->input_data.getRegion(); - region_str = chr + ":" + std::to_string(region.first) + "-" + std::to_string(region.second); + bcf_sr_destroy(snp_reader); + bcf_close(snp_file); + throw std::runtime_error("ERROR: Could not initialize SNP header."); } - std::string filtered_snp_vcf_filepath = this->input_data.getOutputDir() + "/filtered_snps.vcf"; + // Set multi-threading int thread_count = this->input_data.getThreadCount(); - // std::string cmd = "bcftools view -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath; - std::string cmd = "bcftools view --threads " + std::to_string(thread_count) + " -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath; - if (this->input_data.getVerbose()) { - std::cout << "Filtering SNPs by depth and quality..." << std::endl; - std::cout << "Command: " << cmd << std::endl; - } - system(cmd.c_str()); - - if (this->input_data.getVerbose()) { - std::cout << "Filtered SNPs written to " << filtered_snp_vcf_filepath << std::endl; - } + bcf_sr_set_threads(snp_reader, thread_count); - // Extract B-allele frequency data from the VCF file and sort by chromosome - // and position - if (this->input_data.getVerbose()) { - std::cout << "Extracting B-allele frequency data from filtered SNPs..." << std::endl; + // Enable index usage + snp_reader->require_index = 1; + + // Add the SNP file to the reader + if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0) + { + bcf_sr_destroy(snp_reader); + bcf_hdr_destroy(snp_header); + bcf_close(snp_file); + throw std::runtime_error("ERROR: Could not add SNP file to reader: " + snp_filepath); } - cmd = "bcftools query -f '%POS,[%AD]\n' " + filtered_snp_vcf_filepath + " 2>/dev/null"; - FILE *fp = popen(cmd.c_str(), "r"); - if (fp == NULL) + + // Set the region + std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); + if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) { - std::cerr << "ERROR: Could not open pipe for command: " << cmd << std::endl; - exit(1); + bcf_sr_destroy(snp_reader); + bcf_hdr_destroy(snp_header); + bcf_close(snp_file); + throw std::runtime_error("ERROR: Could not set region for SNP reader: " + region_str); } - // Read the reference and alternate allele depths from the VCF file - std::string alt_allele = ""; // Alternate allele - uint32_t pos = 0; - int ref_ad = 0; - int alt_ad = 0; - const int line_size = 1024; - char line[line_size]; // Line buffer - std::vector locations; - std::vector bafs; - std::string chr_no_prefix = removeChrPrefix(chr); - while (fgets(line, line_size, fp) != NULL) + std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl; + int print_count = 0; + while (bcf_sr_next_line(snp_reader) >= 0) { - // Parse the line - char *tok = strtok(line, ","); // Tokenize the line - int col = 0; // Column index - while (tok != NULL) + bcf1_t *snp_record = bcf_sr_get_line(snp_reader, 0); + if (snp_record) { - // Get the position from column 2 - if (col == 0) + uint32_t pos = (uint32_t)snp_record->pos + 1; + + // Skip if not a SNP + if (!bcf_is_snp(snp_record)) { - pos = (uint32_t)atoi(tok); + continue; } - // Get the AD for the reference allele from column 3 - else if (col == 1) + // Get the QUAL, DP, and AD values + float qual = snp_record->qual; + if (bcf_float_is_missing(qual)) + { + std::cerr << "ERROR: QUAL value is missing for SNP at " << chr << ":" << pos << std::endl; + } + // Skip if quality is less than 30 + if (qual <= 30) { - ref_ad = atoi(tok); + continue; } - // Get the AD for the non-reference allele from column 4 - else if (col == 2) + // Extract DP from FORMAT field + int32_t *dp = 0; + int dp_count = 0; + // int dp_count = bcf_get_info_int32(snp_header, snp_record, "DP", + // &dp, &dp_count); + int dp_ret = bcf_get_format_int32(snp_header, snp_record, "DP", &dp, &dp_count); + if (dp_ret < 0) { - alt_ad = atoi(tok); + std::cerr << "ERROR: Could not get DP value for SNP at " << chr << ":" << pos << std::endl; + } else { + // Skip if depth is not greater than 10 + for (int i = 0; i < dp_count; i++) + { + if (dp[i] <= 10) + { + continue; + } + } + // if (dp <= 10) + // { + // continue; + // } + } + free(dp); + // // Skip if depth is not greater than 10 + // if (dp <= 10) + // { + // continue; + // } + + // Skip if the SNP does not pass the filter + if (bcf_has_filter(snp_header, snp_record, const_cast("PASS")) != 1) + { + continue; } - // Move to the next token - tok = strtok(NULL, ","); - col++; - } + // Extract AD from FORMAT field + // int32_t ad[2] = {0, 0}; + int32_t *ad = 0; + int ad_count = 0; + // int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", + // &ad, &ad_count); + int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count); + + // Skip if AD value is missing + if (ad_ret < 0) + { + std::cerr << "ERROR: AD value is missing for SNP at " << chr << ":" << pos << std::endl; + continue; + } - // Calculate the B-allele frequency (BAF) as the ratio of the alternate - // allele depth to the total depth (reference + alternate) - double baf = (double) alt_ad / (double) (ref_ad + alt_ad); + // Calculate the B-allele frequency (BAF) + double baf = 0.0; + // double ad0 = (double) ad[0]; + // double ad1 = (double) ad[1]; + double ad0 = 0.0; + double ad1 = 0.0; + for (int i = 0; i < ad_count; i++) + { + if (i == 0) + { + ad0 = (double) ad[i]; + } else if (i == 1) { + ad1 = (double) ad[i]; + } + } + free(ad); + try { + // std::cout << "AD[0]: " << ad0 << ", AD[1]: " << ad1 << std::endl; + baf = ad1 / (ad0 + ad1); + // std::cout << "AD[0]: " << ad[0] << ", AD[1]: " << ad[1] << std::endl; + // baf = (double) ad[1] / (double) (ad[0] + ad[1]); + } catch (const std::exception& e) { + std::cerr << "ERROR: Could not calculate BAF for SNP at " << chr << ":" << pos << std::endl; + continue; + } + + // Insert the SNP position and BAF into the maps + snp_pos.insert(pos); + snp_baf[pos] = baf; - // Add a new location and BAF value to the chromosome's SNP data - // (population frequency and log2 ratio will be added later) - // snp_info.insertSNPAlleleFrequency(chr_no_prefix, pos, baf); - this->snp_baf_map[pos] = baf; - this->snp_baf_keys.insert(pos); + // Print the SNP position and BAF + if (print_count < 10) + { + std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << "(REF=" << ad0 << ",ALT=" << ad1 << ")" << std::endl; + print_count++; + } + } } - pclose(fp); // Close the process + // Clean up + std::cout << "Cleaning up SNP reader..." << std::endl; + bcf_sr_destroy(snp_reader); + bcf_hdr_destroy(snp_header); + bcf_close(snp_file); + + // std::cout << "Opening SNP file: " << snp_filepath << std::endl; + // htsFile *snp_file = bcf_open(snp_filepath.c_str(), "r"); + // if (!snp_file) + // { + // throw std::runtime_error("ERROR: Could not open SNP file: " + snp_filepath); + // } + + // // Enable multi-threading + // hts_set_threads(snp_file, thread_count); + + // // Read the header + // bcf_hdr_t *snp_header = bcf_hdr_read(snp_file); + // if (!snp_header) + // { + // bcf_close(snp_file); + // throw std::runtime_error("ERROR: Could not read header from SNP file: " + snp_filepath); + // } + + // // Load the index + // hts_idx_t *snp_index = bcf_index_load(snp_filepath.c_str()); + // if (!snp_index) + // { + // bcf_hdr_destroy(snp_header); + // bcf_close(snp_file); + // throw std::runtime_error("ERROR: Could not load index for SNP file: " + snp_filepath); + // } + + // // Construct the region string + // std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); + // hts_itr_t *snp_iter = bcf_itr_querys(snp_index, snp_header, region_str.c_str()); + // if (!snp_iter) + // { + // hts_idx_destroy(snp_index); + // bcf_hdr_destroy(snp_header); + // bcf_close(snp_file); + // throw std::runtime_error("ERROR: Could not create iterator for SNP region: " + region_str); + // } + + // // Set up the record + // bcf1_t *snp_record = bcf_init(); + // if (!snp_record) + // { + // bcf_hdr_destroy(snp_header); + // bcf_close(snp_file); + // throw std::runtime_error("ERROR: Could not initialize SNP record."); + // } + + // // Read the SNPs in the chromosome region + // int print_count = 0; + // while (bcf_itr_next(snp_file, snp_iter, snp_record) >= 0) + // { + // // Get the position and B-allele frequency (BAF) from the SNP record + // uint32_t pos = snp_record->pos + 1; // 0-based to 1-based + + // // Get QUAL, DP, and AD values + // float qual = snp_record->qual; + // if (bcf_float_is_missing(qual)) + // { + // std::cerr << "ERROR: QUAL value is missing for SNP at " << chr << ":" << pos << std::endl; + // } + // // Skip if quality is less than 30 + // if (qual <= 30) + // { + // continue; + // } + + // // Get FILTER status + // int pass_id = bcf_hdr_id2int(snp_header, BCF_DT_ID, "PASS"); + // if (pass_id == -1) + // { + // std::cerr << "ERROR: Could not get PASS ID for SNP at " << chr << ":" << pos << std::endl; + // } + // std::string pass_filter = "PASS"; + // if (bcf_has_filter(snp_header, snp_record, const_cast(pass_filter.c_str())) != 1) + // { + // // Skip if the SNP does not pass the filter + // continue; + // } + + // // Extract DP from INFO field + // int32_t dp = 0; + // int dp_count = bcf_get_info_int32(snp_header, snp_record, "DP", &dp, &dp_count); + // if (dp_count != 1) + // { + // std::cerr << "ERROR: Could not get DP value for SNP at " << chr << ":" << pos << std::endl; + // } + // // Skip if depth is not greater than 10 + // if (dp <= 10) + // { + // continue; + // } + + // // Skip if not a SNP + // if (!bcf_is_snp(snp_record)) + // { + // continue; + // } + + // // Extract AD from FORMAT field + // int32_t ad[2] = {0, 0}; + // int ad_count = 0; + // int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count); + // // if (ad_count != 2) + // // { + // // std::cerr << "ERROR: Could not get AD value for SNP at " << chr << ":" << pos << std::endl; + // // } + + // // Calculate the BAF + // if (ad_ret > 0 && ad_count > 0) + // { + // double baf = (double) ad[1] / (double) (ad[0] + ad[1]); + // snp_pos.insert(pos); + // snp_baf[pos] = baf; + + // // Print the SNP position and BAF + // if (print_count < 10) + // { + // std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << std::endl; + // print_count++; + // } + // } + // } + + // // Clean up + // bcf_destroy(snp_record); + // hts_itr_destroy(snp_iter); + // hts_idx_destroy(snp_index); + // bcf_hdr_destroy(snp_header); + // bcf_close(snp_file); + + // // Check that the SNP file is sorted by running bcftools index and reading + // // the error output + // std::string index_cmd = "bcftools index " + filepath + " 2>&1 | grep -i error"; + // if (this->input_data.getVerbose()) { + // std::cout << "Command: " << index_cmd << std::endl; + // } + + // // Open a pipe to read the output of the command + // FILE *index_fp = popen(index_cmd.c_str(), "r"); + // if (index_fp == NULL) + // { + // std::cerr << "ERROR: Could not open pipe for command: " << index_cmd << std::endl; + // exit(1); + // } + + // // Read the output of the command + // const int error_size = 256; + // char index_error[error_size]; + // while (fgets(index_error, error_size, index_fp) != NULL) + // { + // std::cerr << "ERROR: " << index_error << std::endl; + // exit(1); + // } + // pclose(index_fp); // Close the process + + // // Filter variants by depth, quality, and region + // if (this->input_data.getVerbose()) { + // std::cout << "Filtering SNPs by depth, quality, and region..." << std::endl; + // } + + // // Check if a region was specified by the user + // std::string region_str = chr; + // if (this->input_data.isRegionSet()) + // { + // std::pair region = this->input_data.getRegion(); + // region_str = chr + ":" + std::to_string(region.first) + "-" + std::to_string(region.second); + // } + + // std::string filtered_snp_vcf_filepath = this->input_data.getOutputDir() + "/filtered_snps.vcf"; + // int thread_count = this->input_data.getThreadCount(); + // // std::string cmd = "bcftools view -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath; + // std::string cmd = "bcftools view --threads " + std::to_string(thread_count) + " -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath; + // if (this->input_data.getVerbose()) { + // std::cout << "Filtering SNPs by depth and quality..." << std::endl; + // std::cout << "Command: " << cmd << std::endl; + // } + // system(cmd.c_str()); + + // if (this->input_data.getVerbose()) { + // std::cout << "Filtered SNPs written to " << filtered_snp_vcf_filepath << std::endl; + // } + + // // Extract B-allele frequency data from the VCF file and sort by chromosome + // // and position + // if (this->input_data.getVerbose()) { + // std::cout << "Extracting B-allele frequency data from filtered SNPs..." << std::endl; + // } + // cmd = "bcftools query -f '%POS,[%AD]\n' " + filtered_snp_vcf_filepath + " 2>/dev/null"; + // FILE *fp = popen(cmd.c_str(), "r"); + // if (fp == NULL) + // { + // std::cerr << "ERROR: Could not open pipe for command: " << cmd << std::endl; + // exit(1); + // } + + // // Read the reference and alternate allele depths from the VCF file + // std::string alt_allele = ""; // Alternate allele + // uint32_t pos = 0; + // int ref_ad = 0; + // int alt_ad = 0; + // const int line_size = 1024; + // char line[line_size]; // Line buffer + // std::vector locations; + // std::vector bafs; + // std::string chr_no_prefix = removeChrPrefix(chr); + // while (fgets(line, line_size, fp) != NULL) + // { + // // Parse the line + // char *tok = strtok(line, ","); // Tokenize the line + // int col = 0; // Column index + // while (tok != NULL) + // { + // // Get the position from column 2 + // if (col == 0) + // { + // pos = (uint32_t)atoi(tok); + // } + + // // Get the AD for the reference allele from column 3 + // else if (col == 1) + // { + // ref_ad = atoi(tok); + // } + + // // Get the AD for the non-reference allele from column 4 + // else if (col == 2) + // { + // alt_ad = atoi(tok); + // } + + // // Move to the next token + // tok = strtok(NULL, ","); + // col++; + // } + + // // Calculate the B-allele frequency (BAF) as the ratio of the alternate + // // allele depth to the total depth (reference + alternate) + // double baf = (double) alt_ad / (double) (ref_ad + alt_ad); + + // // Add a new location and BAF value to the chromosome's SNP data + // // (population frequency and log2 ratio will be added later) + // // snp_info.insertSNPAlleleFrequency(chr_no_prefix, pos, baf); + // this->snp_baf_map[pos] = baf; + // this->snp_baf_keys.insert(pos); + // } + + // pclose(fp); // Close the process if (this->input_data.getVerbose()) { std::cout << "Finished extracting B-allele frequency data from filtered SNPs" << std::endl; } } -void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) +// void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) +// { +// // Get the population frequency file for the chromosome +// std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr); +// if (pfb_filepath.empty()) +// { +// std::cout << "No population frequency file provided for chromosome " << chr << std::endl; +// return; +// } + +// // Determine the ethnicity-specific allele frequency key +// std::string AF_key = "AF"; +// if (this->input_data.getEthnicity() != "") +// { +// AF_key += "_" + this->input_data.getEthnicity(); +// } + +// // Check if the filepath uses the 'chr' prefix notations based on the +// // chromosome name (*.chr1.vcf.gz vs *.1.vcf.gz) +// std::string chr_gnomad = chr; // gnomAD data may or may not have the 'chr' prefix +// std::string chr_prefix = "chr"; +// if (pfb_filepath.find(chr_prefix) == std::string::npos) +// { +// // Remove the 'chr' prefix from the chromosome name +// if (chr_gnomad.find(chr_prefix) != std::string::npos) +// { +// chr_gnomad = chr_gnomad.substr(chr_prefix.length()); +// } +// } else { +// // Add the 'chr' prefix to the chromosome name +// if (chr_gnomad.find(chr_prefix) == std::string::npos) +// { +// chr_gnomad = chr_prefix + chr; +// } +// } + +// // Remove the 'chr' prefix from the chromosome name for SNP data. All +// // SNP data in this program does not use the 'chr' prefix +// std::string chr_no_prefix = removeChrPrefix(chr); + +// std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl; +// int thread_count = this->input_data.getThreadCount(); + +// // Open the population frequency file +// std::cout << "Opening population frequency file: " << pfb_filepath << std::endl; +// htsFile *pfb_file = hts_open(pfb_filepath.c_str(), "r"); +// if (!pfb_file) +// { +// throw std::runtime_error("ERROR: Could not open population frequency file: " + pfb_filepath); +// } + +// // Enable multi-threading +// std::cout << "Setting number of threads to " << thread_count << std::endl; +// hts_set_threads(pfb_file, thread_count); + +// // Read the header +// std::cout << "Reading header from population frequency file..." << std::endl; +// bcf_hdr_t *pfb_header = bcf_hdr_read(pfb_file); +// if (!pfb_header) +// { +// bcf_close(pfb_file); +// throw std::runtime_error("ERROR: Could not read header from population frequency file: " + pfb_filepath); +// } + +// // Set up the record +// std::cout << "Initializing BCF record..." << std::endl; +// bcf1_t *pfb_record = bcf_init(); +// if (!pfb_record) +// { +// bcf_hdr_destroy(pfb_header); +// bcf_close(pfb_file); +// throw std::runtime_error("ERROR: Could not initialize BCF record."); +// } + +// // Read the population frequencies for the chromosome +// std::cout << "[TEST] Reading population frequencies for chromosome " << chr << " (AF_key = " << AF_key << ")..." << std::endl; +// int print_count = 0; +// while (bcf_read(pfb_file, pfb_header, pfb_record) == 0) +// { +// // Get the chromosome and position +// // std::cout << "Reading record..." << std::endl; +// uint32_t pos = pfb_record->pos + 1; // 0-based to 1-based + +// // Skip if not a SNP, or if the position is not in the BAF map +// if (!bcf_is_snp(pfb_record) || this->snp_baf_keys.count(pos) == 0) +// { +// continue; +// } + +// // Get the population frequency for the SNP +// float *pfb_f = NULL; +// int count = 0; +// int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count); +// if (pfb_status < 0 || count == 0) +// { +// std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl; +// continue; +// } +// double pfb = (double) pfb_f[0]; +// free(pfb_f); + +// // Continue if the population frequency is outside the threshold +// if (pfb <= MIN_PFB || pfb >= MAX_PFB) +// { +// continue; +// } + +// // Add the population frequency to the SNP data +// // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb); +// if (this->snp_pfb_map.find(pos) == this->snp_pfb_map.end()) +// { +// this->snp_pfb_map[pos] = pfb; +// } else { +// // Keep the larger population frequency +// if (pfb > this->snp_pfb_map[pos]) +// { +// this->snp_pfb_map[pos] = pfb; +// } +// } +// if (print_count < 10) +// { +// std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; +// print_count++; +// } +// } +// std::cout << "Finished reading population frequencies for chromosome " << chr << std::endl; +// } + +void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map& snp_pfb_map) { // Get the population frequency file for the chromosome std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr); @@ -848,152 +1283,278 @@ void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl; int thread_count = this->input_data.getThreadCount(); - // Open the population frequency file - std::cout << "Opening population frequency file: " << pfb_filepath << std::endl; - htsFile *pfb_file = hts_open(pfb_filepath.c_str(), "r"); - if (!pfb_file) + // Initialize the synced reader + bcf_srs_t *pfb_reader = bcf_sr_init(); + if (!pfb_reader) { - throw std::runtime_error("ERROR: Could not open population frequency file: " + pfb_filepath); + throw std::runtime_error("ERROR: Could not initialize synced reader for population frequency file: " + pfb_filepath); } - // Enable multi-threading - std::cout << "Setting number of threads to " << thread_count << std::endl; - hts_set_threads(pfb_file, thread_count); + // Set multi-threading + bcf_sr_set_threads(pfb_reader, thread_count); - // Read the header - std::cout << "Reading header from population frequency file..." << std::endl; - bcf_hdr_t *pfb_header = bcf_hdr_read(pfb_file); - if (!pfb_header) + // Enable index usage + pfb_reader->require_index = 1; + + // Add the population frequency file to the synced reader + if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0) { - bcf_close(pfb_file); - throw std::runtime_error("ERROR: Could not read header from population frequency file: " + pfb_filepath); + bcf_sr_destroy(pfb_reader); + throw std::runtime_error("ERROR: Could not add population frequency file to synced reader: " + pfb_filepath); } - // Set up the record - std::cout << "Initializing BCF record..." << std::endl; - bcf1_t *pfb_record = bcf_init(); - if (!pfb_record) + // Set the region for the synced reader + std::string region_str = chr_gnomad + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); + if (bcf_sr_set_regions(pfb_reader, region_str.c_str(), 0) < 0) { - bcf_hdr_destroy(pfb_header); - bcf_close(pfb_file); - throw std::runtime_error("ERROR: Could not initialize BCF record."); + bcf_sr_destroy(pfb_reader); + throw std::runtime_error("ERROR: Could not set region for synced reader: " + region_str); + } else { + std::cout << "Successfully set region for synced reader: " << region_str << std::endl; } - // Read the population frequencies for the chromosome - std::cout << "[TEST] Reading population frequencies for chromosome " << chr << " (AF_key = " << AF_key << ")..." << std::endl; - int print_count = 0; - while (bcf_read(pfb_file, pfb_header, pfb_record) == 0) + // Iterate through the records in the population frequency file + // bcf1_t *pfb_record = bcf_init(); + // if (!pfb_record) + // { + // bcf_sr_destroy(pfb_reader); + // throw std::runtime_error("ERROR: Could not initialize BCF record for population frequency file: " + pfb_filepath); + // } + + int test_count = 0; + std::cout << "Iterating through records..." << std::endl; + while (bcf_sr_next_line(pfb_reader) >= 0) { - // Get the chromosome and position // std::cout << "Reading record..." << std::endl; - // std::string record_chr = bcf_hdr_id2name(pfb_header, pfb_record->rid); - uint32_t pos = pfb_record->pos + 1; // 0-based to 1-based - - // Skip if not a SNP, or if the position is not in the BAF map - // if (!bcf_is_snp(pfb_record) || this->snp_baf_keys.find(pos) == this->snp_baf_keys.end()) - if (!bcf_is_snp(pfb_record) || this->snp_baf_keys.count(pos) == 0) + // pfb_record = bcf_sr_get_line(pfb_reader, 0); + bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0); + // Do something with the record + if (pfb_record) { - continue; - } + // Skip if not a SNP + if (!bcf_is_snp(pfb_record)) + { + // std::cout << "Skipping non-SNP at " << chr << ":" << pfb_record->pos << std::endl; + continue; + } - // Get the population frequency for the SNP - // std::cout << "Getting population frequency..." << std::endl; - // double pfb = 0.0; - // int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb, NULL); - // if (pfb_status < 0) - // { - // continue; - // } - float *pfb_f = NULL; - int count = 0; - int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count); - if (pfb_status < 0 || count == 0) - { - std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl; - continue; - } - double pfb = (double) pfb_f[0]; - free(pfb_f); + uint32_t pos = (uint32_t) pfb_record->pos + 1; // 0-based to 1-based - // Continue if the population frequency is outside the threshold - if (pfb <= MIN_PFB || pfb >= MAX_PFB) - { - continue; - } + // Get the population frequency for the SNP + float *pfb_f = NULL; + int count = 0; + int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count); + if (pfb_status < 0 || count == 0) + { + // std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl; + continue; + } + double pfb = (double) pfb_f[0]; + free(pfb_f); - // Add the population frequency to the SNP data - // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb); - if (this->snp_pfb_map.find(pos) == this->snp_pfb_map.end()) - { - this->snp_pfb_map[pos] = pfb; - } else { - // Keep the larger population frequency - if (pfb > this->snp_pfb_map[pos]) + // Continue if the population frequency is outside the threshold + if (pfb <= MIN_PFB || pfb >= MAX_PFB) { - this->snp_pfb_map[pos] = pfb; + continue; + } + + // Add the population frequency to the SNP data + if (snp_pfb_map.find(pos) == snp_pfb_map.end()) + { + snp_pfb_map[pos] = pfb; + } else { + // Keep the larger population frequency + if (pfb > snp_pfb_map[pos]) + { + snp_pfb_map[pos] = pfb; + } + } + + if (test_count < 10) + { + std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; + test_count++; } } - if (print_count < 10) - { - std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; - print_count++; - } + // std::cout << "Record: " << pfb_record->pos << std::endl; + // std::cout << "QUAL: " << pfb_record->qual << std::endl; + + // // Skip if not a SNP + // if (!bcf_is_snp(pfb_record)) + // { + // std::cout << "Skipping non-SNP at " << chr << ":" << pos << std::endl; + // continue; + // } + + // // Get the population frequency for the SNP + // float *pfb_f = NULL; + // int count = 0; + // int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count); + // if (pfb_status < 0 || count == 0) + // { + // std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl; + // continue; + // } + // double pfb = (double) pfb_f[0]; + // free(pfb_f); + + // // Continue if the population frequency is outside the threshold + // if (pfb <= MIN_PFB || pfb >= MAX_PFB) + // { + // continue; + // } + + // // Add the population frequency to the SNP data + // // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb); + // if (snp_pfb_map.find(pos) == snp_pfb_map.end()) + // { + // snp_pfb_map[pos] = pfb; + // } else { + // // Keep the larger population frequency + // if (pfb > snp_pfb_map[pos]) + // { + // snp_pfb_map[pos] = pfb; + // } + // } + // if (print_count < 10) + // { + // std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; + // print_count++; + // } } + } + if (pfb_reader->errnum) + { + std::cerr << "ERROR: " <errnum) << std::endl; } - // // Run bcftools query to get the population frequencies for the - // // chromosome within the SNP region, filtering for SNPS only, - // // and within the MIN-MAX range of frequencies. - // std::string snps_fp = this->input_data.getOutputDir() + "/filtered_snps.vcf"; - // std::string filter_criteria = "INFO/variant_type=\"snv\" && " + AF_key + " >= " + std::to_string(MIN_PFB) + " && " + AF_key + " <= " + std::to_string(MAX_PFB); - // std::string cmd = \ - // "bcftools view --threads " + std::to_string(thread_count) + " -T " + snps_fp + " -i '" + filter_criteria + "' " + pfb_filepath + " | bcftools query -f '%POS\t%" + AF_key + "\n' 2>/dev/null"; - - // // printMessage("Running command: " + cmd); - // std::cout << "Running command: " << cmd << std::endl; + // std::cout << "Test count: " << test_count << std::endl; - // // Open a pipe to read the output of the command - // FILE *fp = popen(cmd.c_str(), "r"); - // if (fp == NULL) + // Clean up + // bcf_destroy(pfb_record); + bcf_sr_destroy(pfb_reader); + std::cout << "Finished reading population frequencies for SV region" << std::endl; + + + // // Open the population frequency file + // std::cout << "Opening population frequency file: " << pfb_filepath << std::endl; + // htsFile *pfb_file = hts_open(pfb_filepath.c_str(), "r"); + // if (!pfb_file) + // { + // throw std::runtime_error("ERROR: Could not open population frequency file: " + pfb_filepath); + // } + + // // Enable multi-threading + // std::cout << "Setting number of threads to " << thread_count << std::endl; + // hts_set_threads(pfb_file, thread_count); + + // // Read the header + // std::cout << "Reading header from population frequency file..." << std::endl; + // bcf_hdr_t *pfb_header = bcf_hdr_read(pfb_file); + // if (!pfb_header) + // { + // bcf_close(pfb_file); + // throw std::runtime_error("ERROR: Could not read header from population frequency file: " + pfb_filepath); + // } + + // // Load the index + // hts_idx_t *pfb_index = bcf_index_load(pfb_filepath.c_str()); + // if (!pfb_index) + // { + // bcf_hdr_destroy(pfb_header); + // bcf_close(pfb_file); + // throw std::runtime_error("ERROR: Could not load index for population frequency file: " + pfb_filepath); + // } + + // // Construct the region string + // std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); + // hts_itr_t *pfb_iter = bcf_itr_querys(pfb_index, pfb_header, region_str.c_str()); + // if (!pfb_iter) + // { + // // Try using the other chromosome notation + // std::string alt_region_str = "chr" + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); + // pfb_iter = bcf_itr_querys(pfb_index, pfb_header, alt_region_str.c_str()); + // if (!pfb_iter) + // { + // hts_idx_destroy(pfb_index); + // bcf_hdr_destroy(pfb_header); + // bcf_close(pfb_file); + // throw std::runtime_error("ERROR: Could not create iterator for region: " + alt_region_str); + // } else { + // region_str = alt_region_str; + // std::cout << "Successfully created iterator for region: " << region_str << std::endl; + // } + // // hts_idx_destroy(pfb_index); + // // bcf_hdr_destroy(pfb_header); + // // bcf_close(pfb_file); + // // throw std::runtime_error("ERROR: Could not create iterator for region: " + region_str); + // } + + // // Set up the record + // std::cout << "Initializing BCF record..." << std::endl; + // bcf1_t *pfb_record = bcf_init(); + // if (!pfb_record) // { - // throw std::runtime_error("ERROR: Could not open pipe for command: " + cmd); + // bcf_hdr_destroy(pfb_header); + // bcf_close(pfb_file); + // throw std::runtime_error("ERROR: Could not initialize BCF record."); // } - // // Loop through the BCFTOOLS output and populate the map of population - // // frequencies - // // printMessage("Parsing population frequencies for chromosome " + chr + - // // "..."); - // std::cout << "Parsing population frequencies for chromosome " << chr << "..." << std::endl; - // // const int line_size = 256; - // // char line[line_size]; + // // Read the population frequencies for the region + // std::cout << "[TEST] Reading population frequencies for region " << region_str << " (AF_key = " << AF_key << ")..." << std::endl; // int print_count = 0; - // // while (fgets(line, line_size, fp) != NULL) - // char line[2048]; - // while (fgets(line, sizeof(line), fp) != NULL) + // int test_count = 0; + // while (bcf_itr_next(pfb_file, pfb_iter, pfb_record) >= 0) // { - // std::istringstream iss(line); - // // Parse the line - // int pos; - // double pfb; - // // if (sscanf(line, "%d\t%lf", &pos, &pfb) == 2) - // if (iss >> pos >> pfb){ - // // pos_pfb_map[pos] = pfb; // Add the position and population - // // frequency to the map - // // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb); - - // // Print the first 10 population frequencies - // if (print_count < 10) + // test_count++; + // // Get the chromosome and position + // // std::cout << "Reading record..." << std::endl; + // uint32_t pos = pfb_record->pos + 1; // 0-based to 1-based + + // // Skip if not a SNP + // if (!bcf_is_snp(pfb_record)) + // { + // std::cout << "Skipping non-SNP at " << chr << ":" << pos << std::endl; + // continue; + // } + + // // Get the population frequency for the SNP + // float *pfb_f = NULL; + // int count = 0; + // int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count); + // if (pfb_status < 0 || count == 0) + // { + // std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl; + // continue; + // } + // double pfb = (double) pfb_f[0]; + // free(pfb_f); + + // // Continue if the population frequency is outside the threshold + // if (pfb <= MIN_PFB || pfb >= MAX_PFB) + // { + // continue; + // } + + // // Add the population frequency to the SNP data + // // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb); + // if (snp_pfb_map.find(pos) == snp_pfb_map.end()) + // { + // snp_pfb_map[pos] = pfb; + // } else { + // // Keep the larger population frequency + // if (pfb > snp_pfb_map[pos]) // { - // std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; - // // printMessage("Population frequency for " + chr + ":" + - // // std::to_string(pos) + " = " + std::to_string(pfb)); - // this->snp_pfb_map[pos] = pfb; - // print_count++; + // snp_pfb_map[pos] = pfb; // } // } + // if (print_count < 10) + // { + // std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; + // print_count++; + // } // } - // pclose(fp); - std::cout << "Finished reading population frequencies for chromosome " << chr << std::endl; - // printMessage("Finished parsing population frequencies for chromosome " + chr + "..."); + // std::cout << "Finished reading population frequencies for region " << region_str << std::endl; + // std::cout << "Test count: " << test_count << std::endl; } void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) @@ -1088,11 +1649,9 @@ void CNVCaller::updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, doubl snp_data.is_snp.emplace_back(is_snp); } -std::tuple, std::vector, std::vector> CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end) +void CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end, std::set& snp_pos, std::unordered_map& snp_baf, std::unordered_map& snp_pfb) { - // Lock the mutex for reading SNP information - // std::lock_guard lock(this->snp_info_mtx); - + std::string snp_chr = chr; chr = removeChrPrefix(chr); // Create an ordered map of SNP positions to BAF and PFB values @@ -1100,35 +1659,72 @@ std::tuple, std::vector, std::vector> CNVC // Query SNPs within a range (start, end) and return their BAF and PFB // values as separate vectors - std::vector bafs; - std::vector pfbs; - std::vector pos; + // std::vector bafs; + // std::vector pfbs; + // std::vector pos; double pfb_default = 0.5; + // Read the SNP data from the VCF file + this->readSNPAlleleFrequencies(snp_chr, start, end, snp_pos, snp_baf); + // Query the SNPs within the range and return their BAFs and corresponding // positions - auto snp_start = this->snp_baf_keys.lower_bound(start); - auto snp_end = this->snp_baf_keys.upper_bound(end); + // auto snp_start = this->snp_baf_keys.lower_bound(start); + // auto snp_end = this->snp_baf_keys.upper_bound(end); + // if (snp_start == this->snp_baf_keys.end()) + // { + // // return std::make_tuple(pos, bafs, pfbs); + // return; + // } - if (snp_start == this->snp_baf_keys.end()) - { - return std::make_tuple(pos, bafs, pfbs); - } + // Query the population frequencies for the SNPs + std::unordered_map pfb_map; + this->readSNPPopulationFrequencies(chr, start, end, pfb_map); - for (auto it = snp_start; it != snp_end; it++) + // Filter out the SNP population frequencies that are not in the SNP + // position set + // std::unordered_map snp_pfb; + for (auto& pos : snp_pos) { - uint32_t snp_pos = *it; - pos.push_back(snp_pos); - bafs.push_back(this->snp_baf_map[snp_pos]); - - // Get the PFB value for the SNP - if (this->snp_pfb_map.find(snp_pos) != this->snp_pfb_map.end()) + if (pfb_map.find(pos) != pfb_map.end()) { - pfbs.push_back(this->snp_pfb_map[snp_pos]); + snp_pfb[pos] = pfb_map[pos]; } else { - pfbs.push_back(pfb_default); + snp_pfb[pos] = pfb_default; } } + + // // Get the PFB values for the SNPs from the keys + // // Create the PFB vector using the SNP positions (loop through snp_pos, + // // query the pfb_map, and push the value to the vector) + // for (size_t i = 0; i < snp_pos.size(); i++) + // { + // uint32_t snp_pos = snp_pos[i]; + // double pfb = pfb_default; + // if (pfb_map.find(snp_pos) != pfb_map.end()) + // { + // pfb = pfb_map[snp_pos]; + // } else { + // pfb = pfb_default; + // } + // snp_pfb.push_back(pfb); + // } + + // // Get the PFB values for the SNPs from the keys + // for (auto it = snp_start; it != snp_end; it++) + // { + // uint32_t snp_pos = *it; + // pos.push_back(snp_pos); + // bafs.push_back(this->snp_baf_map[snp_pos]); + + // // Get the PFB value for the SNP + // if (this->snp_pfb_map.find(snp_pos) != this->snp_pfb_map.end()) + // { + // pfbs.push_back(this->snp_pfb_map[snp_pos]); + // } else { + // pfbs.push_back(pfb_default); + // } + // } // auto& baf_bst = this->snp_baf_map[chr]; // auto baf_start = baf_bst.lower_bound({start, 0.0}); // auto baf_end = baf_bst.upper_bound({end, 0.0}); @@ -1147,5 +1743,5 @@ std::tuple, std::vector, std::vector> CNVC // } // } - return std::make_tuple(pos, bafs, pfbs); + // return std::make_tuple(pos, bafs, pfbs); } diff --git a/src/sv_object.cpp b/src/sv_object.cpp index b2c5827f..327f3fd2 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -171,12 +171,12 @@ void mergeSVs(std::set& sv_calls) { uint32_t current_length = current_merge.end - current_merge.start; uint32_t next_length = next.end - next.start; - // Merge the SV calls if the overlap is at least 50% of the current or - // next SV call - double overlap_pct_current = static_cast(overlap_length) / current_length; - double overlap_pct_next = static_cast(overlap_length) / next_length; + // Merge the SV calls if the overlap is > 0 + //double overlap_pct_current = static_cast(overlap_length) / current_length; + //double overlap_pct_next = static_cast(overlap_length) / next_length; - if (overlap_pct_current >= 0.5 || overlap_pct_next >= 0.5) { + //if (overlap_pct_current >= 0.5 || overlap_pct_next >= 0.5) { + if (overlap_length > 0) { // Merge the SV calls based on the likelihood if (next.hmm_likelihood != 0.0) { // Update the likelihood if the next SV call has a likelihood @@ -197,7 +197,7 @@ void mergeSVs(std::set& sv_calls) { // } } } else { - // No overlap: Save the SV and continue + // No overlap: Save the previous SV and continue merged_sv_calls.push_back(current_merge); current_merge = next; } From 53473af46d40f052b31ad263682b810223c183e4 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 25 Nov 2024 14:11:42 -0500 Subject: [PATCH 022/134] Fix reading AFs --- src/cnv_caller.cpp | 155 +++++++++++++++++++++++---------------------- 1 file changed, 81 insertions(+), 74 deletions(-) diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 4fcc1604..f2602d88 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -700,14 +700,12 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui throw std::runtime_error("ERROR: Could not initialize SNP reader."); } - // Read the SNP header - htsFile *snp_file = bcf_open(snp_filepath.c_str(), "r"); - bcf_hdr_t *snp_header = bcf_hdr_read(snp_file); - if (!snp_header) + // Set the region + std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); + if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) { bcf_sr_destroy(snp_reader); - bcf_close(snp_file); - throw std::runtime_error("ERROR: Could not initialize SNP header."); + throw std::runtime_error("ERROR: Could not set region for SNP reader: " + region_str); } // Set multi-threading @@ -721,30 +719,48 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0) { bcf_sr_destroy(snp_reader); - bcf_hdr_destroy(snp_header); - bcf_close(snp_file); throw std::runtime_error("ERROR: Could not add SNP file to reader: " + snp_filepath); } - // Set the region - std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); - if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) + // Get the header + bcf_hdr_t *snp_header = bcf_sr_get_header(snp_reader, 0); + if (!snp_header) { bcf_sr_destroy(snp_reader); - bcf_hdr_destroy(snp_header); - bcf_close(snp_file); - throw std::runtime_error("ERROR: Could not set region for SNP reader: " + region_str); + throw std::runtime_error("ERROR: Could not get header for SNP reader."); } std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl; int print_count = 0; - while (bcf_sr_next_line(snp_reader) >= 0) + int record_count = 0; + int duplicate_count = 0; + uint32_t last_pos = 0; + while (bcf_sr_next_line(snp_reader) > 0) { + if (!bcf_sr_has_line(snp_reader, 0)) + { + continue; + } bcf1_t *snp_record = bcf_sr_get_line(snp_reader, 0); if (snp_record) { + record_count++; uint32_t pos = (uint32_t)snp_record->pos + 1; + // Skip if 3 or more duplicate positions found + // if (pos == last_pos) + // { + // duplicate_count++; + // if (duplicate_count >= 10) + // { + // std::cerr << "ERROR: 3 or more duplicate positions found in SNP file at " << chr << ":" << pos << std::endl; + // break; + // } + // } else { + // duplicate_count = 0; + // } + // last_pos = pos; + // Skip if not a SNP if (!bcf_is_snp(snp_record)) { @@ -755,7 +771,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui float qual = snp_record->qual; if (bcf_float_is_missing(qual)) { - std::cerr << "ERROR: QUAL value is missing for SNP at " << chr << ":" << pos << std::endl; + // std::cerr << "ERROR: QUAL value is missing for SNP at " << chr << ":" << pos << std::endl; } // Skip if quality is less than 30 if (qual <= 30) @@ -766,33 +782,28 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Extract DP from FORMAT field int32_t *dp = 0; int dp_count = 0; - // int dp_count = bcf_get_info_int32(snp_header, snp_record, "DP", - // &dp, &dp_count); int dp_ret = bcf_get_format_int32(snp_header, snp_record, "DP", &dp, &dp_count); + bool dp_skip = false; if (dp_ret < 0) { - std::cerr << "ERROR: Could not get DP value for SNP at " << chr << ":" << pos << std::endl; + // std::cerr << "ERROR: Could not get DP value for SNP at " << chr << ":" << pos << std::endl; } else { // Skip if depth is not greater than 10 for (int i = 0; i < dp_count; i++) { if (dp[i] <= 10) { - continue; + dp_skip = true; + break; } } - // if (dp <= 10) - // { - // continue; - // } } free(dp); - // // Skip if depth is not greater than 10 - // if (dp <= 10) - // { - // continue; - // } - + if (dp_skip) + { + continue; + } + // Skip if the SNP does not pass the filter if (bcf_has_filter(snp_header, snp_record, const_cast("PASS")) != 1) { @@ -800,24 +811,20 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui } // Extract AD from FORMAT field - // int32_t ad[2] = {0, 0}; int32_t *ad = 0; int ad_count = 0; - // int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", - // &ad, &ad_count); int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count); // Skip if AD value is missing if (ad_ret < 0) { - std::cerr << "ERROR: AD value is missing for SNP at " << chr << ":" << pos << std::endl; - continue; + // std::cerr << "ERROR: AD value is missing for SNP at " << chr + // << ":" << pos << std::endl; + throw std::runtime_error("ERROR: AD value is missing for SNP at " + chr + ":" + std::to_string(pos)); } // Calculate the B-allele frequency (BAF) double baf = 0.0; - // double ad0 = (double) ad[0]; - // double ad1 = (double) ad[1]; double ad0 = 0.0; double ad1 = 0.0; for (int i = 0; i < ad_count; i++) @@ -830,34 +837,29 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui } } free(ad); - try { - // std::cout << "AD[0]: " << ad0 << ", AD[1]: " << ad1 << std::endl; - baf = ad1 / (ad0 + ad1); - // std::cout << "AD[0]: " << ad[0] << ", AD[1]: " << ad[1] << std::endl; - // baf = (double) ad[1] / (double) (ad[0] + ad[1]); - } catch (const std::exception& e) { - std::cerr << "ERROR: Could not calculate BAF for SNP at " << chr << ":" << pos << std::endl; - continue; - } + baf = ad1 / (ad0 + ad1); // Insert the SNP position and BAF into the maps snp_pos.insert(pos); snp_baf[pos] = baf; // Print the SNP position and BAF - if (print_count < 10) - { - std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << "(REF=" << ad0 << ",ALT=" << ad1 << ")" << std::endl; - print_count++; - } + // std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << "(REF=" << ad0 << ",ALT=" << ad1 << ")" << std::endl; + // print_count++; + // if (print_count < 10) + // { + // std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << "(REF=" << ad0 << ",ALT=" << ad1 << ")" << std::endl; + // print_count++; + // } } } + std::cout << "[TEST] SNP record count: " << record_count << std::endl; + // Clean up std::cout << "Cleaning up SNP reader..." << std::endl; bcf_sr_destroy(snp_reader); - bcf_hdr_destroy(snp_header); - bcf_close(snp_file); + std::cout << "Finished reading SNP allele frequencies for chromosome " << chr << std::endl; // std::cout << "Opening SNP file: " << snp_filepath << std::endl; // htsFile *snp_file = bcf_open(snp_filepath.c_str(), "r"); @@ -1290,6 +1292,14 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos throw std::runtime_error("ERROR: Could not initialize synced reader for population frequency file: " + pfb_filepath); } + // Set the region for the synced reader + std::string region_str = chr_gnomad + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); + if (bcf_sr_set_regions(pfb_reader, region_str.c_str(), 0) < 0) + { + bcf_sr_destroy(pfb_reader); + throw std::runtime_error("ERROR: Could not set region for synced reader: " + region_str); + } + // Set multi-threading bcf_sr_set_threads(pfb_reader, thread_count); @@ -1303,34 +1313,30 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos throw std::runtime_error("ERROR: Could not add population frequency file to synced reader: " + pfb_filepath); } - // Set the region for the synced reader - std::string region_str = chr_gnomad + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); - if (bcf_sr_set_regions(pfb_reader, region_str.c_str(), 0) < 0) + // Get the header + bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0); + if (!pfb_header) { bcf_sr_destroy(pfb_reader); - throw std::runtime_error("ERROR: Could not set region for synced reader: " + region_str); - } else { - std::cout << "Successfully set region for synced reader: " << region_str << std::endl; + throw std::runtime_error("ERROR: Could not get header for population frequency file: " + pfb_filepath); } - // Iterate through the records in the population frequency file - // bcf1_t *pfb_record = bcf_init(); - // if (!pfb_record) - // { - // bcf_sr_destroy(pfb_reader); - // throw std::runtime_error("ERROR: Could not initialize BCF record for population frequency file: " + pfb_filepath); - // } - int test_count = 0; - std::cout << "Iterating through records..." << std::endl; - while (bcf_sr_next_line(pfb_reader) >= 0) + int record_count = 0; + std::cout << "Iterating through records for region " << region_str << "..." << std::endl; + while (bcf_sr_next_line(pfb_reader) > 0) { + if (!bcf_sr_has_line(pfb_reader, 0)) + { + continue; + } // std::cout << "Reading record..." << std::endl; // pfb_record = bcf_sr_get_line(pfb_reader, 0); bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0); // Do something with the record if (pfb_record) { + record_count++; // Skip if not a SNP if (!bcf_is_snp(pfb_record)) { @@ -1370,11 +1376,11 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos } } - if (test_count < 10) - { - std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; - test_count++; - } + // if (test_count < 10) + // { + // std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; + // test_count++; + // } } // std::cout << "Record: " << pfb_record->pos << std::endl; // std::cout << "QUAL: " << pfb_record->qual << std::endl; @@ -1428,6 +1434,7 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos } // std::cout << "Test count: " << test_count << std::endl; + std::cout << "Record count: " << record_count << std::endl; // Clean up // bcf_destroy(pfb_record); From 4b83e62b514d6b1bb5dcfc179a571027b92105fa Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 25 Nov 2024 15:50:46 -0500 Subject: [PATCH 023/134] Fix mean chr cov --- python/cnv_plots.py | 2 +- src/cnv_caller.cpp | 45 +++++++++++++++++++++++++++++++-------------- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/python/cnv_plots.py b/python/cnv_plots.py index ec9ba842..67c831c6 100644 --- a/python/cnv_plots.py +++ b/python/cnv_plots.py @@ -76,7 +76,7 @@ def run(cnv_data_file, output_html): line = f.readline().strip() if '=' in line: key, value = line.split("=") - log.info("Metadata: %s=%s", key, value) + # log.info("Metadata: %s=%s", key, value) value = value.strip() metadata[key] = value diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index f2602d88..0c6c432a 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -74,6 +74,7 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta // window, then calculate the log2 ratio for each window for (uint32_t i = start_pos; i <= end_pos; i += window_size) { + // std::cout << "Querying SNP region for " << chr << ":" << i << "-" << std::min(i + window_size - 1, end_pos) << std::endl; // Run a sliding non-overlapping window of size window_size across // the SV region and calculate the log2 ratio for each window uint32_t window_start = i; @@ -98,6 +99,8 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta // after the SNP, and continue until the end of the window // (If there are no SNPs in the window, then use the default BAF and // PFB values, and the coverage log2 ratio) + + // If no SNPs, then calculate the log2 ratio for the window if (snp_window_pos.size() == 0) { double window_log2_ratio = calculateLog2Ratio(window_start, window_end, pos_depth_map, mean_chr_cov); @@ -117,6 +120,20 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta uint32_t bin_start = snp_window_pos[j] - window_size / 2; uint32_t bin_end = snp_window_pos[j] + window_size / 2; + // Trim the bin start and end to 1/2 the distance from the + // neighboring SNPs (or the start/end of the window) + if (j > 0) + { + bin_start = std::max(bin_start, (snp_window_pos[j-1] + snp_window_pos[j]) / 2); + } + + if (j < (int) snp_window_pos.size() - 1) + { + bin_end = std::min(bin_end, (snp_window_pos[j] + snp_window_pos[j+1]) / 2); + } + // std::cout << "bin_start: " << bin_start << std::endl; + // std::cout << "bin_end: " << bin_end << std::endl; + // Calculate the log2 ratio for the SNP bin double bin_cov = calculateLog2Ratio(bin_start, bin_end, pos_depth_map, mean_chr_cov); this->updateSNPData(snp_data, snp_window_pos[j], snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true); @@ -582,6 +599,11 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) std::unordered_map chr_pos_depth_map; while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0) { + // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads + if (bam_record->core.flag & BAM_FUNMAP || bam_record->core.flag & BAM_FSECONDARY || bam_record->core.flag & BAM_FQCFAIL || bam_record->core.flag & BAM_FDUP) + { + continue; + } // Parse the CIGAR string to get the depth (match, sequence match, and // mismatch) @@ -622,11 +644,18 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) bam_hdr_destroy(bam_header); sam_close(bam_file); - // Calculate the mean chromosome coverage + // Calculate the mean chromosome coverage for positions with non-zero depth uint64_t cum_depth = 0; uint32_t pos_count = 0; for (auto& pos_depth : chr_pos_depth_map) { + // if (pos_depth.second > 0) + // { + // cum_depth += pos_depth.second; + // pos_count++; + // } else { + // std::cout << "Zero depth at position " << pos_depth.first << std::endl; + // } cum_depth += pos_depth.second; pos_count++; } @@ -730,7 +759,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui throw std::runtime_error("ERROR: Could not get header for SNP reader."); } - std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl; + // std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl; int print_count = 0; int record_count = 0; int duplicate_count = 0; @@ -854,12 +883,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui } } - std::cout << "[TEST] SNP record count: " << record_count << std::endl; - // Clean up - std::cout << "Cleaning up SNP reader..." << std::endl; bcf_sr_destroy(snp_reader); - std::cout << "Finished reading SNP allele frequencies for chromosome " << chr << std::endl; // std::cout << "Opening SNP file: " << snp_filepath << std::endl; // htsFile *snp_file = bcf_open(snp_filepath.c_str(), "r"); @@ -1281,8 +1306,6 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos // Remove the 'chr' prefix from the chromosome name for SNP data. All // SNP data in this program does not use the 'chr' prefix std::string chr_no_prefix = removeChrPrefix(chr); - - std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl; int thread_count = this->input_data.getThreadCount(); // Initialize the synced reader @@ -1323,7 +1346,6 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos int test_count = 0; int record_count = 0; - std::cout << "Iterating through records for region " << region_str << "..." << std::endl; while (bcf_sr_next_line(pfb_reader) > 0) { if (!bcf_sr_has_line(pfb_reader, 0)) @@ -1433,14 +1455,9 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos std::cerr << "ERROR: " <errnum) << std::endl; } - // std::cout << "Test count: " << test_count << std::endl; - std::cout << "Record count: " << record_count << std::endl; - // Clean up // bcf_destroy(pfb_record); bcf_sr_destroy(pfb_reader); - std::cout << "Finished reading population frequencies for SV region" << std::endl; - // // Open the population frequency file // std::cout << "Opening population frequency file: " << pfb_filepath << std::endl; From 302598e806fcb6001b2448348555a18b6e8159e0 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 25 Nov 2024 15:57:52 -0500 Subject: [PATCH 024/134] Update gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 8284049d..f2253893 100644 --- a/.gitignore +++ b/.gitignore @@ -75,6 +75,7 @@ data/gnomadv3_filepaths.txt data/gnomadv4_filepaths.txt data/gnomadv4_filepaths_ssd.txt data/gnomadv4_hg19_filepaths.txt +data/gnomadv4_hg19_filepaths_ssd.txt # Training data data/sv_scoring_dataset/ From 9d3164c03aa758221f1fcb78adffe7ee17360cf8 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 25 Nov 2024 16:03:29 -0500 Subject: [PATCH 025/134] Update test --- tests/test_general.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_general.py b/tests/test_general.py index 1d81fe96..689dda55 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -64,7 +64,7 @@ def test_run(): # Check that the VCF file has the correct number of lines. with open(output_file, 'r', encoding='utf-8') as f: - assert len(f.readlines()) == 32 + assert len(f.readlines()) == 30 # Check that the VCF file has the correct header, and the correct # VCF CHROM, POS, and INFO fields in the next 2 lines. @@ -78,11 +78,11 @@ def test_run(): fields = line.strip().split('\t') assert fields[0] == "21" assert fields[1] == "14458394" - assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1341;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS,;CLIPSUP=0;REPTYPE=NA;HMM=-179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.000000" + assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1344;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS;CLIPSUP=0;REPTYPE=NA;HMM=0.000000" elif i == header_line + 2: fields = line.strip().split('\t') assert fields[0] == "21" - assert fields[1] == "14458394" - assert fields[7] == "END=14458394;SVTYPE=INS;SVLEN=1344;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARINS,;CLIPSUP=0;REPTYPE=NA;HMM=-179769313486231570814527423731704356798070567525844996598917476803157260780028538760589558632766878171540458953514382464234321326889464182768467546703537516986049910576551282076245490090389328944075868508455133942304583236903222948165808559332123348274797826204144723168738177180919299881250404026184124858368.000000" + assert fields[1] == "14469910" + assert fields[7] == "END=14470078;SVTYPE=DEL;SVLEN=-168;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARDEL;CLIPSUP=0;REPTYPE=NA;HMM=0.000000" break \ No newline at end of file From 2077f2636cc4d463bed44b1b1d15b566da073286 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 25 Nov 2024 22:57:14 -0500 Subject: [PATCH 026/134] Fix trimming --- include/sv_caller.h | 2 + src/sv_caller.cpp | 621 ++++++++++---------------------------------- 2 files changed, 138 insertions(+), 485 deletions(-) diff --git a/include/sv_caller.h b/include/sv_caller.h index 184248d7..cc9f2630 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -56,6 +56,8 @@ class SVCaller { void saveToVCF(const std::unordered_map>& sv_calls); + void trimOverlappingAlignments(AlignmentData& primary_alignment, AlignmentData& supp_alignment); + public: explicit SVCaller(InputData& input_data); diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 228a93a2..72d61b14 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -463,6 +463,8 @@ std::unordered_map> SVCaller::run() std::set& subregion_sv_calls = std::get<0>(region_data); PrimaryMap& primary_map = std::get<1>(region_data); SuppMap& supp_map = std::get<2>(region_data); + std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl; + mergeSVs(subregion_sv_calls); // SVData& subregion_sv_calls = std::get<0>(region_data); // PrimaryMap& primary_map = std::get<1>(region_data); // SuppMap& supp_map = std::get<2>(region_data); @@ -550,515 +552,107 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p int32_t primary_query_end = std::get<5>(primary_alignment); std::unordered_map primary_match_map = std::get<6>(primary_alignment); // bool primary_strand = std::get<7>(primary_alignment); + + // Skip primary alignments that do not have supplementary alignments if (supp_map.find(qname) == supp_map.end()) { continue; } - // // Resolve overlaps between the primary and supplementary query - // // sequences - // for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { - // std::string supp_chr = std::get<0>(*it); - // // int32_t supp_start = std::get<1>(*it); - // // int32_t supp_end = std::get<2>(*it); - // int32_t supp_query_start = std::get<4>(*it); - // int32_t supp_query_end = std::get<5>(*it); - // std::unordered_map supp_match_map = std::get<6>(*it); - // // bool supp_strand = std::get<7>(*it); - - // // Resolve overlaps between the primary and supplementary query - // // sequences - // if (primary_query_start < supp_query_end && primary_query_end > supp_query_start || supp_query_start < primary_query_end && supp_query_end > primary_query_start) { - - // // Calculate the mismatch rate for each alignment at the overlap - // double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end-1); - // double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end-1); - // // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl; - // // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl; - - // // Trim the overlap from the alignment with the higher mismatch - // // rate - // if (primary_mismatch_rate > supp_mismatch_rate) { - // if (overlap_start == primary_query_start) { - // primary_start += overlap_length; - // } else if (overlap_end == primary_query_end) { - // primary_end -= overlap_length; - // } - - // } else { - // if (overlap_start == supp_query_start) { - // // supp_start += overlap_length; - // // Update the value in the supp map - // std::get<1>(*it) += overlap_length; - // } else if (overlap_end == supp_query_end) { - // // supp_end -= overlap_length; - // // Update the value in the supp map - // std::get<2>(*it) -= overlap_length; - // } - // } - // } - // } - - // Remove supplementary alignments that are not on the same chromosome - // as the primary alignment - for (auto it = supp_map[qname].begin(); it != supp_map[qname].end();) { - if (std::get<0>(*it) != primary_chr) { - it = supp_map[qname].erase(it); - } else { - ++it; - } - } - - // Run copy number variant predictions on the primary alignment - SVType primary_type = SVType::UNKNOWN; - double primary_lh = std::numeric_limits::lowest(); - int32_t primary_lh_t = 0; - if (primary_end - primary_start >= min_cnv_length) { - SVCandidate sv_candidate(primary_start+1, primary_end+1, "."); - // std::cout << "TEST5" << std::endl; - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - primary_lh = std::get<0>(result); - // primary_log_likelihood /= (double)(primary_end - primary_start); // Normalize the log likelihood by the length - primary_type = std::get<1>(result); - } - - // Loop through the supplementary alignments, find the largest - // supplementary alignment, and the closest non-overlapping - // supplementary alignment to the primary alignment + // Find the largest supplementary alignment, and also identify inversions AlignmentData largest_supp_alignment = supp_map[qname][0]; - AlignmentData closest_supp_alignment = supp_map[qname][0]; int32_t largest_supp_length = 0; - int32_t closest_supp_distance = std::numeric_limits::max(); - int32_t closest_supp_length = 0; for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { - // const auto& supp_chr = std::get<0>(*it); + const auto& supp_chr = std::get<0>(*it); + if (primary_chr != supp_chr) { + continue; // Skip supplementary alignments on different chromosomes + } int32_t supp_start = std::get<1>(*it); int32_t supp_end = std::get<2>(*it); int32_t supp_length = supp_end - supp_start + 1; - int32_t supp_distance = std::numeric_limits::max(); - if (supp_start > primary_end) { - supp_distance = supp_start - primary_end; - } else if (supp_end < primary_start) { - supp_distance = primary_start - supp_end; - } if (supp_length > largest_supp_length) { largest_supp_length = supp_length; largest_supp_alignment = *it; } - if (supp_distance < closest_supp_distance) { - closest_supp_length = supp_length; - closest_supp_alignment = *it; - closest_supp_distance = supp_distance; - } - } - - // Run copy number variant predictions on the largest supplementary - // alignment - double largest_supp_lh = std::numeric_limits::lowest(); - SVType largest_supp_type = SVType::UNKNOWN; - int largest_supp_lh_t = 0; - if (largest_supp_length >= min_cnv_length) { - SVCandidate sv_candidate(std::get<1>(largest_supp_alignment)+1, std::get<2>(largest_supp_alignment)+1, "."); - // std::cout << "TEST1" << std::endl; - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - largest_supp_lh = std::get<0>(result); - // largest_supp_log_likelihood /= (double)largest_supp_length; // Normalize the log likelihood by the length - largest_supp_type = std::get<1>(result); - } - - // Run copy number variant predictions on the closest non-overlapping - // supplementary alignment (if not the same as the largest) - double closest_supp_lh = std::numeric_limits::lowest(); - SVType closest_supp_type = SVType::UNKNOWN; - int closest_supp_lh_t = 0; - if (largest_supp_alignment != closest_supp_alignment) { - if (closest_supp_length >= min_cnv_length) { - SVCandidate sv_candidate(std::get<1>(closest_supp_alignment)+1, std::get<2>(closest_supp_alignment)+1, "."); - // std::cout << "TEST2" << std::endl; - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - closest_supp_lh = std::get<0>(result); - // closest_supp_log_likelihood /= (double)closest_supp_length; // Normalize the log likelihood by the length - closest_supp_type = std::get<1>(result); - int32_t closest_supp_start = std::get<1>(closest_supp_alignment); - int32_t closest_supp_end = std::get<2>(closest_supp_alignment); - } - } - - // Define constants representing read scenarios used for SV detection - const int NOCALL = -1; // Default - const int PRIM_SUPP_BD = 0; // Primary and supplementary boundary - const int PRIM_SUPP_GAP = 1; // Primary and supplementary gap - const int SUPP_PRIM_BD = 2; // Supplementary and primary boundary - const int SUPP_PRIM_GAP = 3; // Supplementary and primary gap - - // Loop through all the supplementary alignments and find the highest - // likelihood prediction - double best_split_aln_lh = std::numeric_limits::lowest(); - double best_split_aln_lh_norm = std::numeric_limits::lowest(); - // int best_split_aln_length = 0; - SVType best_supp_type = SVType::UNKNOWN; - std::pair best_supp_candidate; - AlignmentData& best_split_alignment = supp_map[qname][0]; - int best_scenario = NOCALL; - for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { - int32_t supp_start = std::get<1>(*it); - int32_t supp_end = std::get<2>(*it); - bool primary_before_supp = primary_start < supp_start; - - // Create the SV candidate as the boundary of the primary and - // supplementary alignments - SVCandidate split_boundary; - SVCandidate split_gap; - bool invalid_gap = false; - if (primary_before_supp) { - split_boundary = SVCandidate(primary_start+1, supp_end+1, "."); - - // Check for an invalid gap (overlap) - if (primary_end >= supp_start) { - invalid_gap = true; - } else { - split_gap = SVCandidate(primary_end+1, supp_start+1, "."); - } - // split_gap = SVCandidate(primary_end+1, supp_start+1, "."); - - } else { - split_boundary = SVCandidate(supp_start+1, primary_end+1, "."); - - // Check for an invalid gap (overlap) - if (supp_end >= primary_start) { - invalid_gap = true; - } else { - split_gap = SVCandidate(supp_end+1, primary_start+1, "."); - } - } - // Create a vector of the two SV candidates, don't add the gap if - // it is an overlap, or if either SV is less than the minimum CNV - // length - std::vector sv_candidates; - if (!invalid_gap && std::get<1>(split_gap) - std::get<0>(split_gap) >= min_cnv_length) { - sv_candidates.push_back(split_gap); - } - if (std::get<1>(split_boundary) - std::get<0>(split_boundary) >= min_cnv_length) { - sv_candidates.push_back(split_boundary); - } - - // Continue if no SV candidates - if (sv_candidates.size() == 0) { - continue; - } - - // Run copy number variant predictions on both, and keep the - // prediction with the highest normalized log likelihood - double chosen_lh_norm = std::numeric_limits::lowest(); - SVType chosen_type = SVType::UNKNOWN; - std::pair chosen_candidate; - std::string chosen_candidate_str = "BOUNDARY"; - int split_scenario = NOCALL; - for (const auto& sv_candidate : sv_candidates) { - // std::cout << "TEST3: primary = " << primary_start << ", " << primary_end << " supp = " << supp_start << ", " << supp_end << std::endl; - // std::cout << "Position: " << std::get<0>(sv_candidate) << ", " << std::get<1>(sv_candidate) << std::endl; - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - double current_lh = std::get<0>(result); - SVType current_type = std::get<1>(result); - - // Normalize the log likelihood by the state sequence length - double current_lh_norm = current_lh;// / (double)T; - // if (sv_candidate == split_boundary) { - // std::cout << "Boundary candidate: " << current_lh_norm << ", before normalization: " << current_lh << std::endl; - // } else if (sv_candidate == split_gap) { - // std::cout << "Gap candidate: " << current_lh_norm << ", before normalization: " << current_lh << std::endl; - // } - - // Update the current SV candidate if the likelihood is higher - if (current_type != SVType::UNKNOWN && current_lh_norm > chosen_lh_norm) { - chosen_lh_norm = current_lh_norm; - chosen_type = current_type; - chosen_candidate = std::make_pair(std::get<0>(sv_candidate), std::get<1>(sv_candidate)); - - // Update the candidate string - if (sv_candidate == split_boundary) { - chosen_candidate_str = "BOUNDARY"; - if (primary_before_supp) { - split_scenario = PRIM_SUPP_BD; - } else { - split_scenario = SUPP_PRIM_BD; - } - } else if (sv_candidate == split_gap) { - chosen_candidate_str = "GAP"; - if (primary_before_supp) { - split_scenario = PRIM_SUPP_GAP; - } else { - split_scenario = SUPP_PRIM_GAP; - } + // Inversion detection + bool is_opposite_strand = std::get<7>(primary_alignment) != std::get<7>(*it); + if (is_opposite_strand) { + if (supp_length >= min_cnv_length) { + SVCandidate sv_candidate(supp_start+1, supp_end+1, "."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); + double supp_lh = std::get<0>(result); + SVType supp_type = std::get<1>(result); + if (supp_type == SVType::NEUTRAL) { + addSVCall(sv_calls, (uint32_t)(supp_start+1), (uint32_t)(supp_end+1), "INV", ".", "HMM", "./.", supp_lh); + sv_count++; + } else if (supp_type == SVType::DUP) { + addSVCall(sv_calls, (uint32_t)(supp_start+1), (uint32_t)(supp_end+1), "INVDUP", ".", "HMM", "./.", supp_lh); + sv_count++; } - // std::cout << "Updated candidate: " << chosen_candidate_str << " with likelihood: " << current_lh_norm << std::endl; - } else if (current_type == SVType::UNKNOWN) { - // std::cerr << "ERROR: Unknown SV type" << std::endl; - // exit(1); - } - } - - // std::cout << "Chosen candidate: " << chosen_candidate_str << std::endl; - - // Continue if unknown SV type - if (chosen_type == SVType::UNKNOWN) { - // std::cerr << "ERROR: Unknown SV type" << std::endl; - continue; - } - - // If opposite strand, set the type to INV or INV_DUP - bool same_strand = std::get<7>(*it) == std::get<7>(primary_alignment); - if (!same_strand) { - if (chosen_type == SVType::NEUTRAL) { - chosen_type = SVType::INV; - } else if (chosen_type == SVType::DUP) { - chosen_type = SVType::INV_DUP; + } else { + // Add the inversion without running copy number predictions + // (too small for predictions) + addSVCall(sv_calls, (uint32_t)(supp_start+1), (uint32_t)(supp_end+1), "INV", ".", "REV", "./.", 0.0); + sv_count++; } } - - if (chosen_lh_norm > best_split_aln_lh_norm) { - // best_supp_log_likelihood = supp_likelihood; - // best_supp_log_likelihood /= (double)(sv_end - sv_start); // - // Normalize the log likelihood by the length - // best_split_aln_lh = split_aln_lh; - best_split_aln_lh_norm = chosen_lh_norm; - // best_split_aln_length = split_aln_length; - best_supp_type = chosen_type; - best_supp_candidate = chosen_candidate; - best_split_alignment = *it; - best_scenario = split_scenario; - } else if (chosen_lh_norm <= best_split_aln_lh_norm) { - // std::cerr << "ERROR: split_aln_lh_norm is less than or equal to best_split_aln_lh_norm" << std::endl; - // exit(1); - } - } - - // If the likelihood is equal to the lowest value, print an error - if (best_split_aln_lh_norm == std::numeric_limits::lowest()) { - // std::cerr << "ERROR: best_supp_log_likelihood is the lowest value" << std::endl; - // exit(1); } - // Print the likelihoods - // std::cout << "Primary log likelihood: " << primary_lh << std::endl; - // std::cout << "Largest supplementary log likelihood: " << largest_supp_lh << std::endl; - // std::cout << "Closest supplementary log likelihood: " << closest_supp_lh << std::endl; - // // std::cout << "Best split alignment log likelihood: " << best_split_aln_lh << std::endl; - // std::cout << "Best split alignment log likelihood (normalized): " << best_split_aln_lh_norm << std::endl; - // std::cout << "Best scenario: " << best_scenario << std::endl; - - // Add the SV call with the highest likelihood prediction - // - // Determine the normalized log likelihood for the combined alignments - // by summing and normalizing the log likelihoods by the length - double complex_lh = 0.0; - double complex_lh_norm = 0.0; - if (largest_supp_alignment == closest_supp_alignment) { - int32_t complex_t = primary_lh_t + largest_supp_lh_t; - complex_lh = primary_lh + largest_supp_lh; - complex_lh_norm = complex_lh;// / complex_t; + // Trim overlapping alignments + int32_t supp_start = std::get<1>(largest_supp_alignment); + int32_t supp_end = std::get<2>(largest_supp_alignment); + bool primary_before_supp = primary_start < supp_start; + trimOverlappingAlignments(primary_alignment, largest_supp_alignment); + + // Create the SV candidate using both alignments + supp_start = std::get<1>(largest_supp_alignment); + supp_end = std::get<2>(largest_supp_alignment); + primary_start = std::get<1>(primary_alignment); + primary_end = std::get<2>(primary_alignment); + SVCandidate split_boundary; + SVCandidate split_gap; + bool gap_exists = false; + int32_t boundary_left, boundary_right, gap_left, gap_right; + if (primary_before_supp) { + boundary_left = primary_start+1; + boundary_right = supp_end+1; + gap_left = primary_end+1; + gap_right = supp_start+1; + gap_exists = primary_end < supp_start; } else { - int32_t complex_t = primary_lh_t + largest_supp_lh_t + closest_supp_lh_t; - complex_lh = primary_lh + largest_supp_lh + closest_supp_lh; - complex_lh_norm = complex_lh;// / complex_t; + boundary_left = supp_start+1; + boundary_right = primary_end+1; + gap_left = supp_end+1; + gap_right = primary_start+1; + gap_exists = supp_end < primary_start; } - // std::cout << "Complex log likelihood (normalized): " << complex_lh_norm << std::endl; - - // Compare the best split alignment likelihood to the complex likelihood - // if (best_supp_log_likelihood > primary_log_likelihood || best_supp_log_likelihood > largest_supp_log_likelihood || best_supp_log_likelihood > closest_supp_log_likelihood) { - if (best_split_aln_lh_norm > complex_lh_norm) { - int32_t sv_start = best_supp_candidate.first; - int32_t sv_end = best_supp_candidate.second; - - // Print an error and continue if the end is less than the start - if (sv_end < sv_start) { - std::cerr << "ERROR: SV end is less than the start: " << sv_start << " - " << sv_end << ", SV type: " << getSVTypeString(best_supp_type) << std::endl; - continue; - } - - // Resolve overlaps between the primary and supplementary query - // sequences for deletions (not usually an issue for other types) - if (best_supp_type == SVType::DEL) { - AlignmentData& best_supp_alignment = best_split_alignment; - int32_t supp_start = std::get<1>(best_supp_alignment); - int32_t supp_end = std::get<2>(best_supp_alignment); - int32_t supp_query_start = std::get<4>(best_supp_alignment); - int32_t supp_query_end = std::get<5>(best_supp_alignment); - std::unordered_map supp_match_map = std::get<6>(best_supp_alignment); - - // Resolve overlaps between the primary and supplementary query - // sequences - // int32_t overlap_start = std::max(primary_query_start, supp_query_start); - // int32_t overlap_end = std::min(primary_query_end, supp_query_end); - // int32_t overlap_length = overlap_end - overlap_start; - bool gap_present = primary_query_end < supp_query_start || supp_query_end < primary_query_start; - if (!gap_present) { - int32_t overlap_start = std::max(primary_query_start, supp_query_start); - int32_t overlap_end = std::min(primary_query_end, supp_query_end); - int32_t overlap_length = overlap_end - overlap_start; - - // Calculate the mismatch rate for each alignment at the overlap - double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, overlap_start, overlap_end); - double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, overlap_start, overlap_end); - // std::cout << "Primary mismatch rate: " << primary_mismatch_rate << std::endl; - // std::cout << "Supplementary mismatch rate: " << supp_mismatch_rate << std::endl; - - // Trim the overlap from the alignment with the higher mismatch - // rate - if (primary_mismatch_rate > supp_mismatch_rate) { - - // Handle each scenario - if (best_scenario == PRIM_SUPP_BD || best_scenario == PRIM_SUPP_GAP) { - // Primary is first, incorporate the overlap into - // the beginning of the deletion - sv_start -= overlap_length; - } else if (best_scenario == SUPP_PRIM_BD || best_scenario == SUPP_PRIM_GAP) { - // Primary is last, incorporate the overlap into - // the end of the deletion - sv_end += overlap_length; - } - } else { - - // Handle each scenario - if (best_scenario == SUPP_PRIM_BD || best_scenario == SUPP_PRIM_GAP) { - // Supplementary is first, incorporate the overlap into - // the beginning of the deletion - sv_start -= overlap_length; - } else if (best_scenario == PRIM_SUPP_BD || best_scenario == PRIM_SUPP_GAP) { - // Supplementary is last, incorporate the overlap into - // the end of the deletion - sv_end += overlap_length; - } - } - } - } - - // Add the best split alignment as the SV call - // sv_calls.add(primary_chr, sv_start, sv_end, best_supp_type, ".", - // "SPLITREAD", "./.", best_split_aln_lh_norm); - std::string sv_type_str = getSVTypeString(best_supp_type); - sv_count++; - } else { - // Resolve complex SVs - - // Simplest case: Largest supplementary is also the closest - if (largest_supp_alignment == closest_supp_alignment) { - // [primary] -- [supp_start] -- [supp_end] - // Determine if opposite strands - bool opposite_strands = std::get<7>(largest_supp_alignment) != std::get<7>(primary_alignment); - - // Determine if the supplementary alignment is an inversion - if (opposite_strands) { - if (largest_supp_type == SVType::NEUTRAL) { - largest_supp_type = SVType::INV; - } else if (largest_supp_type == SVType::DUP) { - largest_supp_type = SVType::INV_DUP; - } - } - - // Get the SV type strings - std::string primary_type_str = getSVTypeString(primary_type); - std::string supp_type_str = getSVTypeString(largest_supp_type); - - // Determine the order of the primary and supplementary - // alignment to resolve the SV - if (std::get<1>(largest_supp_alignment) < primary_start) { - // [supp_start] -- [supp_end] -- [primary] - std::string complex_sv_type_str = supp_type_str + "+" + primary_type_str; - - // Add the complex SV call - addSVCall(sv_calls, (uint32_t)std::get<1>(largest_supp_alignment), (uint32_t)primary_end, "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm); - // sv_calls.insert(SVCall{(uint32_t)std::get<1>(largest_supp_alignment), (uint32_t)primary_end, "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm}); - // sv_calls.add(primary_chr, std::get<1>(largest_supp_alignment), primary_end, SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm); - sv_count++; - } else { - // [primary] -- [supp_start] -- [supp_end] - std::string complex_sv_type_str = primary_type_str + "+" + supp_type_str; - - // Add the complex SV call - addSVCall(sv_calls, (uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm); - // sv_calls.insert(SVCall{(uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm}); - // sv_calls.add(primary_chr, primary_start, std::get<2>(largest_supp_alignment), SVType::COMPLEX, ".", complex_sv_type_str, "./.", complex_lh_norm); - sv_count++; - } + + // Run copy number variant predictions on the boundary + split_boundary = SVCandidate(boundary_left, boundary_right, "."); + std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_boundary); + double bd_lh = std::get<0>(bd_result); + SVType bd_type = std::get<1>(bd_result); + + // Run copy number variant predictions on the gap if it exists + if (gap_exists) { + split_gap = SVCandidate(gap_left, gap_right, "."); + std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_gap); + double gap_lh = std::get<0>(gap_result); + SVType gap_type = std::get<1>(gap_result); + + // If higher likelihood than the boundary, add the gap as the SV call + if (gap_lh > bd_lh) { + addSVCall(sv_calls, (uint32_t)(gap_left), (uint32_t)(gap_right), "GAP", ".", "GAP", "./.", gap_lh); + sv_count++; } else { - // Resolve complex SVs with multiple supplementary alignments - // Determine the order of the primary and supplementary - // alignments - // [primary] -- [closest_supp] -- [largest_supp] - // [closest_supp] -- [primary] -- [largest_supp] - // [largest_supp] -- [closest_supp] -- [primary] - // [largest_supp] -- [primary] -- [closest_supp] - // Only consider case 1 for efficiency: - if (primary_end < std::get<1>(closest_supp_alignment) && std::get<2>(closest_supp_alignment) < std::get<1>(largest_supp_alignment)) { - // [primary] -- [closest_supp] -- [largest_supp] - // Determine if the closest supplementary alignment is an - // inversion - if (std::get<7>(closest_supp_alignment) != std::get<7>(primary_alignment)) { - if (closest_supp_type == SVType::NEUTRAL) { - closest_supp_type = SVType::INV; - } else if (closest_supp_type == SVType::DUP) { - closest_supp_type = SVType::INV_DUP; - } - } - - // Run copy number variant predictions on the region between - // the closest supplementary alignment and the largest - // supplementary alignment - SVCandidate sv_candidate(std::get<2>(closest_supp_alignment)+1, std::get<1>(largest_supp_alignment)+1, "."); - // std::cout << "TEST4" << std::endl; - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); - // double complex_log_likelihood = std::get<0>(result); - SVType complex_type = std::get<1>(result); - - // if (std::get<7>(largest_supp_alignment) != std::get<7>(primary_alignment)) { - // if (largest_supp_type == SVType::NEUTRAL) { - // largest_supp_type = SVType::INV; - // } else if (largest_supp_type == SVType::DUP) { - // largest_supp_type = SVType::INV_DUP; - // } - // } - - std::string primary_type_str = getSVTypeString(primary_type); - std::string closest_supp_type_str = getSVTypeString(closest_supp_type); - // std::string largest_supp_type_str = getSVTypeString(largest_supp_type); - // std::string complex_sv_type_str = primary_type_str + "+" + closest_supp_type_str; - - - // Combine the types if equal and not unknown/neutral - std::cout << "Resolving complex SVs..." << std::endl; - std::string complex_sv_type_str = ""; - if (primary_type != SVType::UNKNOWN && primary_type != SVType::NEUTRAL) { - complex_sv_type_str += primary_type_str; - std::cout << "[1] Updated to type: " << complex_sv_type_str << std::endl; - } - if (closest_supp_type != primary_type && closest_supp_type != SVType::UNKNOWN && closest_supp_type != SVType::NEUTRAL) { - if (complex_sv_type_str != "") { - complex_sv_type_str += "+"; - } - complex_sv_type_str += closest_supp_type_str; - std::cout << "[2] Updated to type: " << complex_sv_type_str << std::endl; - } - if (complex_type != closest_supp_type && complex_type != primary_type && complex_type != SVType::UNKNOWN && complex_type != SVType::NEUTRAL) { - if (complex_sv_type_str != "") { - complex_sv_type_str += "+"; - } - complex_sv_type_str += getSVTypeString(complex_type); - std::cout << "[3] Updated to type: " << complex_sv_type_str << std::endl; - } - - // Add the complex SV call if not empty - if (complex_sv_type_str != "") { - std::cout << "Found complex SV type: " << complex_sv_type_str << std::endl; - // sv_calls.add(primary_chr, primary_start, - // std::get<2>(largest_supp_alignment), SVType::COMPLEX, - // ".", complex_sv_type_str, "./.", complex_lh_norm); - // sv_calls.insert(SVCall{(uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm}); - addSVCall(sv_calls, (uint32_t)primary_start, (uint32_t)std::get<2>(largest_supp_alignment), "COMPLEX", ".", complex_sv_type_str, "./.", complex_lh_norm); - sv_count++; - } - } + // Add the boundary as the SV call + addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), "BOUNDARY", ".", "BOUNDARY", "./.", bd_lh); + sv_count++; } + } else { + // Add the boundary as the SV call + addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), "BOUNDARY", ".", "BOUNDARY", "./.", bd_lh); + sv_count++; } } @@ -1276,3 +870,60 @@ void SVCaller::saveToVCF(const std::unordered_map std::cout << "Finished writing VCF file. Total SV calls: " << total_count << ", skipped: " << skip_count << " with unknown SV type" << std::endl; } +void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, AlignmentData& supp_alignment) +{ + // Get the start and end read positions for the primary and supplementary + // alignments + int32_t primary_query_start = std::get<4>(primary_alignment); + int32_t primary_query_end = std::get<5>(primary_alignment); + int32_t supp_query_start = std::get<4>(supp_alignment); + int32_t supp_query_end = std::get<5>(supp_alignment); + std::unordered_map& primary_match_map = std::get<6>(primary_alignment); + std::unordered_map& supp_match_map = std::get<6>(supp_alignment); + int32_t primary_alignment_start = std::get<1>(primary_alignment); + int32_t primary_alignment_end = std::get<2>(primary_alignment); + int32_t supp_alignment_start = std::get<1>(supp_alignment); + int32_t supp_alignment_end = std::get<2>(supp_alignment); + + // Check if the alignments overlap + bool primary_before_supp = primary_query_start < supp_query_start; + if (primary_before_supp) { + // Primary before supplementary in the query + if (primary_query_end >= supp_query_start) { + // Calculate the mismatch rates at the overlapping region + double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, supp_query_start, primary_query_end); + double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, supp_query_start, primary_query_end); + int32_t overlap_length = primary_query_end - supp_query_start + 1; + + // Trim the ailgnment with the higher mismatch rate + if (primary_mismatch_rate > supp_mismatch_rate) { + // Trim the end of the primary alignment + std::get<2>(primary_alignment) = primary_alignment_end - overlap_length; + std::cout << "Trimming primary alignment" << std::endl; + } else { + // Trim the beginning of the supplementary alignment + std::get<1>(supp_alignment) = supp_alignment_start + overlap_length; + std::cout << "Trimming supplementary alignment" << std::endl; + } + } + } else { + // Supplementary before primary in the query + if (supp_query_end >= primary_query_start) { + // Calculate the mismatch rates at the overlapping region + double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, primary_query_start, supp_query_end); + double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, primary_query_start, supp_query_end); + int32_t overlap_length = supp_query_end - primary_query_start + 1; + + // Trim the ailgnment with the higher mismatch rate + if (supp_mismatch_rate > primary_mismatch_rate) { + // Trim the end of the supplementary alignment + std::get<2>(supp_alignment) = supp_alignment_end - overlap_length; + std::cout << "Trimming supplementary alignment" << std::endl; + } else { + // Trim the beginning of the primary alignment + std::get<1>(primary_alignment) = primary_alignment_start + overlap_length; + std::cout << "Trimming primary alignment" << std::endl; + } + } + } +} From 942ce746af3a040f6ca984ed6a0d68b237f5b0ba Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 25 Nov 2024 22:58:36 -0500 Subject: [PATCH 027/134] Reduce debug outputs --- src/sv_caller.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 72d61b14..e1ae71d6 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -899,11 +899,9 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align if (primary_mismatch_rate > supp_mismatch_rate) { // Trim the end of the primary alignment std::get<2>(primary_alignment) = primary_alignment_end - overlap_length; - std::cout << "Trimming primary alignment" << std::endl; } else { // Trim the beginning of the supplementary alignment std::get<1>(supp_alignment) = supp_alignment_start + overlap_length; - std::cout << "Trimming supplementary alignment" << std::endl; } } } else { @@ -918,11 +916,9 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align if (supp_mismatch_rate > primary_mismatch_rate) { // Trim the end of the supplementary alignment std::get<2>(supp_alignment) = supp_alignment_end - overlap_length; - std::cout << "Trimming supplementary alignment" << std::endl; } else { // Trim the beginning of the primary alignment std::get<1>(primary_alignment) = primary_alignment_start + overlap_length; - std::cout << "Trimming primary alignment" << std::endl; } } } From f7a17e263b18550827c42ec7f769b8a8a4b333a9 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 25 Nov 2024 23:21:39 -0500 Subject: [PATCH 028/134] Update test --- tests/test_general.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_general.py b/tests/test_general.py index 689dda55..ac7d5d8d 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -64,7 +64,7 @@ def test_run(): # Check that the VCF file has the correct number of lines. with open(output_file, 'r', encoding='utf-8') as f: - assert len(f.readlines()) == 30 + assert len(f.readlines()) == 22 # Check that the VCF file has the correct header, and the correct # VCF CHROM, POS, and INFO fields in the next 2 lines. @@ -82,7 +82,7 @@ def test_run(): elif i == header_line + 2: fields = line.strip().split('\t') assert fields[0] == "21" - assert fields[1] == "14469910" - assert fields[7] == "END=14470078;SVTYPE=DEL;SVLEN=-168;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=CIGARDEL;CLIPSUP=0;REPTYPE=NA;HMM=0.000000" + assert fields[1] == "14502888" + assert fields[7] == "END=14502953;SVTYPE=BOUNDARY;SVLEN=65;SUPPORT=1;SVMETHOD=CONTEXTSVv0.1;ALN=BOUNDARY;CLIPSUP=0;REPTYPE=NA;HMM=-4.606171" break \ No newline at end of file From bb4d2c4c905e1a201ef8bda077adb317493bfa7b Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Tue, 26 Nov 2024 13:24:08 -0500 Subject: [PATCH 029/134] fix warnings --- src/cnv_caller.cpp | 4 ---- src/sv_caller.cpp | 50 ++++++++++++++++++++++++---------------------- src/sv_object.cpp | 13 +++++++----- 3 files changed, 34 insertions(+), 33 deletions(-) diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 0c6c432a..3bc3f39e 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -760,10 +760,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui } // std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl; - int print_count = 0; int record_count = 0; - int duplicate_count = 0; - uint32_t last_pos = 0; while (bcf_sr_next_line(snp_reader) > 0) { if (!bcf_sr_has_line(snp_reader, 0)) @@ -1344,7 +1341,6 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos throw std::runtime_error("ERROR: Could not get header for population frequency file: " + pfb_filepath); } - int test_count = 0; int record_count = 0; while (bcf_sr_next_line(pfb_reader) > 0) { diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index e1ae71d6..3761325d 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -548,8 +548,8 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p std::string primary_chr = std::get<0>(primary_alignment); int32_t primary_start = std::get<1>(primary_alignment); int32_t primary_end = std::get<2>(primary_alignment); - int32_t primary_query_start = std::get<4>(primary_alignment); - int32_t primary_query_end = std::get<5>(primary_alignment); + // int32_t primary_query_start = std::get<4>(primary_alignment); + // int32_t primary_query_end = std::get<5>(primary_alignment); std::unordered_map primary_match_map = std::get<6>(primary_alignment); // bool primary_strand = std::get<7>(primary_alignment); @@ -627,32 +627,34 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p gap_exists = supp_end < primary_start; } - // Run copy number variant predictions on the boundary - split_boundary = SVCandidate(boundary_left, boundary_right, "."); - std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_boundary); - double bd_lh = std::get<0>(bd_result); - SVType bd_type = std::get<1>(bd_result); - - // Run copy number variant predictions on the gap if it exists - if (gap_exists) { - split_gap = SVCandidate(gap_left, gap_right, "."); - std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_gap); - double gap_lh = std::get<0>(gap_result); - SVType gap_type = std::get<1>(gap_result); - - // If higher likelihood than the boundary, add the gap as the SV call - if (gap_lh > bd_lh) { - addSVCall(sv_calls, (uint32_t)(gap_left), (uint32_t)(gap_right), "GAP", ".", "GAP", "./.", gap_lh); - sv_count++; + // Run copy number variant predictions on the boundary if large enough + if (boundary_right - boundary_left >= min_cnv_length) { + split_boundary = SVCandidate(boundary_left, boundary_right, "."); + std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_boundary); + double bd_lh = std::get<0>(bd_result); + SVType bd_type = std::get<1>(bd_result); + + // Run copy number variant predictions on the gap if it exists + if (gap_exists && gap_right - gap_left >= min_cnv_length) { + split_gap = SVCandidate(gap_left, gap_right, "."); + std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_gap); + double gap_lh = std::get<0>(gap_result); + SVType gap_type = std::get<1>(gap_result); + + // If higher likelihood than the boundary, add the gap as the SV call + if (gap_lh > bd_lh) { + addSVCall(sv_calls, (uint32_t)(gap_left), (uint32_t)(gap_right), getSVTypeString(gap_type), ".", "GAP", "./.", gap_lh); + sv_count++; + } else { + // Add the boundary as the SV call + addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh); + sv_count++; + } } else { // Add the boundary as the SV call - addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), "BOUNDARY", ".", "BOUNDARY", "./.", bd_lh); + addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh); sv_count++; } - } else { - // Add the boundary as the SV call - addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), "BOUNDARY", ".", "BOUNDARY", "./.", bd_lh); - sv_count++; } } diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 327f3fd2..c28d6b21 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -15,9 +15,9 @@ bool SVCall::operator<(const SVCall & other) const void addSVCall(std::set& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) { - // Throw an error if unknown SV type + // Ignore unknown SV types if (sv_type == "UNKNOWN") { - throw std::runtime_error("ERROR: Cannot add unknown SV type"); + return; } if (start >= end) { @@ -26,10 +26,13 @@ void addSVCall(std::set& sv_calls, uint32_t start, uint32_t end, std::st // If the SV call already exists (start and end position), then update all information if the // likelihood is higher - // std::cout << "[TEST1] Adding SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl; - SVCall new_sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1}; + // std::cout << "[TEST1] Adding SV call: " << start << "-" << end << " " << + // sv_type << " " << alt_allele << " " << data_type << " " << genotype << " + // " << hmm_likelihood << std::endl; + sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1}); + // SVCall new_sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1}; - sv_calls.insert(new_sv_call); + // sv_calls.insert(new_sv_call); /* bool exists = false; From f241af5c3f7368b9648f542735ba816ddc37525c Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Tue, 26 Nov 2024 13:38:20 -0500 Subject: [PATCH 030/134] Read hmm once only --- include/cnv_caller.h | 6 ++-- include/sv_caller.h | 2 +- src/cnv_caller.cpp | 66 ++++---------------------------------------- src/sv_caller.cpp | 18 ++++++++---- 4 files changed, 22 insertions(+), 70 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index e8d3d3b8..9f16c5f6 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -52,7 +52,7 @@ class CNVCaller { mutable std::mutex sv_candidates_mtx; // SV candidate map mutex mutable std::mutex snp_data_mtx; // SNP data mutex mutable std::mutex hmm_mtx; // HMM mutex - CHMM hmm; + // CHMM hmm; SNPData snp_data; SNPInfo snp_info; double mean_chr_cov = 0.0; @@ -116,11 +116,11 @@ class CNVCaller { // Run copy number prediction for a single SV candidate, returning the // likelihood, predicted CNV type, genotype, and whether SNPs were found - std::tuple runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate); + std::tuple runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate, CHMM hmm); // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings // SNPData runCIGARCopyNumberPrediction(std::string chr, std::map& sv_candidates, int min_length); - void runCIGARCopyNumberPrediction(std::string chr, std::set& sv_candidates, int min_length); + void runCIGARCopyNumberPrediction(std::string chr, std::set& sv_candidates, int min_length, CHMM hmm); // Calculate the mean chromosome coverage double calculateMeanChromosomeCoverage(std::string chr); diff --git a/include/sv_caller.h b/include/sv_caller.h index cc9f2630..dee58dea 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -47,7 +47,7 @@ class SVCaller { int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); // Detect SVs from split alignments - void detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller); + void detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, CHMM hmm); // Calculate the mismatch rate given a map of query positions to // match/mismatch (1/0) values within a specified range of the query diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 3bc3f39e..c6525c1d 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -148,7 +148,7 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta return std::make_pair(snp_data, snps_found); } -std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate) +std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate, CHMM hmm) { // Get the start and end positions of the SV call uint32_t start_pos = std::get<0>(candidate); @@ -157,28 +157,16 @@ std::tuple CNVCaller::runCopyNumberPrediction // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2 // the SV length uint32_t sv_half_length = (end_pos - start_pos) / 2.0; - // uint32_t snp_start_pos = std::max((uint32_t)1, start_pos - sv_length); - // Prevent underflow (start_pos - sv_length) if start_pos < sv_length uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1; uint32_t snp_end_pos = end_pos + sv_half_length; - // std::cout << "CNP for " << chr << ":" << start_pos << "-" << end_pos << "(" << snp_start_pos << ", " << snp_end_pos << ")" << std::endl; - // printMessage("Running copy number prediction for SV candidate " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + " with SNP region " + chr + ":" + std::to_string(snp_start_pos) + "-" + std::to_string(snp_end_pos) + "..."); // Query the SNP region for the SV candidate std::pair snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, this->snp_info, this->pos_depth_map, this->mean_chr_cov); SNPData& sv_snps = snp_call.first; bool sv_snps_found = snp_call.second; - /* - if (sv_snps.pos.size() == 0) { - std::cerr << "ERROR [2]: No windows for SV " << chr << ":" << std::to_string((int)start_pos) << "-" << std::to_string((int)end_pos) << " (" << snp_start_pos << "," << snp_end_pos << std::endl; - continue; - } - */ - // Run the Viterbi algorithm - // printMessage("[TEST] Running Viterbi algorithm for SV candidate " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + "..."); - std::pair, double> prediction = runViterbi(this->hmm, sv_snps); + std::pair, double> prediction = runViterbi(hmm, sv_snps); std::vector& state_sequence = prediction.first; double likelihood = prediction.second; @@ -235,54 +223,12 @@ std::tuple CNVCaller::runCopyNumberPrediction } -void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set &sv_candidates, int min_length) +void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set &sv_candidates, int min_length, CHMM hmm) { - CHMM& hmm = this->hmm; int window_size = this->input_data.getWindowSize(); double mean_chr_cov = this->mean_chr_cov; printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "..."); - - // Create a map with counts for each CNV type - // std::map cnv_type_counts; - // for (int i = 0; i < 6; i++) - // { - // cnv_type_counts[i] = 0; - // } - runCIGARCopyNumberPredictionChunk(chr, sv_candidates, hmm, window_size, mean_chr_cov); - // // Split the SV candidates into chunks for each thread - // int chunk_count = this->input_data.getThreadCount(); - // // std::vector> sv_chunks = splitSVCandidatesIntoChunks(sv_candidates, chunk_count); - // std::vector> sv_chunks = splitSVsIntoChunks(sv_candidates, chunk_count); - - // // Loop through each SV chunk and run the copy number prediction in parallel - // // std::vector> futures; - // std::vector> futures; - // for (auto& sv_chunk : sv_chunks) - // { - // // Run the copy number prediction for the SV chunk - // futures.emplace_back(std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, std::ref(sv_chunk), std::ref(this->snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map))); - // // futures.emplace_back(std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, std::ref(sv_chunk), std::ref(this->snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map))); - // // std::async(std::launch::async, &CNVCaller::runCIGARCopyNumberPredictionChunk, this, chr, sv_chunk, std::ref(this->snp_info), hmm, window_size, mean_chr_cov, std::ref(this->pos_depth_map)); - // } - - // // Wait for all the futures to finish - // int current_chunk = 0; - // for (auto& future : futures) - // { - // current_chunk++; - // try { - // future.wait(); - // // SNPData chunk_snp_data = std::move(future.get()); - // if (this->input_data.getVerbose()) - // { - // printMessage("Finished processing SV chunk " + std::to_string(current_chunk) + " of " + std::to_string(chunk_count) + "..."); - // } - // } catch (const std::exception& e) { - // printError("Error processing SV chunk " + std::to_string(current_chunk) + " of " + std::to_string(chunk_count) + ": " + e.what()); - // } - // } - printMessage("Finished predicting copy number states for chromosome " + chr + "..."); } @@ -525,9 +471,9 @@ std::vector> CNVCaller::splitSVCandidatesIntoChunks(std void CNVCaller::loadChromosomeData(std::string chr) { - std::string hmm_filepath = this->input_data.getHMMFilepath(); - std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; - this->hmm = ReadCHMM(hmm_filepath.c_str()); + // std::string hmm_filepath = this->input_data.getHMMFilepath(); + // std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; + // this->hmm = ReadCHMM(hmm_filepath.c_str()); printMessage("Calculating mean chromosome coverage for " + chr + "..."); this->mean_chr_cov = calculateMeanChromosomeCoverage(chr); diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 3761325d..b40ecaed 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -383,6 +383,12 @@ std::unordered_map> SVCaller::run() throw std::runtime_error("ERROR: failed to open " + bam_filepath); } + // Read the HMM from the file + std::string hmm_filepath = this->input_data.getHMMFilepath(); + std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; + CHMM hmm = ReadCHMM(hmm_filepath.c_str()); + // this->hmm = ReadCHMM(hmm_filepath.c_str()); + // Enable multi-threading int num_threads = this->input_data.getThreadCount(); if (num_threads > 1) { @@ -485,12 +491,12 @@ std::unordered_map> SVCaller::run() std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl; // cnv_caller.runCIGARCopyNumberPrediction(chr, cigar_svs, // min_cnv_length); - cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length); + cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm); } // Run split-read SV and copy number variant predictions std::cout << "Detecting copy number variants from split reads..." << std::endl; - this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller); + this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm); // sv_calls.concatenate(subregion_sv_calls); // Add the calls to the // main set // sv_calls.emplace_back(subregion_sv_calls); @@ -537,7 +543,7 @@ std::unordered_map> SVCaller::run() // Detect SVs from split read alignments -void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller) +void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, CHMM hmm) { // Find split-read SV evidence int sv_count = 0; @@ -579,7 +585,7 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p if (is_opposite_strand) { if (supp_length >= min_cnv_length) { SVCandidate sv_candidate(supp_start+1, supp_end+1, "."); - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate, hmm); double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); if (supp_type == SVType::NEUTRAL) { @@ -630,14 +636,14 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p // Run copy number variant predictions on the boundary if large enough if (boundary_right - boundary_left >= min_cnv_length) { split_boundary = SVCandidate(boundary_left, boundary_right, "."); - std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_boundary); + std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_boundary, hmm); double bd_lh = std::get<0>(bd_result); SVType bd_type = std::get<1>(bd_result); // Run copy number variant predictions on the gap if it exists if (gap_exists && gap_right - gap_left >= min_cnv_length) { split_gap = SVCandidate(gap_left, gap_right, "."); - std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_gap); + std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_gap, hmm); double gap_lh = std::get<0>(gap_result); SVType gap_type = std::get<1>(gap_result); From 39602fe09d3ab10cad02c2667aafe616f7e93087 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Tue, 26 Nov 2024 13:41:14 -0500 Subject: [PATCH 031/134] remove some comments --- src/sv_caller.cpp | 41 ----------------------------------------- 1 file changed, 41 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index b40ecaed..7335148e 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -370,12 +370,6 @@ std::unordered_map> SVCaller::run() chromosomes = this->input_data.getRefGenomeChromosomes(); } - // [TEST] Only process the last N chromosomes - // int last_n = 3; - // chromosomes = std::vector(chromosomes.end()-last_n, chromosomes.end()); - // std::cout << "[DEBUG] Running last " << last_n << " chromosomes" << std::endl; - // //chromosomes = std::vector(chromosomes.end()-3, chromosomes.end()); - // Open the BAM file std::string bam_filepath = this->input_data.getLongReadBam(); samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); @@ -387,7 +381,6 @@ std::unordered_map> SVCaller::run() std::string hmm_filepath = this->input_data.getHMMFilepath(); std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; CHMM hmm = ReadCHMM(hmm_filepath.c_str()); - // this->hmm = ReadCHMM(hmm_filepath.c_str()); // Enable multi-threading int num_threads = this->input_data.getThreadCount(); @@ -416,9 +409,6 @@ std::unordered_map> SVCaller::run() int current_chr = 0; std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl; int chunk_count = 100; // Number of chunks to split the chromosome into - // SVData sv_calls; - // std::vector> sv_calls; - // std::unordered_map> sv_calls; uint32_t total_sv_count = 0; std::unordered_map> whole_genome_sv_calls; int min_cnv_length = this->input_data.getMinCNVLength(); @@ -463,43 +453,25 @@ std::unordered_map> SVCaller::run() int current_region = 0; std::set combined_sv_calls; for (const auto& sub_region : region_chunks) { - // std::cout << "Detecting CIGAR string SVs from " << sub_region << "..." << std::endl; - // std::tuple, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(sub_region); std::tuple, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region); std::set& subregion_sv_calls = std::get<0>(region_data); PrimaryMap& primary_map = std::get<1>(region_data); SuppMap& supp_map = std::get<2>(region_data); std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl; mergeSVs(subregion_sv_calls); - // SVData& subregion_sv_calls = std::get<0>(region_data); - // PrimaryMap& primary_map = std::get<1>(region_data); - // SuppMap& supp_map = std::get<2>(region_data); - // int region_sv_count = subregion_sv_calls.totalCalls(); - // if (region_sv_count > 0) { - // std::cout << "Detected " << region_sv_count << " CIGAR SVs from " << sub_region << "..." << std::endl; - // } - // int region_sv_count = subregion_sv_calls.count(); int region_sv_count = getSVCount(subregion_sv_calls); printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); // Run copy number variant predictions on the SVs detected from the // CIGAR string, using a minimum CNV length threshold - // std::cout << "Detecting copy number variants from CIGAR string SVs..." << std::endl; - // std::map& cigar_svs = subregion_sv_calls.getChromosomeSVs(chr); - // if (cigar_svs.size() > 0) { if (region_sv_count > 0) { std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl; - // cnv_caller.runCIGARCopyNumberPrediction(chr, cigar_svs, - // min_cnv_length); cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm); } // Run split-read SV and copy number variant predictions std::cout << "Detecting copy number variants from split reads..." << std::endl; this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm); - // sv_calls.concatenate(subregion_sv_calls); // Add the calls to the - // main set - // sv_calls.emplace_back(subregion_sv_calls); // Merge the SV calls from the current region std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; @@ -509,10 +481,6 @@ std::unordered_map> SVCaller::run() std::cout << "Combining SV calls from " << sub_region << "..." << std::endl; concatenateSVCalls(combined_sv_calls, subregion_sv_calls); std::cout << "Completed " << ++current_region << " of " << region_count << " region(s)..." << std::endl; - - // [TEST] Break after the first region - // std::cout << "[DEBUG] Breaking after the first region" << std::endl; - // break; } std::cout << "Completed " << ++current_chr << " of " << chr_count << " chromosome(s)..." << std::endl; @@ -521,7 +489,6 @@ std::unordered_map> SVCaller::run() std::cout << "Total SVs detected for chromosome " << chr << ": " << chr_sv_count << std::endl; total_sv_count += chr_sv_count; std::cout << "Cumulative total SVs: " << total_sv_count << std::endl; - // std::cout << "Completed " << region_count << " of " << chr_count << " chromosome(s)" << std::endl; } // Clean up the BAM file, header, and index @@ -529,11 +496,6 @@ std::unordered_map> SVCaller::run() bam_hdr_destroy(bamHdr); sam_close(fp_in); - // SVData sv_calls_combined; - // for (const auto& subregion_sv_calls : sv_calls) { - // sv_calls_combined.concatenate(subregion_sv_calls); - // } - // Save to VCF std::cout << "Saving SVs to VCF..." << std::endl; this->saveToVCF(whole_genome_sv_calls); @@ -554,10 +516,7 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p std::string primary_chr = std::get<0>(primary_alignment); int32_t primary_start = std::get<1>(primary_alignment); int32_t primary_end = std::get<2>(primary_alignment); - // int32_t primary_query_start = std::get<4>(primary_alignment); - // int32_t primary_query_end = std::get<5>(primary_alignment); std::unordered_map primary_match_map = std::get<6>(primary_alignment); - // bool primary_strand = std::get<7>(primary_alignment); // Skip primary alignments that do not have supplementary alignments if (supp_map.find(qname) == supp_map.end()) { From 1884f675357e119d30b9eb3d21e1be1baa026505 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Tue, 26 Nov 2024 17:32:54 -0500 Subject: [PATCH 032/134] Fix multithreading --- include/sv_caller.h | 2 + setup.py | 2 +- src/cnv_caller.cpp | 713 +------------------------------------------- src/sv_caller.cpp | 352 +++++++++++----------- 4 files changed, 199 insertions(+), 870 deletions(-) diff --git a/include/sv_caller.h b/include/sv_caller.h index dee58dea..ec89aa8a 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -38,6 +38,8 @@ class SVCaller { // mismatch rate, and the start and end positions of the query sequence std::tuple, int32_t, int32_t> detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set& sv_calls, bool is_primary); + void processChromosome(const std::string& chr, const std::string& bam_filepath, CHMM hmm, std::set& combined_sv_calls, int min_cnv_length); + // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. // RegionData detectSVsFromRegion(std::string region); diff --git a/setup.py b/setup.py index 5b6d9ef2..ce0c428f 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ name="_" + NAME, sources=SRC_FILES, include_dirs=[INCLUDE_DIR, conda_include_dir], - extra_compile_args=["-std=c++11"], + extra_compile_args=["-std=c++14"], language="c++", libraries=["hts"], library_dirs=[conda_lib_dir] diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index c6525c1d..fd7a6c7c 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -74,7 +74,6 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta // window, then calculate the log2 ratio for each window for (uint32_t i = start_pos; i <= end_pos; i += window_size) { - // std::cout << "Querying SNP region for " << chr << ":" << i << "-" << std::min(i + window_size - 1, end_pos) << std::endl; // Run a sliding non-overlapping window of size window_size across // the SV region and calculate the log2 ratio for each window uint32_t window_start = i; @@ -131,13 +130,10 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta { bin_end = std::min(bin_end, (snp_window_pos[j] + snp_window_pos[j+1]) / 2); } - // std::cout << "bin_start: " << bin_start << std::endl; - // std::cout << "bin_end: " << bin_end << std::endl; // Calculate the log2 ratio for the SNP bin double bin_cov = calculateLog2Ratio(bin_start, bin_end, pos_depth_map, mean_chr_cov); this->updateSNPData(snp_data, snp_window_pos[j], snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true); - // this->updateSNPData(snp_data, snp_pos, snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true); // Update the previous bin start bin_start = bin_end + 1; @@ -215,7 +211,7 @@ std::tuple CNVCaller::runCopyNumberPrediction { std::string cnv_type_str = getSVTypeString(predicted_cnv_type); std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv"; - std::cout << "Saving SV split-alignment copy number predictions to " << sv_filename << std::endl; + printMessage("Saving SV split-alignment copy number predictions to " + sv_filename + "..."); this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood); } @@ -247,9 +243,6 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set(candidate); - // int64_t end_pos = std::get<1>(candidate); uint32_t start_pos = sv_call.start; uint32_t end_pos = sv_call.end; @@ -266,8 +259,6 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set sv_half_length ? start_pos - sv_half_length : 1; uint32_t snp_end_pos = end_pos + sv_half_length; - // std::cout << "CIGAR sv_half_length:" << sv_half_length << std::endl; - // std::cout << "CIGAR SV query at " << chr << ":" << query_start << "-" << query_end << std::endl; - - // printMessage("Querying SNPs for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", qstart = " + std::to_string(query_start) + ", qend = " + std::to_string(query_end)); std::pair snp_call = this->querySNPRegion(chr, snp_start_pos, snp_end_pos, snp_info, this->pos_depth_map, mean_chr_cov); - // printMessage("Finished querying SNPs for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); SNPData& sv_snps = snp_call.first; bool snps_found = snp_call.second; - // Run the Viterbi algorithm - // printMessage("[TEST2] Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "..."); - + // Run the Viterbi algorithm if (sv_snps.pos.size() == 0) { std::cerr << "ERROR: No windows for SV " << chr << ":" << start_pos << "-" << end_pos << " (" << snp_start_pos << "," << snp_end_pos << std::endl; continue; @@ -353,13 +337,7 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::setupdateSVCopyNumber(sv_candidates, sv_call, cnv_type, data_type, genotype, likelihood); // Save the SV calls as a TSV file if enabled, if the SV type is // known, and the length is greater than 10 kb @@ -372,8 +350,6 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::setinput_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_CIGAR.tsv"; - // std::cout << "Saving SV CIGAR copy number predictions to " << - // sv_filename << std::endl; printMessage("Saving SV CIGAR copy number predictions to " + sv_filename); this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood); } @@ -471,22 +447,9 @@ std::vector> CNVCaller::splitSVCandidatesIntoChunks(std void CNVCaller::loadChromosomeData(std::string chr) { - // std::string hmm_filepath = this->input_data.getHMMFilepath(); - // std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; - // this->hmm = ReadCHMM(hmm_filepath.c_str()); - printMessage("Calculating mean chromosome coverage for " + chr + "..."); this->mean_chr_cov = calculateMeanChromosomeCoverage(chr); - //this->mean_chr_cov = 30.0; printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov)); - - // std::cout << "Reading SNP allele frequencies for chromosome " << chr << " from VCF file..." << std::endl; - // std::string snp_filepath = this->input_data.getSNPFilepath(); - // readSNPAlleleFrequencies(chr, snp_filepath, this->snp_info); - - // std::cout << "Obtaining SNP population frequencies for chromosome " << chr << "..." << std::endl; - // getSNPPopulationFrequencies(chr, this->snp_info); - // std::cout << "Finished loading chromosome data for " << chr << std::endl; } // Calculate the mean chromosome coverage @@ -501,7 +464,7 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) } // Enable multi-threading - hts_set_threads(bam_file, this->input_data.getThreadCount()); + // hts_set_threads(bam_file, this->input_data.getThreadCount()); // Read the header bam_hdr_t *bam_header = sam_hdr_read(bam_file); @@ -595,13 +558,6 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) uint32_t pos_count = 0; for (auto& pos_depth : chr_pos_depth_map) { - // if (pos_depth.second > 0) - // { - // cum_depth += pos_depth.second; - // pos_count++; - // } else { - // std::cout << "Zero depth at position " << pos_depth.first << std::endl; - // } cum_depth += pos_depth.second; pos_count++; } @@ -684,8 +640,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui } // Set multi-threading - int thread_count = this->input_data.getThreadCount(); - bcf_sr_set_threads(snp_reader, thread_count); + // int thread_count = this->input_data.getThreadCount(); + // bcf_sr_set_threads(snp_reader, thread_count); // Enable index usage snp_reader->require_index = 1; @@ -719,20 +675,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui record_count++; uint32_t pos = (uint32_t)snp_record->pos + 1; - // Skip if 3 or more duplicate positions found - // if (pos == last_pos) - // { - // duplicate_count++; - // if (duplicate_count >= 10) - // { - // std::cerr << "ERROR: 3 or more duplicate positions found in SNP file at " << chr << ":" << pos << std::endl; - // break; - // } - // } else { - // duplicate_count = 0; - // } - // last_pos = pos; - // Skip if not a SNP if (!bcf_is_snp(snp_record)) { @@ -814,409 +756,20 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Insert the SNP position and BAF into the maps snp_pos.insert(pos); snp_baf[pos] = baf; - - // Print the SNP position and BAF - // std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << "(REF=" << ad0 << ",ALT=" << ad1 << ")" << std::endl; - // print_count++; - // if (print_count < 10) - // { - // std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << "(REF=" << ad0 << ",ALT=" << ad1 << ")" << std::endl; - // print_count++; - // } } } // Clean up bcf_sr_destroy(snp_reader); - - // std::cout << "Opening SNP file: " << snp_filepath << std::endl; - // htsFile *snp_file = bcf_open(snp_filepath.c_str(), "r"); - // if (!snp_file) - // { - // throw std::runtime_error("ERROR: Could not open SNP file: " + snp_filepath); - // } - - // // Enable multi-threading - // hts_set_threads(snp_file, thread_count); - - // // Read the header - // bcf_hdr_t *snp_header = bcf_hdr_read(snp_file); - // if (!snp_header) - // { - // bcf_close(snp_file); - // throw std::runtime_error("ERROR: Could not read header from SNP file: " + snp_filepath); - // } - - // // Load the index - // hts_idx_t *snp_index = bcf_index_load(snp_filepath.c_str()); - // if (!snp_index) - // { - // bcf_hdr_destroy(snp_header); - // bcf_close(snp_file); - // throw std::runtime_error("ERROR: Could not load index for SNP file: " + snp_filepath); - // } - - // // Construct the region string - // std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); - // hts_itr_t *snp_iter = bcf_itr_querys(snp_index, snp_header, region_str.c_str()); - // if (!snp_iter) - // { - // hts_idx_destroy(snp_index); - // bcf_hdr_destroy(snp_header); - // bcf_close(snp_file); - // throw std::runtime_error("ERROR: Could not create iterator for SNP region: " + region_str); - // } - - // // Set up the record - // bcf1_t *snp_record = bcf_init(); - // if (!snp_record) - // { - // bcf_hdr_destroy(snp_header); - // bcf_close(snp_file); - // throw std::runtime_error("ERROR: Could not initialize SNP record."); - // } - - // // Read the SNPs in the chromosome region - // int print_count = 0; - // while (bcf_itr_next(snp_file, snp_iter, snp_record) >= 0) - // { - // // Get the position and B-allele frequency (BAF) from the SNP record - // uint32_t pos = snp_record->pos + 1; // 0-based to 1-based - - // // Get QUAL, DP, and AD values - // float qual = snp_record->qual; - // if (bcf_float_is_missing(qual)) - // { - // std::cerr << "ERROR: QUAL value is missing for SNP at " << chr << ":" << pos << std::endl; - // } - // // Skip if quality is less than 30 - // if (qual <= 30) - // { - // continue; - // } - - // // Get FILTER status - // int pass_id = bcf_hdr_id2int(snp_header, BCF_DT_ID, "PASS"); - // if (pass_id == -1) - // { - // std::cerr << "ERROR: Could not get PASS ID for SNP at " << chr << ":" << pos << std::endl; - // } - // std::string pass_filter = "PASS"; - // if (bcf_has_filter(snp_header, snp_record, const_cast(pass_filter.c_str())) != 1) - // { - // // Skip if the SNP does not pass the filter - // continue; - // } - - // // Extract DP from INFO field - // int32_t dp = 0; - // int dp_count = bcf_get_info_int32(snp_header, snp_record, "DP", &dp, &dp_count); - // if (dp_count != 1) - // { - // std::cerr << "ERROR: Could not get DP value for SNP at " << chr << ":" << pos << std::endl; - // } - // // Skip if depth is not greater than 10 - // if (dp <= 10) - // { - // continue; - // } - - // // Skip if not a SNP - // if (!bcf_is_snp(snp_record)) - // { - // continue; - // } - - // // Extract AD from FORMAT field - // int32_t ad[2] = {0, 0}; - // int ad_count = 0; - // int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count); - // // if (ad_count != 2) - // // { - // // std::cerr << "ERROR: Could not get AD value for SNP at " << chr << ":" << pos << std::endl; - // // } - - // // Calculate the BAF - // if (ad_ret > 0 && ad_count > 0) - // { - // double baf = (double) ad[1] / (double) (ad[0] + ad[1]); - // snp_pos.insert(pos); - // snp_baf[pos] = baf; - - // // Print the SNP position and BAF - // if (print_count < 10) - // { - // std::cout << "SNP: " << chr << ":" << pos << ", BAF: " << baf << std::endl; - // print_count++; - // } - // } - // } - - // // Clean up - // bcf_destroy(snp_record); - // hts_itr_destroy(snp_iter); - // hts_idx_destroy(snp_index); - // bcf_hdr_destroy(snp_header); - // bcf_close(snp_file); - - // // Check that the SNP file is sorted by running bcftools index and reading - // // the error output - // std::string index_cmd = "bcftools index " + filepath + " 2>&1 | grep -i error"; - // if (this->input_data.getVerbose()) { - // std::cout << "Command: " << index_cmd << std::endl; - // } - - // // Open a pipe to read the output of the command - // FILE *index_fp = popen(index_cmd.c_str(), "r"); - // if (index_fp == NULL) - // { - // std::cerr << "ERROR: Could not open pipe for command: " << index_cmd << std::endl; - // exit(1); - // } - - // // Read the output of the command - // const int error_size = 256; - // char index_error[error_size]; - // while (fgets(index_error, error_size, index_fp) != NULL) - // { - // std::cerr << "ERROR: " << index_error << std::endl; - // exit(1); - // } - // pclose(index_fp); // Close the process - - // // Filter variants by depth, quality, and region - // if (this->input_data.getVerbose()) { - // std::cout << "Filtering SNPs by depth, quality, and region..." << std::endl; - // } - - // // Check if a region was specified by the user - // std::string region_str = chr; - // if (this->input_data.isRegionSet()) - // { - // std::pair region = this->input_data.getRegion(); - // region_str = chr + ":" + std::to_string(region.first) + "-" + std::to_string(region.second); - // } - - // std::string filtered_snp_vcf_filepath = this->input_data.getOutputDir() + "/filtered_snps.vcf"; - // int thread_count = this->input_data.getThreadCount(); - // // std::string cmd = "bcftools view -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath; - // std::string cmd = "bcftools view --threads " + std::to_string(thread_count) + " -r " + region_str + " -v snps -i 'QUAL > 30 && DP > 10 && FILTER = \"PASS\"' " + filepath + " > " + filtered_snp_vcf_filepath; - // if (this->input_data.getVerbose()) { - // std::cout << "Filtering SNPs by depth and quality..." << std::endl; - // std::cout << "Command: " << cmd << std::endl; - // } - // system(cmd.c_str()); - - // if (this->input_data.getVerbose()) { - // std::cout << "Filtered SNPs written to " << filtered_snp_vcf_filepath << std::endl; - // } - - // // Extract B-allele frequency data from the VCF file and sort by chromosome - // // and position - // if (this->input_data.getVerbose()) { - // std::cout << "Extracting B-allele frequency data from filtered SNPs..." << std::endl; - // } - // cmd = "bcftools query -f '%POS,[%AD]\n' " + filtered_snp_vcf_filepath + " 2>/dev/null"; - // FILE *fp = popen(cmd.c_str(), "r"); - // if (fp == NULL) - // { - // std::cerr << "ERROR: Could not open pipe for command: " << cmd << std::endl; - // exit(1); - // } - - // // Read the reference and alternate allele depths from the VCF file - // std::string alt_allele = ""; // Alternate allele - // uint32_t pos = 0; - // int ref_ad = 0; - // int alt_ad = 0; - // const int line_size = 1024; - // char line[line_size]; // Line buffer - // std::vector locations; - // std::vector bafs; - // std::string chr_no_prefix = removeChrPrefix(chr); - // while (fgets(line, line_size, fp) != NULL) - // { - // // Parse the line - // char *tok = strtok(line, ","); // Tokenize the line - // int col = 0; // Column index - // while (tok != NULL) - // { - // // Get the position from column 2 - // if (col == 0) - // { - // pos = (uint32_t)atoi(tok); - // } - - // // Get the AD for the reference allele from column 3 - // else if (col == 1) - // { - // ref_ad = atoi(tok); - // } - - // // Get the AD for the non-reference allele from column 4 - // else if (col == 2) - // { - // alt_ad = atoi(tok); - // } - - // // Move to the next token - // tok = strtok(NULL, ","); - // col++; - // } - - // // Calculate the B-allele frequency (BAF) as the ratio of the alternate - // // allele depth to the total depth (reference + alternate) - // double baf = (double) alt_ad / (double) (ref_ad + alt_ad); - - // // Add a new location and BAF value to the chromosome's SNP data - // // (population frequency and log2 ratio will be added later) - // // snp_info.insertSNPAlleleFrequency(chr_no_prefix, pos, baf); - // this->snp_baf_map[pos] = baf; - // this->snp_baf_keys.insert(pos); - // } - - // pclose(fp); // Close the process - - if (this->input_data.getVerbose()) { - std::cout << "Finished extracting B-allele frequency data from filtered SNPs" << std::endl; - } } -// void CNVCaller::getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info) -// { -// // Get the population frequency file for the chromosome -// std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr); -// if (pfb_filepath.empty()) -// { -// std::cout << "No population frequency file provided for chromosome " << chr << std::endl; -// return; -// } - -// // Determine the ethnicity-specific allele frequency key -// std::string AF_key = "AF"; -// if (this->input_data.getEthnicity() != "") -// { -// AF_key += "_" + this->input_data.getEthnicity(); -// } - -// // Check if the filepath uses the 'chr' prefix notations based on the -// // chromosome name (*.chr1.vcf.gz vs *.1.vcf.gz) -// std::string chr_gnomad = chr; // gnomAD data may or may not have the 'chr' prefix -// std::string chr_prefix = "chr"; -// if (pfb_filepath.find(chr_prefix) == std::string::npos) -// { -// // Remove the 'chr' prefix from the chromosome name -// if (chr_gnomad.find(chr_prefix) != std::string::npos) -// { -// chr_gnomad = chr_gnomad.substr(chr_prefix.length()); -// } -// } else { -// // Add the 'chr' prefix to the chromosome name -// if (chr_gnomad.find(chr_prefix) == std::string::npos) -// { -// chr_gnomad = chr_prefix + chr; -// } -// } - -// // Remove the 'chr' prefix from the chromosome name for SNP data. All -// // SNP data in this program does not use the 'chr' prefix -// std::string chr_no_prefix = removeChrPrefix(chr); - -// std::cout << "Reading population frequencies for chromosome " << chr << " from " << pfb_filepath << std::endl; -// int thread_count = this->input_data.getThreadCount(); - -// // Open the population frequency file -// std::cout << "Opening population frequency file: " << pfb_filepath << std::endl; -// htsFile *pfb_file = hts_open(pfb_filepath.c_str(), "r"); -// if (!pfb_file) -// { -// throw std::runtime_error("ERROR: Could not open population frequency file: " + pfb_filepath); -// } - -// // Enable multi-threading -// std::cout << "Setting number of threads to " << thread_count << std::endl; -// hts_set_threads(pfb_file, thread_count); - -// // Read the header -// std::cout << "Reading header from population frequency file..." << std::endl; -// bcf_hdr_t *pfb_header = bcf_hdr_read(pfb_file); -// if (!pfb_header) -// { -// bcf_close(pfb_file); -// throw std::runtime_error("ERROR: Could not read header from population frequency file: " + pfb_filepath); -// } - -// // Set up the record -// std::cout << "Initializing BCF record..." << std::endl; -// bcf1_t *pfb_record = bcf_init(); -// if (!pfb_record) -// { -// bcf_hdr_destroy(pfb_header); -// bcf_close(pfb_file); -// throw std::runtime_error("ERROR: Could not initialize BCF record."); -// } - -// // Read the population frequencies for the chromosome -// std::cout << "[TEST] Reading population frequencies for chromosome " << chr << " (AF_key = " << AF_key << ")..." << std::endl; -// int print_count = 0; -// while (bcf_read(pfb_file, pfb_header, pfb_record) == 0) -// { -// // Get the chromosome and position -// // std::cout << "Reading record..." << std::endl; -// uint32_t pos = pfb_record->pos + 1; // 0-based to 1-based - -// // Skip if not a SNP, or if the position is not in the BAF map -// if (!bcf_is_snp(pfb_record) || this->snp_baf_keys.count(pos) == 0) -// { -// continue; -// } - -// // Get the population frequency for the SNP -// float *pfb_f = NULL; -// int count = 0; -// int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count); -// if (pfb_status < 0 || count == 0) -// { -// std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl; -// continue; -// } -// double pfb = (double) pfb_f[0]; -// free(pfb_f); - -// // Continue if the population frequency is outside the threshold -// if (pfb <= MIN_PFB || pfb >= MAX_PFB) -// { -// continue; -// } - -// // Add the population frequency to the SNP data -// // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb); -// if (this->snp_pfb_map.find(pos) == this->snp_pfb_map.end()) -// { -// this->snp_pfb_map[pos] = pfb; -// } else { -// // Keep the larger population frequency -// if (pfb > this->snp_pfb_map[pos]) -// { -// this->snp_pfb_map[pos] = pfb; -// } -// } -// if (print_count < 10) -// { -// std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; -// print_count++; -// } -// } -// std::cout << "Finished reading population frequencies for chromosome " << chr << std::endl; -// } - void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map& snp_pfb_map) { // Get the population frequency file for the chromosome std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr); if (pfb_filepath == "") { - std::cout << "No population frequency file provided for chromosome " << chr << std::endl; + printError("No population frequency file provided for chromosome " + chr); return; } @@ -1249,7 +802,7 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos // Remove the 'chr' prefix from the chromosome name for SNP data. All // SNP data in this program does not use the 'chr' prefix std::string chr_no_prefix = removeChrPrefix(chr); - int thread_count = this->input_data.getThreadCount(); + // int thread_count = this->input_data.getThreadCount(); // Initialize the synced reader bcf_srs_t *pfb_reader = bcf_sr_init(); @@ -1267,7 +820,7 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos } // Set multi-threading - bcf_sr_set_threads(pfb_reader, thread_count); + // bcf_sr_set_threads(pfb_reader, thread_count); // Enable index usage pfb_reader->require_index = 1; @@ -1294,7 +847,6 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos { continue; } - // std::cout << "Reading record..." << std::endl; // pfb_record = bcf_sr_get_line(pfb_reader, 0); bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0); // Do something with the record @@ -1304,7 +856,6 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos // Skip if not a SNP if (!bcf_is_snp(pfb_record)) { - // std::cout << "Skipping non-SNP at " << chr << ":" << pfb_record->pos << std::endl; continue; } @@ -1316,7 +867,6 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count); if (pfb_status < 0 || count == 0) { - // std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl; continue; } double pfb = (double) pfb_f[0]; @@ -1339,58 +889,7 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos snp_pfb_map[pos] = pfb; } } - - // if (test_count < 10) - // { - // std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; - // test_count++; - // } } - // std::cout << "Record: " << pfb_record->pos << std::endl; - // std::cout << "QUAL: " << pfb_record->qual << std::endl; - - // // Skip if not a SNP - // if (!bcf_is_snp(pfb_record)) - // { - // std::cout << "Skipping non-SNP at " << chr << ":" << pos << std::endl; - // continue; - // } - - // // Get the population frequency for the SNP - // float *pfb_f = NULL; - // int count = 0; - // int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count); - // if (pfb_status < 0 || count == 0) - // { - // std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl; - // continue; - // } - // double pfb = (double) pfb_f[0]; - // free(pfb_f); - - // // Continue if the population frequency is outside the threshold - // if (pfb <= MIN_PFB || pfb >= MAX_PFB) - // { - // continue; - // } - - // // Add the population frequency to the SNP data - // // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb); - // if (snp_pfb_map.find(pos) == snp_pfb_map.end()) - // { - // snp_pfb_map[pos] = pfb; - // } else { - // // Keep the larger population frequency - // if (pfb > snp_pfb_map[pos]) - // { - // snp_pfb_map[pos] = pfb; - // } - // } - // if (print_count < 10) - // { - // std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; - // print_count++; - // } } } if (pfb_reader->errnum) { @@ -1398,129 +897,7 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos } // Clean up - // bcf_destroy(pfb_record); bcf_sr_destroy(pfb_reader); - - // // Open the population frequency file - // std::cout << "Opening population frequency file: " << pfb_filepath << std::endl; - // htsFile *pfb_file = hts_open(pfb_filepath.c_str(), "r"); - // if (!pfb_file) - // { - // throw std::runtime_error("ERROR: Could not open population frequency file: " + pfb_filepath); - // } - - // // Enable multi-threading - // std::cout << "Setting number of threads to " << thread_count << std::endl; - // hts_set_threads(pfb_file, thread_count); - - // // Read the header - // std::cout << "Reading header from population frequency file..." << std::endl; - // bcf_hdr_t *pfb_header = bcf_hdr_read(pfb_file); - // if (!pfb_header) - // { - // bcf_close(pfb_file); - // throw std::runtime_error("ERROR: Could not read header from population frequency file: " + pfb_filepath); - // } - - // // Load the index - // hts_idx_t *pfb_index = bcf_index_load(pfb_filepath.c_str()); - // if (!pfb_index) - // { - // bcf_hdr_destroy(pfb_header); - // bcf_close(pfb_file); - // throw std::runtime_error("ERROR: Could not load index for population frequency file: " + pfb_filepath); - // } - - // // Construct the region string - // std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); - // hts_itr_t *pfb_iter = bcf_itr_querys(pfb_index, pfb_header, region_str.c_str()); - // if (!pfb_iter) - // { - // // Try using the other chromosome notation - // std::string alt_region_str = "chr" + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); - // pfb_iter = bcf_itr_querys(pfb_index, pfb_header, alt_region_str.c_str()); - // if (!pfb_iter) - // { - // hts_idx_destroy(pfb_index); - // bcf_hdr_destroy(pfb_header); - // bcf_close(pfb_file); - // throw std::runtime_error("ERROR: Could not create iterator for region: " + alt_region_str); - // } else { - // region_str = alt_region_str; - // std::cout << "Successfully created iterator for region: " << region_str << std::endl; - // } - // // hts_idx_destroy(pfb_index); - // // bcf_hdr_destroy(pfb_header); - // // bcf_close(pfb_file); - // // throw std::runtime_error("ERROR: Could not create iterator for region: " + region_str); - // } - - // // Set up the record - // std::cout << "Initializing BCF record..." << std::endl; - // bcf1_t *pfb_record = bcf_init(); - // if (!pfb_record) - // { - // bcf_hdr_destroy(pfb_header); - // bcf_close(pfb_file); - // throw std::runtime_error("ERROR: Could not initialize BCF record."); - // } - - // // Read the population frequencies for the region - // std::cout << "[TEST] Reading population frequencies for region " << region_str << " (AF_key = " << AF_key << ")..." << std::endl; - // int print_count = 0; - // int test_count = 0; - // while (bcf_itr_next(pfb_file, pfb_iter, pfb_record) >= 0) - // { - // test_count++; - // // Get the chromosome and position - // // std::cout << "Reading record..." << std::endl; - // uint32_t pos = pfb_record->pos + 1; // 0-based to 1-based - - // // Skip if not a SNP - // if (!bcf_is_snp(pfb_record)) - // { - // std::cout << "Skipping non-SNP at " << chr << ":" << pos << std::endl; - // continue; - // } - - // // Get the population frequency for the SNP - // float *pfb_f = NULL; - // int count = 0; - // int pfb_status = bcf_get_info_float(pfb_header, pfb_record, AF_key.c_str(), &pfb_f, &count); - // if (pfb_status < 0 || count == 0) - // { - // std::cout << "Field " << AF_key << " not found, or count is 0" << std::endl; - // continue; - // } - // double pfb = (double) pfb_f[0]; - // free(pfb_f); - - // // Continue if the population frequency is outside the threshold - // if (pfb <= MIN_PFB || pfb >= MAX_PFB) - // { - // continue; - // } - - // // Add the population frequency to the SNP data - // // snp_info.insertSNPPopulationFrequency(chr_no_prefix, pos, pfb); - // if (snp_pfb_map.find(pos) == snp_pfb_map.end()) - // { - // snp_pfb_map[pos] = pfb; - // } else { - // // Keep the larger population frequency - // if (pfb > snp_pfb_map[pos]) - // { - // snp_pfb_map[pos] = pfb; - // } - // } - // if (print_count < 10) - // { - // std::cout << "Population frequency for " << chr << ":" << pos << " = " << pfb << std::endl; - // print_count++; - // } - // } - // std::cout << "Finished reading population frequencies for region " << region_str << std::endl; - // std::cout << "Test count: " << test_count << std::endl; } void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) @@ -1620,36 +997,17 @@ void CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end, std::se std::string snp_chr = chr; chr = removeChrPrefix(chr); - // Create an ordered map of SNP positions to BAF and PFB values + // Query the SNP allele frequencies for the SNPs std::map> snp_map; - - // Query SNPs within a range (start, end) and return their BAF and PFB - // values as separate vectors - // std::vector bafs; - // std::vector pfbs; - // std::vector pos; - double pfb_default = 0.5; - - // Read the SNP data from the VCF file this->readSNPAlleleFrequencies(snp_chr, start, end, snp_pos, snp_baf); - // Query the SNPs within the range and return their BAFs and corresponding - // positions - // auto snp_start = this->snp_baf_keys.lower_bound(start); - // auto snp_end = this->snp_baf_keys.upper_bound(end); - // if (snp_start == this->snp_baf_keys.end()) - // { - // // return std::make_tuple(pos, bafs, pfbs); - // return; - // } - // Query the population frequencies for the SNPs std::unordered_map pfb_map; this->readSNPPopulationFrequencies(chr, start, end, pfb_map); // Filter out the SNP population frequencies that are not in the SNP // position set - // std::unordered_map snp_pfb; + double pfb_default = 0.5; for (auto& pos : snp_pos) { if (pfb_map.find(pos) != pfb_map.end()) @@ -1659,55 +1017,4 @@ void CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end, std::se snp_pfb[pos] = pfb_default; } } - - // // Get the PFB values for the SNPs from the keys - // // Create the PFB vector using the SNP positions (loop through snp_pos, - // // query the pfb_map, and push the value to the vector) - // for (size_t i = 0; i < snp_pos.size(); i++) - // { - // uint32_t snp_pos = snp_pos[i]; - // double pfb = pfb_default; - // if (pfb_map.find(snp_pos) != pfb_map.end()) - // { - // pfb = pfb_map[snp_pos]; - // } else { - // pfb = pfb_default; - // } - // snp_pfb.push_back(pfb); - // } - - // // Get the PFB values for the SNPs from the keys - // for (auto it = snp_start; it != snp_end; it++) - // { - // uint32_t snp_pos = *it; - // pos.push_back(snp_pos); - // bafs.push_back(this->snp_baf_map[snp_pos]); - - // // Get the PFB value for the SNP - // if (this->snp_pfb_map.find(snp_pos) != this->snp_pfb_map.end()) - // { - // pfbs.push_back(this->snp_pfb_map[snp_pos]); - // } else { - // pfbs.push_back(pfb_default); - // } - // } - // auto& baf_bst = this->snp_baf_map[chr]; - // auto baf_start = baf_bst.lower_bound({start, 0.0}); - // auto baf_end = baf_bst.upper_bound({end, 0.0}); - // for (auto it = baf_start; it != baf_end; it++) { - // bafs.push_back(std::get<1>(*it)); - // pos.push_back(std::get<0>(*it)); - // } - - - - // auto& pfb_map = this->snp_pfb_map[chr]; - // for (size_t i = 0; i < pos.size(); i++) { - // uint32_t snp_pos = pos[i]; - // if (pfb_map.find(snp_pos) != pfb_map.end()) { - // pfbs[i] = pfb_map[snp_pos]; - // } - // } - - // return std::make_tuple(pos, bafs, pfbs); } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 7335148e..aa32dda2 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -39,29 +39,6 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) // RegionData SVCaller::detectSVsFromRegion(std::string region) std::tuple, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region) { - // // Open the BAM file - // std::string bam_filepath = this->input_data.getLongReadBam(); - // samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); - // if (fp_in == NULL) { - // std::cerr << "ERROR: failed to open " << bam_filepath << std::endl; - // exit(1); - // } - - // // Load the header for the BAM file - // bam_hdr_t *bamHdr = sam_hdr_read(fp_in); - // if (!bamHdr) { - // sam_close(fp_in); - // throw std::runtime_error("ERROR: failed to read header for " + bam_filepath); - // } - - // // Load the index for the BAM file - // hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str()); - // if (!idx) { - // bam_hdr_destroy(bamHdr); - // sam_close(fp_in); - // throw std::runtime_error("ERROR: failed to load index for " + bam_filepath); - // } - // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); if (!bam1) { @@ -80,7 +57,6 @@ std::tuple, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFi } // Main loop to process the alignments - // SVData sv_calls; std::set sv_calls; int num_alignments = 0; PrimaryMap primary_alignments; @@ -138,12 +114,6 @@ std::tuple, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFi // Clean up the iterator and alignment hts_itr_destroy(itr); bam_destroy1(bam1); - - // hts_itr_destroy(itr); - // bam_destroy1(bam1); - // hts_idx_destroy(idx); - // bam_hdr_destroy(bamHdr); - // sam_close(fp_in); return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments); } @@ -204,11 +174,6 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr // for sequence identity between the insertion and the // reference genome (duplications are typically >= 90%) - // Loop from the leftmost position of the insertion (pos-op_len) - // to the rightmost position of the insertion (pos+op_len-1) and - // calculate the sequence identity at each window of the - // insertion length to identify potential duplications. - // Loop through the reference sequence and calculate the // sequence identity +/- insertion length from the insertion // position. @@ -253,18 +218,8 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr ref_pos = pos+1; ref_end = ref_pos + op_len -1; if (is_duplication) { - // sv_calls.add(chr, ref_pos, ref_end, SVType::DUP, - // ins_seq_str, "CIGARDUP", "./.", default_lh); - //printMessage("[TEST] FOUND CIGAR DUP"); - // sv_calls.insert(SVCall{(uint32_t)ref_pos, - // (uint32_t)ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", - // default_lh}); addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh); } else { - // sv_calls.add(chr, ref_pos, ref_end, SVType::INS, ins_seq_str, "CIGARINS", "./.", default_lh); - // sv_calls.insert(SVCall{(uint32_t)ref_pos, - // (uint32_t)ref_end, "INS", ins_seq_str, "CIGARINS", "./.", - // default_lh}); addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh); } } @@ -277,10 +232,6 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr { ref_pos = pos+1; ref_end = ref_pos + op_len -1; - // sv_calls.add(chr, ref_pos, ref_end, SVType::DEL, ".", - // "CIGARDEL", "./.", default_lh); // Add to SV calls (1-based) - // sv_calls.insert(SVCall{(uint32_t)ref_pos, (uint32_t)ref_end, - // "DEL", ".", "CIGARDEL", "./.", default_lh}); addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh); } @@ -319,8 +270,7 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr // Check that the two sequence lengths are equal if (cmatch_seq_str.length() != cmatch_ref_str.length()) { - std::cerr << "ERROR: Sequence lengths do not match" << std::endl; - exit(1); + throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op)); } // Compare the two sequences and update the mismatch map @@ -340,8 +290,7 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) { // Do nothing } else { - std::cerr << "ERROR: Unknown CIGAR operation " << op << std::endl; - exit(1); + throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op)); } // Update the query position based on the CIGAR operation (M, I, S, H) @@ -350,8 +299,7 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr } else if (op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) { // Do nothing } else { - std::cerr << "ERROR: Unknown CIGAR operation " << op << std::endl; - exit(1); + throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op)); } } @@ -360,43 +308,22 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr return std::tuple, int32_t, int32_t>(query_match_map, query_start, query_end); } -std::unordered_map> SVCaller::run() +void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, CHMM hmm, std::set& combined_sv_calls, int min_cnv_length) { - // Get the chromosomes to process - std::vector chromosomes; - if (this->input_data.getChromosome() != "") { - chromosomes.push_back(this->input_data.getChromosome()); - } else { - chromosomes = this->input_data.getRefGenomeChromosomes(); - } - // Open the BAM file - std::string bam_filepath = this->input_data.getLongReadBam(); samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); if (!fp_in) { throw std::runtime_error("ERROR: failed to open " + bam_filepath); } - // Read the HMM from the file - std::string hmm_filepath = this->input_data.getHMMFilepath(); - std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; - CHMM hmm = ReadCHMM(hmm_filepath.c_str()); - - // Enable multi-threading - int num_threads = this->input_data.getThreadCount(); - if (num_threads > 1) { - std::cout << "Running SV detection with " << num_threads << " thread(s)..." << std::endl; - } - hts_set_threads(fp_in, num_threads); - - // Load the header for the BAM file + // Load the header bam_hdr_t *bamHdr = sam_hdr_read(fp_in); if (!bamHdr) { sam_close(fp_in); - throw std::runtime_error("ERROR: failed to read header for " + bam_filepath); + throw std::runtime_error("ERROR: failed to read header from " + bam_filepath); } - // Load the index for the BAM file + // Load the index hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str()); if (!idx) { bam_hdr_destroy(bamHdr); @@ -404,97 +331,196 @@ std::unordered_map> SVCaller::run() throw std::runtime_error("ERROR: failed to load index for " + bam_filepath); } - // Loop through each region and detect SVs in chunks - int chr_count = chromosomes.size(); - int current_chr = 0; - std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl; - int chunk_count = 100; // Number of chunks to split the chromosome into - uint32_t total_sv_count = 0; - std::unordered_map> whole_genome_sv_calls; - int min_cnv_length = this->input_data.getMinCNVLength(); - for (const auto& chr : chromosomes) { - std::cout << "Running SV detection for chromosome " << chr << "..." << std::endl; - - // Split the chromosome into chunks - std::vector region_chunks; - if (this->input_data.isRegionSet()) { - - // Use one chunk for the specified region - std::pair region = this->input_data.getRegion(); - int region_start = region.first; - int region_end = region.second; - std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end); - region_chunks.push_back(chunk); - std::cout << "Using specified region " << chunk << "..." << std::endl; - - } else { - int chr_len = this->input_data.getRefGenomeChromosomeLength(chr); - int chunk_size = std::ceil((double)chr_len / chunk_count); - for (int i = 0; i < chunk_count; i++) { - int start = i * chunk_size + 1; // 1-based - int end = start + chunk_size; - if (i == chunk_count - 1) { - end = chr_len; - } - std::string chunk = chr + ":" + std::to_string(start) + "-" + std::to_string(end); - region_chunks.push_back(chunk); + // Split the chromosome into chunks for memory efficiency + std::vector region_chunks; + int chunk_count = 100; + if (this->input_data.isRegionSet()) { + + // Use one chunk for the specified region + std::pair region = this->input_data.getRegion(); + int region_start = region.first; + int region_end = region.second; + std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end); + region_chunks.push_back(chunk); + // std::cout << "Using specified region " << chunk << "..." << std::endl; + + } else { + int chr_len = this->input_data.getRefGenomeChromosomeLength(chr); + int chunk_size = std::ceil((double)chr_len / chunk_count); + for (int i = 0; i < chunk_count; i++) { + int start = i * chunk_size + 1; // 1-based + int end = start + chunk_size; + if (i == chunk_count - 1) { + end = chr_len; } - std::cout << "Split chromosome " << chr << " into " << region_chunks.size() << " chunks of size " << chunk_size << "..." << std::endl; + std::string chunk = chr + ":" + std::to_string(start) + "-" + std::to_string(end); + region_chunks.push_back(chunk); } + printMessage("Split chromosome " + chr + " into " + std::to_string(region_chunks.size()) + " chunks of size " + std::to_string(chunk_size) + "..."); + } - // Load chromosome data for copy number predictions - std::cout << "Loading chromosome data for copy number predictions..." << std::endl; - CNVCaller cnv_caller(this->input_data); - cnv_caller.loadChromosomeData(chr); - - // Process each chunk one at a time - std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl; - int region_count = region_chunks.size(); - int current_region = 0; - std::set combined_sv_calls; - for (const auto& sub_region : region_chunks) { - std::tuple, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region); - std::set& subregion_sv_calls = std::get<0>(region_data); - PrimaryMap& primary_map = std::get<1>(region_data); - SuppMap& supp_map = std::get<2>(region_data); - std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl; - mergeSVs(subregion_sv_calls); - int region_sv_count = getSVCount(subregion_sv_calls); - printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); - - // Run copy number variant predictions on the SVs detected from the - // CIGAR string, using a minimum CNV length threshold - if (region_sv_count > 0) { - std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl; - cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm); - } + // Load chromosome data for copy number predictions + // std::cout << "Loading chromosome data for copy number predictions..." << std::endl; + CNVCaller cnv_caller(this->input_data); + cnv_caller.loadChromosomeData(chr); + + // Process each chunk one at a time + // std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl; + int region_count = region_chunks.size(); + int current_region = 0; + // std::set combined_sv_calls; + for (const auto& sub_region : region_chunks) { + std::tuple, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region); + std::set& subregion_sv_calls = std::get<0>(region_data); + PrimaryMap& primary_map = std::get<1>(region_data); + SuppMap& supp_map = std::get<2>(region_data); + // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl; + mergeSVs(subregion_sv_calls); + int region_sv_count = getSVCount(subregion_sv_calls); + // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); + + // Run copy number variant predictions on the SVs detected from the + // CIGAR string, using a minimum CNV length threshold + if (region_sv_count > 0) { + // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl; + cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm); + } - // Run split-read SV and copy number variant predictions - std::cout << "Detecting copy number variants from split reads..." << std::endl; - this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm); + // Run split-read SV and copy number variant predictions + // std::cout << "Detecting copy number variants from split reads..." << std::endl; + this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm); - // Merge the SV calls from the current region - std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; - mergeSVs(subregion_sv_calls); + // Merge the SV calls from the current region + // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; + mergeSVs(subregion_sv_calls); - // Combine the SV calls from the current region - std::cout << "Combining SV calls from " << sub_region << "..." << std::endl; - concatenateSVCalls(combined_sv_calls, subregion_sv_calls); - std::cout << "Completed " << ++current_region << " of " << region_count << " region(s)..." << std::endl; - } - - std::cout << "Completed " << ++current_chr << " of " << chr_count << " chromosome(s)..." << std::endl; - int chr_sv_count = getSVCount(combined_sv_calls); - whole_genome_sv_calls[chr] = combined_sv_calls; - std::cout << "Total SVs detected for chromosome " << chr << ": " << chr_sv_count << std::endl; - total_sv_count += chr_sv_count; - std::cout << "Cumulative total SVs: " << total_sv_count << std::endl; + // Combine the SV calls from the current region + // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl; + concatenateSVCalls(combined_sv_calls, subregion_sv_calls); + current_region++; + printMessage("Completed " + std::to_string(current_region) + " of " + std::to_string(region_count) + " region(s) for chromosome " + chr + "..."); } // Clean up the BAM file, header, and index hts_idx_destroy(idx); bam_hdr_destroy(bamHdr); sam_close(fp_in); +} + +std::unordered_map> SVCaller::run() +{ + // Get the chromosomes to process + std::vector chromosomes; + if (this->input_data.getChromosome() != "") { + chromosomes.push_back(this->input_data.getChromosome()); + } else { + chromosomes = this->input_data.getRefGenomeChromosomes(); + } + + // Ignore all alternate contigs (contains 'alt', 'GL', 'NC', 'hs', etc.) + chromosomes.erase(std::remove_if(chromosomes.begin(), chromosomes.end(), [](const std::string& chr) { + return chr.find("alt") != std::string::npos || chr.find("GL") != std::string::npos || chr.find("NC") != std::string::npos || chr.find("hs") != std::string::npos; + }), chromosomes.end()); + + // Read the HMM from the file + std::string hmm_filepath = this->input_data.getHMMFilepath(); + std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; + CHMM hmm = ReadCHMM(hmm_filepath.c_str()); + + // Set up threads for processing each chromosome + std::vector> futures; + std::unordered_map> whole_genome_sv_calls; + std::mutex sv_mutex; + + // Set a thread count for processing each chromosome. Keep it low to avoid + // memory issues. + int max_threads = 6; // Number of chromosomes to process in parallel + int batch_count = 0; + int completed_threads = 0; + int chr_count = chromosomes.size(); + for (const auto& chr : chromosomes) { + printMessage("Launching thread for chromosome " + chr + "..."); + futures.push_back(std::async(std::launch::async, [&]() { + std::set sv_calls; + this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength()); + { + std::lock_guard lock(sv_mutex); + whole_genome_sv_calls[chr] = std::move(sv_calls); + } + } + )); + batch_count++; + if (batch_count >= max_threads || batch_count >= chr_count) { + // Wait for all threads to finish + // printMessage("Waiting for all threads to finish for " + std::to_string(batch_count) + " chromosome(s)..."); + for (auto& future : futures) { + future.get(); + completed_threads++; + printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); + } + // completed_threads += batch_count; + // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); + batch_count = 0; + futures.clear(); + } + } + + // Wait for remaining threads to finish + if (futures.size() > 0) { + // printMessage("Waiting for remaining threads to finish for " + std::to_string(futures.size()) + " chromosome(s)..."); + for (auto& future : futures) { + future.get(); + completed_threads++; + printMessage("[TEST] Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); + } + // completed_threads += futures.size(); + // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); + } + + // // Loop through each region and detect SVs in chunks + // std::string bam_filepath = this->input_data.getLongReadBam(); + // int chr_count = chromosomes.size(); + // std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl; + // // int thread_count = this->input_data.getThreadCount(); + // int thread_count = chr_count; + // int min_cnv_length = this->input_data.getMinCNVLength(); + // for (const auto& chr : chromosomes) { + // printMessage("Launching thread for chromosome " + chr + "..."); + // futures.push_back(std::async(std::launch::async, [&]() { + // std::set sv_calls; + // this->processChromosome(chr, bam_filepath, hmm, sv_calls, min_cnv_length); + // { + // std::lock_guard lock(sv_mutex); + // whole_genome_sv_calls[chr] = std::move(sv_calls); + // } + // } + // )); + // } + + // // Wait for all threads to finish + // printMessage("Waiting for all threads to finish for " + std::to_string(chr_count) + " chromosome(s)..."); + // int threads_finished = 0; + // for (auto& future : futures) { + // try{ + // // future.wait(); + // future.get(); // Wait and handle exceptions + // threads_finished++; + // printMessage("Completed " + std::to_string(threads_finished) + " of " + std::to_string(thread_count) + " threads..."); + // } catch (const std::exception& e) { + // std::cerr << "Error in thread: " << e.what() << std::endl; + // } + // } + printMessage("All threads have finished."); + + // Print the total number of SVs detected for each chromosome + uint32_t total_sv_count = 0; + for (const auto& entry : whole_genome_sv_calls) { + std::string chr = entry.first; + int sv_count = getSVCount(entry.second); + total_sv_count += sv_count; + printMessage("Total SVs detected for chromosome " + chr + ": " + std::to_string(sv_count)); + } + printMessage("Total SVs detected for all chromosomes: " + std::to_string(total_sv_count)); // Save to VCF std::cout << "Saving SVs to VCF..." << std::endl; @@ -706,16 +732,10 @@ void SVCaller::saveToVCF(const std::unordered_map std::string sv_method = "CONTEXTSVv0.1"; int skip_count = 0; int total_count = 0; - // std::set chrs = this->getChromosomes(); - //for (auto const& chr : chrs) { - for (const auto& pair : sv_calls) { - // if (this->sv_calls.find(chr) == this->sv_calls.end()) { - // continue; - // } + for (const auto& pair : sv_calls) { std::string chr = pair.first; const std::set& sv_calls = pair.second; std::cout << "Saving SV calls for " << chr << "..." << std::endl; - // for (auto const& sv_call : this->sv_calls[chr]) { for (const auto& sv_call : sv_calls) { // Get the SV candidate and SV info uint32_t start = sv_call.start; From 128575ab4d5dbf1de07bdb907435953fffaddd4c Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Tue, 26 Nov 2024 22:04:11 -0500 Subject: [PATCH 033/134] implement thread pooling --- src/cnv_caller.cpp | 4 +- src/sv_caller.cpp | 129 ++++++++++++++++++++++++--------------------- src/sv_object.cpp | 6 +-- 3 files changed, 75 insertions(+), 64 deletions(-) diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index fd7a6c7c..a38c37a4 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -223,9 +223,9 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set & { int window_size = this->input_data.getWindowSize(); double mean_chr_cov = this->mean_chr_cov; - printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "..."); + // printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "..."); runCIGARCopyNumberPredictionChunk(chr, sv_candidates, hmm, window_size, mean_chr_cov); - printMessage("Finished predicting copy number states for chromosome " + chr + "..."); + // printMessage("Finished predicting copy number states for chromosome " + chr + "..."); } void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, CHMM hmm, int window_size, double mean_chr_cov) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index aa32dda2..bdcd081c 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include "utils.h" #include "sv_types.h" @@ -428,89 +429,99 @@ std::unordered_map> SVCaller::run() CHMM hmm = ReadCHMM(hmm_filepath.c_str()); // Set up threads for processing each chromosome + // const int max_threads = 6; + const int max_threads = 10; std::vector> futures; std::unordered_map> whole_genome_sv_calls; std::mutex sv_mutex; + std::condition_variable cv; + int active_threads = 0; - // Set a thread count for processing each chromosome. Keep it low to avoid - // memory issues. - int max_threads = 6; // Number of chromosomes to process in parallel - int batch_count = 0; - int completed_threads = 0; - int chr_count = chromosomes.size(); - for (const auto& chr : chromosomes) { + // Lambda to process a chromosome + auto process_chr = [&](const std::string& chr) { printMessage("Launching thread for chromosome " + chr + "..."); - futures.push_back(std::async(std::launch::async, [&]() { - std::set sv_calls; - this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength()); - { - std::lock_guard lock(sv_mutex); - whole_genome_sv_calls[chr] = std::move(sv_calls); - } + std::set sv_calls; + this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength()); + { + std::lock_guard lock(sv_mutex); + whole_genome_sv_calls[chr] = std::move(sv_calls); } - )); - batch_count++; - if (batch_count >= max_threads || batch_count >= chr_count) { - // Wait for all threads to finish - // printMessage("Waiting for all threads to finish for " + std::to_string(batch_count) + " chromosome(s)..."); - for (auto& future : futures) { - future.get(); - completed_threads++; - printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); - } - // completed_threads += batch_count; - // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); - batch_count = 0; - futures.clear(); + printMessage("Completed chromosome " + chr); + + // Notify thread completion + { + std::lock_guard lock(sv_mutex); + active_threads--; + } + cv.notify_one(); + }; + + // Thread management + std::vector threads; + for (const auto& chr : chromosomes) { + { + std::unique_lock lock(sv_mutex); + cv.wait(lock, [&] { return active_threads < max_threads; }); + active_threads++; } + + // Launch a new thread + threads.emplace_back(process_chr, chr); } - // Wait for remaining threads to finish - if (futures.size() > 0) { - // printMessage("Waiting for remaining threads to finish for " + std::to_string(futures.size()) + " chromosome(s)..."); - for (auto& future : futures) { - future.get(); - completed_threads++; - printMessage("[TEST] Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); + // Wait for all threads to complete + for (auto& thread : threads) { + if (thread.joinable()) { + thread.join(); } - // completed_threads += futures.size(); - // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); } - // // Loop through each region and detect SVs in chunks - // std::string bam_filepath = this->input_data.getLongReadBam(); + printMessage("All threads have finished."); + + ///////////////////////////////////////////////// + + // // Set a thread count for processing each chromosome. Keep it low to avoid + // // memory issues. + // int batch_count = 0; + // int completed_threads = 0; // int chr_count = chromosomes.size(); - // std::cout << "Detecting SVs from " << chr_count << " chromosome(s)..." << std::endl; - // // int thread_count = this->input_data.getThreadCount(); - // int thread_count = chr_count; - // int min_cnv_length = this->input_data.getMinCNVLength(); // for (const auto& chr : chromosomes) { // printMessage("Launching thread for chromosome " + chr + "..."); // futures.push_back(std::async(std::launch::async, [&]() { // std::set sv_calls; - // this->processChromosome(chr, bam_filepath, hmm, sv_calls, min_cnv_length); + // this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength()); // { // std::lock_guard lock(sv_mutex); // whole_genome_sv_calls[chr] = std::move(sv_calls); // } // } // )); + // batch_count++; + // if (batch_count >= max_threads || batch_count >= chr_count) { + // // Wait for all threads to finish + // // printMessage("Waiting for all threads to finish for " + std::to_string(batch_count) + " chromosome(s)..."); + // for (auto& future : futures) { + // future.get(); + // completed_threads++; + // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); + // } + // // completed_threads += batch_count; + // // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); + // batch_count = 0; + // futures.clear(); + // } // } - // // Wait for all threads to finish - // printMessage("Waiting for all threads to finish for " + std::to_string(chr_count) + " chromosome(s)..."); - // int threads_finished = 0; - // for (auto& future : futures) { - // try{ - // // future.wait(); - // future.get(); // Wait and handle exceptions - // threads_finished++; - // printMessage("Completed " + std::to_string(threads_finished) + " of " + std::to_string(thread_count) + " threads..."); - // } catch (const std::exception& e) { - // std::cerr << "Error in thread: " << e.what() << std::endl; + // // Wait for remaining threads to finish + // if (futures.size() > 0) { + // // printMessage("Waiting for remaining threads to finish for " + std::to_string(futures.size()) + " chromosome(s)..."); + // for (auto& future : futures) { + // future.get(); + // completed_threads++; + // printMessage("[TEST] Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); // } // } - printMessage("All threads have finished."); + // printMessage("All threads have finished."); // Print the total number of SVs detected for each chromosome uint32_t total_sv_count = 0; @@ -650,9 +661,9 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p } // Print the number of SVs detected from split-read alignments - if (sv_count > 0) { - std::cout << "Found " << sv_count << " SVs from split-read alignments" << std::endl; - } + // if (sv_count > 0) { + // std::cout << "Found " << sv_count << " SVs from split-read alignments" << std::endl; + // } } void SVCaller::saveToVCF(const std::unordered_map >& sv_calls) diff --git a/src/sv_object.cpp b/src/sv_object.cpp index c28d6b21..09203b59 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -158,7 +158,7 @@ void mergeSVs(std::set& sv_calls) { } // Merge SV calls if they overlap by at least 50% - int initial_size = sv_calls.size(); + // int initial_size = sv_calls.size(); std::vector merged_sv_calls; auto it = sv_calls.begin(); SVCall current_merge = *it++; @@ -214,6 +214,6 @@ void mergeSVs(std::set& sv_calls) { for (const auto& sv_call : merged_sv_calls) { sv_calls.insert(sv_call); } - int updated_size = sv_calls.size(); - std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl; + // int updated_size = sv_calls.size(); + // std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl; } From d62fe120fd2dd919eb417511cd6f021732e0ecba Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 27 Nov 2024 14:46:55 -0500 Subject: [PATCH 034/134] fix duplication error --- Makefile-cpp | 2 +- src/cnv_caller.cpp | 2 +- src/sv_caller.cpp | 33 ++++++++++++++++++++++++++------- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/Makefile-cpp b/Makefile-cpp index e6ba7d30..2b117f0e 100644 --- a/Makefile-cpp +++ b/Makefile-cpp @@ -11,7 +11,7 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib # Compiler and Flags CXX := g++ -CXXFLAGS := -std=c++14 -I$(INCL_DIR) -I$(CONDA_INCL_DIR) +CXXFLAGS := -std=c++14 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR) # Add rpath for shared libraries # Link htslib diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index a38c37a4..feace2e8 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -769,7 +769,7 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr); if (pfb_filepath == "") { - printError("No population frequency file provided for chromosome " + chr); + // printError("No population frequency file provided for chromosome " + chr); return; } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index bdcd081c..b38e6fe8 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -173,14 +173,15 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr // To determine whether the insertion is a duplication, check // for sequence identity between the insertion and the - // reference genome (duplications are typically >= 90%) - + // reference genome (duplications are typically >= 90%): // Loop through the reference sequence and calculate the // sequence identity +/- insertion length from the insertion // position. bool is_duplication = false; int ins_ref_pos; - for (int j = pos - op_len; j <= pos; j++) { + int dup_start = std::max(0, pos - op_len); + // for (int j = pos - op_len; j <= pos; j++) { + for (int j = dup_start; j <= pos; j++) { // Get the string for the window (1-based coordinates) ins_ref_pos = j + 1; @@ -267,6 +268,7 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr // Get the corresponding reference sequence int cmatch_pos = pos + 1; // Querying the reference genome is 1-based + // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1)); std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1); // Check that the two sequence lengths are equal @@ -362,6 +364,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // Load chromosome data for copy number predictions // std::cout << "Loading chromosome data for copy number predictions..." << std::endl; + printMessage(chr + ": Loading chromosome data..."); CNVCaller cnv_caller(this->input_data); cnv_caller.loadChromosomeData(chr); @@ -371,11 +374,14 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ int current_region = 0; // std::set combined_sv_calls; for (const auto& sub_region : region_chunks) { + current_region++; + printMessage(chr + ": CIGAR SVs..."); std::tuple, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region); std::set& subregion_sv_calls = std::get<0>(region_data); PrimaryMap& primary_map = std::get<1>(region_data); SuppMap& supp_map = std::get<2>(region_data); // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl; + printMessage(chr + ": Merging CIGAR..."); mergeSVs(subregion_sv_calls); int region_sv_count = getSVCount(subregion_sv_calls); // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); @@ -384,21 +390,25 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // CIGAR string, using a minimum CNV length threshold if (region_sv_count > 0) { // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl; + printMessage(chr + ": CIGAR predictions..."); cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm); } // Run split-read SV and copy number variant predictions // std::cout << "Detecting copy number variants from split reads..." << std::endl; + printMessage(chr + ": Split read SVs..."); this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm); // Merge the SV calls from the current region // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; + printMessage(chr + ": Merging split reads..."); mergeSVs(subregion_sv_calls); // Combine the SV calls from the current region // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl; + printMessage(chr + ": Concatenating calls..."); concatenateSVCalls(combined_sv_calls, subregion_sv_calls); - current_region++; + printMessage("Completed " + std::to_string(current_region) + " of " + std::to_string(region_count) + " region(s) for chromosome " + chr + "..."); } @@ -422,15 +432,24 @@ std::unordered_map> SVCaller::run() chromosomes.erase(std::remove_if(chromosomes.begin(), chromosomes.end(), [](const std::string& chr) { return chr.find("alt") != std::string::npos || chr.find("GL") != std::string::npos || chr.find("NC") != std::string::npos || chr.find("hs") != std::string::npos; }), chromosomes.end()); - + + /* + // Test only on a subset 241125_ALL/output.merged.vcf + chromosomes = {"chr2", "chr3", "chr5", "chr6", "chr7", "chr4"}; + */ + + + // Test only on a subset 241125_ALL/output.merged.vcf + // chromosomes = {"chrM", "chr8", "chr9", "chr10", "chr11", "chr1"}; + // Read the HMM from the file std::string hmm_filepath = this->input_data.getHMMFilepath(); std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; CHMM hmm = ReadCHMM(hmm_filepath.c_str()); // Set up threads for processing each chromosome - // const int max_threads = 6; - const int max_threads = 10; + const int max_threads = 8; + // const int max_threads = 10; std::vector> futures; std::unordered_map> whole_genome_sv_calls; std::mutex sv_mutex; From da4d72f142ee7727ab8f6c277075202344297cd9 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 30 Nov 2024 12:27:08 -0500 Subject: [PATCH 035/134] add arguments and version and reduce copies --- Makefile-cpp | 10 ++- include/cnv_caller.h | 8 +- include/khmm.h | 14 ++-- include/sv_caller.h | 4 +- include/version.h | 2 + src/cnv_caller.cpp | 9 ++- src/input_data.cpp | 2 - src/khmm.cpp | 38 --------- src/main.cpp | 189 +++++++++++++++++++++++++++++++------------ src/sv_caller.cpp | 69 +++------------- 10 files changed, 173 insertions(+), 172 deletions(-) create mode 100644 include/version.h diff --git a/Makefile-cpp b/Makefile-cpp index 2b117f0e..58139e9c 100644 --- a/Makefile-cpp +++ b/Makefile-cpp @@ -4,6 +4,13 @@ SRC_DIR := $(CURDIR)/src BUILD_DIR := $(CURDIR)/build LIB_DIR := $(CURDIR)/lib +# Version header +VERSION := $(shell git describe --tags --always) +VERSION_HEADER := $(INCL_DIR)/version.h +.PHONY: $(VERSION_HEADER) + @echo "#pragma once" > $@ + @echo "#define VERSION \"$(VERSION)\"" >> $@ + # Conda environment directories CONDA_PREFIX := $(shell echo $$CONDA_PREFIX) CONDA_INCL_DIR := $(CONDA_PREFIX)/include @@ -13,10 +20,7 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib CXX := g++ CXXFLAGS := -std=c++14 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR) # Add rpath for shared libraries - -# Link htslib LDLIBS := -lhts # Link with libhts.a or libhts.so -# LDLIBS := -lmylib # Link with libraries in LIB_DIR, e.g., libmylib.a or libmylib.so # Sources and Output # SOURCES := $(wildcard $(SRC_DIR)/*.cpp) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 9f16c5f6..be3c1479 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -85,7 +85,7 @@ class CNVCaller { void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp); - std::pair, double> runViterbi(CHMM hmm, SNPData &snp_data); + std::pair, double> runViterbi(const CHMM& hmm, SNPData &snp_data); // Query a region for SNPs and return the SNP data std::pair querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, SNPInfo &snp_info, std::unordered_map &pos_depth_map, double mean_chr_cov); @@ -93,7 +93,7 @@ class CNVCaller { void querySNPs(std::string chr, uint32_t start, uint32_t end, std::set& snp_pos, std::unordered_map& snp_baf, std::unordered_map& snp_pfb); // Run copy number prediction for a chunk of SV candidates from CIGAR strings - void runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, CHMM hmm, int window_size, double mean_chr_cov); + void runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov); void updateSVCopyNumber(std::map& sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood); @@ -116,11 +116,11 @@ class CNVCaller { // Run copy number prediction for a single SV candidate, returning the // likelihood, predicted CNV type, genotype, and whether SNPs were found - std::tuple runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate, CHMM hmm); + std::tuple runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate, const CHMM& hmm); // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings // SNPData runCIGARCopyNumberPrediction(std::string chr, std::map& sv_candidates, int min_length); - void runCIGARCopyNumberPrediction(std::string chr, std::set& sv_candidates, int min_length, CHMM hmm); + void runCIGARCopyNumberPrediction(std::string chr, std::set& sv_candidates, int min_length, const CHMM& hmm); // Calculate the mean chromosome coverage double calculateMeanChromosomeCoverage(std::string chr); diff --git a/include/khmm.h b/include/khmm.h index 8f86e7a4..9585635f 100644 --- a/include/khmm.h +++ b/include/khmm.h @@ -13,22 +13,22 @@ // Struct for HMM (C++ RAII style) struct CHMM { - int N; // Number of states - int M; // Number of observation symbols + int N = 0; // Number of states + int M = 0; // Number of observation symbols std::vector> A; // Transition probability matrix std::vector> B; // Emission probability matrix std::vector pi; // Initial state distribution std::vector B1_mean; // Mean of a continuous Gaussian distribution for state 1 through N std::vector B1_sd; // Standard deviation of B1 values, which is the same for all states - double B1_uf; // B1_uniform_fraction: the contribution of uniform distribution to the finite mixture model + double B1_uf = 0.0; // B1_uniform_fraction: the contribution of uniform distribution to the finite mixture model std::vector B2_mean; // B2_mean[1..4] is the average of B_allele_freq std::vector B2_sd; // B2_sd[1..4] is the standard deviation of four B_allele_freq, B2_sd[5] is specially for state1, where B is modelled as a wide normal distribution - double B2_uf; // B2_uniform_fraction: the fraction of uniform distribution in the finite mixture model - int NP_flag; + double B2_uf = 0.0; // B2_uniform_fraction: the fraction of uniform distribution in the finite mixture model + int NP_flag = 0; std::vector B3_mean; std::vector B3_sd; - double B3_uf; - int dist; + double B3_uf = 0.0; + int dist = 0; }; diff --git a/include/sv_caller.h b/include/sv_caller.h index ec89aa8a..cdafab0b 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -38,7 +38,7 @@ class SVCaller { // mismatch rate, and the start and end positions of the query sequence std::tuple, int32_t, int32_t> detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set& sv_calls, bool is_primary); - void processChromosome(const std::string& chr, const std::string& bam_filepath, CHMM hmm, std::set& combined_sv_calls, int min_cnv_length); + void processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::set& combined_sv_calls, int min_cnv_length); // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. @@ -49,7 +49,7 @@ class SVCaller { int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); // Detect SVs from split alignments - void detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, CHMM hmm); + void detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm); // Calculate the mismatch rate given a map of query positions to // match/mismatch (1/0) values within a specified range of the query diff --git a/include/version.h b/include/version.h new file mode 100644 index 00000000..d38178a8 --- /dev/null +++ b/include/version.h @@ -0,0 +1,2 @@ +#pragma once +#define VERSION "v0,1,0-41-gd62fe12" diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index feace2e8..dac33183 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -41,7 +41,7 @@ CNVCaller::CNVCaller(InputData &input_data) } // Function to call the Viterbi algorithm for the CHMM -std::pair, double> CNVCaller::runViterbi(CHMM hmm, SNPData& snp_data) +std::pair, double> CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data) { int data_count = (int) snp_data.pos.size(); if (data_count == 0) @@ -144,7 +144,7 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta return std::make_pair(snp_data, snps_found); } -std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate, CHMM hmm) +std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate, const CHMM& hmm) { // Get the start and end positions of the SV call uint32_t start_pos = std::get<0>(candidate); @@ -162,6 +162,7 @@ std::tuple CNVCaller::runCopyNumberPrediction bool sv_snps_found = snp_call.second; // Run the Viterbi algorithm + printMessage("Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " (" + std::to_string(sv_snps.pos.size()) + " SNPs, start=" + std::to_string(snp_start_pos) + ", end=" + std::to_string(snp_end_pos) + ")..."); std::pair, double> prediction = runViterbi(hmm, sv_snps); std::vector& state_sequence = prediction.first; double likelihood = prediction.second; @@ -219,7 +220,7 @@ std::tuple CNVCaller::runCopyNumberPrediction } -void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set &sv_candidates, int min_length, CHMM hmm) +void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set &sv_candidates, int min_length, const CHMM& hmm) { int window_size = this->input_data.getWindowSize(); double mean_chr_cov = this->mean_chr_cov; @@ -228,7 +229,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set & // printMessage("Finished predicting copy number states for chromosome " + chr + "..."); } -void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, CHMM hmm, int window_size, double mean_chr_cov) +void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov) { // printMessage("Running copy number prediction for " + std::to_string(sv_chunk.size()) + " SV candidates on chromosome " + chr + "..."); // Map with counts for each CNV type diff --git a/src/input_data.cpp b/src/input_data.cpp index 572ed92a..e152a6cf 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -217,8 +217,6 @@ bool InputData::isRegionSet() void InputData::setAlleleFreqFilepaths(std::string filepath) { - // this->pfb_filepath = filepath; - // Check if empty string if (filepath == "") { diff --git a/src/khmm.cpp b/src/khmm.cpp index bdb6eb8b..d375a1a0 100644 --- a/src/khmm.cpp +++ b/src/khmm.cpp @@ -459,17 +459,6 @@ CHMM ReadCHMM(const std::string filename) throw std::runtime_error("Error reading A"); } - // Print A - // std::cout << "A: " << std::endl; - // for (int i = 0; i < hmm.N; i++) - // { - // for (int j = 0; j < hmm.N; j++) - // { - // std::cout << std::setprecision(10) << hmm.A[i][j] << " "; - // } - // std::cout << std::endl; - // } - // Read B std::getline(file, line); if (line != "B:") @@ -494,13 +483,6 @@ CHMM ReadCHMM(const std::string filename) throw std::runtime_error("Error reading pi"); } - // Print pi - // std::cout << "pi: "; - // for (int i = 0; i < hmm.N; i++) - // { - // std::cout << std::setprecision(10) << hmm.pi[i] << " "; - // } - // Read B1_mean std::getline(file, line); if (line != "B1_mean:") @@ -513,13 +495,6 @@ CHMM ReadCHMM(const std::string filename) throw std::runtime_error("Error reading B1_mean"); } - // Print B1_mean - // std::cout << "B1_mean: "; - // for (int i = 0; i < hmm.N; i++) - // { - // std::cout << std::setprecision(10) << hmm.B1_mean[i] << " "; - // } - // Read B1_sd std::getline(file, line); if (line != "B1_sd:") @@ -532,13 +507,6 @@ CHMM ReadCHMM(const std::string filename) throw std::runtime_error("Error reading B1_sd"); } - // Print B1_sd - // std::cout << "B1_sd: "; - // for (int i = 0; i < hmm.N; i++) - // { - // std::cout << std::setprecision(10) << hmm.B1_sd[i] << " "; - // } - // Read B1_uf std::getline(file, line); if (line != "B1_uf:") @@ -552,9 +520,6 @@ CHMM ReadCHMM(const std::string filename) throw std::runtime_error("Error reading B1_uf"); } - // Print B1_uf - // std::cout << "B1_uf: " << std::setprecision(10) << hmm.B1_uf << std::endl; - // Read B2_mean std::getline(file, line); if (line != "B2_mean:") @@ -592,9 +557,6 @@ CHMM ReadCHMM(const std::string filename) throw std::runtime_error("Error reading B2_uf"); } - // Print B2_uf - // std::cout << "B2_uf: " << std::setprecision(10) << hmm.B2_uf << std::endl; - return hmm; } diff --git a/src/main.cpp b/src/main.cpp index 558e2493..1d78ad5f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,80 +1,165 @@ #include "swig_interface.h" #include "input_data.h" +#include "version.h" /// @cond DOXYGEN_IGNORE #include #include +// #include /// @endcond // Placeholder for ContextSV library includes // #include "ContextSV.h" -void runContextSV(const std::string& bamFile, const std::string& refFile, const std::string& vcfFile, const std::string& outputDir, int threadCount = 1, const std::string& hmmFile = "", int windowSize = 2500, int minCNV = 2500, const std::string& eth = "", const std::string& pfbFile = "") +void runContextSV(const std::unordered_map& args) { // Placeholder for setting up input data and running ContextSV - std::cout << "Running ContextSV with the following files:" << std::endl; - std::cout << "BAM file: " << bamFile << std::endl; - std::cout << "Reference file: " << refFile << std::endl; - std::cout << "VCF file: " << vcfFile << std::endl; - std::cout << "Thread count: " << threadCount << std::endl; - std::cout << "Output directory: " << outputDir << std::endl; + std::cout << "ContextSV version " << VERSION << std::endl; + std::cout << "Input parameters:" << std::endl; + for (const auto& arg : args) { + std::cout << arg.first << ": " << arg.second << std::endl; + } // Set up input data InputData input_data; - input_data.setShortReadBam(bamFile); - input_data.setLongReadBam(bamFile); - input_data.setRefGenome(refFile); - input_data.setSNPFilepath(vcfFile); - //input_data.setChromosome("21"); - //input_data.setRegion("14486099-14515105"); - input_data.setThreadCount(threadCount); - input_data.setAlleleFreqFilepaths(pfbFile); - input_data.setHMMFilepath(hmmFile); - input_data.setOutputDir(outputDir); - input_data.saveCNVData(false); - input_data.setThreadCount(threadCount); - input_data.setWindowSize(windowSize); - input_data.setMinCNVLength(minCNV); + input_data.setLongReadBam(args.at("bam-file")); + input_data.setShortReadBam(args.at("bam-file")); + input_data.setRefGenome(args.at("ref-file")); + input_data.setSNPFilepath(args.at("snps-file")); + input_data.setOutputDir(args.at("output-dir")); + if (args.find("chr") != args.end()) { + input_data.setChromosome(args.at("chr")); + } + if (args.find("region") != args.end()) { + input_data.setRegion(args.at("region")); + } + if (args.find("thread-count") != args.end()) { + input_data.setThreadCount(std::stoi(args.at("thread-count"))); + } + if (args.find("hmm-file") != args.end()) { + input_data.setHMMFilepath(args.at("hmm-file")); + } + if (args.find("window-size") != args.end()) { + input_data.setWindowSize(std::stoi(args.at("window-size"))); + } + if (args.find("min-cnv") != args.end()) { + input_data.setMinCNVLength(std::stoi(args.at("min-cnv"))); + } + if (args.find("eth") != args.end()) { + input_data.setEthnicity(args.at("eth")); + } + if (args.find("pfb-file") != args.end()) { + input_data.setAlleleFreqFilepaths(args.at("pfb-file")); + } + if (args.find("save-cnv") != args.end()) { + input_data.saveCNVData(true); + } + if (args.find("debug") != args.end()) { + input_data.setVerbose(true); + } + // input_data.setShortReadBam(bamFile); + // input_data.setLongReadBam(bamFile); + // input_data.setRefGenome(refFile); + // input_data.setSNPFilepath(vcfFile); + // //input_data.setChromosome("21"); + // //input_data.setRegion("14486099-14515105"); + // input_data.setThreadCount(threadCount); + // input_data.setAlleleFreqFilepaths(pfbFile); + // input_data.setHMMFilepath(hmmFile); + // input_data.setOutputDir(outputDir); + // input_data.saveCNVData(false); + // input_data.setThreadCount(threadCount); + // input_data.setWindowSize(windowSize); + // input_data.setMinCNVLength(minCNV); // Run ContextSV run(input_data); } -int main(int argc, char* argv[]) { - if (argc < 6) { - std::cerr << "Usage: " << argv[0] << " " << std::endl; - return 1; - } +void printUsage(const std::string& programName) { + std::cerr << "Usage: " << programName << " [options]\n" + << "Options:\n" + << " -b, --bam Long-read BAM file (required)\n" + << " -r, --ref Reference genome FASTA file (required)\n" + << " -s, --snp SNPs VCF file (required)\n" + << " -o, --outdir Output directory (required)\n" + << " -c, --chr Chromosome\n" + << " -r, --region Region (e.g., 14486099-14515105)\n" + << " -t, --threads Number of threads\n" + << " -h, --hmm HMM file\n" + << " -w, --window Window size\n" + << " --min-cnv Minimum CNV length\n" + << " -e, --eth ETH file\n" + << " -p, --pfb PFB file\n" + << " --save-cnv Save CNV data\n" + << " --debug Debug mode\n" + << " --version Print version and exit\n" + << " -h, --help Print usage and exit\n"; +} + +std::unordered_map parseArguments(int argc, char* argv[]) { + std::unordered_map args; + for (int i = 1; i < argc; i++) { + std::string arg = argv[i]; - std::string bamFile = argv[1]; - std::string refFile = argv[2]; - std::string vcfFile = argv[3]; - std::string outputDir = argv[4]; - int threadCount = std::stoi(argv[5]); + // Handle short and long options + if ((arg == "-b" || arg == "--bam") && i + 1 < argc) { + args["bam-file"] = argv[++i]; + } else if ((arg == "-r" || arg == "--ref") && i + 1 < argc) { + args["ref-file"] = argv[++i]; + } else if ((arg == "-s" || arg == "--snp") && i + 1 < argc) { + args["snps-file"] = argv[++i]; + } else if ((arg == "-o" || arg == "--outdir") && i + 1 < argc) { + args["output-dir"] = argv[++i]; + } else if ((arg == "-c" || arg == "--chr") && i + 1 < argc) { + args["chr"] = argv[++i]; + } else if ((arg == "-r" || arg == "--region") && i + 1 < argc) { + args["region"] = argv[++i]; + } else if ((arg == "-t" || arg == "--threads") && i + 1 < argc) { + args["thread-count"] = argv[++i]; + } else if ((arg == "-h" || arg == "--hmm") && i + 1 < argc) { + args["hmm-file"] = argv[++i]; + } else if ((arg == "-w" || arg == "--window") && i + 1 < argc) { + args["window-size"] = argv[++i]; + } else if (arg == "--min-cnv" && i + 1 < argc) { + args["min-cnv"] = argv[++i]; + } else if ((arg == "-e" || arg == "--eth") && i + 1 < argc) { + args["eth"] = argv[++i]; + } else if ((arg == "-p" || arg == "--pfb") && i + 1 < argc) { + args["pfb-file"] = argv[++i]; + } else if (arg == "--save-cnv") { + args["save-cnv"] = "true"; + } else if (arg == "--debug") { + args["debug"] = "true"; + } else if ((arg == "-v" || arg == "--version")) { + std::cout << "ContextSV version " << VERSION << std::endl; + exit(0); + } else if (arg == "-h" || arg == "--help") { + printUsage(argv[0]); + exit(0); + } else { + std::cerr << "Unknown option: " << arg << std::endl; + } + } - std::string hmmFile = ""; - int windowSize = 2500; - int minCNV = 2500; - std::string eth = ""; - std::string pfbFile = ""; - if (argc == 11) { - hmmFile = argv[6]; - windowSize = std::stoi(argv[7]); - minCNV = std::stoi(argv[8]); - eth = argv[9]; - pfbFile = argv[10]; + // Check for required arguments + bool hasLR = args.find("bam-file") != args.end(); + bool hasOutput = args.find("output-dir") != args.end(); + bool hasRef = args.find("ref-file") != args.end(); + bool hasSNPs = args.find("snps-file") != args.end(); + bool requiredArgs = hasLR && hasOutput && hasRef && hasSNPs; + if (!requiredArgs) { + std::cerr << "Missing required argument(s): -b/--bam, -r/--ref, -s/--snp, -o/--outdir" << std::endl; + exit(1); } - - runContextSV(bamFile, refFile, vcfFile, outputDir, threadCount, hmmFile, windowSize, minCNV, eth, pfbFile); - - //std::string hmmFile = argv[6]; - //int windowSize = std::stoi(argv[7]); - //int minCNV = std::stoi(argv[8]); - //std::string eth = argv[9]; - //std::string pfbFile = argv[10]; - - //runContextSV(bamFile, refFile, vcfFile, outputDir, threadCount, "", 2500, 2500, "", ""); + + return args; +} + +int main(int argc, char* argv[]) { + auto args = parseArguments(argc, argv); + runContextSV(args); return 0; } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index b38e6fe8..b9038e5c 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -311,7 +311,7 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr return std::tuple, int32_t, int32_t>(query_match_map, query_start, query_end); } -void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, CHMM hmm, std::set& combined_sv_calls, int min_cnv_length) +void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::set& combined_sv_calls, int min_cnv_length) { // Open the BAM file samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); @@ -432,24 +432,15 @@ std::unordered_map> SVCaller::run() chromosomes.erase(std::remove_if(chromosomes.begin(), chromosomes.end(), [](const std::string& chr) { return chr.find("alt") != std::string::npos || chr.find("GL") != std::string::npos || chr.find("NC") != std::string::npos || chr.find("hs") != std::string::npos; }), chromosomes.end()); - - /* - // Test only on a subset 241125_ALL/output.merged.vcf - chromosomes = {"chr2", "chr3", "chr5", "chr6", "chr7", "chr4"}; - */ - - - // Test only on a subset 241125_ALL/output.merged.vcf - // chromosomes = {"chrM", "chr8", "chr9", "chr10", "chr11", "chr1"}; // Read the HMM from the file std::string hmm_filepath = this->input_data.getHMMFilepath(); std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; - CHMM hmm = ReadCHMM(hmm_filepath.c_str()); + const CHMM& hmm = ReadCHMM(hmm_filepath.c_str()); // Set up threads for processing each chromosome - const int max_threads = 8; - // const int max_threads = 10; + const int max_threads = this->input_data.getThreadCount(); + std::cout << "Using " << max_threads << " threads for processing..." << std::endl; std::vector> futures; std::unordered_map> whole_genome_sv_calls; std::mutex sv_mutex; @@ -458,7 +449,7 @@ std::unordered_map> SVCaller::run() // Lambda to process a chromosome auto process_chr = [&](const std::string& chr) { - printMessage("Launching thread for chromosome " + chr + "..."); + // printMessage("Launching thread for chromosome " + chr + "..."); std::set sv_calls; this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength()); { @@ -471,6 +462,7 @@ std::unordered_map> SVCaller::run() { std::lock_guard lock(sv_mutex); active_threads--; + printMessage("Active threads: " + std::to_string(active_threads)); } cv.notify_one(); }; @@ -480,8 +472,10 @@ std::unordered_map> SVCaller::run() for (const auto& chr : chromosomes) { { std::unique_lock lock(sv_mutex); + printMessage("Waiting for thread slot. Active threads: " + std::to_string(active_threads)); cv.wait(lock, [&] { return active_threads < max_threads; }); active_threads++; + printMessage("Launching thread for chromosome " + chr + ". Active threads: " + std::to_string(active_threads)); } // Launch a new thread @@ -497,51 +491,6 @@ std::unordered_map> SVCaller::run() printMessage("All threads have finished."); - ///////////////////////////////////////////////// - - // // Set a thread count for processing each chromosome. Keep it low to avoid - // // memory issues. - // int batch_count = 0; - // int completed_threads = 0; - // int chr_count = chromosomes.size(); - // for (const auto& chr : chromosomes) { - // printMessage("Launching thread for chromosome " + chr + "..."); - // futures.push_back(std::async(std::launch::async, [&]() { - // std::set sv_calls; - // this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength()); - // { - // std::lock_guard lock(sv_mutex); - // whole_genome_sv_calls[chr] = std::move(sv_calls); - // } - // } - // )); - // batch_count++; - // if (batch_count >= max_threads || batch_count >= chr_count) { - // // Wait for all threads to finish - // // printMessage("Waiting for all threads to finish for " + std::to_string(batch_count) + " chromosome(s)..."); - // for (auto& future : futures) { - // future.get(); - // completed_threads++; - // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); - // } - // // completed_threads += batch_count; - // // printMessage("Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); - // batch_count = 0; - // futures.clear(); - // } - // } - - // // Wait for remaining threads to finish - // if (futures.size() > 0) { - // // printMessage("Waiting for remaining threads to finish for " + std::to_string(futures.size()) + " chromosome(s)..."); - // for (auto& future : futures) { - // future.get(); - // completed_threads++; - // printMessage("[TEST] Completed " + std::to_string(completed_threads) + " of " + std::to_string(chr_count) + " chromosome(s)"); - // } - // } - // printMessage("All threads have finished."); - // Print the total number of SVs detected for each chromosome uint32_t total_sv_count = 0; for (const auto& entry : whole_genome_sv_calls) { @@ -561,7 +510,7 @@ std::unordered_map> SVCaller::run() // Detect SVs from split read alignments -void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, CHMM hmm) +void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm) { // Find split-read SV evidence int sv_count = 0; From ea06c1cd08c5a0d8e0a1b7d1cab8006efde24b8f Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 30 Nov 2024 19:31:58 -0500 Subject: [PATCH 036/134] Fix errors and reduce copying --- include/cnv_caller.h | 54 ++----- include/cnv_data.h | 32 ---- include/contextsv.h | 2 - include/input_data.h | 3 +- include/snp_info.h | 51 ------- include/sv_caller.h | 15 +- include/sv_data.h | 52 ------- include/sv_types.h | 19 --- src/cnv_caller.cpp | 338 +++++++++++++++++----------------------- src/cnv_data.cpp | 73 --------- src/contextsv.cpp | 4 +- src/input_data.cpp | 2 +- src/snp_info.cpp | 108 ------------- src/sv_caller.cpp | 151 +++++++++++------- src/sv_data.cpp | 355 ------------------------------------------- src/sv_object.cpp | 139 +++-------------- 16 files changed, 283 insertions(+), 1115 deletions(-) delete mode 100644 include/cnv_data.h delete mode 100644 include/snp_info.h delete mode 100644 include/sv_data.h delete mode 100644 src/cnv_data.cpp delete mode 100644 src/snp_info.cpp delete mode 100644 src/sv_data.cpp diff --git a/include/cnv_caller.h b/include/cnv_caller.h index be3c1479..ad22b449 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -6,8 +6,6 @@ #include "khmm.h" #include "input_data.h" -#include "cnv_data.h" -#include "sv_data.h" #include "sv_types.h" #include "sv_object.h" @@ -19,7 +17,6 @@ #include #include -#include "snp_info.h" /// @endcond using namespace sv_types; @@ -49,18 +46,14 @@ struct SNPData { class CNVCaller { private: InputData& input_data; - mutable std::mutex sv_candidates_mtx; // SV candidate map mutex - mutable std::mutex snp_data_mtx; // SNP data mutex - mutable std::mutex hmm_mtx; // HMM mutex + mutable std::mutex snp_file_mtx; // SNP file mutex + mutable std::mutex pfb_file_mtx; // Population frequency file mutex + mutable std::mutex bam_file_mtx; // BAM file mutex + // CHMM hmm; SNPData snp_data; - SNPInfo snp_info; - double mean_chr_cov = 0.0; - std::unordered_map pos_depth_map; // Read depth map - // std::unordered_map snp_baf_map; // SNP B-allele frequency map - // std::set snp_alt_map; // SNP B-allele map - // std::set snp_baf_keys; // SNP positions for BAF values - // std::unordered_map snp_pfb_map; // SNP population frequency map + // double mean_chr_cov = 0.0; + // std::unordered_map pos_depth_map; // Read depth map // Define a map of CNV genotypes by HMM predicted state. // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output. @@ -85,56 +78,35 @@ class CNVCaller { void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp); - std::pair, double> runViterbi(const CHMM& hmm, SNPData &snp_data); + std::pair, double> runViterbi(const CHMM& hmm, SNPData& snp_data); // Query a region for SNPs and return the SNP data - std::pair querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, SNPInfo &snp_info, std::unordered_map &pos_depth_map, double mean_chr_cov); + std::pair querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& pos_depth_map, double mean_chr_cov); void querySNPs(std::string chr, uint32_t start, uint32_t end, std::set& snp_pos, std::unordered_map& snp_baf, std::unordered_map& snp_pfb); // Run copy number prediction for a chunk of SV candidates from CIGAR strings - void runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov); - - void updateSVCopyNumber(std::map& sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood); - - void updateDPValue(std::map& sv_candidates, SVCandidate key, int dp_value); + void runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov, std::vector& pos_depth_map); // Split a region into chunks for parallel processing std::vector splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count); - // Split SV candidates into chunks for parallel processing - std::vector> splitSVCandidatesIntoChunks(std::map& sv_candidates, int chunk_count); - - // Merge the read depths from a chunk into the main read depth map - void mergePosDepthMaps(std::unordered_map& main_map, std::unordered_map& map_update); - public: explicit CNVCaller(InputData& input_data); - // Load file data for a chromosome (SNP positions, BAF values, and PFB values) - void loadChromosomeData(std::string chr); - // Run copy number prediction for a single SV candidate, returning the // likelihood, predicted CNV type, genotype, and whether SNPs were found - std::tuple runCopyNumberPrediction(std::string chr, const SVCandidate& sv_candidate, const CHMM& hmm); + std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, std::vector& pos_depth_map); // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings - // SNPData runCIGARCopyNumberPrediction(std::string chr, std::map& sv_candidates, int min_length); - void runCIGARCopyNumberPrediction(std::string chr, std::set& sv_candidates, int min_length, const CHMM& hmm); + void runCIGARCopyNumberPrediction(std::string chr, std::set& sv_candidates, int min_length, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map); // Calculate the mean chromosome coverage - double calculateMeanChromosomeCoverage(std::string chr); + std::pair> calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len); // Calculate the log2 ratio for a region given the read depths and mean // chromosome coverage - double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::unordered_map& pos_depth_map, double mean_chr_cov); - - // Read SNP positions and BAF values from the VCF file of SNP calls - // void readSNPAlleleFrequencies(std::string chr, std::string filepath, SNPInfo& snp_info); - - // Read SNP population frequencies from the PFB file and return a vector - // of population frequencies for each SNP location - // void getSNPPopulationFrequencies(std::string chr, SNPInfo& snp_info); + double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::vector& pos_depth_map, double mean_chr_cov); void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::set& snp_pos, std::unordered_map& snp_baf); void readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map& snp_pfb_map); diff --git a/include/cnv_data.h b/include/cnv_data.h deleted file mode 100644 index a2ebd403..00000000 --- a/include/cnv_data.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef CNV_DATA_H -#define CNV_DATA_H - -/// @cond -#include -#include -#include -/// @endcond - -// CNV candidate location map -// (chr, snp_pos) : cnv_type - -using SNPLocation = std::pair; -using SNPToCNVMap = std::map; - - -class CNVData { - private: - SNPToCNVMap cnv_calls; // Map of SNP positions to CNV types - - public: - // Add a CNV call to the map - void addCNVCall(std::string chr, int snp_pos, int cnv_type); - - // Get the most common CNV type within the SV region start and end positions - std::tuple getMostCommonCNV(std::string chr, int start, int end); - - // Load CNV calls from file - void loadFromFile(std::string filepath); -}; - -#endif // CNV_DATA_H diff --git a/include/contextsv.h b/include/contextsv.h index 56a82a54..97d7bce9 100644 --- a/include/contextsv.h +++ b/include/contextsv.h @@ -7,8 +7,6 @@ #define CONTEXTSV_H #include "input_data.h" -#include "cnv_data.h" -#include "sv_data.h" class ContextSV { diff --git a/include/input_data.h b/include/input_data.h index 718b5264..7d577784 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -40,7 +40,6 @@ class InputData { // Return a reference to the ReferenceGenome object. const ReferenceGenome& getRefGenome() const; - // FASTAQuery getRefGenome(); // Query the reference genome for a sequence. std::string queryRefGenome(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const; @@ -49,7 +48,7 @@ class InputData { std::vector getRefGenomeChromosomes(); // Get a chromosome's length in the reference genome. - int64_t getRefGenomeChromosomeLength(std::string chr); + uint32_t getRefGenomeChromosomeLength(std::string chr); // Set the filepath to the text file containing the locations of the // VCF files with population frequencies for each chromosome. diff --git a/include/snp_info.h b/include/snp_info.h deleted file mode 100644 index 51278951..00000000 --- a/include/snp_info.h +++ /dev/null @@ -1,51 +0,0 @@ -#ifndef SNP_INFO_H -#define SNP_INFO_H - -#include -#include -#include -#include -#include -#include - -// Define the comparator for the binary search tree by SNP position (first -// element of tuple) -struct SNPCompare { - bool operator()(const std::tuple& a, const std::tuple& b) const { - return std::get<0>(a) < std::get<0>(b); - } -}; - -// Define the data structure for SNP frequencies sorted by position -using BST = std::set, SNPCompare>; - -class SNPInfo { -public: - SNPInfo() {} - - // Insert a SNP into the map with its position and B-allele frequency - void insertSNPAlleleFrequency(const std::string& chr, uint32_t pos, double baf); - - // Insert a SNP into the map with its position and population frequency of - // the B allele - void insertSNPPopulationFrequency(const std::string& chr, uint32_t pos, double pfb); - - // Query SNPs within a range (start, end) and return their BAF and PFB values - std::tuple, std::vector, std::vector> querySNPs(std::string chr, uint32_t start, uint32_t end); - - // Get the range of SNP positions for a given chromosome - std::pair getSNPRange(std::string chr); - - -private: - // Mutex for reading SNP information - std::mutex snp_info_mtx; - - // Define the map of chromosome to SNP B-allele frequency - std::unordered_map snp_baf_map; - - // Define the map of chromosome to SNP population frequency - std::unordered_map> snp_pfb_map; -}; - -#endif // SNP_INFO_H diff --git a/include/sv_caller.h b/include/sv_caller.h index cdafab0b..f3f78af9 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -5,8 +5,6 @@ #include "cnv_caller.h" #include "input_data.h" -#include "cnv_data.h" -#include "sv_data.h" #include "sv_object.h" #include "fasta_query.h" @@ -20,7 +18,7 @@ // SV candidate alignment data (chr, start, end, sequence, query start, query // end, mismatch map, strand) -using AlignmentData = std::tuple, bool>; +using AlignmentData = std::tuple, bool>; using AlignmentVector = std::vector; // Query map (query name, alignment vector) @@ -36,25 +34,26 @@ class SVCaller { // Detect SVs from the CIGAR string of a read alignment, and return the // mismatch rate, and the start and end positions of the query sequence - std::tuple, int32_t, int32_t> detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set& sv_calls, bool is_primary); + void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set& sv_calls, std::tuple, int32_t, int32_t>& query_info, bool is_primary); void processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::set& combined_sv_calls, int min_cnv_length); // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. // RegionData detectSVsFromRegion(std::string region); - std::tuple, PrimaryMap, SuppMap> detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region); + void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::set& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments); // Read the next alignment from the BAM file in a thread-safe manner int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); // Detect SVs from split alignments - void detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm); + void detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map); // Calculate the mismatch rate given a map of query positions to // match/mismatch (1/0) values within a specified range of the query // sequence - double calculateMismatchRate(std::unordered_map& mismatch_map, int32_t start, int32_t end); + // double calculateMismatchRate(std::unordered_map& mismatch_map, int32_t start, int32_t end); + double calculateMismatchRate(const std::vector& mismatch_map, int32_t start, int32_t end); void saveToVCF(const std::unordered_map>& sv_calls); @@ -64,7 +63,7 @@ class SVCaller { explicit SVCaller(InputData& input_data); // Detect SVs and predict SV type from long read alignments and CNV calls - std::unordered_map> run(); + void run(); }; #endif // SV_CALLER_H diff --git a/include/sv_data.h b/include/sv_data.h deleted file mode 100644 index fef815ed..00000000 --- a/include/sv_data.h +++ /dev/null @@ -1,52 +0,0 @@ -#ifndef SV_DATA_H -#define SV_DATA_H - - -/// @cond -#include -#include -#include -#include - -#include "sv_types.h" -#include "fasta_query.h" -/// @endcond - -// Include the SV types namespace -using namespace sv_types; - -// SV data class -class SVData { - private: - SVDepthMap sv_calls; - - // Map of clipped base support by position (chr, pos) : depth - std::map, int> clipped_base_support; - - public: - SVData() {}; - - int add(std::string chr, int32_t start, int32_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood); - - void concatenate(const SVData& sv_data); - - // Update clipped base support for a given breakpoint location - void updateClippedBaseSupport(std::string chr, int64_t pos); - - int getClippedBaseSupport(std::string chr, int64_t pos, int64_t end); - - void saveToVCF(ReferenceGenome& ref_genome, std::string output_dir); - - std::map& getChromosomeSVs(std::string chr); - - std::set getChromosomes(); - - // Begin and end iterators for the SV candidate map - SVDepthMap::iterator begin() { return this->sv_calls.begin(); } - SVDepthMap::iterator end() { return this->sv_calls.end(); } - - // Get the total number of calls (For summary purposes) - int totalCalls(); -}; - -#endif // SV_DATA_H diff --git a/include/sv_types.h b/include/sv_types.h index f58e6f7b..60471a01 100644 --- a/include/sv_types.h +++ b/include/sv_types.h @@ -57,25 +57,6 @@ namespace sv_types { inline SVType getSVTypeFromCNState(int cn_state) { return CNVTypeMap.at(cn_state); } - - // Create a struct for storing SV information - struct SVInfo { - SVType sv_type; - int read_support; // Number of reads supporting the SV breakpoints - int read_depth; // Read depth at the SV start position - std::set data_type; // Alignment type used to call the SV - int sv_length; - std::string genotype = "./."; // Default genotype (no call) - double hmm_likelihood = 0.0; // HMM likelihood score for the state sequence - - SVInfo() = default; - SVInfo(SVType sv_type, int read_support, int read_depth, std::string data_type, int sv_length, std::string genotype, double hmm_likelihood) : - sv_type(sv_type), read_support(read_support), read_depth(read_depth), data_type({data_type}), sv_length(sv_length), genotype(genotype), hmm_likelihood(hmm_likelihood) {} - }; - - // Type definition for SV-related structures - using SVCandidate = std::tuple; // SV (start, end, alt_allele) - using SVDepthMap = std::unordered_map>; // Chromosome -> SV candidate -> SV info } #endif // SV_TYPES_H diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index dac33183..5f1c2e1d 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -24,9 +24,9 @@ #include #include #include // std::max +#include // std::pair #include "utils.h" -#include "sv_data.h" #include "sv_types.h" #define MIN_PFB 0.01 @@ -48,13 +48,12 @@ std::pair, double> CNVCaller::runViterbi(const CHMM& hmm, SNPDa { throw std::runtime_error("Error: No SNP data found for Viterbi algorithm."); } - // std::lock_guard lock(this->hmm_mtx); // Lock the mutex for the HMM std::pair, double> state_sequence = testVit_CHMM(hmm, data_count, snp_data.log2_cov, snp_data.baf, snp_data.pfb); return state_sequence; } // Function to obtain SNP information for a region -std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, SNPInfo& snp_info, std::unordered_map& pos_depth_map, double mean_chr_cov) +std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& pos_depth_map, double mean_chr_cov) { SNPData snp_data; bool snps_found = false; @@ -65,10 +64,6 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta std::unordered_map snp_baf; std::unordered_map snp_pfb; this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb); - // std::pair, std::vector, std::vector> snp_query = this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb); - // std::vector& snp_pos = std::get<0>(snp_query); - // std::vector& snp_pfb = std::get<1>(snp_query); - // std::vector& snp_baf = std::get<2>(snp_query); // Loop through the range of the SV region and query the SNPs in a sliding // window, then calculate the log2 ratio for each window @@ -98,7 +93,6 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta // after the SNP, and continue until the end of the window // (If there are no SNPs in the window, then use the default BAF and // PFB values, and the coverage log2 ratio) - // If no SNPs, then calculate the log2 ratio for the window if (snp_window_pos.size() == 0) { @@ -111,8 +105,6 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta snps_found = true; // Loop through the SNPs and calculate the log2 ratios - // uint32_t bin_start = window_start; - // uint32_t bin_end = 0; for (int j = 0; j < (int) snp_window_pos.size(); j++) { // Just use a window centered at the SNP position @@ -144,12 +136,8 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta return std::make_pair(snp_data, snps_found); } -std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const SVCandidate& candidate, const CHMM& hmm) +std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, std::vector& pos_depth_map) { - // Get the start and end positions of the SV call - uint32_t start_pos = std::get<0>(candidate); - uint32_t end_pos = std::get<1>(candidate); - // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2 // the SV length uint32_t sv_half_length = (end_pos - start_pos) / 2.0; @@ -157,12 +145,12 @@ std::tuple CNVCaller::runCopyNumberPrediction uint32_t snp_end_pos = end_pos + sv_half_length; // Query the SNP region for the SV candidate - std::pair snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, this->snp_info, this->pos_depth_map, this->mean_chr_cov); + std::pair snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov); SNPData& sv_snps = snp_call.first; bool sv_snps_found = snp_call.second; // Run the Viterbi algorithm - printMessage("Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " (" + std::to_string(sv_snps.pos.size()) + " SNPs, start=" + std::to_string(snp_start_pos) + ", end=" + std::to_string(snp_end_pos) + ")..."); + // printMessage("Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " (" + std::to_string(sv_snps.pos.size()) + " SNPs, start=" + std::to_string(snp_start_pos) + ", end=" + std::to_string(snp_end_pos) + ")..."); std::pair, double> prediction = runViterbi(hmm, sv_snps); std::vector& state_sequence = prediction.first; double likelihood = prediction.second; @@ -220,16 +208,16 @@ std::tuple CNVCaller::runCopyNumberPrediction } -void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set &sv_candidates, int min_length, const CHMM& hmm) +void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set &sv_candidates, int min_length, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map) { int window_size = this->input_data.getWindowSize(); - double mean_chr_cov = this->mean_chr_cov; + // double mean_chr_cov = this->mean_chr_cov; // printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "..."); - runCIGARCopyNumberPredictionChunk(chr, sv_candidates, hmm, window_size, mean_chr_cov); + runCIGARCopyNumberPredictionChunk(chr, sv_candidates, hmm, window_size, mean_chr_cov, pos_depth_map); // printMessage("Finished predicting copy number states for chromosome " + chr + "..."); } -void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov) +void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov, std::vector& pos_depth_map) { // printMessage("Running copy number prediction for " + std::to_string(sv_chunk.size()) + " SV candidates on chromosome " + chr + "..."); // Map with counts for each CNV type @@ -270,7 +258,7 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set sv_half_length ? start_pos - sv_half_length : 1; uint32_t snp_end_pos = end_pos + sv_half_length; - std::pair snp_call = this->querySNPRegion(chr, snp_start_pos, snp_end_pos, snp_info, this->pos_depth_map, mean_chr_cov); + std::pair snp_call = this->querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov); SNPData& sv_snps = snp_call.first; bool snps_found = snp_call.second; @@ -357,39 +345,6 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set &sv_candidates, SVCandidate key, SVType sv_type_update, std::string data_type, std::string genotype, double hmm_likelihood) -{ - // Update SV data from the HMM copy number prediction - // Lock the SV candidate map - std::lock_guard lock(this->sv_candidates_mtx); - - // Update the SV type if the update is not unknown, and if the types don't - // conflict (To avoid overwriting previous calls) - SVType current_sv_type = sv_candidates[key].sv_type; - if ((sv_type_update != SVType::UNKNOWN) && ((current_sv_type == sv_type_update) || (current_sv_type == SVType::UNKNOWN))) - { - sv_candidates[key].sv_type = sv_type_update; // Update the SV type - sv_candidates[key].data_type.insert(data_type); // Update the data type - - // Update the likelihood if it is greater than the existing likelihood, - // or if it is currently unknown (0.0) - double previous_likelihood = sv_candidates[key].hmm_likelihood; - if (previous_likelihood == 0.0 || hmm_likelihood > previous_likelihood) - { - sv_candidates[key].hmm_likelihood = hmm_likelihood; - } - - // Update the genotype - sv_candidates[key].genotype = genotype; - } -} - -void CNVCaller::updateDPValue(std::map& sv_candidates, SVCandidate key, int dp_value) -{ - std::lock_guard lock(this->sv_candidates_mtx); - sv_candidates[key].read_depth = dp_value; -} - std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) { // Split the region into chunks @@ -415,187 +370,168 @@ std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint3 return region_chunks; } -std::vector> CNVCaller::splitSVCandidatesIntoChunks(std::map& sv_candidates, int chunk_count) -{ - // Split the SV candidates into chunks - std::vector> sv_chunks; - int sv_count = (int) sv_candidates.size(); - int chunk_size = std::ceil((double) sv_count / (double) chunk_count); - int current_chunk = 0; - std::vector current_sv_chunk; - for (auto const& sv_call : sv_candidates) - { - current_sv_chunk.push_back(sv_call.first); - - // If the current chunk size is reached, then add the chunk to the - // vector and reset the current chunk - if ((int) current_sv_chunk.size() == chunk_size) - { - sv_chunks.push_back(current_sv_chunk); - current_sv_chunk.clear(); - current_chunk++; - } - } - - // Add the remaining SV candidates to the last chunk - if (current_sv_chunk.size() > 0) - { - sv_chunks.push_back(current_sv_chunk); - } - - return sv_chunks; -} - -void CNVCaller::loadChromosomeData(std::string chr) -{ - printMessage("Calculating mean chromosome coverage for " + chr + "..."); - this->mean_chr_cov = calculateMeanChromosomeCoverage(chr); - printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov)); -} +// std::pair> CNVCaller::loadChromosomeData(std::string chr) +// { +// printMessage("Calculating mean chromosome coverage for " + chr + "..."); +// // this->mean_chr_cov = calculateMeanChromosomeCoverage(chr); +// std::pair> depth_data = calculateMeanChromosomeCoverage(chr); +// printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov)); +// } // Calculate the mean chromosome coverage -double CNVCaller::calculateMeanChromosomeCoverage(std::string chr) +std::pair> CNVCaller::calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len) { - // Open the BAM file - std::string bam_filepath = this->input_data.getShortReadBam(); - samFile *bam_file = sam_open(bam_filepath.c_str(), "r"); - if (!bam_file) + // std::unordered_map chr_pos_depth_map; + std::vector chr_pos_depth_map(chr_len+1, 0); // 1-based index { - throw std::runtime_error("ERROR: Could not open BAM file: " + bam_filepath); - } + // Lock the bam file + std::lock_guard lock(this->bam_file_mtx); - // Enable multi-threading - // hts_set_threads(bam_file, this->input_data.getThreadCount()); + // Open the BAM file + std::string bam_filepath = this->input_data.getShortReadBam(); + samFile *bam_file = sam_open(bam_filepath.c_str(), "r"); + if (!bam_file) + { + throw std::runtime_error("ERROR: Could not open BAM file: " + bam_filepath); + } - // Read the header - bam_hdr_t *bam_header = sam_hdr_read(bam_file); - if (!bam_header) - { - sam_close(bam_file); - throw std::runtime_error("ERROR: Could not read header from BAM file: " + bam_filepath); - } + // Enable multi-threading + // hts_set_threads(bam_file, this->input_data.getThreadCount()); - // Load the index - hts_idx_t *bam_index = sam_index_load(bam_file, bam_filepath.c_str()); - if (!bam_index) - { - bam_hdr_destroy(bam_header); - sam_close(bam_file); - throw std::runtime_error("ERROR: Could not load index for BAM file: " + bam_filepath); - } + // Read the header + bam_hdr_t *bam_header = sam_hdr_read(bam_file); + if (!bam_header) + { + sam_close(bam_file); + throw std::runtime_error("ERROR: Could not read header from BAM file: " + bam_filepath); + } - // Create an iterator for the chromosome - hts_itr_t *bam_iter = sam_itr_querys(bam_index, bam_header, chr.c_str()); - if (!bam_iter) - { - hts_idx_destroy(bam_index); - bam_hdr_destroy(bam_header); - sam_close(bam_file); - throw std::runtime_error("ERROR: Could not create iterator for chromosome: " + chr); - } + // Load the index + hts_idx_t *bam_index = sam_index_load(bam_file, bam_filepath.c_str()); + if (!bam_index) + { + bam_hdr_destroy(bam_header); + sam_close(bam_file); + throw std::runtime_error("ERROR: Could not load index for BAM file: " + bam_filepath); + } - // Initialize the record - bam1_t *bam_record = bam_init1(); - if (!bam_record) - { - hts_itr_destroy(bam_iter); - hts_idx_destroy(bam_index); - bam_hdr_destroy(bam_header); - sam_close(bam_file); - throw std::runtime_error("ERROR: Could not initialize BAM record."); - } + // Create an iterator for the chromosome + hts_itr_t *bam_iter = sam_itr_querys(bam_index, bam_header, chr.c_str()); + if (!bam_iter) + { + hts_idx_destroy(bam_index); + bam_hdr_destroy(bam_header); + sam_close(bam_file); + throw std::runtime_error("ERROR: Could not create iterator for chromosome: " + chr + ", check if the chromosome exists in the BAM file."); + } - // Iterate through the chromosome and update the depth map - std::unordered_map chr_pos_depth_map; - while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0) - { - // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads - if (bam_record->core.flag & BAM_FUNMAP || bam_record->core.flag & BAM_FSECONDARY || bam_record->core.flag & BAM_FQCFAIL || bam_record->core.flag & BAM_FDUP) + // Initialize the record + bam1_t *bam_record = bam_init1(); + if (!bam_record) { - continue; + hts_itr_destroy(bam_iter); + hts_idx_destroy(bam_index); + bam_hdr_destroy(bam_header); + sam_close(bam_file); + throw std::runtime_error("ERROR: Could not initialize BAM record."); } - - // Parse the CIGAR string to get the depth (match, sequence match, and - // mismatch) - // uint32_t depth = 0; - uint32_t pos = bam_record->core.pos + 1; // 0-based to 1-based - uint32_t ref_pos = pos; - uint32_t cigar_len = bam_record->core.n_cigar; - uint32_t *cigar = bam_get_cigar(bam_record); - for (uint32_t i = 0; i < cigar_len; i++) + + // Iterate through the chromosome and update the depth map + // std::unordered_map chr_pos_depth_map; + while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0) { - uint32_t op = bam_cigar_op(cigar[i]); - uint32_t op_len = bam_cigar_oplen(cigar[i]); - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) + // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads + if (bam_record->core.flag & BAM_FUNMAP || bam_record->core.flag & BAM_FSECONDARY || bam_record->core.flag & BAM_FQCFAIL || bam_record->core.flag & BAM_FDUP) { - // Update the depth for each position in the alignment - for (uint32_t j = 0; j < op_len; j++) - { - chr_pos_depth_map[ref_pos + j]++; - } + continue; } - // Update the reference coordinate based on the CIGAR operation - // https://samtools.github.io/hts-specs/SAMv1.pdf - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { - ref_pos += op_len; - } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) { - // Do nothing - } else { - throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op)); + // Parse the CIGAR string to get the depth (match, sequence match, and + // mismatch) + uint32_t pos = bam_record->core.pos + 1; // 0-based to 1-based + uint32_t ref_pos = pos; + uint32_t cigar_len = bam_record->core.n_cigar; + uint32_t *cigar = bam_get_cigar(bam_record); + for (uint32_t i = 0; i < cigar_len; i++) + { + uint32_t op = bam_cigar_op(cigar[i]); + uint32_t op_len = bam_cigar_oplen(cigar[i]); + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) + { + // Update the depth for each position in the alignment + for (uint32_t j = 0; j < op_len; j++) + { + try { + chr_pos_depth_map[ref_pos + j]++; + } catch (const std::out_of_range& oor) { + std::cerr << "Out of range error for " << chr << ":" << ref_pos+j << std::endl; + } + // chr_pos_depth_map[ref_pos + j]++; + } + } + + // Update the reference coordinate based on the CIGAR operation + // https://samtools.github.io/hts-specs/SAMv1.pdf + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { + ref_pos += op_len; + } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) { + // Do nothing + } else { + throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op)); + } } } - } - // Clean up - bam_destroy1(bam_record); - hts_itr_destroy(bam_iter); - hts_idx_destroy(bam_index); - bam_hdr_destroy(bam_header); - sam_close(bam_file); + // Clean up + bam_destroy1(bam_record); + hts_itr_destroy(bam_iter); + hts_idx_destroy(bam_index); + bam_hdr_destroy(bam_header); + sam_close(bam_file); + } // Calculate the mean chromosome coverage for positions with non-zero depth uint64_t cum_depth = 0; uint32_t pos_count = 0; - for (auto& pos_depth : chr_pos_depth_map) + for (const auto& pos_depth : chr_pos_depth_map) { - cum_depth += pos_depth.second; - pos_count++; + if (pos_depth > 0) + { + cum_depth += pos_depth; + pos_count++; + } } - double mean_chr_cov = (double) cum_depth / (double) pos_count; - - // Update the position depth map - this->pos_depth_map = std::move(chr_pos_depth_map); - - return mean_chr_cov; -} - -void CNVCaller::mergePosDepthMaps(std::unordered_map& main_map, std::unordered_map& map_update) -{ - // Merge the second depth map into the first - for (auto& pos_depth : map_update) + double mean_chr_cov = 0.0; + if (pos_count > 0) { - main_map[pos_depth.first] = pos_depth.second; + mean_chr_cov = static_cast(cum_depth) / static_cast(pos_count); } + + return std::make_pair(mean_chr_cov, chr_pos_depth_map); } -double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::unordered_map &pos_depth_map, double mean_chr_cov) +double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::vector& pos_depth_map, double mean_chr_cov) { // Use the position and depth map to calculate the log2 ratio double cum_depth = 0; int pos_count = 0; for (uint32_t i = start_pos; i <= end_pos; i++) { - // Check if the position is in the map - auto it = pos_depth_map.find(i); - if (it == pos_depth_map.end()) + if (i < pos_depth_map.size() && pos_depth_map[i] > 0) { - continue; + cum_depth += pos_depth_map[i]; + pos_count++; } - int depth = pos_depth_map[i]; - pos_count++; - cum_depth += depth; + // // Check if the position is in the map + // auto it = pos_depth_map.find(i); + // if (it == pos_depth_map.end()) + // { + // continue; + // } + // int depth = pos_depth_map[i]; + // pos_count++; + // cum_depth += depth; } // Calculate the window coverage log2 ratio (0 if no positions) @@ -632,6 +568,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui throw std::runtime_error("ERROR: Could not initialize SNP reader."); } + // Lock during reading + std::lock_guard lock(this->snp_file_mtx); + // Set the region std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) @@ -812,6 +751,9 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos throw std::runtime_error("ERROR: Could not initialize synced reader for population frequency file: " + pfb_filepath); } + // Lock during reading + std::lock_guard lock(this->pfb_file_mtx); + // Set the region for the synced reader std::string region_str = chr_gnomad + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); if (bcf_sr_set_regions(pfb_reader, region_str.c_str(), 0) < 0) diff --git a/src/cnv_data.cpp b/src/cnv_data.cpp deleted file mode 100644 index 0c4593c0..00000000 --- a/src/cnv_data.cpp +++ /dev/null @@ -1,73 +0,0 @@ -#include "cnv_data.h" - -/// @cond -#include -#include -#include -#include -#include -#include - -#include "sv_types.h" -/// @endcond - -// Include the SV types namespace -using namespace sv_types; - -void CNVData::addCNVCall(std::string chr, int snp_pos, int cnv_type) -{ - // Add the CNV call to the map - SNPLocation key(chr, snp_pos); - this->cnv_calls[key] = cnv_type; -} - -void CNVData::loadFromFile(std::string filepath) -{ - // Load CNV calls from file - std::ifstream cnv_file(filepath); - std::string line; - std::string chr; - int snp_pos; - int cnv_type; - - // Check if the file was opened successfully - if (!cnv_file.is_open()) { - std::cerr << "Error: Could not open CNV file " << filepath << std::endl; - exit(1); - } - - // Skip the first line (header) - std::getline(cnv_file, line); - - // Read the file line by line - int line_num = 1; - while (std::getline(cnv_file, line)) { - - // Parse the line - std::istringstream iss(line); - - // Get columns 1, 2, and 5 (chr, pos, cnv_type) - std::string chr; - std::getline(iss, chr, '\t'); - - std::string pos_str; - std::getline(iss, pos_str, '\t'); - snp_pos = std::stoi(pos_str); - - std::string skip_str; - std::getline(iss, skip_str, '\t'); - std::getline(iss, skip_str, '\t'); - - std::string cnv_type_str; - std::getline(iss, cnv_type_str, '\t'); - cnv_type = std::stoi(cnv_type_str); - - // Add the CNV call to the map - this->addCNVCall(chr, snp_pos, cnv_type); - - line_num++; - } - cnv_file.close(); - - std::cout << "Loaded " << line_num << " CNV calls" << std::endl; -} diff --git a/src/contextsv.cpp b/src/contextsv.cpp index 1e22b650..5a9e7ffd 100644 --- a/src/contextsv.cpp +++ b/src/contextsv.cpp @@ -23,7 +23,9 @@ int ContextSV::run() SVCaller sv_caller(this->input_data); // Create an SV caller object // SVCaller sv_caller(*this->input_data); // Create an SV caller object // SVData sv_calls = sv_caller.run(); // Run the SV caller - std::unordered_map> sv_calls = sv_caller.run(); // Run the SV caller + // std::unordered_map> sv_calls = + // sv_caller.run(); // Run the SV caller + sv_caller.run(); // Run the SV caller // std::string output_dir = this->input_data->getOutputDir(); // Get the output directory // std::cout << "Writing SV calls to file " << output_dir << "/output.vcf..." << std::endl; diff --git a/src/input_data.cpp b/src/input_data.cpp index e152a6cf..74dd9788 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -105,7 +105,7 @@ std::vector InputData::getRefGenomeChromosomes() return this->fasta_query.getChromosomes(); } -int64_t InputData::getRefGenomeChromosomeLength(std::string chr) +uint32_t InputData::getRefGenomeChromosomeLength(std::string chr) { return this->fasta_query.getChromosomeLength(chr); } diff --git a/src/snp_info.cpp b/src/snp_info.cpp deleted file mode 100644 index 1dc7b4a7..00000000 --- a/src/snp_info.cpp +++ /dev/null @@ -1,108 +0,0 @@ -#include "snp_info.h" -#include "utils.h" - -/// @cond -#include -#include -#include -#include -#include -/// @endcond - -#define MIN_PFB 0.01 - - -void SNPInfo::insertSNPAlleleFrequency(const std::string& chr, uint32_t pos, double baf) -{ - // chr = removeChrPrefix(chr); - - // Add the chromosome to the SNP B-allele frequency map if it does not exist - // if (this->snp_baf_map.find(chr) == this->snp_baf_map.end()) { - // this->snp_baf_map[chr] = BST(); - // } - - // Insert the SNP into the map with its position and B-allele frequency - // using a binary search tree to keep the SNP positions sorted - this->snp_baf_map[chr].insert({pos, baf}); -} - -void SNPInfo::insertSNPPopulationFrequency(const std::string& chr, uint32_t pos, double pfb) -{ - // chr = removeChrPrefix(chr); - - // Add the chromosome to the SNP population frequency map if it does not - // exist - // if (this->snp_pfb_map.find(chr) == this->snp_pfb_map.end()) { - // this->snp_pfb_map[chr] = std::unordered_map(); - // } - - // Insert the SNP into the map with its position and population frequency of - // the B allele - this->snp_pfb_map[chr][pos] = pfb; -} - -std::tuple, std::vector, std::vector> SNPInfo::querySNPs(std::string chr, uint32_t start, uint32_t end) -{ - // Lock the mutex for reading SNP information - // std::lock_guard lock(this->snp_info_mtx); - - chr = removeChrPrefix(chr); - - // Create an ordered map of SNP positions to BAF and PFB values - std::map> snp_map; - - // Query SNPs within a range (start, end) and return their BAF and PFB - // values as separate vectors - std::vector bafs; - std::vector pfbs; - std::vector pos; - - // Check if the chromosome exists in the B-allele frequency map - if (this->snp_baf_map.find(chr) == this->snp_baf_map.end()) { - return std::make_tuple(pos, bafs, pfbs); - } - - // Query the SNPs within the range and return their BAFs and corresponding - // positions - auto& baf_bst = this->snp_baf_map[chr]; - auto baf_start = baf_bst.lower_bound({start, 0.0}); - auto baf_end = baf_bst.upper_bound({end, 0.0}); - for (auto it = baf_start; it != baf_end; it++) { - bafs.push_back(std::get<1>(*it)); - pos.push_back(std::get<0>(*it)); - } - - // Define a default PFB value (0.5) for SNPs with no population frequency data - pfbs = std::vector(bafs.size(), 0.5); - - // Check if the chromosome exists in the population frequency map - if (this->snp_pfb_map.find(chr) == this->snp_pfb_map.end()) { - return std::make_tuple(pos, bafs, pfbs); - } - - // Query the PFBs for all SNP positions with PFB data - auto& pfb_map = this->snp_pfb_map[chr]; - for (size_t i = 0; i < pos.size(); i++) { - uint32_t snp_pos = pos[i]; - if (pfb_map.find(snp_pos) != pfb_map.end()) { - pfbs[i] = pfb_map[snp_pos]; - } - } - - return std::make_tuple(pos, bafs, pfbs); -} - -std::pair SNPInfo::getSNPRange(std::string chr) -{ - chr = removeChrPrefix(chr); - - // Get the range of SNP positions for a given chromosome - uint32_t start = 0; - uint32_t end = 0; - if (this->snp_baf_map.find(chr) != this->snp_baf_map.end()) { - auto& baf_bst = this->snp_baf_map[chr]; - start = std::get<0>(*baf_bst.begin()); - end = std::get<0>(*baf_bst.rbegin()); - } - return std::make_pair(start, end); -} diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index b9038e5c..8918a131 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -38,7 +38,10 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) } // RegionData SVCaller::detectSVsFromRegion(std::string region) -std::tuple, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region) +// std::tuple, PrimaryMap, SuppMap> +// SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, +// const std::string& region) +void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::set& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments) { // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); @@ -58,10 +61,10 @@ std::tuple, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFi } // Main loop to process the alignments - std::set sv_calls; + // std::set sv_calls; int num_alignments = 0; - PrimaryMap primary_alignments; - SuppMap supplementary_alignments; + // PrimaryMap primary_alignments; + // SuppMap supplementary_alignments; while (readNextAlignment(fp_in, itr, bam1) >= 0) { // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality @@ -80,13 +83,15 @@ std::tuple, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFi bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); // Call SVs directly from the CIGAR string - std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true); - std::unordered_map match_map = std::get<0>(query_info); + std::tuple, int32_t, int32_t> query_info; + this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true); + // std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true); + const std::vector& match_map = std::get<0>(query_info); int32_t query_start = std::get<1>(query_info); int32_t query_end = std::get<2>(query_info); // Add the primary alignment to the map - AlignmentData alignment(chr, start, end, ".", query_start, query_end, match_map, fwd_strand); + AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand); primary_alignments[qname] = alignment; // Process supplementary alignments @@ -99,13 +104,16 @@ std::tuple, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFi bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); // Get CIGAR string information, but don't call SVs - std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false); - const std::unordered_map& match_map = std::get<0>(query_info); + // std::tuple, int32_t, int32_t> query_info = + // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false); + std::tuple, int32_t, int32_t> query_info; + this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, false); + const std::vector& match_map = std::get<0>(query_info); int32_t query_start = std::get<1>(query_info); int32_t query_end = std::get<2>(query_info); // Add the supplementary alignment to the map - AlignmentData alignment(chr, start, end, ".", query_start, query_end, match_map, fwd_strand); + AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand); supplementary_alignments[qname].emplace_back(alignment); } @@ -116,35 +124,57 @@ std::tuple, PrimaryMap, SuppMap> SVCaller::detectCIGARSVs(samFi hts_itr_destroy(itr); bam_destroy1(bam1); - return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments); + // return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments); } -double SVCaller::calculateMismatchRate(std::unordered_map &match_map, int32_t start, int32_t end) +double SVCaller::calculateMismatchRate(const std::vector& mismatch_map, int32_t start, int32_t end) { + start = std::max(start, 0); + end = std::min(end, (int32_t)mismatch_map.size() - 1); int match_count = 0; int mismatch_count = 0; + int MATCH = 1; + int MISMATCH = -1; for (int i = start; i <= end; i++) { - if (match_map.find(i) != match_map.end()) { - if (match_map[i] == 1) { - match_count++; - } else { - mismatch_count++; - } + if (mismatch_map[i] == MATCH) { + match_count++; + } else if (mismatch_map[i] == MISMATCH) { + mismatch_count++; } } - double mismatch_rate = (double)mismatch_count / (double)(match_count + mismatch_count); + + // Avoid division by zero + if (match_count + mismatch_count == 0) { + return 0.0; + } + + double mismatch_rate = static_cast(mismatch_count) / static_cast(match_count + mismatch_count); + // int match_count = 0; + // int mismatch_count = 0; + // for (int i = start; i <= end; i++) { + // if (match_map.find(i) != match_map.end()) { + // if (match_map[i] == 1) { + // match_count++; + // } else { + // mismatch_count++; + // } + // } + // } + // double mismatch_rate = (double)mismatch_count / (double)(match_count + mismatch_count); return mismatch_rate; } -std::tuple, int32_t, int32_t> SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set& sv_calls, bool is_primary) +void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set& sv_calls, std::tuple, int32_t, int32_t>& query_info, bool is_primary) { std::string chr = header->target_name[alignment->core.tid]; // Chromosome name int32_t pos = alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) uint32_t* cigar = bam_get_cigar(alignment); // CIGAR array int cigar_len = alignment->core.n_cigar; int query_pos = 0; - std::unordered_map query_match_map; // Query position to match/mismatch (1/0) map + // std::unordered_map query_match_map; // Query position to + // match/mismatch (1/0) map + std::vector query_match_map(alignment->core.l_qseq, 0); // Query position to match/mismatch (1/0) map // Loop through the CIGAR string, process operations, detect SVs (primary // only), update clipped base support, calculate sequence identity for @@ -250,13 +280,15 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr } // Update match/mismatch query map + int MATCH = 1; + int MISMATCH = -1; if (op == BAM_CEQUAL) { for (int j = 0; j < op_len; j++) { - query_match_map[query_pos + j] = 1; + query_match_map[query_pos + j] = MATCH; } } else if (op == BAM_CDIFF) { for (int j = 0; j < op_len; j++) { - query_match_map[query_pos + j] = 0; + query_match_map[query_pos + j] = MISMATCH; } } else if (op == BAM_CMATCH) { // Get the read sequence @@ -279,9 +311,9 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr // Compare the two sequences and update the mismatch map for (int j = 0; j < op_len; j++) { if (cmatch_seq_str[j] != cmatch_ref_str[j]) { - query_match_map[query_pos + j] = 0; + query_match_map[query_pos + j] = MISMATCH; } else { - query_match_map[query_pos + j] = 1; + query_match_map[query_pos + j] = MATCH; } } } @@ -308,7 +340,9 @@ std::tuple, int32_t, int32_t> SVCaller::detectSVsFr query_end = query_pos; // Last alignment position in the query - return std::tuple, int32_t, int32_t>(query_match_map, query_start, query_end); + query_info = std::tuple, int32_t, int32_t>(std::move(query_match_map), query_start, query_end); + + // return std::tuple, int32_t, int32_t>(query_match_map, query_start, query_end); } void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::set& combined_sv_calls, int min_cnv_length) @@ -337,6 +371,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // Split the chromosome into chunks for memory efficiency std::vector region_chunks; int chunk_count = 100; + uint32_t chr_len = this->input_data.getRefGenomeChromosomeLength(chr); if (this->input_data.isRegionSet()) { // Use one chunk for the specified region @@ -348,7 +383,6 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // std::cout << "Using specified region " << chunk << "..." << std::endl; } else { - int chr_len = this->input_data.getRefGenomeChromosomeLength(chr); int chunk_size = std::ceil((double)chr_len / chunk_count); for (int i = 0; i < chunk_count; i++) { int start = i * chunk_size + 1; // 1-based @@ -366,7 +400,8 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // std::cout << "Loading chromosome data for copy number predictions..." << std::endl; printMessage(chr + ": Loading chromosome data..."); CNVCaller cnv_caller(this->input_data); - cnv_caller.loadChromosomeData(chr); + // cnv_caller.loadChromosomeData(chr); + std::pair> chr_data = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_len); // Process each chunk one at a time // std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl; @@ -376,10 +411,13 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ for (const auto& sub_region : region_chunks) { current_region++; printMessage(chr + ": CIGAR SVs..."); - std::tuple, PrimaryMap, SuppMap> region_data = this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region); - std::set& subregion_sv_calls = std::get<0>(region_data); - PrimaryMap& primary_map = std::get<1>(region_data); - SuppMap& supp_map = std::get<2>(region_data); + PrimaryMap primary_map; + SuppMap supp_map; + std::set subregion_sv_calls; + this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map); + // std::set& subregion_sv_calls = std::get<0>(region_data); + // PrimaryMap& primary_map = std::get<1>(region_data); + // SuppMap& supp_map = std::get<2>(region_data); // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl; printMessage(chr + ": Merging CIGAR..."); mergeSVs(subregion_sv_calls); @@ -391,13 +429,13 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ if (region_sv_count > 0) { // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl; printMessage(chr + ": CIGAR predictions..."); - cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm); + cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm, chr_data.first, chr_data.second); } // Run split-read SV and copy number variant predictions // std::cout << "Detecting copy number variants from split reads..." << std::endl; printMessage(chr + ": Split read SVs..."); - this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm); + this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, chr_data.first, chr_data.second); // Merge the SV calls from the current region // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; @@ -418,7 +456,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ sam_close(fp_in); } -std::unordered_map> SVCaller::run() +void SVCaller::run() { // Get the chromosomes to process std::vector chromosomes; @@ -505,12 +543,12 @@ std::unordered_map> SVCaller::run() std::cout << "Saving SVs to VCF..." << std::endl; this->saveToVCF(whole_genome_sv_calls); - return whole_genome_sv_calls; + // return whole_genome_sv_calls; } // Detect SVs from split read alignments -void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm) +void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map) { // Find split-read SV evidence int sv_count = 0; @@ -521,7 +559,6 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p std::string primary_chr = std::get<0>(primary_alignment); int32_t primary_start = std::get<1>(primary_alignment); int32_t primary_end = std::get<2>(primary_alignment); - std::unordered_map primary_match_map = std::get<6>(primary_alignment); // Skip primary alignments that do not have supplementary alignments if (supp_map.find(qname) == supp_map.end()) { @@ -545,11 +582,11 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p } // Inversion detection - bool is_opposite_strand = std::get<7>(primary_alignment) != std::get<7>(*it); + bool is_opposite_strand = std::get<6>(primary_alignment) != std::get<6>(*it); if (is_opposite_strand) { if (supp_length >= min_cnv_length) { - SVCandidate sv_candidate(supp_start+1, supp_end+1, "."); - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, sv_candidate, hmm); + // SVCandidate sv_candidate(supp_start+1, supp_end+1, "."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map); double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); if (supp_type == SVType::NEUTRAL) { @@ -579,8 +616,6 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p supp_end = std::get<2>(largest_supp_alignment); primary_start = std::get<1>(primary_alignment); primary_end = std::get<2>(primary_alignment); - SVCandidate split_boundary; - SVCandidate split_gap; bool gap_exists = false; int32_t boundary_left, boundary_right, gap_left, gap_right; if (primary_before_supp) { @@ -599,15 +634,15 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p // Run copy number variant predictions on the boundary if large enough if (boundary_right - boundary_left >= min_cnv_length) { - split_boundary = SVCandidate(boundary_left, boundary_right, "."); - std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_boundary, hmm); + // split_boundary = SVCandidate(boundary_left, boundary_right, "."); + std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map); double bd_lh = std::get<0>(bd_result); SVType bd_type = std::get<1>(bd_result); // Run copy number variant predictions on the gap if it exists if (gap_exists && gap_right - gap_left >= min_cnv_length) { - split_gap = SVCandidate(gap_left, gap_right, "."); - std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, split_gap, hmm); + // split_gap = SVCandidate(gap_left, gap_right, "."); + std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map); double gap_lh = std::get<0>(gap_result); SVType gap_type = std::get<1>(gap_result); @@ -840,16 +875,26 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align { // Get the start and end read positions for the primary and supplementary // alignments - int32_t primary_query_start = std::get<4>(primary_alignment); - int32_t primary_query_end = std::get<5>(primary_alignment); - int32_t supp_query_start = std::get<4>(supp_alignment); - int32_t supp_query_end = std::get<5>(supp_alignment); - std::unordered_map& primary_match_map = std::get<6>(primary_alignment); - std::unordered_map& supp_match_map = std::get<6>(supp_alignment); int32_t primary_alignment_start = std::get<1>(primary_alignment); int32_t primary_alignment_end = std::get<2>(primary_alignment); int32_t supp_alignment_start = std::get<1>(supp_alignment); int32_t supp_alignment_end = std::get<2>(supp_alignment); + int32_t primary_query_start = std::get<3>(primary_alignment); + int32_t primary_query_end = std::get<4>(primary_alignment); + int32_t supp_query_start = std::get<3>(supp_alignment); + int32_t supp_query_end = std::get<4>(supp_alignment); + const std::vector& primary_match_map = std::get<5>(primary_alignment); + const std::vector& supp_match_map = std::get<5>(supp_alignment); + // int32_t primary_query_start = std::get<4>(primary_alignment); + // int32_t primary_query_end = std::get<5>(primary_alignment); + // int32_t supp_query_start = std::get<4>(supp_alignment); + // int32_t supp_query_end = std::get<5>(supp_alignment); + // const std::vector& primary_match_map = std::get<6>(primary_alignment); + // const std::vector& supp_match_map = std::get<6>(supp_alignment); + // int32_t primary_alignment_start = std::get<1>(primary_alignment); + // int32_t primary_alignment_end = std::get<2>(primary_alignment); + // int32_t supp_alignment_start = std::get<1>(supp_alignment); + // int32_t supp_alignment_end = std::get<2>(supp_alignment); // Check if the alignments overlap bool primary_before_supp = primary_query_start < supp_query_start; diff --git a/src/sv_data.cpp b/src/sv_data.cpp deleted file mode 100644 index d2dfd605..00000000 --- a/src/sv_data.cpp +++ /dev/null @@ -1,355 +0,0 @@ -#include "sv_data.h" - -/// @cond -#include -#include -#include -/// @endcond - -int SVData::add(std::string chr, int32_t start, int32_t end, SVType sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) -{ - // Throw an error if the genotype is not valid - if (genotype != "./." && genotype != "0/0" && genotype != "0/1" && genotype != "1/1") { - std::cerr << "Error: Invalid genotype " << genotype << std::endl; - return -1; - } - - // Trim the alternate allele if it is too long - if (alt_allele.length() > 100) { - alt_allele = alt_allele.substr(0, 100); - } - - // Check if the alternate allele contains ambiguous bases - const std::unordered_set ambiguous_bases = {'R', 'Y', 'W', 'S', 'K', 'M', 'B', 'D', 'H', 'V'}; - for (char &c : alt_allele) { - if (ambiguous_bases.count(c) > 0) { - c = 'N'; - } - } - - // Check if the SV candidate already exists in the map - SVCandidate candidate(start, end, alt_allele); - if (this->sv_calls[chr].find(candidate) != this->sv_calls[chr].end()) { - - // Update the alignment-based support count - SVInfo& sv_info = this->sv_calls[chr][candidate]; - sv_info.read_support += 1; - - // Update the SV type if it is unknown - if (sv_info.sv_type == SVType::UNKNOWN) { - sv_info.sv_type = sv_type; - } - - // Update the genotype if it is unknown - if (sv_info.genotype == "./.") { - sv_info.genotype = genotype; - } - - // Update the HMM likelihood - if ((sv_info.hmm_likelihood == 0.0) || (hmm_likelihood > sv_info.hmm_likelihood)) { - sv_info.hmm_likelihood = hmm_likelihood; - } - sv_info.data_type.insert(data_type); // Add the alignment type to the set - - return 0; // SV call already exists - - // Otherwise, add the SV candidate to the map - } else { - int sv_length = end - start; - - // For deletions, the SV length is the length of the deletion, including the start position - if (sv_type == SVType::DEL) { - sv_length++; - } - - SVInfo sv_info(sv_type, 1, 0, data_type, sv_length, genotype, hmm_likelihood); - this->sv_calls[chr][candidate] = sv_info; // Add the SV candidate to the map - - return 1; // SV call added - } -} - -void SVData::concatenate(const SVData &sv_data) -{ - if (sv_data.sv_calls.empty()) { - std::cerr << "Error: SVData object is empty." << std::endl; - return; - } - - // Iterate over the chromosomes in the other SVData object - for (auto const& chr_sv_calls : sv_data.sv_calls) { - const auto &chr = chr_sv_calls.first; - // std::string chr = chr_sv_calls.first; - auto ¤t_chr_calls = this->sv_calls[chr]; - - // Iterate over the SV calls in the other SVData object - for (auto const& sv_call : chr_sv_calls.second) { - - // Add the SV call to the map of candidate locations - std::pair::iterator, bool> result = current_chr_calls.emplace(sv_call); - bool inserted = result.second; - - // Throw a warning if the SV candidate already exists - if (!inserted) { - std::cerr << "Warning: SV candidate already exists in the map." << std::endl; - } - } - } -} - -void SVData::updateClippedBaseSupport(std::string chr, int64_t pos) -{ - // Update clipped base support - std::pair key(chr, pos); - if (this->clipped_base_support.find(key) != this->clipped_base_support.end()) { - this->clipped_base_support[key] += 1; - } else { - this->clipped_base_support[key] = 1; - } -} - -int SVData::getClippedBaseSupport(std::string chr, int64_t pos, int64_t end) -{ - // Clipped base support is the maximum clipped base support at the start - // and end positions - int clipped_base_support = 0; - std::pair pos_key(chr, pos); - - if (pos == end) { - // If the start and end positions are the same, then the clipped base - // support is the same at both positions - clipped_base_support = this->clipped_base_support[pos_key]; - - } else{ - - // Otherwise, get the clipped base support at the start and end - // positions - int pos_support = 0; - int end_support = 0; - std::pair end_key(chr, end); - if (this->clipped_base_support.find(pos_key) != this->clipped_base_support.end()) { - pos_support = this->clipped_base_support[pos_key]; - } - if (this->clipped_base_support.find(end_key) != this->clipped_base_support.end()) { - end_support = this->clipped_base_support[end_key]; - } - clipped_base_support = std::max(pos_support, end_support); - } - - return clipped_base_support; -} - -void SVData::saveToVCF(ReferenceGenome& ref_genome, std::string output_dir) -{ - // Create a VCF writer - std::cout << "Creating VCF writer..." << std::endl; - std::string output_vcf = output_dir + "/output.vcf"; - std::cout << "Writing VCF file to " << output_vcf << std::endl; - std::ofstream vcf_stream(output_vcf); - if (!vcf_stream.is_open()) { - throw std::runtime_error("Failed to open VCF file for writing."); - } - std::string sample_name = "SAMPLE"; - - std::cout << "Getting reference genome filepath..." << std::endl; - try { - std::string ref_fp = ref_genome.getFilepath(); - std::cout << "Reference genome filepath: " << ref_fp << std::endl; - } catch (const std::exception& e) { - std::cerr << "Error: " << e.what() << std::endl; - return; - } - - std::cout << "Getting reference genome header..." << std::endl; - try { - ref_genome.getContigHeader(); - } catch (const std::exception& e) { - std::cerr << "Error: " << e.what() << std::endl; - return; - } - - // Set the header lines - std::vector header_lines = { - std::string("##reference=") + ref_genome.getFilepath(), - ref_genome.getContigHeader(), - "##INFO=", - "##INFO=", - "##INFO=", - "##INFO=", - "##INFO=", - "##INFO=", - "##INFO=", - "##INFO=", - "##INFO=", - "##FILTER=", - "##FILTER=", - "##FORMAT=", - "##FORMAT=" - }; - - std::cout << "Writing VCF header..." << std::endl; - - // Add the file format - std::string file_format = "##fileformat=VCFv4.2"; - vcf_stream << file_format << std::endl; - - // Add date and time - time_t rawtime; - struct tm * timeinfo; - char buffer[80]; - time (&rawtime); - timeinfo = localtime(&rawtime); - strftime(buffer, sizeof(buffer), "%Y%m%d", timeinfo); - vcf_stream << "##fileDate=" << buffer << std::endl; - - // Add source - std::string source = "##source=ContexSV"; - vcf_stream << source << std::endl; - - // Loop over the header metadata lines - for (const auto &line : header_lines) { - vcf_stream << line << std::endl; - } - - // Add the header line - std::string header_line = "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE"; - vcf_stream << header_line << std::endl; - - // Flush the stream to ensure that the header is written - //this->file_stream.flush(); - - std::cout << "Saving SV calls to " << output_vcf << std::endl; - std::string sv_method = "CONTEXTSVv0.1"; - int skip_count = 0; - int total_count = 0; - std::set chrs = this->getChromosomes(); - for (auto const& chr : chrs) { - if (this->sv_calls.find(chr) == this->sv_calls.end()) { - continue; - } - std::cout << "Saving SV calls for " << chr << " (" << this->sv_calls[chr].size() << " SV calls)..." << std::endl; - for (auto const& sv_call : this->sv_calls[chr]) { - - // Get the SV candidate and SV info - SVCandidate candidate = sv_call.first; - SVInfo info = sv_call.second; - SVType sv_type = info.sv_type; - int read_support = info.read_support; - int read_depth = info.read_depth; - int sv_length = info.sv_length; - std::set data_type = info.data_type; - std::string genotype = info.genotype; - double hmm_likelihood = info.hmm_likelihood; - - // Convert the data type set to a string - std::string data_type_str = ""; - for (auto const& type : data_type) { - data_type_str += type + ","; - } - - // Get the CHROM, POS, END, and ALT - int64_t pos = std::get<0>(candidate); - int64_t end = std::get<1>(candidate); - - // If the SV type is unknown, skip it - if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) { - skip_count += 1; - continue; - } else { - total_count += 1; - } - - // Process by SV type - std::string ref_allele = "."; - std::string alt_allele = "."; - std::string repeat_type = "NA"; - - // Deletion - if (sv_type == SVType::DEL) { - // Get the deleted sequence from the reference genome, also including the preceding base - int64_t preceding_pos = (int64_t) std::max(1, (int) pos-1); // Make sure the position is not negative - ref_allele = ref_genome.query(chr, preceding_pos, end); - - // Use the preceding base as the alternate allele - if (ref_allele != "") { - alt_allele = ref_allele.at(0); - } else { - alt_allele = ""; // Symbolic allele - std::cerr << "Warning: Reference allele is empty for deletion at " << chr << ":" << pos << "-" << end << std::endl; - } - - sv_length = -1 * sv_length; // Negative length for deletions - - pos = preceding_pos; // Update the position to the preceding base - - // Other types (duplications, insertions, inversions) - } else { - // Use the preceding base as the reference allele - int64_t preceding_pos = (int64_t) std::max(1, (int) pos-1); // Make sure the position is not negative - ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos); - - // Format novel insertions - if (sv_type == SVType::INS) { - // Use the insertion sequence as the alternate allele - alt_allele = std::get<2>(candidate); - alt_allele.insert(0, ref_allele); - - pos = preceding_pos; // Update the position to the preceding base - - // Update the end position to the start position to change from - // query to reference coordinates for insertions - end = pos; - } else if (sv_type == SVType::DUP) { - alt_allele = ""; // Symbolic allele - repeat_type = "TANDEM"; - } - } - - // Create the VCF parameter strings - int clipped_base_support = this->getClippedBaseSupport(chr, pos, end); - std::string sv_type_str = getSVTypeString(sv_type); - std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \ - ";SVLEN=" + std::to_string(sv_length) + ";SUPPORT=" + std::to_string(read_support) + \ - ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";CLIPSUP=" + std::to_string(clipped_base_support) + \ - ";REPTYPE=" + repeat_type + ";HMM=" + std::to_string(hmm_likelihood); - - std::string format_str = "GT:DP"; - std::string sample_str = genotype + ":" + std::to_string(read_depth); - std::vector samples = {sample_str}; - - // Write the SV call to the file (CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLES) - vcf_stream << chr << "\t" << pos << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "PASS" << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl; - if (total_count % 1000 == 0) - { - std::cout << "Wrote SV at " << chr << ": " << pos << ", total=" << total_count << std::endl; - } - } - } - - // Print the number of SV calls skipped - std::cout << "Finished writing VCF file. Total SV calls: " << total_count << ", skipped: " << skip_count << " with unknown SV type" << std::endl; -} - -std::map& SVData::getChromosomeSVs(std::string chr) -{ - return this->sv_calls[chr]; -} - -std::set SVData::getChromosomes() -{ - std::set chromosomes; - for (auto const& sv_call : this->sv_calls) { - chromosomes.insert(sv_call.first); - } - return chromosomes; -} - -int SVData::totalCalls() -{ - int sv_calls = 0; - for (auto const& sv_call : this->sv_calls) { - sv_calls += sv_call.second.size(); - } - - return sv_calls; -} diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 09203b59..f24143fd 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -1,5 +1,5 @@ #include "sv_object.h" -#include "sv_object.h" + #include #include #include @@ -7,10 +7,11 @@ #include #include +#include "utils.h" + bool SVCall::operator<(const SVCall & other) const { return start < other.start || (start == other.start && end < other.end); - //return std::tie(start, end) < std::tie(other.start, other.end); } void addSVCall(std::set& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) @@ -24,88 +25,8 @@ void addSVCall(std::set& sv_calls, uint32_t start, uint32_t end, std::st throw std::runtime_error("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end)); } - // If the SV call already exists (start and end position), then update all information if the - // likelihood is higher - // std::cout << "[TEST1] Adding SV call: " << start << "-" << end << " " << - // sv_type << " " << alt_allele << " " << data_type << " " << genotype << " - // " << hmm_likelihood << std::endl; + // printMessage("Adding SV call: " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end - start) + " and type " + sv_type); sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1}); - // SVCall new_sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1}; - - // sv_calls.insert(new_sv_call); - - /* - bool exists = false; - bool print_out = false; - for (auto it = sv_calls.begin(); it != sv_calls.end();) - { - if (it->start == start && it->end == end) - { - exists = true; - if (hmm_likelihood > it->hmm_likelihood) - { - //std::cout << "[DEBUG] Found higher likelihood for SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl; - print_out = true; - // Update the data type and support - // std::string new_data_type = it->data_type + "," + data_type; - // int new_support = it->support + 1; - new_sv_call.data_type = it->data_type + "," + data_type; - new_sv_call.support = it->support + 1; - //higher_lh = true; - - // updates.push_back(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support}); - - // Erase and re-insert the SV call - // Erase the current iterator and safely insert the new SV calls - std::cout << "Erasing iterator." << std::endl; - sv_calls.erase(it); - std::cout << "Iterator erased." << std::endl; - break; - //it = sv_calls.erase(it); // Erase and get the next iterator - // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, new_data_type, genotype, hmm_likelihood, new_support}); - } else { - // End if the SV exists but is lower lh - return; - } - } else { - // Increment the iterator if the SV call does not match - ++it; - } - } - - if (print_out) - { - std::cout << "[DEBUG] Adding updates" << std::endl; - } - - // Update the SV call if it does not exist, or if the likelihood is higher - // than the existing call - if (print_out) - { - std::cout << "[DEBUG] Inserting call" << std::endl; - } - sv_calls.insert(new_sv_call); - if (print_out) - { - std::cout << "[DEBUG] Call inserted" << std::endl; - } - // Insert the updates - // for (const auto& update : updates) - // { - // sv_calls.insert(update); - // } - - // if (print_out) - // { - // std::cout << "[DEBUG] Added updates" << std::endl; - // } - - - // Add the SV call if it does not exist - // std::cout << "[TEST2] Adding SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl; - // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1}); - // std::cout << "[TEST3] Added SV call: " << start << "-" << end << " " << sv_type << " " << alt_allele << " " << data_type << " " << genotype << " " << hmm_likelihood << std::endl; - */ } std::vector> splitSVsIntoChunks(std::set& sv_calls, int chunk_count) @@ -158,47 +79,26 @@ void mergeSVs(std::set& sv_calls) { } // Merge SV calls if they overlap by at least 50% - // int initial_size = sv_calls.size(); + int initial_size = sv_calls.size(); std::vector merged_sv_calls; auto it = sv_calls.begin(); SVCall current_merge = *it++; - for (; it != sv_calls.end(); ++it) { const SVCall& next = *it; - // Check if the SV calls overlap by at least 50% - uint32_t overlap_start = std::max(current_merge.start, next.start); - uint32_t overlap_end = std::min(current_merge.end, next.end); - uint32_t overlap_length = (overlap_start < overlap_end) ? overlap_end - overlap_start : 0; - - uint32_t current_length = current_merge.end - current_merge.start; - uint32_t next_length = next.end - next.start; - - // Merge the SV calls if the overlap is > 0 - //double overlap_pct_current = static_cast(overlap_length) / current_length; - //double overlap_pct_next = static_cast(overlap_length) / next_length; - - //if (overlap_pct_current >= 0.5 || overlap_pct_next >= 0.5) { - if (overlap_length > 0) { - // Merge the SV calls based on the likelihood - if (next.hmm_likelihood != 0.0) { - // Update the likelihood if the next SV call has a likelihood - // and it is higher than the current merged SV call - if (next.hmm_likelihood > current_merge.hmm_likelihood) { - current_merge = next; - } - } else { - // If both have no likelihood (CIGAR only), then merge the SV calls - // based on largest SV length - if (next.hmm_likelihood == current_merge.hmm_likelihood) { - if (next_length > current_length) { - current_merge = next; - } - } - // if (next_length > current_length) { - // current_merge = next; - // } + // Find overlap + if (next.start <= current_merge.end) { + // Merge the SV calls if it is a subset + if (next.end <= current_merge.end) { + continue; } + + // Merge the SV calls based on HMM log likelihood (keep the higher + // likelihood), 0.0 indicates no likelihood + if (next.hmm_likelihood != 0.0 && next.hmm_likelihood > current_merge.hmm_likelihood) { + current_merge = next; // Continue with the next call + } + } else { // No overlap: Save the previous SV and continue merged_sv_calls.push_back(current_merge); @@ -207,6 +107,7 @@ void mergeSVs(std::set& sv_calls) { } // Add the last merged SV call + printMessage("Saving SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood)); merged_sv_calls.push_back(current_merge); // Update the SV calls @@ -214,6 +115,6 @@ void mergeSVs(std::set& sv_calls) { for (const auto& sv_call : merged_sv_calls) { sv_calls.insert(sv_call); } - // int updated_size = sv_calls.size(); - // std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl; + int updated_size = sv_calls.size(); + std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl; } From 50cc5058146cd1bf7aee3e288af4fa27cbc64eb8 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 30 Nov 2024 19:45:13 -0500 Subject: [PATCH 037/134] reduce output --- src/sv_object.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/sv_object.cpp b/src/sv_object.cpp index f24143fd..9af58235 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -107,14 +107,17 @@ void mergeSVs(std::set& sv_calls) { } // Add the last merged SV call - printMessage("Saving SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood)); + // printMessage("Saving SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood)); merged_sv_calls.push_back(current_merge); - // Update the SV calls - sv_calls.clear(); - for (const auto& sv_call : merged_sv_calls) { - sv_calls.insert(sv_call); - } + // Replace contents of the SV calls + sv_calls = std::set(merged_sv_calls.begin(), merged_sv_calls.end()); + + // // Update the SV calls + // sv_calls.clear(); + // for (const auto& sv_call : merged_sv_calls) { + // sv_calls.insert(sv_call); + // } int updated_size = sv_calls.size(); std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl; } From 18bd4a99ee026b5bceb54857ee7e4c5b325861ed Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sun, 1 Dec 2024 17:00:54 -0500 Subject: [PATCH 038/134] Improve merging --- include/cnv_caller.h | 10 +- include/sv_caller.h | 14 +-- include/sv_object.h | 18 ++-- python/sv_merger.py | 106 +++++++------------ src/cnv_caller.cpp | 24 ++--- src/khmm.cpp | 4 - src/sv_caller.cpp | 246 ++++++++++++++++++++++--------------------- src/sv_object.cpp | 138 ++++++++++++++++-------- 8 files changed, 283 insertions(+), 277 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index ad22b449..3663d184 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -49,11 +49,6 @@ class CNVCaller { mutable std::mutex snp_file_mtx; // SNP file mutex mutable std::mutex pfb_file_mtx; // Population frequency file mutex mutable std::mutex bam_file_mtx; // BAM file mutex - - // CHMM hmm; - SNPData snp_data; - // double mean_chr_cov = 0.0; - // std::unordered_map pos_depth_map; // Read depth map // Define a map of CNV genotypes by HMM predicted state. // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output. @@ -85,9 +80,6 @@ class CNVCaller { void querySNPs(std::string chr, uint32_t start, uint32_t end, std::set& snp_pos, std::unordered_map& snp_baf, std::unordered_map& snp_pfb); - // Run copy number prediction for a chunk of SV candidates from CIGAR strings - void runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov, std::vector& pos_depth_map); - // Split a region into chunks for parallel processing std::vector splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count); @@ -99,7 +91,7 @@ class CNVCaller { std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, std::vector& pos_depth_map); // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings - void runCIGARCopyNumberPrediction(std::string chr, std::set& sv_candidates, int min_length, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map); + void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map); // Calculate the mean chromosome coverage std::pair> calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len); diff --git a/include/sv_caller.h b/include/sv_caller.h index f3f78af9..b4f6eaac 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -18,7 +18,7 @@ // SV candidate alignment data (chr, start, end, sequence, query start, query // end, mismatch map, strand) -using AlignmentData = std::tuple, bool>; +using AlignmentData = std::tuple, bool>; using AlignmentVector = std::vector; // Query map (query name, alignment vector) @@ -34,20 +34,20 @@ class SVCaller { // Detect SVs from the CIGAR string of a read alignment, and return the // mismatch rate, and the start and end positions of the query sequence - void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set& sv_calls, std::tuple, int32_t, int32_t>& query_info, bool is_primary); + void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, std::tuple, uint32_t, uint32_t>& query_info, bool is_primary, std::unordered_map& breakpoint_depth); - void processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::set& combined_sv_calls, int min_cnv_length); + void processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::vector& combined_sv_calls, int min_cnv_length); // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. // RegionData detectSVsFromRegion(std::string region); - void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::set& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments); + void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, std::unordered_map& breakpoint_depth); // Read the next alignment from the BAM file in a thread-safe manner int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); // Detect SVs from split alignments - void detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map); + void detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map, std::unordered_map& breakpoint_depth); // Calculate the mismatch rate given a map of query positions to // match/mismatch (1/0) values within a specified range of the query @@ -55,10 +55,12 @@ class SVCaller { // double calculateMismatchRate(std::unordered_map& mismatch_map, int32_t start, int32_t end); double calculateMismatchRate(const std::vector& mismatch_map, int32_t start, int32_t end); - void saveToVCF(const std::unordered_map>& sv_calls); + void saveToVCF(const std::unordered_map>& sv_calls); void trimOverlappingAlignments(AlignmentData& primary_alignment, AlignmentData& supp_alignment); + void updateBreakpointDepth(std::unordered_map& breakpoint_depth, uint32_t start, uint32_t end); + public: explicit SVCaller(InputData& input_data); diff --git a/include/sv_object.h b/include/sv_object.h index fb52691e..7f8b9d96 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -6,6 +6,7 @@ #include #include #include +#include // Struct to represent a structural variant call struct SVCall { @@ -16,24 +17,25 @@ struct SVCall { std::string data_type = "NA"; std::string genotype = "./."; double hmm_likelihood = 0.0; - int support = 0; + int support = 0; // Exact breakpoint support + int total_support = 0; // Support at either breakpoint // Comparison operator for std::set bool operator<(const SVCall& other) const; // Constructor with parameters for all fields - SVCall(uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int support) : - start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), support(support) {} + SVCall(uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int support, int total_support) : + start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), support(support), total_support(support) {} }; -void addSVCall(std::set& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood); +void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood); -void mergeSVs(std::set& sv_calls); +void mergeSVs(std::vector& sv_calls, std::unordered_map& breakpoint_support); -std::vector> splitSVsIntoChunks(std::set& sv_calls, int chunk_count); +void filterSVsWithLowSupport(std::vector &sv_calls, std::unordered_map &breakpoint_support, int min_support); -uint32_t getSVCount(const std::set& sv_calls); +uint32_t getSVCount(const std::vector& sv_calls); -void concatenateSVCalls(std::set& sv_calls, const std::set& sv_calls_update); +void concatenateSVCalls(std::vector& sv_calls, const std::vector& sv_calls_update); #endif // SV_OBJECT_H diff --git a/python/sv_merger.py b/python/sv_merger.py index b2d1491a..2f5cb94f 100644 --- a/python/sv_merger.py +++ b/python/sv_merger.py @@ -89,11 +89,11 @@ def update_support(record, cluster_size): return record -def weighted_score(read_support, hmm_score, weight_hmm): +def weighted_score(sv_len, hmm_score, weight_hmm): """ Calculate a weighted score based on read support and HMM score. """ - return (1 - weight_hmm) * read_support + weight_hmm * hmm_score + return (1 - weight_hmm) * sv_len + weight_hmm * hmm_score def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): """ @@ -157,28 +157,8 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): for label in unique_labels: - # Skip label -1 (outliers) - # if label == -1: # Skip label -1 (outliers) only if there are no other clusters if label == -1 and len(unique_labels) > 1: - # # Print the positions if any are within a certain range - # pos_min = 180915940 - # pos_max = 180950356 - - # Debug if position is found - target_pos = 180949217 - - idx = cluster_labels == label - pos_values = breakpoints[idx][:, 0] - if target_pos in pos_values: - logging.info(f"Outlier deletion positions: {pos_values}") - - # if (np.any(pos_values >= pos_min) and np.any(pos_values <= pos_max)): - # Print all within range - # pos_within_range = pos_values[(pos_values >= pos_min) & (pos_values <= pos_max)] - # logging.info(f"Outlier deletion positions: {pos_within_range}") - # logging.info(f"Outlier deletion positions: {pos_values}") - continue # Get the indices of SVs with the same label @@ -187,39 +167,47 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): # Get HMM and read support values for the cluster # max_score_idx = 0 # Default to the first SV in the cluster cluster_hmm_scores = np.array(hmm_scores[idx]) - cluster_depth_scores = np.array(sv_support[idx]) + # cluster_depth_scores = np.array(sv_support[idx]) cluster_sv_lengths = np.array(breakpoints[idx][:, 1] - breakpoints[idx][:, 0] + 1) - max_hmm = None - max_support = None - max_hmm_idx = None - max_support_idx = None + # max_hmm = None + # max_support = None + # max_hmm_idx = None + # max_support_idx = None # Find the maximum HMM score - if len(np.unique(cluster_hmm_scores)) > 1: - max_hmm_idx = np.nanargmax(cluster_hmm_scores) - max_hmm = cluster_hmm_scores[max_hmm_idx] + # if len(np.unique(cluster_hmm_scores)) > 1: + # max_hmm_idx = np.nanargmax(cluster_hmm_scores) + # max_hmm = cluster_hmm_scores[max_hmm_idx] # Find the maximum read alignment and clipped base support - if len(np.unique(cluster_depth_scores)) > 1: - max_support_idx = np.argmax(cluster_depth_scores) - max_support = cluster_depth_scores[max_support_idx] + # if len(np.unique(cluster_depth_scores)) > 1: + # max_support_idx = np.argmax(cluster_depth_scores) + # max_support = cluster_depth_scores[max_support_idx] + + # Normalize the HMM scores. Since the HMM scores are negative (log lh), we + # normalize them to the range [0, 1] by subtracting the minimum value + cluster_hmm_norm = (cluster_hmm_scores - np.min(cluster_hmm_scores)) / (np.max(cluster_hmm_scores) - np.min(cluster_hmm_scores)) + + # Normalize the SV lengths to the range [0, 1] + cluster_sv_lengths_norm = (cluster_sv_lengths - np.min(cluster_sv_lengths)) / (np.max(cluster_sv_lengths) - np.min(cluster_sv_lengths)) # Use a weighted approach to choose the best SV based on HMM and # support. Deletions have higher priority for HMM scores, while # insertions and duplications have higher priority for read alignment # support. # hmm_weight = 0.7 if sv_type == 'DEL' else 0.3 - hmm_weight = 0.4 + hmm_weight = 0.5 max_score_idx = 0 # Default to the first SV in the cluster - max_score = weighted_score(cluster_depth_scores[max_score_idx], cluster_hmm_scores[max_score_idx], hmm_weight) - for k, hmm_loglh in enumerate(cluster_hmm_scores): - read_support = cluster_depth_scores[k] - score = weighted_score(read_support, hmm_loglh, hmm_weight) + max_score = weighted_score(cluster_hmm_norm[max_score_idx], cluster_sv_lengths_norm[max_score_idx], hmm_weight) + # max_score = weighted_score(cluster_sv_lengths[max_score_idx], cluster_hmm_scores[max_score_idx], hmm_weight) + for k, hmm_norm in enumerate(cluster_hmm_norm): + svlen_norm = cluster_sv_lengths_norm[k] + score = weighted_score(svlen_norm, hmm_norm, hmm_weight) if score > max_score: max_score = score max_score_idx = k - # Get the VCF record with the highest depth score + # Get the VCF record with the highest score max_record = vcf_df.iloc[idx, :].iloc[max_score_idx, :] # # For deletions, choose the SV with the highest HMM score if available @@ -238,7 +226,7 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): # max_score_idx = max_hmm_idx # Get the VCF record with the highest depth score - max_record = vcf_df.iloc[idx, :].iloc[max_score_idx, :] + # max_record = vcf_df.iloc[idx, :].iloc[max_score_idx, :] # Get the number of SVs in this cluster cluster_size = np.sum(idx) @@ -246,30 +234,7 @@ def cluster_breakpoints(vcf_df, sv_type, cluster_size_min): # Update the SUPPORT field in the INFO column max_record = update_support(max_record, cluster_size) - - # Get all position values in the cluster - pos_values = breakpoints[idx][:, 0] - - # Debug if position is found - target_pos = 180949217 - if target_pos in pos_values: - logging.info(f"Cluster size: {cluster_size}") - logging.info(f"Pos values:") - for k, pos in enumerate(pos_values): - logging.info(f"Row {k+1} - Pos: {pos}, HMM: {cluster_hmm_scores[k]}, support: {cluster_depth_scores[k]}") - - logging.info(f"Chosen position: {max_record['POS']} - HMM: {max_hmm}, support: {max_support}") - - # # If the POS value is a certain value, plot the support - # pos_min = 180915940 - # pos_max = 180950356 - # # if (np.any(pos_values >= pos_min) and np.any(pos_values <= pos_max)) or cluster_size > 1000: - # if (np.any(pos_values >= pos_min) and np.any(pos_values <= pos_max)): - # logging.info(f"Cluster size: {cluster_size}") - # logging.info(f"Pos values:") - # for k, pos in enumerate(pos_values): - # logging.info(f"Row {k+1} - Pos: {pos}, HMM: {cluster_hmm_scores[k]}, support: {cluster_depth_scores[k]}") - + # pos_values = breakpoints[idx][:, 0] # Append the chosen record to the dataframe of records that will # form the merged VCF file @@ -322,16 +287,19 @@ def sv_merger(vcf_file_path, cluster_size_min=3, suffix='.merged'): del chr_del_df # Cluster insertions and duplications - logging.info("Clustering insertions and duplications on chromosome %s...", chromosome) - chr_ins_dup_df = vcf_df[(vcf_df['CHROM'] == chromosome) & ((vcf_df['INFO'].str.contains('SVTYPE=INS')) | (vcf_df['INFO'].str.contains('SVTYPE=DUP')))] - ins_dup_records = cluster_breakpoints(chr_ins_dup_df, 'INS/DUP', cluster_size_min) - del chr_ins_dup_df + logging.info("Clustering all other SVs on chromosome %s...", chromosome) + # chr_ins_dup_df = vcf_df[(vcf_df['CHROM'] == chromosome) & + # ((vcf_df['INFO'].str.contains('SVTYPE=INS')) | + # (vcf_df['INFO'].str.contains('SVTYPE=DUP')))] + chr_non_del_df = vcf_df[(vcf_df['CHROM'] == chromosome) & (~vcf_df['INFO'].str.contains('SVTYPE=DEL'))] + ins_dup_records = cluster_breakpoints(chr_non_del_df, 'INS/DUP', cluster_size_min) + del chr_non_del_df # Summarize the number of deletions and insertions/duplications del_count = del_records.shape[0] ins_dup_count = ins_dup_records.shape[0] records_processed += del_count + ins_dup_count - logging.info("Chromosome %s - %d deletions, %d insertions, and duplications merged.", chromosome, del_count, ins_dup_count) + logging.info("Chromosome %s - %d deletions, %d other types merged.", chromosome, del_count, ins_dup_count) # Append the deletion and insertion/duplication records to the merged # records DataFrame diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 5f1c2e1d..35e36e3b 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -208,18 +208,11 @@ std::tuple CNVCaller::runCopyNumberPrediction } -void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::set &sv_candidates, int min_length, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map) +void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector &sv_candidates, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map) { + int min_length = this->input_data.getMinCNVLength(); int window_size = this->input_data.getWindowSize(); - // double mean_chr_cov = this->mean_chr_cov; - // printMessage("Predicting CIGAR string copy number states for chromosome " + chr + "..."); - runCIGARCopyNumberPredictionChunk(chr, sv_candidates, hmm, window_size, mean_chr_cov, pos_depth_map); - // printMessage("Finished predicting copy number states for chromosome " + chr + "..."); -} -void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set& sv_chunk, const CHMM& hmm, int window_size, double mean_chr_cov, std::vector& pos_depth_map) -{ - // printMessage("Running copy number prediction for " + std::to_string(sv_chunk.size()) + " SV candidates on chromosome " + chr + "..."); // Map with counts for each CNV type std::map cnv_type_counts; for (int i = 0; i < 6; i++) @@ -228,7 +221,7 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::setinput_data.getMinCNVLength()) + if ((end_pos - start_pos) < (uint32_t) min_length) { continue; } @@ -322,10 +315,15 @@ void CNVCaller::runCIGARCopyNumberPredictionChunk(std::string chr, std::set, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect // Threshold any zero values to avoid calculation issues. for (i = 1; i <= hmm.N; i++) { - // if (hmm.pi[i] == 0) - // hmm.pi[i] = 1e-9; /*eliminate problems with zero probability*/ - // hmm.pi[i] = log(hmm.pi[i]); // Convert to log probability due to underflow - // Update to 0-based indexing if (hmm.pi[i-1] == 0) { hmm.pi[i-1] = 1e-9; /*eliminate problems with zero probability*/ diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 8918a131..51a2a332 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -37,11 +37,7 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) return ret; } -// RegionData SVCaller::detectSVsFromRegion(std::string region) -// std::tuple, PrimaryMap, SuppMap> -// SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, -// const std::string& region) -void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::set& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments) +void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, std::unordered_map& breakpoint_depth) { // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); @@ -61,10 +57,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, } // Main loop to process the alignments - // std::set sv_calls; int num_alignments = 0; - // PrimaryMap primary_alignments; - // SuppMap supplementary_alignments; while (readNextAlignment(fp_in, itr, bam1) >= 0) { // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality @@ -78,17 +71,27 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, // Get the primary alignment information std::string chr = bamHdr->target_name[bam1->core.tid]; - int64_t start = bam1->core.pos; - int64_t end = bam_endpos(bam1); // This is the first position after the alignment + uint32_t start = (uint32_t)bam1->core.pos; + uint32_t end = (uint32_t)bam_endpos(bam1); // This is the first position after the alignment bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); + // Check for underflow + if (start > 4000000000 || end > 4000000000) { + throw std::runtime_error("ERROR: Integer underflow for alignment at position " + std::to_string(start) + "-" + std::to_string(end)); + } + // Call SVs directly from the CIGAR string - std::tuple, int32_t, int32_t> query_info; - this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true); + std::tuple, uint32_t, uint32_t> query_info; + this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true, breakpoint_depth); // std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true); const std::vector& match_map = std::get<0>(query_info); - int32_t query_start = std::get<1>(query_info); - int32_t query_end = std::get<2>(query_info); + uint32_t query_start = std::get<1>(query_info); + uint32_t query_end = std::get<2>(query_info); + + // Check for underflow + if (query_start > 4000000000 || query_end > 4000000000) { + throw std::runtime_error("ERROR: Integer underflow for query at position " + std::to_string(query_start) + "-" + std::to_string(query_end)); + } // Add the primary alignment to the map AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand); @@ -99,18 +102,18 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, // Get the supplementary alignment information std::string chr = bamHdr->target_name[bam1->core.tid]; - int32_t start = bam1->core.pos; - int32_t end = bam_endpos(bam1); + uint32_t start = bam1->core.pos; + uint32_t end = bam_endpos(bam1); bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); // Get CIGAR string information, but don't call SVs // std::tuple, int32_t, int32_t> query_info = // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false); - std::tuple, int32_t, int32_t> query_info; - this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, false); + std::tuple, uint32_t, uint32_t> query_info; + this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, false, breakpoint_depth); const std::vector& match_map = std::get<0>(query_info); - int32_t query_start = std::get<1>(query_info); - int32_t query_end = std::get<2>(query_info); + uint32_t query_start = std::get<1>(query_info); + uint32_t query_end = std::get<2>(query_info); // Add the supplementary alignment to the map AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand); @@ -149,29 +152,17 @@ double SVCaller::calculateMismatchRate(const std::vector& mismatch_map, int } double mismatch_rate = static_cast(mismatch_count) / static_cast(match_count + mismatch_count); - // int match_count = 0; - // int mismatch_count = 0; - // for (int i = start; i <= end; i++) { - // if (match_map.find(i) != match_map.end()) { - // if (match_map[i] == 1) { - // match_count++; - // } else { - // mismatch_count++; - // } - // } - // } - // double mismatch_rate = (double)mismatch_count / (double)(match_count + mismatch_count); return mismatch_rate; } -void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set& sv_calls, std::tuple, int32_t, int32_t>& query_info, bool is_primary) +void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, std::tuple, uint32_t, uint32_t>& query_info, bool is_primary, std::unordered_map& breakpoint_depth) { std::string chr = header->target_name[alignment->core.tid]; // Chromosome name - int32_t pos = alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) + uint32_t pos = (uint32_t)alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) uint32_t* cigar = bam_get_cigar(alignment); // CIGAR array int cigar_len = alignment->core.n_cigar; - int query_pos = 0; + uint32_t query_pos = 0; // std::unordered_map query_match_map; // Query position to // match/mismatch (1/0) map std::vector query_match_map(alignment->core.l_qseq, 0); // Query position to match/mismatch (1/0) map @@ -180,10 +171,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set // only), update clipped base support, calculate sequence identity for // potential duplications (primary only), and calculate // the clipped base support and mismatch rate - int32_t ref_pos; - int32_t ref_end; - int32_t query_start = 0; // First alignment position in the query - int32_t query_end = 0; // Last alignment position in the query + uint32_t ref_pos; + uint32_t ref_end; + uint32_t query_start = 0; // First alignment position in the query + uint32_t query_end = 0; // Last alignment position in the query bool first_op = false; // First alignment operation for the query double default_lh = 0.0; for (int i = 0; i < cigar_len; i++) { @@ -209,9 +200,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set // position. bool is_duplication = false; int ins_ref_pos; - int dup_start = std::max(0, pos - op_len); + uint32_t dup_start = std::max(0, (int)pos - op_len); + // int dup_start = std::max(0, pos - op_len); // for (int j = pos - op_len; j <= pos; j++) { - for (int j = dup_start; j <= pos; j++) { + for (uint32_t j = dup_start; j <= pos; j++) { // Get the string for the window (1-based coordinates) ins_ref_pos = j + 1; @@ -250,10 +242,11 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set ref_pos = pos+1; ref_end = ref_pos + op_len -1; if (is_duplication) { - addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh); + addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh); } else { - addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh); + addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh); } + this->updateBreakpointDepth(breakpoint_depth, ref_pos, ref_end); } // Check if the CIGAR operation is a deletion @@ -264,7 +257,8 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set { ref_pos = pos+1; ref_end = ref_pos + op_len -1; - addSVCall(sv_calls, (uint32_t)ref_pos, (uint32_t)ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh); + addSVCall(sv_calls, ref_pos, ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh); + this->updateBreakpointDepth(breakpoint_depth, ref_pos, ref_end); } // Check if the CIGAR operation is a clipped base @@ -340,12 +334,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::set query_end = query_pos; // Last alignment position in the query - query_info = std::tuple, int32_t, int32_t>(std::move(query_match_map), query_start, query_end); - - // return std::tuple, int32_t, int32_t>(query_match_map, query_start, query_end); + query_info = std::tuple, uint32_t, uint32_t>(std::move(query_match_map), query_start, query_end); } -void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::set& combined_sv_calls, int min_cnv_length) +void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::vector& combined_sv_calls, int min_cnv_length) { // Open the BAM file samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); @@ -408,19 +400,20 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ int region_count = region_chunks.size(); int current_region = 0; // std::set combined_sv_calls; + std::unordered_map breakpoint_depth; for (const auto& sub_region : region_chunks) { current_region++; printMessage(chr + ": CIGAR SVs..."); PrimaryMap primary_map; SuppMap supp_map; - std::set subregion_sv_calls; - this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map); + std::vector subregion_sv_calls; + this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map, breakpoint_depth); // std::set& subregion_sv_calls = std::get<0>(region_data); // PrimaryMap& primary_map = std::get<1>(region_data); // SuppMap& supp_map = std::get<2>(region_data); // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl; printMessage(chr + ": Merging CIGAR..."); - mergeSVs(subregion_sv_calls); + mergeSVs(subregion_sv_calls, breakpoint_depth); int region_sv_count = getSVCount(subregion_sv_calls); // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); @@ -429,18 +422,18 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ if (region_sv_count > 0) { // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl; printMessage(chr + ": CIGAR predictions..."); - cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, min_cnv_length, hmm, chr_data.first, chr_data.second); + cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, hmm, chr_data.first, chr_data.second); } // Run split-read SV and copy number variant predictions // std::cout << "Detecting copy number variants from split reads..." << std::endl; printMessage(chr + ": Split read SVs..."); - this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, chr_data.first, chr_data.second); + this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, chr_data.first, chr_data.second, breakpoint_depth); // Merge the SV calls from the current region // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; printMessage(chr + ": Merging split reads..."); - mergeSVs(subregion_sv_calls); + mergeSVs(subregion_sv_calls, breakpoint_depth); // Combine the SV calls from the current region // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl; @@ -450,6 +443,13 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ printMessage("Completed " + std::to_string(current_region) + " of " + std::to_string(region_count) + " region(s) for chromosome " + chr + "..."); } + // Run a final merge on the combined SV calls + printMessage(chr + ": Merging final calls..."); + mergeSVs(combined_sv_calls, breakpoint_depth); + + // Insert breakpoint support and filter SVs with low support + filterSVsWithLowSupport(combined_sv_calls, breakpoint_depth, 5); + // Clean up the BAM file, header, and index hts_idx_destroy(idx); bam_hdr_destroy(bamHdr); @@ -480,7 +480,7 @@ void SVCaller::run() const int max_threads = this->input_data.getThreadCount(); std::cout << "Using " << max_threads << " threads for processing..." << std::endl; std::vector> futures; - std::unordered_map> whole_genome_sv_calls; + std::unordered_map> whole_genome_sv_calls; std::mutex sv_mutex; std::condition_variable cv; int active_threads = 0; @@ -488,7 +488,7 @@ void SVCaller::run() // Lambda to process a chromosome auto process_chr = [&](const std::string& chr) { // printMessage("Launching thread for chromosome " + chr + "..."); - std::set sv_calls; + std::vector sv_calls; this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength()); { std::lock_guard lock(sv_mutex); @@ -548,7 +548,7 @@ void SVCaller::run() // Detect SVs from split read alignments -void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map) +void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map, std::unordered_map& breakpoint_depth) { // Find split-read SV evidence int sv_count = 0; @@ -557,8 +557,8 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p std::string qname = entry.first; AlignmentData primary_alignment = entry.second; std::string primary_chr = std::get<0>(primary_alignment); - int32_t primary_start = std::get<1>(primary_alignment); - int32_t primary_end = std::get<2>(primary_alignment); + uint32_t primary_start = std::get<1>(primary_alignment); + uint32_t primary_end = std::get<2>(primary_alignment); // Skip primary alignments that do not have supplementary alignments if (supp_map.find(qname) == supp_map.end()) { @@ -567,15 +567,15 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p // Find the largest supplementary alignment, and also identify inversions AlignmentData largest_supp_alignment = supp_map[qname][0]; - int32_t largest_supp_length = 0; + uint32_t largest_supp_length = 0; for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { const auto& supp_chr = std::get<0>(*it); if (primary_chr != supp_chr) { continue; // Skip supplementary alignments on different chromosomes } - int32_t supp_start = std::get<1>(*it); - int32_t supp_end = std::get<2>(*it); - int32_t supp_length = supp_end - supp_start + 1; + uint32_t supp_start = std::get<1>(*it); + uint32_t supp_end = std::get<2>(*it); + uint32_t supp_length = supp_end - supp_start + 1; if (supp_length > largest_supp_length) { largest_supp_length = supp_length; largest_supp_alignment = *it; @@ -585,29 +585,31 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p bool is_opposite_strand = std::get<6>(primary_alignment) != std::get<6>(*it); if (is_opposite_strand) { if (supp_length >= min_cnv_length) { - // SVCandidate sv_candidate(supp_start+1, supp_end+1, "."); + // printMessage("Running copy number prediction on inversion: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1)); std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map); double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); if (supp_type == SVType::NEUTRAL) { - addSVCall(sv_calls, (uint32_t)(supp_start+1), (uint32_t)(supp_end+1), "INV", ".", "HMM", "./.", supp_lh); + addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", ".", "HMM", "./.", supp_lh); + this->updateBreakpointDepth(breakpoint_depth, supp_start+1, supp_end+1); + sv_count++; } else if (supp_type == SVType::DUP) { - addSVCall(sv_calls, (uint32_t)(supp_start+1), (uint32_t)(supp_end+1), "INVDUP", ".", "HMM", "./.", supp_lh); - sv_count++; + addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", ".", "HMM", "./.", supp_lh); + this->updateBreakpointDepth(breakpoint_depth, supp_start+1, supp_end+1); } } else { // Add the inversion without running copy number predictions // (too small for predictions) - addSVCall(sv_calls, (uint32_t)(supp_start+1), (uint32_t)(supp_end+1), "INV", ".", "REV", "./.", 0.0); - sv_count++; + addSVCall(sv_calls, supp_start+1, (supp_end+1), "INV", ".", "REV", "./.", 0.0); + this->updateBreakpointDepth(breakpoint_depth, supp_start+1, supp_end+1); } } } // Trim overlapping alignments - int32_t supp_start = std::get<1>(largest_supp_alignment); - int32_t supp_end = std::get<2>(largest_supp_alignment); + uint32_t supp_start = std::get<1>(largest_supp_alignment); + uint32_t supp_end = std::get<2>(largest_supp_alignment); bool primary_before_supp = primary_start < supp_start; trimOverlappingAlignments(primary_alignment, largest_supp_alignment); @@ -617,7 +619,7 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p primary_start = std::get<1>(primary_alignment); primary_end = std::get<2>(primary_alignment); bool gap_exists = false; - int32_t boundary_left, boundary_right, gap_left, gap_right; + uint32_t boundary_left, boundary_right, gap_left, gap_right; if (primary_before_supp) { boundary_left = primary_start+1; boundary_right = supp_end+1; @@ -634,42 +636,37 @@ void SVCaller::detectSVsFromSplitReads(std::set& sv_calls, PrimaryMap& p // Run copy number variant predictions on the boundary if large enough if (boundary_right - boundary_left >= min_cnv_length) { - // split_boundary = SVCandidate(boundary_left, boundary_right, "."); + // printMessage("Running copy number prediction on boundary: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right)); std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map); double bd_lh = std::get<0>(bd_result); SVType bd_type = std::get<1>(bd_result); // Run copy number variant predictions on the gap if it exists if (gap_exists && gap_right - gap_left >= min_cnv_length) { - // split_gap = SVCandidate(gap_left, gap_right, "."); + // printMessage("Running copy number prediction on gap: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right)); std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map); double gap_lh = std::get<0>(gap_result); SVType gap_type = std::get<1>(gap_result); // If higher likelihood than the boundary, add the gap as the SV call if (gap_lh > bd_lh) { - addSVCall(sv_calls, (uint32_t)(gap_left), (uint32_t)(gap_right), getSVTypeString(gap_type), ".", "GAP", "./.", gap_lh); - sv_count++; + addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), ".", "GAP", "./.", gap_lh); + this->updateBreakpointDepth(breakpoint_depth, gap_left, gap_right); } else { // Add the boundary as the SV call - addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh); - sv_count++; + addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh); + this->updateBreakpointDepth(breakpoint_depth, boundary_left, boundary_right); } } else { // Add the boundary as the SV call - addSVCall(sv_calls, (uint32_t)(boundary_left), (uint32_t)(boundary_right), getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh); - sv_count++; + addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh); + this->updateBreakpointDepth(breakpoint_depth, boundary_left, boundary_right); } } } - - // Print the number of SVs detected from split-read alignments - // if (sv_count > 0) { - // std::cout << "Found " << sv_count << " SVs from split-read alignments" << std::endl; - // } } -void SVCaller::saveToVCF(const std::unordered_map >& sv_calls) +void SVCaller::saveToVCF(const std::unordered_map>& sv_calls) { std::cout << "Creating VCF writer..." << std::endl; // std::string output_vcf = output_dir + "/output.vcf"; @@ -701,8 +698,8 @@ void SVCaller::saveToVCF(const std::unordered_map "##INFO=", "##INFO=", "##INFO=", - "##INFO=", - "##INFO=", + "##INFO=", + "##INFO=", "##INFO=", "##INFO=", "##FILTER=", @@ -748,7 +745,7 @@ void SVCaller::saveToVCF(const std::unordered_map int total_count = 0; for (const auto& pair : sv_calls) { std::string chr = pair.first; - const std::set& sv_calls = pair.second; + const std::vector& sv_calls = pair.second; std::cout << "Saving SV calls for " << chr << "..." << std::endl; for (const auto& sv_call : sv_calls) { // Get the SV candidate and SV info @@ -763,7 +760,8 @@ void SVCaller::saveToVCF(const std::unordered_map if (sv_type_str == "DEL") { sv_length++; } - int read_support = sv_call.support; + int bp_support = sv_call.support; + int total_bp_support = sv_call.total_support; int read_depth = 0; // SVType sv_type = sv_call.sv_type; // SVCandidate candidate = sv_call.first; @@ -845,13 +843,14 @@ void SVCaller::saveToVCF(const std::unordered_map } // Create the VCF parameter strings - // int clipped_base_support = this->getClippedBaseSupport(chr, pos, - // end); - int clipped_base_support = 0; - // std::string sv_type_str = getSVTypeString(sv_type); + // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \ + // ";SVLEN=" + std::to_string(sv_length) + ";SUPPORT=" + std::to_string(read_support) + \ + // ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";CLIPSUP=" + std::to_string(clipped_base_support) + \ + // ";REPTYPE=" + repeat_type + ";HMM=" + + // std::to_string(hmm_likelihood); std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \ - ";SVLEN=" + std::to_string(sv_length) + ";SUPPORT=" + std::to_string(read_support) + \ - ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";CLIPSUP=" + std::to_string(clipped_base_support) + \ + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \ + ";BPSUP1=" + std::to_string(total_bp_support) + ";BPSUP2=" + std::to_string(bp_support) + \ ";REPTYPE=" + repeat_type + ";HMM=" + std::to_string(hmm_likelihood); std::string format_str = "GT:DP"; @@ -875,28 +874,18 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align { // Get the start and end read positions for the primary and supplementary // alignments - int32_t primary_alignment_start = std::get<1>(primary_alignment); - int32_t primary_alignment_end = std::get<2>(primary_alignment); - int32_t supp_alignment_start = std::get<1>(supp_alignment); - int32_t supp_alignment_end = std::get<2>(supp_alignment); - int32_t primary_query_start = std::get<3>(primary_alignment); - int32_t primary_query_end = std::get<4>(primary_alignment); - int32_t supp_query_start = std::get<3>(supp_alignment); - int32_t supp_query_end = std::get<4>(supp_alignment); + uint32_t primary_alignment_start = std::get<1>(primary_alignment); + uint32_t primary_alignment_end = std::get<2>(primary_alignment); + uint32_t supp_alignment_start = std::get<1>(supp_alignment); + uint32_t supp_alignment_end = std::get<2>(supp_alignment); + uint32_t primary_query_start = std::get<3>(primary_alignment); + uint32_t primary_query_end = std::get<4>(primary_alignment); + uint32_t supp_query_start = std::get<3>(supp_alignment); + uint32_t supp_query_end = std::get<4>(supp_alignment); const std::vector& primary_match_map = std::get<5>(primary_alignment); const std::vector& supp_match_map = std::get<5>(supp_alignment); - // int32_t primary_query_start = std::get<4>(primary_alignment); - // int32_t primary_query_end = std::get<5>(primary_alignment); - // int32_t supp_query_start = std::get<4>(supp_alignment); - // int32_t supp_query_end = std::get<5>(supp_alignment); - // const std::vector& primary_match_map = std::get<6>(primary_alignment); - // const std::vector& supp_match_map = std::get<6>(supp_alignment); - // int32_t primary_alignment_start = std::get<1>(primary_alignment); - // int32_t primary_alignment_end = std::get<2>(primary_alignment); - // int32_t supp_alignment_start = std::get<1>(supp_alignment); - // int32_t supp_alignment_end = std::get<2>(supp_alignment); - - // Check if the alignments overlap + + // Check for overlapping read alignments bool primary_before_supp = primary_query_start < supp_query_start; if (primary_before_supp) { // Primary before supplementary in the query @@ -904,15 +893,19 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align // Calculate the mismatch rates at the overlapping region double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, supp_query_start, primary_query_end); double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, supp_query_start, primary_query_end); - int32_t overlap_length = primary_query_end - supp_query_start + 1; + uint32_t overlap_length = primary_query_end - supp_query_start + 1; // Trim the ailgnment with the higher mismatch rate if (primary_mismatch_rate > supp_mismatch_rate) { // Trim the end of the primary alignment - std::get<2>(primary_alignment) = primary_alignment_end - overlap_length; + uint32_t new_end = primary_alignment_end > overlap_length ? primary_alignment_end - overlap_length : 0; + std::get<2>(primary_alignment) = new_end; + // std::get<2>(primary_alignment) = primary_alignment_end - overlap_length; } else { // Trim the beginning of the supplementary alignment - std::get<1>(supp_alignment) = supp_alignment_start + overlap_length; + uint32_t new_start = supp_alignment_start + overlap_length; + std::get<1>(supp_alignment) = new_start; + // std::get<1>(supp_alignment) = supp_alignment_start + overlap_length; } } } else { @@ -921,16 +914,25 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align // Calculate the mismatch rates at the overlapping region double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, primary_query_start, supp_query_end); double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, primary_query_start, supp_query_end); - int32_t overlap_length = supp_query_end - primary_query_start + 1; + uint32_t overlap_length = supp_query_end - primary_query_start + 1; // Trim the ailgnment with the higher mismatch rate if (supp_mismatch_rate > primary_mismatch_rate) { // Trim the end of the supplementary alignment - std::get<2>(supp_alignment) = supp_alignment_end - overlap_length; + uint32_t new_end = supp_alignment_end > overlap_length ? supp_alignment_end - overlap_length : 0; + // std::get<2>(supp_alignment) = supp_alignment_end - overlap_length; } else { // Trim the beginning of the primary alignment - std::get<1>(primary_alignment) = primary_alignment_start + overlap_length; + uint32_t new_start = primary_alignment_start + overlap_length; + std::get<1>(primary_alignment) = new_start; + // std::get<1>(primary_alignment) = primary_alignment_start + overlap_length; } } } } + +void SVCaller::updateBreakpointDepth(std::unordered_map &breakpoint_depth, uint32_t start, uint32_t end) +{ + breakpoint_depth[start] += 1; + breakpoint_depth[end] += 1; +} diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 9af58235..9efa9ca2 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -14,10 +14,15 @@ bool SVCall::operator<(const SVCall & other) const return start < other.start || (start == other.start && end < other.end); } -void addSVCall(std::set& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) +void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) { + // Catch underflow errors + if (start > 4000000000 || end > 4000000000) { + throw std::runtime_error("ERROR: Integer underflow for SV call at position " + std::to_string(start) + "-" + std::to_string(end)); + } + // Ignore unknown SV types - if (sv_type == "UNKNOWN") { + if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") { return; } @@ -25,66 +30,74 @@ void addSVCall(std::set& sv_calls, uint32_t start, uint32_t end, std::st throw std::runtime_error("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end)); } - // printMessage("Adding SV call: " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end - start) + " and type " + sv_type); - sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1}); -} + // Insert the SV call in sorted order + SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1, 0}; + auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call); + // sv_calls.insert(it, sv_call); -std::vector> splitSVsIntoChunks(std::set& sv_calls, int chunk_count) -{ - // Split the SV calls into chunks - std::vector> sv_chunks; - int sv_count = (int) sv_calls.size(); - int chunk_size = std::ceil((double) sv_count / (double) chunk_count); - int current_chunk = 0; - std::set current_sv_chunk; - for (const auto& sv_call : sv_calls) + // Update the SV type if the SV call already exists (if likelihood is + // higher) + if (it != sv_calls.end() && it->start == start && it->end == end) { - current_sv_chunk.insert(sv_call); - - // If the current chunk size is reached, then add the chunk to the - // vector and reset the current chunk - if ((int) current_sv_chunk.size() == chunk_size) + if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood) { - // sv_chunks.insert(current_sv_chunk); - sv_chunks.push_back(current_sv_chunk); - current_sv_chunk.clear(); - current_chunk++; + // Update the SV call + it->sv_type = sv_type; + it->data_type = data_type; + it->genotype = genotype; + it->hmm_likelihood = hmm_likelihood; + it->support++; // Update support + } else { + it->support++; // Update support } + } else { + sv_calls.insert(it, sv_call); // Insert the new SV call } - // Add the last chunk if it is not empty - if (!current_sv_chunk.empty()) + // printMessage("Adding SV call: " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end - start) + " and type " + sv_type); + // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1}); +} + +void updateSVType(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string data_type, std::string genotype, double hmm_likelihood) +{ + // Update the SV type for an existing SV call + auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), SVCall{start, end, "", "", "", "", 0.0, 0, 0}); + if (it != sv_calls.end() && it->start == start && it->end == end) { - sv_chunks.push_back(current_sv_chunk); - // sv_chunks.insert(current_sv_chunk); + it->sv_type = sv_type; + it->data_type = data_type; + it->genotype = genotype; + it->hmm_likelihood = hmm_likelihood; + } else { + throw std::runtime_error("ERROR: SV call not found for update at position " + std::to_string(start) + "-" + std::to_string(end)); } - - return sv_chunks; } -uint32_t getSVCount(const std::set& sv_calls) +uint32_t getSVCount(const std::vector& sv_calls) { return (uint32_t) sv_calls.size(); } -void concatenateSVCalls(std::set &target, const std::set &source) +void concatenateSVCalls(std::vector &target, const std::vector& source) { // Efficiently concatenate two sets of SV calls - target.insert(source.begin(), source.end()); + // target.insert(source.begin(), source.end()); + target.insert(target.end(), source.begin(), source.end()); } -void mergeSVs(std::set& sv_calls) { +void mergeSVs(std::vector& sv_calls, std::unordered_map& breakpoint_support) +{ if (sv_calls.size() < 2) { return; } - // Merge SV calls if they overlap by at least 50% + // Merge SV calls if they overlap int initial_size = sv_calls.size(); std::vector merged_sv_calls; auto it = sv_calls.begin(); SVCall current_merge = *it++; for (; it != sv_calls.end(); ++it) { - const SVCall& next = *it; + SVCall& next = *it; // Find overlap if (next.start <= current_merge.end) { @@ -94,30 +107,63 @@ void mergeSVs(std::set& sv_calls) { } // Merge the SV calls based on HMM log likelihood (keep the higher - // likelihood), 0.0 indicates no likelihood - if (next.hmm_likelihood != 0.0 && next.hmm_likelihood > current_merge.hmm_likelihood) { + // likelihood), 0.0 indicates no likelihood (Also update support) + if (next.hmm_likelihood != 0.0) { + if (next.hmm_likelihood > current_merge.hmm_likelihood) { + current_merge = next; // Continue with the next call + } + + // Merge based on support + } else if (next.support > current_merge.support) { current_merge = next; // Continue with the next call + + } else { + // Merge based on breakpoint depth + uint32_t next_depth = breakpoint_support[next.start] + breakpoint_support[next.end]; + uint32_t current_depth = breakpoint_support[current_merge.start] + breakpoint_support[current_merge.end]; + if (next_depth > current_depth) { + current_merge = next; // Continue with the next call + + // Merge based on SV length + } else if (next.end - next.start > current_merge.end - current_merge.start) { + current_merge = next; // Continue with the next call + } } } else { // No overlap: Save the previous SV and continue - merged_sv_calls.push_back(current_merge); + merged_sv_calls.emplace_back(current_merge); current_merge = next; } } // Add the last merged SV call // printMessage("Saving SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood)); - merged_sv_calls.push_back(current_merge); + merged_sv_calls.emplace_back(current_merge); // Replace contents of the SV calls - sv_calls = std::set(merged_sv_calls.begin(), merged_sv_calls.end()); - - // // Update the SV calls - // sv_calls.clear(); - // for (const auto& sv_call : merged_sv_calls) { - // sv_calls.insert(sv_call); - // } + sv_calls = merged_sv_calls; + int updated_size = sv_calls.size(); std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl; } + +void filterSVsWithLowSupport(std::vector& sv_calls, std::unordered_map& breakpoint_support, int min_support) +{ + // Insert breakpoint support for each SV call, and remove SV calls with low + // support + int prev_size = sv_calls.size(); + for (auto& sv_call : sv_calls) + { + sv_call.total_support = breakpoint_support[sv_call.start] + breakpoint_support[sv_call.end]; + printMessage("SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " with support " + std::to_string(sv_call.total_support) + " and likelihood " + std::to_string(sv_call.hmm_likelihood) + " and length " + std::to_string(sv_call.end - sv_call.start)); + } + + // Remove SV calls with low support, unless they are large (> 20 kb) + sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) { + return (sv_call.total_support < min_support && (sv_call.end - sv_call.start) < 20000); + }), sv_calls.end()); + + int updated_size = sv_calls.size(); + printMessage("Filtered " + std::to_string(prev_size) + " SV calls to " + std::to_string(updated_size) + " SV calls with support >= " + std::to_string(min_support)); +} From 3458bc8e3a3a773885da0d97f0719d27e6a0e712 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 2 Dec 2024 01:49:03 -0500 Subject: [PATCH 039/134] add support back and improve merging --- include/cnv_caller.h | 10 +-- include/sv_caller.h | 10 +-- include/sv_object.h | 14 ++-- src/cnv_caller.cpp | 78 ++++++++++--------- src/sv_caller.cpp | 176 +++++++++++++++++++++---------------------- src/sv_object.cpp | 123 +++++++++++++++++------------- 6 files changed, 212 insertions(+), 199 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 3663d184..b36d414f 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -73,10 +73,10 @@ class CNVCaller { void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp); - std::pair, double> runViterbi(const CHMM& hmm, SNPData& snp_data); + void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair, double>& prediction); // Query a region for SNPs and return the SNP data - std::pair querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& pos_depth_map, double mean_chr_cov); + std::pair querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov); void querySNPs(std::string chr, uint32_t start, uint32_t end, std::set& snp_pos, std::unordered_map& snp_baf, std::unordered_map& snp_pfb); @@ -88,17 +88,17 @@ class CNVCaller { // Run copy number prediction for a single SV candidate, returning the // likelihood, predicted CNV type, genotype, and whether SNPs were found - std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, std::vector& pos_depth_map); + std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map); // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings - void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map); + void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map); // Calculate the mean chromosome coverage std::pair> calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len); // Calculate the log2 ratio for a region given the read depths and mean // chromosome coverage - double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::vector& pos_depth_map, double mean_chr_cov); + double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov); void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::set& snp_pos, std::unordered_map& snp_baf); void readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map& snp_pfb_map); diff --git a/include/sv_caller.h b/include/sv_caller.h index b4f6eaac..82a833f3 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -34,32 +34,32 @@ class SVCaller { // Detect SVs from the CIGAR string of a read alignment, and return the // mismatch rate, and the start and end positions of the query sequence - void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, std::tuple, uint32_t, uint32_t>& query_info, bool is_primary, std::unordered_map& breakpoint_depth); + void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, std::tuple, uint32_t, uint32_t>& query_info, bool is_primary, const std::vector& pos_depth_map); void processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::vector& combined_sv_calls, int min_cnv_length); // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. // RegionData detectSVsFromRegion(std::string region); - void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, std::unordered_map& breakpoint_depth); + void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, const std::vector& pos_depth_map); // Read the next alignment from the BAM file in a thread-safe manner int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); // Detect SVs from split alignments - void detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map, std::unordered_map& breakpoint_depth); + void detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map); // Calculate the mismatch rate given a map of query positions to // match/mismatch (1/0) values within a specified range of the query // sequence - // double calculateMismatchRate(std::unordered_map& mismatch_map, int32_t start, int32_t end); double calculateMismatchRate(const std::vector& mismatch_map, int32_t start, int32_t end); void saveToVCF(const std::unordered_map>& sv_calls); void trimOverlappingAlignments(AlignmentData& primary_alignment, AlignmentData& supp_alignment); - void updateBreakpointDepth(std::unordered_map& breakpoint_depth, uint32_t start, uint32_t end); + // Calculate the read depth (INFO/DP) for a region + int calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end); public: explicit SVCaller(InputData& input_data); diff --git a/include/sv_object.h b/include/sv_object.h index 7f8b9d96..e36e8624 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -17,22 +17,22 @@ struct SVCall { std::string data_type = "NA"; std::string genotype = "./."; double hmm_likelihood = 0.0; - int support = 0; // Exact breakpoint support - int total_support = 0; // Support at either breakpoint + int read_depth = 0; // Breakpoint depth + int support = 0; // Number of supporting reads // Comparison operator for std::set bool operator<(const SVCall& other) const; // Constructor with parameters for all fields - SVCall(uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int support, int total_support) : - start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), support(support), total_support(support) {} + SVCall(uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) : + start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {} }; -void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood); +void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth); -void mergeSVs(std::vector& sv_calls, std::unordered_map& breakpoint_support); +void mergeSVs(std::vector& sv_calls); -void filterSVsWithLowSupport(std::vector &sv_calls, std::unordered_map &breakpoint_support, int min_support); +void filterSVsWithLowSupport(std::vector &sv_calls, int min_depth); uint32_t getSVCount(const std::vector& sv_calls); diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 35e36e3b..9002e2b6 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -41,19 +41,18 @@ CNVCaller::CNVCaller(InputData &input_data) } // Function to call the Viterbi algorithm for the CHMM -std::pair, double> CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data) +void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair, double>& prediction) { int data_count = (int) snp_data.pos.size(); if (data_count == 0) { throw std::runtime_error("Error: No SNP data found for Viterbi algorithm."); } - std::pair, double> state_sequence = testVit_CHMM(hmm, data_count, snp_data.log2_cov, snp_data.baf, snp_data.pfb); - return state_sequence; + prediction = testVit_CHMM(hmm, data_count, snp_data.log2_cov, snp_data.baf, snp_data.pfb); } // Function to obtain SNP information for a region -std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& pos_depth_map, double mean_chr_cov) +std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov) { SNPData snp_data; bool snps_found = false; @@ -136,13 +135,28 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta return std::make_pair(snp_data, snps_found); } -std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, std::vector& pos_depth_map) +std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map) { + // Check that the start position is less than the end position + if (start_pos >= end_pos) + { + throw std::runtime_error("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); + } + // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2 // the SV length - uint32_t sv_half_length = (end_pos - start_pos) / 2.0; - uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1; - uint32_t snp_end_pos = end_pos + sv_half_length; + // Only extened the region if "save CNV data" is enabled + uint32_t snp_start_pos = start_pos; + uint32_t snp_end_pos = end_pos; + if (this->input_data.getSaveCNVData()) + { + uint32_t sv_half_length = (end_pos - start_pos) / 2.0; + snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1; + snp_end_pos = end_pos + sv_half_length; + } + // uint32_t sv_half_length = (end_pos - start_pos) / 2.0; + // uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1; + // uint32_t snp_end_pos = end_pos + sv_half_length; // Query the SNP region for the SV candidate std::pair snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov); @@ -151,7 +165,8 @@ std::tuple CNVCaller::runCopyNumberPrediction // Run the Viterbi algorithm // printMessage("Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " (" + std::to_string(sv_snps.pos.size()) + " SNPs, start=" + std::to_string(snp_start_pos) + ", end=" + std::to_string(snp_end_pos) + ")..."); - std::pair, double> prediction = runViterbi(hmm, sv_snps); + std::pair, double> prediction; + runViterbi(hmm, sv_snps, prediction); std::vector& state_sequence = prediction.first; double likelihood = prediction.second; @@ -208,7 +223,7 @@ std::tuple CNVCaller::runCopyNumberPrediction } -void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector &sv_candidates, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map) +void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector &sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map) { int min_length = this->input_data.getMinCNVLength(); int window_size = this->input_data.getWindowSize(); @@ -241,27 +256,29 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vectorupdateDPValue(sv_candidates, sv_call, dp_value); - // Loop through the SV region +/- 1/2 SV length and run copy number // predictions - uint32_t sv_half_length = (end_pos - start_pos) / 2.0; - uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1; - uint32_t snp_end_pos = end_pos + sv_half_length; + // Only extend the region if "save CNV data" is enabled + uint32_t snp_start_pos = start_pos; + uint32_t snp_end_pos = end_pos; + if (this->input_data.getSaveCNVData()) + { + uint32_t sv_half_length = (end_pos - start_pos) / 2.0; + snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1; + snp_end_pos = end_pos + sv_half_length; + } std::pair snp_call = this->querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov); SNPData& sv_snps = snp_call.first; bool snps_found = snp_call.second; - // Run the Viterbi algorithm + // Run the Viterbi algorithm if (sv_snps.pos.size() == 0) { std::cerr << "ERROR: No windows for SV " << chr << ":" << start_pos << "-" << end_pos << " (" << snp_start_pos << "," << snp_end_pos << std::endl; continue; } - std::pair, double> prediction = runViterbi(hmm, sv_snps); + std::pair, double> prediction; + runViterbi(hmm, sv_snps, prediction); std::vector& state_sequence = prediction.first; double likelihood = prediction.second; // printMessage("Finished running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "..."); @@ -322,8 +339,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint3 return region_chunks; } -// std::pair> CNVCaller::loadChromosomeData(std::string chr) -// { -// printMessage("Calculating mean chromosome coverage for " + chr + "..."); -// // this->mean_chr_cov = calculateMeanChromosomeCoverage(chr); -// std::pair> depth_data = calculateMeanChromosomeCoverage(chr); -// printMessage("Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov)); -// } - // Calculate the mean chromosome coverage std::pair> CNVCaller::calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len) { @@ -509,7 +516,7 @@ std::pair> CNVCaller::calculateMeanChromosomeCover return std::make_pair(mean_chr_cov, chr_pos_depth_map); } -double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std::vector& pos_depth_map, double mean_chr_cov) +double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov) { // Use the position and depth map to calculate the log2 ratio double cum_depth = 0; @@ -521,15 +528,6 @@ double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, std:: cum_depth += pos_depth_map[i]; pos_count++; } - // // Check if the position is in the map - // auto it = pos_depth_map.find(i); - // if (it == pos_depth_map.end()) - // { - // continue; - // } - // int depth = pos_depth_map[i]; - // pos_count++; - // cum_depth += depth; } // Calculate the window coverage log2 ratio (0 if no positions) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 51a2a332..3635fe15 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -22,6 +22,7 @@ #include "utils.h" #include "sv_types.h" +#include "version.h" /// @endcond # define DUP_SEQSIM_THRESHOLD 0.9 // Sequence similarity threshold for duplication detection @@ -37,7 +38,7 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) return ret; } -void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, std::unordered_map& breakpoint_depth) +void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, const std::vector& pos_depth_map) { // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); @@ -82,7 +83,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, // Call SVs directly from the CIGAR string std::tuple, uint32_t, uint32_t> query_info; - this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true, breakpoint_depth); + this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true, pos_depth_map); // std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true); const std::vector& match_map = std::get<0>(query_info); uint32_t query_start = std::get<1>(query_info); @@ -110,7 +111,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, // std::tuple, int32_t, int32_t> query_info = // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false); std::tuple, uint32_t, uint32_t> query_info; - this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, false, breakpoint_depth); + this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, false, pos_depth_map); const std::vector& match_map = std::get<0>(query_info); uint32_t query_start = std::get<1>(query_info); uint32_t query_end = std::get<2>(query_info); @@ -156,7 +157,7 @@ double SVCaller::calculateMismatchRate(const std::vector& mismatch_map, int return mismatch_rate; } -void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, std::tuple, uint32_t, uint32_t>& query_info, bool is_primary, std::unordered_map& breakpoint_depth) +void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, std::tuple, uint32_t, uint32_t>& query_info, bool is_primary, const std::vector& pos_depth_map) { std::string chr = header->target_name[alignment->core.tid]; // Chromosome name uint32_t pos = (uint32_t)alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) @@ -241,12 +242,12 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec // Add to SV calls (1-based) with the appropriate SV type ref_pos = pos+1; ref_end = ref_pos + op_len -1; + int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); if (is_duplication) { - addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh); + addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth); } else { - addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh); + addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth); } - this->updateBreakpointDepth(breakpoint_depth, ref_pos, ref_end); } // Check if the CIGAR operation is a deletion @@ -257,8 +258,8 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec { ref_pos = pos+1; ref_end = ref_pos + op_len -1; - addSVCall(sv_calls, ref_pos, ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh); - this->updateBreakpointDepth(breakpoint_depth, ref_pos, ref_end); + int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); + addSVCall(sv_calls, ref_pos, ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh, read_depth); } // Check if the CIGAR operation is a clipped base @@ -399,21 +400,19 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl; int region_count = region_chunks.size(); int current_region = 0; - // std::set combined_sv_calls; - std::unordered_map breakpoint_depth; for (const auto& sub_region : region_chunks) { current_region++; printMessage(chr + ": CIGAR SVs..."); PrimaryMap primary_map; SuppMap supp_map; std::vector subregion_sv_calls; - this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map, breakpoint_depth); + this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map, chr_data.second); // std::set& subregion_sv_calls = std::get<0>(region_data); // PrimaryMap& primary_map = std::get<1>(region_data); // SuppMap& supp_map = std::get<2>(region_data); // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl; printMessage(chr + ": Merging CIGAR..."); - mergeSVs(subregion_sv_calls, breakpoint_depth); + mergeSVs(subregion_sv_calls); int region_sv_count = getSVCount(subregion_sv_calls); // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); @@ -428,12 +427,12 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // Run split-read SV and copy number variant predictions // std::cout << "Detecting copy number variants from split reads..." << std::endl; printMessage(chr + ": Split read SVs..."); - this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, chr_data.first, chr_data.second, breakpoint_depth); + this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, chr_data.first, chr_data.second); // Merge the SV calls from the current region // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; printMessage(chr + ": Merging split reads..."); - mergeSVs(subregion_sv_calls, breakpoint_depth); + mergeSVs(subregion_sv_calls); // Combine the SV calls from the current region // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl; @@ -445,10 +444,11 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // Run a final merge on the combined SV calls printMessage(chr + ": Merging final calls..."); - mergeSVs(combined_sv_calls, breakpoint_depth); + mergeSVs(combined_sv_calls); // Insert breakpoint support and filter SVs with low support - filterSVsWithLowSupport(combined_sv_calls, breakpoint_depth, 5); + // filterSVsWithLowSupport(combined_sv_calls, breakpoint_depth, 5); + filterSVsWithLowSupport(combined_sv_calls, 10); // Clean up the BAM file, header, and index hts_idx_destroy(idx); @@ -548,7 +548,7 @@ void SVCaller::run() // Detect SVs from split read alignments -void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, std::vector& pos_depth_map, std::unordered_map& breakpoint_depth) +void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map) { // Find split-read SV evidence int sv_count = 0; @@ -589,20 +589,20 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map); double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); + int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); if (supp_type == SVType::NEUTRAL) { - addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", ".", "HMM", "./.", supp_lh); - this->updateBreakpointDepth(breakpoint_depth, supp_start+1, supp_end+1); + addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", ".", "HMM", "./.", supp_lh, read_depth); sv_count++; } else if (supp_type == SVType::DUP) { - addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", ".", "HMM", "./.", supp_lh); - this->updateBreakpointDepth(breakpoint_depth, supp_start+1, supp_end+1); + int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); + addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", ".", "HMM", "./.", supp_lh, read_depth); } } else { // Add the inversion without running copy number predictions // (too small for predictions) - addSVCall(sv_calls, supp_start+1, (supp_end+1), "INV", ".", "REV", "./.", 0.0); - this->updateBreakpointDepth(breakpoint_depth, supp_start+1, supp_end+1); + int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); + addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", ".", "REV", "./.", 0.0, read_depth); } } } @@ -650,17 +650,17 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap // If higher likelihood than the boundary, add the gap as the SV call if (gap_lh > bd_lh) { - addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), ".", "GAP", "./.", gap_lh); - this->updateBreakpointDepth(breakpoint_depth, gap_left, gap_right); + int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right); + addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), ".", "GAP", "./.", gap_lh, read_depth); } else { // Add the boundary as the SV call - addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh); - this->updateBreakpointDepth(breakpoint_depth, boundary_left, boundary_right); + int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); + addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh, read_depth); } } else { // Add the boundary as the SV call - addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh); - this->updateBreakpointDepth(breakpoint_depth, boundary_left, boundary_right); + int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); + addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh, read_depth); } } } @@ -697,15 +697,12 @@ void SVCaller::saveToVCF(const std::unordered_map", "##INFO=", "##INFO=", - "##INFO=", - "##INFO=", - "##INFO=", - "##INFO=", + "##INFO=", "##INFO=", "##FILTER=", "##FILTER=", "##FORMAT=", - "##FORMAT=" + "##FORMAT=", }; std::cout << "Writing VCF header..." << std::endl; @@ -740,7 +737,7 @@ void SVCaller::saveToVCF(const std::unordered_mapfile_stream.flush(); std::cout << "Saving SV calls to " << output_vcf << std::endl; - std::string sv_method = "CONTEXTSVv0.1"; + std::string sv_method = "CONTEXTSV" + std::string(VERSION); int skip_count = 0; int total_count = 0; for (const auto& pair : sv_calls) { @@ -760,31 +757,8 @@ void SVCaller::saveToVCF(const std::unordered_map data_type = info.data_type; - // std::string genotype = info.genotype; - // double hmm_likelihood = info.hmm_likelihood; - - // Convert the data type set to a string - // std::string data_type_str = ""; - // for (auto const& type : data_type) { - // data_type_str += type + ","; - // } - - // Get the CHROM, POS, END, and ALT - // uint32_t pos = std::get<0>(candidate); - // uint32_t end = std::get<1>(candidate); + int read_depth = sv_call.read_depth; + std::string ref_allele = "."; // If the SV type is unknown, skip it if (sv_type_str == "UNKNOWN" || sv_type_str == "NEUTRAL") { @@ -794,11 +768,6 @@ void SVCaller::saveToVCF(const std::unordered_mapinput_data.queryRefGenome(chr, preceding_pos, preceding_pos); // Format novel insertions @@ -831,7 +798,6 @@ void SVCaller::saveToVCF(const std::unordered_map(candidate); alt_allele.insert(0, ref_allele); } start = preceding_pos; // Update the position to the preceding base @@ -843,15 +809,9 @@ void SVCaller::saveToVCF(const std::unordered_map supp_mismatch_rate) { - // Trim the end of the primary alignment - uint32_t new_end = primary_alignment_end > overlap_length ? primary_alignment_end - overlap_length : 0; - std::get<2>(primary_alignment) = new_end; + // Trim the end of the primary alignment, ensuring that the new + // end is not less than the start + if (primary_alignment_end > overlap_length && (primary_alignment_end - overlap_length) > primary_alignment_start) { + // Trim the end of the primary alignment + uint32_t new_end = primary_alignment_end - overlap_length; + std::get<2>(primary_alignment) = new_end; + } // std::get<2>(primary_alignment) = primary_alignment_end - overlap_length; } else { - // Trim the beginning of the supplementary alignment - uint32_t new_start = supp_alignment_start + overlap_length; - std::get<1>(supp_alignment) = new_start; + // Trim the beginning of the supplementary alignment, ensuring + // that the new start is not greater than the end + if (supp_alignment_start + overlap_length < supp_alignment_end) { + // Trim the beginning of the supplementary alignment + uint32_t new_start = supp_alignment_start + overlap_length; + std::get<1>(supp_alignment) = new_start; + } + // uint32_t new_start = supp_alignment_start + overlap_length; + // std::get<1>(supp_alignment) = new_start; // std::get<1>(supp_alignment) = supp_alignment_start + overlap_length; } } @@ -918,21 +888,47 @@ void SVCaller::trimOverlappingAlignments(AlignmentData& primary_alignment, Align // Trim the ailgnment with the higher mismatch rate if (supp_mismatch_rate > primary_mismatch_rate) { - // Trim the end of the supplementary alignment - uint32_t new_end = supp_alignment_end > overlap_length ? supp_alignment_end - overlap_length : 0; + // Trim the end of the supplementary alignment, ensuring that + // the new end is not less than the start + if (supp_alignment_end > overlap_length && (supp_alignment_end - overlap_length) > supp_alignment_start) { + // Trim the end of the supplementary alignment + uint32_t new_end = supp_alignment_end - overlap_length; + std::get<2>(supp_alignment) = new_end; + } + // uint32_t new_end = supp_alignment_end > overlap_length ? supp_alignment_end - overlap_length : 0; + // std::get<2>(supp_alignment) = new_end; // std::get<2>(supp_alignment) = supp_alignment_end - overlap_length; } else { - // Trim the beginning of the primary alignment - uint32_t new_start = primary_alignment_start + overlap_length; - std::get<1>(primary_alignment) = new_start; + // Trim the beginning of the primary alignment, ensuring that + // the new start is not greater than the end + if (primary_alignment_start + overlap_length < primary_alignment_end) { + // Trim the beginning of the primary alignment + uint32_t new_start = primary_alignment_start + overlap_length; + std::get<1>(primary_alignment) = new_start; + } + // uint32_t new_start = primary_alignment_start + overlap_length; + // std::get<1>(primary_alignment) = new_start; // std::get<1>(primary_alignment) = primary_alignment_start + overlap_length; } } } } -void SVCaller::updateBreakpointDepth(std::unordered_map &breakpoint_depth, uint32_t start, uint32_t end) +int SVCaller::calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end) { - breakpoint_depth[start] += 1; - breakpoint_depth[end] += 1; + int read_depth = 0; + try { + // printMessage("Read depth at start: " + std::to_string(pos_depth_map.at(start)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start)); + read_depth += pos_depth_map.at(start); + } catch (const std::out_of_range& e) { + std::cerr << "Warning: Start position " << start << " not found in depth map." << std::endl; + } + try { + // printMessage("Read depth at end: " + std::to_string(pos_depth_map.at(end)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start)); + read_depth += pos_depth_map.at(end); + } catch (const std::out_of_range& e) { + std::cerr << "Warning: End position " << end << " not found in depth map." << std::endl; + } + // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth)); + return read_depth; } diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 9efa9ca2..479357a2 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -14,7 +14,7 @@ bool SVCall::operator<(const SVCall & other) const return start < other.start || (start == other.start && end < other.end); } -void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood) +void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth) { // Catch underflow errors if (start > 4000000000 || end > 4000000000) { @@ -31,14 +31,15 @@ void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std: } // Insert the SV call in sorted order - SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1, 0}; + SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1}; auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call); - // sv_calls.insert(it, sv_call); // Update the SV type if the SV call already exists (if likelihood is // higher) if (it != sv_calls.end() && it->start == start && it->end == end) { + it->support += 1; // Update the read support + // printMessage("Updating SV call with length " + std::to_string(end - start) + " and type " + sv_type + " and support " + std::to_string(it->support)); if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood) { // Update the SV call @@ -46,16 +47,10 @@ void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std: it->data_type = data_type; it->genotype = genotype; it->hmm_likelihood = hmm_likelihood; - it->support++; // Update support - } else { - it->support++; // Update support } } else { sv_calls.insert(it, sv_call); // Insert the new SV call } - - // printMessage("Adding SV call: " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end - start) + " and type " + sv_type); - // sv_calls.insert(SVCall{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, 1}); } void updateSVType(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string data_type, std::string genotype, double hmm_likelihood) @@ -80,12 +75,10 @@ uint32_t getSVCount(const std::vector& sv_calls) void concatenateSVCalls(std::vector &target, const std::vector& source) { - // Efficiently concatenate two sets of SV calls - // target.insert(source.begin(), source.end()); target.insert(target.end(), source.begin(), source.end()); } -void mergeSVs(std::vector& sv_calls, std::unordered_map& breakpoint_support) +void mergeSVs(std::vector& sv_calls) { if (sv_calls.size() < 2) { return; @@ -96,74 +89,100 @@ void mergeSVs(std::vector& sv_calls, std::unordered_map merged_sv_calls; auto it = sv_calls.begin(); SVCall current_merge = *it++; + double log_lh_eps = 1.0; // Log likelihood epsilon for (; it != sv_calls.end(); ++it) { SVCall& next = *it; // Find overlap + // printMessage("[0] Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support)); + // printMessage("[0] Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support)); if (next.start <= current_merge.end) { - // Merge the SV calls if it is a subset - if (next.end <= current_merge.end) { - continue; - } - // Merge the SV calls based on HMM log likelihood (keep the higher - // likelihood), 0.0 indicates no likelihood (Also update support) - if (next.hmm_likelihood != 0.0) { - if (next.hmm_likelihood > current_merge.hmm_likelihood) { + // Merge based on read support + if (next.support > current_merge.support) { + // Compare only if lengths are within 20% of each other + uint32_t current_length = current_merge.end - current_merge.start; + uint32_t next_length = next.end - next.start; + double length_diff = std::abs((int)current_length - (int)next_length); + double length_threshold = 0.2 * (int)current_length; + if (length_diff <= length_threshold) { current_merge = next; // Continue with the next call + // printMessage("Keeping next SV call with support " + std::to_string(next.support)); + } else { + // Keep the larger SV + if (next_length > current_length) { + current_merge = next; + // printMessage("Keeping next SV call with length " + std::to_string(next_length)); + } } + // printMessage("Keeping next SV call with support " + std::to_string(next.support)); - // Merge based on support - } else if (next.support > current_merge.support) { - current_merge = next; // Continue with the next call - - } else { - // Merge based on breakpoint depth - uint32_t next_depth = breakpoint_support[next.start] + breakpoint_support[next.end]; - uint32_t current_depth = breakpoint_support[current_merge.start] + breakpoint_support[current_merge.end]; - if (next_depth > current_depth) { - current_merge = next; // Continue with the next call - - // Merge based on SV length - } else if (next.end - next.start > current_merge.end - current_merge.start) { + } else if (next.support == current_merge.support) { + // Merge based on existence of predictions + if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood == 0.0) { current_merge = next; // Continue with the next call + // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood)); + + // Merge based on prediction log likelihood + } else if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0) { + + // Print all SV information + // printMessage("Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support)); + // printMessage("Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support)); + // printMessage("Comparing likelihoods: " + std::to_string(current_merge.hmm_likelihood) + " vs " + std::to_string(next.hmm_likelihood)); + + // Keep the SV call with the higher likelihood. Compare only if + // lengths are within 20% of each other + uint32_t current_length = current_merge.end - current_merge.start; + uint32_t next_length = next.end - next.start; + double length_diff = std::abs((int)current_length - (int)next_length); + double length_threshold = 0.2 * (int)current_length; + if (length_diff <= length_threshold) { + // printMessage("Length difference is within threshold: " + std::to_string(length_diff) + " <= " + std::to_string(length_threshold)); + + if (next.hmm_likelihood > current_merge.hmm_likelihood) { + current_merge = next; // Continue with the next call + // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood)); + } + + } else { + // Keep the larger SV + if (next_length > current_length) { + current_merge = next; + // printMessage("[2] Keeping next SV call with length " + std::to_string(next_length)); + } + } } } } else { - // No overlap: Save the previous SV and continue + // No overlap: Save the call and continue merged_sv_calls.emplace_back(current_merge); current_merge = next; } } + merged_sv_calls.emplace_back(current_merge); // Save the last call + sv_calls = merged_sv_calls; // Update the SV calls - // Add the last merged SV call - // printMessage("Saving SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood)); - merged_sv_calls.emplace_back(current_merge); - - // Replace contents of the SV calls - sv_calls = merged_sv_calls; - int updated_size = sv_calls.size(); std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl; } -void filterSVsWithLowSupport(std::vector& sv_calls, std::unordered_map& breakpoint_support, int min_support) +void filterSVsWithLowSupport(std::vector& sv_calls, int min_depth) { - // Insert breakpoint support for each SV call, and remove SV calls with low - // support int prev_size = sv_calls.size(); - for (auto& sv_call : sv_calls) - { - sv_call.total_support = breakpoint_support[sv_call.start] + breakpoint_support[sv_call.end]; - printMessage("SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " with support " + std::to_string(sv_call.total_support) + " and likelihood " + std::to_string(sv_call.hmm_likelihood) + " and length " + std::to_string(sv_call.end - sv_call.start)); + + // Print read depth for each SV call + for (const auto& sv_call : sv_calls) { + std::cout << "SV call: " << sv_call.start << "-" << sv_call.end << " with depth " << sv_call.read_depth << " and length " << (sv_call.end - sv_call.start) << std::endl; } - // Remove SV calls with low support, unless they are large (> 20 kb) - sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) { - return (sv_call.total_support < min_support && (sv_call.end - sv_call.start) < 20000); + // Remove SV calls with low read depth + sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_depth](const SVCall& sv_call) { + return sv_call.read_depth < min_depth; + // return (sv_call.total_support < min_support && (sv_call.end - sv_call.start) < 20000); }), sv_calls.end()); int updated_size = sv_calls.size(); - printMessage("Filtered " + std::to_string(prev_size) + " SV calls to " + std::to_string(updated_size) + " SV calls with support >= " + std::to_string(min_support)); + printMessage("Filtered " + std::to_string(prev_size) + " SV calls to " + std::to_string(updated_size) + " SV calls with DP >= " + std::to_string(min_depth)); } From ef4df0b1a9b4e6b73d9e0ada1fc8f79464efdcd0 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 2 Dec 2024 02:09:11 -0500 Subject: [PATCH 040/134] remove filter --- src/sv_caller.cpp | 3 +-- src/sv_object.cpp | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 3635fe15..b55e3834 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -447,8 +447,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ mergeSVs(combined_sv_calls); // Insert breakpoint support and filter SVs with low support - // filterSVsWithLowSupport(combined_sv_calls, breakpoint_depth, 5); - filterSVsWithLowSupport(combined_sv_calls, 10); + // filterSVsWithLowSupport(combined_sv_calls, 10); // Clean up the BAM file, header, and index hts_idx_destroy(idx); diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 479357a2..8529b181 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -180,7 +180,6 @@ void filterSVsWithLowSupport(std::vector& sv_calls, int min_depth) // Remove SV calls with low read depth sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_depth](const SVCall& sv_call) { return sv_call.read_depth < min_depth; - // return (sv_call.total_support < min_support && (sv_call.end - sv_call.start) < 20000); }), sv_calls.end()); int updated_size = sv_calls.size(); From c79c1da4ee2822284a788e63eca4518ddf2fe976 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 2 Dec 2024 21:21:27 -0500 Subject: [PATCH 041/134] Fix breakpoint error and improve filtering --- src/sv_caller.cpp | 123 ++++++++++++++++------- src/sv_object.cpp | 244 ++++++++++++++++++++++++++++++++-------------- 2 files changed, 255 insertions(+), 112 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index b55e3834..b8f934f7 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -74,6 +74,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::string chr = bamHdr->target_name[bam1->core.tid]; uint32_t start = (uint32_t)bam1->core.pos; uint32_t end = (uint32_t)bam_endpos(bam1); // This is the first position after the alignment + end--; // Adjust to the last position of the alignment bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); // Check for underflow @@ -104,7 +105,8 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, // Get the supplementary alignment information std::string chr = bamHdr->target_name[bam1->core.tid]; uint32_t start = bam1->core.pos; - uint32_t end = bam_endpos(bam1); + uint32_t end = bam_endpos(bam1); // This is the first position after the alignment + end--; // Adjust to the last position of the alignment bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); // Get CIGAR string information, but don't call SVs @@ -400,6 +402,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl; int region_count = region_chunks.size(); int current_region = 0; + int filter_threshold = 4; for (const auto& sub_region : region_chunks) { current_region++; printMessage(chr + ": CIGAR SVs..."); @@ -413,6 +416,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl; printMessage(chr + ": Merging CIGAR..."); mergeSVs(subregion_sv_calls); + filterSVsWithLowSupport(subregion_sv_calls, filter_threshold); int region_sv_count = getSVCount(subregion_sv_calls); // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); @@ -433,6 +437,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; printMessage(chr + ": Merging split reads..."); mergeSVs(subregion_sv_calls); + filterSVsWithLowSupport(subregion_sv_calls, filter_threshold); // Combine the SV calls from the current region // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl; @@ -445,6 +450,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // Run a final merge on the combined SV calls printMessage(chr + ": Merging final calls..."); mergeSVs(combined_sv_calls); + filterSVsWithLowSupport(combined_sv_calls, filter_threshold); // Insert breakpoint support and filter SVs with low support // filterSVsWithLowSupport(combined_sv_calls, 10); @@ -486,37 +492,53 @@ void SVCaller::run() // Lambda to process a chromosome auto process_chr = [&](const std::string& chr) { - // printMessage("Launching thread for chromosome " + chr + "..."); - std::vector sv_calls; - this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength()); - { - std::lock_guard lock(sv_mutex); - whole_genome_sv_calls[chr] = std::move(sv_calls); - } - printMessage("Completed chromosome " + chr); - - // Notify thread completion - { - std::lock_guard lock(sv_mutex); - active_threads--; - printMessage("Active threads: " + std::to_string(active_threads)); + try { + // printMessage("Launching thread for chromosome " + chr + "..."); + std::vector sv_calls; + this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength()); + { + std::lock_guard lock(sv_mutex); + whole_genome_sv_calls[chr] = std::move(sv_calls); + } + printMessage("Completed chromosome " + chr); + + // // Notify thread completion + // { + // std::lock_guard lock(sv_mutex); + // active_threads--; + // printMessage("Active threads: " + std::to_string(active_threads)); + // } + // cv.notify_one(); + } catch (const std::exception& e) { + printError("Error processing chromosome " + chr + ": " + e.what()); } - cv.notify_one(); }; // Thread management std::vector threads; for (const auto& chr : chromosomes) { - { - std::unique_lock lock(sv_mutex); - printMessage("Waiting for thread slot. Active threads: " + std::to_string(active_threads)); - cv.wait(lock, [&] { return active_threads < max_threads; }); - active_threads++; - printMessage("Launching thread for chromosome " + chr + ". Active threads: " + std::to_string(active_threads)); - } + // Wait for a thread slot + std::unique_lock lock(sv_mutex); + cv.wait(lock, [&] { return threads.size() < max_threads; }); // Launch a new thread - threads.emplace_back(process_chr, chr); + threads.emplace_back([&, chr] { + process_chr(chr); + + // Notify thread completion + std::lock_guard lock(sv_mutex); + cv.notify_one(); + }); + // { + // std::unique_lock lock(sv_mutex); + // printMessage("Waiting for thread slot. Active threads: " + std::to_string(active_threads)); + // cv.wait(lock, [&] { return active_threads < max_threads; }); + // active_threads++; + // printMessage("Launching thread for chromosome " + chr + ". Active threads: " + std::to_string(active_threads)); + // } + + // // Launch a new thread + // threads.emplace_back(process_chr, chr); } // Wait for all threads to complete @@ -551,7 +573,7 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap { // Find split-read SV evidence int sv_count = 0; - int min_cnv_length = this->input_data.getMinCNVLength(); + uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength(); for (const auto& entry : primary_map) { std::string qname = entry.first; AlignmentData primary_alignment = entry.second; @@ -584,6 +606,14 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap bool is_opposite_strand = std::get<6>(primary_alignment) != std::get<6>(*it); if (is_opposite_strand) { if (supp_length >= min_cnv_length) { + + // Print error if the start position is greater than the end + // position + if (supp_start+1 > supp_end+1) { + printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1)); + continue; + } + // printMessage("Running copy number prediction on inversion: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1)); std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map); double supp_lh = std::get<0>(result); @@ -621,20 +651,30 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap uint32_t boundary_left, boundary_right, gap_left, gap_right; if (primary_before_supp) { boundary_left = primary_start+1; - boundary_right = supp_end+1; + // boundary_right = supp_end+1; + boundary_right = std::max(primary_end, supp_end)+1; gap_left = primary_end+1; gap_right = supp_start+1; - gap_exists = primary_end < supp_start; + gap_exists = gap_left < gap_right; } else { boundary_left = supp_start+1; - boundary_right = primary_end+1; + // boundary_right = primary_end+1; + boundary_right = std::max(primary_end, supp_end)+1; gap_left = supp_end+1; gap_right = primary_start+1; - gap_exists = supp_end < primary_start; + gap_exists = gap_left < gap_right; } // Run copy number variant predictions on the boundary if large enough if (boundary_right - boundary_left >= min_cnv_length) { + + // Print error if the start position is greater than the end + // position + if (boundary_left > boundary_right) { + printError("ERROR: Invalid boundary coordinates: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right)); + continue; + } + // printMessage("Running copy number prediction on boundary: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right)); std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map); double bd_lh = std::get<0>(bd_result); @@ -642,6 +682,14 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap // Run copy number variant predictions on the gap if it exists if (gap_exists && gap_right - gap_left >= min_cnv_length) { + + // Print error if the start position is greater than the end + // position + if (gap_left > gap_right) { + printError("ERROR: Invalid gap coordinates: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right)); + continue; + } + // printMessage("Running copy number prediction on gap: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right)); std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map); double gap_lh = std::get<0>(gap_result); @@ -690,7 +738,7 @@ void SVCaller::saveToVCF(const std::unordered_mapinput_data.getRefGenome().getContigHeader(); std::vector header_lines = { - std::string("##reference=") + + std::string("##reference=") + this->input_data.getRefGenome().getFilepath(), contig_header, "##INFO=", "##INFO=", @@ -698,6 +746,7 @@ void SVCaller::saveToVCF(const std::unordered_map", "##INFO=", "##INFO=", + "##INFO=", "##FILTER=", "##FILTER=", "##FORMAT=", @@ -720,7 +769,8 @@ void SVCaller::saveToVCF(const std::unordered_mapfile_stream.flush(); - std::cout << "Saving SV calls to " << output_vcf << std::endl; - std::string sv_method = "CONTEXTSV" + std::string(VERSION); int skip_count = 0; int total_count = 0; for (const auto& pair : sv_calls) { @@ -758,6 +803,7 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls) // Merge SV calls if they overlap int initial_size = sv_calls.size(); + + // Merge any SV calls that have >90% reciprocal overlap std::vector merged_sv_calls; - auto it = sv_calls.begin(); - SVCall current_merge = *it++; - double log_lh_eps = 1.0; // Log likelihood epsilon - for (; it != sv_calls.end(); ++it) { - SVCall& next = *it; - - // Find overlap - // printMessage("[0] Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support)); - // printMessage("[0] Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support)); + SVCall current_merge = sv_calls[0]; + for (size_t i = 1; i < sv_calls.size(); i++) { + SVCall& next = sv_calls[i]; + // Check for overlap if (next.start <= current_merge.end) { + // printMessage("Comparing SV " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " (support " + std::to_string(current_merge.support) + ", length " + std::to_string(current_merge.end - current_merge.start) + ") with " + std::to_string(next.start) + "-" + std::to_string(next.end) + " (support " + std::to_string(next.support) + ", length " + std::to_string(next.end - next.start) + ")"); + // if (current_merge.start <= next.end && next.start <= current_merge.end) { + // Calculate reciprocal overlap + uint32_t overlap = std::max(0, (int)std::min(current_merge.end, next.end) - (int)std::max(current_merge.start, next.start)); + uint32_t union_length = std::max(current_merge.end, next.end) - std::min(current_merge.start, next.start); + double overlap_fraction = static_cast(overlap) / union_length; + // printMessage("Overlap fraction: " + std::to_string(overlap_fraction)); - // Merge based on read support - if (next.support > current_merge.support) { - // Compare only if lengths are within 20% of each other - uint32_t current_length = current_merge.end - current_merge.start; - uint32_t next_length = next.end - next.start; - double length_diff = std::abs((int)current_length - (int)next_length); - double length_threshold = 0.2 * (int)current_length; - if (length_diff <= length_threshold) { - current_merge = next; // Continue with the next call - // printMessage("Keeping next SV call with support " + std::to_string(next.support)); - } else { - // Keep the larger SV - if (next_length > current_length) { + // Merge if reciprocal overlap is >90% + if (overlap_fraction > 0.90) { + // printMessage("Merging SV calls with overlap " + std::to_string(overlap_fraction)); + // Keep the SV call with the higher read support + if (next.support > current_merge.support) { + current_merge = next; + } else if (next.support == current_merge.support) { + // Keep the SV call with the higher likelihood + if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0 && next.hmm_likelihood > current_merge.hmm_likelihood) { current_merge = next; - // printMessage("Keeping next SV call with length " + std::to_string(next_length)); - } - } - // printMessage("Keeping next SV call with support " + std::to_string(next.support)); - - } else if (next.support == current_merge.support) { - // Merge based on existence of predictions - if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood == 0.0) { - current_merge = next; // Continue with the next call - // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood)); - - // Merge based on prediction log likelihood - } else if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0) { - - // Print all SV information - // printMessage("Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support)); - // printMessage("Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support)); - // printMessage("Comparing likelihoods: " + std::to_string(current_merge.hmm_likelihood) + " vs " + std::to_string(next.hmm_likelihood)); - - // Keep the SV call with the higher likelihood. Compare only if - // lengths are within 20% of each other - uint32_t current_length = current_merge.end - current_merge.start; - uint32_t next_length = next.end - next.start; - double length_diff = std::abs((int)current_length - (int)next_length); - double length_threshold = 0.2 * (int)current_length; - if (length_diff <= length_threshold) { - // printMessage("Length difference is within threshold: " + std::to_string(length_diff) + " <= " + std::to_string(length_threshold)); - - if (next.hmm_likelihood > current_merge.hmm_likelihood) { - current_merge = next; // Continue with the next call - // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood)); - } - - } else { - // Keep the larger SV - if (next_length > current_length) { + } else if (next.hmm_likelihood == current_merge.hmm_likelihood) { + // Keep the SV call with the higher read depth + if (next.read_depth > current_merge.read_depth) { current_merge = next; - // printMessage("[2] Keeping next SV call with length " + std::to_string(next_length)); } } + // // Keep the SV call with the higher read depth + // if (next.read_depth > current_merge.read_depth) { + // current_merge = next; + // } else if (next.read_depth == current_merge.read_depth) { + // // Keep the SV call with the higher likelihood + // if (next.hmm_likelihood > current_merge.hmm_likelihood) { + // current_merge = next; + // } + // } } + } else { + merged_sv_calls.push_back(current_merge); + current_merge = next; } - } else { - // No overlap: Save the call and continue - merged_sv_calls.emplace_back(current_merge); + merged_sv_calls.push_back(current_merge); current_merge = next; } } - merged_sv_calls.emplace_back(current_merge); // Save the last call - sv_calls = merged_sv_calls; // Update the SV calls + + // Add the last SV call + merged_sv_calls.push_back(current_merge); + + // Update the SV calls + sv_calls = merged_sv_calls; + // for (size_t i = 0; i < sv_calls.size(); i++) { + // SVCall& current = sv_calls[i]; + // bool merged = false; + // for (size_t j = i + 1; j < sv_calls.size(); j++) { + // SVCall& next = sv_calls[j]; + // if (current.start <= next.end && next.start <= current.end) { + // // Calculate reciprocal overlap + // uint32_t overlap = std::max(0, (int)std::min(current.end, next.end) - (int)std::max(current.start, next.start)); + // uint32_t union_length = std::max(current.end, next.end) - std::min(current.start, next.start); + // double overlap_fraction = static_cast(overlap) / union_length; + + // // Merge if reciprocal overlap is >90% + // if (overlap_fraction > 0.9) { + // // Keep the SV call with the higher likelihood + // if (next.hmm_likelihood > current.hmm_likelihood) { + // current = next; + // } + // merged = true; + // } + + // // Remove the merged SV call + // sv_calls.erase(sv_calls.begin() + j); + // j--; + + // } + // if (!merged) { + // merged_sv_calls.push_back(current); + // } + // } + + + + // std::vector merged_sv_calls; + // auto it = sv_calls.begin(); + // SVCall current_merge = *it++; + // double log_lh_eps = 1.0; // Log likelihood epsilon + // for (; it != sv_calls.end(); ++it) { + // SVCall& next = *it; + + // // Find overlap + // // printMessage("[0] Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support)); + // // printMessage("[0] Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support)); + // if (next.start <= current_merge.end) { + + // // Merge based on read support + // if (next.support > current_merge.support) { + // // Compare only if lengths are within 20% of each other + // uint32_t current_length = current_merge.end - current_merge.start; + // uint32_t next_length = next.end - next.start; + // double length_diff = std::abs((int)current_length - (int)next_length); + // double length_threshold = 0.2 * (int)current_length; + // if (length_diff <= length_threshold) { + // current_merge = next; // Continue with the next call + // // printMessage("Keeping next SV call with support " + std::to_string(next.support)); + // } else { + // // Keep the larger SV + // if (next_length > current_length) { + // current_merge = next; + // // printMessage("Keeping next SV call with length " + std::to_string(next_length)); + // } + // } + // // printMessage("Keeping next SV call with support " + std::to_string(next.support)); + + // } else if (next.support == current_merge.support) { + // // Merge based on existence of predictions + // if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood == 0.0) { + // current_merge = next; // Continue with the next call + // // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood)); + + // // Merge based on prediction log likelihood + // } else if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0) { + + // // Print all SV information + // // printMessage("Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support)); + // // printMessage("Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support)); + // // printMessage("Comparing likelihoods: " + std::to_string(current_merge.hmm_likelihood) + " vs " + std::to_string(next.hmm_likelihood)); + + // // Keep the SV call with the higher likelihood. Compare only if + // // lengths are within 20% of each other + // uint32_t current_length = current_merge.end - current_merge.start; + // uint32_t next_length = next.end - next.start; + // double length_diff = std::abs((int)current_length - (int)next_length); + // double length_threshold = 0.2 * (int)current_length; + // if (length_diff <= length_threshold) { + // // printMessage("Length difference is within threshold: " + std::to_string(length_diff) + " <= " + std::to_string(length_threshold)); + + // if (next.hmm_likelihood > current_merge.hmm_likelihood) { + // current_merge = next; // Continue with the next call + // // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood)); + // } + + // } else { + // // Keep the larger SV + // if (next_length > current_length) { + // current_merge = next; + // // printMessage("[2] Keeping next SV call with length " + std::to_string(next_length)); + // } + // } + // } + // } + + // } else { + // // No overlap: Save the call and continue + // merged_sv_calls.emplace_back(current_merge); + // current_merge = next; + // } + // } + // merged_sv_calls.emplace_back(current_merge); // Save the last call + // sv_calls = merged_sv_calls; // Update the SV calls int updated_size = sv_calls.size(); std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl; } -void filterSVsWithLowSupport(std::vector& sv_calls, int min_depth) +void filterSVsWithLowSupport(std::vector& sv_calls, int min_support) { int prev_size = sv_calls.size(); - // Print read depth for each SV call - for (const auto& sv_call : sv_calls) { - std::cout << "SV call: " << sv_call.start << "-" << sv_call.end << " with depth " << sv_call.read_depth << " and length " << (sv_call.end - sv_call.start) << std::endl; - } - - // Remove SV calls with low read depth - sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_depth](const SVCall& sv_call) { - return sv_call.read_depth < min_depth; + // Filter SV calls with low read support + sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) { + return sv_call.support < min_support; }), sv_calls.end()); - int updated_size = sv_calls.size(); - printMessage("Filtered " + std::to_string(prev_size) + " SV calls to " + std::to_string(updated_size) + " SV calls with DP >= " + std::to_string(min_depth)); + // // Print read depth for each SV call + // for (const auto& sv_call : sv_calls) { + // std::cout << "SV call: " << sv_call.start << "-" << sv_call.end << " with depth " << sv_call.read_depth << " and length " << (sv_call.end - sv_call.start) << std::endl; + // } + + // // Remove SV calls with low read depth + // sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_depth](const SVCall& sv_call) { + // return sv_call.read_depth < min_depth; + // }), sv_calls.end()); + + // int updated_size = sv_calls.size(); + // printMessage("Filtered " + std::to_string(prev_size) + " SV calls to " + std::to_string(updated_size) + " SV calls with DP >= " + std::to_string(min_depth)); } From 3afeaac11841b31579d587992e0fa37c520c8e7a Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 2 Dec 2024 21:59:23 -0500 Subject: [PATCH 042/134] Update filtering --- src/sv_caller.cpp | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index b8f934f7..d4182cbb 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -415,8 +415,8 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // SuppMap& supp_map = std::get<2>(region_data); // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl; printMessage(chr + ": Merging CIGAR..."); - mergeSVs(subregion_sv_calls); filterSVsWithLowSupport(subregion_sv_calls, filter_threshold); + mergeSVs(subregion_sv_calls); int region_sv_count = getSVCount(subregion_sv_calls); // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); @@ -436,8 +436,8 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // Merge the SV calls from the current region // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; printMessage(chr + ": Merging split reads..."); - mergeSVs(subregion_sv_calls); filterSVsWithLowSupport(subregion_sv_calls, filter_threshold); + mergeSVs(subregion_sv_calls); // Combine the SV calls from the current region // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl; @@ -450,10 +450,7 @@ void SVCaller::processChromosome(const std::string& chr, const std::string& bam_ // Run a final merge on the combined SV calls printMessage(chr + ": Merging final calls..."); mergeSVs(combined_sv_calls); - filterSVsWithLowSupport(combined_sv_calls, filter_threshold); - - // Insert breakpoint support and filter SVs with low support - // filterSVsWithLowSupport(combined_sv_calls, 10); + // filterSVsWithLowSupport(combined_sv_calls, filter_threshold); // Clean up the BAM file, header, and index hts_idx_destroy(idx); From 6f6ecc58aee989fe0abbc05240af09fd5c61c9c0 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 4 Dec 2024 14:07:30 -0500 Subject: [PATCH 043/134] Improve multithreading --- .gitignore | 5 ++++ src/sv_caller.cpp | 68 +++++++++++++++++++++++------------------------ 2 files changed, 39 insertions(+), 34 deletions(-) diff --git a/.gitignore b/.gitignore index f2253893..343adf0f 100644 --- a/.gitignore +++ b/.gitignore @@ -91,3 +91,8 @@ python/dist_plots # Temporary files lib/.nfs* valgrind.log + +# Log files +*.log +*.err +*.out diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index d4182cbb..f23fe002 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -481,16 +481,12 @@ void SVCaller::run() // Set up threads for processing each chromosome const int max_threads = this->input_data.getThreadCount(); std::cout << "Using " << max_threads << " threads for processing..." << std::endl; - std::vector> futures; std::unordered_map> whole_genome_sv_calls; std::mutex sv_mutex; - std::condition_variable cv; - int active_threads = 0; // Lambda to process a chromosome auto process_chr = [&](const std::string& chr) { try { - // printMessage("Launching thread for chromosome " + chr + "..."); std::vector sv_calls; this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength()); { @@ -498,52 +494,58 @@ void SVCaller::run() whole_genome_sv_calls[chr] = std::move(sv_calls); } printMessage("Completed chromosome " + chr); - - // // Notify thread completion - // { - // std::lock_guard lock(sv_mutex); - // active_threads--; - // printMessage("Active threads: " + std::to_string(active_threads)); - // } - // cv.notify_one(); } catch (const std::exception& e) { printError("Error processing chromosome " + chr + ": " + e.what()); } }; // Thread management - std::vector threads; + // std::vector threads; + std::vector> futures; + std::atomic active_threads(0); + std::mutex cv_mutex; + std::condition_variable cv; for (const auto& chr : chromosomes) { // Wait for a thread slot - std::unique_lock lock(sv_mutex); - cv.wait(lock, [&] { return threads.size() < max_threads; }); + { + std::unique_lock lock(cv_mutex); + cv.wait(lock, [&] { return active_threads.load() < max_threads; }); + active_threads.fetch_add(1); + } - // Launch a new thread - threads.emplace_back([&, chr] { + // Launch a task + futures.push_back(std::async(std::launch::async, [&, chr] { process_chr(chr); + { + std::lock_guard lock(cv_mutex); + active_threads.fetch_sub(1); - // Notify thread completion - std::lock_guard lock(sv_mutex); - cv.notify_one(); - }); - // { - // std::unique_lock lock(sv_mutex); - // printMessage("Waiting for thread slot. Active threads: " + std::to_string(active_threads)); - // cv.wait(lock, [&] { return active_threads < max_threads; }); - // active_threads++; - // printMessage("Launching thread for chromosome " + chr + ". Active threads: " + std::to_string(active_threads)); + // Notify threads waiting for a slot + cv.notify_all(); + } + })); + // while (active_threads.load() >= max_threads) { + // std::this_thread::yield(); // } // // Launch a new thread - // threads.emplace_back(process_chr, chr); + // threads.emplace_back([&, chr] { + // active_threads.fetch_add(1); + // process_chr(chr); + // active_threads.fetch_sub(1); + // }); } // Wait for all threads to complete - for (auto& thread : threads) { - if (thread.joinable()) { - thread.join(); - } + printMessage("Waiting for all threads to finish..."); + for (auto& future : futures) { + future.get(); } + // for (auto& thread : threads) { + // if (thread.joinable()) { + // thread.join(); + // } + // } printMessage("All threads have finished."); @@ -560,8 +562,6 @@ void SVCaller::run() // Save to VCF std::cout << "Saving SVs to VCF..." << std::endl; this->saveToVCF(whole_genome_sv_calls); - - // return whole_genome_sv_calls; } From 1486176b14738e1d8f78727b89fb0f9957be3ee5 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 4 Dec 2024 19:45:59 -0500 Subject: [PATCH 044/134] Fix warnings --- Makefile-cpp | 2 +- include/sv_caller.h | 2 +- src/sv_caller.cpp | 31 ++++++++++++++++--------------- 3 files changed, 18 insertions(+), 17 deletions(-) diff --git a/Makefile-cpp b/Makefile-cpp index 58139e9c..55630e9b 100644 --- a/Makefile-cpp +++ b/Makefile-cpp @@ -18,7 +18,7 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib # Compiler and Flags CXX := g++ -CXXFLAGS := -std=c++14 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) +CXXFLAGS := -std=c++14 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR) # Add rpath for shared libraries LDLIBS := -lhts # Link with libhts.a or libhts.so diff --git a/include/sv_caller.h b/include/sv_caller.h index 82a833f3..c0f9ce23 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -36,7 +36,7 @@ class SVCaller { // mismatch rate, and the start and end positions of the query sequence void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, std::tuple, uint32_t, uint32_t>& query_info, bool is_primary, const std::vector& pos_depth_map); - void processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::vector& combined_sv_calls, int min_cnv_length); + void processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls); // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index f23fe002..4d1cf0ef 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -261,7 +261,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec ref_pos = pos+1; ref_end = ref_pos + op_len -1; int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); - addSVCall(sv_calls, ref_pos, ref_end, "DEL", ".", "CIGARDEL", "./.", default_lh, read_depth); + addSVCall(sv_calls, ref_pos, ref_end, "DEL", "", "CIGARDEL", "./.", default_lh, read_depth); } // Check if the CIGAR operation is a clipped base @@ -340,9 +340,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec query_info = std::tuple, uint32_t, uint32_t>(std::move(query_match_map), query_start, query_end); } -void SVCaller::processChromosome(const std::string& chr, const std::string& bam_filepath, const CHMM& hmm, std::vector& combined_sv_calls, int min_cnv_length) +void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls) { // Open the BAM file + std::string bam_filepath = this->input_data.getLongReadBam(); samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); if (!fp_in) { throw std::runtime_error("ERROR: failed to open " + bam_filepath); @@ -469,9 +470,9 @@ void SVCaller::run() } // Ignore all alternate contigs (contains 'alt', 'GL', 'NC', 'hs', etc.) - chromosomes.erase(std::remove_if(chromosomes.begin(), chromosomes.end(), [](const std::string& chr) { - return chr.find("alt") != std::string::npos || chr.find("GL") != std::string::npos || chr.find("NC") != std::string::npos || chr.find("hs") != std::string::npos; - }), chromosomes.end()); + // chromosomes.erase(std::remove_if(chromosomes.begin(), chromosomes.end(), [](const std::string& chr) { + // return chr.find("alt") != std::string::npos || chr.find("GL") != std::string::npos || chr.find("NC") != std::string::npos || chr.find("hs") != std::string::npos; + // }), chromosomes.end()); // Read the HMM from the file std::string hmm_filepath = this->input_data.getHMMFilepath(); @@ -488,7 +489,7 @@ void SVCaller::run() auto process_chr = [&](const std::string& chr) { try { std::vector sv_calls; - this->processChromosome(chr, this->input_data.getLongReadBam(), hmm, sv_calls, this->input_data.getMinCNVLength()); + this->processChromosome(chr, hmm, sv_calls); { std::lock_guard lock(sv_mutex); whole_genome_sv_calls[chr] = std::move(sv_calls); @@ -617,18 +618,18 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap SVType supp_type = std::get<1>(result); int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); if (supp_type == SVType::NEUTRAL) { - addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", ".", "HMM", "./.", supp_lh, read_depth); + addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "", "HMM", "./.", supp_lh, read_depth); sv_count++; } else if (supp_type == SVType::DUP) { int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); - addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", ".", "HMM", "./.", supp_lh, read_depth); + addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", "", "HMM", "./.", supp_lh, read_depth); } } else { // Add the inversion without running copy number predictions // (too small for predictions) int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); - addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", ".", "REV", "./.", 0.0, read_depth); + addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "", "REV", "./.", 0.0, read_depth); } } } @@ -695,16 +696,19 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap // If higher likelihood than the boundary, add the gap as the SV call if (gap_lh > bd_lh) { int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right); - addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), ".", "GAP", "./.", gap_lh, read_depth); + std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; + addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "GAP", "./.", gap_lh, read_depth); } else { // Add the boundary as the SV call int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); - addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh, read_depth); + std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; + addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth); } } else { // Add the boundary as the SV call int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); - addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), ".", "BOUNDARY", "./.", bd_lh, read_depth); + std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; + addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth); } } } @@ -851,9 +855,6 @@ void SVCaller::saveToVCF(const std::unordered_map lock(queue_mutex); + stop = true; + } + condition.notify_all(); + for(std::thread &worker: workers) + worker.join(); +} + +#endif diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 9002e2b6..2b3a48b4 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -46,7 +46,10 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair(), 0.0); } prediction = testVit_CHMM(hmm, data_count, snp_data.log2_cov, snp_data.baf, snp_data.pfb); } @@ -140,7 +143,9 @@ std::tuple CNVCaller::runCopyNumberPrediction // Check that the start position is less than the end position if (start_pos >= end_pos) { - throw std::runtime_error("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); + // throw std::runtime_error("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); + printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); + return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false); } // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2 @@ -167,6 +172,11 @@ std::tuple CNVCaller::runCopyNumberPrediction // printMessage("Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " (" + std::to_string(sv_snps.pos.size()) + " SNPs, start=" + std::to_string(snp_start_pos) + ", end=" + std::to_string(snp_end_pos) + ")..."); std::pair, double> prediction; runViterbi(hmm, sv_snps, prediction); + if (prediction.first.size() == 0) + { + return std::make_tuple(0.0, SVType::UNKNOWN, "./.", sv_snps_found); + } + std::vector& state_sequence = prediction.first; double likelihood = prediction.second; @@ -225,9 +235,6 @@ std::tuple CNVCaller::runCopyNumberPrediction void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector &sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map) { - int min_length = this->input_data.getMinCNVLength(); - int window_size = this->input_data.getWindowSize(); - // Map with counts for each CNV type std::map cnv_type_counts; for (int i = 0; i < 6; i++) @@ -236,6 +243,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vectorinput_data.getMinCNVLength(); for (auto& sv_call : sv_candidates) { @@ -386,7 +394,6 @@ std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint3 // Calculate the mean chromosome coverage std::pair> CNVCaller::calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len) { - // std::unordered_map chr_pos_depth_map; std::vector chr_pos_depth_map(chr_len+1, 0); // 1-based index { // Lock the bam file @@ -397,7 +404,10 @@ std::pair> CNVCaller::calculateMeanChromosomeCover samFile *bam_file = sam_open(bam_filepath.c_str(), "r"); if (!bam_file) { - throw std::runtime_error("ERROR: Could not open BAM file: " + bam_filepath); + // throw std::runtime_error("ERROR: Could not open BAM file: " + + // bam_filepath); + printError("ERROR: Could not open BAM file: " + bam_filepath); + return std::make_pair(0.0, chr_pos_depth_map); } // Enable multi-threading @@ -408,7 +418,9 @@ std::pair> CNVCaller::calculateMeanChromosomeCover if (!bam_header) { sam_close(bam_file); - throw std::runtime_error("ERROR: Could not read header from BAM file: " + bam_filepath); + printError("ERROR: Could not read header from BAM file: " + bam_filepath); + return std::make_pair(0.0, chr_pos_depth_map); + // throw std::runtime_error("ERROR: Could not read header from BAM file: " + bam_filepath); } // Load the index @@ -417,7 +429,10 @@ std::pair> CNVCaller::calculateMeanChromosomeCover { bam_hdr_destroy(bam_header); sam_close(bam_file); - throw std::runtime_error("ERROR: Could not load index for BAM file: " + bam_filepath); + // throw std::runtime_error("ERROR: Could not load index for BAM + // file: " + bam_filepath); + printError("ERROR: Could not load index for BAM file: " + bam_filepath); + return std::make_pair(0.0, chr_pos_depth_map); } // Create an iterator for the chromosome @@ -427,7 +442,11 @@ std::pair> CNVCaller::calculateMeanChromosomeCover hts_idx_destroy(bam_index); bam_hdr_destroy(bam_header); sam_close(bam_file); - throw std::runtime_error("ERROR: Could not create iterator for chromosome: " + chr + ", check if the chromosome exists in the BAM file."); + // throw std::runtime_error("ERROR: Could not create iterator for + // chromosome: " + chr + ", check if the chromosome exists in the + // BAM file."); + printError("ERROR: Could not create iterator for chromosome: " + chr + ", check if the chromosome exists in the BAM file."); + return std::make_pair(0.0, chr_pos_depth_map); } // Initialize the record @@ -438,7 +457,10 @@ std::pair> CNVCaller::calculateMeanChromosomeCover hts_idx_destroy(bam_index); bam_hdr_destroy(bam_header); sam_close(bam_file); - throw std::runtime_error("ERROR: Could not initialize BAM record."); + // throw std::runtime_error("ERROR: Could not initialize BAM + // record."); + printError("ERROR: Could not initialize BAM record."); + return std::make_pair(0.0, chr_pos_depth_map); } // Iterate through the chromosome and update the depth map @@ -469,7 +491,9 @@ std::pair> CNVCaller::calculateMeanChromosomeCover try { chr_pos_depth_map[ref_pos + j]++; } catch (const std::out_of_range& oor) { - std::cerr << "Out of range error for " << chr << ":" << ref_pos+j << std::endl; + // std::cerr << "Out of range error for " << chr << + // ":" << ref_pos+j << std::endl; + printError("Out of range error for " + chr + ":" + std::to_string(ref_pos+j)); } // chr_pos_depth_map[ref_pos + j]++; } @@ -482,7 +506,9 @@ std::pair> CNVCaller::calculateMeanChromosomeCover } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) { // Do nothing } else { - throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op)); + // throw std::runtime_error("ERROR: Unknown CIGAR operation: + // " + std::to_string(op)); + printError("ERROR: Unknown CIGAR operation: " + std::to_string(op)); } } } @@ -554,14 +580,18 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui std::string snp_filepath = this->input_data.getSNPFilepath(); if (snp_filepath.empty()) { - throw std::runtime_error("ERROR: SNP file path is empty."); + // throw std::runtime_error("ERROR: SNP file path is empty."); + printError("ERROR: SNP file path is empty."); + return; } // Initialize the synced reader bcf_srs_t *snp_reader = bcf_sr_init(); if (!snp_reader) { - throw std::runtime_error("ERROR: Could not initialize SNP reader."); + // throw std::runtime_error("ERROR: Could not initialize SNP reader."); + printError("ERROR: Could not initialize SNP reader."); + return; } // Lock during reading @@ -572,7 +602,10 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) { bcf_sr_destroy(snp_reader); - throw std::runtime_error("ERROR: Could not set region for SNP reader: " + region_str); + // throw std::runtime_error("ERROR: Could not set region for SNP reader: + // " + region_str); + printError("ERROR: Could not set region for SNP reader: " + region_str); + return; } // Set multi-threading @@ -586,7 +619,10 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0) { bcf_sr_destroy(snp_reader); - throw std::runtime_error("ERROR: Could not add SNP file to reader: " + snp_filepath); + // throw std::runtime_error("ERROR: Could not add SNP file to reader: " + // + snp_filepath); + printError("ERROR: Could not add SNP file to reader: " + snp_filepath); + return; } // Get the header @@ -594,7 +630,10 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui if (!snp_header) { bcf_sr_destroy(snp_reader); - throw std::runtime_error("ERROR: Could not get header for SNP reader."); + // throw std::runtime_error("ERROR: Could not get header for SNP + // reader."); + printError("ERROR: Could not get header for SNP reader."); + return; } // std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl; @@ -670,7 +709,10 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui { // std::cerr << "ERROR: AD value is missing for SNP at " << chr // << ":" << pos << std::endl; - throw std::runtime_error("ERROR: AD value is missing for SNP at " + chr + ":" + std::to_string(pos)); + // throw std::runtime_error("ERROR: AD value is missing for SNP + // at " + chr + ":" + std::to_string(pos)); + printError("ERROR: AD value is missing for SNP at " + chr + ":" + std::to_string(pos)); + continue; } // Calculate the B-allele frequency (BAF) @@ -744,7 +786,10 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos bcf_srs_t *pfb_reader = bcf_sr_init(); if (!pfb_reader) { - throw std::runtime_error("ERROR: Could not initialize synced reader for population frequency file: " + pfb_filepath); + // throw std::runtime_error("ERROR: Could not initialize synced reader + // for population frequency file: " + pfb_filepath); + printError("ERROR: Could not initialize synced reader for population frequency file: " + pfb_filepath); + return; } // Lock during reading @@ -755,7 +800,10 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos if (bcf_sr_set_regions(pfb_reader, region_str.c_str(), 0) < 0) { bcf_sr_destroy(pfb_reader); - throw std::runtime_error("ERROR: Could not set region for synced reader: " + region_str); + // throw std::runtime_error("ERROR: Could not set region for synced + // reader: " + region_str); + printError("ERROR: Could not set region for synced reader: " + region_str); + return; } // Set multi-threading @@ -768,7 +816,10 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0) { bcf_sr_destroy(pfb_reader); - throw std::runtime_error("ERROR: Could not add population frequency file to synced reader: " + pfb_filepath); + // throw std::runtime_error("ERROR: Could not add population frequency + // file to synced reader: " + pfb_filepath); + printError("ERROR: Could not add population frequency file to synced reader: " + pfb_filepath); + return; } // Get the header @@ -776,7 +827,10 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos if (!pfb_header) { bcf_sr_destroy(pfb_reader); - throw std::runtime_error("ERROR: Could not get header for population frequency file: " + pfb_filepath); + // throw std::runtime_error("ERROR: Could not get header for population + // frequency file: " + pfb_filepath); + printError("ERROR: Could not get header for population frequency file: " + pfb_filepath); + return; } int record_count = 0; @@ -832,7 +886,9 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos } if (pfb_reader->errnum) { - std::cerr << "ERROR: " <errnum) << std::endl; + // std::cerr << "ERROR: " <errnum) << + // std::endl; + printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum))); } // Clean up diff --git a/src/khmm.cpp b/src/khmm.cpp index 22d4a269..b5b5bf02 100644 --- a/src/khmm.cpp +++ b/src/khmm.cpp @@ -12,6 +12,8 @@ #include /// @endcond +#include "utils.h" + #define STATE_CHANGE 100000.0 /*this is the expected changes (D value) in the transition matrix*/ #define VITHUGE 100000000000.0 #define FLOAT_MINIMUM 1.175494351e-38 /*this is indeed machine dependent*/ @@ -423,7 +425,9 @@ CHMM ReadCHMM(const std::string filename) std::ifstream file(filename); if (!file.is_open()) { - throw std::runtime_error("Error opening file"); + // throw std::runtime_error("Error opening file"); + printError("Error opening file"); + return CHMM(); } CHMM hmm; @@ -433,87 +437,115 @@ CHMM ReadCHMM(const std::string filename) std::getline(file, line); if (sscanf(line.c_str(), "M=%d", &hmm.M) != 1) { - throw std::runtime_error("Error reading M"); + // throw std::runtime_error("Error reading M"); + printError("Error reading M"); + return CHMM(); } // Read N std::getline(file, line); if (sscanf(line.c_str(), "N=%d", &hmm.N) != 1) { - throw std::runtime_error("Error reading N"); + // throw std::runtime_error("Error reading N"); + printError("Error reading N"); + return CHMM(); } // Read A std::getline(file, line); if (line != "A:") { - throw std::runtime_error("Error reading A"); + // throw std::runtime_error("Error reading A"); + printError("Error reading A"); + return CHMM(); } hmm.A = readMatrix(file, hmm.N, hmm.N); if (hmm.A.size() != (size_t)hmm.N || hmm.A[0].size() != (size_t)hmm.N) { - throw std::runtime_error("Error reading A"); + // throw std::runtime_error("Error reading A"); + printError("Error reading A"); + return CHMM(); } // Read B std::getline(file, line); if (line != "B:") { - throw std::runtime_error("Error reading B"); + // throw std::runtime_error("Error reading B"); + printError("Error reading B"); + return CHMM(); } hmm.B = readMatrix(file, hmm.N, hmm.M); if (hmm.B.size() != (size_t)hmm.N || hmm.B[0].size() != (size_t)hmm.M) { - throw std::runtime_error("Error reading B"); + // throw std::runtime_error("Error reading B"); + printError("Error reading B"); + return CHMM(); } // Read pi std::getline(file, line); if (line != "pi:") { - throw std::runtime_error("Error reading pi"); + // throw std::runtime_error("Error reading pi"); + printError("Error reading pi"); + return CHMM(); } hmm.pi = readVector(file, hmm.N); if (hmm.pi.size() != (size_t)hmm.N) { - throw std::runtime_error("Error reading pi"); + // throw std::runtime_error("Error reading pi"); + printError("Error reading pi"); + return CHMM(); } // Read B1_mean std::getline(file, line); if (line != "B1_mean:") { - throw std::runtime_error("Error reading B1_mean"); + // throw std::runtime_error("Error reading B1_mean"); + printError("Error reading B1_mean"); + return CHMM(); } hmm.B1_mean = readVector(file, hmm.N); if (hmm.B1_mean.size() != (size_t)hmm.N) { - throw std::runtime_error("Error reading B1_mean"); + // throw std::runtime_error("Error reading B1_mean"); + printError("Error reading B1_mean"); + return CHMM(); } // Read B1_sd std::getline(file, line); if (line != "B1_sd:") { - throw std::runtime_error("Error reading B1_sd"); + // throw std::runtime_error("Error reading B1_sd"); + printError("Error reading B1_sd"); + return CHMM(); } hmm.B1_sd = readVector(file, hmm.N); if (hmm.B1_sd.size() != (size_t)hmm.N) { - throw std::runtime_error("Error reading B1_sd"); + // throw std::runtime_error("Error reading B1_sd"); + printError("Error reading B1_sd"); + return CHMM(); } // Read B1_uf std::getline(file, line); if (line != "B1_uf:") { - throw std::runtime_error("Error reading B1_uf"); + // throw std::runtime_error("Error reading B1_uf"); + printError("Error reading B1_uf"); + return CHMM(); } std::getline(file, line); try { hmm.B1_uf = std::stod(line); } catch (const std::invalid_argument& e) { - throw std::runtime_error("Error reading B1_uf"); + // throw std::runtime_error("Error reading B1_uf"); + printError("Error reading B1_uf"); + return CHMM(); } // Read B2_mean diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 4d1cf0ef..c79d3149 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -20,6 +20,7 @@ #include #include +#include "ThreadPool.h" #include "utils.h" #include "sv_types.h" #include "version.h" @@ -398,6 +399,12 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v CNVCaller cnv_caller(this->input_data); // cnv_caller.loadChromosomeData(chr); std::pair> chr_data = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_len); + if (chr_data.first == 0.0 || chr_data.second.size() == 0) { + hts_idx_destroy(idx); + bam_hdr_destroy(bamHdr); + sam_close(fp_in); + return; + } // Process each chunk one at a time // std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl; @@ -468,20 +475,18 @@ void SVCaller::run() } else { chromosomes = this->input_data.getRefGenomeChromosomes(); } - - // Ignore all alternate contigs (contains 'alt', 'GL', 'NC', 'hs', etc.) - // chromosomes.erase(std::remove_if(chromosomes.begin(), chromosomes.end(), [](const std::string& chr) { - // return chr.find("alt") != std::string::npos || chr.find("GL") != std::string::npos || chr.find("NC") != std::string::npos || chr.find("hs") != std::string::npos; - // }), chromosomes.end()); // Read the HMM from the file std::string hmm_filepath = this->input_data.getHMMFilepath(); std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; const CHMM& hmm = ReadCHMM(hmm_filepath.c_str()); - // Set up threads for processing each chromosome + // Set up thread pool const int max_threads = this->input_data.getThreadCount(); std::cout << "Using " << max_threads << " threads for processing..." << std::endl; + ThreadPool pool(max_threads); + + // Shared resources std::unordered_map> whole_genome_sv_calls; std::mutex sv_mutex; @@ -497,58 +502,34 @@ void SVCaller::run() printMessage("Completed chromosome " + chr); } catch (const std::exception& e) { printError("Error processing chromosome " + chr + ": " + e.what()); + } catch (...) { + printError("Unknown error processing chromosome " + chr); } }; - // Thread management - // std::vector threads; + // Futures vector std::vector> futures; - std::atomic active_threads(0); - std::mutex cv_mutex; - std::condition_variable cv; - for (const auto& chr : chromosomes) { - // Wait for a thread slot - { - std::unique_lock lock(cv_mutex); - cv.wait(lock, [&] { return active_threads.load() < max_threads; }); - active_threads.fetch_add(1); - } - // Launch a task - futures.push_back(std::async(std::launch::async, [&, chr] { + // Submit tasks to the thread pool and track futures + for (const auto& chr : chromosomes) { + futures.emplace_back(pool.enqueue([&, chr] { + printMessage("Processing chromosome " + chr); process_chr(chr); - { - std::lock_guard lock(cv_mutex); - active_threads.fetch_sub(1); - - // Notify threads waiting for a slot - cv.notify_all(); - } })); - // while (active_threads.load() >= max_threads) { - // std::this_thread::yield(); - // } - - // // Launch a new thread - // threads.emplace_back([&, chr] { - // active_threads.fetch_add(1); - // process_chr(chr); - // active_threads.fetch_sub(1); - // }); } - // Wait for all threads to complete - printMessage("Waiting for all threads to finish..."); + // Wait for all tasks to complete for (auto& future : futures) { - future.get(); + try { + future.get(); + printMessage("Chromosome task completed."); + } catch (const std::exception& e) { + printError("Error processing chromosome task: " + std::string(e.what())); + } catch (...) { + printError("Unknown error processing chromosome task."); + } } - // for (auto& thread : threads) { - // if (thread.joinable()) { - // thread.join(); - // } - // } - - printMessage("All threads have finished."); + printMessage("All tasks have finished."); // Print the total number of SVs detected for each chromosome uint32_t total_sv_count = 0; @@ -614,6 +595,10 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap // printMessage("Running copy number prediction on inversion: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1)); std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map); + if (std::get<1>(result) == SVType::UNKNOWN) { + continue; + } + double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); @@ -675,6 +660,9 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap // printMessage("Running copy number prediction on boundary: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right)); std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map); + if (std::get<1>(bd_result) == SVType::UNKNOWN) { + continue; + } double bd_lh = std::get<0>(bd_result); SVType bd_type = std::get<1>(bd_result); @@ -690,6 +678,9 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap // printMessage("Running copy number prediction on gap: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right)); std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map); + if (std::get<1>(gap_result) == SVType::UNKNOWN) { + continue; + } double gap_lh = std::get<0>(gap_result); SVType gap_type = std::get<1>(gap_result); From 934350f34026db79b23140bb6806e517c25f87cb Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 5 Dec 2024 16:53:05 -0500 Subject: [PATCH 046/134] Calculate ME rates --- python/mendelian_inheritance.py | 51 +++++++++++++++++++++++++++++++++ python/plot_venn.py | 5 ++-- src/sv_caller.cpp | 1 - 3 files changed, 54 insertions(+), 3 deletions(-) create mode 100644 python/mendelian_inheritance.py diff --git a/python/mendelian_inheritance.py b/python/mendelian_inheritance.py new file mode 100644 index 00000000..911b994b --- /dev/null +++ b/python/mendelian_inheritance.py @@ -0,0 +1,51 @@ +import csv +import sys + + +def read_tsv(file_path): + with open(file_path, 'r') as file: + reader = csv.reader(file, delimiter='\t') + return [row for row in reader] + +def calculate_mendelian_error(father_genotype, mother_genotype, child_genotype): + # Generate all possible child genotypes + child_genotypes = set() + for allele1 in father_genotype.split('/'): + for allele2 in mother_genotype.split('/'): + child_genotypes.add('/'.join(sorted([allele1, allele2]))) + + # Check if the child genotype is valid + return 0 if child_genotype in child_genotypes else 1 + + +def main(father_file, mother_file, child_file): + father_records = read_tsv(father_file) + mother_records = read_tsv(mother_file) + child_records = read_tsv(child_file) + + if len(father_records) != len(mother_records) or len(father_records) != len(child_records): + raise ValueError("All files must have the same number of records") + + total_records = len(father_records) + error_count = 0 + + for i in range(total_records): + father_genotype = father_records[i][5] + mother_genotype = mother_records[i][5] + child_genotype = child_records[i][5] + + error_count += calculate_mendelian_error(father_genotype, mother_genotype, child_genotype) + + error_rate = error_count / total_records + print(f"Mendelian Inheritance Error Rate: {error_rate:.2%} for {total_records} SVs") + +if __name__ == "__main__": + if len(sys.argv) != 4: + print("Usage: python mendelian_inheritance.py ") + sys.exit(1) + + father_file = sys.argv[1] + mother_file = sys.argv[2] + child_file = sys.argv[3] + + main(father_file, mother_file, child_file) diff --git a/python/plot_venn.py b/python/plot_venn.py index eb7e8e78..757f4408 100644 --- a/python/plot_venn.py +++ b/python/plot_venn.py @@ -4,7 +4,7 @@ import matplotlib.pyplot as plt -def plot_venn(AB, Ab, aB, output, title_AB, title_Ab, title_aB): +def plot_venn(AB, Ab, aB, output, plot_title, title_Ab, title_aB): plt.figure(figsize=(8, 8)) print('AB:', AB) @@ -27,7 +27,8 @@ def plot_venn(AB, Ab, aB, output, title_AB, title_Ab, title_aB): venn.get_label_by_id('11').set_text(str(AB)) # Update the title - plt.title("contextsv and " + title_aB + " venn diagram (all SV types)") + # plt.title("contextsv and " + title_aB + " venn diagram (all SV types)") + plt.title(plot_title) plt.savefig(output) plt.close() diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index c79d3149..95bf1441 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -397,7 +397,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // std::cout << "Loading chromosome data for copy number predictions..." << std::endl; printMessage(chr + ": Loading chromosome data..."); CNVCaller cnv_caller(this->input_data); - // cnv_caller.loadChromosomeData(chr); std::pair> chr_data = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_len); if (chr_data.first == 0.0 || chr_data.second.size() == 0) { hts_idx_destroy(idx); From d5923fecd65ac2a0004fe66cc86132b2dcf9b1e6 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 6 Dec 2024 15:05:02 -0500 Subject: [PATCH 047/134] Add ME debug output --- python/mendelian_inheritance.py | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/python/mendelian_inheritance.py b/python/mendelian_inheritance.py index 911b994b..a8050e53 100644 --- a/python/mendelian_inheritance.py +++ b/python/mendelian_inheritance.py @@ -13,6 +13,10 @@ def calculate_mendelian_error(father_genotype, mother_genotype, child_genotype): for allele1 in father_genotype.split('/'): for allele2 in mother_genotype.split('/'): child_genotypes.add('/'.join(sorted([allele1, allele2]))) + + # Print the parent and child genotypes if invalid + # if child_genotype not in child_genotypes: + # print(f"ME: Father: {father_genotype}, Mother: {mother_genotype}, Child: {child_genotype}") # Check if the child genotype is valid return 0 if child_genotype in child_genotypes else 1 @@ -29,15 +33,38 @@ def main(father_file, mother_file, child_file): total_records = len(father_records) error_count = 0 + sv_type_dict = {} + sv_type_error_dict = {} + for i in range(total_records): father_genotype = father_records[i][5] mother_genotype = mother_records[i][5] child_genotype = child_records[i][5] + child_sv_type = child_records[i][2] + sv_type_dict[child_sv_type] = sv_type_dict.get(child_sv_type, 0) + 1 + + # Print SV size if error occurs + error_value = calculate_mendelian_error(father_genotype, mother_genotype, child_genotype) + if error_value == 1: + # print(f"SV size: {father_records[i][2]}") + sv_type_error_dict[child_sv_type] = sv_type_error_dict.get(child_sv_type, 0) + 1 + + error_count += error_value + # error_count += calculate_mendelian_error(father_genotype, mother_genotype, child_genotype) + + if total_records == 0: + error_rate = 0 + print("No records found") + else: + error_rate = error_count / total_records - error_count += calculate_mendelian_error(father_genotype, mother_genotype, child_genotype) + print(f"Mendelian Inheritance Error Rate: {error_rate:.2%} for {total_records} shared trio SVs") - error_rate = error_count / total_records - print(f"Mendelian Inheritance Error Rate: {error_rate:.2%} for {total_records} SVs") + print("SV Type Distribution:") + for sv_type, count in sv_type_dict.items(): + error_count = sv_type_error_dict.get(sv_type, 0) + error_rate = error_count / count + print(f"{sv_type}: {error_rate:.2%} ({error_count}/{count})") if __name__ == "__main__": if len(sys.argv) != 4: From 7e3bac451a6204c33f49e44b38d0e793235afeed Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 11 Dec 2024 12:19:28 -0500 Subject: [PATCH 048/134] Fix alt allele error --- python/mendelian_inheritance.py | 4 +- src/cnv_caller.cpp | 16 +++- src/sv_caller.cpp | 49 +++++++--- src/sv_object.cpp | 152 +++++--------------------------- 4 files changed, 74 insertions(+), 147 deletions(-) diff --git a/python/mendelian_inheritance.py b/python/mendelian_inheritance.py index a8050e53..128b1d1a 100644 --- a/python/mendelian_inheritance.py +++ b/python/mendelian_inheritance.py @@ -15,8 +15,8 @@ def calculate_mendelian_error(father_genotype, mother_genotype, child_genotype): child_genotypes.add('/'.join(sorted([allele1, allele2]))) # Print the parent and child genotypes if invalid - # if child_genotype not in child_genotypes: - # print(f"ME: Father: {father_genotype}, Mother: {mother_genotype}, Child: {child_genotype}") + if child_genotype not in child_genotypes: + print(f"ME: Father: {father_genotype}, Mother: {mother_genotype}, Child: {child_genotype}") # Check if the child genotype is valid return 0 if child_genotype in child_genotypes else 1 diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 2b3a48b4..4eb0bc26 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -338,15 +338,23 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector region_chunks; - int chunk_count = 100; + //int chunk_count = 100; + int chunk_count = 1; uint32_t chr_len = this->input_data.getRefGenomeChromosomeLength(chr); if (this->input_data.isRegionSet()) { @@ -420,7 +421,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // std::set& subregion_sv_calls = std::get<0>(region_data); // PrimaryMap& primary_map = std::get<1>(region_data); // SuppMap& supp_map = std::get<2>(region_data); - // std::cout << "Merge CIGAR SV calls from " << sub_region << "..." << std::endl; + // std::cout << " CIGAR SV calls from " << sub_region << "..." << std::endl; printMessage(chr + ": Merging CIGAR..."); filterSVsWithLowSupport(subregion_sv_calls, filter_threshold); mergeSVs(subregion_sv_calls); @@ -552,19 +553,25 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap // Find split-read SV evidence int sv_count = 0; uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength(); + int primary_count = primary_map.size(); + int current_primary = 0; for (const auto& entry : primary_map) { + current_primary++; std::string qname = entry.first; AlignmentData primary_alignment = entry.second; std::string primary_chr = std::get<0>(primary_alignment); uint32_t primary_start = std::get<1>(primary_alignment); uint32_t primary_end = std::get<2>(primary_alignment); + printMessage("Processing primary alignment " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " (Location: " + primary_chr + ":" + std::to_string(primary_start+1) + "-" + std::to_string(primary_end+1) + ")..."); // Skip primary alignments that do not have supplementary alignments if (supp_map.find(qname) == supp_map.end()) { continue; } - // Find the largest supplementary alignment, and also identify inversions + // Find the largest supplementary alignment, and also identify + // inversions + printMessage("Finding largest supplementary alignment..."); AlignmentData largest_supp_alignment = supp_map[qname][0]; uint32_t largest_supp_length = 0; for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { @@ -592,7 +599,7 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap continue; } - // printMessage("Running copy number prediction on inversion: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1)); + printMessage("Running copy number prediction on inversion: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1) + " of length " + std::to_string(supp_length) + " bp..."); std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map); if (std::get<1>(result) == SVType::UNKNOWN) { continue; @@ -600,18 +607,20 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); + printMessage("Calculating read depth for inversion (length: " + std::to_string(supp_length) + " bp)..."); int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); if (supp_type == SVType::NEUTRAL) { addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "", "HMM", "./.", supp_lh, read_depth); sv_count++; } else if (supp_type == SVType::DUP) { - int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); + // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", "", "HMM", "./.", supp_lh, read_depth); } } else { // Add the inversion without running copy number predictions // (too small for predictions) + printMessage("Calculating read depth for small inversion (length: " + std::to_string(supp_length) + " bp)..."); int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "", "REV", "./.", 0.0, read_depth); } @@ -619,12 +628,14 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap } // Trim overlapping alignments + printMessage("Trimming overlapping alignments..."); uint32_t supp_start = std::get<1>(largest_supp_alignment); uint32_t supp_end = std::get<2>(largest_supp_alignment); bool primary_before_supp = primary_start < supp_start; trimOverlappingAlignments(primary_alignment, largest_supp_alignment); // Create the SV candidate using both alignments + printMessage("Creating SV candidates..."); supp_start = std::get<1>(largest_supp_alignment); supp_end = std::get<2>(largest_supp_alignment); primary_start = std::get<1>(primary_alignment); @@ -658,6 +669,7 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap } // printMessage("Running copy number prediction on boundary: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right)); + printMessage("Running copy number prediction on boundary (Length: " + std::to_string(boundary_right - boundary_left) + " bp)..."); std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map); if (std::get<1>(bd_result) == SVType::UNKNOWN) { continue; @@ -675,7 +687,10 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap continue; } - // printMessage("Running copy number prediction on gap: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right)); + // printMessage("Running copy number prediction on gap: " + + // primary_chr + ":" + std::to_string(gap_left) + "-" + + // std::to_string(gap_right)); + printMessage("Running copy number prediction on gap (Length: " + std::to_string(gap_right - gap_left) + " bp)..."); std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map); if (std::get<1>(gap_result) == SVType::UNKNOWN) { continue; @@ -686,16 +701,19 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap // If higher likelihood than the boundary, add the gap as the SV call if (gap_lh > bd_lh) { int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right); - std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; + printMessage("Calculating read depth for gap (length: " + std::to_string(gap_right - gap_left) + " bp)..."); + std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">"; addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "GAP", "./.", gap_lh, read_depth); } else { // Add the boundary as the SV call + printMessage("Calculating read depth for boundary (length: " + std::to_string(boundary_right - boundary_left) + " bp)..."); int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth); } } else { // Add the boundary as the SV call + printMessage("Calculating read depth for boundary (length: " + std::to_string(boundary_right - boundary_left) + " bp)..."); int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth); @@ -829,18 +847,25 @@ void SVCaller::saveToVCF(const std::unordered_mapinput_data.queryRefGenome(chr, preceding_pos, preceding_pos); - // Format novel insertions + // Update the start position to the preceding base + start = preceding_pos; + + // Update the end position to the same base for duplications and insertions + if (sv_type_str == "DUP" || sv_type_str == "INS") { + end = start; + } + if (sv_type_str == "INS") { // Check if in symbolic form if (alt_allele != "") { // Use the insertion sequence as the alternate allele alt_allele.insert(0, ref_allele); } - start = preceding_pos; // Update the position to the preceding base + // start = preceding_pos; // Update the position to the preceding base - // Update the end position to the start position to change from - // query to reference coordinates for insertions - end = start; + // // Update the end position to the start position to change from + // // query to reference coordinates for insertions + // end = start; } } diff --git a/src/sv_object.cpp b/src/sv_object.cpp index d9b8e457..04201066 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -25,6 +25,16 @@ void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std: if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") { return; } + + // Set the alt allele to or if the SV type is DUP or DEL, throw + // an error otherwise + if (sv_type == "DUP" && alt_allele == ".") { + printError("ERROR: Invalid alt allele for duplication at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele); + alt_allele = ""; + } else if (sv_type == "DEL" && alt_allele == ".") { + printError("ERROR: Invalid alt allele for deletion at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele); + alt_allele = ""; + } if (start >= end) { throw std::runtime_error("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end)); @@ -94,17 +104,18 @@ void mergeSVs(std::vector& sv_calls) SVCall& next = sv_calls[i]; // Check for overlap if (next.start <= current_merge.end) { - // printMessage("Comparing SV " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " (support " + std::to_string(current_merge.support) + ", length " + std::to_string(current_merge.end - current_merge.start) + ") with " + std::to_string(next.start) + "-" + std::to_string(next.end) + " (support " + std::to_string(next.support) + ", length " + std::to_string(next.end - next.start) + ")"); + //XprintMessage("Comparing SV " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " (support " + std::to_string(current_merge.support) + ", length " + std::to_string(current_merge.end - current_merge.start) + ") with " + std::to_string(next.start) + "-" + std::to_string(next.end) + " (support " + std::to_string(next.support) + ", length " + std::to_string(next.end - next.start) + ")"); + // if (current_merge.start <= next.end && next.start <= current_merge.end) { // Calculate reciprocal overlap uint32_t overlap = std::max(0, (int)std::min(current_merge.end, next.end) - (int)std::max(current_merge.start, next.start)); uint32_t union_length = std::max(current_merge.end, next.end) - std::min(current_merge.start, next.start); double overlap_fraction = static_cast(overlap) / union_length; - // printMessage("Overlap fraction: " + std::to_string(overlap_fraction)); + //XprintMessage("Overlap fraction: " + std::to_string(overlap_fraction)); // Merge if reciprocal overlap is >90% if (overlap_fraction > 0.90) { - // printMessage("Merging SV calls with overlap " + std::to_string(overlap_fraction)); + //XprintMessage("Merging SV calls with overlap " + std::to_string(overlap_fraction)); // Keep the SV call with the higher read support if (next.support > current_merge.support) { current_merge = next; @@ -118,19 +129,14 @@ void mergeSVs(std::vector& sv_calls) current_merge = next; } } - // // Keep the SV call with the higher read depth - // if (next.read_depth > current_merge.read_depth) { - // current_merge = next; - // } else if (next.read_depth == current_merge.read_depth) { - // // Keep the SV call with the higher likelihood - // if (next.hmm_likelihood > current_merge.hmm_likelihood) { - // current_merge = next; - // } - // } } } else { - merged_sv_calls.push_back(current_merge); - current_merge = next; + // Continue with the larger length + uint32_t current_length = current_merge.end - current_merge.start; + uint32_t next_length = next.end - next.start; + if (next_length > current_length) { // And support meets threshold + current_merge = next; + } } } else { merged_sv_calls.push_back(current_merge); @@ -138,120 +144,8 @@ void mergeSVs(std::vector& sv_calls) } } - // Add the last SV call - merged_sv_calls.push_back(current_merge); - - // Update the SV calls - sv_calls = merged_sv_calls; - // for (size_t i = 0; i < sv_calls.size(); i++) { - // SVCall& current = sv_calls[i]; - // bool merged = false; - // for (size_t j = i + 1; j < sv_calls.size(); j++) { - // SVCall& next = sv_calls[j]; - // if (current.start <= next.end && next.start <= current.end) { - // // Calculate reciprocal overlap - // uint32_t overlap = std::max(0, (int)std::min(current.end, next.end) - (int)std::max(current.start, next.start)); - // uint32_t union_length = std::max(current.end, next.end) - std::min(current.start, next.start); - // double overlap_fraction = static_cast(overlap) / union_length; - - // // Merge if reciprocal overlap is >90% - // if (overlap_fraction > 0.9) { - // // Keep the SV call with the higher likelihood - // if (next.hmm_likelihood > current.hmm_likelihood) { - // current = next; - // } - // merged = true; - // } - - // // Remove the merged SV call - // sv_calls.erase(sv_calls.begin() + j); - // j--; - - // } - // if (!merged) { - // merged_sv_calls.push_back(current); - // } - // } - - - - // std::vector merged_sv_calls; - // auto it = sv_calls.begin(); - // SVCall current_merge = *it++; - // double log_lh_eps = 1.0; // Log likelihood epsilon - // for (; it != sv_calls.end(); ++it) { - // SVCall& next = *it; - - // // Find overlap - // // printMessage("[0] Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support)); - // // printMessage("[0] Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support)); - // if (next.start <= current_merge.end) { - - // // Merge based on read support - // if (next.support > current_merge.support) { - // // Compare only if lengths are within 20% of each other - // uint32_t current_length = current_merge.end - current_merge.start; - // uint32_t next_length = next.end - next.start; - // double length_diff = std::abs((int)current_length - (int)next_length); - // double length_threshold = 0.2 * (int)current_length; - // if (length_diff <= length_threshold) { - // current_merge = next; // Continue with the next call - // // printMessage("Keeping next SV call with support " + std::to_string(next.support)); - // } else { - // // Keep the larger SV - // if (next_length > current_length) { - // current_merge = next; - // // printMessage("Keeping next SV call with length " + std::to_string(next_length)); - // } - // } - // // printMessage("Keeping next SV call with support " + std::to_string(next.support)); - - // } else if (next.support == current_merge.support) { - // // Merge based on existence of predictions - // if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood == 0.0) { - // current_merge = next; // Continue with the next call - // // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood)); - - // // Merge based on prediction log likelihood - // } else if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0) { - - // // Print all SV information - // // printMessage("Current SV call: " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " with likelihood " + std::to_string(current_merge.hmm_likelihood) + " and read depth " + std::to_string(current_merge.read_depth) + " and length " + std::to_string(current_merge.end - current_merge.start) + " and support " + std::to_string(current_merge.support)); - // // printMessage("Next SV call: " + std::to_string(next.start) + "-" + std::to_string(next.end) + " with likelihood " + std::to_string(next.hmm_likelihood) + " and read depth " + std::to_string(next.read_depth) + " and length " + std::to_string(next.end - next.start) + " and support " + std::to_string(next.support)); - // // printMessage("Comparing likelihoods: " + std::to_string(current_merge.hmm_likelihood) + " vs " + std::to_string(next.hmm_likelihood)); - - // // Keep the SV call with the higher likelihood. Compare only if - // // lengths are within 20% of each other - // uint32_t current_length = current_merge.end - current_merge.start; - // uint32_t next_length = next.end - next.start; - // double length_diff = std::abs((int)current_length - (int)next_length); - // double length_threshold = 0.2 * (int)current_length; - // if (length_diff <= length_threshold) { - // // printMessage("Length difference is within threshold: " + std::to_string(length_diff) + " <= " + std::to_string(length_threshold)); - - // if (next.hmm_likelihood > current_merge.hmm_likelihood) { - // current_merge = next; // Continue with the next call - // // printMessage("Keeping next SV call with likelihood " + std::to_string(next.hmm_likelihood)); - // } - - // } else { - // // Keep the larger SV - // if (next_length > current_length) { - // current_merge = next; - // // printMessage("[2] Keeping next SV call with length " + std::to_string(next_length)); - // } - // } - // } - // } - - // } else { - // // No overlap: Save the call and continue - // merged_sv_calls.emplace_back(current_merge); - // current_merge = next; - // } - // } - // merged_sv_calls.emplace_back(current_merge); // Save the last call - // sv_calls = merged_sv_calls; // Update the SV calls + merged_sv_calls.push_back(current_merge); // Add the last SV call + sv_calls = merged_sv_calls; // Update the SV calls int updated_size = sv_calls.size(); std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl; @@ -259,7 +153,7 @@ void mergeSVs(std::vector& sv_calls) void filterSVsWithLowSupport(std::vector& sv_calls, int min_support) { - int prev_size = sv_calls.size(); + // int prev_size = sv_calls.size(); // Filter SV calls with low read support sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) { From a169b731bbc1d68a8e6d475d530c2ba01bb4563c Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 11 Dec 2024 18:07:36 -0500 Subject: [PATCH 049/134] Replace fixed window with sample size --- include/cnv_caller.h | 10 +- include/input_data.h | 8 +- src/cnv_caller.cpp | 538 ++++++++++++++++++++++--------------------- src/input_data.cpp | 10 +- src/main.cpp | 28 +-- src/sv_caller.cpp | 41 +--- src/sv_object.cpp | 5 - 7 files changed, 308 insertions(+), 332 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index b36d414f..457336e1 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -76,9 +76,9 @@ class CNVCaller { void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair, double>& prediction); // Query a region for SNPs and return the SNP data - std::pair querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov); + void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data); - void querySNPs(std::string chr, uint32_t start, uint32_t end, std::set& snp_pos, std::unordered_map& snp_baf, std::unordered_map& snp_pfb); + void querySNPs(std::string chr, uint32_t start, uint32_t end, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp); // Split a region into chunks for parallel processing std::vector splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count); @@ -100,8 +100,10 @@ class CNVCaller { // chromosome coverage double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov); - void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::set& snp_pos, std::unordered_map& snp_baf); - void readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map& snp_pfb_map); + void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& pos_log2); + + void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp); + // void readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pfb); // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood); diff --git a/include/input_data.h b/include/input_data.h index 7d577784..65051e77 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -69,9 +69,9 @@ class InputData { void setEthnicity(std::string ethnicity); std::string getEthnicity(); - // Set the window size for the log2 ratio calculation. - void setWindowSize(int window_size); - int getWindowSize(); + // Set the sample size for HMM predictions. + void setSampleSize(int sample_size); + int getSampleSize(); // Set the minimum CNV length to use for copy number predictions. void setMinCNVLength(int min_cnv_length); @@ -112,7 +112,7 @@ class InputData { std::unordered_map pfb_filepaths; // Map of population frequency VCF filepaths by chromosome ReferenceGenome fasta_query; std::string output_dir; - int window_size; + int sample_size; int min_cnv_length; std::string chr; // Chromosome to analyze std::pair start_end; // Region to analyze diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 4eb0bc26..eb217728 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -46,8 +46,6 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair(), 0.0); } @@ -55,18 +53,44 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov) +void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data) { - SNPData snp_data; - bool snps_found = false; - uint32_t window_size = (uint32_t)this->input_data.getWindowSize(); + // uint32_t window_size = (uint32_t)this->input_data.getWindowSize(); + + // Initialize the SNP data with default values and sample size length + int sample_size = this->input_data.getSampleSize(); + int region_length = (int) (end_pos - start_pos + 1); + if (region_length < sample_size) + { + sample_size = region_length; + } + + printMessage("Querying SNPs for region length " + std::to_string(region_length) + " bp with sample size " + std::to_string(sample_size) + "..."); + + // std::set snp_pos(sample_size); + std::vector snp_pos(sample_size, 0); + std::vector snp_baf(sample_size, -1.0); + std::vector snp_pfb(sample_size, 0.5); + std::vector snp_log2_cov(sample_size, 0.0); + std::vector is_snp(sample_size, false); + // std::unordered_map snp_baf(sample_size, -1.0); + // std::unordered_map snp_pfb(sample_size, 0.5); // Query the SNPs for the entire region - std::set snp_pos; - std::unordered_map snp_baf; - std::unordered_map snp_pfb; - this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb); + this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp); + + // Get the log2 ratio for evenly spaced positions in the + // region + this->calculateRegionLog2Ratio(start_pos, end_pos, sample_size, pos_depth_map, mean_chr_cov, snp_log2_cov); + + // Update the SNP data with all information + snp_data.pos = std::move(snp_pos); + snp_data.baf = std::move(snp_baf); + snp_data.pfb = std::move(snp_pfb); + snp_data.log2_cov = std::move(snp_log2_cov); + snp_data.is_snp = std::move(is_snp); + /* // Loop through the range of the SV region and query the SNPs in a sliding // window, then calculate the log2 ratio for each window for (uint32_t i = start_pos; i <= end_pos; i += window_size) @@ -134,8 +158,8 @@ std::pair CNVCaller::querySNPRegion(std::string chr, uint32_t sta } } } - - return std::make_pair(snp_data, snps_found); + */ + // return std::make_pair(snp_data, snps_found); } std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map) @@ -164,17 +188,19 @@ std::tuple CNVCaller::runCopyNumberPrediction // uint32_t snp_end_pos = end_pos + sv_half_length; // Query the SNP region for the SV candidate - std::pair snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov); - SNPData& sv_snps = snp_call.first; - bool sv_snps_found = snp_call.second; + SNPData snp_data; + querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data); + // std::pair snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov); + // SNPData& sv_snps = snp_call.first; + // bool sv_snps_found = snp_call.second; // Run the Viterbi algorithm - // printMessage("Running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + " (" + std::to_string(sv_snps.pos.size()) + " SNPs, start=" + std::to_string(snp_start_pos) + ", end=" + std::to_string(snp_end_pos) + ")..."); + printMessage("Running Viterbi algorithm for SV with length " + std::to_string((int) (end_pos - start_pos)) + " bp, snp data found: " + std::to_string(snp_data.pos.size()) + "..."); std::pair, double> prediction; - runViterbi(hmm, sv_snps, prediction); + runViterbi(hmm, snp_data, prediction); if (prediction.first.size() == 0) { - return std::make_tuple(0.0, SVType::UNKNOWN, "./.", sv_snps_found); + return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false); } std::vector& state_sequence = prediction.first; @@ -184,7 +210,7 @@ std::tuple CNVCaller::runCopyNumberPrediction std::vector sv_states; for (size_t i = 0; i < state_sequence.size(); i++) { - if (sv_snps.pos[i] >= start_pos && sv_snps.pos[i] <= end_pos) + if (snp_data.pos[i] >= start_pos && snp_data.pos[i] <= end_pos) { sv_states.push_back(state_sequence[i]); } @@ -217,7 +243,8 @@ std::tuple CNVCaller::runCopyNumberPrediction predicted_cnv_type = getSVTypeFromCNState(max_state); genotype = cnv_genotype_map[max_state]; } - sv_snps.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data + snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data + printMessage("Finished running Viterbi algorithm for SV with length " + std::to_string((int) (end_pos - start_pos)) + " bp: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", returning..."); // Save the SV calls as a TSV file if enabled bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL); @@ -226,10 +253,10 @@ std::tuple CNVCaller::runCopyNumberPrediction std::string cnv_type_str = getSVTypeString(predicted_cnv_type); std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv"; printMessage("Saving SV split-alignment copy number predictions to " + sv_filename + "..."); - this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood); + this->saveSVCopyNumberToTSV(snp_data, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood); } - return std::make_tuple(likelihood, predicted_cnv_type, genotype, sv_snps_found); + return std::make_tuple(likelihood, predicted_cnv_type, genotype, true); } @@ -275,18 +302,17 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector sv_half_length ? start_pos - sv_half_length : 1; snp_end_pos = end_pos + sv_half_length; } - std::pair snp_call = this->querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov); - SNPData& sv_snps = snp_call.first; - bool snps_found = snp_call.second; + SNPData snp_data; + this->querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data); // Run the Viterbi algorithm - if (sv_snps.pos.size() == 0) { + if (snp_data.pos.size() == 0) { std::cerr << "ERROR: No windows for SV " << chr << ":" << start_pos << "-" << end_pos << " (" << snp_start_pos << "," << snp_end_pos << std::endl; continue; } std::pair, double> prediction; - runViterbi(hmm, sv_snps, prediction); + runViterbi(hmm, snp_data, prediction); std::vector& state_sequence = prediction.first; double likelihood = prediction.second; // printMessage("Finished running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "..."); @@ -296,7 +322,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector sv_states; for (size_t i = 0; i < state_sequence.size(); i++) { - if (sv_snps.pos[i] >= start_pos && sv_snps.pos[i] <= end_pos) + if (snp_data.pos[i] >= start_pos && snp_data.pos[i] <= end_pos) { sv_states.push_back(state_sequence[i]); } @@ -331,12 +357,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vectorinput_data.getSaveCNVData() && updated_sv_type != SVType::UNKNOWN && (end_pos - start_pos) > 10000) { // Add the state sequence to the SNP data (avoid copying the data) - sv_snps.state_sequence = std::move(state_sequence); + snp_data.state_sequence = std::move(state_sequence); // Save the SV calls as a TSV file std::string cnv_type_str = getSVTypeString(updated_sv_type); std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_CIGAR.tsv"; printMessage("Saving SV CIGAR copy number predictions to " + sv_filename); - this->saveSVCopyNumberToTSV(sv_snps, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood); + this->saveSVCopyNumberToTSV(snp_data, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood); } } } @@ -582,18 +599,42 @@ double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, const return window_log2_ratio; } -void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::set& snp_pos, std::unordered_map& snp_baf) +void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& log2_region) +{ + uint32_t region_length = end_pos - start_pos + 1; + for (int i = 0; i < sample_size; i++) + { + uint32_t pos = start_pos + ((double)region_length / sample_size) * i; + try { + uint32_t depth = pos_depth_map.at(pos); + + // Calculate the log2 ratio for the position + if (depth == 0) + { + log2_region[i] = 0.0; + } else { + log2_region[i] = log2((double) depth / mean_chr_cov); + } + + } catch (const std::out_of_range& e) { + log2_region[i] = 0.0; + } + // printMessage("Position: " + std::to_string((int)pos) + ", log2 ratio: " + std::to_string(log2_region[i])); + } +} + +void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp) { + // --------- SNP file --------- // Get the SNP file path std::string snp_filepath = this->input_data.getSNPFilepath(); if (snp_filepath.empty()) { - // throw std::runtime_error("ERROR: SNP file path is empty."); printError("ERROR: SNP file path is empty."); return; } - // Initialize the synced reader + // Initialize the SNP file reader bcf_srs_t *snp_reader = bcf_sr_init(); if (!snp_reader) { @@ -601,34 +642,20 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui printError("ERROR: Could not initialize SNP reader."); return; } + snp_reader->require_index = 1; - // Lock during reading - std::lock_guard lock(this->snp_file_mtx); - - // Set the region - std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); - if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) + // Set multi-threading if running on a single chromosome + if (this->input_data.getChromosome() != "") { - bcf_sr_destroy(snp_reader); - // throw std::runtime_error("ERROR: Could not set region for SNP reader: - // " + region_str); - printError("ERROR: Could not set region for SNP reader: " + region_str); - return; + int thread_count = this->input_data.getThreadCount() - 1; // Leave one thread for the main thread + printMessage("Setting SNP reader threads to " + std::to_string(thread_count / 2)); + bcf_sr_set_threads(snp_reader, std::max(1, thread_count / 2)); } - // Set multi-threading - // int thread_count = this->input_data.getThreadCount(); - // bcf_sr_set_threads(snp_reader, thread_count); - - // Enable index usage - snp_reader->require_index = 1; - // Add the SNP file to the reader if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0) { bcf_sr_destroy(snp_reader); - // throw std::runtime_error("ERROR: Could not add SNP file to reader: " - // + snp_filepath); printError("ERROR: Could not add SNP file to reader: " + snp_filepath); return; } @@ -638,124 +665,17 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui if (!snp_header) { bcf_sr_destroy(snp_reader); - // throw std::runtime_error("ERROR: Could not get header for SNP - // reader."); printError("ERROR: Could not get header for SNP reader."); return; } - // std::cout << "Iterating through SNPs in region " << region_str << "..." << std::endl; - int record_count = 0; - while (bcf_sr_next_line(snp_reader) > 0) - { - if (!bcf_sr_has_line(snp_reader, 0)) - { - continue; - } - bcf1_t *snp_record = bcf_sr_get_line(snp_reader, 0); - if (snp_record) - { - record_count++; - uint32_t pos = (uint32_t)snp_record->pos + 1; - - // Skip if not a SNP - if (!bcf_is_snp(snp_record)) - { - continue; - } - - // Get the QUAL, DP, and AD values - float qual = snp_record->qual; - if (bcf_float_is_missing(qual)) - { - // std::cerr << "ERROR: QUAL value is missing for SNP at " << chr << ":" << pos << std::endl; - } - // Skip if quality is less than 30 - if (qual <= 30) - { - continue; - } - - // Extract DP from FORMAT field - int32_t *dp = 0; - int dp_count = 0; - int dp_ret = bcf_get_format_int32(snp_header, snp_record, "DP", &dp, &dp_count); - bool dp_skip = false; - if (dp_ret < 0) - { - // std::cerr << "ERROR: Could not get DP value for SNP at " << chr << ":" << pos << std::endl; - } else { - // Skip if depth is not greater than 10 - for (int i = 0; i < dp_count; i++) - { - if (dp[i] <= 10) - { - dp_skip = true; - break; - } - } - } - free(dp); - if (dp_skip) - { - continue; - } - - // Skip if the SNP does not pass the filter - if (bcf_has_filter(snp_header, snp_record, const_cast("PASS")) != 1) - { - continue; - } - - // Extract AD from FORMAT field - int32_t *ad = 0; - int ad_count = 0; - int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count); - - // Skip if AD value is missing - if (ad_ret < 0) - { - // std::cerr << "ERROR: AD value is missing for SNP at " << chr - // << ":" << pos << std::endl; - // throw std::runtime_error("ERROR: AD value is missing for SNP - // at " + chr + ":" + std::to_string(pos)); - printError("ERROR: AD value is missing for SNP at " + chr + ":" + std::to_string(pos)); - continue; - } - - // Calculate the B-allele frequency (BAF) - double baf = 0.0; - double ad0 = 0.0; - double ad1 = 0.0; - for (int i = 0; i < ad_count; i++) - { - if (i == 0) - { - ad0 = (double) ad[i]; - } else if (i == 1) { - ad1 = (double) ad[i]; - } - } - free(ad); - baf = ad1 / (ad0 + ad1); - - // Insert the SNP position and BAF into the maps - snp_pos.insert(pos); - snp_baf[pos] = baf; - } - } + // --------- Population allele frequency file --------- - // Clean up - bcf_sr_destroy(snp_reader); -} - -void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::unordered_map& snp_pfb_map) -{ - // Get the population frequency file for the chromosome + // Get the population allele frequency file path std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr); - if (pfb_filepath == "") + if (pfb_filepath.empty()) { - // printError("No population frequency file provided for chromosome " + chr); + printError("ERROR: Population allele frequency file path is empty."); return; } @@ -785,48 +705,30 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos } } - // Remove the 'chr' prefix from the chromosome name for SNP data. All - // SNP data in this program does not use the 'chr' prefix - std::string chr_no_prefix = removeChrPrefix(chr); - // int thread_count = this->input_data.getThreadCount(); - - // Initialize the synced reader + // Initialize the population allele frequency reader bcf_srs_t *pfb_reader = bcf_sr_init(); if (!pfb_reader) { - // throw std::runtime_error("ERROR: Could not initialize synced reader - // for population frequency file: " + pfb_filepath); - printError("ERROR: Could not initialize synced reader for population frequency file: " + pfb_filepath); + bcf_sr_destroy(snp_reader); + printError("ERROR: Could not initialize population allele frequency reader."); return; } + pfb_reader->require_index = 1; - // Lock during reading - std::lock_guard lock(this->pfb_file_mtx); - - // Set the region for the synced reader - std::string region_str = chr_gnomad + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); - if (bcf_sr_set_regions(pfb_reader, region_str.c_str(), 0) < 0) + // Set multi-threading if running on a single chromosome + if (this->input_data.getChromosome() != "") { - bcf_sr_destroy(pfb_reader); - // throw std::runtime_error("ERROR: Could not set region for synced - // reader: " + region_str); - printError("ERROR: Could not set region for synced reader: " + region_str); - return; + int thread_count = this->input_data.getThreadCount() - 1; // Leave one thread for the main thread + printMessage("Setting population allele frequency reader threads to " + std::to_string(thread_count / 2)); + bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2)); } - // Set multi-threading - // bcf_sr_set_threads(pfb_reader, thread_count); - - // Enable index usage - pfb_reader->require_index = 1; - - // Add the population frequency file to the synced reader + // Add the population allele frequency file to the reader if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0) { + bcf_sr_destroy(snp_reader); bcf_sr_destroy(pfb_reader); - // throw std::runtime_error("ERROR: Could not add population frequency - // file to synced reader: " + pfb_filepath); - printError("ERROR: Could not add population frequency file to synced reader: " + pfb_filepath); + printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath); return; } @@ -834,72 +736,186 @@ void CNVCaller::readSNPPopulationFrequencies(std::string chr, uint32_t start_pos bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0); if (!pfb_header) { + bcf_sr_destroy(snp_reader); bcf_sr_destroy(pfb_reader); - // throw std::runtime_error("ERROR: Could not get header for population - // frequency file: " + pfb_filepath); - printError("ERROR: Could not get header for population frequency file: " + pfb_filepath); + printError("ERROR: Could not get header for population allele frequency reader."); return; } - int record_count = 0; - while (bcf_sr_next_line(pfb_reader) > 0) + // Split the region into samples + int sample_size = snp_pos.size(); + std::vector region_chunks = splitRegionIntoChunks(chr, start_pos, end_pos, sample_size); + + // Loop through the samples and read the SNP data, storing the first + // SNP position and BAF value for each sample + int print_count = 0; + int current_region = 0; + for (size_t i = 0; i < region_chunks.size(); ++i) { - if (!bcf_sr_has_line(pfb_reader, 0)) + current_region++; + // Lock during reading + // std::lock_guard lock(this->snp_file_mtx); + + // Read the SNP data ---------------------------------------------- + + // Set the region + std::string region_str = region_chunks[i]; + if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) { - continue; + bcf_sr_destroy(snp_reader); + printError("ERROR: Could not set region for SNP reader: " + region_str); + return; } - // pfb_record = bcf_sr_get_line(pfb_reader, 0); - bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0); - // Do something with the record - if (pfb_record) + + // std::cout << "Iterating through SNPs in region " << region_str << + // "..." << std::endl; + // printMessage("Iterating through SNPs in region " + std::to_string(current_region) + " of " + std::to_string((int) region_chunks.size()) + " with length " + std::to_string((int) (end_pos - start_pos)) + " bp..."); + bool snp_found = false; + while (bcf_sr_next_line(snp_reader) > 0) { - record_count++; - // Skip if not a SNP - if (!bcf_is_snp(pfb_record)) + if (!bcf_sr_has_line(snp_reader, 0)) { continue; } + bcf1_t *snp_record = bcf_sr_get_line(snp_reader, 0); + if (snp_record) + { + uint32_t pos = (uint32_t)snp_record->pos + 1; - uint32_t pos = (uint32_t) pfb_record->pos + 1; // 0-based to 1-based + // Skip if not a SNP + if (!bcf_is_snp(snp_record)) + { + continue; + } - // Get the population frequency for the SNP - float *pfb_f = NULL; - int count = 0; - int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count); - if (pfb_status < 0 || count == 0) - { - continue; + // Get the QUAL, DP, and AD values + if (bcf_float_is_missing(snp_record->qual) || snp_record->qual <= 30) + { + continue; + } + + // Extract DP from FORMAT field + int32_t *dp = 0; + // int dp_values[2]; + int dp_count = 0; + int dp_ret = bcf_get_format_int32(snp_header, snp_record, "DP", &dp, &dp_count); + if (dp_ret < 0 || dp[0] <= 10) + { + continue; + } + free(dp); + + // Skip if the SNP does not pass the filter + if (bcf_has_filter(snp_header, snp_record, const_cast("PASS")) != 1) + { + continue; + } + + // Extract AD from FORMAT field + int32_t *ad = 0; + // int ad_values[2]; + int ad_count = 0; + int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count); + // int ad_ret = bcf_get_format_int32(snp_header, snp_record, + // "AD", &ad, &ad_count); + if (ad_ret < 0 || ad_count < 2) + { + continue; + } + + // Calculate the B-allele frequency (BAF) + // double baf = (double) ad_values[1] / (double) (ad_values[0] + + // ad_values[1]); + double baf = (double) ad[1] / (double) (ad[0] + ad[1]); + free(ad); + + // Add the SNP position and BAF information + snp_pos[i] = pos; + snp_baf[i] = baf; + is_snp[i] = true; + snp_found = true; + + break; // Only one SNP per region } - double pfb = (double) pfb_f[0]; - free(pfb_f); + } + + if (snp_reader->errnum) + { + printError("ERROR: " + std::string(bcf_sr_strerror(snp_reader->errnum))); + } + + // Continue if no SNP was found in the region + if (!snp_found) + { + continue; + } + + // Read the population allele frequency data ---------------------- - // Continue if the population frequency is outside the threshold - if (pfb <= MIN_PFB || pfb >= MAX_PFB) + // Set the region as the SNP position + uint32_t target_snp_pos = snp_pos[i]; // Already 1-based + std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos); + if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0) + { + bcf_sr_destroy(snp_reader); + bcf_sr_destroy(pfb_reader); + printError("ERROR: Could not set region for population allele frequency reader: " + region_str); + return; + } + + // Find the SNP position in the population allele frequency file + while (bcf_sr_next_line(pfb_reader) > 0) + { + if (!bcf_sr_has_line(pfb_reader, 0)) { continue; } - - // Add the population frequency to the SNP data - if (snp_pfb_map.find(pos) == snp_pfb_map.end()) + // pfb_record = bcf_sr_get_line(pfb_reader, 0); + bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0); + // Do something with the record + if (pfb_record) { - snp_pfb_map[pos] = pfb; - } else { - // Keep the larger population frequency - if (pfb > snp_pfb_map[pos]) + // Skip if not a SNP + if (!bcf_is_snp(pfb_record)) + { + continue; + } + + // Get the population frequency for the SNP + float *pfb_f = NULL; + int count = 0; + int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count); + if (pfb_status < 0 || count == 0) { - snp_pfb_map[pos] = pfb; + continue; + } + double pfb = (double) pfb_f[0]; + free(pfb_f); + + // Continue if the population frequency is outside the threshold + if (pfb <= MIN_PFB || pfb >= MAX_PFB) + { + continue; + } + + // Add the population frequency to the SNP data + snp_pfb[i] = pfb; + + // Break after finding the SNP position + break; + + if (print_count < 20) { + printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")"); + print_count++; } } } + if (pfb_reader->errnum) + { + printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum))); + } } - if (pfb_reader->errnum) - { - // std::cerr << "ERROR: " <errnum) << - // std::endl; - printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum))); - } - - // Clean up + bcf_sr_destroy(snp_reader); bcf_sr_destroy(pfb_reader); } @@ -995,29 +1011,29 @@ void CNVCaller::updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, doubl snp_data.is_snp.emplace_back(is_snp); } -void CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end, std::set& snp_pos, std::unordered_map& snp_baf, std::unordered_map& snp_pfb) +void CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp) { std::string snp_chr = chr; chr = removeChrPrefix(chr); // Query the SNP allele frequencies for the SNPs - std::map> snp_map; - this->readSNPAlleleFrequencies(snp_chr, start, end, snp_pos, snp_baf); + // std::map> snp_map; + this->readSNPAlleleFrequencies(snp_chr, start, end, snp_pos, snp_baf, snp_pfb, is_snp); // Query the population frequencies for the SNPs - std::unordered_map pfb_map; - this->readSNPPopulationFrequencies(chr, start, end, pfb_map); + // std::unordered_map pfb_map; + // this->readSNPPopulationFrequencies(chr, start, end, snp_pfb); // Filter out the SNP population frequencies that are not in the SNP // position set - double pfb_default = 0.5; - for (auto& pos : snp_pos) - { - if (pfb_map.find(pos) != pfb_map.end()) - { - snp_pfb[pos] = pfb_map[pos]; - } else { - snp_pfb[pos] = pfb_default; - } - } + // double pfb_default = 0.5; + // for (auto& pos : snp_pos) + // { + // if (pfb_map.find(pos) != pfb_map.end()) + // { + // snp_pfb[pos] = pfb_map[pos]; + // } else { + // snp_pfb[pos] = pfb_default; + // } + // } } diff --git a/src/input_data.cpp b/src/input_data.cpp index 74dd9788..4f9ae124 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -24,7 +24,7 @@ InputData::InputData() this->start_end = std::make_pair(0, 0); this->region_set = false; this->output_dir = ""; - this->window_size = 2500; + this->sample_size = 100; this->min_cnv_length = 1000; this->thread_count = 1; this->hmm_filepath = "data/wgs.hmm"; @@ -124,14 +124,14 @@ void InputData::setOutputDir(std::string dirpath) system(cmd.c_str()); } -int InputData::getWindowSize() +int InputData::getSampleSize() { - return this->window_size; + return this->sample_size; } -void InputData::setWindowSize(int window_size) +void InputData::setSampleSize(int sample_size) { - this->window_size = window_size; + this->sample_size = sample_size; } std::string InputData::getSNPFilepath() diff --git a/src/main.cpp b/src/main.cpp index 1d78ad5f..da0d8d93 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -40,8 +40,8 @@ void runContextSV(const std::unordered_map& args) if (args.find("hmm-file") != args.end()) { input_data.setHMMFilepath(args.at("hmm-file")); } - if (args.find("window-size") != args.end()) { - input_data.setWindowSize(std::stoi(args.at("window-size"))); + if (args.find("sample-size") != args.end()) { + input_data.setSampleSize(std::stoi(args.at("sample-size"))); } if (args.find("min-cnv") != args.end()) { input_data.setMinCNVLength(std::stoi(args.at("min-cnv"))); @@ -58,20 +58,6 @@ void runContextSV(const std::unordered_map& args) if (args.find("debug") != args.end()) { input_data.setVerbose(true); } - // input_data.setShortReadBam(bamFile); - // input_data.setLongReadBam(bamFile); - // input_data.setRefGenome(refFile); - // input_data.setSNPFilepath(vcfFile); - // //input_data.setChromosome("21"); - // //input_data.setRegion("14486099-14515105"); - // input_data.setThreadCount(threadCount); - // input_data.setAlleleFreqFilepaths(pfbFile); - // input_data.setHMMFilepath(hmmFile); - // input_data.setOutputDir(outputDir); - // input_data.saveCNVData(false); - // input_data.setThreadCount(threadCount); - // input_data.setWindowSize(windowSize); - // input_data.setMinCNVLength(minCNV); // Run ContextSV run(input_data); @@ -85,15 +71,15 @@ void printUsage(const std::string& programName) { << " -s, --snp SNPs VCF file (required)\n" << " -o, --outdir Output directory (required)\n" << " -c, --chr Chromosome\n" - << " -r, --region Region (e.g., 14486099-14515105)\n" + << " -r, --region Region (start-end)\n" << " -t, --threads Number of threads\n" << " -h, --hmm HMM file\n" - << " -w, --window Window size\n" + << " -n, --sample-size Sample size for HMM predictions\n" << " --min-cnv Minimum CNV length\n" << " -e, --eth ETH file\n" << " -p, --pfb PFB file\n" << " --save-cnv Save CNV data\n" - << " --debug Debug mode\n" + << " --debug Debug mode with verbose logging\n" << " --version Print version and exit\n" << " -h, --help Print usage and exit\n"; } @@ -120,8 +106,8 @@ std::unordered_map parseArguments(int argc, char* argv args["thread-count"] = argv[++i]; } else if ((arg == "-h" || arg == "--hmm") && i + 1 < argc) { args["hmm-file"] = argv[++i]; - } else if ((arg == "-w" || arg == "--window") && i + 1 < argc) { - args["window-size"] = argv[++i]; + } else if ((arg == "-n" || arg == "--sample-size") && i + 1 < argc) { + args["sample-size"] = argv[++i]; } else if (arg == "--min-cnv" && i + 1 < argc) { args["min-cnv"] = argv[++i]; } else if ((arg == "-e" || arg == "--eth") && i + 1 < argc) { diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 97672a91..2e2295da 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -78,11 +78,6 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, end--; // Adjust to the last position of the alignment bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); - // Check for underflow - if (start > 4000000000 || end > 4000000000) { - throw std::runtime_error("ERROR: Integer underflow for alignment at position " + std::to_string(start) + "-" + std::to_string(end)); - } - // Call SVs directly from the CIGAR string std::tuple, uint32_t, uint32_t> query_info; this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true, pos_depth_map); @@ -91,11 +86,6 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, uint32_t query_start = std::get<1>(query_info); uint32_t query_end = std::get<2>(query_info); - // Check for underflow - if (query_start > 4000000000 || query_end > 4000000000) { - throw std::runtime_error("ERROR: Integer underflow for query at position " + std::to_string(query_start) + "-" + std::to_string(query_end)); - } - // Add the primary alignment to the map AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand); primary_alignments[qname] = alignment; @@ -481,9 +471,13 @@ void SVCaller::run() std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; const CHMM& hmm = ReadCHMM(hmm_filepath.c_str()); - // Set up thread pool - const int max_threads = this->input_data.getThreadCount(); - std::cout << "Using " << max_threads << " threads for processing..." << std::endl; + // Use multi-threading across chromosomes unless a single chromosome is + // specified + int max_threads = 1; + if (this->input_data.getChromosome() == "") { + max_threads = this->input_data.getThreadCount(); + std::cout << "Using " << max_threads << " threads for processing..." << std::endl; + } ThreadPool pool(max_threads); // Shared resources @@ -507,10 +501,8 @@ void SVCaller::run() } }; - // Futures vector - std::vector> futures; - // Submit tasks to the thread pool and track futures + std::vector> futures; for (const auto& chr : chromosomes) { futures.emplace_back(pool.enqueue([&, chr] { printMessage("Processing chromosome " + chr); @@ -553,16 +545,12 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap // Find split-read SV evidence int sv_count = 0; uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength(); - int primary_count = primary_map.size(); - int current_primary = 0; for (const auto& entry : primary_map) { - current_primary++; std::string qname = entry.first; AlignmentData primary_alignment = entry.second; std::string primary_chr = std::get<0>(primary_alignment); uint32_t primary_start = std::get<1>(primary_alignment); uint32_t primary_end = std::get<2>(primary_alignment); - printMessage("Processing primary alignment " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " (Location: " + primary_chr + ":" + std::to_string(primary_start+1) + "-" + std::to_string(primary_end+1) + ")..."); // Skip primary alignments that do not have supplementary alignments if (supp_map.find(qname) == supp_map.end()) { @@ -571,7 +559,7 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap // Find the largest supplementary alignment, and also identify // inversions - printMessage("Finding largest supplementary alignment..."); + // printMessage("Finding largest supplementary alignment..."); AlignmentData largest_supp_alignment = supp_map[qname][0]; uint32_t largest_supp_length = 0; for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { @@ -599,7 +587,6 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap continue; } - printMessage("Running copy number prediction on inversion: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1) + " of length " + std::to_string(supp_length) + " bp..."); std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map); if (std::get<1>(result) == SVType::UNKNOWN) { continue; @@ -607,20 +594,17 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); - printMessage("Calculating read depth for inversion (length: " + std::to_string(supp_length) + " bp)..."); int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); if (supp_type == SVType::NEUTRAL) { addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "", "HMM", "./.", supp_lh, read_depth); sv_count++; } else if (supp_type == SVType::DUP) { - // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", "", "HMM", "./.", supp_lh, read_depth); } } else { // Add the inversion without running copy number predictions // (too small for predictions) - printMessage("Calculating read depth for small inversion (length: " + std::to_string(supp_length) + " bp)..."); int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "", "REV", "./.", 0.0, read_depth); } @@ -628,14 +612,12 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap } // Trim overlapping alignments - printMessage("Trimming overlapping alignments..."); uint32_t supp_start = std::get<1>(largest_supp_alignment); uint32_t supp_end = std::get<2>(largest_supp_alignment); bool primary_before_supp = primary_start < supp_start; trimOverlappingAlignments(primary_alignment, largest_supp_alignment); // Create the SV candidate using both alignments - printMessage("Creating SV candidates..."); supp_start = std::get<1>(largest_supp_alignment); supp_end = std::get<2>(largest_supp_alignment); primary_start = std::get<1>(primary_alignment); @@ -669,7 +651,6 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap } // printMessage("Running copy number prediction on boundary: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right)); - printMessage("Running copy number prediction on boundary (Length: " + std::to_string(boundary_right - boundary_left) + " bp)..."); std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map); if (std::get<1>(bd_result) == SVType::UNKNOWN) { continue; @@ -690,7 +671,6 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap // printMessage("Running copy number prediction on gap: " + // primary_chr + ":" + std::to_string(gap_left) + "-" + // std::to_string(gap_right)); - printMessage("Running copy number prediction on gap (Length: " + std::to_string(gap_right - gap_left) + " bp)..."); std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map); if (std::get<1>(gap_result) == SVType::UNKNOWN) { continue; @@ -701,19 +681,16 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap // If higher likelihood than the boundary, add the gap as the SV call if (gap_lh > bd_lh) { int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right); - printMessage("Calculating read depth for gap (length: " + std::to_string(gap_right - gap_left) + " bp)..."); std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">"; addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "GAP", "./.", gap_lh, read_depth); } else { // Add the boundary as the SV call - printMessage("Calculating read depth for boundary (length: " + std::to_string(boundary_right - boundary_left) + " bp)..."); int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth); } } else { // Add the boundary as the SV call - printMessage("Calculating read depth for boundary (length: " + std::to_string(boundary_right - boundary_left) + " bp)..."); int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth); diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 04201066..5330fd9f 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -16,11 +16,6 @@ bool SVCall::operator<(const SVCall & other) const void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth) { - // Catch underflow errors - if (start > 4000000000 || end > 4000000000) { - throw std::runtime_error("ERROR: Integer underflow for SV call at position " + std::to_string(start) + "-" + std::to_string(end)); - } - // Ignore unknown SV types if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") { return; From 9e4367834156e424511626042a04fde373fe5714 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 12 Dec 2024 14:15:01 -0500 Subject: [PATCH 050/134] log memory usage --- include/cnv_caller.h | 2 +- include/utils.h | 2 + src/cnv_caller.cpp | 337 +++++++++++++++++-------------------------- src/contextsv.cpp | 17 +-- src/main.cpp | 12 +- src/sv_caller.cpp | 32 +++- src/sv_object.cpp | 6 +- src/utils.cpp | 12 ++ 8 files changed, 190 insertions(+), 230 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 457336e1..f17e807e 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -94,7 +94,7 @@ class CNVCaller { void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map); // Calculate the mean chromosome coverage - std::pair> calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len); + double calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map); // Calculate the log2 ratio for a region given the read depths and mean // chromosome coverage diff --git a/include/utils.h b/include/utils.h index 4ec19138..7311efbc 100644 --- a/include/utils.h +++ b/include/utils.h @@ -25,4 +25,6 @@ std::string getElapsedTime(std::chrono::high_resolution_clock::time_point start, std::string removeChrPrefix(std::string chr); +void printMemoryUsage(const std::string &functionName); + #endif // UTILS_H diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index eb217728..2d2c5548 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -2,7 +2,6 @@ #include "cnv_caller.h" #include - #include #include #include @@ -65,8 +64,6 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end sample_size = region_length; } - printMessage("Querying SNPs for region length " + std::to_string(region_length) + " bp with sample size " + std::to_string(sample_size) + "..."); - // std::set snp_pos(sample_size); std::vector snp_pos(sample_size, 0); std::vector snp_baf(sample_size, -1.0); @@ -89,77 +86,6 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end snp_data.pfb = std::move(snp_pfb); snp_data.log2_cov = std::move(snp_log2_cov); snp_data.is_snp = std::move(is_snp); - - /* - // Loop through the range of the SV region and query the SNPs in a sliding - // window, then calculate the log2 ratio for each window - for (uint32_t i = start_pos; i <= end_pos; i += window_size) - { - // Run a sliding non-overlapping window of size window_size across - // the SV region and calculate the log2 ratio for each window - uint32_t window_start = i; - uint32_t window_end = std::min(i + window_size - 1, end_pos); - - // Get the SNP info for the window - std::vector snp_window_pos; - std::vector snp_window_bafs; - std::vector snp_window_pfbs; - auto it_start = snp_pos.lower_bound(window_start); - auto it_end = snp_pos.upper_bound(window_end); - for (auto it = it_start; it != it_end; it++) - { - snp_window_pos.push_back(*it); - snp_window_bafs.push_back(snp_baf[*it]); - snp_window_pfbs.push_back(snp_pfb[*it]); - } - - // Loop though the SNP positions and calculate the log2 ratio for - // the window up to the SNP, then calculate the log2 ratio centered - // at the SNP, and finally calculate the log2 ratio for the window - // after the SNP, and continue until the end of the window - // (If there are no SNPs in the window, then use the default BAF and - // PFB values, and the coverage log2 ratio) - // If no SNPs, then calculate the log2 ratio for the window - if (snp_window_pos.size() == 0) - { - double window_log2_ratio = calculateLog2Ratio(window_start, window_end, pos_depth_map, mean_chr_cov); - double pfb_default = 0.5; - double baf_default = -1.0; // Use -1.0 to indicate no BAF data - this->updateSNPData(snp_data, (window_start + window_end) / 2, pfb_default, baf_default, window_log2_ratio, false); - - } else { - snps_found = true; - - // Loop through the SNPs and calculate the log2 ratios - for (int j = 0; j < (int) snp_window_pos.size(); j++) - { - // Just use a window centered at the SNP position - uint32_t bin_start = snp_window_pos[j] - window_size / 2; - uint32_t bin_end = snp_window_pos[j] + window_size / 2; - - // Trim the bin start and end to 1/2 the distance from the - // neighboring SNPs (or the start/end of the window) - if (j > 0) - { - bin_start = std::max(bin_start, (snp_window_pos[j-1] + snp_window_pos[j]) / 2); - } - - if (j < (int) snp_window_pos.size() - 1) - { - bin_end = std::min(bin_end, (snp_window_pos[j] + snp_window_pos[j+1]) / 2); - } - - // Calculate the log2 ratio for the SNP bin - double bin_cov = calculateLog2Ratio(bin_start, bin_end, pos_depth_map, mean_chr_cov); - this->updateSNPData(snp_data, snp_window_pos[j], snp_window_pfbs[j], snp_window_bafs[j], bin_cov, true); - - // Update the previous bin start - bin_start = bin_end + 1; - } - } - } - */ - // return std::make_pair(snp_data, snps_found); } std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map) @@ -195,13 +121,14 @@ std::tuple CNVCaller::runCopyNumberPrediction // bool sv_snps_found = snp_call.second; // Run the Viterbi algorithm - printMessage("Running Viterbi algorithm for SV with length " + std::to_string((int) (end_pos - start_pos)) + " bp, snp data found: " + std::to_string(snp_data.pos.size()) + "..."); + printMemoryUsage("Before running Viterbi algorithm, "); std::pair, double> prediction; runViterbi(hmm, snp_data, prediction); if (prediction.first.size() == 0) { return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false); } + printMemoryUsage("After running Viterbi algorithm, "); std::vector& state_sequence = prediction.first; double likelihood = prediction.second; @@ -244,7 +171,6 @@ std::tuple CNVCaller::runCopyNumberPrediction genotype = cnv_genotype_map[max_state]; } snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data - printMessage("Finished running Viterbi algorithm for SV with length " + std::to_string((int) (end_pos - start_pos)) + " bp: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", returning..."); // Save the SV calls as a TSV file if enabled bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL); @@ -315,7 +241,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector& state_sequence = prediction.first; double likelihood = prediction.second; - // printMessage("Finished running Viterbi algorithm for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "..."); // Get all the states in the SV region // printMessage("Getting states for SV " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + "..."); @@ -417,9 +342,9 @@ std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint3 } // Calculate the mean chromosome coverage -std::pair> CNVCaller::calculateMeanChromosomeCoverage(std::string chr, uint32_t chr_len) +double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map) { - std::vector chr_pos_depth_map(chr_len+1, 0); // 1-based index + printMemoryUsage("Before calculating mean chromosome coverage, "); { // Lock the bam file std::lock_guard lock(this->bam_file_mtx); @@ -429,10 +354,8 @@ std::pair> CNVCaller::calculateMeanChromosomeCover samFile *bam_file = sam_open(bam_filepath.c_str(), "r"); if (!bam_file) { - // throw std::runtime_error("ERROR: Could not open BAM file: " + - // bam_filepath); printError("ERROR: Could not open BAM file: " + bam_filepath); - return std::make_pair(0.0, chr_pos_depth_map); + return 0.0; } // Enable multi-threading @@ -444,8 +367,7 @@ std::pair> CNVCaller::calculateMeanChromosomeCover { sam_close(bam_file); printError("ERROR: Could not read header from BAM file: " + bam_filepath); - return std::make_pair(0.0, chr_pos_depth_map); - // throw std::runtime_error("ERROR: Could not read header from BAM file: " + bam_filepath); + return 0.0; } // Load the index @@ -454,10 +376,8 @@ std::pair> CNVCaller::calculateMeanChromosomeCover { bam_hdr_destroy(bam_header); sam_close(bam_file); - // throw std::runtime_error("ERROR: Could not load index for BAM - // file: " + bam_filepath); printError("ERROR: Could not load index for BAM file: " + bam_filepath); - return std::make_pair(0.0, chr_pos_depth_map); + return 0.0; } // Create an iterator for the chromosome @@ -467,11 +387,8 @@ std::pair> CNVCaller::calculateMeanChromosomeCover hts_idx_destroy(bam_index); bam_hdr_destroy(bam_header); sam_close(bam_file); - // throw std::runtime_error("ERROR: Could not create iterator for - // chromosome: " + chr + ", check if the chromosome exists in the - // BAM file."); printError("ERROR: Could not create iterator for chromosome: " + chr + ", check if the chromosome exists in the BAM file."); - return std::make_pair(0.0, chr_pos_depth_map); + return 0.0; } // Initialize the record @@ -482,14 +399,11 @@ std::pair> CNVCaller::calculateMeanChromosomeCover hts_idx_destroy(bam_index); bam_hdr_destroy(bam_header); sam_close(bam_file); - // throw std::runtime_error("ERROR: Could not initialize BAM - // record."); printError("ERROR: Could not initialize BAM record."); - return std::make_pair(0.0, chr_pos_depth_map); + return 0.0; } // Iterate through the chromosome and update the depth map - // std::unordered_map chr_pos_depth_map; while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0) { // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads @@ -516,11 +430,8 @@ std::pair> CNVCaller::calculateMeanChromosomeCover try { chr_pos_depth_map[ref_pos + j]++; } catch (const std::out_of_range& oor) { - // std::cerr << "Out of range error for " << chr << - // ":" << ref_pos+j << std::endl; printError("Out of range error for " + chr + ":" + std::to_string(ref_pos+j)); } - // chr_pos_depth_map[ref_pos + j]++; } } @@ -531,8 +442,6 @@ std::pair> CNVCaller::calculateMeanChromosomeCover } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) { // Do nothing } else { - // throw std::runtime_error("ERROR: Unknown CIGAR operation: - // " + std::to_string(op)); printError("ERROR: Unknown CIGAR operation: " + std::to_string(op)); } } @@ -545,6 +454,7 @@ std::pair> CNVCaller::calculateMeanChromosomeCover bam_hdr_destroy(bam_header); sam_close(bam_file); } + printMemoryUsage("After calculating mean chromosome coverage, "); // Calculate the mean chromosome coverage for positions with non-zero depth uint64_t cum_depth = 0; @@ -564,7 +474,8 @@ std::pair> CNVCaller::calculateMeanChromosomeCover mean_chr_cov = static_cast(cum_depth) / static_cast(pos_count); } - return std::make_pair(mean_chr_cov, chr_pos_depth_map); + // return std::make_pair(mean_chr_cov, chr_pos_depth_map); + return mean_chr_cov; } double CNVCaller::calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov) @@ -625,6 +536,8 @@ void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, i void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp) { + printMemoryUsage("Reading SNP allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", "); + // --------- SNP file --------- // Get the SNP file path std::string snp_filepath = this->input_data.getSNPFilepath(); @@ -659,6 +572,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui printError("ERROR: Could not add SNP file to reader: " + snp_filepath); return; } + printMemoryUsage("After adding SNP file to reader, "); // Get the header bcf_hdr_t *snp_header = bcf_sr_get_header(snp_reader, 0); @@ -672,74 +586,82 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // --------- Population allele frequency file --------- // Get the population allele frequency file path + bool use_pfb = true; std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr); if (pfb_filepath.empty()) { - printError("ERROR: Population allele frequency file path is empty."); - return; + use_pfb = false; + // printError("ERROR: Population allele frequency file path is empty."); + // return; } - // Determine the ethnicity-specific allele frequency key - std::string AF_key = "AF"; - if (this->input_data.getEthnicity() != "") - { - AF_key += "_" + this->input_data.getEthnicity(); - } - - // Check if the filepath uses the 'chr' prefix notations based on the - // chromosome name (*.chr1.vcf.gz vs *.1.vcf.gz) - std::string chr_gnomad = chr; // gnomAD data may or may not have the 'chr' prefix - std::string chr_prefix = "chr"; - if (pfb_filepath.find(chr_prefix) == std::string::npos) + bcf_srs_t *pfb_reader = bcf_sr_init(); + std::string chr_gnomad; + std::string AF_key; + if (use_pfb) { - // Remove the 'chr' prefix from the chromosome name - if (chr_gnomad.find(chr_prefix) != std::string::npos) + // Determine the ethnicity-specific allele frequency key + AF_key = "AF"; + if (this->input_data.getEthnicity() != "") { - chr_gnomad = chr_gnomad.substr(chr_prefix.length()); + AF_key += "_" + this->input_data.getEthnicity(); } - } else { - // Add the 'chr' prefix to the chromosome name - if (chr_gnomad.find(chr_prefix) == std::string::npos) + + // Check if the filepath uses the 'chr' prefix notations based on the + // chromosome name (*.chr1.vcf.gz vs *.1.vcf.gz) + chr_gnomad = chr; // gnomAD data may or may not have the 'chr' prefix + std::string chr_prefix = "chr"; + if (pfb_filepath.find(chr_prefix) == std::string::npos) { - chr_gnomad = chr_prefix + chr; + // Remove the 'chr' prefix from the chromosome name + if (chr_gnomad.find(chr_prefix) != std::string::npos) + { + chr_gnomad = chr_gnomad.substr(chr_prefix.length()); + } + } else { + // Add the 'chr' prefix to the chromosome name + if (chr_gnomad.find(chr_prefix) == std::string::npos) + { + chr_gnomad = chr_prefix + chr; + } } - } - // Initialize the population allele frequency reader - bcf_srs_t *pfb_reader = bcf_sr_init(); - if (!pfb_reader) - { - bcf_sr_destroy(snp_reader); - printError("ERROR: Could not initialize population allele frequency reader."); - return; - } - pfb_reader->require_index = 1; + // Initialize the population allele frequency reader + if (!pfb_reader) + { + bcf_sr_destroy(snp_reader); + printError("ERROR: Could not initialize population allele frequency reader."); + return; + } + pfb_reader->require_index = 1; - // Set multi-threading if running on a single chromosome - if (this->input_data.getChromosome() != "") - { - int thread_count = this->input_data.getThreadCount() - 1; // Leave one thread for the main thread - printMessage("Setting population allele frequency reader threads to " + std::to_string(thread_count / 2)); - bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2)); - } + // Set multi-threading if running on a single chromosome + if (this->input_data.getChromosome() != "") + { + int thread_count = this->input_data.getThreadCount() - 1; // Leave one thread for the main thread + printMessage("Setting population allele frequency reader threads to " + std::to_string(thread_count / 2)); + bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2)); + } - // Add the population allele frequency file to the reader - if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0) - { - bcf_sr_destroy(snp_reader); - bcf_sr_destroy(pfb_reader); - printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath); - return; - } + // Add the population allele frequency file to the reader + if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0) + { + bcf_sr_destroy(snp_reader); + bcf_sr_destroy(pfb_reader); + printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath); + return; + } + printMemoryUsage("After adding population allele frequency file to reader, "); - // Get the header - bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0); - if (!pfb_header) - { - bcf_sr_destroy(snp_reader); - bcf_sr_destroy(pfb_reader); - printError("ERROR: Could not get header for population allele frequency reader."); - return; + // Get the header + bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0); + if (!pfb_header) + { + bcf_sr_destroy(snp_reader); + bcf_sr_destroy(pfb_reader); + printError("ERROR: Could not get header for population allele frequency reader."); + return; + } } // Split the region into samples @@ -759,6 +681,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Read the SNP data ---------------------------------------------- // Set the region + printMemoryUsage("Before setting region for SNP reader, "); std::string region_str = region_chunks[i]; if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) { @@ -766,6 +689,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui printError("ERROR: Could not set region for SNP reader: " + region_str); return; } + printMemoryUsage("After setting region for SNP reader, and before reading SNPs, "); // std::cout << "Iterating through SNPs in region " << region_str << // "..." << std::endl; @@ -839,6 +763,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui } } + printMemoryUsage("After reading SNPs for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", "); + if (snp_reader->errnum) { printError("ERROR: " + std::string(bcf_sr_strerror(snp_reader->errnum))); @@ -850,70 +776,75 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui continue; } - // Read the population allele frequency data ---------------------- - - // Set the region as the SNP position - uint32_t target_snp_pos = snp_pos[i]; // Already 1-based - std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos); - if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0) - { - bcf_sr_destroy(snp_reader); - bcf_sr_destroy(pfb_reader); - printError("ERROR: Could not set region for population allele frequency reader: " + region_str); - return; - } + printMemoryUsage("Before reading population allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", "); - // Find the SNP position in the population allele frequency file - while (bcf_sr_next_line(pfb_reader) > 0) + // Read the population allele frequency data ---------------------- + if (use_pfb) { - if (!bcf_sr_has_line(pfb_reader, 0)) + // Set the region as the SNP position + uint32_t target_snp_pos = snp_pos[i]; // Already 1-based + std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos); + if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0) { - continue; + bcf_sr_destroy(snp_reader); + bcf_sr_destroy(pfb_reader); + printError("ERROR: Could not set region for population allele frequency reader: " + region_str); + return; } - // pfb_record = bcf_sr_get_line(pfb_reader, 0); - bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0); - // Do something with the record - if (pfb_record) + + // Find the SNP position in the population allele frequency file + while (bcf_sr_next_line(pfb_reader) > 0) { - // Skip if not a SNP - if (!bcf_is_snp(pfb_record)) + if (!bcf_sr_has_line(pfb_reader, 0)) { continue; } - - // Get the population frequency for the SNP - float *pfb_f = NULL; - int count = 0; - int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count); - if (pfb_status < 0 || count == 0) + // pfb_record = bcf_sr_get_line(pfb_reader, 0); + bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0); + // Do something with the record + if (pfb_record) { - continue; - } - double pfb = (double) pfb_f[0]; - free(pfb_f); + // Skip if not a SNP + if (!bcf_is_snp(pfb_record)) + { + continue; + } - // Continue if the population frequency is outside the threshold - if (pfb <= MIN_PFB || pfb >= MAX_PFB) - { - continue; - } + // Get the population frequency for the SNP + float *pfb_f = NULL; + int count = 0; + int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count); + if (pfb_status < 0 || count == 0) + { + continue; + } + double pfb = (double) pfb_f[0]; + free(pfb_f); + + // Continue if the population frequency is outside the threshold + if (pfb <= MIN_PFB || pfb >= MAX_PFB) + { + continue; + } - // Add the population frequency to the SNP data - snp_pfb[i] = pfb; + // Add the population frequency to the SNP data + snp_pfb[i] = pfb; - // Break after finding the SNP position - break; + // Break after finding the SNP position + break; - if (print_count < 20) { - printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")"); - print_count++; + if (print_count < 20) { + printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")"); + print_count++; + } } } + if (pfb_reader->errnum) + { + printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum))); + } } - if (pfb_reader->errnum) - { - printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum))); - } + printMemoryUsage("After reading population allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", "); } bcf_sr_destroy(snp_reader); bcf_sr_destroy(pfb_reader); diff --git a/src/contextsv.cpp b/src/contextsv.cpp index 5a9e7ffd..107a6dae 100644 --- a/src/contextsv.cpp +++ b/src/contextsv.cpp @@ -19,19 +19,10 @@ ContextSV::ContextSV(InputData& input_data) int ContextSV::run() { - ReferenceGenome ref_genome = this->input_data.getRefGenome(); // Load the reference genome - SVCaller sv_caller(this->input_data); // Create an SV caller object - // SVCaller sv_caller(*this->input_data); // Create an SV caller object - // SVData sv_calls = sv_caller.run(); // Run the SV caller - // std::unordered_map> sv_calls = - // sv_caller.run(); // Run the SV caller - sv_caller.run(); // Run the SV caller - // std::string output_dir = this->input_data->getOutputDir(); // Get the output directory - - // std::cout << "Writing SV calls to file " << output_dir << "/output.vcf..." << std::endl; - // sv_caller.saveToVCF(ref_genome, output_dir); // Save the SV calls to a VCF file - // sv_calls.saveToVCF(ref_genome, output_dir); // Save the SV calls to a VCF file - std::cout << "SV calling complete." << std::endl; + printMemoryUsage("Before creating SV caller, "); + SVCaller sv_caller(this->input_data); + printMemoryUsage("After creating SV caller, "); + sv_caller.run(); return 0; } diff --git a/src/main.cpp b/src/main.cpp index da0d8d93..c622c34d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,7 +1,5 @@ #include "swig_interface.h" -#include "input_data.h" -#include "version.h" /// @cond DOXYGEN_IGNORE #include @@ -9,8 +7,10 @@ // #include /// @endcond -// Placeholder for ContextSV library includes -// #include "ContextSV.h" +#include "input_data.h" +#include "version.h" +#include "utils.h" + void runContextSV(const std::unordered_map& args) { @@ -22,10 +22,13 @@ void runContextSV(const std::unordered_map& args) } // Set up input data + printMemoryUsage("Before setting up input data, "); InputData input_data; input_data.setLongReadBam(args.at("bam-file")); input_data.setShortReadBam(args.at("bam-file")); + printMemoryUsage("Before reading reference genome, "); input_data.setRefGenome(args.at("ref-file")); + printMemoryUsage("After reading reference genome, "); input_data.setSNPFilepath(args.at("snps-file")); input_data.setOutputDir(args.at("output-dir")); if (args.find("chr") != args.end()) { @@ -58,6 +61,7 @@ void runContextSV(const std::unordered_map& args) if (args.find("debug") != args.end()) { input_data.setVerbose(true); } + printMemoryUsage("After setting up input data, "); // Run ContextSV run(input_data); diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 2e2295da..74a4f891 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -41,6 +41,8 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, const std::vector& pos_depth_map) { + printMemoryUsage("Before detecting SVs from CIGAR strings, "); + // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); if (!bam1) { @@ -121,6 +123,8 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, hts_itr_destroy(itr); bam_destroy1(bam1); + printMemoryUsage("After detecting SVs from CIGAR strings, "); + // return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments); } @@ -333,12 +337,14 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls) { + printMemoryUsage("Before opening BAM file, "); // Open the BAM file std::string bam_filepath = this->input_data.getLongReadBam(); samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); if (!fp_in) { throw std::runtime_error("ERROR: failed to open " + bam_filepath); } + printMemoryUsage("After opening BAM file, "); // Load the header bam_hdr_t *bamHdr = sam_hdr_read(fp_in); @@ -354,6 +360,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v sam_close(fp_in); throw std::runtime_error("ERROR: failed to load index for " + bam_filepath); } + printMemoryUsage("After loading index, "); // Split the chromosome into chunks for memory efficiency std::vector region_chunks; @@ -383,13 +390,17 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v } printMessage("Split chromosome " + chr + " into " + std::to_string(region_chunks.size()) + " chunks of size " + std::to_string(chunk_size) + "..."); } + printMemoryUsage("After splitting chromosome into chunks, "); // Load chromosome data for copy number predictions // std::cout << "Loading chromosome data for copy number predictions..." << std::endl; printMessage(chr + ": Loading chromosome data..."); CNVCaller cnv_caller(this->input_data); - std::pair> chr_data = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_len); - if (chr_data.first == 0.0 || chr_data.second.size() == 0) { + printMemoryUsage("Before calculating mean chromosome coverage (top), "); + std::vector chr_pos_depth_map(chr_len+1, 0); // 1-based index + double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map); + printMemoryUsage("After calculating mean chromosome coverage (top), "); + if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) { hts_idx_destroy(idx); bam_hdr_destroy(bamHdr); sam_close(fp_in); @@ -404,10 +415,12 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v for (const auto& sub_region : region_chunks) { current_region++; printMessage(chr + ": CIGAR SVs..."); + printMemoryUsage("Before detecting CIGAR SVs, "); PrimaryMap primary_map; SuppMap supp_map; std::vector subregion_sv_calls; - this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map, chr_data.second); + printMemoryUsage("After creating primary and supplementary maps, "); + this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map, chr_pos_depth_map); // std::set& subregion_sv_calls = std::get<0>(region_data); // PrimaryMap& primary_map = std::get<1>(region_data); // SuppMap& supp_map = std::get<2>(region_data); @@ -423,13 +436,13 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v if (region_sv_count > 0) { // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl; printMessage(chr + ": CIGAR predictions..."); - cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, hmm, chr_data.first, chr_data.second); + cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map); } // Run split-read SV and copy number variant predictions // std::cout << "Detecting copy number variants from split reads..." << std::endl; printMessage(chr + ": Split read SVs..."); - this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, chr_data.first, chr_data.second); + this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map); // Merge the SV calls from the current region // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; @@ -458,6 +471,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v void SVCaller::run() { + printMemoryUsage("Before running SV caller, "); // Get the chromosomes to process std::vector chromosomes; if (this->input_data.getChromosome() != "") { @@ -465,11 +479,13 @@ void SVCaller::run() } else { chromosomes = this->input_data.getRefGenomeChromosomes(); } - + + printMemoryUsage("After getting chromosomes, "); // Read the HMM from the file std::string hmm_filepath = this->input_data.getHMMFilepath(); std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; const CHMM& hmm = ReadCHMM(hmm_filepath.c_str()); + printMemoryUsage("After reading HMM, "); // Use multi-threading across chromosomes unless a single chromosome is // specified @@ -542,6 +558,8 @@ void SVCaller::run() // Detect SVs from split read alignments void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map) { + printMemoryUsage("Before detecting SVs from split reads, "); + // Find split-read SV evidence int sv_count = 0; uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength(); @@ -697,6 +715,8 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap } } } + + printMemoryUsage("After detecting SVs from split reads, "); } void SVCaller::saveToVCF(const std::unordered_map>& sv_calls) diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 5330fd9f..aedb187c 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -90,7 +90,7 @@ void mergeSVs(std::vector& sv_calls) } // Merge SV calls if they overlap - int initial_size = sv_calls.size(); + // int initial_size = sv_calls.size(); // Merge any SV calls that have >90% reciprocal overlap std::vector merged_sv_calls; @@ -142,8 +142,8 @@ void mergeSVs(std::vector& sv_calls) merged_sv_calls.push_back(current_merge); // Add the last SV call sv_calls = merged_sv_calls; // Update the SV calls - int updated_size = sv_calls.size(); - std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl; + // int updated_size = sv_calls.size(); + // std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl; } void filterSVsWithLowSupport(std::vector& sv_calls, int min_support) diff --git a/src/utils.cpp b/src/utils.cpp index db083f97..bb82abbc 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -1,6 +1,8 @@ #include "utils.h" /// @cond +#include // getrusage +#include #include #include #include @@ -109,3 +111,13 @@ std::string removeChrPrefix(std::string chr) } return chr; } + +void printMemoryUsage(const std::string& functionName) { + struct rusage usage; + getrusage(RUSAGE_SELF, &usage); + + // Convert from KB to GB + double mem_usage_gb = (double)usage.ru_maxrss / 1024.0 / 1024.0; + std::cout << functionName << " memory usage: " + << std::fixed << std::setprecision(2) << mem_usage_gb << " GB" << std::endl; +} From 2d1337c3941472fd78069cd4dbcccd538f9b8545 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 14 Dec 2024 18:40:35 -0500 Subject: [PATCH 051/134] improve mem eff --- include/cnv_caller.h | 26 +- include/fasta_query.h | 4 +- include/input_data.h | 36 +-- include/sv_caller.h | 41 ++- src/cnv_caller.cpp | 83 +---- src/fasta_query.cpp | 6 +- src/input_data.cpp | 34 +- src/sv_caller.cpp | 734 ++++++++++++++++++++++++++++++++---------- 8 files changed, 638 insertions(+), 326 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index f17e807e..12bbce80 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -45,7 +45,7 @@ struct SNPData { // CNVCaller: Detect CNVs and return the state sequence by SNP position class CNVCaller { private: - InputData& input_data; + const InputData& input_data; mutable std::mutex snp_file_mtx; // SNP file mutex mutable std::mutex pfb_file_mtx; // Population frequency file mutex mutable std::mutex bam_file_mtx; // BAM file mutex @@ -73,40 +73,32 @@ class CNVCaller { void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp); - void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair, double>& prediction); + void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair, double>& prediction) const; // Query a region for SNPs and return the SNP data - void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data); - - void querySNPs(std::string chr, uint32_t start, uint32_t end, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp); + void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data) const; // Split a region into chunks for parallel processing - std::vector splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count); + std::vector splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const; public: - explicit CNVCaller(InputData& input_data); + explicit CNVCaller(const InputData& input_data); // Run copy number prediction for a single SV candidate, returning the // likelihood, predicted CNV type, genotype, and whether SNPs were found - std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map); + std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map) const; // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map); - // Calculate the mean chromosome coverage double calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map); - // Calculate the log2 ratio for a region given the read depths and mean - // chromosome coverage - double calculateLog2Ratio(uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov); - - void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& pos_log2); + void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& pos_log2) const; - void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp); - // void readSNPPopulationFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pfb); + void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp) const; // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions - void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood); + void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const; }; #endif // CNV_CALLER_H diff --git a/include/fasta_query.h b/include/fasta_query.h index ffa88d8a..b130117a 100644 --- a/include/fasta_query.h +++ b/include/fasta_query.h @@ -25,10 +25,10 @@ class ReferenceGenome { std::string getContigHeader() const; // Get the list of chromosomes, used for whole genome analysis - std::vector getChromosomes(); + std::vector getChromosomes() const; // Get the length of a chromosome - uint32_t getChromosomeLength(std::string chr); + uint32_t getChromosomeLength(std::string chr) const; }; #endif // FASTA_QUERY_H diff --git a/include/input_data.h b/include/input_data.h index 65051e77..43a9790b 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -23,39 +23,35 @@ class InputData { public: InputData(); - std::string getShortReadBam(); + std::string getShortReadBam() const; void setShortReadBam(std::string filepath); - std::string getLongReadBam(); + std::string getLongReadBam() const; void setLongReadBam(std::string filepath); // Set the filepath to the HMM parameters. void setHMMFilepath(std::string filepath); - std::string getHMMFilepath(); + std::string getHMMFilepath() const; // Set the filepath to the reference genome FASTA file. void setRefGenome(std::string fasta_filepath); // Return a reference to the ReferenceGenome object. const ReferenceGenome& getRefGenome() const; - - // Query the reference genome for a sequence. std::string queryRefGenome(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const; // Get the chromosomes in the reference genome. - std::vector getRefGenomeChromosomes(); + std::vector getRefGenomeChromosomes() const; // Get a chromosome's length in the reference genome. - uint32_t getRefGenomeChromosomeLength(std::string chr); + uint32_t getRefGenomeChromosomeLength(std::string chr) const; // Set the filepath to the text file containing the locations of the // VCF files with population frequencies for each chromosome. void setAlleleFreqFilepaths(std::string filepath); - - // Get the chromosome's VCF filepath with population frequencies. - std::string getAlleleFreqFilepath(std::string chr); + std::string getAlleleFreqFilepath(std::string chr) const; // Get the population frequency map. // PFBMap getPFBMap(); @@ -63,36 +59,36 @@ class InputData { // Set the filepath to the VCF file with SNP calls used for CNV // detection with the HMM. void setSNPFilepath(std::string filepath); - std::string getSNPFilepath(); + std::string getSNPFilepath() const; // Set the ethnicity for SNP population frequencies. void setEthnicity(std::string ethnicity); - std::string getEthnicity(); + std::string getEthnicity() const; // Set the sample size for HMM predictions. void setSampleSize(int sample_size); - int getSampleSize(); + int getSampleSize() const; // Set the minimum CNV length to use for copy number predictions. void setMinCNVLength(int min_cnv_length); - int getMinCNVLength(); + int getMinCNVLength() const; // Set the chromosome to analyze. void setChromosome(std::string chr); - std::string getChromosome(); + std::string getChromosome() const; // Set the region to analyze. void setRegion(std::string region); - std::pair getRegion(); - bool isRegionSet(); + std::pair getRegion() const; + bool isRegionSet() const; // Set the output directory where the results will be written. void setOutputDir(std::string dirpath); - std::string getOutputDir(); + std::string getOutputDir() const; // Set the number of threads to use when parallelization is possible. void setThreadCount(int thread_count); - int getThreadCount(); + int getThreadCount() const; // Set the verbose flag to true if verbose output is desired. void setVerbose(bool verbose); @@ -101,7 +97,7 @@ class InputData { // Set whether to extend the SNP CNV regions around the SV breakpoints // (+/- 1/2 SV length), save a TSV file, and generate HTML reports. void saveCNVData(bool save_cnv_data); - bool getSaveCNVData(); + bool getSaveCNVData() const; private: std::string short_read_bam; diff --git a/include/sv_caller.h b/include/sv_caller.h index c0f9ce23..abd397d1 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -16,47 +16,58 @@ #include /// @endcond -// SV candidate alignment data (chr, start, end, sequence, query start, query -// end, mismatch map, strand) -using AlignmentData = std::tuple, bool>; -using AlignmentVector = std::vector; +struct GenomicRegion { + int tid; + hts_pos_t start; + hts_pos_t end; + bool strand; +}; -// Query map (query name, alignment vector) -using PrimaryMap = std::unordered_map; -using SuppMap = std::unordered_map; -// using RegionData = std::tuple; +struct MismatchData { + uint32_t query_start; + uint32_t query_end; + std::vector match_map; +}; class SVCaller { private: int min_sv_size = 50; // Minimum SV size to be considered int min_mapq = 20; // Minimum mapping quality to be considered - InputData& input_data; + const InputData& input_data; + + void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data) const; + + void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map) const; // Detect SVs from the CIGAR string of a read alignment, and return the // mismatch rate, and the start and end positions of the query sequence - void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, std::tuple, uint32_t, uint32_t>& query_info, bool is_primary, const std::vector& pos_depth_map); + void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map); void processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls); // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. // RegionData detectSVsFromRegion(std::string region); - void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, const std::vector& pos_depth_map); + void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map); // Read the next alignment from the BAM file in a thread-safe manner - int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); + int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const; // Detect SVs from split alignments - void detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map); + void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map); // Calculate the mismatch rate given a map of query positions to // match/mismatch (1/0) values within a specified range of the query // sequence - double calculateMismatchRate(const std::vector& mismatch_map, int32_t start, int32_t end); + double calculateMismatchRate(const MismatchData& mismatch_data) const; + + std::pair generateMatchMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, hts_itr_t *itr, std::vector& match_map) const; void saveToVCF(const std::unordered_map>& sv_calls); - void trimOverlappingAlignments(AlignmentData& primary_alignment, AlignmentData& supp_alignment); + void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches) const; + + // void trimOverlappingAlignments(uint32_t& primary_start, uint32_t& primary_end, uint32_t& supp_start, uint32_t& supp_end, const std::vector& primary_match_map, const std::vector& supp_match_map); // Calculate the read depth (INFO/DP) for a region int calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end); diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 2d2c5548..ac79d598 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -34,13 +34,13 @@ using namespace sv_types; -CNVCaller::CNVCaller(InputData &input_data) +CNVCaller::CNVCaller(const InputData& input_data) : input_data(input_data) // Initialize the input data { } // Function to call the Viterbi algorithm for the CHMM -void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair, double>& prediction) +void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair, double>& prediction) const { int data_count = (int) snp_data.pos.size(); if (data_count == 0) @@ -52,7 +52,7 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair& pos_depth_map, double mean_chr_cov, SNPData& snp_data) +void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data) const { // uint32_t window_size = (uint32_t)this->input_data.getWindowSize(); @@ -70,11 +70,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end std::vector snp_pfb(sample_size, 0.5); std::vector snp_log2_cov(sample_size, 0.0); std::vector is_snp(sample_size, false); - // std::unordered_map snp_baf(sample_size, -1.0); - // std::unordered_map snp_pfb(sample_size, 0.5); - - // Query the SNPs for the entire region - this->querySNPs(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp); + this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp); // Get the log2 ratio for evenly spaced positions in the // region @@ -88,7 +84,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end snp_data.is_snp = std::move(is_snp); } -std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map) +std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map) const { // Check that the start position is less than the end position if (start_pos >= end_pos) @@ -168,7 +164,7 @@ std::tuple CNVCaller::runCopyNumberPrediction if ((double) max_count / (double) state_count > pct_threshold) { predicted_cnv_type = getSVTypeFromCNState(max_state); - genotype = cnv_genotype_map[max_state]; + genotype = cnv_genotype_map.at(max_state); } snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data @@ -316,7 +312,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) +std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const { // Split the region into chunks std::vector region_chunks; @@ -478,39 +474,7 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector& pos_depth_map, double mean_chr_cov) -{ - // Use the position and depth map to calculate the log2 ratio - double cum_depth = 0; - int pos_count = 0; - for (uint32_t i = start_pos; i <= end_pos; i++) - { - if (i < pos_depth_map.size() && pos_depth_map[i] > 0) - { - cum_depth += pos_depth_map[i]; - pos_count++; - } - } - - // Calculate the window coverage log2 ratio (0 if no positions) - double window_mean_cov = 0; - if (pos_count > 0) - { - window_mean_cov = (double) cum_depth / (double) pos_count; - } - - // Calculate the log2 ratio for the window - // Avoid log2(0) by using a small value - if (window_mean_cov == 0) - { - window_mean_cov = 0.0001; - } - double window_log2_ratio = log2(window_mean_cov / mean_chr_cov); - - return window_log2_ratio; -} - -void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& log2_region) +void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& log2_region) const { uint32_t region_length = end_pos - start_pos + 1; for (int i = 0; i < sample_size; i++) @@ -534,7 +498,7 @@ void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, i } } -void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp) +void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp) const { printMemoryUsage("Reading SNP allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", "); @@ -850,7 +814,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui bcf_sr_destroy(pfb_reader); } -void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) +void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const { // Open the TSV file for writing std::ofstream tsv_file(filepath); @@ -941,30 +905,3 @@ void CNVCaller::updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, doubl snp_data.log2_cov.emplace_back(log2_cov); snp_data.is_snp.emplace_back(is_snp); } - -void CNVCaller::querySNPs(std::string chr, uint32_t start, uint32_t end, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp) -{ - std::string snp_chr = chr; - chr = removeChrPrefix(chr); - - // Query the SNP allele frequencies for the SNPs - // std::map> snp_map; - this->readSNPAlleleFrequencies(snp_chr, start, end, snp_pos, snp_baf, snp_pfb, is_snp); - - // Query the population frequencies for the SNPs - // std::unordered_map pfb_map; - // this->readSNPPopulationFrequencies(chr, start, end, snp_pfb); - - // Filter out the SNP population frequencies that are not in the SNP - // position set - // double pfb_default = 0.5; - // for (auto& pos : snp_pos) - // { - // if (pfb_map.find(pos) != pfb_map.end()) - // { - // snp_pfb[pos] = pfb_map[pos]; - // } else { - // snp_pfb[pos] = pfb_default; - // } - // } -} diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp index ee220d1c..212343f0 100644 --- a/src/fasta_query.cpp +++ b/src/fasta_query.cpp @@ -161,12 +161,12 @@ std::string ReferenceGenome::getContigHeader() const return contig_header; } -std::vector ReferenceGenome::getChromosomes() +std::vector ReferenceGenome::getChromosomes() const { return this->chromosomes; } -uint32_t ReferenceGenome::getChromosomeLength(std::string chr) +uint32_t ReferenceGenome::getChromosomeLength(std::string chr) const { - return this->chr_to_seq[chr].length(); + return this->chr_to_seq.at(chr).length(); } diff --git a/src/input_data.cpp b/src/input_data.cpp index 4f9ae124..381d5ac5 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -32,7 +32,7 @@ InputData::InputData() this->save_cnv_data = false; } -std::string InputData::getShortReadBam() +std::string InputData::getShortReadBam() const { return this->short_read_bam; } @@ -58,7 +58,7 @@ void InputData::setShortReadBam(std::string filepath) } } -std::string InputData::getLongReadBam() +std::string InputData::getLongReadBam() const { return this->long_read_bam; } @@ -100,17 +100,17 @@ std::string InputData::queryRefGenome(const std::string& chr, uint32_t pos_start return this->fasta_query.query(chr, pos_start, pos_end); } -std::vector InputData::getRefGenomeChromosomes() +std::vector InputData::getRefGenomeChromosomes() const { return this->fasta_query.getChromosomes(); } -uint32_t InputData::getRefGenomeChromosomeLength(std::string chr) +uint32_t InputData::getRefGenomeChromosomeLength(std::string chr) const { return this->fasta_query.getChromosomeLength(chr); } -std::string InputData::getOutputDir() +std::string InputData::getOutputDir() const { return this->output_dir; } @@ -124,7 +124,7 @@ void InputData::setOutputDir(std::string dirpath) system(cmd.c_str()); } -int InputData::getSampleSize() +int InputData::getSampleSize() const { return this->sample_size; } @@ -134,7 +134,7 @@ void InputData::setSampleSize(int sample_size) this->sample_size = sample_size; } -std::string InputData::getSNPFilepath() +std::string InputData::getSNPFilepath() const { return this->snp_vcf_filepath; } @@ -144,7 +144,7 @@ void InputData::setSNPFilepath(std::string filepath) this->snp_vcf_filepath = filepath; } -std::string InputData::getEthnicity() +std::string InputData::getEthnicity() const { return this->ethnicity; } @@ -154,7 +154,7 @@ void InputData::setEthnicity(std::string ethnicity) this->ethnicity = ethnicity; } -int InputData::getMinCNVLength() +int InputData::getMinCNVLength() const { return this->min_cnv_length; } @@ -169,7 +169,7 @@ void InputData::setChromosome(std::string chr) this->chr = chr; } -std::string InputData::getChromosome() +std::string InputData::getChromosome() const { return this->chr; } @@ -205,12 +205,12 @@ void InputData::setRegion(std::string region) } } -std::pair InputData::getRegion() +std::pair InputData::getRegion() const { return this->start_end; } -bool InputData::isRegionSet() +bool InputData::isRegionSet() const { return this->region_set; } @@ -299,14 +299,14 @@ void InputData::setAlleleFreqFilepaths(std::string filepath) } } -std::string InputData::getAlleleFreqFilepath(std::string chr) +std::string InputData::getAlleleFreqFilepath(std::string chr) const { // Remove the chr notation if (chr.find("chr") != std::string::npos) { chr = chr.substr(3, chr.size() - 3); } - return this->pfb_filepaths[chr]; + return this->pfb_filepaths.at(chr); } void InputData::setThreadCount(int thread_count) @@ -314,12 +314,12 @@ void InputData::setThreadCount(int thread_count) this->thread_count = thread_count; } -int InputData::getThreadCount() +int InputData::getThreadCount() const { return this->thread_count; } -std::string InputData::getHMMFilepath() +std::string InputData::getHMMFilepath() const { return this->hmm_filepath; } @@ -361,7 +361,7 @@ void InputData::saveCNVData(bool save_cnv_data) this->save_cnv_data = save_cnv_data; } -bool InputData::getSaveCNVData() +bool InputData::getSaveCNVData() const { return this->save_cnv_data; } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 74a4f891..db4da271 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -33,14 +33,16 @@ SVCaller::SVCaller(InputData &input_data) { } -int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) +int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const { int ret = sam_itr_next(fp_in, itr, bam1); return ret; } -void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, PrimaryMap& primary_alignments, SuppMap& supplementary_alignments, const std::vector& pos_depth_map) +void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map) const { + // std::map primary_map; + // std::map> supplementary_map; printMemoryUsage("Before detecting SVs from CIGAR strings, "); // Create a read and iterator for the region @@ -60,8 +62,11 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, throw std::runtime_error("ERROR: failed to query region " + region); } + uint32_t primary_count = 0; + uint32_t supplementary_count = 0; + // Main loop to process the alignments - int num_alignments = 0; + uint32_t num_alignments = 0; while (readNextAlignment(fp_in, itr, bam1) >= 0) { // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality @@ -72,50 +77,20 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, // Process primary alignments if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { - - // Get the primary alignment information - std::string chr = bamHdr->target_name[bam1->core.tid]; - uint32_t start = (uint32_t)bam1->core.pos; - uint32_t end = (uint32_t)bam_endpos(bam1); // This is the first position after the alignment - end--; // Adjust to the last position of the alignment - bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); - - // Call SVs directly from the CIGAR string - std::tuple, uint32_t, uint32_t> query_info; - this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, true, pos_depth_map); - // std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true); - const std::vector& match_map = std::get<0>(query_info); - uint32_t query_start = std::get<1>(query_info); - uint32_t query_end = std::get<2>(query_info); - - // Add the primary alignment to the map - AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand); - primary_alignments[qname] = alignment; + // primary_map[qname] = itr; + // Store chromosome (TID), start, and end positions (1-based) of the + // primary alignment, and the strand + primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE)}; + primary_count++; // Process supplementary alignments } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { - - // Get the supplementary alignment information - std::string chr = bamHdr->target_name[bam1->core.tid]; - uint32_t start = bam1->core.pos; - uint32_t end = bam_endpos(bam1); // This is the first position after the alignment - end--; // Adjust to the last position of the alignment - bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); - - // Get CIGAR string information, but don't call SVs - // std::tuple, int32_t, int32_t> query_info = - // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false); - std::tuple, uint32_t, uint32_t> query_info; - this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, query_info, false, pos_depth_map); - const std::vector& match_map = std::get<0>(query_info); - uint32_t query_start = std::get<1>(query_info); - uint32_t query_end = std::get<2>(query_info); - - // Add the supplementary alignment to the map - AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand); - supplementary_alignments[qname].emplace_back(alignment); + // supp_map[qname].push_back(itr); + // Store chromosome (TID), start, and end positions (1-based) of the + // supplementary alignment, and the strand + supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE)}); + supplementary_count++; } - num_alignments++; } @@ -124,12 +99,215 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, bam_destroy1(bam1); printMemoryUsage("After detecting SVs from CIGAR strings, "); + printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments..."); +} + +void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data) const +{ + // Create a read and iterator for the region + bam1_t *bam1 = bam_init1(); + if (!bam1) { + hts_idx_destroy(idx); + bam_hdr_destroy(bamHdr); + sam_close(fp_in); + throw std::runtime_error("ERROR: failed to initialize BAM record"); + } + hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start, region.end); + if (!itr) { + bam_destroy1(bam1); + hts_idx_destroy(idx); + bam_hdr_destroy(bamHdr); + sam_close(fp_in); + throw std::runtime_error("ERROR: failed to query region " + std::to_string(region.tid) + ":" + std::to_string(region.start) + "-" + std::to_string(region.end)); + } + + // Read the alignment + if (readNextAlignment(fp_in, itr, bam1) < 0) { + bam_destroy1(bam1); + hts_itr_destroy(itr); + printError("ERROR: failed to read alignment"); + return; + } + + // Main loop to process the alignments + std::vector match_map; + uint32_t query_start = 0; + uint32_t query_end = 0; + uint32_t query_pos = 0; + bool first_op = true; + + // Process mismatches in the CIGAR string + const std::string chr = bamHdr->target_name[bam1->core.tid]; + hts_pos_t pos = bam1->core.pos; // 0-based position + uint32_t* cigar = bam_get_cigar(bam1); // CIGAR array + int cigar_len = bam1->core.n_cigar; + for (int i = 0; i < cigar_len; i++) { + int op = bam_cigar_op(cigar[i]); // CIGAR operation + int op_len = bam_cigar_oplen(cigar[i]); // CIGAR operation length + + // Update match/mismatch query map + int MATCH = 1; + int MISMATCH = -1; + if (op == BAM_CEQUAL) { + for (int j = 0; j < op_len; j++) { + match_map[query_pos + j] = MATCH; + } + } else if (op == BAM_CDIFF) { + for (int j = 0; j < op_len; j++) { + match_map[query_pos + j] = MISMATCH; + } + } else if (op == BAM_CMATCH) { + // Get the read sequence + uint8_t* seq_ptr = bam_get_seq(bam1); + std::string cmatch_seq_str = ""; + for (int j = 0; j < op_len; j++) { + cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)]; + } - // return std::make_tuple(sv_calls, primary_alignments, supplementary_alignments); + // Get the corresponding reference sequence + int cmatch_pos = pos + 1; // Querying the reference genome is 1-based + // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1)); + std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1); + + // Check that the two sequence lengths are equal + if (cmatch_seq_str.length() != cmatch_ref_str.length()) { + throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op)); + } + + // Compare the two sequences and update the mismatch map + for (int j = 0; j < op_len; j++) { + if (cmatch_seq_str[j] != cmatch_ref_str[j]) { + match_map[query_pos + j] = MISMATCH; + } else { + match_map[query_pos + j] = MATCH; + } + } + } else if (first_op && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP)) { + query_start = query_pos + op_len; + first_op = false; + } + + // Update the reference position + // https://samtools.github.io/hts-specs/SAMv1.pdf + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { + pos += op_len; + + // Update the query position + } else if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) { + query_pos += op_len; + } + } + query_end = query_pos; + + // Clean up the iterator and alignment + hts_itr_destroy(itr); + bam_destroy1(bam1); + + // Update the mismatch data + mismatch_data.query_start = query_start; + mismatch_data.query_end = query_end; + mismatch_data.match_map = std::move(match_map); } -double SVCaller::calculateMismatchRate(const std::vector& mismatch_map, int32_t start, int32_t end) +void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map) { + printMemoryUsage("Before detecting SVs from CIGAR strings, "); + + // Create a read and iterator for the region + bam1_t *bam1 = bam_init1(); + if (!bam1) { + hts_idx_destroy(idx); + bam_hdr_destroy(bamHdr); + sam_close(fp_in); + throw std::runtime_error("ERROR: failed to initialize BAM record"); + } + hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str()); + if (!itr) { + bam_destroy1(bam1); + hts_idx_destroy(idx); + bam_hdr_destroy(bamHdr); + sam_close(fp_in); + throw std::runtime_error("ERROR: failed to query region " + region); + } + + // Main loop to process the alignments + while (readNextAlignment(fp_in, itr, bam1) >= 0) { + + // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality + if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) { + continue; + } + // const std::string qname = bam_get_qname(bam1); // Query template name + + // Process the alignment + bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY); + this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, primary, pos_depth_map); + // if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { + + // // Get the primary alignment information + // // std::string chr = bamHdr->target_name[bam1->core.tid]; + // // uint32_t start = (uint32_t)bam1->core.pos; + // // uint32_t end = (uint32_t)bam_endpos(bam1); // This is the first position after the alignment + // // end--; // Adjust to the last position of the alignment + // // bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); + + // // Call SVs directly from the CIGAR string + // // std::tuple, uint32_t, uint32_t> query_info; + // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true, pos_depth_map); + // // std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true); + // // const std::vector& match_map = std::get<0>(query_info); + // // uint32_t query_start = std::get<1>(query_info); + // // uint32_t query_end = std::get<2>(query_info); + + // // Add the primary alignment to the map + // // AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand); + // // primary_alignments[qname] = alignment; + + // // Add the iterator to the primary map + // // primary_map[qname] = itr; + + // // Process supplementary alignments + // } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { + + // // Get the supplementary alignment information + // // std::string chr = bamHdr->target_name[bam1->core.tid]; + // // uint32_t start = bam1->core.pos; + // // uint32_t end = bam_endpos(bam1); // This is the first position after the alignment + // // end--; // Adjust to the last position of the alignment + // // bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); + + // // Get CIGAR string information, but don't call SVs + // // std::tuple, int32_t, int32_t> query_info = + // // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false); + // // std::tuple, uint32_t, uint32_t> query_info; + // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false, pos_depth_map); + // // const std::vector& match_map = std::get<0>(query_info); + // // uint32_t query_start = std::get<1>(query_info); + // // uint32_t query_end = std::get<2>(query_info); + + // // Add the supplementary alignment to the map + // // AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand); + // // supplementary_alignments[qname].emplace_back(alignment); + + // // Add the iterator to the supplementary map + // // supplementary_map[qname].push_back(itr); + // } + } + + // Clean up the iterator and alignment + hts_itr_destroy(itr); + bam_destroy1(bam1); + + printMemoryUsage("After detecting SVs from CIGAR strings, "); +} + +// double SVCaller::calculateMismatchRate(const std::vector& mismatch_map, +// int32_t start, int32_t end) +double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) const +{ + int start = mismatch_data.query_start; + int end = mismatch_data.query_end; + const std::vector& mismatch_map = mismatch_data.match_map; start = std::max(start, 0); end = std::min(end, (int32_t)mismatch_map.size() - 1); int match_count = 0; @@ -153,8 +331,117 @@ double SVCaller::calculateMismatchRate(const std::vector& mismatch_map, int return mismatch_rate; } +// { +// start = std::max(start, 0); +// end = std::min(end, (int32_t)mismatch_map.size() - 1); +// int match_count = 0; +// int mismatch_count = 0; +// int MATCH = 1; +// int MISMATCH = -1; +// for (int i = start; i <= end; i++) { +// if (mismatch_map[i] == MATCH) { +// match_count++; +// } else if (mismatch_map[i] == MISMATCH) { +// mismatch_count++; +// } +// } + +// // Avoid division by zero +// if (match_count + mismatch_count == 0) { +// return 0.0; +// } + +// double mismatch_rate = static_cast(mismatch_count) / static_cast(match_count + mismatch_count); + +// return mismatch_rate; +// } + +std::pair SVCaller::generateMatchMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, hts_itr_t* itr, std::vector& match_map) const +{ + // Create a read and iterator for the region + bam1_t *bam1 = bam_init1(); + if (!bam1) { + printError("ERROR: failed to initialize BAM record"); + return std::make_pair(0, 0); + } + + // Read the alignment + if (readNextAlignment(fp_in, itr, bam1) < 0) { + bam_destroy1(bam1); + printError("ERROR: failed to read alignment"); + return std::make_pair(0, 0); + } + + // Main loop to process the alignments + std::string chr = bamHdr->target_name[bam1->core.tid]; // Chromosome name + uint32_t pos = (uint32_t)bam1->core.pos; // Leftmost position of the alignment in the reference genome (0-based) + uint32_t query_pos = 0; + uint32_t query_start = 0; + uint32_t query_end = 0; + bool first_op = true; + + // Get the CIGAR string + uint32_t* cigar = bam_get_cigar(bam1); // CIGAR array + int cigar_len = bam1->core.n_cigar; + for (int i = 0; i < cigar_len; i++) { + int op = bam_cigar_op(cigar[i]); // CIGAR operation + int op_len = bam_cigar_oplen(cigar[i]); // CIGAR operation length + + // Update match/mismatch query map + int MATCH = 1; + int MISMATCH = -1; + if (op == BAM_CEQUAL) { + for (int j = 0; j < op_len; j++) { + match_map[query_pos + j] = MATCH; + } + } else if (op == BAM_CDIFF) { + for (int j = 0; j < op_len; j++) { + match_map[query_pos + j] = MISMATCH; + } + } else if (op == BAM_CMATCH) { + // Get the read sequence + uint8_t* seq_ptr = bam_get_seq(bam1); + std::string cmatch_seq_str = ""; + for (int j = 0; j < op_len; j++) { + cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)]; + } + + // Get the corresponding reference sequence + int cmatch_pos = pos + 1; // Querying the reference genome is 1-based + // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1)); + std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1); + + // Check that the two sequence lengths are equal + if (cmatch_seq_str.length() != cmatch_ref_str.length()) { + throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op)); + } + + // Compare the two sequences and update the mismatch map + for (int j = 0; j < op_len; j++) { + if (cmatch_seq_str[j] != cmatch_ref_str[j]) { + match_map[query_pos + j] = MISMATCH; + } else { + match_map[query_pos + j] = MATCH; + } + } + } else if (first_op && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP)) { + query_start = query_pos + op_len; + first_op = false; + } + + // Update the query position + if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) { + query_pos += op_len; + } + } + query_end = query_pos; + + bam_destroy1(bam1); // Clean up the alignment + + return std::make_pair(query_start, query_end); +} -void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, std::tuple, uint32_t, uint32_t>& query_info, bool is_primary, const std::vector& pos_depth_map) +void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map) { std::string chr = header->target_name[alignment->core.tid]; // Chromosome name uint32_t pos = (uint32_t)alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) @@ -163,7 +450,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec uint32_t query_pos = 0; // std::unordered_map query_match_map; // Query position to // match/mismatch (1/0) map - std::vector query_match_map(alignment->core.l_qseq, 0); // Query position to match/mismatch (1/0) map + // std::vector query_match_map(alignment->core.l_qseq, 0); // Query position to match/mismatch (1/0) map // Loop through the CIGAR string, process operations, detect SVs (primary // only), update clipped base support, calculate sequence identity for @@ -171,9 +458,9 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec // the clipped base support and mismatch rate uint32_t ref_pos; uint32_t ref_end; - uint32_t query_start = 0; // First alignment position in the query - uint32_t query_end = 0; // Last alignment position in the query - bool first_op = false; // First alignment operation for the query + // uint32_t query_start = 0; // First alignment position in the query + // uint32_t query_end = 0; // Last alignment position in the query + // bool first_op = false; // First alignment operation for the query double default_lh = 0.0; for (int i = 0; i < cigar_len; i++) { @@ -265,50 +552,50 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec // sv_calls.updateClippedBaseSupport(chr, pos); // Update clipped base support // Update the query alignment start position - if (!first_op) { - query_start = query_pos + op_len; - first_op = true; - } + // if (!first_op) { + // query_start = query_pos + op_len; + // first_op = true; + // } } - // Update match/mismatch query map - int MATCH = 1; - int MISMATCH = -1; - if (op == BAM_CEQUAL) { - for (int j = 0; j < op_len; j++) { - query_match_map[query_pos + j] = MATCH; - } - } else if (op == BAM_CDIFF) { - for (int j = 0; j < op_len; j++) { - query_match_map[query_pos + j] = MISMATCH; - } - } else if (op == BAM_CMATCH) { - // Get the read sequence - uint8_t* seq_ptr = bam_get_seq(alignment); - std::string cmatch_seq_str = ""; - for (int j = 0; j < op_len; j++) { - cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)]; - } - - // Get the corresponding reference sequence - int cmatch_pos = pos + 1; // Querying the reference genome is 1-based - // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1)); - std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1); - - // Check that the two sequence lengths are equal - if (cmatch_seq_str.length() != cmatch_ref_str.length()) { - throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op)); - } - - // Compare the two sequences and update the mismatch map - for (int j = 0; j < op_len; j++) { - if (cmatch_seq_str[j] != cmatch_ref_str[j]) { - query_match_map[query_pos + j] = MISMATCH; - } else { - query_match_map[query_pos + j] = MATCH; - } - } - } + // // Update match/mismatch query map + // int MATCH = 1; + // int MISMATCH = -1; + // if (op == BAM_CEQUAL) { + // for (int j = 0; j < op_len; j++) { + // query_match_map[query_pos + j] = MATCH; + // } + // } else if (op == BAM_CDIFF) { + // for (int j = 0; j < op_len; j++) { + // query_match_map[query_pos + j] = MISMATCH; + // } + // } else if (op == BAM_CMATCH) { + // // Get the read sequence + // uint8_t* seq_ptr = bam_get_seq(alignment); + // std::string cmatch_seq_str = ""; + // for (int j = 0; j < op_len; j++) { + // cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)]; + // } + + // // Get the corresponding reference sequence + // int cmatch_pos = pos + 1; // Querying the reference genome is 1-based + // // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1)); + // std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1); + + // // Check that the two sequence lengths are equal + // if (cmatch_seq_str.length() != cmatch_ref_str.length()) { + // throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op)); + // } + + // // Compare the two sequences and update the mismatch map + // for (int j = 0; j < op_len; j++) { + // if (cmatch_seq_str[j] != cmatch_ref_str[j]) { + // query_match_map[query_pos + j] = MISMATCH; + // } else { + // query_match_map[query_pos + j] = MATCH; + // } + // } + // } // Update the reference coordinate based on the CIGAR operation // https://samtools.github.io/hts-specs/SAMv1.pdf @@ -330,9 +617,9 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec } } - query_end = query_pos; // Last alignment position in the query + // query_end = query_pos; // Last alignment position in the query - query_info = std::tuple, uint32_t, uint32_t>(std::move(query_match_map), query_start, query_end); + // query_info = std::tuple, uint32_t, uint32_t>(std::move(query_match_map), query_start, query_end); } void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls) @@ -414,13 +701,12 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v int filter_threshold = 4; for (const auto& sub_region : region_chunks) { current_region++; + + // Detect SVs from the CIGAR strings printMessage(chr + ": CIGAR SVs..."); - printMemoryUsage("Before detecting CIGAR SVs, "); - PrimaryMap primary_map; - SuppMap supp_map; - std::vector subregion_sv_calls; - printMemoryUsage("After creating primary and supplementary maps, "); - this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, primary_map, supp_map, chr_pos_depth_map); + std::vector subregion_sv_calls; + this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, chr_pos_depth_map); + // std::set& subregion_sv_calls = std::get<0>(region_data); // PrimaryMap& primary_map = std::get<1>(region_data); // SuppMap& supp_map = std::get<2>(region_data); @@ -442,7 +728,8 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // Run split-read SV and copy number variant predictions // std::cout << "Detecting copy number variants from split reads..." << std::endl; printMessage(chr + ": Split read SVs..."); - this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map); + this->detectSVsFromSplitReads(sub_region, fp_in, idx, bamHdr, subregion_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map); + // this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map); // Merge the SV calls from the current region // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; @@ -556,45 +843,105 @@ void SVCaller::run() // Detect SVs from split read alignments -void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap& primary_map, SuppMap& supp_map, CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map) +void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map) { printMemoryUsage("Before detecting SVs from split reads, "); + + printMessage("Getting split alignments..."); + // std::map primary_map; + // std::map> supp_map; + std::unordered_map primary_map; + std::unordered_map> supp_map; + this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map); + + printMessage("[TEST] Primary map size: " + std::to_string(primary_map.size())); + printMessage("[TEST] Supplementary map size: " + std::to_string(supp_map.size())); + // Find split-read SV evidence int sv_count = 0; uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength(); - for (const auto& entry : primary_map) { - std::string qname = entry.first; - AlignmentData primary_alignment = entry.second; - std::string primary_chr = std::get<0>(primary_alignment); - uint32_t primary_start = std::get<1>(primary_alignment); - uint32_t primary_end = std::get<2>(primary_alignment); + for (auto& entry : primary_map) { + // std::string qname = entry.first; + const std::string& qname = entry.first; + GenomicRegion& primary_region = entry.second; + // AlignmentData primary_alignment = entry.second; + // std::string primary_chr = std::get<0>(primary_alignment); + // uint32_t primary_start = std::get<1>(primary_alignment); + // uint32_t primary_end = std::get<2>(primary_alignment); + + // Get the primary alignment information + // std::string primary_chr = bamHdr->target_name[primary_bam1->core.tid]; + // uint32_t primary_start = (uint32_t) primary_bam1->core.pos; + // uint32_t primary_end = (uint32_t) bam_endpos(primary_bam1) - 1; // Last alignment position + // bool primary_fwd_strand = !(primary_bam1->core.flag & BAM_FREVERSE); // Skip primary alignments that do not have supplementary alignments if (supp_map.find(qname) == supp_map.end()) { continue; } + // Get the read match/mismatch map + MismatchData primary_mismatches; + this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches); + // std::vector match_map(primary_region.end - primary_region.start + // + 1, 0); + // this->getMatchMismatchMap(fp_in, idx, bamHdr, primary_region, mismatch_data); + + // std::pair query_info = generateMatchMismatchMap(fp_in, idx, bamHdr, primary_itr, match_map); + // Find the largest supplementary alignment, and also identify // inversions // printMessage("Finding largest supplementary alignment..."); - AlignmentData largest_supp_alignment = supp_map[qname][0]; + // AlignmentData largest_supp_region = supp_map[qname][0]; + // hts_itr_t* largest_supp_itr = supp_map[qname][0]; + GenomicRegion largest_supp_region = supp_map[qname][0]; uint32_t largest_supp_length = 0; + const std::string& primary_chr = bamHdr->target_name[primary_region.tid]; for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { - const auto& supp_chr = std::get<0>(*it); - if (primary_chr != supp_chr) { - continue; // Skip supplementary alignments on different chromosomes + GenomicRegion& supp_region = *it; + // Get the supplementary alignment information + // bam1_t* supp_bam1 = bam_init1(); + // if (!supp_bam1) { + // throw std::runtime_error("ERROR: failed to initialize BAM record"); + // } + // if (sam_itr_next(fp_in, *it, supp_bam1) < 0) { + // bam_destroy1(supp_bam1); + // throw std::runtime_error("ERROR: failed to read alignment"); + // } + + // Skip if not on the primary chromosome + if (primary_region.tid != supp_region.tid) { + continue; } - uint32_t supp_start = std::get<1>(*it); - uint32_t supp_end = std::get<2>(*it); + + // std::string supp_chr = bamHdr->target_name[supp_bam1->core.tid]; + // uint32_t supp_start = (uint32_t) supp_bam1->core.pos; + // uint32_t supp_end = (uint32_t) bam_endpos(supp_bam1) - 1; // Last alignment position + // uint32_t supp_length = supp_end - supp_start + 1; + + // const auto& supp_chr = std::get<0>(*it); + // if (primary_chr != supp_chr) { + // continue; // Skip supplementary alignments on different chromosomes + // } + // uint32_t supp_start = std::get<1>(*it); + // uint32_t supp_end = std::get<2>(*it); + // uint32_t supp_length = supp_end - supp_start + 1; + + // Get the supplementary alignment information + uint32_t supp_start = (uint32_t) supp_region.start; + uint32_t supp_end = (uint32_t) supp_region.end; uint32_t supp_length = supp_end - supp_start + 1; if (supp_length > largest_supp_length) { largest_supp_length = supp_length; - largest_supp_alignment = *it; + largest_supp_region = *it; } // Inversion detection - bool is_opposite_strand = std::get<6>(primary_alignment) != std::get<6>(*it); + // bool is_opposite_strand = std::get<6>(primary_alignment) != std::get<6>(*it); + // bool supp_fwd_strand = !(supp_bam1->core.flag & BAM_FREVERSE); + // bool is_opposite_strand = primary_fwd_strand != supp_fwd_strand; + bool is_opposite_strand = primary_region.strand != supp_region.strand; if (is_opposite_strand) { if (supp_length >= min_cnv_length) { @@ -630,31 +977,42 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap } // Trim overlapping alignments - uint32_t supp_start = std::get<1>(largest_supp_alignment); - uint32_t supp_end = std::get<2>(largest_supp_alignment); - bool primary_before_supp = primary_start < supp_start; - trimOverlappingAlignments(primary_alignment, largest_supp_alignment); + MismatchData supp_mismatches; + this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches); + // uint32_t supp_start = std::get<1>(largest_supp_region); + // uint32_t supp_end = std::get<2>(largest_supp_region); + // bool primary_before_supp = primary_start < supp_start; + // trimOverlappingAlignments(primary_alignment, largest_supp_region); + trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches); // Create the SV candidate using both alignments - supp_start = std::get<1>(largest_supp_alignment); - supp_end = std::get<2>(largest_supp_alignment); - primary_start = std::get<1>(primary_alignment); - primary_end = std::get<2>(primary_alignment); + // supp_start = std::get<1>(largest_supp_region); + // supp_end = std::get<2>(largest_supp_region); + // primary_start = std::get<1>(primary_alignment); + // primary_end = std::get<2>(primary_alignment); bool gap_exists = false; uint32_t boundary_left, boundary_right, gap_left, gap_right; - if (primary_before_supp) { - boundary_left = primary_start+1; - // boundary_right = supp_end+1; - boundary_right = std::max(primary_end, supp_end)+1; - gap_left = primary_end+1; - gap_right = supp_start+1; + if (primary_region.start < largest_supp_region.start) { // Primary before supp + // boundary_left = primary_start+1; + // boundary_right = std::max(primary_end, supp_end)+1; + // gap_left = primary_end+1; + // gap_right = supp_start+1; + // gap_exists = gap_left < gap_right; + boundary_left = primary_region.start + 1; + boundary_right = std::max(primary_region.end, largest_supp_region.end) + 1; + gap_left = primary_region.end + 1; + gap_right = largest_supp_region.start + 1; gap_exists = gap_left < gap_right; } else { - boundary_left = supp_start+1; - // boundary_right = primary_end+1; - boundary_right = std::max(primary_end, supp_end)+1; - gap_left = supp_end+1; - gap_right = primary_start+1; + // boundary_left = supp_start+1; + // boundary_right = std::max(primary_end, supp_end)+1; + // gap_left = supp_end+1; + // gap_right = primary_start+1; + // gap_exists = gap_left < gap_right; + boundary_left = largest_supp_region.start + 1; + boundary_right = std::max(primary_region.end, largest_supp_region.end) + 1; + gap_left = largest_supp_region.end + 1; + gap_right = primary_region.start + 1; gap_exists = gap_left < gap_right; } @@ -668,7 +1026,9 @@ void SVCaller::detectSVsFromSplitReads(std::vector& sv_calls, PrimaryMap continue; } - // printMessage("Running copy number prediction on boundary: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right)); + // printMessage("Running copy number prediction on boundary: " + + // primary_chr + ":" + std::to_string(boundary_left) + "-" + + // std::to_string(boundary_right)); std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map); if (std::get<1>(bd_result) == SVType::UNKNOWN) { continue; @@ -888,85 +1248,101 @@ void SVCaller::saveToVCF(const std::unordered_map(primary_alignment); - uint32_t primary_alignment_end = std::get<2>(primary_alignment); - uint32_t supp_alignment_start = std::get<1>(supp_alignment); - uint32_t supp_alignment_end = std::get<2>(supp_alignment); - uint32_t primary_query_start = std::get<3>(primary_alignment); - uint32_t primary_query_end = std::get<4>(primary_alignment); - uint32_t supp_query_start = std::get<3>(supp_alignment); - uint32_t supp_query_end = std::get<4>(supp_alignment); - const std::vector& primary_match_map = std::get<5>(primary_alignment); - const std::vector& supp_match_map = std::get<5>(supp_alignment); + // uint32_t primary_alignment_start = std::get<1>(primary_alignment); + // uint32_t primary_alignment_end = std::get<2>(primary_alignment); + // uint32_t supp_alignment_start = std::get<1>(supp_alignment); + // uint32_t supp_alignment_end = std::get<2>(supp_alignment); + // uint32_t primary_query_start = std::get<3>(primary_alignment); + // uint32_t primary_query_end = std::get<4>(primary_alignment); + // uint32_t supp_query_start = std::get<3>(supp_alignment); + // uint32_t supp_query_end = std::get<4>(supp_alignment); + // const std::vector& primary_match_map = std::get<5>(primary_alignment); + // const std::vector& supp_match_map = std::get<5>(supp_alignment); // Check for overlapping read alignments - bool primary_before_supp = primary_query_start < supp_query_start; - if (primary_before_supp) { + // bool primary_before_supp = primary_query_start < supp_query_start; + if (primary_mismatches.query_start < supp_mismatches.query_start) { // Primary before supplementary in the query - if (primary_query_end >= supp_query_start) { + + // if (primary_query_end >= supp_query_start) { + if (primary_mismatches.query_end >= supp_mismatches.query_start) { // Calculate the mismatch rates at the overlapping region - double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, supp_query_start, primary_query_end); - double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, supp_query_start, primary_query_end); - uint32_t overlap_length = primary_query_end - supp_query_start + 1; + double primary_mismatch_rate = this->calculateMismatchRate(primary_mismatches); + double supp_mismatch_rate = this->calculateMismatchRate(supp_mismatches); + // uint32_t overlap_length = primary_query_end - supp_query_start + + // 1; + hts_pos_t overlap_length = primary_mismatches.query_end - supp_mismatches.query_start + 1; // Trim the ailgnment with the higher mismatch rate if (primary_mismatch_rate > supp_mismatch_rate) { // Trim the end of the primary alignment, ensuring that the new // end is not less than the start - if (primary_alignment_end > overlap_length && (primary_alignment_end - overlap_length) > primary_alignment_start) { + // if (primary_alignment_end > overlap_length && + // (primary_alignment_end - overlap_length) > + // primary_alignment_start) { + if (primary_alignment.end > overlap_length && (primary_alignment.end - overlap_length) > primary_alignment.start) { // Trim the end of the primary alignment - uint32_t new_end = primary_alignment_end - overlap_length; - std::get<2>(primary_alignment) = new_end; + // uint32_t new_end = primary_alignment_end - overlap_length; + // std::get<2>(primary_alignment) = new_end; + primary_alignment.end = primary_alignment.end - overlap_length; } - // std::get<2>(primary_alignment) = primary_alignment_end - overlap_length; } else { // Trim the beginning of the supplementary alignment, ensuring // that the new start is not greater than the end - if (supp_alignment_start + overlap_length < supp_alignment_end) { + // if (supp_alignment_start + overlap_length < + // supp_alignment_end) { + if (supp_alignment.start + overlap_length < supp_alignment.end) { // Trim the beginning of the supplementary alignment - uint32_t new_start = supp_alignment_start + overlap_length; - std::get<1>(supp_alignment) = new_start; + // uint32_t new_start = supp_alignment_start + overlap_length; + // std::get<1>(supp_alignment) = new_start; + supp_alignment.start = supp_alignment.start + overlap_length; } - // uint32_t new_start = supp_alignment_start + overlap_length; - // std::get<1>(supp_alignment) = new_start; - // std::get<1>(supp_alignment) = supp_alignment_start + overlap_length; } } + + // } else if (supp_mismatches.query_end >= primary_mismatches.query_start) { } else { // Supplementary before primary in the query - if (supp_query_end >= primary_query_start) { + if (primary_mismatches.query_start <= supp_mismatches.query_end) { // Calculate the mismatch rates at the overlapping region - double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, primary_query_start, supp_query_end); - double supp_mismatch_rate = this->calculateMismatchRate(supp_match_map, primary_query_start, supp_query_end); - uint32_t overlap_length = supp_query_end - primary_query_start + 1; + // double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, primary_query_start, supp_query_end); + // double supp_mismatch_rate = + // this->calculateMismatchRate(supp_match_map, primary_query_start, + // supp_query_end); + double primary_mismatch_rate = this->calculateMismatchRate(primary_mismatches); + double supp_mismatch_rate = this->calculateMismatchRate(supp_mismatches); + // hts_pos_t overlap_length = supp_query_end - primary_query_start + + // 1; + hts_pos_t overlap_length = supp_mismatches.query_end - primary_mismatches.query_start + 1; // Trim the ailgnment with the higher mismatch rate if (supp_mismatch_rate > primary_mismatch_rate) { // Trim the end of the supplementary alignment, ensuring that // the new end is not less than the start - if (supp_alignment_end > overlap_length && (supp_alignment_end - overlap_length) > supp_alignment_start) { + // if (supp_alignment_end > overlap_length && + // (supp_alignment_end - overlap_length) > supp_alignment_start) + // { + if (supp_alignment.end > overlap_length && (supp_alignment.end - overlap_length) > supp_alignment.start) { // Trim the end of the supplementary alignment - uint32_t new_end = supp_alignment_end - overlap_length; - std::get<2>(supp_alignment) = new_end; + // uint32_t new_end = supp_alignment_end - overlap_length; + // std::get<2>(supp_alignment) = new_end; + supp_alignment.end = supp_alignment.end - overlap_length; } - // uint32_t new_end = supp_alignment_end > overlap_length ? supp_alignment_end - overlap_length : 0; - // std::get<2>(supp_alignment) = new_end; - // std::get<2>(supp_alignment) = supp_alignment_end - overlap_length; } else { // Trim the beginning of the primary alignment, ensuring that // the new start is not greater than the end - if (primary_alignment_start + overlap_length < primary_alignment_end) { + // if (primary_alignment_start + overlap_length < + // primary_alignment_end) { + if (primary_alignment.start + overlap_length < primary_alignment.end) { // Trim the beginning of the primary alignment - uint32_t new_start = primary_alignment_start + overlap_length; - std::get<1>(primary_alignment) = new_start; + // uint32_t new_start = primary_alignment_start + overlap_length; + // std::get<1>(primary_alignment) = new_start; + primary_alignment.start = primary_alignment.start + overlap_length; } - // uint32_t new_start = primary_alignment_start + overlap_length; - // std::get<1>(primary_alignment) = new_start; - // std::get<1>(primary_alignment) = primary_alignment_start + overlap_length; } } } From 56507d49d44b032ac026e5ae8e419d4e0f91ea4a Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sun, 15 Dec 2024 14:21:23 -0500 Subject: [PATCH 052/134] Fix vector error --- src/sv_caller.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index db4da271..1d966a19 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -130,7 +130,7 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t } // Main loop to process the alignments - std::vector match_map; + std::vector match_map(bam1->core.l_qseq, 0); // Query position to match/mismatch (1/0) map uint32_t query_start = 0; uint32_t query_end = 0; uint32_t query_pos = 0; From a9b6fcce253ffca243fd1e9fc18ec1787ac12135 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sun, 15 Dec 2024 16:40:24 -0500 Subject: [PATCH 053/134] Fix alignments error --- include/sv_caller.h | 2 +- src/cnv_caller.cpp | 46 ++------- src/contextsv.cpp | 2 - src/main.cpp | 4 - src/sv_caller.cpp | 226 ++++++++++++-------------------------------- 5 files changed, 73 insertions(+), 207 deletions(-) diff --git a/include/sv_caller.h b/include/sv_caller.h index abd397d1..f1028b70 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -35,7 +35,7 @@ class SVCaller { int min_mapq = 20; // Minimum mapping quality to be considered const InputData& input_data; - void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data) const; + void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data, bool is_primary) const; void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map) const; diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index ac79d598..b380c150 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -94,9 +94,8 @@ std::tuple CNVCaller::runCopyNumberPrediction return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false); } - // Run the Viterbi algorithm on SNPs in the SV region +/- 1/2 - // the SV length - // Only extened the region if "save CNV data" is enabled + // Run the Viterbi algorithm on SNPs in the SV region + // Only extend the region if "save CNV data" is enabled uint32_t snp_start_pos = start_pos; uint32_t snp_end_pos = end_pos; if (this->input_data.getSaveCNVData()) @@ -105,26 +104,18 @@ std::tuple CNVCaller::runCopyNumberPrediction snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1; snp_end_pos = end_pos + sv_half_length; } - // uint32_t sv_half_length = (end_pos - start_pos) / 2.0; - // uint32_t snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1; - // uint32_t snp_end_pos = end_pos + sv_half_length; // Query the SNP region for the SV candidate SNPData snp_data; querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data); - // std::pair snp_call = querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov); - // SNPData& sv_snps = snp_call.first; - // bool sv_snps_found = snp_call.second; // Run the Viterbi algorithm - printMemoryUsage("Before running Viterbi algorithm, "); std::pair, double> prediction; runViterbi(hmm, snp_data, prediction); if (prediction.first.size() == 0) { return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false); } - printMemoryUsage("After running Viterbi algorithm, "); std::vector& state_sequence = prediction.first; double likelihood = prediction.second; @@ -213,8 +204,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector sv_states; for (size_t i = 0; i < state_sequence.size(); i++) { @@ -276,7 +264,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint3 // Calculate the mean chromosome coverage double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map) { - printMemoryUsage("Before calculating mean chromosome coverage, "); { // Lock the bam file std::lock_guard lock(this->bam_file_mtx); @@ -354,7 +339,11 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vectorinput_data.getChromosome() != "") + { + hts_set_threads(bam_file, this->input_data.getThreadCount()); + } // hts_set_threads(bam_file, this->input_data.getThreadCount()); // Read the header @@ -450,7 +439,6 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp) const -{ - printMemoryUsage("Reading SNP allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", "); - +{ // --------- SNP file --------- // Get the SNP file path std::string snp_filepath = this->input_data.getSNPFilepath(); @@ -515,7 +500,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui bcf_srs_t *snp_reader = bcf_sr_init(); if (!snp_reader) { - // throw std::runtime_error("ERROR: Could not initialize SNP reader."); printError("ERROR: Could not initialize SNP reader."); return; } @@ -536,7 +520,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui printError("ERROR: Could not add SNP file to reader: " + snp_filepath); return; } - printMemoryUsage("After adding SNP file to reader, "); // Get the header bcf_hdr_t *snp_header = bcf_sr_get_header(snp_reader, 0); @@ -555,8 +538,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui if (pfb_filepath.empty()) { use_pfb = false; - // printError("ERROR: Population allele frequency file path is empty."); - // return; + printMessage("WARNING: No population allele frequency file provided for chromosome " + chr); } bcf_srs_t *pfb_reader = bcf_sr_init(); @@ -615,7 +597,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath); return; } - printMemoryUsage("After adding population allele frequency file to reader, "); // Get the header bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0); @@ -645,7 +626,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Read the SNP data ---------------------------------------------- // Set the region - printMemoryUsage("Before setting region for SNP reader, "); std::string region_str = region_chunks[i]; if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) { @@ -653,10 +633,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui printError("ERROR: Could not set region for SNP reader: " + region_str); return; } - printMemoryUsage("After setting region for SNP reader, and before reading SNPs, "); - // std::cout << "Iterating through SNPs in region " << region_str << - // "..." << std::endl; // printMessage("Iterating through SNPs in region " + std::to_string(current_region) + " of " + std::to_string((int) region_chunks.size()) + " with length " + std::to_string((int) (end_pos - start_pos)) + " bp..."); bool snp_found = false; while (bcf_sr_next_line(snp_reader) > 0) @@ -727,8 +704,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui } } - printMemoryUsage("After reading SNPs for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", "); - if (snp_reader->errnum) { printError("ERROR: " + std::string(bcf_sr_strerror(snp_reader->errnum))); @@ -740,8 +715,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui continue; } - printMemoryUsage("Before reading population allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", "); - // Read the population allele frequency data ---------------------- if (use_pfb) { @@ -808,7 +781,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum))); } } - printMemoryUsage("After reading population allele frequencies for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos) + ", "); } bcf_sr_destroy(snp_reader); bcf_sr_destroy(pfb_reader); diff --git a/src/contextsv.cpp b/src/contextsv.cpp index 107a6dae..01329c2c 100644 --- a/src/contextsv.cpp +++ b/src/contextsv.cpp @@ -19,9 +19,7 @@ ContextSV::ContextSV(InputData& input_data) int ContextSV::run() { - printMemoryUsage("Before creating SV caller, "); SVCaller sv_caller(this->input_data); - printMemoryUsage("After creating SV caller, "); sv_caller.run(); return 0; diff --git a/src/main.cpp b/src/main.cpp index c622c34d..bbdb8366 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -22,13 +22,10 @@ void runContextSV(const std::unordered_map& args) } // Set up input data - printMemoryUsage("Before setting up input data, "); InputData input_data; input_data.setLongReadBam(args.at("bam-file")); input_data.setShortReadBam(args.at("bam-file")); - printMemoryUsage("Before reading reference genome, "); input_data.setRefGenome(args.at("ref-file")); - printMemoryUsage("After reading reference genome, "); input_data.setSNPFilepath(args.at("snps-file")); input_data.setOutputDir(args.at("output-dir")); if (args.find("chr") != args.end()) { @@ -61,7 +58,6 @@ void runContextSV(const std::unordered_map& args) if (args.find("debug") != args.end()) { input_data.setVerbose(true); } - printMemoryUsage("After setting up input data, "); // Run ContextSV run(input_data); diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 1d966a19..928e11b3 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -41,10 +41,6 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) co void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map) const { - // std::map primary_map; - // std::map> supplementary_map; - printMemoryUsage("Before detecting SVs from CIGAR strings, "); - // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); if (!bam1) { @@ -97,12 +93,10 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam // Clean up the iterator and alignment hts_itr_destroy(itr); bam_destroy1(bam1); - - printMemoryUsage("After detecting SVs from CIGAR strings, "); printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments..."); } -void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data) const +void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data, bool is_primary) const { // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); @@ -112,7 +106,8 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t sam_close(fp_in); throw std::runtime_error("ERROR: failed to initialize BAM record"); } - hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start, region.end); + // hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start, region.end); + hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start - 1, region.end); if (!itr) { bam_destroy1(bam1); hts_idx_destroy(idx); @@ -121,11 +116,52 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t throw std::runtime_error("ERROR: failed to query region " + std::to_string(region.tid) + ":" + std::to_string(region.start) + "-" + std::to_string(region.end)); } - // Read the alignment - if (readNextAlignment(fp_in, itr, bam1) < 0) { - bam_destroy1(bam1); + // // Read the alignment + // if (readNextAlignment(fp_in, itr, bam1) < 0) { + // bam_destroy1(bam1); + // hts_itr_destroy(itr); + // printError("ERROR: failed to read alignment"); + // return; + // } + + // Find the correct alignment + bool success = false; + std::string fail_str = ""; + // printMessage("Looking for alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " with type: " + (is_primary ? "primary" : "supplementary") + " and strand: " + (region.strand ? "forward" : "reverse")); + while (readNextAlignment(fp_in, itr, bam1) >= 0) { + // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality + if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) { + continue; + } + + // Skip if not the correct type of alignment + if (is_primary && (bam1->core.flag & BAM_FSUPPLEMENTARY)) { + continue; + } else if (!is_primary && !(bam1->core.flag & BAM_FSUPPLEMENTARY)) { + continue; + } + + // Check the alignment start and end positions, and strand + if (bam1->core.pos+1 == region.start && bam_endpos(bam1) == region.end && !(bam1->core.flag & BAM_FREVERSE) == region.strand) { + // printMessage("SUCCESS: Found alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " at position: " + std::to_string(bam1->core.pos + 1) + "-" + std::to_string(bam_endpos(bam1))); + success = true; + break; + } else { + // std::string type_str = is_primary ? "primary" : "supplementary"; + // std::string strand_str = region.strand ? "forward" : "reverse"; + // fail_str = "ERROR: Incorrect alignment start and end positions for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + ", Got: " + std::to_string(bam1->core.pos + 1) + "-" + std::to_string(bam_endpos(bam1)) + " with type: " + type_str + " and strand: " + strand_str; + // printError(fail_str); + // printError("ERROR: Incorrect alignment start and end positions for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + ", Got: " + std::to_string(bam1->core.pos) + "-" + std::to_string(bam_endpos(bam1))); + continue; + } + } + + // Check if the alignment was found + if (!success) { + printError("ERROR: Failed to find alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " with type: " + (is_primary ? "primary" : "supplementary") + " and strand: " + (region.strand ? "forward" : "reverse")); + // printError(fail_str); hts_itr_destroy(itr); - printError("ERROR: failed to read alignment"); + bam_destroy1(bam1); return; } @@ -177,7 +213,18 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t // Compare the two sequences and update the mismatch map for (int j = 0; j < op_len; j++) { if (cmatch_seq_str[j] != cmatch_ref_str[j]) { - match_map[query_pos + j] = MISMATCH; + try { + match_map.at(query_pos + j) = MISMATCH; + } catch (const std::out_of_range& e) { + printError("ERROR: Out of range exception for query position: " + std::to_string(query_pos + j) + " with read length: " + std::to_string(bam1->core.l_qseq) + " and array size: " + std::to_string(match_map.size()) + " for CIGAR operation: " + std::to_string(op) + " with length: " + std::to_string(op_len)); + + // Exit the program + hts_itr_destroy(itr); + bam_destroy1(bam1); + + return; + } + // match_map[query_pos + j] = MISMATCH; } else { match_map[query_pos + j] = MATCH; } @@ -211,8 +258,6 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map) { - printMemoryUsage("Before detecting SVs from CIGAR strings, "); - // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); if (!bam1) { @@ -237,68 +282,15 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) { continue; } - // const std::string qname = bam_get_qname(bam1); // Query template name // Process the alignment bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY); this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, primary, pos_depth_map); - // if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { - - // // Get the primary alignment information - // // std::string chr = bamHdr->target_name[bam1->core.tid]; - // // uint32_t start = (uint32_t)bam1->core.pos; - // // uint32_t end = (uint32_t)bam_endpos(bam1); // This is the first position after the alignment - // // end--; // Adjust to the last position of the alignment - // // bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); - - // // Call SVs directly from the CIGAR string - // // std::tuple, uint32_t, uint32_t> query_info; - // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true, pos_depth_map); - // // std::tuple, int32_t, int32_t> query_info = this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, true); - // // const std::vector& match_map = std::get<0>(query_info); - // // uint32_t query_start = std::get<1>(query_info); - // // uint32_t query_end = std::get<2>(query_info); - - // // Add the primary alignment to the map - // // AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand); - // // primary_alignments[qname] = alignment; - - // // Add the iterator to the primary map - // // primary_map[qname] = itr; - - // // Process supplementary alignments - // } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { - - // // Get the supplementary alignment information - // // std::string chr = bamHdr->target_name[bam1->core.tid]; - // // uint32_t start = bam1->core.pos; - // // uint32_t end = bam_endpos(bam1); // This is the first position after the alignment - // // end--; // Adjust to the last position of the alignment - // // bool fwd_strand = !(bam1->core.flag & BAM_FREVERSE); - - // // Get CIGAR string information, but don't call SVs - // // std::tuple, int32_t, int32_t> query_info = - // // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false); - // // std::tuple, uint32_t, uint32_t> query_info; - // this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, false, pos_depth_map); - // // const std::vector& match_map = std::get<0>(query_info); - // // uint32_t query_start = std::get<1>(query_info); - // // uint32_t query_end = std::get<2>(query_info); - - // // Add the supplementary alignment to the map - // // AlignmentData alignment(chr, start, end, query_start, query_end, match_map, fwd_strand); - // // supplementary_alignments[qname].emplace_back(alignment); - - // // Add the iterator to the supplementary map - // // supplementary_map[qname].push_back(itr); - // } } // Clean up the iterator and alignment hts_itr_destroy(itr); bam_destroy1(bam1); - - printMemoryUsage("After detecting SVs from CIGAR strings, "); } // double SVCaller::calculateMismatchRate(const std::vector& mismatch_map, @@ -624,14 +616,12 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls) { - printMemoryUsage("Before opening BAM file, "); // Open the BAM file std::string bam_filepath = this->input_data.getLongReadBam(); samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); if (!fp_in) { throw std::runtime_error("ERROR: failed to open " + bam_filepath); } - printMemoryUsage("After opening BAM file, "); // Load the header bam_hdr_t *bamHdr = sam_hdr_read(fp_in); @@ -647,7 +637,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v sam_close(fp_in); throw std::runtime_error("ERROR: failed to load index for " + bam_filepath); } - printMemoryUsage("After loading index, "); // Split the chromosome into chunks for memory efficiency std::vector region_chunks; @@ -677,16 +666,13 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v } printMessage("Split chromosome " + chr + " into " + std::to_string(region_chunks.size()) + " chunks of size " + std::to_string(chunk_size) + "..."); } - printMemoryUsage("After splitting chromosome into chunks, "); // Load chromosome data for copy number predictions // std::cout << "Loading chromosome data for copy number predictions..." << std::endl; printMessage(chr + ": Loading chromosome data..."); CNVCaller cnv_caller(this->input_data); - printMemoryUsage("Before calculating mean chromosome coverage (top), "); std::vector chr_pos_depth_map(chr_len+1, 0); // 1-based index double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map); - printMemoryUsage("After calculating mean chromosome coverage (top), "); if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) { hts_idx_destroy(idx); bam_hdr_destroy(bamHdr); @@ -758,8 +744,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v void SVCaller::run() { - printMemoryUsage("Before running SV caller, "); - // Get the chromosomes to process + // Get the chromosomes std::vector chromosomes; if (this->input_data.getChromosome() != "") { chromosomes.push_back(this->input_data.getChromosome()); @@ -767,12 +752,10 @@ void SVCaller::run() chromosomes = this->input_data.getRefGenomeChromosomes(); } - printMemoryUsage("After getting chromosomes, "); // Read the HMM from the file std::string hmm_filepath = this->input_data.getHMMFilepath(); std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; const CHMM& hmm = ReadCHMM(hmm_filepath.c_str()); - printMemoryUsage("After reading HMM, "); // Use multi-threading across chromosomes unless a single chromosome is // specified @@ -845,36 +828,17 @@ void SVCaller::run() // Detect SVs from split read alignments void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map) { - printMemoryUsage("Before detecting SVs from split reads, "); - - printMessage("Getting split alignments..."); - // std::map primary_map; - // std::map> supp_map; std::unordered_map primary_map; std::unordered_map> supp_map; this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map); - printMessage("[TEST] Primary map size: " + std::to_string(primary_map.size())); - printMessage("[TEST] Supplementary map size: " + std::to_string(supp_map.size())); - // Find split-read SV evidence int sv_count = 0; uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength(); for (auto& entry : primary_map) { - // std::string qname = entry.first; const std::string& qname = entry.first; GenomicRegion& primary_region = entry.second; - // AlignmentData primary_alignment = entry.second; - // std::string primary_chr = std::get<0>(primary_alignment); - // uint32_t primary_start = std::get<1>(primary_alignment); - // uint32_t primary_end = std::get<2>(primary_alignment); - - // Get the primary alignment information - // std::string primary_chr = bamHdr->target_name[primary_bam1->core.tid]; - // uint32_t primary_start = (uint32_t) primary_bam1->core.pos; - // uint32_t primary_end = (uint32_t) bam_endpos(primary_bam1) - 1; // Last alignment position - // bool primary_fwd_strand = !(primary_bam1->core.flag & BAM_FREVERSE); // Skip primary alignments that do not have supplementary alignments if (supp_map.find(qname) == supp_map.end()) { @@ -883,51 +847,18 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Get the read match/mismatch map MismatchData primary_mismatches; - this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches); - // std::vector match_map(primary_region.end - primary_region.start - // + 1, 0); - // this->getMatchMismatchMap(fp_in, idx, bamHdr, primary_region, mismatch_data); - - // std::pair query_info = generateMatchMismatchMap(fp_in, idx, bamHdr, primary_itr, match_map); - - // Find the largest supplementary alignment, and also identify - // inversions - // printMessage("Finding largest supplementary alignment..."); - // AlignmentData largest_supp_region = supp_map[qname][0]; - // hts_itr_t* largest_supp_itr = supp_map[qname][0]; + this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches, true); GenomicRegion largest_supp_region = supp_map[qname][0]; uint32_t largest_supp_length = 0; const std::string& primary_chr = bamHdr->target_name[primary_region.tid]; for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { GenomicRegion& supp_region = *it; - // Get the supplementary alignment information - // bam1_t* supp_bam1 = bam_init1(); - // if (!supp_bam1) { - // throw std::runtime_error("ERROR: failed to initialize BAM record"); - // } - // if (sam_itr_next(fp_in, *it, supp_bam1) < 0) { - // bam_destroy1(supp_bam1); - // throw std::runtime_error("ERROR: failed to read alignment"); - // } // Skip if not on the primary chromosome if (primary_region.tid != supp_region.tid) { continue; } - // std::string supp_chr = bamHdr->target_name[supp_bam1->core.tid]; - // uint32_t supp_start = (uint32_t) supp_bam1->core.pos; - // uint32_t supp_end = (uint32_t) bam_endpos(supp_bam1) - 1; // Last alignment position - // uint32_t supp_length = supp_end - supp_start + 1; - - // const auto& supp_chr = std::get<0>(*it); - // if (primary_chr != supp_chr) { - // continue; // Skip supplementary alignments on different chromosomes - // } - // uint32_t supp_start = std::get<1>(*it); - // uint32_t supp_end = std::get<2>(*it); - // uint32_t supp_length = supp_end - supp_start + 1; - // Get the supplementary alignment information uint32_t supp_start = (uint32_t) supp_region.start; uint32_t supp_end = (uint32_t) supp_region.end; @@ -938,9 +869,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } // Inversion detection - // bool is_opposite_strand = std::get<6>(primary_alignment) != std::get<6>(*it); - // bool supp_fwd_strand = !(supp_bam1->core.flag & BAM_FREVERSE); - // bool is_opposite_strand = primary_fwd_strand != supp_fwd_strand; bool is_opposite_strand = primary_region.strand != supp_region.strand; if (is_opposite_strand) { if (supp_length >= min_cnv_length) { @@ -978,37 +906,17 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Trim overlapping alignments MismatchData supp_mismatches; - this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches); - // uint32_t supp_start = std::get<1>(largest_supp_region); - // uint32_t supp_end = std::get<2>(largest_supp_region); - // bool primary_before_supp = primary_start < supp_start; - // trimOverlappingAlignments(primary_alignment, largest_supp_region); + this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches, false); trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches); - - // Create the SV candidate using both alignments - // supp_start = std::get<1>(largest_supp_region); - // supp_end = std::get<2>(largest_supp_region); - // primary_start = std::get<1>(primary_alignment); - // primary_end = std::get<2>(primary_alignment); bool gap_exists = false; uint32_t boundary_left, boundary_right, gap_left, gap_right; if (primary_region.start < largest_supp_region.start) { // Primary before supp - // boundary_left = primary_start+1; - // boundary_right = std::max(primary_end, supp_end)+1; - // gap_left = primary_end+1; - // gap_right = supp_start+1; - // gap_exists = gap_left < gap_right; boundary_left = primary_region.start + 1; boundary_right = std::max(primary_region.end, largest_supp_region.end) + 1; gap_left = primary_region.end + 1; gap_right = largest_supp_region.start + 1; gap_exists = gap_left < gap_right; } else { - // boundary_left = supp_start+1; - // boundary_right = std::max(primary_end, supp_end)+1; - // gap_left = supp_end+1; - // gap_right = primary_start+1; - // gap_exists = gap_left < gap_right; boundary_left = largest_supp_region.start + 1; boundary_right = std::max(primary_region.end, largest_supp_region.end) + 1; gap_left = largest_supp_region.end + 1; @@ -1026,9 +934,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in continue; } - // printMessage("Running copy number prediction on boundary: " + - // primary_chr + ":" + std::to_string(boundary_left) + "-" + - // std::to_string(boundary_right)); std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map); if (std::get<1>(bd_result) == SVType::UNKNOWN) { continue; @@ -1046,9 +951,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in continue; } - // printMessage("Running copy number prediction on gap: " + - // primary_chr + ":" + std::to_string(gap_left) + "-" + - // std::to_string(gap_right)); std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map); if (std::get<1>(gap_result) == SVType::UNKNOWN) { continue; @@ -1075,8 +977,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } } } - - printMemoryUsage("After detecting SVs from split reads, "); } void SVCaller::saveToVCF(const std::unordered_map>& sv_calls) From d1756826fff65f900fc9e08ee1b9e5045f587c21 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 16 Dec 2024 11:28:18 -0500 Subject: [PATCH 054/134] work on error handling --- include/sv_caller.h | 2 - src/cnv_caller.cpp | 1 - src/khmm.cpp | 40 +++---- src/sv_caller.cpp | 246 ++++++-------------------------------------- src/sv_object.cpp | 8 +- 5 files changed, 52 insertions(+), 245 deletions(-) diff --git a/include/sv_caller.h b/include/sv_caller.h index f1028b70..11b3919f 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -61,8 +61,6 @@ class SVCaller { // sequence double calculateMismatchRate(const MismatchData& mismatch_data) const; - std::pair generateMatchMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, hts_itr_t *itr, std::vector& match_map) const; - void saveToVCF(const std::unordered_map>& sv_calls); void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches) const; diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index b380c150..c0811d51 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -89,7 +89,6 @@ std::tuple CNVCaller::runCopyNumberPrediction // Check that the start position is less than the end position if (start_pos >= end_pos) { - // throw std::runtime_error("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false); } diff --git a/src/khmm.cpp b/src/khmm.cpp index b5b5bf02..fcc4899d 100644 --- a/src/khmm.cpp +++ b/src/khmm.cpp @@ -425,7 +425,6 @@ CHMM ReadCHMM(const std::string filename) std::ifstream file(filename); if (!file.is_open()) { - // throw std::runtime_error("Error opening file"); printError("Error opening file"); return CHMM(); } @@ -437,7 +436,6 @@ CHMM ReadCHMM(const std::string filename) std::getline(file, line); if (sscanf(line.c_str(), "M=%d", &hmm.M) != 1) { - // throw std::runtime_error("Error reading M"); printError("Error reading M"); return CHMM(); } @@ -446,7 +444,6 @@ CHMM ReadCHMM(const std::string filename) std::getline(file, line); if (sscanf(line.c_str(), "N=%d", &hmm.N) != 1) { - // throw std::runtime_error("Error reading N"); printError("Error reading N"); return CHMM(); } @@ -455,14 +452,12 @@ CHMM ReadCHMM(const std::string filename) std::getline(file, line); if (line != "A:") { - // throw std::runtime_error("Error reading A"); printError("Error reading A"); return CHMM(); } hmm.A = readMatrix(file, hmm.N, hmm.N); if (hmm.A.size() != (size_t)hmm.N || hmm.A[0].size() != (size_t)hmm.N) { - // throw std::runtime_error("Error reading A"); printError("Error reading A"); return CHMM(); } @@ -471,14 +466,12 @@ CHMM ReadCHMM(const std::string filename) std::getline(file, line); if (line != "B:") { - // throw std::runtime_error("Error reading B"); printError("Error reading B"); return CHMM(); } hmm.B = readMatrix(file, hmm.N, hmm.M); if (hmm.B.size() != (size_t)hmm.N || hmm.B[0].size() != (size_t)hmm.M) { - // throw std::runtime_error("Error reading B"); printError("Error reading B"); return CHMM(); } @@ -487,14 +480,12 @@ CHMM ReadCHMM(const std::string filename) std::getline(file, line); if (line != "pi:") { - // throw std::runtime_error("Error reading pi"); printError("Error reading pi"); return CHMM(); } hmm.pi = readVector(file, hmm.N); if (hmm.pi.size() != (size_t)hmm.N) { - // throw std::runtime_error("Error reading pi"); printError("Error reading pi"); return CHMM(); } @@ -503,14 +494,12 @@ CHMM ReadCHMM(const std::string filename) std::getline(file, line); if (line != "B1_mean:") { - // throw std::runtime_error("Error reading B1_mean"); printError("Error reading B1_mean"); return CHMM(); } hmm.B1_mean = readVector(file, hmm.N); if (hmm.B1_mean.size() != (size_t)hmm.N) { - // throw std::runtime_error("Error reading B1_mean"); printError("Error reading B1_mean"); return CHMM(); } @@ -519,14 +508,12 @@ CHMM ReadCHMM(const std::string filename) std::getline(file, line); if (line != "B1_sd:") { - // throw std::runtime_error("Error reading B1_sd"); printError("Error reading B1_sd"); return CHMM(); } hmm.B1_sd = readVector(file, hmm.N); if (hmm.B1_sd.size() != (size_t)hmm.N) { - // throw std::runtime_error("Error reading B1_sd"); printError("Error reading B1_sd"); return CHMM(); } @@ -535,7 +522,6 @@ CHMM ReadCHMM(const std::string filename) std::getline(file, line); if (line != "B1_uf:") { - // throw std::runtime_error("Error reading B1_uf"); printError("Error reading B1_uf"); return CHMM(); } @@ -543,7 +529,6 @@ CHMM ReadCHMM(const std::string filename) try { hmm.B1_uf = std::stod(line); } catch (const std::invalid_argument& e) { - // throw std::runtime_error("Error reading B1_uf"); printError("Error reading B1_uf"); return CHMM(); } @@ -552,37 +537,44 @@ CHMM ReadCHMM(const std::string filename) std::getline(file, line); if (line != "B2_mean:") { - throw std::runtime_error("Error reading B2_mean"); + printError("Error reading B2_mean"); + return CHMM(); } hmm.B2_mean = readVector(file, 5); if (hmm.B2_mean.size() != (size_t)5) { - throw std::runtime_error("Error reading B2_mean"); + printError("Error reading B2_mean"); + return CHMM(); } // Read B2_sd std::getline(file, line); if (line != "B2_sd:") { - throw std::runtime_error("Error reading B2_sd"); + printError("Error reading B2_sd"); + return CHMM(); } hmm.B2_sd = readVector(file, 5); if (hmm.B2_sd.size() != (size_t)5) { - throw std::runtime_error("Error reading B2_sd"); + printError("Error reading B2_sd"); + return CHMM(); + } // Read B2_uf std::getline(file, line); if (line != "B2_uf:") { - throw std::runtime_error("Error reading B2_uf"); + printError("Error reading B2_uf"); + return CHMM(); } std::getline(file, line); try { hmm.B2_uf = std::stod(line); } catch (const std::invalid_argument& e) { - throw std::runtime_error("Error reading B2_uf"); + printError("Error reading B2_uf"); + return CHMM(); } return hmm; @@ -597,7 +589,8 @@ std::vector> readMatrix(std::ifstream &file, int rows, int c { if (!(file >> matrix[i][j])) { - throw std::runtime_error("Error reading matrix"); + printError("Error reading matrix"); + return std::vector>(); } } } @@ -612,7 +605,8 @@ std::vector readVector(std::ifstream &file, int size) { if (!(file >> vector[i])) { - throw std::runtime_error("Error reading vector"); + printError("Error reading vector"); + return std::vector(); } } file.ignore(std::numeric_limits::max(), '\n'); diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 928e11b3..df54f4ef 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -47,7 +47,8 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam hts_idx_destroy(idx); bam_hdr_destroy(bamHdr); sam_close(fp_in); - throw std::runtime_error("ERROR: failed to initialize BAM record"); + printError("ERROR: failed to initialize BAM record"); + return; } hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str()); if (!itr) { @@ -55,7 +56,8 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam hts_idx_destroy(idx); bam_hdr_destroy(bamHdr); sam_close(fp_in); - throw std::runtime_error("ERROR: failed to query region " + region); + printError("ERROR: failed to query region " + region); + return; } uint32_t primary_count = 0; @@ -104,7 +106,8 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t hts_idx_destroy(idx); bam_hdr_destroy(bamHdr); sam_close(fp_in); - throw std::runtime_error("ERROR: failed to initialize BAM record"); + printError("ERROR: failed to initialize BAM record"); + return; } // hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start, region.end); hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start - 1, region.end); @@ -113,17 +116,10 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t hts_idx_destroy(idx); bam_hdr_destroy(bamHdr); sam_close(fp_in); - throw std::runtime_error("ERROR: failed to query region " + std::to_string(region.tid) + ":" + std::to_string(region.start) + "-" + std::to_string(region.end)); + printError("ERROR: failed to query region " + std::to_string(region.tid) + ":" + std::to_string(region.start) + "-" + std::to_string(region.end)); + return; } - // // Read the alignment - // if (readNextAlignment(fp_in, itr, bam1) < 0) { - // bam_destroy1(bam1); - // hts_itr_destroy(itr); - // printError("ERROR: failed to read alignment"); - // return; - // } - // Find the correct alignment bool success = false; std::string fail_str = ""; @@ -147,11 +143,6 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t success = true; break; } else { - // std::string type_str = is_primary ? "primary" : "supplementary"; - // std::string strand_str = region.strand ? "forward" : "reverse"; - // fail_str = "ERROR: Incorrect alignment start and end positions for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + ", Got: " + std::to_string(bam1->core.pos + 1) + "-" + std::to_string(bam_endpos(bam1)) + " with type: " + type_str + " and strand: " + strand_str; - // printError(fail_str); - // printError("ERROR: Incorrect alignment start and end positions for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + ", Got: " + std::to_string(bam1->core.pos) + "-" + std::to_string(bam_endpos(bam1))); continue; } } @@ -159,7 +150,6 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t // Check if the alignment was found if (!success) { printError("ERROR: Failed to find alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " with type: " + (is_primary ? "primary" : "supplementary") + " and strand: " + (region.strand ? "forward" : "reverse")); - // printError(fail_str); hts_itr_destroy(itr); bam_destroy1(bam1); return; @@ -207,7 +197,10 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t // Check that the two sequence lengths are equal if (cmatch_seq_str.length() != cmatch_ref_str.length()) { - throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op)); + printError("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op)); + hts_itr_destroy(itr); + bam_destroy1(bam1); + return; } // Compare the two sequences and update the mismatch map @@ -238,9 +231,10 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t // https://samtools.github.io/hts-specs/SAMv1.pdf if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { pos += op_len; + } // Update the query position - } else if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) { + if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) { query_pos += op_len; } } @@ -264,7 +258,8 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, hts_idx_destroy(idx); bam_hdr_destroy(bamHdr); sam_close(fp_in); - throw std::runtime_error("ERROR: failed to initialize BAM record"); + printError("ERROR: failed to initialize BAM record"); + return; } hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str()); if (!itr) { @@ -272,7 +267,8 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, hts_idx_destroy(idx); bam_hdr_destroy(bamHdr); sam_close(fp_in); - throw std::runtime_error("ERROR: failed to query region " + region); + printError("ERROR: failed to query region " + region); + return; } // Main loop to process the alignments @@ -293,8 +289,6 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, bam_destroy1(bam1); } -// double SVCaller::calculateMismatchRate(const std::vector& mismatch_map, -// int32_t start, int32_t end) double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) const { int start = mismatch_data.query_start; @@ -323,115 +317,6 @@ double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) const return mismatch_rate; } -// { -// start = std::max(start, 0); -// end = std::min(end, (int32_t)mismatch_map.size() - 1); -// int match_count = 0; -// int mismatch_count = 0; -// int MATCH = 1; -// int MISMATCH = -1; -// for (int i = start; i <= end; i++) { -// if (mismatch_map[i] == MATCH) { -// match_count++; -// } else if (mismatch_map[i] == MISMATCH) { -// mismatch_count++; -// } -// } - -// // Avoid division by zero -// if (match_count + mismatch_count == 0) { -// return 0.0; -// } - -// double mismatch_rate = static_cast(mismatch_count) / static_cast(match_count + mismatch_count); - -// return mismatch_rate; -// } - -std::pair SVCaller::generateMatchMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, hts_itr_t* itr, std::vector& match_map) const -{ - // Create a read and iterator for the region - bam1_t *bam1 = bam_init1(); - if (!bam1) { - printError("ERROR: failed to initialize BAM record"); - return std::make_pair(0, 0); - } - - // Read the alignment - if (readNextAlignment(fp_in, itr, bam1) < 0) { - bam_destroy1(bam1); - printError("ERROR: failed to read alignment"); - return std::make_pair(0, 0); - } - - // Main loop to process the alignments - std::string chr = bamHdr->target_name[bam1->core.tid]; // Chromosome name - uint32_t pos = (uint32_t)bam1->core.pos; // Leftmost position of the alignment in the reference genome (0-based) - uint32_t query_pos = 0; - uint32_t query_start = 0; - uint32_t query_end = 0; - bool first_op = true; - - // Get the CIGAR string - uint32_t* cigar = bam_get_cigar(bam1); // CIGAR array - int cigar_len = bam1->core.n_cigar; - for (int i = 0; i < cigar_len; i++) { - int op = bam_cigar_op(cigar[i]); // CIGAR operation - int op_len = bam_cigar_oplen(cigar[i]); // CIGAR operation length - - // Update match/mismatch query map - int MATCH = 1; - int MISMATCH = -1; - if (op == BAM_CEQUAL) { - for (int j = 0; j < op_len; j++) { - match_map[query_pos + j] = MATCH; - } - } else if (op == BAM_CDIFF) { - for (int j = 0; j < op_len; j++) { - match_map[query_pos + j] = MISMATCH; - } - } else if (op == BAM_CMATCH) { - // Get the read sequence - uint8_t* seq_ptr = bam_get_seq(bam1); - std::string cmatch_seq_str = ""; - for (int j = 0; j < op_len; j++) { - cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)]; - } - - // Get the corresponding reference sequence - int cmatch_pos = pos + 1; // Querying the reference genome is 1-based - // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1)); - std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1); - - // Check that the two sequence lengths are equal - if (cmatch_seq_str.length() != cmatch_ref_str.length()) { - throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op)); - } - - // Compare the two sequences and update the mismatch map - for (int j = 0; j < op_len; j++) { - if (cmatch_seq_str[j] != cmatch_ref_str[j]) { - match_map[query_pos + j] = MISMATCH; - } else { - match_map[query_pos + j] = MATCH; - } - } - } else if (first_op && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP)) { - query_start = query_pos + op_len; - first_op = false; - } - - // Update the query position - if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) { - query_pos += op_len; - } - } - query_end = query_pos; - - bam_destroy1(bam1); // Clean up the alignment - - return std::make_pair(query_start, query_end); -} void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map) { @@ -440,19 +325,11 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec uint32_t* cigar = bam_get_cigar(alignment); // CIGAR array int cigar_len = alignment->core.n_cigar; uint32_t query_pos = 0; - // std::unordered_map query_match_map; // Query position to - // match/mismatch (1/0) map - // std::vector query_match_map(alignment->core.l_qseq, 0); // Query position to match/mismatch (1/0) map // Loop through the CIGAR string, process operations, detect SVs (primary - // only), update clipped base support, calculate sequence identity for - // potential duplications (primary only), and calculate - // the clipped base support and mismatch rate + // only), and calculate sequence identity for potential duplications (primary only) uint32_t ref_pos; uint32_t ref_end; - // uint32_t query_start = 0; // First alignment position in the query - // uint32_t query_end = 0; // Last alignment position in the query - // bool first_op = false; // First alignment operation for the query double default_lh = 0.0; for (int i = 0; i < cigar_len; i++) { @@ -478,8 +355,6 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec bool is_duplication = false; int ins_ref_pos; uint32_t dup_start = std::max(0, (int)pos - op_len); - // int dup_start = std::max(0, pos - op_len); - // for (int j = pos - op_len; j <= pos; j++) { for (uint32_t j = dup_start; j <= pos; j++) { // Get the string for the window (1-based coordinates) @@ -537,81 +412,19 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); addSVCall(sv_calls, ref_pos, ref_end, "DEL", "", "CIGARDEL", "./.", default_lh, read_depth); } - - // Check if the CIGAR operation is a clipped base - } else if (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP) { - - // sv_calls.updateClippedBaseSupport(chr, pos); // Update clipped base support - - // Update the query alignment start position - // if (!first_op) { - // query_start = query_pos + op_len; - // first_op = true; - // } } - // // Update match/mismatch query map - // int MATCH = 1; - // int MISMATCH = -1; - // if (op == BAM_CEQUAL) { - // for (int j = 0; j < op_len; j++) { - // query_match_map[query_pos + j] = MATCH; - // } - // } else if (op == BAM_CDIFF) { - // for (int j = 0; j < op_len; j++) { - // query_match_map[query_pos + j] = MISMATCH; - // } - // } else if (op == BAM_CMATCH) { - // // Get the read sequence - // uint8_t* seq_ptr = bam_get_seq(alignment); - // std::string cmatch_seq_str = ""; - // for (int j = 0; j < op_len; j++) { - // cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)]; - // } - - // // Get the corresponding reference sequence - // int cmatch_pos = pos + 1; // Querying the reference genome is 1-based - // // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1)); - // std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1); - - // // Check that the two sequence lengths are equal - // if (cmatch_seq_str.length() != cmatch_ref_str.length()) { - // throw std::runtime_error("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op)); - // } - - // // Compare the two sequences and update the mismatch map - // for (int j = 0; j < op_len; j++) { - // if (cmatch_seq_str[j] != cmatch_ref_str[j]) { - // query_match_map[query_pos + j] = MISMATCH; - // } else { - // query_match_map[query_pos + j] = MATCH; - // } - // } - // } - - // Update the reference coordinate based on the CIGAR operation + // Update the reference position // https://samtools.github.io/hts-specs/SAMv1.pdf if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { pos += op_len; - } else if (op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) { - // Do nothing - } else { - throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op)); } - - // Update the query position based on the CIGAR operation (M, I, S, H) + + // Update the query position if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) { query_pos += op_len; - } else if (op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CHARD_CLIP || op == BAM_CPAD) { - // Do nothing - } else { - throw std::runtime_error("ERROR: Unknown CIGAR operation: " + std::to_string(op)); } } - - // query_end = query_pos; // Last alignment position in the query - - // query_info = std::tuple, uint32_t, uint32_t>(std::move(query_match_map), query_start, query_end); } void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls) @@ -620,14 +433,16 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v std::string bam_filepath = this->input_data.getLongReadBam(); samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); if (!fp_in) { - throw std::runtime_error("ERROR: failed to open " + bam_filepath); + printError("ERROR: failed to open " + bam_filepath); + return; } // Load the header bam_hdr_t *bamHdr = sam_hdr_read(fp_in); if (!bamHdr) { sam_close(fp_in); - throw std::runtime_error("ERROR: failed to read header from " + bam_filepath); + printError("ERROR: failed to read header from " + bam_filepath); + return; } // Load the index @@ -635,7 +450,8 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v if (!idx) { bam_hdr_destroy(bamHdr); sam_close(fp_in); - throw std::runtime_error("ERROR: failed to load index for " + bam_filepath); + printError("ERROR: failed to load index for " + bam_filepath); + return; } // Split the chromosome into chunks for memory efficiency @@ -651,7 +467,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v int region_end = region.second; std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end); region_chunks.push_back(chunk); - // std::cout << "Using specified region " << chunk << "..." << std::endl; } else { int chunk_size = std::ceil((double)chr_len / chunk_count); @@ -668,7 +483,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v } // Load chromosome data for copy number predictions - // std::cout << "Loading chromosome data for copy number predictions..." << std::endl; printMessage(chr + ": Loading chromosome data..."); CNVCaller cnv_caller(this->input_data); std::vector chr_pos_depth_map(chr_len+1, 0); // 1-based index @@ -987,8 +801,10 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls, uint32_t start, uint32_t end, std: return; } - // Set the alt allele to or if the SV type is DUP or DEL, throw - // an error otherwise + // Set the alt allele to or if the SV type is DUP or DEL if (sv_type == "DUP" && alt_allele == ".") { printError("ERROR: Invalid alt allele for duplication at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele); alt_allele = ""; @@ -32,7 +31,8 @@ void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std: } if (start >= end) { - throw std::runtime_error("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end)); + printError("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end)); + return; } // Insert the SV call in sorted order @@ -69,7 +69,7 @@ void updateSVType(std::vector& sv_calls, uint32_t start, uint32_t end, s it->genotype = genotype; it->hmm_likelihood = hmm_likelihood; } else { - throw std::runtime_error("ERROR: SV call not found for update at position " + std::to_string(start) + "-" + std::to_string(end)); + printError("ERROR: SV call not found for update at position " + std::to_string(start) + "-" + std::to_string(end)); } } From 6b611174fab649c4c0ecf7364d21185447007ebe Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Tue, 17 Dec 2024 07:26:55 -0500 Subject: [PATCH 055/134] Fix 1 off breakpoint error --- include/sv_caller.h | 2 -- src/cnv_caller.cpp | 7 ++--- src/fasta_query.cpp | 14 +-------- src/input_data.cpp | 10 +++++- src/sv_caller.cpp | 77 +++++++++++---------------------------------- 5 files changed, 31 insertions(+), 79 deletions(-) diff --git a/include/sv_caller.h b/include/sv_caller.h index 11b3919f..70be7e33 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -65,8 +65,6 @@ class SVCaller { void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches) const; - // void trimOverlappingAlignments(uint32_t& primary_start, uint32_t& primary_end, uint32_t& supp_start, uint32_t& supp_end, const std::vector& primary_match_map, const std::vector& supp_match_map); - // Calculate the read depth (INFO/DP) for a region int calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end); diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index c0811d51..20e7189f 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -326,10 +326,8 @@ std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint3 double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map) { { - // Lock the bam file - std::lock_guard lock(this->bam_file_mtx); - // Open the BAM file + std::lock_guard lock(this->bam_file_mtx); // Lock the BAM file std::string bam_filepath = this->input_data.getShortReadBam(); samFile *bam_file = sam_open(bam_filepath.c_str(), "r"); if (!bam_file) @@ -398,7 +396,7 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vectorcore.pos + 1; // 0-based to 1-based + uint32_t pos = (uint32_t)bam_record->core.pos + 1; // 0-based to 1-based uint32_t ref_pos = pos; uint32_t cigar_len = bam_record->core.n_cigar; uint32_t *cigar = bam_get_cigar(bam_record); @@ -457,7 +455,6 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector(cum_depth) / static_cast(pos_count); } - // return std::make_pair(mean_chr_cov, chr_pos_depth_map); return mean_chr_cov; } diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp index 212343f0..9a705a9a 100644 --- a/src/fasta_query.cpp +++ b/src/fasta_query.cpp @@ -104,26 +104,14 @@ std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, u pos_start--; pos_end--; - // Ensure that the start position is not negative, and the end position is - // not larger than the chromosome length - if (pos_start < 0) - { - return ""; - } - // if (pos_end >= (uint32_t)this->chr_to_seq[chr].length()) + // Ensure that the end position is not larger than the chromosome length if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length()) { return ""; } uint32_t length = pos_end - pos_start + 1; - - // Get the sequence const std::string& sequence = this->chr_to_seq.at(chr); - // const std::string& sequence = this->chr_to_seq[chr]; - - // Get the substring - // std::string subsequence = sequence.substr(pos_start, length); // If the subsequence is empty, return empty string if (sequence.substr(pos_start, length).empty()) diff --git a/src/input_data.cpp b/src/input_data.cpp index 381d5ac5..952329d1 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -306,7 +306,15 @@ std::string InputData::getAlleleFreqFilepath(std::string chr) const { chr = chr.substr(3, chr.size() - 3); } - return this->pfb_filepaths.at(chr); + + try + { + return this->pfb_filepaths.at(chr); + } + catch (const std::out_of_range& e) + { + return ""; + } } void InputData::setThreadCount(int thread_count) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index df54f4ef..61a236b4 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -485,8 +485,10 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // Load chromosome data for copy number predictions printMessage(chr + ": Loading chromosome data..."); CNVCaller cnv_caller(this->input_data); + printMessage(chr + ": LENGTH: " + std::to_string(chr_len)); std::vector chr_pos_depth_map(chr_len+1, 0); // 1-based index double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map); + printMessage(chr + ": POSDEPTH: " + std::to_string(chr_pos_depth_map.size())); if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) { hts_idx_destroy(idx); bam_hdr_destroy(bamHdr); @@ -529,7 +531,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // std::cout << "Detecting copy number variants from split reads..." << std::endl; printMessage(chr + ": Split read SVs..."); this->detectSVsFromSplitReads(sub_region, fp_in, idx, bamHdr, subregion_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map); - // this->detectSVsFromSplitReads(subregion_sv_calls, primary_map, supp_map, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map); // Merge the SV calls from the current region // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; @@ -689,31 +690,31 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Print error if the start position is greater than the end // position - if (supp_start+1 > supp_end+1) { - printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start+1) + "-" + std::to_string(supp_end+1)); + if (supp_start > supp_end) { + printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end)); continue; } - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start+1, supp_end+1, mean_chr_cov, pos_depth_map); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map); if (std::get<1>(result) == SVType::UNKNOWN) { continue; } double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); - int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); + int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); if (supp_type == SVType::NEUTRAL) { - addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "", "HMM", "./.", supp_lh, read_depth); + addSVCall(sv_calls, supp_start, supp_end, "INV", "", "HMM", "./.", supp_lh, read_depth); sv_count++; } else if (supp_type == SVType::DUP) { - addSVCall(sv_calls, supp_start+1, supp_end+1, "INVDUP", "", "HMM", "./.", supp_lh, read_depth); + addSVCall(sv_calls, supp_start, supp_end, "INVDUP", "", "HMM", "./.", supp_lh, read_depth); } } else { // Add the inversion without running copy number predictions // (too small for predictions) - int read_depth = this->calculateReadDepth(pos_depth_map, supp_start+1, supp_end+1); - addSVCall(sv_calls, supp_start+1, supp_end+1, "INV", "", "REV", "./.", 0.0, read_depth); + int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); + addSVCall(sv_calls, supp_start, supp_end, "INV", "", "REV", "./.", 0.0, read_depth); } } } @@ -725,16 +726,16 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in bool gap_exists = false; uint32_t boundary_left, boundary_right, gap_left, gap_right; if (primary_region.start < largest_supp_region.start) { // Primary before supp - boundary_left = primary_region.start + 1; - boundary_right = std::max(primary_region.end, largest_supp_region.end) + 1; - gap_left = primary_region.end + 1; - gap_right = largest_supp_region.start + 1; + boundary_left = primary_region.start; + boundary_right = std::max(primary_region.end, largest_supp_region.end); + gap_left = primary_region.end; + gap_right = largest_supp_region.start; gap_exists = gap_left < gap_right; } else { - boundary_left = largest_supp_region.start + 1; - boundary_right = std::max(primary_region.end, largest_supp_region.end) + 1; - gap_left = largest_supp_region.end + 1; - gap_right = primary_region.start + 1; + boundary_left = largest_supp_region.start; + boundary_right = std::max(primary_region.end, largest_supp_region.end); + gap_left = largest_supp_region.end; + gap_right = primary_region.start; gap_exists = gap_left < gap_right; } @@ -966,21 +967,8 @@ void SVCaller::saveToVCF(const std::unordered_map(primary_alignment); - // uint32_t primary_alignment_end = std::get<2>(primary_alignment); - // uint32_t supp_alignment_start = std::get<1>(supp_alignment); - // uint32_t supp_alignment_end = std::get<2>(supp_alignment); - // uint32_t primary_query_start = std::get<3>(primary_alignment); - // uint32_t primary_query_end = std::get<4>(primary_alignment); - // uint32_t supp_query_start = std::get<3>(supp_alignment); - // uint32_t supp_query_end = std::get<4>(supp_alignment); - // const std::vector& primary_match_map = std::get<5>(primary_alignment); - // const std::vector& supp_match_map = std::get<5>(supp_alignment); // Check for overlapping read alignments - // bool primary_before_supp = primary_query_start < supp_query_start; if (primary_mismatches.query_start < supp_mismatches.query_start) { // Primary before supplementary in the query @@ -989,74 +977,47 @@ void SVCaller::trimOverlappingAlignments(GenomicRegion& primary_alignment, Genom // Calculate the mismatch rates at the overlapping region double primary_mismatch_rate = this->calculateMismatchRate(primary_mismatches); double supp_mismatch_rate = this->calculateMismatchRate(supp_mismatches); - // uint32_t overlap_length = primary_query_end - supp_query_start + - // 1; hts_pos_t overlap_length = primary_mismatches.query_end - supp_mismatches.query_start + 1; // Trim the ailgnment with the higher mismatch rate if (primary_mismatch_rate > supp_mismatch_rate) { // Trim the end of the primary alignment, ensuring that the new // end is not less than the start - // if (primary_alignment_end > overlap_length && - // (primary_alignment_end - overlap_length) > - // primary_alignment_start) { if (primary_alignment.end > overlap_length && (primary_alignment.end - overlap_length) > primary_alignment.start) { // Trim the end of the primary alignment - // uint32_t new_end = primary_alignment_end - overlap_length; - // std::get<2>(primary_alignment) = new_end; primary_alignment.end = primary_alignment.end - overlap_length; } } else { // Trim the beginning of the supplementary alignment, ensuring // that the new start is not greater than the end - // if (supp_alignment_start + overlap_length < - // supp_alignment_end) { if (supp_alignment.start + overlap_length < supp_alignment.end) { // Trim the beginning of the supplementary alignment - // uint32_t new_start = supp_alignment_start + overlap_length; - // std::get<1>(supp_alignment) = new_start; supp_alignment.start = supp_alignment.start + overlap_length; } } } - // } else if (supp_mismatches.query_end >= primary_mismatches.query_start) { } else { // Supplementary before primary in the query if (primary_mismatches.query_start <= supp_mismatches.query_end) { // Calculate the mismatch rates at the overlapping region - // double primary_mismatch_rate = this->calculateMismatchRate(primary_match_map, primary_query_start, supp_query_end); - // double supp_mismatch_rate = - // this->calculateMismatchRate(supp_match_map, primary_query_start, - // supp_query_end); double primary_mismatch_rate = this->calculateMismatchRate(primary_mismatches); double supp_mismatch_rate = this->calculateMismatchRate(supp_mismatches); - // hts_pos_t overlap_length = supp_query_end - primary_query_start + - // 1; hts_pos_t overlap_length = supp_mismatches.query_end - primary_mismatches.query_start + 1; // Trim the ailgnment with the higher mismatch rate if (supp_mismatch_rate > primary_mismatch_rate) { // Trim the end of the supplementary alignment, ensuring that // the new end is not less than the start - // if (supp_alignment_end > overlap_length && - // (supp_alignment_end - overlap_length) > supp_alignment_start) - // { if (supp_alignment.end > overlap_length && (supp_alignment.end - overlap_length) > supp_alignment.start) { // Trim the end of the supplementary alignment - // uint32_t new_end = supp_alignment_end - overlap_length; - // std::get<2>(supp_alignment) = new_end; supp_alignment.end = supp_alignment.end - overlap_length; } } else { // Trim the beginning of the primary alignment, ensuring that // the new start is not greater than the end - // if (primary_alignment_start + overlap_length < - // primary_alignment_end) { if (primary_alignment.start + overlap_length < primary_alignment.end) { // Trim the beginning of the primary alignment - // uint32_t new_start = primary_alignment_start + overlap_length; - // std::get<1>(primary_alignment) = new_start; primary_alignment.start = primary_alignment.start + overlap_length; } } @@ -1077,7 +1038,7 @@ int SVCaller::calculateReadDepth(const std::vector& pos_depth_map, uin // printMessage("Read depth at end: " + std::to_string(pos_depth_map.at(end)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start)); read_depth += pos_depth_map.at(end); } catch (const std::out_of_range& e) { - std::cerr << "Warning: End position " << end << " not found in depth map." << std::endl; + std::cerr << "Warning: End position " << end << " not found in depth map of size " << pos_depth_map.size() << "." << std::endl; } // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth)); return read_depth; From b35c1077fd2ee4aa67d84a6ebda04b8f47307ef5 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 19 Dec 2024 10:31:24 -0500 Subject: [PATCH 056/134] thread safe ref genome --- Makefile-cpp | 7 + include/cnv_caller.h | 15 ++- include/contextsv.h | 8 +- include/fasta_query.h | 2 + include/input_data.h | 23 +--- include/sv_caller.h | 21 +-- include/swig_interface.h | 2 +- include/utils.h | 50 +++++++ src/cnv_caller.cpp | 176 +++++++++++-------------- src/contextsv.cpp | 10 +- src/fasta_query.cpp | 7 +- src/input_data.cpp | 35 ++--- src/sv_caller.cpp | 273 +++++++++++++++++++-------------------- src/sv_object.cpp | 15 --- src/swig_interface.cpp | 7 +- 15 files changed, 321 insertions(+), 330 deletions(-) diff --git a/Makefile-cpp b/Makefile-cpp index 55630e9b..3babecb3 100644 --- a/Makefile-cpp +++ b/Makefile-cpp @@ -22,6 +22,13 @@ CXXFLAGS := -std=c++14 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedan LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR) # Add rpath for shared libraries LDLIBS := -lhts # Link with libhts.a or libhts.so +# Enable thread sanitizer (TSan) +# ifeq ($(TSAN),1) +# TSAN_FLAGS := -fsanitize=thread -fPIE -pie -g +# CXXFLAGS += $(TSAN_FLAGS) +# LDFLAGS += $(TSAN_FLAGS) +# endif + # Sources and Output # SOURCES := $(wildcard $(SRC_DIR)/*.cpp) SOURCES := $(filter-out $(SRC_DIR)/swig_wrapper.cpp, $(wildcard $(SRC_DIR)/*.cpp)) # Filter out the SWIG wrapper from the sources diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 12bbce80..0417b8ae 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -45,7 +45,6 @@ struct SNPData { // CNVCaller: Detect CNVs and return the state sequence by SNP position class CNVCaller { private: - const InputData& input_data; mutable std::mutex snp_file_mtx; // SNP file mutex mutable std::mutex pfb_file_mtx; // Population frequency file mutex mutable std::mutex bam_file_mtx; // BAM file mutex @@ -76,26 +75,28 @@ class CNVCaller { void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair, double>& prediction) const; // Query a region for SNPs and return the SNP data - void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data) const; + void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const; // Split a region into chunks for parallel processing std::vector splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const; public: - explicit CNVCaller(const InputData& input_data); + // explicit CNVCaller(const InputData& input_data); + // Constructor with no arguments + CNVCaller() = default; // Run copy number prediction for a single SV candidate, returning the // likelihood, predicted CNV type, genotype, and whether SNPs were found - std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map) const; + std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings - void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map); + void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; - double calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map); + double calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map, const std::string& bam_filepath, int thread_count, bool single_chr) const; void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& pos_log2) const; - void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp) const; + void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data) const; // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const; diff --git a/include/contextsv.h b/include/contextsv.h index 97d7bce9..890748da 100644 --- a/include/contextsv.h +++ b/include/contextsv.h @@ -10,14 +10,12 @@ class ContextSV { - private: - InputData& input_data; - public: - explicit ContextSV(InputData& input_data); + // explicit ContextSV(InputData& input_data); + ContextSV() = default; // Entry point - int run(); + int run(const InputData& input_data) const; }; #endif // CONTEXTSV_H diff --git a/include/fasta_query.h b/include/fasta_query.h index b130117a..75259441 100644 --- a/include/fasta_query.h +++ b/include/fasta_query.h @@ -8,6 +8,7 @@ #include #include #include +#include /// @endcond class ReferenceGenome { @@ -15,6 +16,7 @@ class ReferenceGenome { std::string fasta_filepath; std::vector chromosomes; std::unordered_map chr_to_seq; + mutable std::mutex mtx; public: int setFilepath(std::string fasta_filepath); diff --git a/include/input_data.h b/include/input_data.h index 43a9790b..72bca5af 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -36,26 +36,14 @@ class InputData { std::string getHMMFilepath() const; // Set the filepath to the reference genome FASTA file. - void setRefGenome(std::string fasta_filepath); - - // Return a reference to the ReferenceGenome object. - const ReferenceGenome& getRefGenome() const; - std::string queryRefGenome(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const; - - // Get the chromosomes in the reference genome. - std::vector getRefGenomeChromosomes() const; - - // Get a chromosome's length in the reference genome. - uint32_t getRefGenomeChromosomeLength(std::string chr) const; + void setRefGenome(std::string filepath); + std::string getRefGenome() const; // Set the filepath to the text file containing the locations of the // VCF files with population frequencies for each chromosome. void setAlleleFreqFilepaths(std::string filepath); std::string getAlleleFreqFilepath(std::string chr) const; - // Get the population frequency map. - // PFBMap getPFBMap(); - // Set the filepath to the VCF file with SNP calls used for CNV // detection with the HMM. void setSNPFilepath(std::string filepath); @@ -71,11 +59,12 @@ class InputData { // Set the minimum CNV length to use for copy number predictions. void setMinCNVLength(int min_cnv_length); - int getMinCNVLength() const; + uint32_t getMinCNVLength() const; // Set the chromosome to analyze. void setChromosome(std::string chr); std::string getChromosome() const; + bool isSingleChr() const; // Set the region to analyze. void setRegion(std::string region); @@ -106,10 +95,9 @@ class InputData { std::string snp_vcf_filepath; std::string ethnicity; std::unordered_map pfb_filepaths; // Map of population frequency VCF filepaths by chromosome - ReferenceGenome fasta_query; std::string output_dir; int sample_size; - int min_cnv_length; + uint32_t min_cnv_length; std::string chr; // Chromosome to analyze std::pair start_end; // Region to analyze bool region_set; // True if a region is set @@ -118,6 +106,7 @@ class InputData { std::string cnv_filepath; bool verbose; // True if verbose output is enabled bool save_cnv_data; // True if SNP CNV regions should be extended around SV breakpoints, and saved to a TSV file (Large performance hit) + bool single_chr; }; #endif // INPUT_DATA_H diff --git a/include/sv_caller.h b/include/sv_caller.h index 70be7e33..9d967510 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -33,46 +33,47 @@ class SVCaller { private: int min_sv_size = 50; // Minimum SV size to be considered int min_mapq = 20; // Minimum mapping quality to be considered - const InputData& input_data; - void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data, bool is_primary) const; + // void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data, bool is_primary) const; void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map) const; // Detect SVs from the CIGAR string of a read alignment, and return the // mismatch rate, and the start and end positions of the query sequence - void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map); + void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) const; - void processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls); + void processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome); // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. // RegionData detectSVsFromRegion(std::string region); - void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map); + void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) const; // Read the next alignment from the BAM file in a thread-safe manner int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const; // Detect SVs from split alignments - void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map); + void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; // Calculate the mismatch rate given a map of query positions to // match/mismatch (1/0) values within a specified range of the query // sequence double calculateMismatchRate(const MismatchData& mismatch_data) const; - void saveToVCF(const std::unordered_map>& sv_calls); + void saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const; void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches) const; // Calculate the read depth (INFO/DP) for a region - int calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end); + int calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end) const; public: - explicit SVCaller(InputData& input_data); + // explicit SVCaller(InputData& input_data); + // Constructor with no arguments + SVCaller() = default; // Detect SVs and predict SV type from long read alignments and CNV calls - void run(); + void run(const InputData& input_data); }; #endif // SV_CALLER_H diff --git a/include/swig_interface.h b/include/swig_interface.h index c7f163ae..578f4653 100644 --- a/include/swig_interface.h +++ b/include/swig_interface.h @@ -12,6 +12,6 @@ #include /// @endcond -int run(InputData input_data); +int run(const InputData& input_data); #endif // SWIG_INTERFACE_H diff --git a/include/utils.h b/include/utils.h index 7311efbc..2fb4a3b1 100644 --- a/include/utils.h +++ b/include/utils.h @@ -3,12 +3,62 @@ #ifndef UTILS_H #define UTILS_H +#include +#include + /// @cond #include #include #include /// @endcond + +// Guard to close the BAM file +struct BamFileGuard { + samFile* fp_in; + hts_idx_t* idx; + bam_hdr_t* bamHdr; + + BamFileGuard(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr) + : fp_in(fp_in), idx(idx), bamHdr(bamHdr) {} + + ~BamFileGuard() { + if (idx) { + hts_idx_destroy(idx); + } + if (bamHdr) { + bam_hdr_destroy(bamHdr); + } + if (fp_in) { + sam_close(fp_in); + } + } + + BamFileGuard(const BamFileGuard&) = delete; // Non-copyable + BamFileGuard& operator=(const BamFileGuard&) = delete; // Non-assignable +}; + +// Guard to close the BCF file +struct BcfFileGuard { + bcf_srs_t* reader; + bcf_hdr_t* hdr; + + BcfFileGuard(bcf_srs_t* reader, bcf_hdr_t* hdr) + : reader(reader), hdr(hdr) {} + + ~BcfFileGuard() { + if (hdr) { + bcf_hdr_destroy(hdr); + } + if (reader) { + bcf_sr_destroy(reader); + } + } + + BcfFileGuard(const BcfFileGuard&) = delete; // Non-copyable + BcfFileGuard& operator=(const BcfFileGuard&) = delete; // Non-assignable +}; + // Print the progress of a task void printProgress(int progress, int total); diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 20e7189f..cc30ccae 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -34,10 +34,6 @@ using namespace sv_types; -CNVCaller::CNVCaller(const InputData& input_data) - : input_data(input_data) // Initialize the input data -{ -} // Function to call the Viterbi algorithm for the CHMM void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair, double>& prediction) const @@ -52,25 +48,22 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair& pos_depth_map, double mean_chr_cov, SNPData& snp_data) const +void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const { - // uint32_t window_size = (uint32_t)this->input_data.getWindowSize(); - // Initialize the SNP data with default values and sample size length - int sample_size = this->input_data.getSampleSize(); + int sample_size = input_data.getSampleSize(); int region_length = (int) (end_pos - start_pos + 1); if (region_length < sample_size) { sample_size = region_length; } - // std::set snp_pos(sample_size); std::vector snp_pos(sample_size, 0); std::vector snp_baf(sample_size, -1.0); std::vector snp_pfb(sample_size, 0.5); std::vector snp_log2_cov(sample_size, 0.0); std::vector is_snp(sample_size, false); - this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp); + this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data); // Get the log2 ratio for evenly spaced positions in the // region @@ -84,7 +77,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end snp_data.is_snp = std::move(is_snp); } -std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map) const +std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const { // Check that the start position is less than the end position if (start_pos >= end_pos) @@ -97,7 +90,7 @@ std::tuple CNVCaller::runCopyNumberPrediction // Only extend the region if "save CNV data" is enabled uint32_t snp_start_pos = start_pos; uint32_t snp_end_pos = end_pos; - if (this->input_data.getSaveCNVData()) + if (input_data.getSaveCNVData()) { uint32_t sv_half_length = (end_pos - start_pos) / 2.0; snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1; @@ -106,7 +99,7 @@ std::tuple CNVCaller::runCopyNumberPrediction // Query the SNP region for the SV candidate SNPData snp_data; - querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data); + querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data); // Run the Viterbi algorithm std::pair, double> prediction; @@ -160,10 +153,12 @@ std::tuple CNVCaller::runCopyNumberPrediction // Save the SV calls as a TSV file if enabled bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL); - if (this->input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 10000) + // if (save_cnv_data && copy_number_change && (end_pos - start_pos) > 10000) + if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 10000) { std::string cnv_type_str = getSVTypeString(predicted_cnv_type); - std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv"; + const std::string output_dir = input_data.getOutputDir(); + std::string sv_filename = output_dir + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv"; printMessage("Saving SV split-alignment copy number predictions to " + sv_filename + "..."); this->saveSVCopyNumberToTSV(snp_data, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood); } @@ -172,7 +167,7 @@ std::tuple CNVCaller::runCopyNumberPrediction } -void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector &sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map) +void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const { // Map with counts for each CNV type std::map cnv_type_counts; @@ -182,7 +177,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vectorinput_data.getMinCNVLength(); for (auto& sv_call : sv_candidates) { @@ -193,31 +187,23 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector end if (start_pos >= end_pos) { - std::cerr << "Position error for CIGAR SV at " << chr << ":" << start_pos << "-" << end_pos << std::endl; + printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); continue; } // Skip if not the minimum length for CNV predictions - if ((end_pos - start_pos) < (uint32_t) min_length) + if ((end_pos - start_pos) < input_data.getMinCNVLength()) { continue; } // Only extend the region if "save CNV data" is enabled - uint32_t snp_start_pos = start_pos; - uint32_t snp_end_pos = end_pos; - if (this->input_data.getSaveCNVData()) - { - uint32_t sv_half_length = (end_pos - start_pos) / 2.0; - snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1; - snp_end_pos = end_pos + sv_half_length; - } SNPData snp_data; - this->querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data); + this->querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data); // Run the Viterbi algorithm if (snp_data.pos.size() == 0) { - std::cerr << "ERROR: No windows for SV " << chr << ":" << start_pos << "-" << end_pos << " (" << snp_start_pos << "," << snp_end_pos << std::endl; + printError("ERROR: No SNP data found for Viterbi algorithm for CIGAR SV at " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); continue; } @@ -260,13 +246,14 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vectorinput_data.getSaveCNVData() && updated_sv_type != SVType::UNKNOWN && (end_pos - start_pos) > 10000) - { - // Add the state sequence to the SNP data (avoid copying the data) - snp_data.state_sequence = std::move(state_sequence); - - // Save the SV calls as a TSV file - std::string cnv_type_str = getSVTypeString(updated_sv_type); - std::string sv_filename = this->input_data.getOutputDir() + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_CIGAR.tsv"; - printMessage("Saving SV CIGAR copy number predictions to " + sv_filename); - this->saveSVCopyNumberToTSV(snp_data, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood); - } } } @@ -323,12 +296,11 @@ std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint3 } // Calculate the mean chromosome coverage -double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map) +double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map, const std::string& bam_filepath, int thread_count, bool single_chr) const { { // Open the BAM file std::lock_guard lock(this->bam_file_mtx); // Lock the BAM file - std::string bam_filepath = this->input_data.getShortReadBam(); samFile *bam_file = sam_open(bam_filepath.c_str(), "r"); if (!bam_file) { @@ -337,11 +309,10 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vectorinput_data.getChromosome() != "") + if (single_chr) { - hts_set_threads(bam_file, this->input_data.getThreadCount()); + hts_set_threads(bam_file, thread_count); } - // hts_set_threads(bam_file, this->input_data.getThreadCount()); // Read the header bam_hdr_t *bam_header = sam_hdr_read(bam_file); @@ -361,14 +332,12 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp) const -{ +void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data) const +{ // --------- SNP file --------- - // Get the SNP file path - std::string snp_filepath = this->input_data.getSNPFilepath(); + const std::string snp_filepath = input_data.getSNPFilepath(); if (snp_filepath.empty()) { printError("ERROR: SNP file path is empty."); @@ -502,10 +464,10 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui snp_reader->require_index = 1; // Set multi-threading if running on a single chromosome - if (this->input_data.getChromosome() != "") + int thread_count = input_data.getThreadCount(); + if (input_data.isSingleChr()) { - int thread_count = this->input_data.getThreadCount() - 1; // Leave one thread for the main thread - printMessage("Setting SNP reader threads to " + std::to_string(thread_count / 2)); + printMessage("Setting SNP reader threads to " + std::to_string(std::max(1, thread_count / 2))); bcf_sr_set_threads(snp_reader, std::max(1, thread_count / 2)); } @@ -518,35 +480,38 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui } // Get the header - bcf_hdr_t *snp_header = bcf_sr_get_header(snp_reader, 0); - if (!snp_header) - { - bcf_sr_destroy(snp_reader); - printError("ERROR: Could not get header for SNP reader."); - return; - } + // bcf_hdr_t *snp_header = bcf_sr_get_header(snp_reader, 0); + // if (!snp_header) + // { + // bcf_sr_destroy(snp_reader); + // printError("ERROR: Could not get header for SNP reader."); + // return; + // } + // BcfFileGuard snp_guard(snp_reader, snp_header); // Guard to close the SNP file // --------- Population allele frequency file --------- // Get the population allele frequency file path bool use_pfb = true; - std::string pfb_filepath = this->input_data.getAlleleFreqFilepath(chr); + const std::string pfb_filepath = input_data.getAlleleFreqFilepath(chr); if (pfb_filepath.empty()) { use_pfb = false; - printMessage("WARNING: No population allele frequency file provided for chromosome " + chr); + // printMessage("WARNING: No population allele frequency file provided for chromosome " + chr); } bcf_srs_t *pfb_reader = bcf_sr_init(); std::string chr_gnomad; std::string AF_key; + // BcfFileGuard pfb_guard(nullptr, nullptr); // Guard to close the population allele frequency file if (use_pfb) { // Determine the ethnicity-specific allele frequency key AF_key = "AF"; - if (this->input_data.getEthnicity() != "") + const std::string eth = input_data.getEthnicity(); + if (eth != "") { - AF_key += "_" + this->input_data.getEthnicity(); + AF_key += "_" + eth; } // Check if the filepath uses the 'chr' prefix notations based on the @@ -571,38 +536,43 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Initialize the population allele frequency reader if (!pfb_reader) { - bcf_sr_destroy(snp_reader); printError("ERROR: Could not initialize population allele frequency reader."); + + // Clean up + // bcf_hdr_destroy(snp_header); + bcf_sr_destroy(snp_reader); return; } pfb_reader->require_index = 1; // Set multi-threading if running on a single chromosome - if (this->input_data.getChromosome() != "") + if (input_data.isSingleChr()) { - int thread_count = this->input_data.getThreadCount() - 1; // Leave one thread for the main thread - printMessage("Setting population allele frequency reader threads to " + std::to_string(thread_count / 2)); + printMessage("Setting population allele frequency reader threads to " + std::to_string(std::max(1, thread_count / 2))); bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2)); } // Add the population allele frequency file to the reader if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0) { - bcf_sr_destroy(snp_reader); - bcf_sr_destroy(pfb_reader); printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath); - return; - } - // Get the header - bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0); - if (!pfb_header) - { - bcf_sr_destroy(snp_reader); + // Clean up bcf_sr_destroy(pfb_reader); - printError("ERROR: Could not get header for population allele frequency reader."); + // bcf_hdr_destroy(snp_header); + bcf_sr_destroy(snp_reader); return; } + + // Get the header + // bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0); + // if (!pfb_header) + // { + // bcf_sr_destroy(pfb_reader); + // printError("ERROR: Could not get header for population allele frequency reader."); + // return; + // } + // pfb_guard = BcfFileGuard(pfb_reader, pfb_header); // Guard to close the population allele frequency file } // Split the region into samples @@ -617,7 +587,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui { current_region++; // Lock during reading - // std::lock_guard lock(this->snp_file_mtx); + std::lock_guard lock(this->snp_file_mtx); // Read the SNP data ---------------------------------------------- @@ -625,9 +595,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui std::string region_str = region_chunks[i]; if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) { - bcf_sr_destroy(snp_reader); printError("ERROR: Could not set region for SNP reader: " + region_str); - return; + break; } // printMessage("Iterating through SNPs in region " + std::to_string(current_region) + " of " + std::to_string((int) region_chunks.size()) + " with length " + std::to_string((int) (end_pos - start_pos)) + " bp..."); @@ -659,7 +628,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui int32_t *dp = 0; // int dp_values[2]; int dp_count = 0; - int dp_ret = bcf_get_format_int32(snp_header, snp_record, "DP", &dp, &dp_count); + // int dp_ret = bcf_get_format_int32(snp_header, snp_record, + // "DP", &dp, &dp_count); + int dp_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "DP", &dp, &dp_count); if (dp_ret < 0 || dp[0] <= 10) { continue; @@ -667,7 +638,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui free(dp); // Skip if the SNP does not pass the filter - if (bcf_has_filter(snp_header, snp_record, const_cast("PASS")) != 1) + // if (bcf_has_filter(snp_header, snp_record, + // const_cast("PASS")) != 1) + if (bcf_has_filter(snp_reader->readers[0].header, snp_record, const_cast("PASS")) != 1) { continue; } @@ -676,7 +649,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui int32_t *ad = 0; // int ad_values[2]; int ad_count = 0; - int ad_ret = bcf_get_format_int32(snp_header, snp_record, "AD", &ad, &ad_count); + // int ad_ret = bcf_get_format_int32(snp_header, snp_record, + // "AD", &ad, &ad_count); + int ad_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "AD", &ad, &ad_count); // int ad_ret = bcf_get_format_int32(snp_header, snp_record, // "AD", &ad, &ad_count); if (ad_ret < 0 || ad_count < 2) @@ -719,10 +694,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos); if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0) { - bcf_sr_destroy(snp_reader); - bcf_sr_destroy(pfb_reader); printError("ERROR: Could not set region for population allele frequency reader: " + region_str); - return; + break; } // Find the SNP position in the population allele frequency file @@ -732,9 +705,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui { continue; } - // pfb_record = bcf_sr_get_line(pfb_reader, 0); bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0); - // Do something with the record if (pfb_record) { // Skip if not a SNP @@ -778,6 +749,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui } } } + + // Clean up + // bcf_hdr_destroy(snp_header); bcf_sr_destroy(snp_reader); bcf_sr_destroy(pfb_reader); } diff --git a/src/contextsv.cpp b/src/contextsv.cpp index 01329c2c..3957fe95 100644 --- a/src/contextsv.cpp +++ b/src/contextsv.cpp @@ -12,15 +12,11 @@ #include "utils.h" /// @endcond -ContextSV::ContextSV(InputData& input_data) - : input_data(input_data) // Initialize the input data -{ -} -int ContextSV::run() +int ContextSV::run(const InputData& input_data) const { - SVCaller sv_caller(this->input_data); - sv_caller.run(); + SVCaller sv_caller; + sv_caller.run(input_data); return 0; } diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp index 9a705a9a..3cde27d3 100644 --- a/src/fasta_query.cpp +++ b/src/fasta_query.cpp @@ -99,7 +99,9 @@ std::string ReferenceGenome::getFilepath() const // Function to get the reference sequence at a given position range std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const -{ +{ + std::lock_guard lock(this->mtx); + // Convert positions from 1-indexed (reference) to 0-indexed (string indexing) pos_start--; pos_end--; @@ -125,6 +127,7 @@ std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, u // Function to get the chromosome contig lengths in VCF header format std::string ReferenceGenome::getContigHeader() const { + std::lock_guard lock(this->mtx); std::string contig_header = ""; // Sort the chromosomes @@ -151,10 +154,12 @@ std::string ReferenceGenome::getContigHeader() const std::vector ReferenceGenome::getChromosomes() const { + std::lock_guard lock(this->mtx); return this->chromosomes; } uint32_t ReferenceGenome::getChromosomeLength(std::string chr) const { + std::lock_guard lock(this->mtx); return this->chr_to_seq.at(chr).length(); } diff --git a/src/input_data.cpp b/src/input_data.cpp index 952329d1..a24efb9e 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -30,6 +30,7 @@ InputData::InputData() this->hmm_filepath = "data/wgs.hmm"; this->verbose = false; this->save_cnv_data = false; + this->single_chr = false; } std::string InputData::getShortReadBam() const @@ -84,30 +85,14 @@ void InputData::setLongReadBam(std::string filepath) } } -void InputData::setRefGenome(std::string fasta_filepath) +void InputData::setRefGenome(std::string filepath) { - // Set the reference genome - this->fasta_query.setFilepath(fasta_filepath); + this->ref_filepath = filepath; } -const ReferenceGenome& InputData::getRefGenome() const +std::string InputData::getRefGenome() const { - return this->fasta_query; -} - -std::string InputData::queryRefGenome(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const -{ - return this->fasta_query.query(chr, pos_start, pos_end); -} - -std::vector InputData::getRefGenomeChromosomes() const -{ - return this->fasta_query.getChromosomes(); -} - -uint32_t InputData::getRefGenomeChromosomeLength(std::string chr) const -{ - return this->fasta_query.getChromosomeLength(chr); + return this->ref_filepath; } std::string InputData::getOutputDir() const @@ -154,19 +139,20 @@ void InputData::setEthnicity(std::string ethnicity) this->ethnicity = ethnicity; } -int InputData::getMinCNVLength() const +uint32_t InputData::getMinCNVLength() const { return this->min_cnv_length; } void InputData::setMinCNVLength(int min_cnv_length) { - this->min_cnv_length = min_cnv_length; + this->min_cnv_length = (uint32_t) min_cnv_length; } void InputData::setChromosome(std::string chr) { this->chr = chr; + this->single_chr = true; } std::string InputData::getChromosome() const @@ -174,6 +160,11 @@ std::string InputData::getChromosome() const return this->chr; } +bool InputData::isSingleChr() const +{ + return this->single_chr; +} + void InputData::setRegion(std::string region) { // Check if the region is valid diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 61a236b4..6fafaf31 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -24,14 +24,11 @@ #include "utils.h" #include "sv_types.h" #include "version.h" +#include "fasta_query.h" /// @endcond # define DUP_SEQSIM_THRESHOLD 0.9 // Sequence similarity threshold for duplication detection -SVCaller::SVCaller(InputData &input_data) - : input_data(input_data) // Initialize the input data -{ -} int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const { @@ -44,18 +41,12 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); if (!bam1) { - hts_idx_destroy(idx); - bam_hdr_destroy(bamHdr); - sam_close(fp_in); printError("ERROR: failed to initialize BAM record"); return; } hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str()); if (!itr) { bam_destroy1(bam1); - hts_idx_destroy(idx); - bam_hdr_destroy(bamHdr); - sam_close(fp_in); printError("ERROR: failed to query region " + region); return; } @@ -98,6 +89,7 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments..."); } +/* void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data, bool is_primary) const { // Create a read and iterator for the region @@ -192,7 +184,6 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t // Get the corresponding reference sequence int cmatch_pos = pos + 1; // Querying the reference genome is 1-based - // printMessage("Checking window for match: " + chr + ":" + std::to_string(cmatch_pos) + "-" + std::to_string(cmatch_pos + op_len - 1)); std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1); // Check that the two sequence lengths are equal @@ -249,24 +240,19 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t mismatch_data.query_end = query_end; mismatch_data.match_map = std::move(match_map); } +*/ -void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map) +void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) const { // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); if (!bam1) { - hts_idx_destroy(idx); - bam_hdr_destroy(bamHdr); - sam_close(fp_in); printError("ERROR: failed to initialize BAM record"); return; } hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str()); if (!itr) { bam_destroy1(bam1); - hts_idx_destroy(idx); - bam_hdr_destroy(bamHdr); - sam_close(fp_in); printError("ERROR: failed to query region " + region); return; } @@ -281,7 +267,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, // Process the alignment bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY); - this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, primary, pos_depth_map); + this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, primary, pos_depth_map, ref_genome); } // Clean up the iterator and alignment @@ -318,7 +304,7 @@ double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) const return mismatch_rate; } -void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map) +void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) const { std::string chr = header->target_name[alignment->core.tid]; // Chromosome name uint32_t pos = (uint32_t)alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) @@ -359,7 +345,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec // Get the string for the window (1-based coordinates) ins_ref_pos = j + 1; - std::string window_str = this->input_data.queryRefGenome(chr, ins_ref_pos, ins_ref_pos + op_len - 1); + // std::string window_str = + // this->input_data.queryRefGenome(chr, ins_ref_pos, + // ins_ref_pos + op_len - 1); + std::string window_str = ref_genome.query(chr, ins_ref_pos, ins_ref_pos + op_len - 1); // Continue if the window string is empty (out-of-range) if (window_str == "") { @@ -392,11 +381,24 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec // Add to SV calls (1-based) with the appropriate SV type ref_pos = pos+1; - ref_end = ref_pos + op_len -1; - int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); + + // For insertions, the reference end position is the same as the + // reference position + // For duplications, the reference end position is the same as + // the reference position plus the length of the insertion + ref_end = ref_pos + op_len - 1; if (is_duplication) { + // ref_end = std::min(ref_pos + op_len - 1, + // ref_genome.getChromosomeLength(chr)); + uint32_t bp1 = ref_pos; + uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr)); + int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth); } else { + // ref_end = ref_pos; + uint32_t bp1 = std::max(1, (int)ref_pos - 1); + uint32_t bp2 = ref_pos; + int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth); } } @@ -409,6 +411,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec { ref_pos = pos+1; ref_end = ref_pos + op_len -1; + // printMessage("Test2"); int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); addSVCall(sv_calls, ref_pos, ref_end, "DEL", "", "CIGARDEL", "./.", default_lh, read_depth); } @@ -427,10 +430,13 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec } } -void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls) +void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome) { + int filter_threshold = 4; // Minimum number of supporting reads for an SV call + bool single_chr = input_data.getChromosome() != ""; + // Open the BAM file - std::string bam_filepath = this->input_data.getLongReadBam(); + std::string bam_filepath = input_data.getLongReadBam(); samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); if (!fp_in) { printError("ERROR: failed to open " + bam_filepath); @@ -453,130 +459,91 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v printError("ERROR: failed to load index for " + bam_filepath); return; } + BamFileGuard bam_guard(fp_in, idx, bamHdr); // Guard to close the BAM file - // Split the chromosome into chunks for memory efficiency - std::vector region_chunks; - //int chunk_count = 100; - int chunk_count = 1; - uint32_t chr_len = this->input_data.getRefGenomeChromosomeLength(chr); - if (this->input_data.isRegionSet()) { + // Set the region to process + std::string region = chr; + uint32_t chr_len = ref_genome.getChromosomeLength(chr); + if (input_data.isRegionSet()) { // Use one chunk for the specified region - std::pair region = this->input_data.getRegion(); - int region_start = region.first; - int region_end = region.second; - std::string chunk = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end); - region_chunks.push_back(chunk); + std::pair region_data = input_data.getRegion(); + int region_start = region_data.first; + int region_end = region_data.second; + region = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end); - } else { - int chunk_size = std::ceil((double)chr_len / chunk_count); - for (int i = 0; i < chunk_count; i++) { - int start = i * chunk_size + 1; // 1-based - int end = start + chunk_size; - if (i == chunk_count - 1) { - end = chr_len; - } - std::string chunk = chr + ":" + std::to_string(start) + "-" + std::to_string(end); - region_chunks.push_back(chunk); - } - printMessage("Split chromosome " + chr + " into " + std::to_string(region_chunks.size()) + " chunks of size " + std::to_string(chunk_size) + "..."); } // Load chromosome data for copy number predictions printMessage(chr + ": Loading chromosome data..."); - CNVCaller cnv_caller(this->input_data); - printMessage(chr + ": LENGTH: " + std::to_string(chr_len)); + CNVCaller cnv_caller; std::vector chr_pos_depth_map(chr_len+1, 0); // 1-based index - double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map); - printMessage(chr + ": POSDEPTH: " + std::to_string(chr_pos_depth_map.size())); + int thread_count = input_data.getThreadCount(); + double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count, single_chr); if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) { - hts_idx_destroy(idx); - bam_hdr_destroy(bamHdr); - sam_close(fp_in); return; } - // Process each chunk one at a time - // std::cout << "Processing " << region_chunks.size() << " region(s) for chromosome " << chr << "..." << std::endl; - int region_count = region_chunks.size(); - int current_region = 0; - int filter_threshold = 4; - for (const auto& sub_region : region_chunks) { - current_region++; - - // Detect SVs from the CIGAR strings - printMessage(chr + ": CIGAR SVs..."); - std::vector subregion_sv_calls; - this->detectCIGARSVs(fp_in, idx, bamHdr, sub_region, subregion_sv_calls, chr_pos_depth_map); - - // std::set& subregion_sv_calls = std::get<0>(region_data); - // PrimaryMap& primary_map = std::get<1>(region_data); - // SuppMap& supp_map = std::get<2>(region_data); - // std::cout << " CIGAR SV calls from " << sub_region << "..." << std::endl; - printMessage(chr + ": Merging CIGAR..."); - filterSVsWithLowSupport(subregion_sv_calls, filter_threshold); - mergeSVs(subregion_sv_calls); - int region_sv_count = getSVCount(subregion_sv_calls); - // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); - - // Run copy number variant predictions on the SVs detected from the - // CIGAR string, using a minimum CNV length threshold - if (region_sv_count > 0) { - // std::cout << "Running copy number variant detection from CIGAR string SVs..." << std::endl; - printMessage(chr + ": CIGAR predictions..."); - cnv_caller.runCIGARCopyNumberPrediction(chr, subregion_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map); - } - - // Run split-read SV and copy number variant predictions - // std::cout << "Detecting copy number variants from split reads..." << std::endl; - printMessage(chr + ": Split read SVs..."); - this->detectSVsFromSplitReads(sub_region, fp_in, idx, bamHdr, subregion_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map); - - // Merge the SV calls from the current region - // std::cout << "Merge SV calls from " << sub_region << "..." << std::endl; - printMessage(chr + ": Merging split reads..."); - filterSVsWithLowSupport(subregion_sv_calls, filter_threshold); - mergeSVs(subregion_sv_calls); + // Detect SVs from the CIGAR strings + printMessage(chr + ": CIGAR SVs..."); + this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); + + /* + printMessage(chr + ": Merging CIGAR..."); + filterSVsWithLowSupport(chr_sv_calls, filter_threshold); + mergeSVs(chr_sv_calls); + int region_sv_count = getSVCount(chr_sv_calls); + // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); + + // Run copy number variant predictions on the SVs detected from the + // CIGAR string, using a minimum CNV length threshold + if (region_sv_count > 0) { + printMessage(chr + ": CIGAR predictions..."); + cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data); + } - // Combine the SV calls from the current region - // std::cout << "Combining SV calls from " << sub_region << "..." << std::endl; - printMessage(chr + ": Concatenating calls..."); - concatenateSVCalls(combined_sv_calls, subregion_sv_calls); + // Run split-read SV and copy number variant predictions + printMessage(chr + ": Split read SVs..."); + this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data); - printMessage("Completed " + std::to_string(current_region) + " of " + std::to_string(region_count) + " region(s) for chromosome " + chr + "..."); - } + // Merge the SV calls from the current region + printMessage(chr + ": Merging split reads..."); + filterSVsWithLowSupport(chr_sv_calls, filter_threshold); + mergeSVs(chr_sv_calls); // Run a final merge on the combined SV calls printMessage(chr + ": Merging final calls..."); - mergeSVs(combined_sv_calls); - // filterSVsWithLowSupport(combined_sv_calls, filter_threshold); - - // Clean up the BAM file, header, and index - hts_idx_destroy(idx); - bam_hdr_destroy(bamHdr); - sam_close(fp_in); + mergeSVs(chr_sv_calls); + */ + printMessage("Completed chromosome " + chr); } -void SVCaller::run() +void SVCaller::run(const InputData& input_data) { + // Set up the reference genome + printMessage("Loading the reference genome..."); + const std::string ref_filepath = input_data.getRefGenome(); + ReferenceGenome ref_genome; + ref_genome.setFilepath(ref_filepath); + // Get the chromosomes std::vector chromosomes; - if (this->input_data.getChromosome() != "") { - chromosomes.push_back(this->input_data.getChromosome()); + if (input_data.isSingleChr()) { + chromosomes.push_back(input_data.getChromosome()); } else { - chromosomes = this->input_data.getRefGenomeChromosomes(); + chromosomes = ref_genome.getChromosomes(); } // Read the HMM from the file - std::string hmm_filepath = this->input_data.getHMMFilepath(); + std::string hmm_filepath = input_data.getHMMFilepath(); std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; const CHMM& hmm = ReadCHMM(hmm_filepath.c_str()); // Use multi-threading across chromosomes unless a single chromosome is // specified int max_threads = 1; - if (this->input_data.getChromosome() == "") { - max_threads = this->input_data.getThreadCount(); + if (!input_data.isSingleChr()) { + max_threads = input_data.getThreadCount(); std::cout << "Using " << max_threads << " threads for processing..." << std::endl; } ThreadPool pool(max_threads); @@ -589,7 +556,8 @@ void SVCaller::run() auto process_chr = [&](const std::string& chr) { try { std::vector sv_calls; - this->processChromosome(chr, hmm, sv_calls); + InputData chr_input_data = input_data; // Use a thread-local copy + this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome); { std::lock_guard lock(sv_mutex); whole_genome_sv_calls[chr] = std::move(sv_calls); @@ -636,22 +604,27 @@ void SVCaller::run() // Save to VCF std::cout << "Saving SVs to VCF..." << std::endl; - this->saveToVCF(whole_genome_sv_calls); + const std::string output_dir = input_data.getOutputDir(); + this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome); } // Detect SVs from split read alignments -void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map) +void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const { - printMessage("Getting split alignments..."); + printMessage(region + ": Getting split alignments..."); std::unordered_map primary_map; std::unordered_map> supp_map; this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map); // Find split-read SV evidence + printMessage(region + ": Finding split-read SVs..."); int sv_count = 0; - uint32_t min_cnv_length = (uint32_t) this->input_data.getMinCNVLength(); + int current_primary = 0; + int primary_count = primary_map.size(); + uint32_t min_cnv_length = input_data.getMinCNVLength(); for (auto& entry : primary_map) { + current_primary++; const std::string& qname = entry.first; GenomicRegion& primary_region = entry.second; @@ -661,10 +634,13 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } // Get the read match/mismatch map - MismatchData primary_mismatches; - this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches, true); + // printMessage(region + ": Getting mismatch map for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); + // MismatchData primary_mismatches; + // this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches, true); GenomicRegion largest_supp_region = supp_map[qname][0]; uint32_t largest_supp_length = 0; + + printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); const std::string& primary_chr = bamHdr->target_name[primary_region.tid]; for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { GenomicRegion& supp_region = *it; @@ -695,13 +671,15 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in continue; } - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map); + printMessage(region + ": Running copy number prediction for inversion..."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data); if (std::get<1>(result) == SVType::UNKNOWN) { continue; } double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); + // printMessage("Test3"); int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); if (supp_type == SVType::NEUTRAL) { addSVCall(sv_calls, supp_start, supp_end, "INV", "", "HMM", "./.", supp_lh, read_depth); @@ -713,6 +691,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } else { // Add the inversion without running copy number predictions // (too small for predictions) + // printMessage("Test4"); int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); addSVCall(sv_calls, supp_start, supp_end, "INV", "", "REV", "./.", 0.0, read_depth); } @@ -720,9 +699,12 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } // Trim overlapping alignments - MismatchData supp_mismatches; - this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches, false); - trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches); + // MismatchData supp_mismatches; + // printMessage(region + ": Getting mismatch map for supplementary alignments..."); + // this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches, false); + + // printMessage(region + ": Trimming overlapping alignments..."); + // trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches); bool gap_exists = false; uint32_t boundary_left, boundary_right, gap_left, gap_right; if (primary_region.start < largest_supp_region.start) { // Primary before supp @@ -749,7 +731,8 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in continue; } - std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map); + printMessage(region + ": Running copy number prediction for boundary..."); + std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data); if (std::get<1>(bd_result) == SVType::UNKNOWN) { continue; } @@ -766,7 +749,8 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in continue; } - std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map); + printMessage(region + ": Running copy number prediction for gap..."); + std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data); if (std::get<1>(gap_result) == SVType::UNKNOWN) { continue; } @@ -775,17 +759,20 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // If higher likelihood than the boundary, add the gap as the SV call if (gap_lh > bd_lh) { + // printMessage("Test5"); int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right); std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">"; addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "GAP", "./.", gap_lh, read_depth); } else { // Add the boundary as the SV call + // printMessage("Test6"); int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth); } } else { // Add the boundary as the SV call + // printMessage("Test7"); int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth); @@ -794,11 +781,10 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } } -void SVCaller::saveToVCF(const std::unordered_map>& sv_calls) +void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const { std::cout << "Creating VCF writer..." << std::endl; - // std::string output_vcf = output_dir + "/output.vcf"; - std::string output_vcf = this->input_data.getOutputDir() + "/output.vcf"; + std::string output_vcf = output_dir + "/output.vcf"; std::cout << "Writing VCF file to " << output_vcf << std::endl; std::ofstream vcf_stream(output_vcf); if (!vcf_stream.is_open()) { @@ -810,7 +796,7 @@ void SVCaller::saveToVCF(const std::unordered_mapinput_data.getRefGenome().getFilepath(); + std::string ref_fp = ref_genome.getFilepath(); std::cout << "Reference genome filepath: " << ref_fp << std::endl; } catch (const std::exception& e) { std::cerr << "Error: " << e.what() << std::endl; @@ -819,9 +805,9 @@ void SVCaller::saveToVCF(const std::unordered_mapinput_data.getRefGenome().getContigHeader(); + const std::string contig_header = ref_genome.getContigHeader(); std::vector header_lines = { - std::string("##reference=") + this->input_data.getRefGenome().getFilepath(), + std::string("##reference=") + ref_genome.getFilepath(), contig_header, "##INFO=", "##INFO=", @@ -901,7 +887,9 @@ void SVCaller::saveToVCF(const std::unordered_mapinput_data.queryRefGenome(chr, preceding_pos, end); + // ref_allele = this->input_data.queryRefGenome(chr, + // preceding_pos, end); + ref_allele = ref_genome.query(chr, preceding_pos, end); // Use the preceding base as the alternate allele if (ref_allele != "") { @@ -919,7 +907,9 @@ void SVCaller::saveToVCF(const std::unordered_mapinput_data.queryRefGenome(chr, preceding_pos, preceding_pos); + // ref_allele = this->input_data.queryRefGenome(chr, + // preceding_pos, preceding_pos); + ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos); // Update the start position to the preceding base start = preceding_pos; @@ -1025,20 +1015,23 @@ void SVCaller::trimOverlappingAlignments(GenomicRegion& primary_alignment, Genom } } -int SVCaller::calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end) +int SVCaller::calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end) const { int read_depth = 0; try { // printMessage("Read depth at start: " + std::to_string(pos_depth_map.at(start)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start)); read_depth += pos_depth_map.at(start); } catch (const std::out_of_range& e) { - std::cerr << "Warning: Start position " << start << " not found in depth map." << std::endl; + // std::cerr << "Warning: Start position " << start << " not found in + // depth map." << std::endl; + printError("Error: Start position " + std::to_string(start) + " not found in depth map."); } try { // printMessage("Read depth at end: " + std::to_string(pos_depth_map.at(end)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start)); read_depth += pos_depth_map.at(end); } catch (const std::out_of_range& e) { - std::cerr << "Warning: End position " << end << " not found in depth map of size " << pos_depth_map.size() << "." << std::endl; + printError("Error: End position " + std::to_string(end) + " not found in depth map."); + // std::cerr << "Warning: End position " << end << " not found in depth map of size " << pos_depth_map.size() << "." << std::endl; } // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth)); return read_depth; diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 7ef70223..1318f8d3 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -148,23 +148,8 @@ void mergeSVs(std::vector& sv_calls) void filterSVsWithLowSupport(std::vector& sv_calls, int min_support) { - // int prev_size = sv_calls.size(); - // Filter SV calls with low read support sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) { return sv_call.support < min_support; }), sv_calls.end()); - - // // Print read depth for each SV call - // for (const auto& sv_call : sv_calls) { - // std::cout << "SV call: " << sv_call.start << "-" << sv_call.end << " with depth " << sv_call.read_depth << " and length " << (sv_call.end - sv_call.start) << std::endl; - // } - - // // Remove SV calls with low read depth - // sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_depth](const SVCall& sv_call) { - // return sv_call.read_depth < min_depth; - // }), sv_calls.end()); - - // int updated_size = sv_calls.size(); - // printMessage("Filtered " + std::to_string(prev_size) + " SV calls to " + std::to_string(updated_size) + " SV calls with DP >= " + std::to_string(min_depth)); } diff --git a/src/swig_interface.cpp b/src/swig_interface.cpp index 8d2e7a42..76eb2151 100644 --- a/src/swig_interface.cpp +++ b/src/swig_interface.cpp @@ -7,14 +7,13 @@ // Run the CLI with the given parameters -int run(InputData input_data) +int run(const InputData& input_data) { - // Run ContextSV - ContextSV contextsv(input_data); + ContextSV contextsv; try { - contextsv.run(); + contextsv.run(input_data); } catch (std::exception& e) From 11180be5563900a8d321e9f497794ee25d1dab37 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 20 Dec 2024 11:17:13 -0500 Subject: [PATCH 057/134] Fix snp thread locks --- include/cnv_caller.h | 8 ++++---- include/sv_caller.h | 5 ++--- src/cnv_caller.cpp | 40 ++++++++++++---------------------------- src/sv_caller.cpp | 30 ++++++++++++++---------------- 4 files changed, 32 insertions(+), 51 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 0417b8ae..ab7bb147 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -75,7 +75,7 @@ class CNVCaller { void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair, double>& prediction) const; // Query a region for SNPs and return the SNP data - void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const; + void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const; // Split a region into chunks for parallel processing std::vector splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const; @@ -87,16 +87,16 @@ class CNVCaller { // Run copy number prediction for a single SV candidate, returning the // likelihood, predicted CNV type, genotype, and whether SNPs were found - std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; + std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const; // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings - void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; + void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const; double calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map, const std::string& bam_filepath, int thread_count, bool single_chr) const; void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& pos_log2) const; - void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data) const; + void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const; // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const; diff --git a/include/sv_caller.h b/include/sv_caller.h index 9d967510..2b035198 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -42,7 +42,7 @@ class SVCaller { // mismatch rate, and the start and end positions of the query sequence void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) const; - void processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome); + void processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, std::mutex& snp_mutex, std::mutex& pfb_mutex); // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. @@ -53,7 +53,7 @@ class SVCaller { int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const; // Detect SVs from split alignments - void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; + void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const; // Calculate the mismatch rate given a map of query positions to // match/mismatch (1/0) values within a specified range of the query @@ -68,7 +68,6 @@ class SVCaller { int calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end) const; public: - // explicit SVCaller(InputData& input_data); // Constructor with no arguments SVCaller() = default; diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index cc30ccae..569717b4 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -48,7 +48,7 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const +void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const { // Initialize the SNP data with default values and sample size length int sample_size = input_data.getSampleSize(); @@ -63,7 +63,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end std::vector snp_pfb(sample_size, 0.5); std::vector snp_log2_cov(sample_size, 0.0); std::vector is_snp(sample_size, false); - this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data); + this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data, snp_mutex, pfb_mutex); // Get the log2 ratio for evenly spaced positions in the // region @@ -77,7 +77,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end snp_data.is_snp = std::move(is_snp); } -std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const +std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const { // Check that the start position is less than the end position if (start_pos >= end_pos) @@ -99,7 +99,7 @@ std::tuple CNVCaller::runCopyNumberPrediction // Query the SNP region for the SV candidate SNPData snp_data; - querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data); + querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data, snp_mutex, pfb_mutex); // Run the Viterbi algorithm std::pair, double> prediction; @@ -167,7 +167,7 @@ std::tuple CNVCaller::runCopyNumberPrediction } -void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const +void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const { // Map with counts for each CNV type std::map cnv_type_counts; @@ -199,7 +199,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vectorquerySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data); + this->querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data, snp_mutex, pfb_mutex); // Run the Viterbi algorithm if (snp_data.pos.size() == 0) { @@ -444,7 +444,7 @@ void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, i } } -void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data) const +void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const { // --------- SNP file --------- const std::string snp_filepath = input_data.getSNPFilepath(); @@ -479,16 +479,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui return; } - // Get the header - // bcf_hdr_t *snp_header = bcf_sr_get_header(snp_reader, 0); - // if (!snp_header) - // { - // bcf_sr_destroy(snp_reader); - // printError("ERROR: Could not get header for SNP reader."); - // return; - // } - // BcfFileGuard snp_guard(snp_reader, snp_header); // Guard to close the SNP file - // --------- Population allele frequency file --------- // Get the population allele frequency file path @@ -563,16 +553,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui bcf_sr_destroy(snp_reader); return; } - - // Get the header - // bcf_hdr_t *pfb_header = bcf_sr_get_header(pfb_reader, 0); - // if (!pfb_header) - // { - // bcf_sr_destroy(pfb_reader); - // printError("ERROR: Could not get header for population allele frequency reader."); - // return; - // } - // pfb_guard = BcfFileGuard(pfb_reader, pfb_header); // Guard to close the population allele frequency file } // Split the region into samples @@ -587,7 +567,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui { current_region++; // Lock during reading - std::lock_guard lock(this->snp_file_mtx); + // std::lock_guard lock(this->snp_file_mtx); + std::lock_guard lock(snp_mutex); // Read the SNP data ---------------------------------------------- @@ -689,6 +670,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Read the population allele frequency data ---------------------- if (use_pfb) { + // Lock during reading + std::lock_guard lock(pfb_mutex); + // Set the region as the SNP position uint32_t target_snp_pos = snp_pos[i]; // Already 1-based std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos); diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 6fafaf31..d057f89c 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -388,14 +388,11 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec // the reference position plus the length of the insertion ref_end = ref_pos + op_len - 1; if (is_duplication) { - // ref_end = std::min(ref_pos + op_len - 1, - // ref_genome.getChromosomeLength(chr)); uint32_t bp1 = ref_pos; uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr)); int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth); } else { - // ref_end = ref_pos; uint32_t bp1 = std::max(1, (int)ref_pos - 1); uint32_t bp2 = ref_pos; int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); @@ -430,7 +427,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec } } -void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome) +void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, std::mutex& snp_mutex, std::mutex& pfb_mutex) { int filter_threshold = 4; // Minimum number of supporting reads for an SV call bool single_chr = input_data.getChromosome() != ""; @@ -488,23 +485,23 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v printMessage(chr + ": CIGAR SVs..."); this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); - /* printMessage(chr + ": Merging CIGAR..."); filterSVsWithLowSupport(chr_sv_calls, filter_threshold); mergeSVs(chr_sv_calls); int region_sv_count = getSVCount(chr_sv_calls); - // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); + printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); + // Testing on HG002 whole genome // Run copy number variant predictions on the SVs detected from the // CIGAR string, using a minimum CNV length threshold if (region_sv_count > 0) { printMessage(chr + ": CIGAR predictions..."); - cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data); + cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data, snp_mutex, pfb_mutex); } // Run split-read SV and copy number variant predictions printMessage(chr + ": Split read SVs..."); - this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data); + this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data, snp_mutex, pfb_mutex); // Merge the SV calls from the current region printMessage(chr + ": Merging split reads..."); @@ -514,7 +511,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // Run a final merge on the combined SV calls printMessage(chr + ": Merging final calls..."); mergeSVs(chr_sv_calls); - */ printMessage("Completed chromosome " + chr); } @@ -551,18 +547,20 @@ void SVCaller::run(const InputData& input_data) // Shared resources std::unordered_map> whole_genome_sv_calls; std::mutex sv_mutex; + std::mutex snp_mutex; + std::mutex pfb_mutex; // Lambda to process a chromosome auto process_chr = [&](const std::string& chr) { try { std::vector sv_calls; InputData chr_input_data = input_data; // Use a thread-local copy - this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome); + this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, snp_mutex, pfb_mutex); { std::lock_guard lock(sv_mutex); whole_genome_sv_calls[chr] = std::move(sv_calls); } - printMessage("Completed chromosome " + chr); + // printMessage("Completed chromosome " + chr); } catch (const std::exception& e) { printError("Error processing chromosome " + chr + ": " + e.what()); } catch (...) { @@ -610,7 +608,7 @@ void SVCaller::run(const InputData& input_data) // Detect SVs from split read alignments -void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const +void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const { printMessage(region + ": Getting split alignments..."); std::unordered_map primary_map; @@ -671,8 +669,8 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in continue; } - printMessage(region + ": Running copy number prediction for inversion..."); - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data); + printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")..."); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex); if (std::get<1>(result) == SVType::UNKNOWN) { continue; } @@ -732,7 +730,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } printMessage(region + ": Running copy number prediction for boundary..."); - std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data); + std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex); if (std::get<1>(bd_result) == SVType::UNKNOWN) { continue; } @@ -750,7 +748,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } printMessage(region + ": Running copy number prediction for gap..."); - std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data); + std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex); if (std::get<1>(gap_result) == SVType::UNKNOWN) { continue; } From d884e80f11af627261460428d4812e021b2bb71b Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 21 Dec 2024 10:43:52 -0500 Subject: [PATCH 058/134] Split read update --- src/sv_caller.cpp | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index d057f89c..b6794b16 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -83,6 +83,18 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam num_alignments++; } + // Remove primary alignments without supplementary alignments + std::vector to_remove; + for (const auto& entry : primary_map) { + const std::string& qname = entry.first; + if (supp_map.find(qname) == supp_map.end()) { + to_remove.push_back(qname); + } + } + for (const std::string& qname : to_remove) { + primary_map.erase(qname); + } + // Clean up the iterator and alignment hts_itr_destroy(itr); bam_destroy1(bam1); @@ -627,9 +639,9 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in GenomicRegion& primary_region = entry.second; // Skip primary alignments that do not have supplementary alignments - if (supp_map.find(qname) == supp_map.end()) { - continue; - } + // if (supp_map.find(qname) == supp_map.end()) { + // continue; + // } // Get the read match/mismatch map // printMessage(region + ": Getting mismatch map for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); @@ -638,7 +650,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in GenomicRegion largest_supp_region = supp_map[qname][0]; uint32_t largest_supp_length = 0; - printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); + // printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); const std::string& primary_chr = bamHdr->target_name[primary_region.tid]; for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { GenomicRegion& supp_region = *it; @@ -669,7 +681,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in continue; } - printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")..."); + // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")..."); std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex); if (std::get<1>(result) == SVType::UNKNOWN) { continue; @@ -686,13 +698,14 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } else if (supp_type == SVType::DUP) { addSVCall(sv_calls, supp_start, supp_end, "INVDUP", "", "HMM", "./.", supp_lh, read_depth); } - } else { - // Add the inversion without running copy number predictions - // (too small for predictions) - // printMessage("Test4"); - int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); - addSVCall(sv_calls, supp_start, supp_end, "INV", "", "REV", "./.", 0.0, read_depth); } + // } else { + // // Add the inversion without running copy number predictions + // // (too small for predictions) + // // printMessage("Test4"); + // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); + // addSVCall(sv_calls, supp_start, supp_end, "INV", "", "REV", "./.", 0.0, read_depth); + // } } } @@ -729,7 +742,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in continue; } - printMessage(region + ": Running copy number prediction for boundary..."); + // printMessage(region + ": Running copy number prediction for boundary..."); std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex); if (std::get<1>(bd_result) == SVType::UNKNOWN) { continue; @@ -747,7 +760,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in continue; } - printMessage(region + ": Running copy number prediction for gap..."); + // printMessage(region + ": Running copy number prediction for gap..."); std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex); if (std::get<1>(gap_result) == SVType::UNKNOWN) { continue; From 5d6a53c0c9bb8cf5652b4144fae83b17def0ebd1 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sun, 29 Dec 2024 13:04:31 -0500 Subject: [PATCH 059/134] Use single mutex and update dup seqsim threshold --- include/cnv_caller.h | 18 +-- include/fasta_query.h | 5 +- include/sv_caller.h | 22 +-- include/sv_object.h | 4 +- src/cnv_caller.cpp | 24 ++-- src/fasta_query.cpp | 8 +- src/sv_caller.cpp | 309 +++++++++++++++++++++++++++--------------- src/sv_object.cpp | 16 +-- 8 files changed, 248 insertions(+), 158 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index ab7bb147..bed2a347 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -45,9 +45,10 @@ struct SNPData { // CNVCaller: Detect CNVs and return the state sequence by SNP position class CNVCaller { private: - mutable std::mutex snp_file_mtx; // SNP file mutex - mutable std::mutex pfb_file_mtx; // Population frequency file mutex - mutable std::mutex bam_file_mtx; // BAM file mutex + //mutable std::mutex snp_file_mtx; // SNP file mutex + //mutable std::mutex pfb_file_mtx; // Population frequency file mutex + //mutable std::mutex bam_file_mtx; // BAM file mutex + std::mutex& shared_mutex; // Define a map of CNV genotypes by HMM predicted state. // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output. @@ -75,7 +76,7 @@ class CNVCaller { void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair, double>& prediction) const; // Query a region for SNPs and return the SNP data - void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const; + void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const; // Split a region into chunks for parallel processing std::vector splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const; @@ -83,20 +84,21 @@ class CNVCaller { public: // explicit CNVCaller(const InputData& input_data); // Constructor with no arguments - CNVCaller() = default; + //CNVCaller() = default; + CNVCaller(std::mutex& mtx) : shared_mutex(mtx) {} // Run copy number prediction for a single SV candidate, returning the // likelihood, predicted CNV type, genotype, and whether SNPs were found - std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const; + std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings - void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const; + void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; double calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map, const std::string& bam_filepath, int thread_count, bool single_chr) const; void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& pos_log2) const; - void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const; + void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data) const; // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const; diff --git a/include/fasta_query.h b/include/fasta_query.h index 75259441..b3cee253 100644 --- a/include/fasta_query.h +++ b/include/fasta_query.h @@ -16,9 +16,12 @@ class ReferenceGenome { std::string fasta_filepath; std::vector chromosomes; std::unordered_map chr_to_seq; - mutable std::mutex mtx; + //mutable std::mutex mtx; + std::mutex& shared_mutex; public: + ReferenceGenome(std::mutex& mtx) : shared_mutex(mtx) {} + int setFilepath(std::string fasta_filepath); std::string getFilepath() const; std::string query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const; diff --git a/include/sv_caller.h b/include/sv_caller.h index 2b035198..18c602c5 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -31,41 +31,41 @@ struct MismatchData { class SVCaller { private: - int min_sv_size = 50; // Minimum SV size to be considered int min_mapq = 20; // Minimum mapping quality to be considered + std::mutex shared_mutex; - // void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data, bool is_primary) const; + void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data, bool is_primary, const ReferenceGenome& ref_genome); - void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map) const; + void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map); // Detect SVs from the CIGAR string of a read alignment, and return the // mismatch rate, and the start and end positions of the query sequence - void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) const; + void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome); - void processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, std::mutex& snp_mutex, std::mutex& pfb_mutex); + void processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome); // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. // RegionData detectSVsFromRegion(std::string region); - void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) const; + void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome); // Read the next alignment from the BAM file in a thread-safe manner - int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const; + int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); // Detect SVs from split alignments - void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const; + void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, const ReferenceGenome& ref_genome); // Calculate the mismatch rate given a map of query positions to // match/mismatch (1/0) values within a specified range of the query // sequence - double calculateMismatchRate(const MismatchData& mismatch_data) const; + double calculateMismatchRate(const MismatchData& mismatch_data); void saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const; - void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches) const; + void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches); // Calculate the read depth (INFO/DP) for a region - int calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end) const; + int calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end); public: // Constructor with no arguments diff --git a/include/sv_object.h b/include/sv_object.h index e36e8624..3fedee73 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -24,11 +24,11 @@ struct SVCall { bool operator<(const SVCall& other) const; // Constructor with parameters for all fields - SVCall(uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) : + SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) : start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {} }; -void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth); +void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth); void mergeSVs(std::vector& sv_calls); diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 569717b4..a1a93adf 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -48,7 +48,7 @@ void CNVCaller::runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const +void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const { // Initialize the SNP data with default values and sample size length int sample_size = input_data.getSampleSize(); @@ -63,7 +63,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end std::vector snp_pfb(sample_size, 0.5); std::vector snp_log2_cov(sample_size, 0.0); std::vector is_snp(sample_size, false); - this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data, snp_mutex, pfb_mutex); + this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data); // Get the log2 ratio for evenly spaced positions in the // region @@ -77,7 +77,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end snp_data.is_snp = std::move(is_snp); } -std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const +std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const { // Check that the start position is less than the end position if (start_pos >= end_pos) @@ -99,7 +99,7 @@ std::tuple CNVCaller::runCopyNumberPrediction // Query the SNP region for the SV candidate SNPData snp_data; - querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data, snp_mutex, pfb_mutex); + querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data); // Run the Viterbi algorithm std::pair, double> prediction; @@ -124,7 +124,8 @@ std::tuple CNVCaller::runCopyNumberPrediction // Determine if there is a majority state within the SV region and if it // is greater than 75% - double pct_threshold = 0.75; + //double pct_threshold = 0.75; + double pct_threshold = 0.90; int max_state = 0; int max_count = 0; @@ -167,7 +168,7 @@ std::tuple CNVCaller::runCopyNumberPrediction } -void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const +void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const { // Map with counts for each CNV type std::map cnv_type_counts; @@ -199,7 +200,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vectorquerySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data, snp_mutex, pfb_mutex); + this->querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data); // Run the Viterbi algorithm if (snp_data.pos.size() == 0) { @@ -300,7 +301,7 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector lock(this->bam_file_mtx); // Lock the BAM file + std::lock_guard lock(this->shared_mutex); // Lock the BAM file samFile *bam_file = sam_open(bam_filepath.c_str(), "r"); if (!bam_file) { @@ -444,7 +445,7 @@ void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, i } } -void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const +void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data) const { // --------- SNP file --------- const std::string snp_filepath = input_data.getSNPFilepath(); @@ -567,8 +568,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui { current_region++; // Lock during reading - // std::lock_guard lock(this->snp_file_mtx); - std::lock_guard lock(snp_mutex); + std::lock_guard lock(this->shared_mutex); // Read the SNP data ---------------------------------------------- @@ -671,7 +671,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui if (use_pfb) { // Lock during reading - std::lock_guard lock(pfb_mutex); + //std::lock_guard lock(this->shared_mutex); // Set the region as the SNP position uint32_t target_snp_pos = snp_pos[i]; // Already 1-based diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp index 3cde27d3..0f2ce105 100644 --- a/src/fasta_query.cpp +++ b/src/fasta_query.cpp @@ -100,7 +100,7 @@ std::string ReferenceGenome::getFilepath() const // Function to get the reference sequence at a given position range std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const { - std::lock_guard lock(this->mtx); + std::lock_guard lock(this->shared_mutex); // Convert positions from 1-indexed (reference) to 0-indexed (string indexing) pos_start--; @@ -127,7 +127,7 @@ std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, u // Function to get the chromosome contig lengths in VCF header format std::string ReferenceGenome::getContigHeader() const { - std::lock_guard lock(this->mtx); + std::lock_guard lock(this->shared_mutex); std::string contig_header = ""; // Sort the chromosomes @@ -154,12 +154,12 @@ std::string ReferenceGenome::getContigHeader() const std::vector ReferenceGenome::getChromosomes() const { - std::lock_guard lock(this->mtx); + std::lock_guard lock(this->shared_mutex); return this->chromosomes; } uint32_t ReferenceGenome::getChromosomeLength(std::string chr) const { - std::lock_guard lock(this->mtx); + std::lock_guard lock(this->shared_mutex); return this->chr_to_seq.at(chr).length(); } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index b6794b16..18359b8a 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -27,16 +27,18 @@ #include "fasta_query.h" /// @endcond -# define DUP_SEQSIM_THRESHOLD 0.9 // Sequence similarity threshold for duplication detection +# define DUP_SEQSIM_THRESHOLD 0.99 // Sequence similarity threshold for duplication detection +//std::mutex bam_mutex; -int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) const +int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) { + std::lock_guard lock(this->shared_mutex); int ret = sam_itr_next(fp_in, itr, bam1); return ret; } -void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map) const +void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map) { // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); @@ -98,31 +100,33 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam // Clean up the iterator and alignment hts_itr_destroy(itr); bam_destroy1(bam1); - printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments..."); + printMessage(region + ": Processed " + std::to_string(primary_map.size()) + " primary alignments with " + std::to_string(supplementary_count) + " supplementary alignments"); + // printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments..."); } -/* -void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data, bool is_primary) const + +void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data, bool is_primary, const ReferenceGenome& ref_genome) { // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); if (!bam1) { - hts_idx_destroy(idx); - bam_hdr_destroy(bamHdr); - sam_close(fp_in); printError("ERROR: failed to initialize BAM record"); return; } - // hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start, region.end); + + + //bam_mutex.lock(); + this->shared_mutex.lock(); hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start - 1, region.end); if (!itr) { + this->shared_mutex.unlock(); bam_destroy1(bam1); - hts_idx_destroy(idx); - bam_hdr_destroy(bamHdr); - sam_close(fp_in); printError("ERROR: failed to query region " + std::to_string(region.tid) + ":" + std::to_string(region.start) + "-" + std::to_string(region.end)); return; } + this->shared_mutex.unlock(); + //bam_mutex.unlock(); + // Find the correct alignment bool success = false; @@ -196,7 +200,7 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t // Get the corresponding reference sequence int cmatch_pos = pos + 1; // Querying the reference genome is 1-based - std::string cmatch_ref_str = this->input_data.queryRefGenome(chr, cmatch_pos, cmatch_pos + op_len - 1); + std::string cmatch_ref_str = ref_genome.query(chr, cmatch_pos, cmatch_pos + op_len - 1); // Check that the two sequence lengths are equal if (cmatch_seq_str.length() != cmatch_ref_str.length()) { @@ -252,9 +256,9 @@ void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t mismatch_data.query_end = query_end; mismatch_data.match_map = std::move(match_map); } -*/ -void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) const + +void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) { // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); @@ -287,7 +291,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, bam_destroy1(bam1); } -double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) const +double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) { int start = mismatch_data.query_start; int end = mismatch_data.query_end; @@ -316,7 +320,7 @@ double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) const return mismatch_rate; } -void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) const +void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) { std::string chr = header->target_name[alignment->core.tid]; // Chromosome name uint32_t pos = (uint32_t)alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) @@ -329,101 +333,177 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec uint32_t ref_pos; uint32_t ref_end; double default_lh = 0.0; + // List of ambiguous bases + const std::string amb_bases = "RYKMSWBDHV"; for (int i = 0; i < cigar_len; i++) { int op = bam_cigar_op(cigar[i]); // CIGAR operation int op_len = bam_cigar_oplen(cigar[i]); // CIGAR operation length + + if (op_len == 0) { + printError("Warning: Encountered CIGAR operation with length 0 at position " + std::to_string(pos+1) + " in chromosome " + chr); + continue; + } // Process the CIGAR operation if (op == BAM_CINS && is_primary) { - if (op_len >= this->min_sv_size) { - // Get the sequence of the insertion from the query - std::string ins_seq_str(op_len, ' '); - for (int j = 0; j < op_len; j++) { - ins_seq_str[j] = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; + // Get the sequence of the insertion from the query + std::string ins_seq_str(op_len, ' '); + for (int j = 0; j < op_len; j++) { + // Replace ambiguous bases with N + char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; + if (amb_bases.find(base) != std::string::npos) { + ins_seq_str[j] = 'N'; + } else { + ins_seq_str[j] = base; } + // Get the sequence character from the query + // ins_seq_str[j] = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; + } - // To determine whether the insertion is a duplication, check - // for sequence identity between the insertion and the - // reference genome (duplications are typically >= 90%): - // Loop through the reference sequence and calculate the - // sequence identity +/- insertion length from the insertion - // position. - bool is_duplication = false; - int ins_ref_pos; - uint32_t dup_start = std::max(0, (int)pos - op_len); - for (uint32_t j = dup_start; j <= pos; j++) { - - // Get the string for the window (1-based coordinates) - ins_ref_pos = j + 1; - // std::string window_str = - // this->input_data.queryRefGenome(chr, ins_ref_pos, - // ins_ref_pos + op_len - 1); - std::string window_str = ref_genome.query(chr, ins_ref_pos, ins_ref_pos + op_len - 1); - - // Continue if the window string is empty (out-of-range) - if (window_str == "") { + // To determine whether the insertion is a duplication, check + // for sequence identity between the insertion and the + // reference genome (duplications are typically >= 90%): + // Loop through the reference sequence and calculate the + // sequence identity +/- insertion length from the insertion + // position. + // bool is_duplication = false; + // int ins_ref_pos; + // uint32_t dup_start = std::max(0, (int)pos - op_len); + // for (uint32_t j = dup_start; j <= pos; j++) { + + // // Get the string for the window (1-based coordinates) + // ins_ref_pos = j + 1; + // std::string window_str = ref_genome.query(chr, ins_ref_pos, ins_ref_pos + op_len - 1); + + // // Continue if the window string is empty (out-of-range) + // if (window_str == "") { + // continue; + // } + + // // Calculate the sequence identity + // int num_matches = 0; + // for (int k = 0; k < op_len; k++) { + // if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') { + // num_matches++; + // } + // } + // float seq_identity = (float)num_matches / (float)op_len; + + // // Check if the target sequence identity is reached + // if (seq_identity >= DUP_SEQSIM_THRESHOLD) { + // is_duplication = true; + // break; + // } + // } + + // Calculate the sequence identity at the insertion position +/- + // length + // Before the insertion + if (pos >= (uint32_t)op_len-1) + { + uint32_t bp1 = pos - (op_len - 1); + uint32_t bp2 = pos; + const std::string& window_str = ref_genome.query(chr, bp1 + 1, bp2 + 1); + if (window_str.length() > 0) + { + int num_matches = 0; + for (int k = 0; k < op_len; k++) + { + if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') + { + num_matches++; + } + } + float seq_identity = (float)num_matches / (float)op_len; + if (seq_identity >= DUP_SEQSIM_THRESHOLD) + { + uint32_t dup_bp1 = bp1 + 1; + uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr)); + int read_depth = this->calculateReadDepth(pos_depth_map, dup_bp1, dup_bp2); + addSVCall(sv_calls, dup_bp1, dup_bp2, "DUP", ins_seq_str, "LSEQSIM", "./.", default_lh, read_depth); + + // Continue to the next CIGAR operation continue; } + } + } - // Calculate the sequence identity + // After the insertion + if (pos + op_len < ref_genome.getChromosomeLength(chr)) + { + uint32_t bp1 = pos + 1; + uint32_t bp2 = bp1 + op_len - 1; + const std::string& window_str = ref_genome.query(chr, bp1 + 1, bp2 + 1); + if (window_str.length() > 0) + { int num_matches = 0; - for (int k = 0; k < op_len; k++) { - if (ins_seq_str[k] == window_str[k]) { + for (int k = 0; k < op_len; k++) + { + if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') + { num_matches++; } } float seq_identity = (float)num_matches / (float)op_len; - - // Check if the target sequence identity is reached - if (seq_identity >= DUP_SEQSIM_THRESHOLD) { - is_duplication = true; - break; + if (seq_identity >= DUP_SEQSIM_THRESHOLD) + { + uint32_t dup_bp1 = bp1 + 1; + uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr)); + int read_depth = this->calculateReadDepth(pos_depth_map, dup_bp1, dup_bp2); + addSVCall(sv_calls, dup_bp1, dup_bp2, "DUP", ins_seq_str, "RSEQSIM", "./.", default_lh, read_depth); + + // Continue to the next CIGAR operation + continue; } } - - // Determine whether to use a symbolic allele (>50bp) or the - // actual sequence - if (op_len > 50) { - ins_seq_str = ""; - } else { - ins_seq_str = ins_seq_str; - } - - // Add to SV calls (1-based) with the appropriate SV type - ref_pos = pos+1; - - // For insertions, the reference end position is the same as the - // reference position - // For duplications, the reference end position is the same as - // the reference position plus the length of the insertion - ref_end = ref_pos + op_len - 1; - if (is_duplication) { - uint32_t bp1 = ref_pos; - uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr)); - int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth); - } else { - uint32_t bp1 = std::max(1, (int)ref_pos - 1); - uint32_t bp2 = ref_pos; - int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth); - } } + // Add as an insertion + // For read depth calculation, use the previous and current + // positions (1-based) + int read_depth = this->calculateReadDepth(pos_depth_map, std::max(1, (int)pos), pos + 1); + uint32_t ins_pos = pos + 1; + uint32_t ins_end = ins_pos + op_len - 1; + addSVCall(sv_calls, ins_pos, ins_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth); + + // Determine whether to use a symbolic allele (>50bp) or the + // actual sequence + // if (op_len > 50) { + // ins_seq_str = ""; + // } else { + // ins_seq_str = ins_seq_str; + // } + + // Add to SV calls (1-based) with the appropriate SV type + // ref_pos = pos+1; + + // // For insertions, the reference end position is the same as the + // // reference position + // // For duplications, the reference end position is the same as + // // the reference position plus the length of the insertion + // ref_end = ref_pos + op_len - 1; + // if (is_duplication) { + // uint32_t bp1 = ref_pos; + // uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr)); + // int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); + // addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth); + // } else { + // uint32_t bp1 = std::max(1, (int)ref_pos - 1); + // uint32_t bp2 = ref_pos; + // int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); + // addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth); + // } + // Check if the CIGAR operation is a deletion } else if (op == BAM_CDEL && is_primary) { - // Add the SV if greater than the minimum SV size - if (op_len >= this->min_sv_size) - { - ref_pos = pos+1; - ref_end = ref_pos + op_len -1; - // printMessage("Test2"); - int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); - addSVCall(sv_calls, ref_pos, ref_end, "DEL", "", "CIGARDEL", "./.", default_lh, read_depth); - } + ref_pos = pos+1; + ref_end = ref_pos + op_len -1; + // printMessage("Test2"); + int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); + addSVCall(sv_calls, ref_pos, ref_end, "DEL", "", "CIGARDEL", "./.", default_lh, read_depth); } // Update the reference position @@ -439,9 +519,11 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec } } -void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, std::mutex& snp_mutex, std::mutex& pfb_mutex) +void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome) { - int filter_threshold = 4; // Minimum number of supporting reads for an SV call + // int filter_threshold = 4; // Minimum number of supporting reads for an + // SV call + int filter_threshold = 10; // Minimum number of supporting reads for an SV call bool single_chr = input_data.getChromosome() != ""; // Open the BAM file @@ -485,7 +567,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // Load chromosome data for copy number predictions printMessage(chr + ": Loading chromosome data..."); - CNVCaller cnv_caller; + CNVCaller cnv_caller(this->shared_mutex); std::vector chr_pos_depth_map(chr_len+1, 0); // 1-based index int thread_count = input_data.getThreadCount(); double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count, single_chr); @@ -508,12 +590,12 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // CIGAR string, using a minimum CNV length threshold if (region_sv_count > 0) { printMessage(chr + ": CIGAR predictions..."); - cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data, snp_mutex, pfb_mutex); + cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data); } // Run split-read SV and copy number variant predictions printMessage(chr + ": Split read SVs..."); - this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data, snp_mutex, pfb_mutex); + this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data, ref_genome); // Merge the SV calls from the current region printMessage(chr + ": Merging split reads..."); @@ -531,7 +613,7 @@ void SVCaller::run(const InputData& input_data) // Set up the reference genome printMessage("Loading the reference genome..."); const std::string ref_filepath = input_data.getRefGenome(); - ReferenceGenome ref_genome; + ReferenceGenome ref_genome(this->shared_mutex); ref_genome.setFilepath(ref_filepath); // Get the chromosomes @@ -558,18 +640,19 @@ void SVCaller::run(const InputData& input_data) // Shared resources std::unordered_map> whole_genome_sv_calls; - std::mutex sv_mutex; - std::mutex snp_mutex; - std::mutex pfb_mutex; + //std::mutex sv_mutex; + //std::mutex snp_mutex; + //std::mutex pfb_mutex; // Lambda to process a chromosome auto process_chr = [&](const std::string& chr) { try { std::vector sv_calls; InputData chr_input_data = input_data; // Use a thread-local copy - this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, snp_mutex, pfb_mutex); + this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome); { - std::lock_guard lock(sv_mutex); + //std::lock_guard lock(sv_mutex); + std::lock_guard lock(this->shared_mutex); whole_genome_sv_calls[chr] = std::move(sv_calls); } // printMessage("Completed chromosome " + chr); @@ -620,7 +703,7 @@ void SVCaller::run(const InputData& input_data) // Detect SVs from split read alignments -void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, std::mutex& snp_mutex, std::mutex& pfb_mutex) const +void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, const ReferenceGenome& ref_genome) { printMessage(region + ": Getting split alignments..."); std::unordered_map primary_map; @@ -631,7 +714,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in printMessage(region + ": Finding split-read SVs..."); int sv_count = 0; int current_primary = 0; - int primary_count = primary_map.size(); + //int primary_count = primary_map.size(); uint32_t min_cnv_length = input_data.getMinCNVLength(); for (auto& entry : primary_map) { current_primary++; @@ -645,8 +728,10 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Get the read match/mismatch map // printMessage(region + ": Getting mismatch map for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); - // MismatchData primary_mismatches; - // this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches, true); + //MismatchData primary_mismatches; + //this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches, true, ref_genome); + + // Find the largest supplementary alignment GenomicRegion largest_supp_region = supp_map[qname][0]; uint32_t largest_supp_length = 0; @@ -682,7 +767,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")..."); - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data); if (std::get<1>(result) == SVType::UNKNOWN) { continue; } @@ -710,12 +795,12 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } // Trim overlapping alignments - // MismatchData supp_mismatches; - // printMessage(region + ": Getting mismatch map for supplementary alignments..."); - // this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches, false); + //MismatchData supp_mismatches; + //printMessage(region + ": Getting mismatch map for supplementary alignments..."); + //this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches, false, ref_genome); // printMessage(region + ": Trimming overlapping alignments..."); - // trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches); + //trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches); bool gap_exists = false; uint32_t boundary_left, boundary_right, gap_left, gap_right; if (primary_region.start < largest_supp_region.start) { // Primary before supp @@ -743,7 +828,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } // printMessage(region + ": Running copy number prediction for boundary..."); - std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex); + std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data); if (std::get<1>(bd_result) == SVType::UNKNOWN) { continue; } @@ -761,7 +846,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } // printMessage(region + ": Running copy number prediction for gap..."); - std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data, snp_mutex, pfb_mutex); + std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data); if (std::get<1>(gap_result) == SVType::UNKNOWN) { continue; } @@ -966,7 +1051,7 @@ void SVCaller::saveToVCF(const std::unordered_map& pos_depth_map, uint32_t start, uint32_t end) const +int SVCaller::calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end) { int read_depth = 0; try { diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 1318f8d3..dd896717 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -14,7 +14,7 @@ bool SVCall::operator<(const SVCall & other) const return start < other.start || (start == other.start && end < other.end); } -void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth) +void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth) { // Ignore unknown SV types if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") { @@ -22,13 +22,13 @@ void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std: } // Set the alt allele to or if the SV type is DUP or DEL - if (sv_type == "DUP" && alt_allele == ".") { - printError("ERROR: Invalid alt allele for duplication at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele); - alt_allele = ""; - } else if (sv_type == "DEL" && alt_allele == ".") { - printError("ERROR: Invalid alt allele for deletion at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele); - alt_allele = ""; - } + // if (sv_type == "DUP" && alt_allele == ".") { + // printError("ERROR: Invalid alt allele for duplication at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele); + // alt_allele = ""; + // } else if (sv_type == "DEL" && alt_allele == ".") { + // printError("ERROR: Invalid alt allele for deletion at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele); + // alt_allele = ""; + // } if (start >= end) { printError("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end)); From 917a68c975e2b61d8c620dddf1799f9c3f7cc7cb Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sun, 29 Dec 2024 13:20:15 -0500 Subject: [PATCH 060/134] update thresholds --- src/cnv_caller.cpp | 5 +++-- src/sv_caller.cpp | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index a1a93adf..44505994 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -124,8 +124,9 @@ std::tuple CNVCaller::runCopyNumberPrediction // Determine if there is a majority state within the SV region and if it // is greater than 75% - //double pct_threshold = 0.75; - double pct_threshold = 0.90; + double pct_threshold = 0.75; + // double pct_threshold = 0.90; + // double pct_threshold = 0.80; int max_state = 0; int max_count = 0; diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 18359b8a..34200b34 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -27,7 +27,7 @@ #include "fasta_query.h" /// @endcond -# define DUP_SEQSIM_THRESHOLD 0.99 // Sequence similarity threshold for duplication detection +# define DUP_SEQSIM_THRESHOLD 0.90 // Sequence similarity threshold for duplication detection //std::mutex bam_mutex; From 5083aa21594e0fd176311bc787967f62f099d545 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 23 Jan 2025 17:36:31 -0500 Subject: [PATCH 061/134] Improve multi-threading and add min-reads parameter --- include/input_data.h | 5 + include/utils.h | 21 --- src/cnv_caller.cpp | 45 +++--- src/input_data.cpp | 22 ++- src/main.cpp | 6 + src/sv_caller.cpp | 329 +++++++++++++++++++++++-------------------- src/sv_object.cpp | 2 +- 7 files changed, 229 insertions(+), 201 deletions(-) diff --git a/include/input_data.h b/include/input_data.h index 72bca5af..3960d362 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -61,6 +61,10 @@ class InputData { void setMinCNVLength(int min_cnv_length); uint32_t getMinCNVLength() const; + // Set the minimum number of reads supporting an SV for filtering steps. + void setMinReadSupport(int min_reads); + int getMinReadSupport() const; + // Set the chromosome to analyze. void setChromosome(std::string chr); std::string getChromosome() const; @@ -98,6 +102,7 @@ class InputData { std::string output_dir; int sample_size; uint32_t min_cnv_length; + int min_reads; std::string chr; // Chromosome to analyze std::pair start_end; // Region to analyze bool region_set; // True if a region is set diff --git a/include/utils.h b/include/utils.h index 2fb4a3b1..6715b00e 100644 --- a/include/utils.h +++ b/include/utils.h @@ -38,27 +38,6 @@ struct BamFileGuard { BamFileGuard& operator=(const BamFileGuard&) = delete; // Non-assignable }; -// Guard to close the BCF file -struct BcfFileGuard { - bcf_srs_t* reader; - bcf_hdr_t* hdr; - - BcfFileGuard(bcf_srs_t* reader, bcf_hdr_t* hdr) - : reader(reader), hdr(hdr) {} - - ~BcfFileGuard() { - if (hdr) { - bcf_hdr_destroy(hdr); - } - if (reader) { - bcf_sr_destroy(reader); - } - } - - BcfFileGuard(const BcfFileGuard&) = delete; // Non-copyable - BcfFileGuard& operator=(const BcfFileGuard&) = delete; // Non-assignable -}; - // Print the progress of a task void printProgress(int progress, int total); diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 44505994..a622325d 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -80,7 +80,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const { // Check that the start position is less than the end position - if (start_pos >= end_pos) + if (start_pos > end_pos) { printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false); @@ -187,7 +187,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector end - if (start_pos >= end_pos) + if (start_pos > end_pos) { printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); continue; @@ -310,11 +310,12 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vectorrequire_index = 1; - // Set multi-threading if running on a single chromosome + // Use multi-threading. This is possible here due to the lock int thread_count = input_data.getThreadCount(); - if (input_data.isSingleChr()) - { - printMessage("Setting SNP reader threads to " + std::to_string(std::max(1, thread_count / 2))); - bcf_sr_set_threads(snp_reader, std::max(1, thread_count / 2)); - } + bcf_sr_set_threads(snp_reader, thread_count); + // if (input_data.isSingleChr()) + // { + // printMessage("Setting SNP reader threads to " + std::to_string(std::max(1, thread_count / 2))); + // bcf_sr_set_threads(snp_reader, std::max(1, thread_count / 2)); + // } // Add the SNP file to the reader if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0) @@ -495,7 +497,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui bcf_srs_t *pfb_reader = bcf_sr_init(); std::string chr_gnomad; std::string AF_key; - // BcfFileGuard pfb_guard(nullptr, nullptr); // Guard to close the population allele frequency file if (use_pfb) { // Determine the ethnicity-specific allele frequency key @@ -537,13 +538,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui } pfb_reader->require_index = 1; - // Set multi-threading if running on a single chromosome - if (input_data.isSingleChr()) - { - printMessage("Setting population allele frequency reader threads to " + std::to_string(std::max(1, thread_count / 2))); - bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2)); - } - // Add the population allele frequency file to the reader if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0) { @@ -551,10 +545,17 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Clean up bcf_sr_destroy(pfb_reader); - // bcf_hdr_destroy(snp_header); bcf_sr_destroy(snp_reader); return; } + + // Use multi-threading. This is possible here due to the lock + bcf_sr_set_threads(pfb_reader, thread_count); + // if (input_data.isSingleChr()) + // { + // printMessage("Setting population allele frequency reader threads to " + std::to_string(std::max(1, thread_count / 2))); + // bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2)); + // } } // Split the region into samples diff --git a/src/input_data.cpp b/src/input_data.cpp index a24efb9e..489a9894 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -26,6 +26,7 @@ InputData::InputData() this->output_dir = ""; this->sample_size = 100; this->min_cnv_length = 1000; + this->min_reads = 5; this->thread_count = 1; this->hmm_filepath = "data/wgs.hmm"; this->verbose = false; @@ -103,10 +104,15 @@ std::string InputData::getOutputDir() const void InputData::setOutputDir(std::string dirpath) { this->output_dir = dirpath; - - // Create the output directory std::string cmd = "mkdir -p " + output_dir; - system(cmd.c_str()); + try + { + std::system(cmd.c_str()); + } catch (const std::exception& e) + { + std::cerr << "Error creating output directory: " << e.what() << std::endl; + exit(1); + } } int InputData::getSampleSize() const @@ -149,6 +155,16 @@ void InputData::setMinCNVLength(int min_cnv_length) this->min_cnv_length = (uint32_t) min_cnv_length; } +void InputData::setMinReadSupport(int min_reads) +{ + this->min_reads = min_reads; +} + +int InputData::getMinReadSupport() const +{ + return this->min_reads; +} + void InputData::setChromosome(std::string chr) { this->chr = chr; diff --git a/src/main.cpp b/src/main.cpp index bbdb8366..58d8fbdc 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -46,6 +46,9 @@ void runContextSV(const std::unordered_map& args) if (args.find("min-cnv") != args.end()) { input_data.setMinCNVLength(std::stoi(args.at("min-cnv"))); } + if (args.find("min-reads") != args.end()) { + input_data.setMinReadSupport(std::stoi(args.at("min-reads"))); + } if (args.find("eth") != args.end()) { input_data.setEthnicity(args.at("eth")); } @@ -76,6 +79,7 @@ void printUsage(const std::string& programName) { << " -h, --hmm HMM file\n" << " -n, --sample-size Sample size for HMM predictions\n" << " --min-cnv Minimum CNV length\n" + << " --min-reads Minimum read support\n" << " -e, --eth ETH file\n" << " -p, --pfb PFB file\n" << " --save-cnv Save CNV data\n" @@ -110,6 +114,8 @@ std::unordered_map parseArguments(int argc, char* argv args["sample-size"] = argv[++i]; } else if (arg == "--min-cnv" && i + 1 < argc) { args["min-cnv"] = argv[++i]; + } else if (arg == "--min-reads" && i + 1 < argc) { + args["min-reads"] = argv[++i]; } else if ((arg == "-e" || arg == "--eth") && i + 1 < argc) { args["eth"] = argv[++i]; } else if ((arg == "-p" || arg == "--pfb") && i + 1 < argc) { diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 34200b34..6180612f 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -27,7 +27,7 @@ #include "fasta_query.h" /// @endcond -# define DUP_SEQSIM_THRESHOLD 0.90 // Sequence similarity threshold for duplication detection +# define DUP_SEQSIM_THRESHOLD 0.9 // Sequence similarity threshold for duplication detection //std::mutex bam_mutex; @@ -336,174 +336,188 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec // List of ambiguous bases const std::string amb_bases = "RYKMSWBDHV"; for (int i = 0; i < cigar_len; i++) { - - int op = bam_cigar_op(cigar[i]); // CIGAR operation int op_len = bam_cigar_oplen(cigar[i]); // CIGAR operation length - - if (op_len == 0) { - printError("Warning: Encountered CIGAR operation with length 0 at position " + std::to_string(pos+1) + " in chromosome " + chr); - continue; - } - - // Process the CIGAR operation - if (op == BAM_CINS && is_primary) { - - // Get the sequence of the insertion from the query - std::string ins_seq_str(op_len, ' '); - for (int j = 0; j < op_len; j++) { - // Replace ambiguous bases with N - char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; - if (amb_bases.find(base) != std::string::npos) { - ins_seq_str[j] = 'N'; - } else { - ins_seq_str[j] = base; + int op = bam_cigar_op(cigar[i]); // CIGAR operation + if (op_len >= 50) { + // Process SVs + + // Process the CIGAR operation + if (op == BAM_CINS && is_primary) { + + // Get the sequence of the insertion from the query + std::string ins_seq_str(op_len, ' '); + for (int j = 0; j < op_len; j++) { + // Replace ambiguous bases with N + char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; + if (amb_bases.find(base) != std::string::npos) { + ins_seq_str[j] = 'N'; + } else { + ins_seq_str[j] = base; + } + // Get the sequence character from the query + // ins_seq_str[j] = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; } - // Get the sequence character from the query - // ins_seq_str[j] = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; - } + + // To determine whether the insertion is a duplication, check + // for sequence identity between the insertion and the + // reference genome (duplications are typically >= 90%): + // Loop through the reference sequence and calculate the + // sequence identity +/- insertion length from the insertion + // position. + // bool is_duplication = false; + // int ins_ref_pos; + // uint32_t dup_start = std::max(0, (int)pos - op_len); + // for (uint32_t j = dup_start; j <= pos; j++) { + + // // Get the string for the window (1-based coordinates) + // ins_ref_pos = j + 1; + // std::string window_str = ref_genome.query(chr, ins_ref_pos, ins_ref_pos + op_len - 1); + + // // Continue if the window string is empty (out-of-range) + // if (window_str == "") { + // continue; + // } + + // // Calculate the sequence identity + // int num_matches = 0; + // for (int k = 0; k < op_len; k++) { + // if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') { + // num_matches++; + // } + // } + // float seq_identity = (float)num_matches / (float)op_len; + + // // Check if the target sequence identity is reached + // if (seq_identity >= DUP_SEQSIM_THRESHOLD) { + // is_duplication = true; + // break; + // } + // } - // To determine whether the insertion is a duplication, check - // for sequence identity between the insertion and the - // reference genome (duplications are typically >= 90%): - // Loop through the reference sequence and calculate the - // sequence identity +/- insertion length from the insertion - // position. - // bool is_duplication = false; - // int ins_ref_pos; - // uint32_t dup_start = std::max(0, (int)pos - op_len); - // for (uint32_t j = dup_start; j <= pos; j++) { - - // // Get the string for the window (1-based coordinates) - // ins_ref_pos = j + 1; - // std::string window_str = ref_genome.query(chr, ins_ref_pos, ins_ref_pos + op_len - 1); - - // // Continue if the window string is empty (out-of-range) - // if (window_str == "") { - // continue; - // } - - // // Calculate the sequence identity - // int num_matches = 0; - // for (int k = 0; k < op_len; k++) { - // if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') { - // num_matches++; - // } - // } - // float seq_identity = (float)num_matches / (float)op_len; - - // // Check if the target sequence identity is reached - // if (seq_identity >= DUP_SEQSIM_THRESHOLD) { - // is_duplication = true; - // break; - // } - // } - - // Calculate the sequence identity at the insertion position +/- - // length - // Before the insertion - if (pos >= (uint32_t)op_len-1) - { - uint32_t bp1 = pos - (op_len - 1); - uint32_t bp2 = pos; - const std::string& window_str = ref_genome.query(chr, bp1 + 1, bp2 + 1); - if (window_str.length() > 0) + // Calculate the sequence identity at the insertion position +/- + // length if >= 50bp + //if (op_len >= 50) { + + // Before the insertion + if (pos >= (uint32_t)op_len-1) { - int num_matches = 0; - for (int k = 0; k < op_len; k++) + uint32_t bp1 = pos - (op_len - 1) + 1; + uint32_t bp2 = bp1 + op_len - 1; //pos + 1; + const std::string& window_str = ref_genome.query(chr, bp1, bp2); + if (window_str.length() > 0) { - if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') + int num_matches = 0; + for (int k = 0; k < op_len; k++) { - num_matches++; + if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') + { + num_matches++; + } + } + float seq_identity = (float)num_matches / (float)op_len; + if (seq_identity >= DUP_SEQSIM_THRESHOLD) + { + //uint32_t dup_bp1 = bp1 + 1; + //uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr)); + int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); + //printMessage("TEST3"); + addSVCall(sv_calls, bp1, bp2, "DUP", "", "LSEQSIM", "./.", default_lh, read_depth); + + // Continue to the next CIGAR operation + continue; } - } - float seq_identity = (float)num_matches / (float)op_len; - if (seq_identity >= DUP_SEQSIM_THRESHOLD) - { - uint32_t dup_bp1 = bp1 + 1; - uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr)); - int read_depth = this->calculateReadDepth(pos_depth_map, dup_bp1, dup_bp2); - addSVCall(sv_calls, dup_bp1, dup_bp2, "DUP", ins_seq_str, "LSEQSIM", "./.", default_lh, read_depth); - - // Continue to the next CIGAR operation - continue; } } - } - // After the insertion - if (pos + op_len < ref_genome.getChromosomeLength(chr)) - { - uint32_t bp1 = pos + 1; - uint32_t bp2 = bp1 + op_len - 1; - const std::string& window_str = ref_genome.query(chr, bp1 + 1, bp2 + 1); - if (window_str.length() > 0) + // After the insertion + if (pos + op_len < ref_genome.getChromosomeLength(chr)) { - int num_matches = 0; - for (int k = 0; k < op_len; k++) + uint32_t bp1 = pos + 1; + //uint32_t bp2 = std::min(bp1 + op_len - 1, ref_genome.getChromosomeLength(chr)); + uint32_t bp2 = bp1 + op_len - 1; + const std::string& window_str = ref_genome.query(chr, bp1, bp2); + if (window_str.length() > 0) { - if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') + int num_matches = 0; + for (int k = 0; k < op_len; k++) + { + if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') + { + num_matches++; + } + } + float seq_identity = (float)num_matches / (float)op_len; + if (seq_identity >= DUP_SEQSIM_THRESHOLD) { - num_matches++; + //uint32_t dup_bp1 = bp1 + 1; + //uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr)); + int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); + //printMessage("TEST1"); + addSVCall(sv_calls, bp1, bp2, "DUP", "", "RSEQSIM", "./.", default_lh, read_depth); + + // Continue to the next CIGAR operation + continue; } } - float seq_identity = (float)num_matches / (float)op_len; - if (seq_identity >= DUP_SEQSIM_THRESHOLD) - { - uint32_t dup_bp1 = bp1 + 1; - uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr)); - int read_depth = this->calculateReadDepth(pos_depth_map, dup_bp1, dup_bp2); - addSVCall(sv_calls, dup_bp1, dup_bp2, "DUP", ins_seq_str, "RSEQSIM", "./.", default_lh, read_depth); + } - // Continue to the next CIGAR operation - continue; - } + // Add as an insertion + // For read depth calculation, use the previous and current + // positions (1-based) + + //uint32_t ins_pos = pos; + //uint32_t ins_end = ins_pos + op_len -1; + uint32_t ins_pos = pos + 1; + uint32_t ins_end = ins_pos + op_len - 1; + int read_depth = this->calculateReadDepth(pos_depth_map, ins_pos-1, ins_pos); + //printMessage("TEST2: " + std::to_string(ins_pos) + ", " + std::to_string(ins_end) + ", OPLEN=" + std::to_string(op_len)); + + // Determine the ALT allele format based on small vs. large insertion + std::string alt_allele = ""; + if (op_len <= 50) { + alt_allele = ins_seq_str; } - } + + addSVCall(sv_calls, ins_pos, ins_end, "INS", alt_allele, "CIGARINS", "./.", default_lh, read_depth); + + // Determine whether to use a symbolic allele (>50bp) or the + // actual sequence + // if (op_len > 50) { + // ins_seq_str = ""; + // } else { + // ins_seq_str = ins_seq_str; + // } + + // Add to SV calls (1-based) with the appropriate SV type + // ref_pos = pos+1; + + // // For insertions, the reference end position is the same as the + // // reference position + // // For duplications, the reference end position is the same as + // // the reference position plus the length of the insertion + // ref_end = ref_pos + op_len - 1; + // if (is_duplication) { + // uint32_t bp1 = ref_pos; + // uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr)); + // int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); + // addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth); + // } else { + // uint32_t bp1 = std::max(1, (int)ref_pos - 1); + // uint32_t bp2 = ref_pos; + // int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); + // addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth); + // } + + // Check if the CIGAR operation is a deletion + } else if (op == BAM_CDEL && is_primary) { - // Add as an insertion - // For read depth calculation, use the previous and current - // positions (1-based) - int read_depth = this->calculateReadDepth(pos_depth_map, std::max(1, (int)pos), pos + 1); - uint32_t ins_pos = pos + 1; - uint32_t ins_end = ins_pos + op_len - 1; - addSVCall(sv_calls, ins_pos, ins_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth); - - // Determine whether to use a symbolic allele (>50bp) or the - // actual sequence - // if (op_len > 50) { - // ins_seq_str = ""; - // } else { - // ins_seq_str = ins_seq_str; - // } - - // Add to SV calls (1-based) with the appropriate SV type - // ref_pos = pos+1; - - // // For insertions, the reference end position is the same as the - // // reference position - // // For duplications, the reference end position is the same as - // // the reference position plus the length of the insertion - // ref_end = ref_pos + op_len - 1; - // if (is_duplication) { - // uint32_t bp1 = ref_pos; - // uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr)); - // int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - // addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth); - // } else { - // uint32_t bp1 = std::max(1, (int)ref_pos - 1); - // uint32_t bp2 = ref_pos; - // int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - // addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth); - // } - - // Check if the CIGAR operation is a deletion - } else if (op == BAM_CDEL && is_primary) { - - ref_pos = pos+1; - ref_end = ref_pos + op_len -1; - // printMessage("Test2"); - int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); - addSVCall(sv_calls, ref_pos, ref_end, "DEL", "", "CIGARDEL", "./.", default_lh, read_depth); + ref_pos = pos+1; + ref_end = ref_pos + op_len -1; + // printMessage("Test2"); + int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); + //printMessage("TEST4: " + std::to_string(ref_pos) + ", " + std::to_string(ref_end) + ", OPLEN=" + std::to_string(op_len)); + addSVCall(sv_calls, ref_pos, ref_end, "DEL", "", "CIGARDEL", "./.", default_lh, read_depth); + } } // Update the reference position @@ -521,9 +535,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome) { - // int filter_threshold = 4; // Minimum number of supporting reads for an + // int filter_threshold = 4; // Minimum number of supporting reads for an SV call + // int filter_threshold = 10; // Minimum number of supporting reads for an // SV call - int filter_threshold = 10; // Minimum number of supporting reads for an SV call + int filter_threshold = input_data.getMinReadSupport(); // Minimum number of supporting reads for an SV call bool single_chr = input_data.getChromosome() != ""; // Open the BAM file @@ -534,6 +549,10 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v return; } + // Set multi-threading + int num_threads = input_data.getThreadCount(); + hts_set_threads(fp_in, num_threads); + // Load the header bam_hdr_t *bamHdr = sam_hdr_read(fp_in); if (!bamHdr) { @@ -962,10 +981,12 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls, uint32_t start, uint32_t end, std: // alt_allele = ""; // } - if (start >= end) { + if (start > end) { printError("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end)); return; } From b31875c16c6a459dfd37704f7119e654fc5154de Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 27 Jan 2025 20:58:16 -0500 Subject: [PATCH 062/134] efficiency updates --- Makefile-cpp | 2 +- include/cnv_caller.h | 2 +- include/fasta_query.h | 5 +- include/sv_caller.h | 4 +- include/sv_object.h | 2 + setup.py | 2 +- src/cnv_caller.cpp | 127 ++++++----- src/fasta_query.cpp | 108 +++++++--- src/input_data.cpp | 5 + src/sv_caller.cpp | 476 +++++++++++------------------------------- src/sv_object.cpp | 18 +- 11 files changed, 286 insertions(+), 465 deletions(-) diff --git a/Makefile-cpp b/Makefile-cpp index 3babecb3..e77cf0a8 100644 --- a/Makefile-cpp +++ b/Makefile-cpp @@ -18,7 +18,7 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib # Compiler and Flags CXX := g++ -CXXFLAGS := -std=c++14 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic +CXXFLAGS := -std=c++17 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR) # Add rpath for shared libraries LDLIBS := -lhts # Link with libhts.a or libhts.so diff --git a/include/cnv_caller.h b/include/cnv_caller.h index bed2a347..055e3247 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -94,7 +94,7 @@ class CNVCaller { // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; - double calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map, const std::string& bam_filepath, int thread_count, bool single_chr) const; + double calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const; void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& pos_log2) const; diff --git a/include/fasta_query.h b/include/fasta_query.h index b3cee253..a0697446 100644 --- a/include/fasta_query.h +++ b/include/fasta_query.h @@ -9,6 +9,7 @@ #include #include #include +#include /// @endcond class ReferenceGenome { @@ -16,6 +17,7 @@ class ReferenceGenome { std::string fasta_filepath; std::vector chromosomes; std::unordered_map chr_to_seq; + std::map chr_to_length; //mutable std::mutex mtx; std::mutex& shared_mutex; @@ -24,7 +26,8 @@ class ReferenceGenome { int setFilepath(std::string fasta_filepath); std::string getFilepath() const; - std::string query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const; + std::string_view query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const; + bool compare(const std::string& chr, uint32_t pos_start, uint32_t pos_end, const std::string& compare_seq, float match_threshold) const; // Get the chromosome contig lengths in VCF header format std::string getContigHeader() const; diff --git a/include/sv_caller.h b/include/sv_caller.h index 18c602c5..5cfbb956 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -34,8 +34,6 @@ class SVCaller { int min_mapq = 20; // Minimum mapping quality to be considered std::mutex shared_mutex; - void getAlignmentMismatchMap(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const GenomicRegion& region, MismatchData& mismatch_data, bool is_primary, const ReferenceGenome& ref_genome); - void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map); // Detect SVs from the CIGAR string of a read alignment, and return the @@ -53,7 +51,7 @@ class SVCaller { int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); // Detect SVs from split alignments - void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, const ReferenceGenome& ref_genome); + void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data); // Calculate the mismatch rate given a map of query positions to // match/mismatch (1/0) values within a specified range of the query diff --git a/include/sv_object.h b/include/sv_object.h index 3fedee73..e3e3a8ba 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -34,6 +34,8 @@ void mergeSVs(std::vector& sv_calls); void filterSVsWithLowSupport(std::vector &sv_calls, int min_depth); +void filterSVsWithLowSupport(std::vector &sv_calls, int min_depth, const std::string& data_type); + uint32_t getSVCount(const std::vector& sv_calls); void concatenateSVCalls(std::vector& sv_calls, const std::vector& sv_calls_update); diff --git a/setup.py b/setup.py index ce0c428f..c8591523 100644 --- a/setup.py +++ b/setup.py @@ -40,7 +40,7 @@ name="_" + NAME, sources=SRC_FILES, include_dirs=[INCLUDE_DIR, conda_include_dir], - extra_compile_args=["-std=c++14"], + extra_compile_args=["-std=c++17"], language="c++", libraries=["hts"], library_dirs=[conda_lib_dir] diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index a622325d..344288c4 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -298,7 +298,7 @@ std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint3 } // Calculate the mean chromosome coverage -double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map, const std::string& bam_filepath, int thread_count, bool single_chr) const +double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const { { // Open the BAM file @@ -449,6 +449,9 @@ void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, i void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data) const { + // Lock during reading + std::lock_guard lock(this->shared_mutex); + // --------- SNP file --------- const std::string snp_filepath = input_data.getSNPFilepath(); if (snp_filepath.empty()) @@ -458,6 +461,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui } // Initialize the SNP file reader + // printMessage("Initializing SNP reader..."); bcf_srs_t *snp_reader = bcf_sr_init(); if (!snp_reader) { @@ -469,19 +473,16 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Use multi-threading. This is possible here due to the lock int thread_count = input_data.getThreadCount(); bcf_sr_set_threads(snp_reader, thread_count); - // if (input_data.isSingleChr()) - // { - // printMessage("Setting SNP reader threads to " + std::to_string(std::max(1, thread_count / 2))); - // bcf_sr_set_threads(snp_reader, std::max(1, thread_count / 2)); - // } // Add the SNP file to the reader + // printMessage("Adding SNP file to reader..."); if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0) { bcf_sr_destroy(snp_reader); printError("ERROR: Could not add SNP file to reader: " + snp_filepath); return; } + // printMessage("SNP file added to reader."); // --------- Population allele frequency file --------- @@ -491,7 +492,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui if (pfb_filepath.empty()) { use_pfb = false; - // printMessage("WARNING: No population allele frequency file provided for chromosome " + chr); } bcf_srs_t *pfb_reader = bcf_sr_init(); @@ -532,13 +532,13 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui printError("ERROR: Could not initialize population allele frequency reader."); // Clean up - // bcf_hdr_destroy(snp_header); bcf_sr_destroy(snp_reader); return; } pfb_reader->require_index = 1; // Add the population allele frequency file to the reader + // printMessage("Adding population allele frequency file to reader..."); if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0) { printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath); @@ -551,11 +551,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Use multi-threading. This is possible here due to the lock bcf_sr_set_threads(pfb_reader, thread_count); - // if (input_data.isSingleChr()) - // { - // printMessage("Setting population allele frequency reader threads to " + std::to_string(std::max(1, thread_count / 2))); - // bcf_sr_set_threads(pfb_reader, std::max(1, thread_count / 2)); - // } } // Split the region into samples @@ -564,25 +559,26 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Loop through the samples and read the SNP data, storing the first // SNP position and BAF value for each sample - int print_count = 0; + // int print_count = 0; int current_region = 0; for (size_t i = 0; i < region_chunks.size(); ++i) { current_region++; // Lock during reading - std::lock_guard lock(this->shared_mutex); + // std::lock_guard lock(this->shared_mutex); // Read the SNP data ---------------------------------------------- // Set the region + // printMessage("Setting region for SNP reader..."); std::string region_str = region_chunks[i]; if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) { printError("ERROR: Could not set region for SNP reader: " + region_str); break; } + // printMessage("Region set for SNP reader, loading SNP data..."); - // printMessage("Iterating through SNPs in region " + std::to_string(current_region) + " of " + std::to_string((int) region_chunks.size()) + " with length " + std::to_string((int) (end_pos - start_pos)) + " bp..."); bool snp_found = false; while (bcf_sr_next_line(snp_reader) > 0) { @@ -609,10 +605,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Extract DP from FORMAT field int32_t *dp = 0; - // int dp_values[2]; int dp_count = 0; - // int dp_ret = bcf_get_format_int32(snp_header, snp_record, - // "DP", &dp, &dp_count); int dp_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "DP", &dp, &dp_count); if (dp_ret < 0 || dp[0] <= 10) { @@ -621,8 +614,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui free(dp); // Skip if the SNP does not pass the filter - // if (bcf_has_filter(snp_header, snp_record, - // const_cast("PASS")) != 1) if (bcf_has_filter(snp_reader->readers[0].header, snp_record, const_cast("PASS")) != 1) { continue; @@ -630,21 +621,14 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Extract AD from FORMAT field int32_t *ad = 0; - // int ad_values[2]; int ad_count = 0; - // int ad_ret = bcf_get_format_int32(snp_header, snp_record, - // "AD", &ad, &ad_count); int ad_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "AD", &ad, &ad_count); - // int ad_ret = bcf_get_format_int32(snp_header, snp_record, - // "AD", &ad, &ad_count); if (ad_ret < 0 || ad_count < 2) { continue; } // Calculate the B-allele frequency (BAF) - // double baf = (double) ad_values[1] / (double) (ad_values[0] + - // ad_values[1]); double baf = (double) ad[1] / (double) (ad[0] + ad[1]); free(ad); @@ -672,10 +656,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Read the population allele frequency data ---------------------- if (use_pfb) { - // Lock during reading - //std::lock_guard lock(this->shared_mutex); - // Set the region as the SNP position + // printMessage("Setting region for population allele frequency reader..."); uint32_t target_snp_pos = snp_pos[i]; // Already 1-based std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos); if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0) @@ -683,61 +665,74 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui printError("ERROR: Could not set region for population allele frequency reader: " + region_str); break; } + // printMessage("Region set for population allele frequency reader, loading population allele frequency data..."); // Find the SNP position in the population allele frequency file + float *pfb_f = NULL; + int count = 0; while (bcf_sr_next_line(pfb_reader) > 0) { - if (!bcf_sr_has_line(pfb_reader, 0)) - { - continue; - } + // Get the SNP record and validate bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0); - if (pfb_record) + if (!pfb_record || !bcf_is_snp(pfb_record)) { - // Skip if not a SNP - if (!bcf_is_snp(pfb_record)) - { - continue; - } + continue; // Skip if not a SNP + } - // Get the population frequency for the SNP - float *pfb_f = NULL; - int count = 0; - int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count); - if (pfb_status < 0 || count == 0) - { - continue; - } - double pfb = (double) pfb_f[0]; - free(pfb_f); + // if (!bcf_sr_has_line(pfb_reader, 0)) + // { + // continue; + // } + // bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0); + // if (pfb_record) + // { + // // Skip if not a SNP + // if (!bcf_is_snp(pfb_record)) + // { + // continue; + // } + + // Get the population frequency for the SNP + // float *pfb_f = NULL; + // int count = 0; + int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count); + if (pfb_status < 0 || count == 0) + { + continue; + } + // double pfb = (double) pfb_f[0]; + double pfb = static_cast(pfb_f[0]); + // free(pfb_f); - // Continue if the population frequency is outside the threshold - if (pfb <= MIN_PFB || pfb >= MAX_PFB) - { - continue; - } + // Skip if outside the acceptable range + if (pfb <= MIN_PFB || pfb >= MAX_PFB) + { + continue; + } - // Add the population frequency to the SNP data - snp_pfb[i] = pfb; + // Add the population frequency to the SNP data + snp_pfb[i] = pfb; - // Break after finding the SNP position - break; + // Break after finding the SNP position + break; - if (print_count < 20) { - printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")"); - print_count++; - } - } + // if (print_count < 20) { + // printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")"); + // print_count++; + // } } + free(pfb_f); + + // } if (pfb_reader->errnum) { printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum))); } } + // printMessage("SNP region " + std::to_string(current_region) + " of " + std::to_string(region_chunks.size()) + " completed."); } // Clean up - // bcf_hdr_destroy(snp_header); bcf_sr_destroy(snp_reader); bcf_sr_destroy(pfb_reader); } diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp index 0f2ce105..445643cc 100644 --- a/src/fasta_query.cpp +++ b/src/fasta_query.cpp @@ -12,6 +12,9 @@ /// @endcond +#include "utils.h" + + int ReferenceGenome::setFilepath(std::string fasta_filepath) { if (fasta_filepath == "") @@ -31,8 +34,8 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath) } // Get the chromosomes and sequences - std::vector chromosomes; - std::unordered_map chr_to_seq; + // std::vector chromosomes; + // std::unordered_map chr_to_seq; std::string current_chr = ""; std::string sequence = ""; std::string line_str = ""; @@ -45,8 +48,11 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath) // Store the previous chromosome and sequence if (current_chr != "") { - chromosomes.push_back(current_chr); // Add the chromosome to the list - chr_to_seq[current_chr] = sequence; // Add the sequence to the map + this->chromosomes.push_back(current_chr); // Add the chromosome to the list + this->chr_to_seq[current_chr] = sequence; // Add the sequence to the map + this->chr_to_length[current_chr] = sequence.length(); // Add the sequence length to the map + // chromosomes.push_back(current_chr); // Add the chromosome to the list + // chr_to_seq[current_chr] = sequence; // Add the sequence to the map sequence = ""; // Reset the sequence } @@ -61,11 +67,11 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath) } // Check if the chromosome is already in the map - if (chr_to_seq.find(current_chr) != chr_to_seq.end()) - { - std::cerr << "Duplicate chromosome " << current_chr << std::endl; - exit(1); - } + // if (chr_to_seq.find(current_chr) != chr_to_seq.end()) + // { + // std::cerr << "Duplicate chromosome " << current_chr << std::endl; + // exit(1); + // } } else { // Sequence line sequence += line_str; @@ -75,19 +81,23 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath) // Add the last chromosome at the end of the file if (current_chr != "") { - chromosomes.push_back(current_chr); // Add the chromosome to the list - chr_to_seq[current_chr] = sequence; // Add the sequence to the map + this->chromosomes.push_back(current_chr); // Add the chromosome to the list + this->chr_to_seq[current_chr] = sequence; // Add the sequence to the map + this->chr_to_length[current_chr] = sequence.length(); // Add the sequence length to the map + // chromosomes.push_back(current_chr); // Add the chromosome to the list + // chr_to_seq[current_chr] = sequence; // Add the sequence to the map } // Close the file fasta_file.close(); // Sort the chromosomes - std::sort(chromosomes.begin(), chromosomes.end()); + // std::sort(chromosomes.begin(), chromosomes.end()); + std::sort(this->chromosomes.begin(), this->chromosomes.end()); // Set the chromosomes and sequences - this->chromosomes = chromosomes; - this->chr_to_seq = chr_to_seq; + // this->chromosomes = chromosomes; + // this->chr_to_seq = chr_to_seq; return 0; } @@ -98,30 +108,75 @@ std::string ReferenceGenome::getFilepath() const } // Function to get the reference sequence at a given position range -std::string ReferenceGenome::query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const +std::string_view ReferenceGenome::query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const { - std::lock_guard lock(this->shared_mutex); + // printMessage("Querying reference genome"); + // std::lock_guard lock(this->shared_mutex); // Convert positions from 1-indexed (reference) to 0-indexed (string indexing) pos_start--; pos_end--; // Ensure that the end position is not larger than the chromosome length - if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length()) + // if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length()) + const std::string& sequence = this->chr_to_seq.at(chr); + if (pos_end >= sequence.length() || pos_start > pos_end) { - return ""; + return {}; } - uint32_t length = pos_end - pos_start + 1; - const std::string& sequence = this->chr_to_seq.at(chr); + // uint32_t length = pos_end - pos_start + 1; // If the subsequence is empty, return empty string - if (sequence.substr(pos_start, length).empty()) + // if (sequence.substr(pos_start, length).empty()) + // { + // return ""; + // } + + // return sequence.substr(pos_start, length); + return std::string_view(sequence).substr(pos_start, (pos_end - pos_start) + 1); +} + +// Function to compare the reference sequence at a given position range +bool ReferenceGenome::compare(const std::string& chr, uint32_t pos_start, uint32_t pos_end, const std::string& compare_seq, float match_threshold) const +{ + // std::lock_guard lock(this->shared_mutex); + + // Convert positions from 1-indexed (reference) to 0-indexed (string indexing) + pos_start--; + pos_end--; + + // Ensure that the end position is not larger than the chromosome length + // if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length()) + const std::string& sequence = this->chr_to_seq.at(chr); + if (pos_end >= sequence.length() || pos_start >= pos_end) + { + return {}; + } + + // Get the subsequence + std::string_view subseq = std::string_view(sequence).substr(pos_start, pos_end - pos_start + 1); + + // Ensure the lengths are equal + if (subseq.length() != compare_seq.length()) { - return ""; + printError("ERROR: Sequence lengths do not match for comparison"); + return false; } - return sequence.substr(pos_start, length); + // Calculate the match rate + size_t num_matches = 0; + for (size_t i = 0; i < subseq.length(); i++) + { + if (subseq[i] == compare_seq[i]) + { + num_matches++; + } + } + float match_rate = (float)num_matches / (float)subseq.length(); + + // Check if the match rate is above the threshold + return match_rate >= match_threshold; } // Function to get the chromosome contig lengths in VCF header format @@ -154,12 +209,13 @@ std::string ReferenceGenome::getContigHeader() const std::vector ReferenceGenome::getChromosomes() const { - std::lock_guard lock(this->shared_mutex); + // std::lock_guard lock(this->shared_mutex); return this->chromosomes; } uint32_t ReferenceGenome::getChromosomeLength(std::string chr) const { - std::lock_guard lock(this->shared_mutex); - return this->chr_to_seq.at(chr).length(); + // std::lock_guard lock(this->shared_mutex); + // return this->chr_to_seq.at(chr).length(); + return this->chr_to_length.at(chr); } diff --git a/src/input_data.cpp b/src/input_data.cpp index 489a9894..40a640a2 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -157,6 +157,11 @@ void InputData::setMinCNVLength(int min_cnv_length) void InputData::setMinReadSupport(int min_reads) { + // Ensure that the minimum read support is an integer and greater than 0 + if (min_reads < 1) + { + throw std::runtime_error("Minimum read support must be an integer greater than 0"); + } this->min_reads = min_reads; } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 6180612f..03d3b718 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include "ThreadPool.h" #include "utils.h" @@ -100,164 +101,11 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam // Clean up the iterator and alignment hts_itr_destroy(itr); bam_destroy1(bam1); - printMessage(region + ": Processed " + std::to_string(primary_map.size()) + " primary alignments with " + std::to_string(supplementary_count) + " supplementary alignments"); + printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments"); // printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments..."); } -void SVCaller::getAlignmentMismatchMap(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const GenomicRegion& region, MismatchData &mismatch_data, bool is_primary, const ReferenceGenome& ref_genome) -{ - // Create a read and iterator for the region - bam1_t *bam1 = bam_init1(); - if (!bam1) { - printError("ERROR: failed to initialize BAM record"); - return; - } - - - //bam_mutex.lock(); - this->shared_mutex.lock(); - hts_itr_t *itr = sam_itr_queryi(idx, region.tid, region.start - 1, region.end); - if (!itr) { - this->shared_mutex.unlock(); - bam_destroy1(bam1); - printError("ERROR: failed to query region " + std::to_string(region.tid) + ":" + std::to_string(region.start) + "-" + std::to_string(region.end)); - return; - } - this->shared_mutex.unlock(); - //bam_mutex.unlock(); - - - // Find the correct alignment - bool success = false; - std::string fail_str = ""; - // printMessage("Looking for alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " with type: " + (is_primary ? "primary" : "supplementary") + " and strand: " + (region.strand ? "forward" : "reverse")); - while (readNextAlignment(fp_in, itr, bam1) >= 0) { - // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality - if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) { - continue; - } - - // Skip if not the correct type of alignment - if (is_primary && (bam1->core.flag & BAM_FSUPPLEMENTARY)) { - continue; - } else if (!is_primary && !(bam1->core.flag & BAM_FSUPPLEMENTARY)) { - continue; - } - - // Check the alignment start and end positions, and strand - if (bam1->core.pos+1 == region.start && bam_endpos(bam1) == region.end && !(bam1->core.flag & BAM_FREVERSE) == region.strand) { - // printMessage("SUCCESS: Found alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " at position: " + std::to_string(bam1->core.pos + 1) + "-" + std::to_string(bam_endpos(bam1))); - success = true; - break; - } else { - continue; - } - } - - // Check if the alignment was found - if (!success) { - printError("ERROR: Failed to find alignment for region: " + std::to_string(region.start) + "-" + std::to_string(region.end) + " with type: " + (is_primary ? "primary" : "supplementary") + " and strand: " + (region.strand ? "forward" : "reverse")); - hts_itr_destroy(itr); - bam_destroy1(bam1); - return; - } - - // Main loop to process the alignments - std::vector match_map(bam1->core.l_qseq, 0); // Query position to match/mismatch (1/0) map - uint32_t query_start = 0; - uint32_t query_end = 0; - uint32_t query_pos = 0; - bool first_op = true; - - // Process mismatches in the CIGAR string - const std::string chr = bamHdr->target_name[bam1->core.tid]; - hts_pos_t pos = bam1->core.pos; // 0-based position - uint32_t* cigar = bam_get_cigar(bam1); // CIGAR array - int cigar_len = bam1->core.n_cigar; - for (int i = 0; i < cigar_len; i++) { - int op = bam_cigar_op(cigar[i]); // CIGAR operation - int op_len = bam_cigar_oplen(cigar[i]); // CIGAR operation length - - // Update match/mismatch query map - int MATCH = 1; - int MISMATCH = -1; - if (op == BAM_CEQUAL) { - for (int j = 0; j < op_len; j++) { - match_map[query_pos + j] = MATCH; - } - } else if (op == BAM_CDIFF) { - for (int j = 0; j < op_len; j++) { - match_map[query_pos + j] = MISMATCH; - } - } else if (op == BAM_CMATCH) { - // Get the read sequence - uint8_t* seq_ptr = bam_get_seq(bam1); - std::string cmatch_seq_str = ""; - for (int j = 0; j < op_len; j++) { - cmatch_seq_str += seq_nt16_str[bam_seqi(seq_ptr, query_pos + j)]; - } - - // Get the corresponding reference sequence - int cmatch_pos = pos + 1; // Querying the reference genome is 1-based - std::string cmatch_ref_str = ref_genome.query(chr, cmatch_pos, cmatch_pos + op_len - 1); - - // Check that the two sequence lengths are equal - if (cmatch_seq_str.length() != cmatch_ref_str.length()) { - printError("ERROR: Sequence lengths do not match for CIGAR operation: " + std::to_string(op)); - hts_itr_destroy(itr); - bam_destroy1(bam1); - return; - } - - // Compare the two sequences and update the mismatch map - for (int j = 0; j < op_len; j++) { - if (cmatch_seq_str[j] != cmatch_ref_str[j]) { - try { - match_map.at(query_pos + j) = MISMATCH; - } catch (const std::out_of_range& e) { - printError("ERROR: Out of range exception for query position: " + std::to_string(query_pos + j) + " with read length: " + std::to_string(bam1->core.l_qseq) + " and array size: " + std::to_string(match_map.size()) + " for CIGAR operation: " + std::to_string(op) + " with length: " + std::to_string(op_len)); - - // Exit the program - hts_itr_destroy(itr); - bam_destroy1(bam1); - - return; - } - // match_map[query_pos + j] = MISMATCH; - } else { - match_map[query_pos + j] = MATCH; - } - } - } else if (first_op && (op == BAM_CSOFT_CLIP || op == BAM_CHARD_CLIP)) { - query_start = query_pos + op_len; - first_op = false; - } - - // Update the reference position - // https://samtools.github.io/hts-specs/SAMv1.pdf - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { - pos += op_len; - } - - // Update the query position - if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) { - query_pos += op_len; - } - } - query_end = query_pos; - - // Clean up the iterator and alignment - hts_itr_destroy(itr); - bam_destroy1(bam1); - - // Update the mismatch data - mismatch_data.query_start = query_start; - mismatch_data.query_end = query_end; - mismatch_data.match_map = std::move(match_map); -} - - void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) { // Create a read and iterator for the region @@ -333,13 +181,15 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec uint32_t ref_pos; uint32_t ref_end; double default_lh = 0.0; - // List of ambiguous bases - const std::string amb_bases = "RYKMSWBDHV"; + const std::string amb_bases = "RYKMSWBDHV"; // Ambiguous bases + std::bitset<256> amb_bases_bitset; + for (char base : amb_bases) { + amb_bases_bitset.set(base); + } for (int i = 0; i < cigar_len; i++) { int op_len = bam_cigar_oplen(cigar[i]); // CIGAR operation length int op = bam_cigar_op(cigar[i]); // CIGAR operation if (op_len >= 50) { - // Process SVs // Process the CIGAR operation if (op == BAM_CINS && is_primary) { @@ -349,83 +199,24 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec for (int j = 0; j < op_len; j++) { // Replace ambiguous bases with N char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; - if (amb_bases.find(base) != std::string::npos) { + if (amb_bases_bitset.test(base)) { ins_seq_str[j] = 'N'; } else { ins_seq_str[j] = base; } - // Get the sequence character from the query - // ins_seq_str[j] = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; } - // To determine whether the insertion is a duplication, check - // for sequence identity between the insertion and the - // reference genome (duplications are typically >= 90%): - // Loop through the reference sequence and calculate the - // sequence identity +/- insertion length from the insertion - // position. - // bool is_duplication = false; - // int ins_ref_pos; - // uint32_t dup_start = std::max(0, (int)pos - op_len); - // for (uint32_t j = dup_start; j <= pos; j++) { - - // // Get the string for the window (1-based coordinates) - // ins_ref_pos = j + 1; - // std::string window_str = ref_genome.query(chr, ins_ref_pos, ins_ref_pos + op_len - 1); - - // // Continue if the window string is empty (out-of-range) - // if (window_str == "") { - // continue; - // } - - // // Calculate the sequence identity - // int num_matches = 0; - // for (int k = 0; k < op_len; k++) { - // if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') { - // num_matches++; - // } - // } - // float seq_identity = (float)num_matches / (float)op_len; - - // // Check if the target sequence identity is reached - // if (seq_identity >= DUP_SEQSIM_THRESHOLD) { - // is_duplication = true; - // break; - // } - // } - - // Calculate the sequence identity at the insertion position +/- - // length if >= 50bp - //if (op_len >= 50) { - // Before the insertion if (pos >= (uint32_t)op_len-1) { uint32_t bp1 = pos - (op_len - 1) + 1; uint32_t bp2 = bp1 + op_len - 1; //pos + 1; - const std::string& window_str = ref_genome.query(chr, bp1, bp2); - if (window_str.length() > 0) + + if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) { - int num_matches = 0; - for (int k = 0; k < op_len; k++) - { - if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') - { - num_matches++; - } - } - float seq_identity = (float)num_matches / (float)op_len; - if (seq_identity >= DUP_SEQSIM_THRESHOLD) - { - //uint32_t dup_bp1 = bp1 + 1; - //uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr)); - int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - //printMessage("TEST3"); - addSVCall(sv_calls, bp1, bp2, "DUP", "", "LSEQSIM", "./.", default_lh, read_depth); - - // Continue to the next CIGAR operation - continue; - } + int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); + addSVCall(sv_calls, bp1, bp2, "DUP", "", "LSEQSIM", "./.", default_lh, read_depth); + continue; } } @@ -433,44 +224,22 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec if (pos + op_len < ref_genome.getChromosomeLength(chr)) { uint32_t bp1 = pos + 1; - //uint32_t bp2 = std::min(bp1 + op_len - 1, ref_genome.getChromosomeLength(chr)); uint32_t bp2 = bp1 + op_len - 1; - const std::string& window_str = ref_genome.query(chr, bp1, bp2); - if (window_str.length() > 0) + + if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) { - int num_matches = 0; - for (int k = 0; k < op_len; k++) - { - if (ins_seq_str[k] == window_str[k] && ins_seq_str[k] != 'N' && window_str[k] != 'N') - { - num_matches++; - } - } - float seq_identity = (float)num_matches / (float)op_len; - if (seq_identity >= DUP_SEQSIM_THRESHOLD) - { - //uint32_t dup_bp1 = bp1 + 1; - //uint32_t dup_bp2 = std::min(dup_bp1 + op_len - 1, ref_genome.getChromosomeLength(chr)); - int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - //printMessage("TEST1"); - addSVCall(sv_calls, bp1, bp2, "DUP", "", "RSEQSIM", "./.", default_lh, read_depth); - - // Continue to the next CIGAR operation - continue; - } + int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); + addSVCall(sv_calls, bp1, bp2, "DUP", "", "RSEQSIM", "./.", default_lh, read_depth); + continue; } } // Add as an insertion // For read depth calculation, use the previous and current // positions (1-based) - - //uint32_t ins_pos = pos; - //uint32_t ins_end = ins_pos + op_len -1; uint32_t ins_pos = pos + 1; uint32_t ins_end = ins_pos + op_len - 1; int read_depth = this->calculateReadDepth(pos_depth_map, ins_pos-1, ins_pos); - //printMessage("TEST2: " + std::to_string(ins_pos) + ", " + std::to_string(ins_end) + ", OPLEN=" + std::to_string(op_len)); // Determine the ALT allele format based on small vs. large insertion std::string alt_allele = ""; @@ -480,42 +249,12 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec addSVCall(sv_calls, ins_pos, ins_end, "INS", alt_allele, "CIGARINS", "./.", default_lh, read_depth); - // Determine whether to use a symbolic allele (>50bp) or the - // actual sequence - // if (op_len > 50) { - // ins_seq_str = ""; - // } else { - // ins_seq_str = ins_seq_str; - // } - - // Add to SV calls (1-based) with the appropriate SV type - // ref_pos = pos+1; - - // // For insertions, the reference end position is the same as the - // // reference position - // // For duplications, the reference end position is the same as - // // the reference position plus the length of the insertion - // ref_end = ref_pos + op_len - 1; - // if (is_duplication) { - // uint32_t bp1 = ref_pos; - // uint32_t bp2 = std::min(ref_pos + op_len - 1, ref_genome.getChromosomeLength(chr)); - // int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - // addSVCall(sv_calls, ref_pos, ref_end, "DUP", ins_seq_str, "CIGARDUP", "./.", default_lh, read_depth); - // } else { - // uint32_t bp1 = std::max(1, (int)ref_pos - 1); - // uint32_t bp2 = ref_pos; - // int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - // addSVCall(sv_calls, ref_pos, ref_end, "INS", ins_seq_str, "CIGARINS", "./.", default_lh, read_depth); - // } - // Check if the CIGAR operation is a deletion } else if (op == BAM_CDEL && is_primary) { ref_pos = pos+1; ref_end = ref_pos + op_len -1; - // printMessage("Test2"); int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); - //printMessage("TEST4: " + std::to_string(ref_pos) + ", " + std::to_string(ref_end) + ", OPLEN=" + std::to_string(op_len)); addSVCall(sv_calls, ref_pos, ref_end, "DEL", "", "CIGARDEL", "./.", default_lh, read_depth); } } @@ -538,8 +277,9 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // int filter_threshold = 4; // Minimum number of supporting reads for an SV call // int filter_threshold = 10; // Minimum number of supporting reads for an // SV call - int filter_threshold = input_data.getMinReadSupport(); // Minimum number of supporting reads for an SV call - bool single_chr = input_data.getChromosome() != ""; + int cigar_sv_support_threshold = input_data.getMinReadSupport(); // Minimum number of supporting reads for an SV call + int split_sv_support_threshold = 4; // Minimum number of supporting reads for an SV call + // printMessage("Processing chromosome " + chr + " with filter threshold: " + std::to_string(filter_threshold)); // Open the BAM file std::string bam_filepath = input_data.getLongReadBam(); @@ -589,7 +329,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v CNVCaller cnv_caller(this->shared_mutex); std::vector chr_pos_depth_map(chr_len+1, 0); // 1-based index int thread_count = input_data.getThreadCount(); - double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count, single_chr); + double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count); if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) { return; } @@ -599,7 +339,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); printMessage(chr + ": Merging CIGAR..."); - filterSVsWithLowSupport(chr_sv_calls, filter_threshold); + filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold); mergeSVs(chr_sv_calls); int region_sv_count = getSVCount(chr_sv_calls); printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); @@ -614,11 +354,11 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // Run split-read SV and copy number variant predictions printMessage(chr + ": Split read SVs..."); - this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data, ref_genome); + this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data); // Merge the SV calls from the current region printMessage(chr + ": Merging split reads..."); - filterSVsWithLowSupport(chr_sv_calls, filter_threshold); + filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold, "SPLIT"); mergeSVs(chr_sv_calls); // Run a final merge on the combined SV calls @@ -659,9 +399,6 @@ void SVCaller::run(const InputData& input_data) // Shared resources std::unordered_map> whole_genome_sv_calls; - //std::mutex sv_mutex; - //std::mutex snp_mutex; - //std::mutex pfb_mutex; // Lambda to process a chromosome auto process_chr = [&](const std::string& chr) { @@ -670,7 +407,6 @@ void SVCaller::run(const InputData& input_data) InputData chr_input_data = input_data; // Use a thread-local copy this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome); { - //std::lock_guard lock(sv_mutex); std::lock_guard lock(this->shared_mutex); whole_genome_sv_calls[chr] = std::move(sv_calls); } @@ -692,10 +428,13 @@ void SVCaller::run(const InputData& input_data) } // Wait for all tasks to complete + int total_chr_count = futures.size(); + int current_chr = 0; for (auto& future : futures) { try { + current_chr++; future.get(); - printMessage("Chromosome task completed."); + printMessage("Chromosome task "+ std::to_string(current_chr) + " of " + std::to_string(total_chr_count) + " completed."); } catch (const std::exception& e) { printError("Error processing chromosome task: " + std::string(e.what())); } catch (...) { @@ -722,42 +461,44 @@ void SVCaller::run(const InputData& input_data) // Detect SVs from split read alignments -void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data, const ReferenceGenome& ref_genome) +void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) { - printMessage(region + ": Getting split alignments..."); + // printMessage(region + ": Getting split alignments..."); std::unordered_map primary_map; std::unordered_map> supp_map; this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map); // Find split-read SV evidence - printMessage(region + ": Finding split-read SVs..."); + // printMessage(region + ": Finding split-read SVs..."); int sv_count = 0; int current_primary = 0; + int primary_count = primary_map.size(); //int primary_count = primary_map.size(); uint32_t min_cnv_length = input_data.getMinCNVLength(); for (auto& entry : primary_map) { current_primary++; const std::string& qname = entry.first; GenomicRegion& primary_region = entry.second; - - // Skip primary alignments that do not have supplementary alignments - // if (supp_map.find(qname) == supp_map.end()) { - // continue; - // } - - // Get the read match/mismatch map - // printMessage(region + ": Getting mismatch map for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); - //MismatchData primary_mismatches; - //this->getAlignmentMismatchMap(fp_in, idx, bamHdr, primary_region, primary_mismatches, true, ref_genome); // Find the largest supplementary alignment - GenomicRegion largest_supp_region = supp_map[qname][0]; - uint32_t largest_supp_length = 0; + auto& supp_regions = supp_map[qname]; + GenomicRegion largest_supp_region = supp_regions[0]; + auto it = std::max_element(supp_regions.begin(), supp_regions.end(), [](const GenomicRegion& a, const GenomicRegion& b) { + return a.end - a.start < b.end - b.start; + }); + if (it != supp_regions.end()) { + largest_supp_region = *it; + } + + // GenomicRegion largest_supp_region = supp_map[qname][0]; + // uint32_t largest_supp_length = 0; // printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); const std::string& primary_chr = bamHdr->target_name[primary_region.tid]; - for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); ++it) { - GenomicRegion& supp_region = *it; + // for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); + // ++it) { + for (auto& supp_region : supp_regions) { + // GenomicRegion& supp_region = *it; // Skip if not on the primary chromosome if (primary_region.tid != supp_region.tid) { @@ -765,28 +506,33 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } // Get the supplementary alignment information - uint32_t supp_start = (uint32_t) supp_region.start; - uint32_t supp_end = (uint32_t) supp_region.end; - uint32_t supp_length = supp_end - supp_start + 1; - if (supp_length > largest_supp_length) { - largest_supp_length = supp_length; - largest_supp_region = *it; - } + // uint32_t supp_start = (uint32_t) supp_region.start; + // uint32_t supp_end = (uint32_t) supp_region.end; + // uint32_t supp_length = supp_end - supp_start + 1; + // if (supp_length > largest_supp_length) { + // largest_supp_length = supp_length; + // largest_supp_region = *it; + // } // Inversion detection bool is_opposite_strand = primary_region.strand != supp_region.strand; if (is_opposite_strand) { - if (supp_length >= min_cnv_length) { + // if (supp_length >= min_cnv_length) { + if (supp_region.end - supp_region.start >= min_cnv_length) { // Print error if the start position is greater than the end // position - if (supp_start > supp_end) { - printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end)); + // if (supp_start > supp_end) { + if (supp_region.start > supp_region.end) { + printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_region.start) + "-" + std::to_string(supp_region.end)); + // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end)); continue; } // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")..."); - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data); + // std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data); + // printMessage("Running copy number prediction, length: " + std::to_string(supp_region.end - supp_region.start)); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_region.start, supp_region.end, mean_chr_cov, pos_depth_map, input_data); if (std::get<1>(result) == SVType::UNKNOWN) { continue; } @@ -794,47 +540,44 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); // printMessage("Test3"); - int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); + int read_depth = this->calculateReadDepth(pos_depth_map, supp_region.start, supp_region.end); + // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); if (supp_type == SVType::NEUTRAL) { - addSVCall(sv_calls, supp_start, supp_end, "INV", "", "HMM", "./.", supp_lh, read_depth); + // addSVCall(sv_calls, supp_start, supp_end, "INV", + // "", "SPLIT", "./.", supp_lh, read_depth); + addSVCall(sv_calls, supp_region.start, supp_region.end, "INV", "", "SPLIT", "./.", supp_lh, read_depth); sv_count++; } else if (supp_type == SVType::DUP) { - addSVCall(sv_calls, supp_start, supp_end, "INVDUP", "", "HMM", "./.", supp_lh, read_depth); + // addSVCall(sv_calls, supp_start, supp_end, "INVDUP", + // "", "SPLIT", "./.", supp_lh, read_depth); + addSVCall(sv_calls, supp_region.start, supp_region.end, "INVDUP", "", "SPLIT", "./.", supp_lh, read_depth); } } - // } else { - // // Add the inversion without running copy number predictions - // // (too small for predictions) - // // printMessage("Test4"); - // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); - // addSVCall(sv_calls, supp_start, supp_end, "INV", "", "REV", "./.", 0.0, read_depth); - // } } } - // Trim overlapping alignments - //MismatchData supp_mismatches; - //printMessage(region + ": Getting mismatch map for supplementary alignments..."); - //this->getAlignmentMismatchMap(fp_in, idx, bamHdr, largest_supp_region, supp_mismatches, false, ref_genome); - - // printMessage(region + ": Trimming overlapping alignments..."); - //trimOverlappingAlignments(primary_region, largest_supp_region, primary_mismatches, supp_mismatches); + // Analyze split-read evidence for deletions and duplications bool gap_exists = false; uint32_t boundary_left, boundary_right, gap_left, gap_right; - if (primary_region.start < largest_supp_region.start) { // Primary before supp - boundary_left = primary_region.start; - boundary_right = std::max(primary_region.end, largest_supp_region.end); - gap_left = primary_region.end; - gap_right = largest_supp_region.start; - gap_exists = gap_left < gap_right; - } else { - boundary_left = largest_supp_region.start; - boundary_right = std::max(primary_region.end, largest_supp_region.end); - gap_left = largest_supp_region.end; - gap_right = primary_region.start; - gap_exists = gap_left < gap_right; - } + boundary_left = std::min(primary_region.start, largest_supp_region.start); + boundary_right = std::max(primary_region.end, largest_supp_region.end); + gap_left = std::min(primary_region.end, largest_supp_region.start); + gap_right = std::max(primary_region.start, largest_supp_region.end); + gap_exists = gap_left < gap_right; + // if (primary_region.start < largest_supp_region.start) { // Primary before supp + // boundary_left = primary_region.start; + // boundary_right = std::max(primary_region.end, largest_supp_region.end); + // gap_left = primary_region.end; + // gap_right = largest_supp_region.start; + // gap_exists = gap_left < gap_right; + // } else { + // boundary_left = largest_supp_region.start; + // boundary_right = std::max(primary_region.end, largest_supp_region.end); + // gap_left = largest_supp_region.end; + // gap_right = primary_region.start; + // gap_exists = gap_left < gap_right; + // } // Run copy number variant predictions on the boundary if large enough if (boundary_right - boundary_left >= min_cnv_length) { @@ -846,7 +589,9 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in continue; } - // printMessage(region + ": Running copy number prediction for boundary..."); + // printMessage(region + ": Running copy number prediction for + // boundary..."); + // printMessage("Running copy number prediction, length: " + std::to_string(boundary_right - boundary_left)); std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data); if (std::get<1>(bd_result) == SVType::UNKNOWN) { continue; @@ -864,7 +609,9 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in continue; } - // printMessage(region + ": Running copy number prediction for gap..."); + // printMessage(region + ": Running copy number prediction for + // gap..."); + // printMessage("Running copy number prediction, length: " + std::to_string(gap_right - gap_left)); std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data); if (std::get<1>(gap_result) == SVType::UNKNOWN) { continue; @@ -874,25 +621,27 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // If higher likelihood than the boundary, add the gap as the SV call if (gap_lh > bd_lh) { - // printMessage("Test5"); int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right); std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">"; - addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "GAP", "./.", gap_lh, read_depth); + addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "SPLIT", "./.", gap_lh, read_depth); } else { // Add the boundary as the SV call - // printMessage("Test6"); int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth); + addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "SPLIT", "./.", bd_lh, read_depth); } } else { // Add the boundary as the SV call - // printMessage("Test7"); int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "BOUNDARY", "./.", bd_lh, read_depth); + addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "SPLIT", "./.", bd_lh, read_depth); } } + + // Print progress every 1000 primary alignments + if (current_primary % 1000 == 0) { + printMessage(region + ": Processed " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); + } } } @@ -1050,6 +799,11 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls, uint32_t start, uint32_t end, std: if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") { return; } - - // Set the alt allele to or if the SV type is DUP or DEL - // if (sv_type == "DUP" && alt_allele == ".") { - // printError("ERROR: Invalid alt allele for duplication at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele); - // alt_allele = ""; - // } else if (sv_type == "DEL" && alt_allele == ".") { - // printError("ERROR: Invalid alt allele for deletion at position " + std::to_string(start) + "-" + std::to_string(end) + ": " + alt_allele); - // alt_allele = ""; - // } if (start > end) { printError("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end)); @@ -44,7 +35,6 @@ void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std: if (it != sv_calls.end() && it->start == start && it->end == end) { it->support += 1; // Update the read support - // printMessage("Updating SV call with length " + std::to_string(end - start) + " and type " + sv_type + " and support " + std::to_string(it->support)); if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood) { // Update the SV call @@ -153,3 +143,11 @@ void filterSVsWithLowSupport(std::vector& sv_calls, int min_support) return sv_call.support < min_support; }), sv_calls.end()); } + +void filterSVsWithLowSupport(std::vector &sv_calls, int min_support, const std::string &data_type) +{ + // Filter SV calls with low read depth only for the specified data type, keeping the rest + sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support, data_type](const SVCall& sv_call) { + return sv_call.support < min_support && sv_call.data_type == data_type; + }), sv_calls.end()); +} From 249cb6465c56c83fa3b617b42bf43a7b12e5f260 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Tue, 28 Jan 2025 22:13:49 -0500 Subject: [PATCH 063/134] resolve primary overlaps to increase speed --- include/sv_object.h | 8 +- src/sv_caller.cpp | 292 ++++++++++++++++++++++++++++++-------------- src/sv_object.cpp | 17 ++- 3 files changed, 221 insertions(+), 96 deletions(-) diff --git a/include/sv_object.h b/include/sv_object.h index e3e3a8ba..0b2a489c 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -19,13 +19,17 @@ struct SVCall { double hmm_likelihood = 0.0; int read_depth = 0; // Breakpoint depth int support = 0; // Number of supporting reads + int cluster_size = 0; // Number of SV calls in the cluster // Comparison operator for std::set bool operator<(const SVCall& other) const; // Constructor with parameters for all fields - SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) : - start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {} + SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) : + start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {} + + // SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) : + // start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {} }; void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth); diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 03d3b718..2e24c2cf 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include "ThreadPool.h" #include "utils.h" @@ -58,6 +59,7 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam uint32_t supplementary_count = 0; // Main loop to process the alignments + std::unordered_map primary_map_qual; uint32_t num_alignments = 0; while (readNextAlignment(fp_in, itr, bam1) >= 0) { @@ -71,15 +73,16 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { // primary_map[qname] = itr; // Store chromosome (TID), start, and end positions (1-based) of the - // primary alignment, and the strand + // primary alignment, and the strand (true for forward, false for reverse) primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE)}; + primary_map_qual[qname] = bam1->core.qual; primary_count++; // Process supplementary alignments } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { // supp_map[qname].push_back(itr); // Store chromosome (TID), start, and end positions (1-based) of the - // supplementary alignment, and the strand + // supplementary alignment, and the strand (true for forward, false for reverse) supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE)}); supplementary_count++; } @@ -102,7 +105,46 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam hts_itr_destroy(itr); bam_destroy1(bam1); printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments"); - // printMessage("Processed " + std::to_string(num_alignments) + " alignments with " + std::to_string(primary_count) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments..."); + + // Filter overlapping primary alignments and keep the one with the highest mapping + // quality + // std::vector to_remove_overlapping; + std::unordered_set to_remove_overlapping; + for (const auto& entry1 : primary_map) { + const std::string& qname1 = entry1.first; + const GenomicRegion& primary1 = entry1.second; + for (const auto& entry2 : primary_map) { + const std::string& qname2 = entry2.first; + if (qname1 == qname2) { + continue; + } + const GenomicRegion& primary2 = entry2.second; + if (primary1.tid == primary2.tid && primary1.start <= primary2.end && primary1.end >= primary2.start) { + // Overlapping primary alignments + // printMessage("Overlapping primary alignments with quality " + std::to_string(primary_map_qual[qname1]) + " and " + std::to_string(primary_map_qual[qname2])); + if (primary_map_qual[qname1] < primary_map_qual[qname2]) { + // to_remove_overlapping.push_back(qname1); + to_remove_overlapping.insert(qname1); + } else { + // If equal, remove the shorter alignment + if (primary1.end - primary1.start < primary2.end - primary2.start) { + // to_remove_overlapping.push_back(qname1); + to_remove_overlapping.insert(qname1); + } else { + // to_remove_overlapping.push_back(qname2); + to_remove_overlapping.insert(qname2); + } + } + } + } + } + + for (const std::string& qname : to_remove_overlapping) { + primary_map.erase(qname); + supp_map.erase(qname); + } + printMessage(region + ": Removed " + std::to_string(to_remove_overlapping.size()) + " overlapping primary alignments"); + printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supp_map.size()) + " supplementary alignments after filtering"); } @@ -278,7 +320,9 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // int filter_threshold = 10; // Minimum number of supporting reads for an // SV call int cigar_sv_support_threshold = input_data.getMinReadSupport(); // Minimum number of supporting reads for an SV call - int split_sv_support_threshold = 4; // Minimum number of supporting reads for an SV call + // int split_sv_support_threshold = 4; // Minimum number of supporting + // reads for an SV call + int split_sv_support_threshold = input_data.getMinReadSupport(); // printMessage("Processing chromosome " + chr + " with filter threshold: " + std::to_string(filter_threshold)); // Open the BAM file @@ -357,13 +401,14 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data); // Merge the SV calls from the current region - printMessage(chr + ": Merging split reads..."); - filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold, "SPLIT"); - mergeSVs(chr_sv_calls); + // printMessage(chr + ": Merging split reads..."); + // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold); + // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold, "SPLIT"); + // mergeSVs(chr_sv_calls); // Run a final merge on the combined SV calls - printMessage(chr + ": Merging final calls..."); - mergeSVs(chr_sv_calls); + // printMessage(chr + ": Merging final calls..."); + // mergeSVs(chr_sv_calls); printMessage("Completed chromosome " + chr); } @@ -470,7 +515,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Find split-read SV evidence // printMessage(region + ": Finding split-read SVs..."); - int sv_count = 0; int current_primary = 0; int primary_count = primary_map.size(); //int primary_count = primary_map.size(); @@ -478,104 +522,170 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in for (auto& entry : primary_map) { current_primary++; const std::string& qname = entry.first; - GenomicRegion& primary_region = entry.second; - + GenomicRegion& primary = entry.second; + const std::string& primary_chr = bamHdr->target_name[primary.tid]; + // Find the largest supplementary alignment auto& supp_regions = supp_map[qname]; - GenomicRegion largest_supp_region = supp_regions[0]; + // GenomicRegion largest_supp = supp_regions[0]; auto it = std::max_element(supp_regions.begin(), supp_regions.end(), [](const GenomicRegion& a, const GenomicRegion& b) { return a.end - a.start < b.end - b.start; }); - if (it != supp_regions.end()) { - largest_supp_region = *it; - } - - // GenomicRegion largest_supp_region = supp_map[qname][0]; - // uint32_t largest_supp_length = 0; + GenomicRegion largest_supp = *it; + + // If on a different chromosome, label as a translocation + if (primary.tid != largest_supp.tid) { + // Note that these do not currently have a likelihood score or read depth + // Create two BND records for the translocation + // Create the alternate allele format for the first BND record + const std::string& supp_chr = bamHdr->target_name[largest_supp.tid]; + std::string alt_allele = "N[" + supp_chr + ":" + std::to_string(largest_supp.start) + "["; + if (largest_supp.strand == false) { + // Reverse-oriented relative to the reference + alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]"; + } + addSVCall(sv_calls, primary.start, primary.end, "BND", alt_allele, "SPLIT", "./.", 0.0, 0); - // printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); - const std::string& primary_chr = bamHdr->target_name[primary_region.tid]; - // for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); - // ++it) { - for (auto& supp_region : supp_regions) { - // GenomicRegion& supp_region = *it; - - // Skip if not on the primary chromosome - if (primary_region.tid != supp_region.tid) { - continue; + // Create the alternate allele format for the second BND record + alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "["; + if (primary.strand == false) { + // Reverse-oriented relative to the reference + alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]"; } + addSVCall(sv_calls, largest_supp.start, largest_supp.end, "BND", alt_allele, "SPLIT", "./.", 0.0, 0); - // Get the supplementary alignment information - // uint32_t supp_start = (uint32_t) supp_region.start; - // uint32_t supp_end = (uint32_t) supp_region.end; - // uint32_t supp_length = supp_end - supp_start + 1; - // if (supp_length > largest_supp_length) { - // largest_supp_length = supp_length; - // largest_supp_region = *it; - // } - - // Inversion detection - bool is_opposite_strand = primary_region.strand != supp_region.strand; - if (is_opposite_strand) { - // if (supp_length >= min_cnv_length) { - if (supp_region.end - supp_region.start >= min_cnv_length) { - - // Print error if the start position is greater than the end - // position - // if (supp_start > supp_end) { - if (supp_region.start > supp_region.end) { - printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_region.start) + "-" + std::to_string(supp_region.end)); - // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end)); - continue; - } + continue; + } - // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")..."); - // std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data); - // printMessage("Running copy number prediction, length: " + std::to_string(supp_region.end - supp_region.start)); - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_region.start, supp_region.end, mean_chr_cov, pos_depth_map, input_data); - if (std::get<1>(result) == SVType::UNKNOWN) { - continue; - } + // Inversion detection + bool is_opposite_strand = primary.strand != largest_supp.strand; + if (is_opposite_strand) { + // if (supp_length >= min_cnv_length) { + if (largest_supp.end - largest_supp.start >= min_cnv_length) { - double supp_lh = std::get<0>(result); - SVType supp_type = std::get<1>(result); - // printMessage("Test3"); - int read_depth = this->calculateReadDepth(pos_depth_map, supp_region.start, supp_region.end); - // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); - if (supp_type == SVType::NEUTRAL) { - // addSVCall(sv_calls, supp_start, supp_end, "INV", - // "", "SPLIT", "./.", supp_lh, read_depth); - addSVCall(sv_calls, supp_region.start, supp_region.end, "INV", "", "SPLIT", "./.", supp_lh, read_depth); - - sv_count++; - } else if (supp_type == SVType::DUP) { - // addSVCall(sv_calls, supp_start, supp_end, "INVDUP", - // "", "SPLIT", "./.", supp_lh, read_depth); - addSVCall(sv_calls, supp_region.start, supp_region.end, "INVDUP", "", "SPLIT", "./.", supp_lh, read_depth); - } + // Print error if the start position is greater than the end + // position + // if (supp_start > supp_end) { + if (largest_supp.start > largest_supp.end) { + printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(largest_supp.start) + "-" + std::to_string(largest_supp.end)); + // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end)); + continue; + } + + // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")..."); + // std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data); + // printMessage("Running copy number prediction, length: " + std::to_string(supp_region.end - supp_region.start)); + std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, largest_supp.start, largest_supp.end, mean_chr_cov, pos_depth_map, input_data); + if (std::get<1>(result) == SVType::UNKNOWN) { + continue; + } + + double supp_lh = std::get<0>(result); + SVType supp_type = std::get<1>(result); + // printMessage("Test3"); + int read_depth = this->calculateReadDepth(pos_depth_map, largest_supp.start, largest_supp.end); + // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); + if (supp_type == SVType::NEUTRAL) { + // addSVCall(sv_calls, supp_start, supp_end, "INV", + // "", "SPLIT", "./.", supp_lh, read_depth); + addSVCall(sv_calls, largest_supp.start, largest_supp.end, "INV", "", "SPLIT", "./.", supp_lh, read_depth); + continue; + + } else if (supp_type == SVType::DUP) { + // addSVCall(sv_calls, supp_start, supp_end, "INVDUP", + // "", "SPLIT", "./.", supp_lh, read_depth); + addSVCall(sv_calls, largest_supp.start, largest_supp.end, "INVDUP", "", "SPLIT", "./.", supp_lh, read_depth); + continue; } } } + + // GenomicRegion largest_supp_region = supp_map[qname][0]; + // uint32_t largest_supp_length = 0; + + // printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); + // const std::string& primary_chr = bamHdr->target_name[primary.tid]; + // // for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); + // // ++it) { + // for (auto& supp_region : supp_regions) { + // // GenomicRegion& supp_region = *it; + + // // Skip if not on the primary chromosome + // if (primary.tid != supp_region.tid) { + // continue; + // } + + // // Get the supplementary alignment information + // // uint32_t supp_start = (uint32_t) supp_region.start; + // // uint32_t supp_end = (uint32_t) supp_region.end; + // // uint32_t supp_length = supp_end - supp_start + 1; + // // if (supp_length > largest_supp_length) { + // // largest_supp_length = supp_length; + // // largest_supp_region = *it; + // // } + + // // Inversion detection + // bool is_opposite_strand = primary.strand != supp_region.strand; + // if (is_opposite_strand) { + // // if (supp_length >= min_cnv_length) { + // if (supp_region.end - supp_region.start >= min_cnv_length) { + + // // Print error if the start position is greater than the end + // // position + // // if (supp_start > supp_end) { + // if (supp_region.start > supp_region.end) { + // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_region.start) + "-" + std::to_string(supp_region.end)); + // // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end)); + // continue; + // } + + // // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")..."); + // // std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data); + // // printMessage("Running copy number prediction, length: " + std::to_string(supp_region.end - supp_region.start)); + // std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_region.start, supp_region.end, mean_chr_cov, pos_depth_map, input_data); + // if (std::get<1>(result) == SVType::UNKNOWN) { + // continue; + // } + + // double supp_lh = std::get<0>(result); + // SVType supp_type = std::get<1>(result); + // // printMessage("Test3"); + // int read_depth = this->calculateReadDepth(pos_depth_map, supp_region.start, supp_region.end); + // // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); + // if (supp_type == SVType::NEUTRAL) { + // // addSVCall(sv_calls, supp_start, supp_end, "INV", + // // "", "SPLIT", "./.", supp_lh, read_depth); + // addSVCall(sv_calls, supp_region.start, supp_region.end, "INV", "", "SPLIT", "./.", supp_lh, read_depth); + + // sv_count++; + // } else if (supp_type == SVType::DUP) { + // // addSVCall(sv_calls, supp_start, supp_end, "INVDUP", + // // "", "SPLIT", "./.", supp_lh, read_depth); + // addSVCall(sv_calls, supp_region.start, supp_region.end, "INVDUP", "", "SPLIT", "./.", supp_lh, read_depth); + // } + // } + // } + // } // Analyze split-read evidence for deletions and duplications bool gap_exists = false; uint32_t boundary_left, boundary_right, gap_left, gap_right; - boundary_left = std::min(primary_region.start, largest_supp_region.start); - boundary_right = std::max(primary_region.end, largest_supp_region.end); - gap_left = std::min(primary_region.end, largest_supp_region.start); - gap_right = std::max(primary_region.start, largest_supp_region.end); + boundary_left = std::min(primary.start, largest_supp.start); + boundary_right = std::max(primary.end, largest_supp.end); + gap_left = std::min(primary.end, largest_supp.start); + gap_right = std::max(primary.start, largest_supp.end); gap_exists = gap_left < gap_right; - // if (primary_region.start < largest_supp_region.start) { // Primary before supp - // boundary_left = primary_region.start; - // boundary_right = std::max(primary_region.end, largest_supp_region.end); - // gap_left = primary_region.end; + // if (primary.start < largest_supp_region.start) { // Primary before supp + // boundary_left = primary.start; + // boundary_right = std::max(primary.end, largest_supp_region.end); + // gap_left = primary.end; // gap_right = largest_supp_region.start; // gap_exists = gap_left < gap_right; // } else { // boundary_left = largest_supp_region.start; - // boundary_right = std::max(primary_region.end, largest_supp_region.end); + // boundary_right = std::max(primary.end, largest_supp_region.end); // gap_left = largest_supp_region.end; - // gap_right = primary_region.start; + // gap_right = primary.start; // gap_exists = gap_left < gap_right; // } @@ -680,6 +790,7 @@ void SVCaller::saveToVCF(const std::unordered_map", "##INFO=", "##INFO=", + "##INFO=", "##FILTER=", "##FILTER=", "##FORMAT=", @@ -731,6 +842,7 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls, uint32_t start, uint32_t end, std: } // Insert the SV call in sorted order - SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1}; + // SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, + // hmm_likelihood, read_depth, 1}; + SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1}; auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call); // Update the SV type if the SV call already exists (if likelihood is @@ -51,7 +53,9 @@ void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std: void updateSVType(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string data_type, std::string genotype, double hmm_likelihood) { // Update the SV type for an existing SV call - auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), SVCall{start, end, "", "", "", "", 0.0, 0, 0}); + // auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), + // SVCall{start, end, "", "", "", "", 0.0, 0, 0}); + auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), SVCall(start, end, "", "", "", "", 0.0, 0, 0, 0)); if (it != sv_calls.end() && it->start == start && it->end == end) { it->sv_type = sv_type; @@ -103,14 +107,17 @@ void mergeSVs(std::vector& sv_calls) //XprintMessage("Merging SV calls with overlap " + std::to_string(overlap_fraction)); // Keep the SV call with the higher read support if (next.support > current_merge.support) { + next.cluster_size = current_merge.cluster_size + 1; // Update the cluster size current_merge = next; } else if (next.support == current_merge.support) { // Keep the SV call with the higher likelihood if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0 && next.hmm_likelihood > current_merge.hmm_likelihood) { + next.cluster_size = current_merge.cluster_size + 1; // Update the cluster size current_merge = next; } else if (next.hmm_likelihood == current_merge.hmm_likelihood) { // Keep the SV call with the higher read depth if (next.read_depth > current_merge.read_depth) { + next.cluster_size = current_merge.cluster_size + 1; // Update the cluster size current_merge = next; } } @@ -120,10 +127,12 @@ void mergeSVs(std::vector& sv_calls) uint32_t current_length = current_merge.end - current_merge.start; uint32_t next_length = next.end - next.start; if (next_length > current_length) { // And support meets threshold + next.cluster_size = current_merge.cluster_size + 1; // Update the cluster size current_merge = next; } } } else { + // Store the merged SV call and move to the next SV call merged_sv_calls.push_back(current_merge); current_merge = next; } @@ -138,9 +147,9 @@ void mergeSVs(std::vector& sv_calls) void filterSVsWithLowSupport(std::vector& sv_calls, int min_support) { - // Filter SV calls with low read support + // Filter SV calls with low read support or low cluster size sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) { - return sv_call.support < min_support; + return sv_call.support < min_support && sv_call.cluster_size < min_support; }), sv_calls.end()); } From 29333bbae64388fadfbbb58878955581bb591de9 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 29 Jan 2025 12:29:44 -0500 Subject: [PATCH 064/134] fix errors preventing 1 recall for ins dup --- include/sv_object.h | 11 ++++++++--- include/sv_types.h | 12 ++++++++++++ src/cnv_caller.cpp | 48 +++++++++++++++++++++++++++++---------------- src/sv_caller.cpp | 29 ++++++++++++++------------- src/sv_object.cpp | 24 +++++------------------ 5 files changed, 71 insertions(+), 53 deletions(-) diff --git a/include/sv_object.h b/include/sv_object.h index 0b2a489c..43267e0e 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -8,11 +8,16 @@ #include #include +#include "sv_types.h" + +using namespace sv_types; + // Struct to represent a structural variant call struct SVCall { uint32_t start; uint32_t end; - std::string sv_type = "NA"; + // std::string sv_type = "NA"; + SVType sv_type = SVType::UNKNOWN; std::string alt_allele = "."; std::string data_type = "NA"; std::string genotype = "./."; @@ -25,14 +30,14 @@ struct SVCall { bool operator<(const SVCall& other) const; // Constructor with parameters for all fields - SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) : + SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) : start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {} // SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) : // start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {} }; -void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth); +void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth); void mergeSVs(std::vector& sv_calls); diff --git a/include/sv_types.h b/include/sv_types.h index 60471a01..c0e1fabf 100644 --- a/include/sv_types.h +++ b/include/sv_types.h @@ -57,6 +57,18 @@ namespace sv_types { inline SVType getSVTypeFromCNState(int cn_state) { return CNVTypeMap.at(cn_state); } + + // Function to check if an SV type is a valid update from copy number predictions + inline bool isValidCopyNumberUpdate(SVType sv_type, SVType updated_sv_type) { + if (updated_sv_type == SVType::UNKNOWN) { + return false; + } else if (sv_type == SVType::DEL && updated_sv_type != SVType::DEL) { + return false; + } else if (sv_type == SVType::INS && updated_sv_type != SVType::DUP) { + return false; + } + return true; + } } #endif // SV_TYPES_H diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 344288c4..16d62d4f 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -246,29 +246,43 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vectorcalculateReadDepth(pos_depth_map, bp1, bp2); - addSVCall(sv_calls, bp1, bp2, "DUP", "", "LSEQSIM", "./.", default_lh, read_depth); + addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "LSEQSIM", "./.", default_lh, read_depth); continue; } } @@ -271,7 +271,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) { int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - addSVCall(sv_calls, bp1, bp2, "DUP", "", "RSEQSIM", "./.", default_lh, read_depth); + addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth); continue; } } @@ -289,7 +289,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec alt_allele = ins_seq_str; } - addSVCall(sv_calls, ins_pos, ins_end, "INS", alt_allele, "CIGARINS", "./.", default_lh, read_depth); + addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth); // Check if the CIGAR operation is a deletion } else if (op == BAM_CDEL && is_primary) { @@ -297,7 +297,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec ref_pos = pos+1; ref_end = ref_pos + op_len -1; int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); - addSVCall(sv_calls, ref_pos, ref_end, "DEL", "", "CIGARDEL", "./.", default_lh, read_depth); + addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "", "CIGARDEL", "./.", default_lh, read_depth); } } @@ -383,8 +383,9 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); printMessage(chr + ": Merging CIGAR..."); - filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold); - mergeSVs(chr_sv_calls); + // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold); + // mergeSVs(chr_sv_calls); + // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold); int region_sv_count = getSVCount(chr_sv_calls); printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); @@ -544,7 +545,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Reverse-oriented relative to the reference alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]"; } - addSVCall(sv_calls, primary.start, primary.end, "BND", alt_allele, "SPLIT", "./.", 0.0, 0); + addSVCall(sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0); // Create the alternate allele format for the second BND record alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "["; @@ -552,7 +553,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Reverse-oriented relative to the reference alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]"; } - addSVCall(sv_calls, largest_supp.start, largest_supp.end, "BND", alt_allele, "SPLIT", "./.", 0.0, 0); + addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0); continue; } @@ -588,13 +589,13 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in if (supp_type == SVType::NEUTRAL) { // addSVCall(sv_calls, supp_start, supp_end, "INV", // "", "SPLIT", "./.", supp_lh, read_depth); - addSVCall(sv_calls, largest_supp.start, largest_supp.end, "INV", "", "SPLIT", "./.", supp_lh, read_depth); + addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "", "SPLIT", "./.", supp_lh, read_depth); continue; } else if (supp_type == SVType::DUP) { // addSVCall(sv_calls, supp_start, supp_end, "INVDUP", // "", "SPLIT", "./.", supp_lh, read_depth); - addSVCall(sv_calls, largest_supp.start, largest_supp.end, "INVDUP", "", "SPLIT", "./.", supp_lh, read_depth); + addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "", "SPLIT", "./.", supp_lh, read_depth); continue; } } @@ -733,18 +734,18 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in if (gap_lh > bd_lh) { int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right); std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">"; - addSVCall(sv_calls, gap_left, gap_right, getSVTypeString(gap_type), alt_allele, "SPLIT", "./.", gap_lh, read_depth); + addSVCall(sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth); } else { // Add the boundary as the SV call int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "SPLIT", "./.", bd_lh, read_depth); + addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth); } } else { // Add the boundary as the SV call int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - addSVCall(sv_calls, boundary_left, boundary_right, getSVTypeString(bd_type), alt_allele, "SPLIT", "./.", bd_lh, read_depth); + addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth); } } @@ -836,7 +837,7 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls, uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth) +void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth) { // Ignore unknown SV types - if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") { + // if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") { + // return; + // } + if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) { return; } @@ -50,23 +53,6 @@ void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, std: } } -void updateSVType(std::vector& sv_calls, uint32_t start, uint32_t end, std::string sv_type, std::string data_type, std::string genotype, double hmm_likelihood) -{ - // Update the SV type for an existing SV call - // auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), - // SVCall{start, end, "", "", "", "", 0.0, 0, 0}); - auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), SVCall(start, end, "", "", "", "", 0.0, 0, 0, 0)); - if (it != sv_calls.end() && it->start == start && it->end == end) - { - it->sv_type = sv_type; - it->data_type = data_type; - it->genotype = genotype; - it->hmm_likelihood = hmm_likelihood; - } else { - printError("ERROR: SV call not found for update at position " + std::to_string(start) + "-" + std::to_string(end)); - } -} - uint32_t getSVCount(const std::vector& sv_calls) { return (uint32_t) sv_calls.size(); From 0b3d205d4601cfcb779567b4549580ef7c3a23f0 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 30 Jan 2025 00:11:43 -0500 Subject: [PATCH 065/134] high recall merging --- include/sv_caller.h | 1 + include/sv_object.h | 9 ++-- src/sv_caller.cpp | 48 +++++++++++------ src/sv_object.cpp | 125 ++++++++++++++++++++++++-------------------- 4 files changed, 106 insertions(+), 77 deletions(-) diff --git a/include/sv_caller.h b/include/sv_caller.h index 5cfbb956..1824015c 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -21,6 +21,7 @@ struct GenomicRegion { hts_pos_t start; hts_pos_t end; bool strand; + uint8_t qual; }; struct MismatchData { diff --git a/include/sv_object.h b/include/sv_object.h index 43267e0e..8ccbf79f 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -25,19 +25,22 @@ struct SVCall { int read_depth = 0; // Breakpoint depth int support = 0; // Number of supporting reads int cluster_size = 0; // Number of SV calls in the cluster + uint8_t qual = 0; // Alignment quality score // Comparison operator for std::set bool operator<(const SVCall& other) const; // Constructor with parameters for all fields - SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) : - start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {} + SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size, uint8_t qual) : + start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size), qual(qual) {} + // SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) : + // start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {} // SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) : // start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {} }; -void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth); +void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, uint8_t qual); void mergeSVs(std::vector& sv_calls); diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 3da0dbe6..0839e310 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -59,7 +59,7 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam uint32_t supplementary_count = 0; // Main loop to process the alignments - std::unordered_map primary_map_qual; + // std::unordered_map primary_map_qual; uint32_t num_alignments = 0; while (readNextAlignment(fp_in, itr, bam1) >= 0) { @@ -68,14 +68,15 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam continue; } const std::string qname = bam_get_qname(bam1); // Query template name + uint8_t mapq = bam1->core.qual; // Mapping quality // Process primary alignments if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { // primary_map[qname] = itr; // Store chromosome (TID), start, and end positions (1-based) of the // primary alignment, and the strand (true for forward, false for reverse) - primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE)}; - primary_map_qual[qname] = bam1->core.qual; + primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq}; + // primary_map_qual[qname] = bam1->core.qual; primary_count++; // Process supplementary alignments @@ -83,7 +84,7 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam // supp_map[qname].push_back(itr); // Store chromosome (TID), start, and end positions (1-based) of the // supplementary alignment, and the strand (true for forward, false for reverse) - supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE)}); + supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq}); supplementary_count++; } num_alignments++; @@ -122,7 +123,8 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam if (primary1.tid == primary2.tid && primary1.start <= primary2.end && primary1.end >= primary2.start) { // Overlapping primary alignments // printMessage("Overlapping primary alignments with quality " + std::to_string(primary_map_qual[qname1]) + " and " + std::to_string(primary_map_qual[qname2])); - if (primary_map_qual[qname1] < primary_map_qual[qname2]) { + // if (primary_map_qual[qname1] < primary_map_qual[qname2]) { + if (primary1.qual < primary2.qual) { // to_remove_overlapping.push_back(qname1); to_remove_overlapping.insert(qname1); } else { @@ -217,6 +219,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec uint32_t* cigar = bam_get_cigar(alignment); // CIGAR array int cigar_len = alignment->core.n_cigar; uint32_t query_pos = 0; + uint8_t qual = alignment->core.qual; // Loop through the CIGAR string, process operations, detect SVs (primary // only), and calculate sequence identity for potential duplications (primary only) @@ -257,7 +260,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) { int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "LSEQSIM", "./.", default_lh, read_depth); + addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "LSEQSIM", "./.", default_lh, read_depth, qual); continue; } } @@ -271,7 +274,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) { int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth); + addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth, qual); continue; } } @@ -289,7 +292,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec alt_allele = ins_seq_str; } - addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth); + addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, qual); // Check if the CIGAR operation is a deletion } else if (op == BAM_CDEL && is_primary) { @@ -297,7 +300,12 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec ref_pos = pos+1; ref_end = ref_pos + op_len -1; int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); - addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "", "CIGARDEL", "./.", default_lh, read_depth); + addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "", "CIGARDEL", "./.", default_lh, read_depth, qual); + + // Print if the ref pos is within the range 44007800-44007930 + if (ref_pos >= 44007800 && ref_pos <= 44007930) { + printMessage("DEL: " + chr + ":" + std::to_string(ref_pos) + "-" + std::to_string(ref_end) + " (LENGTH " + std::to_string(op_len) + ")"); + } } } @@ -384,7 +392,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v printMessage(chr + ": Merging CIGAR..."); // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold); - // mergeSVs(chr_sv_calls); + mergeSVs(chr_sv_calls); // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold); int region_sv_count = getSVCount(chr_sv_calls); printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); @@ -401,6 +409,11 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v printMessage(chr + ": Split read SVs..."); this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data); + // Sort the SV calls by start position + std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) { + return a.start < b.start; + }); + // Merge the SV calls from the current region // printMessage(chr + ": Merging split reads..."); // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold); @@ -545,7 +558,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Reverse-oriented relative to the reference alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]"; } - addSVCall(sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0); + addSVCall(sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, primary.qual); // Create the alternate allele format for the second BND record alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "["; @@ -553,7 +566,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Reverse-oriented relative to the reference alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]"; } - addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0); + addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, largest_supp.qual); continue; } @@ -589,13 +602,13 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in if (supp_type == SVType::NEUTRAL) { // addSVCall(sv_calls, supp_start, supp_end, "INV", // "", "SPLIT", "./.", supp_lh, read_depth); - addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "", "SPLIT", "./.", supp_lh, read_depth); + addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual); continue; } else if (supp_type == SVType::DUP) { // addSVCall(sv_calls, supp_start, supp_end, "INVDUP", // "", "SPLIT", "./.", supp_lh, read_depth); - addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "", "SPLIT", "./.", supp_lh, read_depth); + addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual); continue; } } @@ -669,6 +682,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // } // Analyze split-read evidence for deletions and duplications + uint8_t mean_qual = (primary.qual + largest_supp.qual) / 2; bool gap_exists = false; uint32_t boundary_left, boundary_right, gap_left, gap_right; boundary_left = std::min(primary.start, largest_supp.start); @@ -734,18 +748,18 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in if (gap_lh > bd_lh) { int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right); std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">"; - addSVCall(sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth); + addSVCall(sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth, mean_qual); } else { // Add the boundary as the SV call int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth); + addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual); } } else { // Add the boundary as the SV call int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth); + addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual); } } diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 2c980e60..748fe01d 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "utils.h" @@ -14,7 +15,7 @@ bool SVCall::operator<(const SVCall & other) const return start < other.start || (start == other.start && end < other.end); } -void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth) +void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, uint8_t qual) { // Ignore unknown SV types // if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") { @@ -32,14 +33,15 @@ void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVTy // Insert the SV call in sorted order // SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, // hmm_likelihood, read_depth, 1}; - SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1}; + SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1, qual}; auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call); - // Update the SV type if the SV call already exists (if likelihood is - // higher) + // Determine if the SV call already exists if (it != sv_calls.end() && it->start == start && it->end == end) { it->support += 1; // Update the read support + + // Update SV type if likelihood is higher if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood) { // Update the SV call @@ -47,6 +49,7 @@ void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVTy it->data_type = data_type; it->genotype = genotype; it->hmm_likelihood = hmm_likelihood; + it->qual = qual; } } else { sv_calls.insert(it, sv_call); // Insert the new SV call @@ -68,67 +71,75 @@ void mergeSVs(std::vector& sv_calls) if (sv_calls.size() < 2) { return; } + int initial_size = sv_calls.size(); - // Merge SV calls if they overlap - // int initial_size = sv_calls.size(); - - // Merge any SV calls that have >90% reciprocal overlap + std::vector merged(sv_calls.size(), false); std::vector merged_sv_calls; - SVCall current_merge = sv_calls[0]; - for (size_t i = 1; i < sv_calls.size(); i++) { - SVCall& next = sv_calls[i]; - // Check for overlap - if (next.start <= current_merge.end) { - //XprintMessage("Comparing SV " + std::to_string(current_merge.start) + "-" + std::to_string(current_merge.end) + " (support " + std::to_string(current_merge.support) + ", length " + std::to_string(current_merge.end - current_merge.start) + ") with " + std::to_string(next.start) + "-" + std::to_string(next.end) + " (support " + std::to_string(next.support) + ", length " + std::to_string(next.end - next.start) + ")"); - - // if (current_merge.start <= next.end && next.start <= current_merge.end) { - // Calculate reciprocal overlap - uint32_t overlap = std::max(0, (int)std::min(current_merge.end, next.end) - (int)std::max(current_merge.start, next.start)); - uint32_t union_length = std::max(current_merge.end, next.end) - std::min(current_merge.start, next.start); - double overlap_fraction = static_cast(overlap) / union_length; - //XprintMessage("Overlap fraction: " + std::to_string(overlap_fraction)); - - // Merge if reciprocal overlap is >90% - if (overlap_fraction > 0.90) { - //XprintMessage("Merging SV calls with overlap " + std::to_string(overlap_fraction)); - // Keep the SV call with the higher read support - if (next.support > current_merge.support) { - next.cluster_size = current_merge.cluster_size + 1; // Update the cluster size - current_merge = next; - } else if (next.support == current_merge.support) { - // Keep the SV call with the higher likelihood - if (next.hmm_likelihood != 0.0 && current_merge.hmm_likelihood != 0.0 && next.hmm_likelihood > current_merge.hmm_likelihood) { - next.cluster_size = current_merge.cluster_size + 1; // Update the cluster size - current_merge = next; - } else if (next.hmm_likelihood == current_merge.hmm_likelihood) { - // Keep the SV call with the higher read depth - if (next.read_depth > current_merge.read_depth) { - next.cluster_size = current_merge.cluster_size + 1; // Update the cluster size - current_merge = next; - } - } + + // Sort SVs by start position to improve efficiency + std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { + return a.start < b.start; + }); + + for (size_t i = 0; i < sv_calls.size(); i++) { + if (merged[i]) continue; + + size_t best_index = i; + int total_cluster_size = sv_calls[i].cluster_size; // Track total cluster size + + for (size_t j = i + 1; j < sv_calls.size(); j++) { + if (merged[j]) continue; + + // Compute overlap + uint32_t overlap_start = std::max(sv_calls[i].start, sv_calls[j].start); + uint32_t overlap_end = std::min(sv_calls[i].end, sv_calls[j].end); + uint32_t overlap_length = (overlap_end > overlap_start) ? (overlap_end - overlap_start) : 0; + + // Compute union length correctly + uint32_t union_start = std::min(sv_calls[i].start, sv_calls[j].start); + uint32_t union_end = std::max(sv_calls[i].end, sv_calls[j].end); + uint32_t union_length = union_end - union_start; // No +1 to prevent off-by-one errors + + double overlap_fraction = (union_length > 0) ? (static_cast(overlap_length) / union_length) : 0.0; + + // Throw error if fraction > 1 + if (overlap_fraction > 1.0) { + throw std::runtime_error("Error: Overlap fraction = " + std::to_string(overlap_fraction) + " > 1.0"); + } + + // if (overlap_fraction > 0.5) { + if (overlap_fraction > 0.5) { // Changed from 0.5 + total_cluster_size += sv_calls[j].cluster_size; + if (sv_calls[j].support > sv_calls[best_index].support) { + best_index = j; } - } else { - // Continue with the larger length - uint32_t current_length = current_merge.end - current_merge.start; - uint32_t next_length = next.end - next.start; - if (next_length > current_length) { // And support meets threshold - next.cluster_size = current_merge.cluster_size + 1; // Update the cluster size - current_merge = next; - } + merged[j] = true; // Mark SV as merged } - } else { - // Store the merged SV call and move to the next SV call - merged_sv_calls.push_back(current_merge); - current_merge = next; } + + sv_calls[best_index].cluster_size = total_cluster_size; // Update best SV with total size + merged_sv_calls.push_back(sv_calls[best_index]); // Keep the strongest SV } - merged_sv_calls.push_back(current_merge); // Add the last SV call - sv_calls = merged_sv_calls; // Update the SV calls + // Filter out merged SVs with low support or cluster size + // merged_sv_calls.erase(std::remove_if(merged_sv_calls.begin(), merged_sv_calls.end(), [initial_size](const SVCall& sv_call) { + // return sv_call.support < 2 && sv_call.cluster_size < 10; // Adjust thresholds as needed + // }), merged_sv_calls.end()); + // merged_sv_calls.erase(std::remove_if(merged_sv_calls.begin(), merged_sv_calls.end(), [initial_size](const SVCall& sv_call) { + // return sv_call.support < 2 && sv_call.cluster_size < 3; // Adjust thresholds as needed + // }), merged_sv_calls.end()); + + sv_calls = std::move(merged_sv_calls); // Replace with filtered list + + // Print SVs that have length 2039 + for (const auto& sv_call : sv_calls) { + if (sv_call.end - sv_call.start == 2039) { + printMessage("Found merged SV with length 2039 at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " (SUP=" + std::to_string(sv_call.support) + ")"); + } + } - // int updated_size = sv_calls.size(); - // std::cout << "Merged " << initial_size << " SV calls into " << updated_size << " SV calls" << std::endl; + int updated_size = sv_calls.size(); + printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls"); } void filterSVsWithLowSupport(std::vector& sv_calls, int min_support) From 8e4da06e32d71a93886c7761aa1786466145ceb2 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 30 Jan 2025 18:43:51 -0500 Subject: [PATCH 066/134] improved merging to obtain 1 recall for sv types in chr21 --- src/sv_object.cpp | 82 +++++++++++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 35 deletions(-) diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 748fe01d..5e31f3e8 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -31,8 +31,6 @@ void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVTy } // Insert the SV call in sorted order - // SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, - // hmm_likelihood, read_depth, 1}; SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1, qual}; auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call); @@ -84,57 +82,71 @@ void mergeSVs(std::vector& sv_calls) for (size_t i = 0; i < sv_calls.size(); i++) { if (merged[i]) continue; - size_t best_index = i; - int total_cluster_size = sv_calls[i].cluster_size; // Track total cluster size + std::vector cluster; + cluster.push_back(sv_calls[i]); + merged[i] = true; + // Use 10% of the length of the first SV as the threshold + uint32_t sv_a_window = (uint32_t) std::ceil((double) (sv_calls[i].end - sv_calls[i].start + 1) * 0.1); + + // Find SVs that have start or end positions within 10% of each other's length for (size_t j = i + 1; j < sv_calls.size(); j++) { if (merged[j]) continue; - // Compute overlap - uint32_t overlap_start = std::max(sv_calls[i].start, sv_calls[j].start); - uint32_t overlap_end = std::min(sv_calls[i].end, sv_calls[j].end); - uint32_t overlap_length = (overlap_end > overlap_start) ? (overlap_end - overlap_start) : 0; + // Check if the SVs are within 10% of the largest SV's length + uint32_t sv_b_window = (uint32_t) std::ceil((double) (sv_calls[j].end - sv_calls[j].start + 1) * 0.1); + uint32_t sv_window = std::max(sv_a_window, sv_b_window); + bool start_within_window = std::abs((int) sv_calls[j].start - (int) sv_calls[i].start) <= (int) sv_window; + bool end_within_window = std::abs((int) sv_calls[j].end - (int) sv_calls[i].end) <= (int) sv_window; + if (start_within_window && end_within_window) { + cluster.push_back(sv_calls[j]); + merged[j] = true; + } + } - // Compute union length correctly - uint32_t union_start = std::min(sv_calls[i].start, sv_calls[j].start); - uint32_t union_end = std::max(sv_calls[i].end, sv_calls[j].end); - uint32_t union_length = union_end - union_start; // No +1 to prevent off-by-one errors + // Remove clusters with single SVs that have low support + if (cluster.size() < 2 && cluster[0].support < 2) { + continue; + } - double overlap_fraction = (union_length > 0) ? (static_cast(overlap_length) / union_length) : 0.0; + std::vector filtered_cluster = cluster; - // Throw error if fraction > 1 - if (overlap_fraction > 1.0) { - throw std::runtime_error("Error: Overlap fraction = " + std::to_string(overlap_fraction) + " > 1.0"); + // If any SV length equals 2039, print all the SV calls in the cluster + bool found_2039 = false; + for (const auto& sv : filtered_cluster) { + if (sv.end - sv.start == 2039) { + printMessage("[TEST] Found SV with length 2039 at " + std::to_string(sv.start) + "-" + std::to_string(sv.end) + " (SUP=" + std::to_string(sv.support) + ")"); + found_2039 = true; } - - // if (overlap_fraction > 0.5) { - if (overlap_fraction > 0.5) { // Changed from 0.5 - total_cluster_size += sv_calls[j].cluster_size; - if (sv_calls[j].support > sv_calls[best_index].support) { - best_index = j; - } - merged[j] = true; // Mark SV as merged + } + if (found_2039) { + std::cout << "[TEST] Cluster of SVs with size " << filtered_cluster.size() << ":" << std::endl; + for (const auto& sv : filtered_cluster) { + printMessage("SV: " + std::to_string(sv.start) + "-" + std::to_string(sv.end) + " (SUP=" + std::to_string(sv.support) + ", LEN=" + std::to_string(sv.end - sv.start) + ")"); } } - sv_calls[best_index].cluster_size = total_cluster_size; // Update best SV with total size - merged_sv_calls.push_back(sv_calls[best_index]); // Keep the strongest SV - } + // Find the median-length SV in the cluster and use it as the merged SV + // Sort the cluster by length + std::sort(filtered_cluster.begin(), filtered_cluster.end(), [](const SVCall& a, const SVCall& b) { + return (a.end - a.start) < (b.end - b.start); + }); - // Filter out merged SVs with low support or cluster size - // merged_sv_calls.erase(std::remove_if(merged_sv_calls.begin(), merged_sv_calls.end(), [initial_size](const SVCall& sv_call) { - // return sv_call.support < 2 && sv_call.cluster_size < 10; // Adjust thresholds as needed - // }), merged_sv_calls.end()); - // merged_sv_calls.erase(std::remove_if(merged_sv_calls.begin(), merged_sv_calls.end(), [initial_size](const SVCall& sv_call) { - // return sv_call.support < 2 && sv_call.cluster_size < 3; // Adjust thresholds as needed - // }), merged_sv_calls.end()); + // Get the median SV + size_t median_index = filtered_cluster.size() / 2; + SVCall median_sv = filtered_cluster[median_index]; + median_sv.cluster_size = (int) cluster.size(); + + // Add the merged SV to the list + merged_sv_calls.push_back(median_sv); + } sv_calls = std::move(merged_sv_calls); // Replace with filtered list // Print SVs that have length 2039 for (const auto& sv_call : sv_calls) { if (sv_call.end - sv_call.start == 2039) { - printMessage("Found merged SV with length 2039 at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " (SUP=" + std::to_string(sv_call.support) + ")"); + printMessage("[TEST] Found merged SV with length 2039 at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " (SUP=" + std::to_string(sv_call.support) + ")"); } } From 07551d975e6765089cfe936b2307d75a3f16025f Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 31 Jan 2025 20:16:52 -0500 Subject: [PATCH 067/134] dbscan in cpp --- include/dbscan.h | 31 +++++++++ include/sv_object.h | 5 -- src/dbscan.cpp | 79 +++++++++++++++++++++++ src/sv_caller.cpp | 1 - src/sv_object.cpp | 149 ++++++++++++++++++++++++-------------------- 5 files changed, 192 insertions(+), 73 deletions(-) create mode 100644 include/dbscan.h create mode 100644 src/dbscan.cpp diff --git a/include/dbscan.h b/include/dbscan.h new file mode 100644 index 00000000..923570c5 --- /dev/null +++ b/include/dbscan.h @@ -0,0 +1,31 @@ +#ifndef DBSCAN_H +#define DBSCAN_H + +#include +#include +#include +#include + +#include "sv_object.h" + +class DBSCAN { + public: + DBSCAN(double epsilon, int minPts) : epsilon(epsilon), minPts(minPts) {} + + void fit(const std::vector& sv_calls); + + const std::vector& getClusters() const; + + private: + double epsilon; + int minPts; + std::vector clusters; + + bool expandCluster(const std::vector& sv_calls, size_t pointIdx, int clusterId); + + std::vector regionQuery(const std::vector& sv_calls, size_t pointIdx) const; + + double distance(const SVCall& a, const SVCall& b) const; +}; + +#endif // DBSCAN_H diff --git a/include/sv_object.h b/include/sv_object.h index 8ccbf79f..cf52cac6 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -33,11 +33,6 @@ struct SVCall { // Constructor with parameters for all fields SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size, uint8_t qual) : start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size), qual(qual) {} - // SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) : - // start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {} - - // SVCall(uint32_t start, uint32_t end, std::string sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support) : - // start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support) {} }; void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, uint8_t qual); diff --git a/src/dbscan.cpp b/src/dbscan.cpp new file mode 100644 index 00000000..d7cc3aeb --- /dev/null +++ b/src/dbscan.cpp @@ -0,0 +1,79 @@ +#include "dbscan.h" + +#include +#include +#include +#include + + +void DBSCAN::fit(const std::vector& sv_calls) { + int clusterId = 0; + // clusters.assign(points.size(), -1); // -1 means unclassified + clusters.assign(sv_calls.size(), -1); // -1 means unclassified + + // for (size_t i = 0; i < points.size(); ++i) { + for (size_t i = 0; i < sv_calls.size(); ++i) { + if (clusters[i] == -1) { // if point is not yet classified + // if (expandCluster(points, i, clusterId)) { + if (expandCluster(sv_calls, i, clusterId)) { + ++clusterId; + } + } + } +} + +const std::vector& DBSCAN::getClusters() const { + return clusters; +} + +// bool DBSCAN::expandCluster(const std::vector>& +// points, size_t pointIdx, int clusterId) { +bool DBSCAN::expandCluster(const std::vector& sv_calls, size_t pointIdx, int clusterId) { + std::vector seeds = regionQuery(sv_calls, pointIdx); + if (static_cast(seeds.size()) < minPts) { + clusters[pointIdx] = -2; // mark as noise + return false; + } + + for (size_t seedIdx : seeds) { + clusters[seedIdx] = clusterId; + } + + seeds.erase(std::remove(seeds.begin(), seeds.end(), pointIdx), seeds.end()); + + while (!seeds.empty()) { + size_t currentPoint = seeds.back(); + seeds.pop_back(); + + std::vector result = regionQuery(sv_calls, currentPoint); + if (static_cast(result.size()) >= minPts) { + for (size_t resultPoint : result) { + if (clusters[resultPoint] == -1 || clusters[resultPoint] == -2) { + if (clusters[resultPoint] == -1) { + seeds.push_back(resultPoint); + } + clusters[resultPoint] = clusterId; + } + } + } + } + + return true; +} + +std::vector DBSCAN::regionQuery(const std::vector& sv_calls, size_t pointIdx) const { + std::vector neighbors; + for (size_t i = 0; i < sv_calls.size(); ++i) { + if (distance(sv_calls[pointIdx], sv_calls[i]) <= epsilon) { + neighbors.push_back(i); + } + } + return neighbors; +} + +double DBSCAN::distance(const SVCall& point1, const SVCall& point2) const { + // return std::sqrt(std::pow(point1.first - point2.first, 2) + + // std::pow(point1.second - point2.second, 2)); + return std::sqrt(std::pow(static_cast(point1.start) - static_cast(point2.start), 2) + + std::pow(static_cast(point1.end) - static_cast(point2.end), 2)); +} diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 0839e310..d7f79d01 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -373,7 +373,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v int region_start = region_data.first; int region_end = region_data.second; region = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end); - } // Load chromosome data for copy number predictions diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 5e31f3e8..0d4cec97 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -8,6 +8,7 @@ #include #include +#include "dbscan.h" #include "utils.h" bool SVCall::operator<(const SVCall & other) const @@ -71,84 +72,98 @@ void mergeSVs(std::vector& sv_calls) } int initial_size = sv_calls.size(); - std::vector merged(sv_calls.size(), false); + // Cluster SVs using DBSCAN for each SV type std::vector merged_sv_calls; - // Sort SVs by start position to improve efficiency - std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { - return a.start < b.start; - }); - - for (size_t i = 0; i < sv_calls.size(); i++) { - if (merged[i]) continue; - - std::vector cluster; - cluster.push_back(sv_calls[i]); - merged[i] = true; - - // Use 10% of the length of the first SV as the threshold - uint32_t sv_a_window = (uint32_t) std::ceil((double) (sv_calls[i].end - sv_calls[i].start + 1) * 0.1); - - // Find SVs that have start or end positions within 10% of each other's length - for (size_t j = i + 1; j < sv_calls.size(); j++) { - if (merged[j]) continue; - - // Check if the SVs are within 10% of the largest SV's length - uint32_t sv_b_window = (uint32_t) std::ceil((double) (sv_calls[j].end - sv_calls[j].start + 1) * 0.1); - uint32_t sv_window = std::max(sv_a_window, sv_b_window); - bool start_within_window = std::abs((int) sv_calls[j].start - (int) sv_calls[i].start) <= (int) sv_window; - bool end_within_window = std::abs((int) sv_calls[j].end - (int) sv_calls[i].end) <= (int) sv_window; - if (start_within_window && end_within_window) { - cluster.push_back(sv_calls[j]); - merged[j] = true; - } - } + // Create a set of size intervals and corresponding DBSCAN epsilons + std::map, double> size_to_eps; + size_to_eps[{0, 1000}] = 200; + size_to_eps[{1000, 5000}] = 500; + size_to_eps[{5000, 10000}] = 3000; + size_to_eps[{10000, 50000}] = 4000; + size_to_eps[{50000, 100000}] = 5000; + size_to_eps[{100000, 500000}] = 10000; + size_to_eps[{500000, 1000000}] = 20000; - // Remove clusters with single SVs that have low support - if (cluster.size() < 2 && cluster[0].support < 2) { - continue; - } + for (auto& size_interval : size_to_eps) { - std::vector filtered_cluster = cluster; + // Calculate epsilon as 20% of the size interval + double epsilon = 0.2 * (size_interval.first.second - size_interval.first.first); + printMessage("Clustering SVs with size " + std::to_string(size_interval.first.first) + "-" + std::to_string(size_interval.first.second) + " with epsilon " + std::to_string(epsilon)); - // If any SV length equals 2039, print all the SV calls in the cluster - bool found_2039 = false; - for (const auto& sv : filtered_cluster) { - if (sv.end - sv.start == 2039) { - printMessage("[TEST] Found SV with length 2039 at " + std::to_string(sv.start) + "-" + std::to_string(sv.end) + " (SUP=" + std::to_string(sv.support) + ")"); - found_2039 = true; - } - } - if (found_2039) { - std::cout << "[TEST] Cluster of SVs with size " << filtered_cluster.size() << ":" << std::endl; - for (const auto& sv : filtered_cluster) { - printMessage("SV: " + std::to_string(sv.start) + "-" + std::to_string(sv.end) + " (SUP=" + std::to_string(sv.support) + ", LEN=" + std::to_string(sv.end - sv.start) + ")"); + DBSCAN dbscan(size_interval.second, 2); + + for ( const auto& sv_type : { + SVType::DEL, + SVType::DUP, + SVType::INV, + SVType::INS, + SVType::BND, + SVType::INV_DUP + }) + { + // DBSCAN dbscan(1000, 2); + + // Create a vector of SV calls for the current SV type and size interval + std::vector sv_type_calls; + + // If the final size interval, then don't set an upper bound + int lower_bound = size_interval.first.first; + int upper_bound = size_interval.first.second; + if (lower_bound == 500000) + { + std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type, lower_bound](const SVCall& sv_call) { + return sv_call.sv_type == sv_type && static_cast(sv_call.end - sv_call.start) >= lower_bound; + }); + // std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) { + // return sv_call.sv_type == sv_type && (sv_call.end - sv_call.start) >= lower_bound; + // }); + } else { + std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type, lower_bound, upper_bound](const SVCall& sv_call) { + return sv_call.sv_type == sv_type && static_cast(sv_call.end - sv_call.start) >= lower_bound && static_cast(sv_call.end - sv_call.start) <= upper_bound; + }); + // std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type, &size_interval](const SVCall& sv_call) { + // return sv_call.sv_type == sv_type && (sv_call.end - sv_call.start) >= lower_bound && (sv_call.end - sv_call.start) <= upper_bound; + // }); } - } + // std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) { + // return sv_call.sv_type == sv_type && (sv_call.end - sv_call.start) >= size_interval.first && (sv_call.end - sv_call.start) <= size_interval.second; + // }); - // Find the median-length SV in the cluster and use it as the merged SV - // Sort the cluster by length - std::sort(filtered_cluster.begin(), filtered_cluster.end(), [](const SVCall& a, const SVCall& b) { - return (a.end - a.start) < (b.end - b.start); - }); - - // Get the median SV - size_t median_index = filtered_cluster.size() / 2; - SVCall median_sv = filtered_cluster[median_index]; + if (sv_type_calls.size() < 2) { + continue; + } - median_sv.cluster_size = (int) cluster.size(); - - // Add the merged SV to the list - merged_sv_calls.push_back(median_sv); - } - sv_calls = std::move(merged_sv_calls); // Replace with filtered list + dbscan.fit(sv_type_calls); + const std::vector& clusters = dbscan.getClusters(); + std::map> cluster_map; + for (size_t i = 0; i < clusters.size(); ++i) { + cluster_map[clusters[i]].push_back(sv_type_calls[i]); + } - // Print SVs that have length 2039 - for (const auto& sv_call : sv_calls) { - if (sv_call.end - sv_call.start == 2039) { - printMessage("[TEST] Found merged SV with length 2039 at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " (SUP=" + std::to_string(sv_call.support) + ")"); + // Merge SVs in each cluster + for (auto& cluster : cluster_map) { + int cluster_id = cluster.first; + // const std::vector& cluster_sv_calls = cluster.second; + std::vector& cluster_sv_calls = cluster.second; + if (cluster_id < 0) { + continue; // Skip noise and unclassified points + } else { + // Use the median length SV + // Sort the SVs in the cluster by their length + std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { + return (a.end - a.start) < (b.end - b.start); + }); + int median_index = cluster_sv_calls.size() / 2; + SVCall median_sv_call = cluster_sv_calls[median_index]; + median_sv_call.cluster_size = (int) cluster_sv_calls.size(); + merged_sv_calls.push_back(median_sv_call); + } + } + printMessage("Completed clustering for " + getSVTypeString(sv_type)); } } + sv_calls = std::move(merged_sv_calls); // Replace with filtered list int updated_size = sv_calls.size(); printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls"); From 4793bbfc694a779150f10814de622d8a63ea25bd Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sun, 2 Feb 2025 20:27:54 -0500 Subject: [PATCH 068/134] update dbscan --- include/sv_object.h | 2 + src/dbscan.cpp | 17 +++- src/sv_caller.cpp | 110 ++++-------------------- src/sv_object.cpp | 199 ++++++++++++++++++++++---------------------- 4 files changed, 132 insertions(+), 196 deletions(-) diff --git a/include/sv_object.h b/include/sv_object.h index cf52cac6..da544571 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -47,4 +47,6 @@ uint32_t getSVCount(const std::vector& sv_calls); void concatenateSVCalls(std::vector& sv_calls, const std::vector& sv_calls_update); +void mergeSVs(std::vector &sv_calls, double epsilon, int min_pts); + #endif // SV_OBJECT_H diff --git a/src/dbscan.cpp b/src/dbscan.cpp index d7cc3aeb..d5310292 100644 --- a/src/dbscan.cpp +++ b/src/dbscan.cpp @@ -74,6 +74,19 @@ std::vector DBSCAN::regionQuery(const std::vector& sv_calls, siz double DBSCAN::distance(const SVCall& point1, const SVCall& point2) const { // return std::sqrt(std::pow(point1.first - point2.first, 2) + // std::pow(point1.second - point2.second, 2)); - return std::sqrt(std::pow(static_cast(point1.start) - static_cast(point2.start), 2) + - std::pow(static_cast(point1.end) - static_cast(point2.end), 2)); + // return std::sqrt(std::pow(static_cast(point1.start) - static_cast(point2.start), 2) + + // std::pow(static_cast(point1.end) - + // static_cast(point2.end), 2)); + + // Calculate reciprocal overlap-based distance + // https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02840-6 + // https://link.springer.com/article/10.1186/gb-2009-10-10-r119 + int overlap = std::max(0, std::min(static_cast(point1.end), static_cast(point2.end)) - std::max(static_cast(point1.start), static_cast(point2.start))); + int length1 = static_cast(point1.end - point1.start); + int length2 = static_cast(point2.end - point2.start); + + // Minimum reciprocal overlap + double distance = 1.0 - std::min(static_cast(overlap) / static_cast(length1), static_cast(overlap) / static_cast(length2)); + // double distance = 1.0 - static_cast(overlap) / std::min(length1, length2); + return distance; // 0.0 means identical, 1.0 means no overlap } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index d7f79d01..6c137c8c 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -391,7 +391,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v printMessage(chr + ": Merging CIGAR..."); // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold); - mergeSVs(chr_sv_calls); + mergeSVs(chr_sv_calls, 0.8, 5); // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold); int region_sv_count = getSVCount(chr_sv_calls); printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); @@ -417,11 +417,9 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // printMessage(chr + ": Merging split reads..."); // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold); // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold, "SPLIT"); - // mergeSVs(chr_sv_calls); // Run a final merge on the combined SV calls // printMessage(chr + ": Merging final calls..."); - // mergeSVs(chr_sv_calls); printMessage("Completed chromosome " + chr); } @@ -528,9 +526,9 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Find split-read SV evidence // printMessage(region + ": Finding split-read SVs..."); + std::vector split_sv_calls; int current_primary = 0; int primary_count = primary_map.size(); - //int primary_count = primary_map.size(); uint32_t min_cnv_length = input_data.getMinCNVLength(); for (auto& entry : primary_map) { current_primary++; @@ -557,7 +555,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Reverse-oriented relative to the reference alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]"; } - addSVCall(sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, primary.qual); + addSVCall(split_sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, primary.qual); // Create the alternate allele format for the second BND record alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "["; @@ -565,7 +563,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Reverse-oriented relative to the reference alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]"; } - addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, largest_supp.qual); + addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, largest_supp.qual); continue; } @@ -585,9 +583,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in continue; } - // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")..."); - // std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data); - // printMessage("Running copy number prediction, length: " + std::to_string(supp_region.end - supp_region.start)); std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, largest_supp.start, largest_supp.end, mean_chr_cov, pos_depth_map, input_data); if (std::get<1>(result) == SVType::UNKNOWN) { continue; @@ -601,84 +596,17 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in if (supp_type == SVType::NEUTRAL) { // addSVCall(sv_calls, supp_start, supp_end, "INV", // "", "SPLIT", "./.", supp_lh, read_depth); - addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual); + addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual); continue; } else if (supp_type == SVType::DUP) { // addSVCall(sv_calls, supp_start, supp_end, "INVDUP", // "", "SPLIT", "./.", supp_lh, read_depth); - addSVCall(sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual); + addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual); continue; } } } - - // GenomicRegion largest_supp_region = supp_map[qname][0]; - // uint32_t largest_supp_length = 0; - - // printMessage(region + ": Processing supplementary alignments for " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); - // const std::string& primary_chr = bamHdr->target_name[primary.tid]; - // // for (auto it = supp_map[qname].begin(); it != supp_map[qname].end(); - // // ++it) { - // for (auto& supp_region : supp_regions) { - // // GenomicRegion& supp_region = *it; - - // // Skip if not on the primary chromosome - // if (primary.tid != supp_region.tid) { - // continue; - // } - - // // Get the supplementary alignment information - // // uint32_t supp_start = (uint32_t) supp_region.start; - // // uint32_t supp_end = (uint32_t) supp_region.end; - // // uint32_t supp_length = supp_end - supp_start + 1; - // // if (supp_length > largest_supp_length) { - // // largest_supp_length = supp_length; - // // largest_supp_region = *it; - // // } - - // // Inversion detection - // bool is_opposite_strand = primary.strand != supp_region.strand; - // if (is_opposite_strand) { - // // if (supp_length >= min_cnv_length) { - // if (supp_region.end - supp_region.start >= min_cnv_length) { - - // // Print error if the start position is greater than the end - // // position - // // if (supp_start > supp_end) { - // if (supp_region.start > supp_region.end) { - // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_region.start) + "-" + std::to_string(supp_region.end)); - // // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end)); - // continue; - // } - - // // printMessage(region + ": Running copy number prediction for inversion (position: " + std::to_string(supp_start) + "-" + std::to_string(supp_end) + ")..."); - // // std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_start, supp_end, mean_chr_cov, pos_depth_map, input_data); - // // printMessage("Running copy number prediction, length: " + std::to_string(supp_region.end - supp_region.start)); - // std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, supp_region.start, supp_region.end, mean_chr_cov, pos_depth_map, input_data); - // if (std::get<1>(result) == SVType::UNKNOWN) { - // continue; - // } - - // double supp_lh = std::get<0>(result); - // SVType supp_type = std::get<1>(result); - // // printMessage("Test3"); - // int read_depth = this->calculateReadDepth(pos_depth_map, supp_region.start, supp_region.end); - // // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); - // if (supp_type == SVType::NEUTRAL) { - // // addSVCall(sv_calls, supp_start, supp_end, "INV", - // // "", "SPLIT", "./.", supp_lh, read_depth); - // addSVCall(sv_calls, supp_region.start, supp_region.end, "INV", "", "SPLIT", "./.", supp_lh, read_depth); - - // sv_count++; - // } else if (supp_type == SVType::DUP) { - // // addSVCall(sv_calls, supp_start, supp_end, "INVDUP", - // // "", "SPLIT", "./.", supp_lh, read_depth); - // addSVCall(sv_calls, supp_region.start, supp_region.end, "INVDUP", "", "SPLIT", "./.", supp_lh, read_depth); - // } - // } - // } - // } // Analyze split-read evidence for deletions and duplications uint8_t mean_qual = (primary.qual + largest_supp.qual) / 2; @@ -689,19 +617,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in gap_left = std::min(primary.end, largest_supp.start); gap_right = std::max(primary.start, largest_supp.end); gap_exists = gap_left < gap_right; - // if (primary.start < largest_supp_region.start) { // Primary before supp - // boundary_left = primary.start; - // boundary_right = std::max(primary.end, largest_supp_region.end); - // gap_left = primary.end; - // gap_right = largest_supp_region.start; - // gap_exists = gap_left < gap_right; - // } else { - // boundary_left = largest_supp_region.start; - // boundary_right = std::max(primary.end, largest_supp_region.end); - // gap_left = largest_supp_region.end; - // gap_right = primary.start; - // gap_exists = gap_left < gap_right; - // } // Run copy number variant predictions on the boundary if large enough if (boundary_right - boundary_left >= min_cnv_length) { @@ -747,18 +662,18 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in if (gap_lh > bd_lh) { int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right); std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">"; - addSVCall(sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth, mean_qual); + addSVCall(split_sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth, mean_qual); } else { // Add the boundary as the SV call int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual); + addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual); } } else { // Add the boundary as the SV call int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - addSVCall(sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual); + addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual); } } @@ -767,6 +682,13 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in printMessage(region + ": Processed " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); } } + + // Merge the split-read SV calls + printMessage(region + ": Merging split-read SVs..."); + mergeSVs(split_sv_calls, 0.1, 2); + + // Unify the SV calls + sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); } void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 0d4cec97..bf4e44a5 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -34,25 +34,26 @@ void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVTy // Insert the SV call in sorted order SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1, qual}; auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call); - - // Determine if the SV call already exists - if (it != sv_calls.end() && it->start == start && it->end == end) - { - it->support += 1; // Update the read support - - // Update SV type if likelihood is higher - if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood) - { - // Update the SV call - it->sv_type = sv_type; - it->data_type = data_type; - it->genotype = genotype; - it->hmm_likelihood = hmm_likelihood; - it->qual = qual; - } - } else { - sv_calls.insert(it, sv_call); // Insert the new SV call - } + sv_calls.insert(it, sv_call); + + // // Determine if the SV call already exists + // if (it != sv_calls.end() && it->start == start && it->end == end) + // { + // it->support += 1; // Update the read support + + // // Update SV type if likelihood is higher + // if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood) + // { + // // Update the SV call + // it->sv_type = sv_type; + // it->data_type = data_type; + // it->genotype = genotype; + // it->hmm_likelihood = hmm_likelihood; + // it->qual = qual; + // } + // } else { + // sv_calls.insert(it, sv_call); // Insert the new SV call + // } } uint32_t getSVCount(const std::vector& sv_calls) @@ -65,7 +66,7 @@ void concatenateSVCalls(std::vector &target, const std::vector& target.insert(target.end(), source.begin(), source.end()); } -void mergeSVs(std::vector& sv_calls) +void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) { if (sv_calls.size() < 2) { return; @@ -77,91 +78,89 @@ void mergeSVs(std::vector& sv_calls) // Create a set of size intervals and corresponding DBSCAN epsilons std::map, double> size_to_eps; - size_to_eps[{0, 1000}] = 200; - size_to_eps[{1000, 5000}] = 500; - size_to_eps[{5000, 10000}] = 3000; - size_to_eps[{10000, 50000}] = 4000; - size_to_eps[{50000, 100000}] = 5000; - size_to_eps[{100000, 500000}] = 10000; - size_to_eps[{500000, 1000000}] = 20000; - - for (auto& size_interval : size_to_eps) { - - // Calculate epsilon as 20% of the size interval - double epsilon = 0.2 * (size_interval.first.second - size_interval.first.first); - printMessage("Clustering SVs with size " + std::to_string(size_interval.first.first) + "-" + std::to_string(size_interval.first.second) + " with epsilon " + std::to_string(epsilon)); - - DBSCAN dbscan(size_interval.second, 2); - - for ( const auto& sv_type : { - SVType::DEL, - SVType::DUP, - SVType::INV, - SVType::INS, - SVType::BND, - SVType::INV_DUP - }) - { - // DBSCAN dbscan(1000, 2); - - // Create a vector of SV calls for the current SV type and size interval - std::vector sv_type_calls; - - // If the final size interval, then don't set an upper bound - int lower_bound = size_interval.first.first; - int upper_bound = size_interval.first.second; - if (lower_bound == 500000) - { - std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type, lower_bound](const SVCall& sv_call) { - return sv_call.sv_type == sv_type && static_cast(sv_call.end - sv_call.start) >= lower_bound; - }); - // std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) { - // return sv_call.sv_type == sv_type && (sv_call.end - sv_call.start) >= lower_bound; - // }); - } else { - std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type, lower_bound, upper_bound](const SVCall& sv_call) { - return sv_call.sv_type == sv_type && static_cast(sv_call.end - sv_call.start) >= lower_bound && static_cast(sv_call.end - sv_call.start) <= upper_bound; - }); - // std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type, &size_interval](const SVCall& sv_call) { - // return sv_call.sv_type == sv_type && (sv_call.end - sv_call.start) >= lower_bound && (sv_call.end - sv_call.start) <= upper_bound; - // }); - } - // std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) { - // return sv_call.sv_type == sv_type && (sv_call.end - sv_call.start) >= size_interval.first && (sv_call.end - sv_call.start) <= size_interval.second; - // }); + // size_to_eps[{0, 1000}] = 200; + // size_to_eps[{1000, 5000}] = 500; + // size_to_eps[{5000, 10000}] = 3000; + // size_to_eps[{10000, 50000}] = 4000; + // size_to_eps[{50000, 100000}] = 5000; + // size_to_eps[{100000, 500000}] = 10000; + // size_to_eps[{500000, 1000000}] = 20000; + + // Small SVs + // size_to_eps[{50, 200}] = 50; + + // // Medium SVs + // size_to_eps[{200, 1000}] = 200; + + // // Large SVs + // size_to_eps[{1000, 10000}] = 1000; + + // // Very large SVs + // size_to_eps[{10000, 100000}] = 10000; + + // // Extreme SVs + // size_to_eps[{100000, 1000000}] = 20000; + + // std::vector epsilons = {50, 200, 1000, 10000, 100000, 500000}; + // std::vector epsilons = {0.2}; + + // double epsilon = size_interval.second; + // // Calculate epsilon as 20% of the largest size in the interval + // double epsilon = 0.1 * size_interval.first.second; + // printMessage("Clustering SVs with size " + std::to_string(size_interval.first.first) + "-" + std::to_string(size_interval.first.second) + " with epsilon " + std::to_string(epsilon)); + // int min_pts = 2; + // int min_pts = 2; + DBSCAN dbscan(epsilon, min_pts); + // DBSCAN dbscan(size_interval.second, 2); + + for ( const auto& sv_type : { + SVType::DEL, + SVType::DUP, + SVType::INV, + SVType::INS, + SVType::BND, + SVType::INV_DUP + }) + { + // DBSCAN dbscan(1000, 2); - if (sv_type_calls.size() < 2) { - continue; - } + // Create a vector of SV calls for the current SV type and size interval + std::vector sv_type_calls; + std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) { + return sv_call.sv_type == sv_type; + }); - dbscan.fit(sv_type_calls); - const std::vector& clusters = dbscan.getClusters(); - std::map> cluster_map; - for (size_t i = 0; i < clusters.size(); ++i) { - cluster_map[clusters[i]].push_back(sv_type_calls[i]); - } + if (sv_type_calls.size() < 2) { + continue; + } - // Merge SVs in each cluster - for (auto& cluster : cluster_map) { - int cluster_id = cluster.first; - // const std::vector& cluster_sv_calls = cluster.second; - std::vector& cluster_sv_calls = cluster.second; - if (cluster_id < 0) { - continue; // Skip noise and unclassified points - } else { - // Use the median length SV - // Sort the SVs in the cluster by their length - std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { - return (a.end - a.start) < (b.end - b.start); - }); - int median_index = cluster_sv_calls.size() / 2; - SVCall median_sv_call = cluster_sv_calls[median_index]; - median_sv_call.cluster_size = (int) cluster_sv_calls.size(); - merged_sv_calls.push_back(median_sv_call); - } + dbscan.fit(sv_type_calls); + const std::vector& clusters = dbscan.getClusters(); + std::map> cluster_map; + for (size_t i = 0; i < clusters.size(); ++i) { + cluster_map[clusters[i]].push_back(sv_type_calls[i]); + } + + // Merge SVs in each cluster + int cluster_count = 0; + for (auto& cluster : cluster_map) { + int cluster_id = cluster.first; + std::vector& cluster_sv_calls = cluster.second; + if (cluster_id < 0) { + continue; // Skip noise and unclassified points + } else { + // Use the median length SV + std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { + return (a.end - a.start) < (b.end - b.start); + }); + int median_index = cluster_sv_calls.size() / 2; + SVCall median_sv_call = cluster_sv_calls[median_index]; + median_sv_call.cluster_size = (int) cluster_sv_calls.size(); + merged_sv_calls.push_back(median_sv_call); + cluster_count++; } - printMessage("Completed clustering for " + getSVTypeString(sv_type)); } + printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type)); } sv_calls = std::move(merged_sv_calls); // Replace with filtered list From 0c329a3aa8946eeff00356f7149da169fa96c881 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 3 Feb 2025 16:51:42 -0500 Subject: [PATCH 069/134] add dbscan parameters --- include/input_data.h | 10 ++++++++++ src/input_data.cpp | 22 ++++++++++++++++++++++ src/main.cpp | 15 +++++++++++++++ src/sv_caller.cpp | 7 +++++-- src/sv_object.cpp | 37 +------------------------------------ 5 files changed, 53 insertions(+), 38 deletions(-) diff --git a/include/input_data.h b/include/input_data.h index 3960d362..d88426f3 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -65,6 +65,14 @@ class InputData { void setMinReadSupport(int min_reads); int getMinReadSupport() const; + // Set the epsilon parameter for DBSCAN clustering. + void setDBSCAN_Epsilon(double epsilon); + double getDBSCAN_Epsilon() const; + + // Set the minimum number of points in a cluster for DBSCAN. + void setDBSCAN_MinPts(int min_pts); + int getDBSCAN_MinPts() const; + // Set the chromosome to analyze. void setChromosome(std::string chr); std::string getChromosome() const; @@ -103,6 +111,8 @@ class InputData { int sample_size; uint32_t min_cnv_length; int min_reads; + double dbscan_epsilon; + int dbscan_min_pts; std::string chr; // Chromosome to analyze std::pair start_end; // Region to analyze bool region_set; // True if a region is set diff --git a/src/input_data.cpp b/src/input_data.cpp index 40a640a2..649e8b1c 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -27,6 +27,8 @@ InputData::InputData() this->sample_size = 100; this->min_cnv_length = 1000; this->min_reads = 5; + this->dbscan_epsilon = 0.5; + this->dbscan_min_pts = 5; this->thread_count = 1; this->hmm_filepath = "data/wgs.hmm"; this->verbose = false; @@ -170,6 +172,26 @@ int InputData::getMinReadSupport() const return this->min_reads; } +void InputData::setDBSCAN_Epsilon(double epsilon) +{ + this->dbscan_epsilon = epsilon; +} + +double InputData::getDBSCAN_Epsilon() const +{ + return this->dbscan_epsilon; +} + +void InputData::setDBSCAN_MinPts(int min_pts) +{ + this->dbscan_min_pts = min_pts; +} + +int InputData::getDBSCAN_MinPts() const +{ + return this->dbscan_min_pts; +} + void InputData::setChromosome(std::string chr) { this->chr = chr; diff --git a/src/main.cpp b/src/main.cpp index 58d8fbdc..5275f368 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -62,6 +62,15 @@ void runContextSV(const std::unordered_map& args) input_data.setVerbose(true); } + // DBSCAN parameters + if (args.find("epsilon") != args.end()) { + input_data.setDBSCAN_Epsilon(std::stod(args.at("epsilon"))); + } + + if (args.find("min-pts") != args.end()) { + input_data.setDBSCAN_MinPts(std::stoi(args.at("min-pts"))); + } + // Run ContextSV run(input_data); } @@ -80,6 +89,8 @@ void printUsage(const std::string& programName) { << " -n, --sample-size Sample size for HMM predictions\n" << " --min-cnv Minimum CNV length\n" << " --min-reads Minimum read support\n" + << " --eps DBSCAN epsilon\n" + << " --min-pts DBSCAN minimum points\n" << " -e, --eth ETH file\n" << " -p, --pfb PFB file\n" << " --save-cnv Save CNV data\n" @@ -116,6 +127,10 @@ std::unordered_map parseArguments(int argc, char* argv args["min-cnv"] = argv[++i]; } else if (arg == "--min-reads" && i + 1 < argc) { args["min-reads"] = argv[++i]; + } else if (arg == "--eps" && i + 1 < argc) { + args["epsilon"] = argv[++i]; + } else if (arg == "--min-pts" && i + 1 < argc) { + args["min-pts"] = argv[++i]; } else if ((arg == "-e" || arg == "--eth") && i + 1 < argc) { args["eth"] = argv[++i]; } else if ((arg == "-p" || arg == "--pfb") && i + 1 < argc) { diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 6c137c8c..1d8115f5 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -331,7 +331,10 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // int split_sv_support_threshold = 4; // Minimum number of supporting // reads for an SV call int split_sv_support_threshold = input_data.getMinReadSupport(); - // printMessage("Processing chromosome " + chr + " with filter threshold: " + std::to_string(filter_threshold)); + // printMessage("Processing chromosome " + chr + " with filter threshold: " + // + std::to_string(filter_threshold)); + double dbscan_epsilon = input_data.getDBSCAN_Epsilon(); + int dbscan_min_pts = input_data.getDBSCAN_MinPts(); // Open the BAM file std::string bam_filepath = input_data.getLongReadBam(); @@ -391,7 +394,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v printMessage(chr + ": Merging CIGAR..."); // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold); - mergeSVs(chr_sv_calls, 0.8, 5); + mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts); // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold); int region_sv_count = getSVCount(chr_sv_calls); printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); diff --git a/src/sv_object.cpp b/src/sv_object.cpp index bf4e44a5..0dd23e79 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -77,42 +77,9 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) std::vector merged_sv_calls; // Create a set of size intervals and corresponding DBSCAN epsilons + printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts)); std::map, double> size_to_eps; - // size_to_eps[{0, 1000}] = 200; - // size_to_eps[{1000, 5000}] = 500; - // size_to_eps[{5000, 10000}] = 3000; - // size_to_eps[{10000, 50000}] = 4000; - // size_to_eps[{50000, 100000}] = 5000; - // size_to_eps[{100000, 500000}] = 10000; - // size_to_eps[{500000, 1000000}] = 20000; - - // Small SVs - // size_to_eps[{50, 200}] = 50; - - // // Medium SVs - // size_to_eps[{200, 1000}] = 200; - - // // Large SVs - // size_to_eps[{1000, 10000}] = 1000; - - // // Very large SVs - // size_to_eps[{10000, 100000}] = 10000; - - // // Extreme SVs - // size_to_eps[{100000, 1000000}] = 20000; - - // std::vector epsilons = {50, 200, 1000, 10000, 100000, 500000}; - // std::vector epsilons = {0.2}; - - // double epsilon = size_interval.second; - // // Calculate epsilon as 20% of the largest size in the interval - // double epsilon = 0.1 * size_interval.first.second; - // printMessage("Clustering SVs with size " + std::to_string(size_interval.first.first) + "-" + std::to_string(size_interval.first.second) + " with epsilon " + std::to_string(epsilon)); - // int min_pts = 2; - // int min_pts = 2; DBSCAN dbscan(epsilon, min_pts); - // DBSCAN dbscan(size_interval.second, 2); - for ( const auto& sv_type : { SVType::DEL, SVType::DUP, @@ -122,8 +89,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) SVType::INV_DUP }) { - // DBSCAN dbscan(1000, 2); - // Create a vector of SV calls for the current SV type and size interval std::vector sv_type_calls; std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) { From 686fe9965fc1fd643fea46a4a39c17567f1ba437 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 6 Feb 2025 20:13:01 -0500 Subject: [PATCH 070/134] cluster primary alignments --- .gitignore | 1 + include/dbscan.h | 4 + include/sv_caller.h | 2 - python/plot_distributions.py | 14 ++- src/sv_caller.cpp | 206 +++++++++++++++++++---------------- src/sv_object.cpp | 15 ++- 6 files changed, 139 insertions(+), 103 deletions(-) diff --git a/.gitignore b/.gitignore index 343adf0f..9f6f43d4 100644 --- a/.gitignore +++ b/.gitignore @@ -87,6 +87,7 @@ data/hg19ToHg38.over.chain.gz # Test images python/dbscan_clustering*.png python/dist_plots +upset_plot*.png # Temporary files lib/.nfs* diff --git a/include/dbscan.h b/include/dbscan.h index 923570c5..9826a144 100644 --- a/include/dbscan.h +++ b/include/dbscan.h @@ -12,6 +12,7 @@ class DBSCAN { public: DBSCAN(double epsilon, int minPts) : epsilon(epsilon), minPts(minPts) {} + // Fit the DBSCAN algorithm to SV calls void fit(const std::vector& sv_calls); const std::vector& getClusters() const; @@ -21,10 +22,13 @@ class DBSCAN { int minPts; std::vector clusters; + // Expand the cluster for a given SV call bool expandCluster(const std::vector& sv_calls, size_t pointIdx, int clusterId); + // Find the region query for a given SV call std::vector regionQuery(const std::vector& sv_calls, size_t pointIdx) const; + // Calculate the distance between two SV calls double distance(const SVCall& a, const SVCall& b) const; }; diff --git a/include/sv_caller.h b/include/sv_caller.h index 1824015c..5eb96e17 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -61,8 +61,6 @@ class SVCaller { void saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const; - void trimOverlappingAlignments(GenomicRegion& primary_alignment, GenomicRegion& supp_alignment, const MismatchData& primary_mismatches, const MismatchData& supp_mismatches); - // Calculate the read depth (INFO/DP) for a region int calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end); diff --git a/python/plot_distributions.py b/python/plot_distributions.py index 7db7cb2a..8766a157 100644 --- a/python/plot_distributions.py +++ b/python/plot_distributions.py @@ -192,13 +192,21 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"): # Add the bin edges to the x-axis ticks as a range fig.update_xaxes(tickvals=x_values, ticktext=bin_labels) - # Move the legend to the top right inside the plot + # # Move the legend to the top right inside the plot + # fig.update_layout(legend=dict( + # orientation='v', + # yanchor='top', + # y=0.75, + # xanchor='right', + # x=0.75, + # )) + # Move the legend to the bottom right outside the plot fig.update_layout(legend=dict( orientation='v', yanchor='top', - y=0.75, + y=1.0, xanchor='right', - x=0.75, + x=1.15, )) # Set a larger font size for all text in the plot diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 1d8115f5..e6ff793a 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -27,6 +27,7 @@ #include "sv_types.h" #include "version.h" #include "fasta_query.h" +#include "dbscan.h" /// @endcond # define DUP_SEQSIM_THRESHOLD 0.9 // Sequence similarity threshold for duplication detection @@ -107,45 +108,106 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam bam_destroy1(bam1); printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments"); - // Filter overlapping primary alignments and keep the one with the highest mapping - // quality - // std::vector to_remove_overlapping; - std::unordered_set to_remove_overlapping; - for (const auto& entry1 : primary_map) { - const std::string& qname1 = entry1.first; - const GenomicRegion& primary1 = entry1.second; - for (const auto& entry2 : primary_map) { - const std::string& qname2 = entry2.first; - if (qname1 == qname2) { - continue; - } - const GenomicRegion& primary2 = entry2.second; - if (primary1.tid == primary2.tid && primary1.start <= primary2.end && primary1.end >= primary2.start) { - // Overlapping primary alignments - // printMessage("Overlapping primary alignments with quality " + std::to_string(primary_map_qual[qname1]) + " and " + std::to_string(primary_map_qual[qname2])); - // if (primary_map_qual[qname1] < primary_map_qual[qname2]) { - if (primary1.qual < primary2.qual) { - // to_remove_overlapping.push_back(qname1); - to_remove_overlapping.insert(qname1); - } else { - // If equal, remove the shorter alignment - if (primary1.end - primary1.start < primary2.end - primary2.start) { - // to_remove_overlapping.push_back(qname1); - to_remove_overlapping.insert(qname1); - } else { - // to_remove_overlapping.push_back(qname2); - to_remove_overlapping.insert(qname2); - } - } + // Create a set of dummy SVs from the primary alignments for each chromosome + // and run DBSCAN to merge them + std::unordered_map> dummy_sv_map; + std::unordered_map> dummy_sv_qnames; + for (const auto& entry : primary_map) { + const std::string& chrom = bamHdr->target_name[entry.second.tid]; + uint32_t start = entry.second.start; + uint32_t end = entry.second.end; + const std::string& qname = entry.first; + SVCall sv_call(start, end, SVType::DUP, ".", qname, ".", 0.0, 0, 0, 0, 0); + dummy_sv_map[chrom].emplace_back(sv_call); + dummy_sv_qnames[chrom].emplace_back(entry.first); + } + + // Run DBSCAN to merge the dummy SVs + double epsilon = 0.65; + int min_pts = 2; + std::unordered_set qnames_to_keep; + for (const auto& entry : dummy_sv_map) { + const std::string& chrom = entry.first; + const std::vector& sv_calls = entry.second; + DBSCAN dbscan(epsilon, min_pts); + dbscan.fit(sv_calls); + const std::vector& clusters = dbscan.getClusters(); + std::map> cluster_map; + for (size_t i = 0; i < clusters.size(); ++i) { + cluster_map[clusters[i]].push_back(sv_calls[i]); + } + + // Merge the SVs in each cluster, using the median of the start and end + // positions of the SVs in each cluster + for (auto& cluster : cluster_map) { + int cluster_id = cluster.first; + std::vector& cluster_sv_calls = cluster.second; + if (cluster_id < 0) { + continue; // Skip noise and unclassified points } + + // Use the median length SV as the representative SV + std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { + return (a.end - a.start) < (b.end - b.start); + }); + SVCall median_sv = cluster_sv_calls[cluster_sv_calls.size() / 2]; + const std::string& qname = median_sv.data_type; + qnames_to_keep.insert(qname); } } - for (const std::string& qname : to_remove_overlapping) { + // Remove the SVs that are not in the qnames_to_keep set + std::unordered_set qnames_to_remove; + for (const auto& entry : primary_map) { + const std::string& qname = entry.first; + if (qnames_to_keep.find(qname) == qnames_to_keep.end()) { + qnames_to_remove.insert(qname); + } + } + + for (const std::string& qname : qnames_to_remove) { primary_map.erase(qname); supp_map.erase(qname); } - printMessage(region + ": Removed " + std::to_string(to_remove_overlapping.size()) + " overlapping primary alignments"); + + // Filter overlapping primary alignments and keep the one with the highest mapping + // quality + + // for (const auto& entry1 : primary_map) { + // const std::string& qname1 = entry1.first; + // const GenomicRegion& primary1 = entry1.second; + // for (const auto& entry2 : primary_map) { + // const std::string& qname2 = entry2.first; + // if (qname1 == qname2) { + // continue; + // } + // const GenomicRegion& primary2 = entry2.second; + // if (primary1.tid == primary2.tid && primary1.start <= primary2.end && primary1.end >= primary2.start) { + // // Overlapping primary alignments + // // printMessage("Overlapping primary alignments with quality " + std::to_string(primary_map_qual[qname1]) + " and " + std::to_string(primary_map_qual[qname2])); + // // if (primary_map_qual[qname1] < primary_map_qual[qname2]) { + // if (primary1.qual < primary2.qual) { + // // to_remove_overlapping.push_back(qname1); + // to_remove_overlapping.insert(qname1); + // } else { + // // If equal, remove the shorter alignment + // if (primary1.end - primary1.start < primary2.end - primary2.start) { + // // to_remove_overlapping.push_back(qname1); + // to_remove_overlapping.insert(qname1); + // } else { + // // to_remove_overlapping.push_back(qname2); + // to_remove_overlapping.insert(qname2); + // } + // } + // } + // } + // } + + // for (const std::string& qname : to_remove_overlapping) { + // primary_map.erase(qname); + // supp_map.erase(qname); + // } + // printMessage(region + ": Removed " + std::to_string(to_remove_overlapping.size()) + " overlapping primary alignments"); printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supp_map.size()) + " supplementary alignments after filtering"); } @@ -230,6 +292,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec std::bitset<256> amb_bases_bitset; for (char base : amb_bases) { amb_bases_bitset.set(base); + amb_bases_bitset.set(std::tolower(base)); } for (int i = 0; i < cigar_len; i++) { int op_len = bam_cigar_oplen(cigar[i]); // CIGAR operation length @@ -686,9 +749,9 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } } - // Merge the split-read SV calls - printMessage(region + ": Merging split-read SVs..."); - mergeSVs(split_sv_calls, 0.1, 2); + // // Merge the split-read SV calls + // printMessage(region + ": Merging split-read SVs..."); + // mergeSVs(split_sv_calls, 0.1, 2); // Unify the SV calls sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); @@ -855,6 +918,19 @@ void SVCaller::saveToVCF(const std::unordered_map amb_bases_bitset; + for (char base : amb_bases) { + amb_bases_bitset.set(base); + amb_bases_bitset.set(std::tolower(base)); + } + for (char& base : ref_allele) { + if (amb_bases_bitset.test(base)) { + base = 'N'; + } + } + // Create the VCF parameter strings std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + \ ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + \ @@ -887,66 +963,6 @@ void SVCaller::saveToVCF(const std::unordered_map= supp_query_start) { - if (primary_mismatches.query_end >= supp_mismatches.query_start) { - // Calculate the mismatch rates at the overlapping region - double primary_mismatch_rate = this->calculateMismatchRate(primary_mismatches); - double supp_mismatch_rate = this->calculateMismatchRate(supp_mismatches); - hts_pos_t overlap_length = primary_mismatches.query_end - supp_mismatches.query_start + 1; - - // Trim the ailgnment with the higher mismatch rate - if (primary_mismatch_rate > supp_mismatch_rate) { - // Trim the end of the primary alignment, ensuring that the new - // end is not less than the start - if (primary_alignment.end > overlap_length && (primary_alignment.end - overlap_length) > primary_alignment.start) { - // Trim the end of the primary alignment - primary_alignment.end = primary_alignment.end - overlap_length; - } - } else { - // Trim the beginning of the supplementary alignment, ensuring - // that the new start is not greater than the end - if (supp_alignment.start + overlap_length < supp_alignment.end) { - // Trim the beginning of the supplementary alignment - supp_alignment.start = supp_alignment.start + overlap_length; - } - } - } - - } else { - // Supplementary before primary in the query - if (primary_mismatches.query_start <= supp_mismatches.query_end) { - // Calculate the mismatch rates at the overlapping region - double primary_mismatch_rate = this->calculateMismatchRate(primary_mismatches); - double supp_mismatch_rate = this->calculateMismatchRate(supp_mismatches); - hts_pos_t overlap_length = supp_mismatches.query_end - primary_mismatches.query_start + 1; - - // Trim the ailgnment with the higher mismatch rate - if (supp_mismatch_rate > primary_mismatch_rate) { - // Trim the end of the supplementary alignment, ensuring that - // the new end is not less than the start - if (supp_alignment.end > overlap_length && (supp_alignment.end - overlap_length) > supp_alignment.start) { - // Trim the end of the supplementary alignment - supp_alignment.end = supp_alignment.end - overlap_length; - } - } else { - // Trim the beginning of the primary alignment, ensuring that - // the new start is not greater than the end - if (primary_alignment.start + overlap_length < primary_alignment.end) { - // Trim the beginning of the primary alignment - primary_alignment.start = primary_alignment.start + overlap_length; - } - } - } - } -} - int SVCaller::calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end) { int read_depth = 0; diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 0dd23e79..fb3a17d9 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -77,9 +77,8 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) std::vector merged_sv_calls; // Create a set of size intervals and corresponding DBSCAN epsilons - printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts)); - std::map, double> size_to_eps; - DBSCAN dbscan(epsilon, min_pts); + // printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts)); + // DBSCAN dbscan(epsilon, min_pts); for ( const auto& sv_type : { SVType::DEL, SVType::DUP, @@ -89,6 +88,16 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) SVType::INV_DUP }) { + // Create a DBSCAN object for the current SV type + if (sv_type == SVType::DEL) { + epsilon = 0.45; + min_pts = 16; + } else { + epsilon = 0.65; + min_pts = 15; + } + DBSCAN dbscan(epsilon, min_pts); + // Create a vector of SV calls for the current SV type and size interval std::vector sv_type_calls; std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) { From d6f1b6f4635ec24b9ad868d0b2c649c4410010d8 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 8 Feb 2025 18:10:39 -0500 Subject: [PATCH 071/134] add invdel --- include/sv_caller.h | 2 +- include/sv_object.h | 7 +- include/sv_types.h | 6 +- src/sv_caller.cpp | 80 +++++++++++------------ src/sv_object.cpp | 153 ++++++++++++++++++++++++++++++-------------- 5 files changed, 154 insertions(+), 94 deletions(-) diff --git a/include/sv_caller.h b/include/sv_caller.h index 5eb96e17..fedd172f 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -52,7 +52,7 @@ class SVCaller { int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); // Detect SVs from split alignments - void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data); + void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data); // Calculate the mismatch rate given a map of query positions to // match/mismatch (1/0) values within a specified range of the query diff --git a/include/sv_object.h b/include/sv_object.h index da544571..58ccd5dc 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -25,17 +25,16 @@ struct SVCall { int read_depth = 0; // Breakpoint depth int support = 0; // Number of supporting reads int cluster_size = 0; // Number of SV calls in the cluster - uint8_t qual = 0; // Alignment quality score // Comparison operator for std::set bool operator<(const SVCall& other) const; // Constructor with parameters for all fields - SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size, uint8_t qual) : - start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size), qual(qual) {} + SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) : + start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {} }; -void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, uint8_t qual); +void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth); void mergeSVs(std::vector& sv_calls); diff --git a/include/sv_types.h b/include/sv_types.h index c0e1fabf..26415935 100644 --- a/include/sv_types.h +++ b/include/sv_types.h @@ -20,8 +20,9 @@ namespace sv_types { INS = 3, BND = 4, NEUTRAL = 5, // Neutral copy number with unknown type - INV_DUP = 6, // Inversion duplication - COMPLEX = 7 // Complex SV + INV_DUP = 6, // Inverted duplication + INV_DEL = 7, // Inverted deletion + COMPLEX = 8 // Complex SV }; // Mapping of SV types to strings @@ -34,6 +35,7 @@ namespace sv_types { {SVType::BND, "BND"}, {SVType::NEUTRAL, "NEUTRAL"}, {SVType::INV_DUP, "INVDUP"}, + {SVType::INV_DEL, "INVDEL"}, {SVType::COMPLEX, "COMPLEX"} }; diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index e6ff793a..a8e25c5f 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -117,7 +117,7 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam uint32_t start = entry.second.start; uint32_t end = entry.second.end; const std::string& qname = entry.first; - SVCall sv_call(start, end, SVType::DUP, ".", qname, ".", 0.0, 0, 0, 0, 0); + SVCall sv_call(start, end, SVType::DUP, ".", qname, ".", 0.0, 0, 0, 0); dummy_sv_map[chrom].emplace_back(sv_call); dummy_sv_qnames[chrom].emplace_back(entry.first); } @@ -323,7 +323,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) { int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "LSEQSIM", "./.", default_lh, read_depth, qual); + addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "LSEQSIM", "./.", default_lh, read_depth); continue; } } @@ -337,7 +337,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) { int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth, qual); + addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth); continue; } } @@ -355,7 +355,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec alt_allele = ins_seq_str; } - addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, qual); + addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth); // Check if the CIGAR operation is a deletion } else if (op == BAM_CDEL && is_primary) { @@ -363,7 +363,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec ref_pos = pos+1; ref_end = ref_pos + op_len -1; int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); - addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "", "CIGARDEL", "./.", default_lh, read_depth, qual); + addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "", "CIGARDEL", "./.", default_lh, read_depth); // Print if the ref pos is within the range 44007800-44007930 if (ref_pos >= 44007800 && ref_pos <= 44007930) { @@ -394,8 +394,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // int split_sv_support_threshold = 4; // Minimum number of supporting // reads for an SV call int split_sv_support_threshold = input_data.getMinReadSupport(); - // printMessage("Processing chromosome " + chr + " with filter threshold: " - // + std::to_string(filter_threshold)); double dbscan_epsilon = input_data.getDBSCAN_Epsilon(); int dbscan_min_pts = input_data.getDBSCAN_MinPts(); @@ -456,13 +454,13 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); printMessage(chr + ": Merging CIGAR..."); - // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold); - mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts); - // filterSVsWithLowSupport(chr_sv_calls, cigar_sv_support_threshold); + double cigar_epsilon = 0.45; + int cigar_min_pts = 15; + mergeSVs(chr_sv_calls, cigar_epsilon, cigar_min_pts); + int region_sv_count = getSVCount(chr_sv_calls); printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); - // Testing on HG002 whole genome // Run copy number variant predictions on the SVs detected from the // CIGAR string, using a minimum CNV length threshold if (region_sv_count > 0) { @@ -472,7 +470,23 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // Run split-read SV and copy number variant predictions printMessage(chr + ": Split read SVs..."); - this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, chr_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data); + std::vector split_sv_calls; + this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, split_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data); + + // Merge the split-read SVs separately + printMessage(chr + ": Merging split reads..."); + double split_epsilon = 0.45; + int split_min_pts = 2; + mergeSVs(split_sv_calls, split_epsilon, split_min_pts); + + // Unify the SV calls + printMessage(chr + ": Unifying SVs..."); + chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); + + // printMessage(chr + ": Final merge..."); + // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts); + + // TODO: Merge subsets based on highest HMM likelihood // Sort the SV calls by start position std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) { @@ -583,7 +597,7 @@ void SVCaller::run(const InputData& input_data) // Detect SVs from split read alignments -void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) +void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) { // printMessage(region + ": Getting split alignments..."); std::unordered_map primary_map; @@ -592,7 +606,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Find split-read SV evidence // printMessage(region + ": Finding split-read SVs..."); - std::vector split_sv_calls; + // std::vector split_sv_calls; int current_primary = 0; int primary_count = primary_map.size(); uint32_t min_cnv_length = input_data.getMinCNVLength(); @@ -621,7 +635,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Reverse-oriented relative to the reference alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]"; } - addSVCall(split_sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, primary.qual); + addSVCall(split_sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0); // Create the alternate allele format for the second BND record alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "["; @@ -629,7 +643,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Reverse-oriented relative to the reference alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]"; } - addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, largest_supp.qual); + addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0); continue; } @@ -637,15 +651,12 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // Inversion detection bool is_opposite_strand = primary.strand != largest_supp.strand; if (is_opposite_strand) { - // if (supp_length >= min_cnv_length) { if (largest_supp.end - largest_supp.start >= min_cnv_length) { // Print error if the start position is greater than the end // position - // if (supp_start > supp_end) { if (largest_supp.start > largest_supp.end) { printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(largest_supp.start) + "-" + std::to_string(largest_supp.end)); - // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(supp_start) + "-" + std::to_string(supp_end)); continue; } @@ -656,26 +667,22 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); - // printMessage("Test3"); int read_depth = this->calculateReadDepth(pos_depth_map, largest_supp.start, largest_supp.end); - // int read_depth = this->calculateReadDepth(pos_depth_map, supp_start, supp_end); if (supp_type == SVType::NEUTRAL) { - // addSVCall(sv_calls, supp_start, supp_end, "INV", - // "", "SPLIT", "./.", supp_lh, read_depth); - addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual); + addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "", "SPLIT", "./.", supp_lh, read_depth); continue; } else if (supp_type == SVType::DUP) { - // addSVCall(sv_calls, supp_start, supp_end, "INVDUP", - // "", "SPLIT", "./.", supp_lh, read_depth); - addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "", "SPLIT", "./.", supp_lh, read_depth, largest_supp.qual); + addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "", "SPLIT", "./.", supp_lh, read_depth); + continue; + } else if (supp_type == SVType::DEL) { + addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DEL, "", "SPLIT", "./.", supp_lh, read_depth); continue; } } } // Analyze split-read evidence for deletions and duplications - uint8_t mean_qual = (primary.qual + largest_supp.qual) / 2; bool gap_exists = false; uint32_t boundary_left, boundary_right, gap_left, gap_right; boundary_left = std::min(primary.start, largest_supp.start); @@ -693,10 +700,7 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in printError("ERROR: Invalid boundary coordinates: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right)); continue; } - - // printMessage(region + ": Running copy number prediction for - // boundary..."); - // printMessage("Running copy number prediction, length: " + std::to_string(boundary_right - boundary_left)); + std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data); if (std::get<1>(bd_result) == SVType::UNKNOWN) { continue; @@ -728,18 +732,18 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in if (gap_lh > bd_lh) { int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right); std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">"; - addSVCall(split_sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth, mean_qual); + addSVCall(split_sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth); } else { // Add the boundary as the SV call int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual); + addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth); } } else { // Add the boundary as the SV call int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, mean_qual); + addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth); } } @@ -749,12 +753,8 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } } - // // Merge the split-read SV calls - // printMessage(region + ": Merging split-read SVs..."); - // mergeSVs(split_sv_calls, 0.1, 2); - // Unify the SV calls - sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); + // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); } void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const diff --git a/src/sv_object.cpp b/src/sv_object.cpp index fb3a17d9..e46afb77 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -16,12 +16,8 @@ bool SVCall::operator<(const SVCall & other) const return start < other.start || (start == other.start && end < other.end); } -void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, uint8_t qual) +void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth) { - // Ignore unknown SV types - // if (sv_type == "UNKNOWN" || sv_type == "NEUTRAL") { - // return; - // } if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) { return; } @@ -32,28 +28,9 @@ void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVTy } // Insert the SV call in sorted order - SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1, qual}; + SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1}; auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call); sv_calls.insert(it, sv_call); - - // // Determine if the SV call already exists - // if (it != sv_calls.end() && it->start == start && it->end == end) - // { - // it->support += 1; // Update the read support - - // // Update SV type if likelihood is higher - // if (hmm_likelihood != 0.0 && hmm_likelihood > it->hmm_likelihood) - // { - // // Update the SV call - // it->sv_type = sv_type; - // it->data_type = data_type; - // it->genotype = genotype; - // it->hmm_likelihood = hmm_likelihood; - // it->qual = qual; - // } - // } else { - // sv_calls.insert(it, sv_call); // Insert the new SV call - // } } uint32_t getSVCount(const std::vector& sv_calls) @@ -68,6 +45,8 @@ void concatenateSVCalls(std::vector &target, const std::vector& void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) { + printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts)); + if (sv_calls.size() < 2) { return; } @@ -76,9 +55,8 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) // Cluster SVs using DBSCAN for each SV type std::vector merged_sv_calls; - // Create a set of size intervals and corresponding DBSCAN epsilons - // printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts)); - // DBSCAN dbscan(epsilon, min_pts); + // Cluster SVs using DBSCAN for each SV type + DBSCAN dbscan(epsilon, min_pts); for ( const auto& sv_type : { SVType::DEL, SVType::DUP, @@ -89,14 +67,22 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) }) { // Create a DBSCAN object for the current SV type - if (sv_type == SVType::DEL) { - epsilon = 0.45; - min_pts = 16; - } else { - epsilon = 0.65; - min_pts = 15; - } - DBSCAN dbscan(epsilon, min_pts); + // epsilon = 0.45; + // min_pts = 15; + // if (sv_type == SVType::DEL) { + // epsilon = 0.45; + // min_pts = 16; + // } else { + // // epsilon = 0.65; + // // min_pts = 15; + // // epsilon = 0.45; + // // min_pts = 16; + // // epsilon = 0.45; + // // min_pts = 2; + // // epsilon = 0.45; + // // min_pts = 15; + // } + // DBSCAN dbscan(epsilon, min_pts); // Create a vector of SV calls for the current SV type and size interval std::vector sv_type_calls; @@ -120,22 +106,95 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) for (auto& cluster : cluster_map) { int cluster_id = cluster.first; std::vector& cluster_sv_calls = cluster.second; - if (cluster_id < 0) { - continue; // Skip noise and unclassified points - } else { - // Use the median length SV - std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { - return (a.end - a.start) < (b.end - b.start); - }); - int median_index = cluster_sv_calls.size() / 2; - SVCall median_sv_call = cluster_sv_calls[median_index]; - median_sv_call.cluster_size = (int) cluster_sv_calls.size(); - merged_sv_calls.push_back(median_sv_call); + // if (cluster_id < 0) { + // continue; // Skip noise and unclassified points + // } else { + if (true) { + // Use the highest HMM likelihood normalized by SV size as the + // representative SV (if any non-zero likelihoods exist) + bool has_nonzero_likelihood = false; + if (cluster_sv_calls.size() > 0) { + for (const auto& sv_call : cluster_sv_calls) { + + // Check if any SV has a non-zero likelihood + if (sv_call.hmm_likelihood != 0.0) { + has_nonzero_likelihood = true; + break; + } + } + } + + // [TEST] Check if any SV has a length greater than 600kb + bool found_large_sv = false; + for (const auto& sv_call : cluster_sv_calls) { + if (sv_call.end - sv_call.start > 600000) { + found_large_sv = true; + break; + } + } + if (found_large_sv) { + printMessage("Found large SV with length greater than 600kb"); + printMessage("Found " + std::to_string(cluster_sv_calls.size()) + " SVs in cluster " + std::to_string(cluster_id) + " of type " + getSVTypeString(sv_type) + ", with epsilon=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts)); + } + + SVCall merged_sv_call = cluster_sv_calls[0]; + if (has_nonzero_likelihood) { + // Use the highest HMM likelihood normalized by SV size as the + // representative SV + // std::vector likelihoods; + // Default very low log-likelihood for zero likelihoods + std::vector likelihoods(cluster_sv_calls.size(), -std::numeric_limits::infinity()); + // for (const auto& sv_call : cluster_sv_calls) { + int i = 0; + for (const auto& sv_call : cluster_sv_calls) { + if (sv_call.hmm_likelihood != 0.0) { + uint32_t sv_size = (uint32_t) (sv_call.end - sv_call.start); + if (sv_size > 0) { + likelihoods[i] = sv_call.hmm_likelihood / sv_size; + // likelihoods.push_back(sv_call.hmm_likelihood / sv_size); + } + } + + // Print the SV length, likelihood, and normalized + // likelihood + if (found_large_sv) { + printMessage("Start: " + std::to_string(sv_call.start) + ", end: " + std::to_string(sv_call.end) + ", likelihood: " + std::to_string(sv_call.hmm_likelihood) + ", normalized likelihood: " + std::to_string(likelihoods[i]) + ", length: " + std::to_string(sv_call.end - sv_call.start)); + // printMessage("SV length: " + std::to_string(sv_call.end - sv_call.start) + ", likelihood: " + std::to_string(sv_call.hmm_likelihood) + ", normalized likelihood: " + std::to_string(likelihoods[i]) + ", start: " + std::to_string(sv_call.start) + ", end: " + std::to_string(sv_call.end)); + } + i++; + } + + // Find the index of the maximum element in the likelihoods + // vector + auto max_likelihood_it = std::max_element(likelihoods.begin(), likelihoods.end()); + int max_likelihood_index = std::distance(likelihoods.begin(), max_likelihood_it); + merged_sv_call = cluster_sv_calls[max_likelihood_index]; + printMessage("Merged SV with highest normalized likelihood: " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", likelihood: " + std::to_string(merged_sv_call.hmm_likelihood) + ", normalized likelihood: " + std::to_string(merged_sv_call.hmm_likelihood / (merged_sv_call.end - merged_sv_call.start)) + ", size: " + std::to_string(merged_sv_call.end - merged_sv_call.start)); + + } else { + // Use the median length SV + std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { + return (a.end - a.start) < (b.end - b.start); + }); + int median_index = cluster_sv_calls.size() / 2; + merged_sv_call = cluster_sv_calls[median_index]; + printMessage("Merged SV with median length: " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", likelihood: " + std::to_string(merged_sv_call.hmm_likelihood) + ", size: " + std::to_string(merged_sv_call.end - merged_sv_call.start)); + + if (cluster_id < 0) { + merged_sv_call.cluster_size = cluster_id; + } else { + merged_sv_call.cluster_size = (int) cluster_sv_calls.size(); + } + // merged_sv_call.cluster_size = (int) cluster_sv_calls.size(); + merged_sv_calls.push_back(merged_sv_call); cluster_count++; + } } } printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type)); } + + printMessage("[TEST] Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(merged_sv_calls.size()) + " SV calls"); sv_calls = std::move(merged_sv_calls); // Replace with filtered list int updated_size = sv_calls.size(); From ddf6d50f9b5bdee21879916f5d75d31a9d21bb37 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 10 Feb 2025 20:24:20 -0500 Subject: [PATCH 072/134] improve split read breakpoints --- include/dbscan1d.h | 34 +++ include/sv_caller.h | 46 +++- include/sv_object.h | 5 +- src/dbscan1d.cpp | 90 +++++++ src/sv_caller.cpp | 618 ++++++++++++++++++++++++++------------------ src/sv_object.cpp | 134 +++++----- 6 files changed, 591 insertions(+), 336 deletions(-) create mode 100644 include/dbscan1d.h create mode 100644 src/dbscan1d.cpp diff --git a/include/dbscan1d.h b/include/dbscan1d.h new file mode 100644 index 00000000..07692e65 --- /dev/null +++ b/include/dbscan1d.h @@ -0,0 +1,34 @@ +#ifndef DBSCAN1D_H +#define DBSCAN1D_H + + +#include +#include +#include +#include + + +class DBSCAN1D { + public: + DBSCAN1D(double epsilon, int minPts) : epsilon(epsilon), minPts(minPts) {} + + void fit(const std::vector& points); + + const std::vector& getClusters() const; + + std::vector getLargestCluster(const std::vector &points); + + private: + double epsilon; + int minPts; + std::vector clusters; + + bool expandCluster(const std::vector& points, size_t pointIdx, int clusterId); + + std::vector regionQuery(const std::vector& points, size_t pointIdx) const; + + double distance(int a, int b) const; + +}; + +#endif // DBSCAN1D_H diff --git a/include/sv_caller.h b/include/sv_caller.h index fedd172f..1998547f 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -22,8 +22,50 @@ struct GenomicRegion { hts_pos_t end; bool strand; uint8_t qual; + int cluster_size; // Number of alignments used for this region }; +// Interval Tree Node +struct IntervalNode { + GenomicRegion region; + std::string qname; + hts_pos_t max_end; // To optimize queries + IntervalNode* left; + IntervalNode* right; + + IntervalNode(GenomicRegion r, std::string name) + : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {} +}; + +IntervalNode* insert(IntervalNode* root, GenomicRegion region, std::string qname) { + if (!root) + return new IntervalNode(region, qname); + + if (region.start < root->region.start) + root->left = insert(root->left, region, qname); + else + root->right = insert(root->right, region, qname); + + // Update max_end + root->max_end = std::max(root->max_end, region.end); + return root; +} + +void findOverlaps(IntervalNode* root, GenomicRegion query, std::vector& result) { + if (!root) return; + + // If overlapping, add to result + if (query.start <= root->region.end && query.end >= root->region.start) + result.push_back(root->qname); + + // If left subtree may have overlaps, search left + if (root->left && root->left->max_end >= query.start) + findOverlaps(root->left, query, result); + + // Always check the right subtree + findOverlaps(root->right, query, result); +} + struct MismatchData { uint32_t query_start; uint32_t query_end; @@ -35,7 +77,7 @@ class SVCaller { int min_mapq = 20; // Minimum mapping quality to be considered std::mutex shared_mutex; - void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map); + std::vector getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map); // Detect SVs from the CIGAR string of a read alignment, and return the // mismatch rate, and the start and end positions of the query sequence @@ -64,6 +106,8 @@ class SVCaller { // Calculate the read depth (INFO/DP) for a region int calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end); + bool regionOverlaps(const GenomicRegion& a, const GenomicRegion& b); + public: // Constructor with no arguments SVCaller() = default; diff --git a/include/sv_object.h b/include/sv_object.h index 58ccd5dc..fc090166 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -34,10 +34,13 @@ struct SVCall { start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {} }; -void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth); +// void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth); +void addSVCall(std::vector& sv_calls, SVCall& sv_call); void mergeSVs(std::vector& sv_calls); +void mergeSVSubsets(std::vector& sv_calls); + void filterSVsWithLowSupport(std::vector &sv_calls, int min_depth); void filterSVsWithLowSupport(std::vector &sv_calls, int min_depth, const std::string& data_type); diff --git a/src/dbscan1d.cpp b/src/dbscan1d.cpp new file mode 100644 index 00000000..90fc9458 --- /dev/null +++ b/src/dbscan1d.cpp @@ -0,0 +1,90 @@ +#include "dbscan1d.h" + +#include +#include +#include +#include + +void DBSCAN1D::fit(const std::vector& points) { + int clusterId = 0; + clusters.assign(points.size(), -1); // -1 means unclassified + + for (size_t i = 0; i < points.size(); ++i) { + if (clusters[i] == -1) { // if point is not yet classified + if (expandCluster(points, i, clusterId)) { + ++clusterId; + } + } + } +} + +const std::vector& DBSCAN1D::getClusters() const { + return clusters; +} + +bool DBSCAN1D::expandCluster(const std::vector& points, size_t pointIdx, int clusterId) { + std::vector seeds = regionQuery(points, pointIdx); + if (static_cast(seeds.size()) < minPts) { + clusters[pointIdx] = -2; // mark as noise + return false; + } + + for (size_t seedIdx : seeds) { + clusters[seedIdx] = clusterId; + } + + seeds.erase(std::remove(seeds.begin(), seeds.end(), pointIdx), seeds.end()); + + while (!seeds.empty()) { + size_t currentPoint = seeds.back(); + seeds.pop_back(); + + std::vector result = regionQuery(points, currentPoint); + if (static_cast(result.size()) >= minPts) { + for (size_t resultPoint : result) { + if (clusters[resultPoint] == -1 || clusters[resultPoint] == -2) { + if (clusters[resultPoint] == -1) { + seeds.push_back(resultPoint); + } + clusters[resultPoint] = clusterId; + } + } + } + } + + return true; +} + +std::vector DBSCAN1D::regionQuery(const std::vector& points, size_t pointIdx) const { + std::vector neighbors; + for (size_t i = 0; i < points.size(); ++i) { + if (distance(points[pointIdx], points[i]) <= epsilon) { + neighbors.push_back(i); + } + } + return neighbors; +} + +double DBSCAN1D::distance(int point1, int point2) const { + return std::abs(point1 - point2); +} + +std::vector DBSCAN1D::getLargestCluster(const std::vector &points) +{ + std::vector clusters = getClusters(); + std::map> cluster_map; + for (size_t i = 0; i < clusters.size(); ++i) { + cluster_map[clusters[i]].push_back(points[i]); + } + + int largest_cluster_id = -1; + size_t largest_size = 0; + for (const auto &entry : cluster_map) { + if (entry.first >= 0 && entry.second.size() > largest_size) { + largest_size = entry.second.size(); + largest_cluster_id = entry.first; + } + } + + return cluster_map[largest_cluster_id]; +} diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index a8e25c5f..98877f3f 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -28,6 +28,7 @@ #include "version.h" #include "fasta_query.h" #include "dbscan.h" +#include "dbscan1d.h" /// @endcond # define DUP_SEQSIM_THRESHOLD 0.9 // Sequence similarity threshold for duplication detection @@ -41,19 +42,19 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) return ret; } -void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map) +std::vector SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map) { // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); if (!bam1) { printError("ERROR: failed to initialize BAM record"); - return; + return {}; } hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str()); if (!itr) { bam_destroy1(bam1); printError("ERROR: failed to query region " + region); - return; + return {}; } uint32_t primary_count = 0; @@ -73,11 +74,9 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam // Process primary alignments if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { - // primary_map[qname] = itr; // Store chromosome (TID), start, and end positions (1-based) of the // primary alignment, and the strand (true for forward, false for reverse) - primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq}; - // primary_map_qual[qname] = bam1->core.qual; + primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq, 0}; primary_count++; // Process supplementary alignments @@ -85,7 +84,7 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam // supp_map[qname].push_back(itr); // Store chromosome (TID), start, and end positions (1-based) of the // supplementary alignment, and the strand (true for forward, false for reverse) - supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq}); + supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq, 0}); supplementary_count++; } num_alignments++; @@ -109,106 +108,157 @@ void SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bam printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments"); // Create a set of dummy SVs from the primary alignments for each chromosome - // and run DBSCAN to merge them - std::unordered_map> dummy_sv_map; - std::unordered_map> dummy_sv_qnames; + // and run DBSCAN to cluster them + // std::vector dummy_sv_map; + // std::vector dummy_sv_qnames; + // for (const auto& entry : primary_map) { + // const std::string& chrom = bamHdr->target_name[entry.second.tid]; + // if (chrom != region) { + // continue; // Skip alignments not in the same chromosome + // } + // uint32_t start = entry.second.start; + // uint32_t end = entry.second.end; + // const std::string& qname = entry.first; + // SVCall sv_call(start, end, SVType::DUP, ".", qname, ".", 0.0, 0, 0, 0); + // dummy_sv_map.emplace_back(sv_call); + // dummy_sv_qnames.emplace_back(entry.first); + // } + + // // Run DBSCAN to merge the dummy SVs + // // double epsilon = 0.65; + // double epsilon = 0.45; + // int min_pts = 2; + // std::vector> primary_clusters; + // DBSCAN dbscan(epsilon, min_pts); + // dbscan.fit(dummy_sv_map); + // const std::vector& cluster_ids = dbscan.getClusters(); + + // // Create the 2D vector of clusters + // for (int cluster_id : cluster_ids) { + // if (cluster_id < 0) { + // continue; // Skip noise and unclassified points + // } + // std::vector cluster; + // for (size_t i = 0; i < cluster_ids.size(); ++i) { + // if (cluster_ids[i] == cluster_id) { + // cluster.push_back(dummy_sv_qnames[i]); + // } + // } + // primary_clusters.push_back(cluster); + // } + + // Identify overlapping primary alignments and then cluster their primary + // start, end vs. supplementary alignment start, end positions, keeping the + // median of the largest cluster for the primary and supplementary positions + // as the final genome coordinates of the SV + IntervalNode* root = nullptr; for (const auto& entry : primary_map) { - const std::string& chrom = bamHdr->target_name[entry.second.tid]; - uint32_t start = entry.second.start; - uint32_t end = entry.second.end; const std::string& qname = entry.first; - SVCall sv_call(start, end, SVType::DUP, ".", qname, ".", 0.0, 0, 0, 0); - dummy_sv_map[chrom].emplace_back(sv_call); - dummy_sv_qnames[chrom].emplace_back(entry.first); - } - - // Run DBSCAN to merge the dummy SVs - double epsilon = 0.65; - int min_pts = 2; - std::unordered_set qnames_to_keep; - for (const auto& entry : dummy_sv_map) { - const std::string& chrom = entry.first; - const std::vector& sv_calls = entry.second; - DBSCAN dbscan(epsilon, min_pts); - dbscan.fit(sv_calls); - const std::vector& clusters = dbscan.getClusters(); - std::map> cluster_map; - for (size_t i = 0; i < clusters.size(); ++i) { - cluster_map[clusters[i]].push_back(sv_calls[i]); - } - - // Merge the SVs in each cluster, using the median of the start and end - // positions of the SVs in each cluster - for (auto& cluster : cluster_map) { - int cluster_id = cluster.first; - std::vector& cluster_sv_calls = cluster.second; - if (cluster_id < 0) { - continue; // Skip noise and unclassified points - } - - // Use the median length SV as the representative SV - std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { - return (a.end - a.start) < (b.end - b.start); - }); - SVCall median_sv = cluster_sv_calls[cluster_sv_calls.size() / 2]; - const std::string& qname = median_sv.data_type; - qnames_to_keep.insert(qname); - } + const GenomicRegion& region = entry.second; + root = insert(root, region, qname); } + std::vector> primary_clusters; + std::set processed; - // Remove the SVs that are not in the qnames_to_keep set - std::unordered_set qnames_to_remove; for (const auto& entry : primary_map) { const std::string& qname = entry.first; - if (qnames_to_keep.find(qname) == qnames_to_keep.end()) { - qnames_to_remove.insert(qname); + if (processed.find(qname) != processed.end()) { + continue; // Skip already processed primary alignments + } + const GenomicRegion& region = entry.second; + std::vector overlap_group; + findOverlaps(root, region, overlap_group); + for (const std::string& qname : overlap_group) { + processed.insert(qname); + } + if (overlap_group.size() > 1) { + primary_clusters.push_back(overlap_group); } } + printMessage(region + ": Found " + std::to_string(primary_clusters.size()) + " groups of overlapping primary alignments"); + + // For each primary alignment cluster the supplementary alignment start and + // end positions, keeping the median of the largest cluster + std::vector sv_candidates; + int current_group = 0; + int min_length = 2000; + int max_length = 1000000; + for (const auto& primary_group : primary_clusters) { + // Use DBSCAN to cluster primary alignment start, end positions + DBSCAN1D dbscan(100, 5); + current_group++; + std::vector starts; + std::vector ends; + for (const std::string& qname : primary_group) { + const GenomicRegion& region = primary_map[qname]; + starts.push_back(region.start); + ends.push_back(region.end); + } - for (const std::string& qname : qnames_to_remove) { - primary_map.erase(qname); - supp_map.erase(qname); - } + // Get the largest cluster of primary alignment start positions + dbscan.fit(starts); + std::vector primary_start_cluster = dbscan.getLargestCluster(starts); + + // Get the largest cluster of primary alignment end positions + dbscan.fit(ends); + std::vector primary_end_cluster = dbscan.getLargestCluster(ends); + + // Get the supplementary alignment positions + std::vector supp_starts; + std::vector supp_ends; + for (const std::string& qname : primary_group) { + const std::vector& regions = supp_map[qname]; + for (const GenomicRegion& region : regions) { + supp_starts.push_back(region.start); + supp_ends.push_back(region.end); + } + } - // Filter overlapping primary alignments and keep the one with the highest mapping - // quality + // Get the largest cluster of supplementary alignment start positions + dbscan.fit(supp_starts); + std::vector supp_start_cluster = dbscan.getLargestCluster(supp_starts); + + // Get the largest cluster of supplementary alignment end positions + dbscan.fit(supp_ends); + std::vector supp_end_cluster = dbscan.getLargestCluster(supp_ends); + + // Use the median of the largest cluster of primary and supplementary + // alignment start, end positions as the final genome coordinates of the + // SV + int primary_pos = -1; + if (primary_start_cluster.size() > primary_end_cluster.size()) { + std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); + primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; + } else if (primary_end_cluster.size() > primary_start_cluster.size()) { + std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); + primary_pos = primary_end_cluster[primary_end_cluster.size() / 2]; + } - // for (const auto& entry1 : primary_map) { - // const std::string& qname1 = entry1.first; - // const GenomicRegion& primary1 = entry1.second; - // for (const auto& entry2 : primary_map) { - // const std::string& qname2 = entry2.first; - // if (qname1 == qname2) { - // continue; - // } - // const GenomicRegion& primary2 = entry2.second; - // if (primary1.tid == primary2.tid && primary1.start <= primary2.end && primary1.end >= primary2.start) { - // // Overlapping primary alignments - // // printMessage("Overlapping primary alignments with quality " + std::to_string(primary_map_qual[qname1]) + " and " + std::to_string(primary_map_qual[qname2])); - // // if (primary_map_qual[qname1] < primary_map_qual[qname2]) { - // if (primary1.qual < primary2.qual) { - // // to_remove_overlapping.push_back(qname1); - // to_remove_overlapping.insert(qname1); - // } else { - // // If equal, remove the shorter alignment - // if (primary1.end - primary1.start < primary2.end - primary2.start) { - // // to_remove_overlapping.push_back(qname1); - // to_remove_overlapping.insert(qname1); - // } else { - // // to_remove_overlapping.push_back(qname2); - // to_remove_overlapping.insert(qname2); - // } - // } - // } - // } - // } + // Get the supplementary alignment positions + int supp_pos = -1; + if (supp_start_cluster.size() > supp_end_cluster.size()) { + std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); + supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; + } else if (supp_end_cluster.size() > supp_start_cluster.size()) { + std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); + supp_pos = supp_end_cluster[supp_end_cluster.size() / 2]; + } - // for (const std::string& qname : to_remove_overlapping) { - // primary_map.erase(qname); - // supp_map.erase(qname); - // } - // printMessage(region + ": Removed " + std::to_string(to_remove_overlapping.size()) + " overlapping primary alignments"); - printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supp_map.size()) + " supplementary alignments after filtering"); + if (primary_pos == -1 || supp_pos == -1) { + continue; + } + + // Store the SV candidate if the length is between 2kb and 1Mb + int sv_start = std::min(primary_pos, supp_pos); + int sv_end = std::max(primary_pos, supp_pos); + int sv_length = sv_end - sv_start + 1; + if (sv_length >= min_length && sv_length <= max_length) { + SVCall sv_candidate(sv_start, sv_end, SVType::UNKNOWN, ".", "NA", "./.", 0.0, 0, 0, 0); + sv_candidates.push_back(sv_candidate); + } + } + + return sv_candidates; } @@ -323,7 +373,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) { int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "LSEQSIM", "./.", default_lh, read_depth); + // addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", + // "LSEQSIM", "./.", default_lh, read_depth); + SVCall sv_call(bp1, bp2, SVType::DUP, "", "LSEQSIM", "./.", default_lh, read_depth, 1, 0); + addSVCall(sv_calls, sv_call); continue; } } @@ -337,7 +390,9 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) { int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth); + SVCall sv_call(bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth, 1, 0); + addSVCall(sv_calls, sv_call); + // addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth); continue; } } @@ -354,8 +409,9 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec if (op_len <= 50) { alt_allele = ins_seq_str; } - - addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth); + SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, 1, 0); + addSVCall(sv_calls, sv_call); + // addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth); // Check if the CIGAR operation is a deletion } else if (op == BAM_CDEL && is_primary) { @@ -363,7 +419,10 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec ref_pos = pos+1; ref_end = ref_pos + op_len -1; int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); - addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "", "CIGARDEL", "./.", default_lh, read_depth); + // addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "", + // "CIGARDEL", "./.", default_lh, read_depth); + SVCall sv_call(ref_pos, ref_end, SVType::DEL, "", "CIGARDEL", "./.", default_lh, read_depth, 1, 0); + addSVCall(sv_calls, sv_call); // Print if the ref pos is within the range 44007800-44007930 if (ref_pos >= 44007800 && ref_pos <= 44007930) { @@ -450,23 +509,23 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v } // Detect SVs from the CIGAR strings - printMessage(chr + ": CIGAR SVs..."); - this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); + // printMessage(chr + ": CIGAR SVs..."); + // this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); - printMessage(chr + ": Merging CIGAR..."); - double cigar_epsilon = 0.45; - int cigar_min_pts = 15; - mergeSVs(chr_sv_calls, cigar_epsilon, cigar_min_pts); + // printMessage(chr + ": Merging CIGAR..."); + // double cigar_epsilon = 0.45; + // int cigar_min_pts = 15; + // mergeSVs(chr_sv_calls, cigar_epsilon, cigar_min_pts); - int region_sv_count = getSVCount(chr_sv_calls); - printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); + // int region_sv_count = getSVCount(chr_sv_calls); + // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); // Run copy number variant predictions on the SVs detected from the // CIGAR string, using a minimum CNV length threshold - if (region_sv_count > 0) { - printMessage(chr + ": CIGAR predictions..."); - cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data); - } + // if (region_sv_count > 0) { + // printMessage(chr + ": CIGAR predictions..."); + // cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data); + // } // Run split-read SV and copy number variant predictions printMessage(chr + ": Split read SVs..."); @@ -479,27 +538,16 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v int split_min_pts = 2; mergeSVs(split_sv_calls, split_epsilon, split_min_pts); - // Unify the SV calls printMessage(chr + ": Unifying SVs..."); chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); - // printMessage(chr + ": Final merge..."); - // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts); - - // TODO: Merge subsets based on highest HMM likelihood + mergeSVSubsets(chr_sv_calls); // Sort the SV calls by start position std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) { return a.start < b.start; }); - // Merge the SV calls from the current region - // printMessage(chr + ": Merging split reads..."); - // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold); - // filterSVsWithLowSupport(chr_sv_calls, split_sv_support_threshold, "SPLIT"); - - // Run a final merge on the combined SV calls - // printMessage(chr + ": Merging final calls..."); printMessage("Completed chromosome " + chr); } @@ -602,156 +650,205 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in // printMessage(region + ": Getting split alignments..."); std::unordered_map primary_map; std::unordered_map> supp_map; - this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map); - - // Find split-read SV evidence - // printMessage(region + ": Finding split-read SVs..."); - // std::vector split_sv_calls; - int current_primary = 0; - int primary_count = primary_map.size(); - uint32_t min_cnv_length = input_data.getMinCNVLength(); - for (auto& entry : primary_map) { - current_primary++; - const std::string& qname = entry.first; - GenomicRegion& primary = entry.second; - const std::string& primary_chr = bamHdr->target_name[primary.tid]; - - // Find the largest supplementary alignment - auto& supp_regions = supp_map[qname]; - // GenomicRegion largest_supp = supp_regions[0]; - auto it = std::max_element(supp_regions.begin(), supp_regions.end(), [](const GenomicRegion& a, const GenomicRegion& b) { - return a.end - a.start < b.end - b.start; - }); - GenomicRegion largest_supp = *it; - - // If on a different chromosome, label as a translocation - if (primary.tid != largest_supp.tid) { - // Note that these do not currently have a likelihood score or read depth - // Create two BND records for the translocation - // Create the alternate allele format for the first BND record - const std::string& supp_chr = bamHdr->target_name[largest_supp.tid]; - std::string alt_allele = "N[" + supp_chr + ":" + std::to_string(largest_supp.start) + "["; - if (largest_supp.strand == false) { - // Reverse-oriented relative to the reference - alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]"; - } - addSVCall(split_sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0); - - // Create the alternate allele format for the second BND record - alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "["; - if (primary.strand == false) { - // Reverse-oriented relative to the reference - alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]"; - } - addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0); + std::vector sv_candidates = this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map); + + // Run copy number predictions on the SVs detected from the split reads + printMessage(region + ": Split read predictions..."); + int current_sv = 0; + int total_svs = sv_candidates.size(); + int min_cnv_length = input_data.getMinCNVLength(); + for (auto& sv_candidate : sv_candidates) { + // Skip if the SV is too small + if ((int)sv_candidate.end - (int)sv_candidate.start <= min_cnv_length) { + continue; + } + std::tuple result = cnv_caller.runCopyNumberPrediction(region, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); + if (std::get<1>(result) == SVType::UNKNOWN) { continue; } - // Inversion detection - bool is_opposite_strand = primary.strand != largest_supp.strand; - if (is_opposite_strand) { - if (largest_supp.end - largest_supp.start >= min_cnv_length) { + double supp_lh = std::get<0>(result); + SVType supp_type = std::get<1>(result); + // int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); + if (supp_type != SVType::NEUTRAL) { + int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); + std::string alt_allele = supp_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(supp_type) + ">"; + SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", "./.", supp_lh, read_depth, 1, sv_candidate.cluster_size); + addSVCall(split_sv_calls, sv_call); + } + current_sv++; + if (current_sv % 1000 == 0) { + printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates"); + } + } - // Print error if the start position is greater than the end - // position - if (largest_supp.start > largest_supp.end) { - printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(largest_supp.start) + "-" + std::to_string(largest_supp.end)); - continue; - } + // // Find split-read SV evidence + // // printMessage(region + ": Finding split-read SVs..."); + // // std::vector split_sv_calls; + // int current_primary = 0; + // int primary_count = primary_map.size(); + // uint32_t min_cnv_length = input_data.getMinCNVLength(); + // for (auto& entry : primary_map) { + // current_primary++; + // const std::string& qname = entry.first; + // GenomicRegion& primary = entry.second; + // const std::string& primary_chr = bamHdr->target_name[primary.tid]; + // int primary_cluster_size = primary.cluster_size; + + // // Find the largest supplementary alignment + // auto& supp_regions = supp_map[qname]; + // // GenomicRegion largest_supp = supp_regions[0]; + // auto it = std::max_element(supp_regions.begin(), supp_regions.end(), [](const GenomicRegion& a, const GenomicRegion& b) { + // return a.end - a.start < b.end - b.start; + // }); + // GenomicRegion largest_supp = *it; + + // // If on a different chromosome, label as a translocation + // if (primary.tid != largest_supp.tid) { + // // Note that these do not currently have a likelihood score or read depth + // // Create two BND records for the translocation + // // Create the alternate allele format for the first BND record + // const std::string& supp_chr = bamHdr->target_name[largest_supp.tid]; + // std::string alt_allele = "N[" + supp_chr + ":" + std::to_string(largest_supp.start) + "["; + // if (largest_supp.strand == false) { + // // Reverse-oriented relative to the reference + // alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]"; + // } + // // addSVCall(split_sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0); + // SVCall sv_call1(primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, 1, primary_cluster_size); + // addSVCall(split_sv_calls, sv_call1); + + // // Create the alternate allele format for the second BND record + // alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "["; + // if (primary.strand == false) { + // // Reverse-oriented relative to the reference + // alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]"; + // } + // // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, + // // SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0); + // SVCall sv_call2(largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, 1, primary_cluster_size); + // addSVCall(split_sv_calls, sv_call2); - std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, largest_supp.start, largest_supp.end, mean_chr_cov, pos_depth_map, input_data); - if (std::get<1>(result) == SVType::UNKNOWN) { - continue; - } + // continue; + // } + + // // Inversion detection + // bool is_opposite_strand = primary.strand != largest_supp.strand; + // if (is_opposite_strand) { + // if (largest_supp.end - largest_supp.start >= min_cnv_length) { + + // // Print error if the start position is greater than the end + // // position + // if (largest_supp.start > largest_supp.end) { + // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(largest_supp.start) + "-" + std::to_string(largest_supp.end)); + // continue; + // } - double supp_lh = std::get<0>(result); - SVType supp_type = std::get<1>(result); - int read_depth = this->calculateReadDepth(pos_depth_map, largest_supp.start, largest_supp.end); - if (supp_type == SVType::NEUTRAL) { - addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "", "SPLIT", "./.", supp_lh, read_depth); - continue; + // std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, largest_supp.start, largest_supp.end, mean_chr_cov, pos_depth_map, input_data); + // if (std::get<1>(result) == SVType::UNKNOWN) { + // continue; + // } + + // double supp_lh = std::get<0>(result); + // SVType supp_type = std::get<1>(result); + // int read_depth = this->calculateReadDepth(pos_depth_map, largest_supp.start, largest_supp.end); + // if (supp_type == SVType::NEUTRAL) { + // SVCall sv_call(largest_supp.start, largest_supp.end, SVType::INV, "", "SPLIT", "./.", supp_lh, read_depth, 1, primary_cluster_size); + // addSVCall(split_sv_calls, sv_call); + // // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "", "SPLIT", "./.", supp_lh, read_depth); + // continue; - } else if (supp_type == SVType::DUP) { - addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "", "SPLIT", "./.", supp_lh, read_depth); - continue; - } else if (supp_type == SVType::DEL) { - addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DEL, "", "SPLIT", "./.", supp_lh, read_depth); - continue; - } - } - } + // } else if (supp_type == SVType::DUP) { + // SVCall sv_call(largest_supp.start, largest_supp.end, SVType::INV_DUP, "", "SPLIT", "./.", supp_lh, read_depth, 1, primary_cluster_size); + // addSVCall(split_sv_calls, sv_call); + // // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "", "SPLIT", "./.", supp_lh, read_depth); + // continue; + // } else if (supp_type == SVType::DEL) { + // SVCall sv_call(largest_supp.start, largest_supp.end, SVType::INV_DEL, "", "SPLIT", "./.", supp_lh, read_depth, 1, primary_cluster_size); + // addSVCall(split_sv_calls, sv_call); + // // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DEL, "", "SPLIT", "./.", supp_lh, read_depth); + // continue; + // } + // } + // } - // Analyze split-read evidence for deletions and duplications - bool gap_exists = false; - uint32_t boundary_left, boundary_right, gap_left, gap_right; - boundary_left = std::min(primary.start, largest_supp.start); - boundary_right = std::max(primary.end, largest_supp.end); - gap_left = std::min(primary.end, largest_supp.start); - gap_right = std::max(primary.start, largest_supp.end); - gap_exists = gap_left < gap_right; + // // Analyze split-read evidence for deletions and duplications + // bool gap_exists = false; + // uint32_t boundary_left, boundary_right, gap_left, gap_right; + // boundary_left = std::min(primary.start, largest_supp.start); + // boundary_right = std::max(primary.end, largest_supp.end); + // gap_left = std::min(primary.end, largest_supp.start); + // gap_right = std::max(primary.start, largest_supp.end); + // gap_exists = gap_left < gap_right; - // Run copy number variant predictions on the boundary if large enough - if (boundary_right - boundary_left >= min_cnv_length) { + // // Run copy number variant predictions on the boundary if large enough + // if (boundary_right - boundary_left >= min_cnv_length) { - // Print error if the start position is greater than the end - // position - if (boundary_left > boundary_right) { - printError("ERROR: Invalid boundary coordinates: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right)); - continue; - } + // // Print error if the start position is greater than the end + // // position + // if (boundary_left > boundary_right) { + // printError("ERROR: Invalid boundary coordinates: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right)); + // continue; + // } - std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data); - if (std::get<1>(bd_result) == SVType::UNKNOWN) { - continue; - } - double bd_lh = std::get<0>(bd_result); - SVType bd_type = std::get<1>(bd_result); + // std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data); + // if (std::get<1>(bd_result) == SVType::UNKNOWN) { + // continue; + // } + // double bd_lh = std::get<0>(bd_result); + // SVType bd_type = std::get<1>(bd_result); - // Run copy number variant predictions on the gap if it exists - if (gap_exists && gap_right - gap_left >= min_cnv_length) { + // // Run copy number variant predictions on the gap if it exists + // if (gap_exists && gap_right - gap_left >= min_cnv_length) { - // Print error if the start position is greater than the end - // position - if (gap_left > gap_right) { - printError("ERROR: Invalid gap coordinates: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right)); - continue; - } + // // Print error if the start position is greater than the end + // // position + // if (gap_left > gap_right) { + // printError("ERROR: Invalid gap coordinates: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right)); + // continue; + // } - // printMessage(region + ": Running copy number prediction for - // gap..."); - // printMessage("Running copy number prediction, length: " + std::to_string(gap_right - gap_left)); - std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data); - if (std::get<1>(gap_result) == SVType::UNKNOWN) { - continue; - } - double gap_lh = std::get<0>(gap_result); - SVType gap_type = std::get<1>(gap_result); - - // If higher likelihood than the boundary, add the gap as the SV call - if (gap_lh > bd_lh) { - int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right); - std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">"; - addSVCall(split_sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth); - } else { - // Add the boundary as the SV call - int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); - std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth); - } - } else { - // Add the boundary as the SV call - int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); - std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth); - } - } + // // printMessage(region + ": Running copy number prediction for + // // gap..."); + // // printMessage("Running copy number prediction, length: " + std::to_string(gap_right - gap_left)); + // std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data); + // if (std::get<1>(gap_result) == SVType::UNKNOWN) { + // continue; + // } + // double gap_lh = std::get<0>(gap_result); + // SVType gap_type = std::get<1>(gap_result); + + // // If higher likelihood than the boundary, add the gap as the SV call + // if (gap_lh > bd_lh) { + // int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right); + // std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">"; + // SVCall sv_call(gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth, 1, primary_cluster_size); + // addSVCall(split_sv_calls, sv_call); + // // addSVCall(split_sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth); + // } else { + // // Add the boundary as the SV call + // int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); + // std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; + // SVCall sv_call(boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, 1, primary_cluster_size); + // addSVCall(split_sv_calls, sv_call); + // // addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth); + // } + // } else { + // // Add the boundary as the SV call + // int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); + // std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; + // SVCall sv_call(boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, 1, primary_cluster_size); + // addSVCall(split_sv_calls, sv_call); + // // addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth); + // } + // } - // Print progress every 1000 primary alignments - if (current_primary % 1000 == 0) { - printMessage(region + ": Processed " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); - } - } + // // Print progress every 1000 primary alignments + // if (current_primary % 1000 == 0) { + // printMessage(region + ": Processed " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); + // } + // } // Unify the SV calls // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); @@ -984,3 +1081,8 @@ int SVCaller::calculateReadDepth(const std::vector& pos_depth_map, uin // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth)); return read_depth; } + +bool SVCaller::regionOverlaps(const GenomicRegion &a, const GenomicRegion &b) +{ + return a.tid == b.tid && a.start <= b.end && b.start <= a.end; +} diff --git a/src/sv_object.cpp b/src/sv_object.cpp index e46afb77..32738adc 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -16,21 +16,26 @@ bool SVCall::operator<(const SVCall & other) const return start < other.start || (start == other.start && end < other.end); } -void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth) +void addSVCall(std::vector& sv_calls, SVCall& sv_call) { - if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) { + if (sv_call.sv_type == SVType::UNKNOWN || sv_call.sv_type == SVType::NEUTRAL) { return; } - - if (start > end) { - printError("ERROR: Invalid SV at position " + std::to_string(start) + "-" + std::to_string(end)); + + // Check if the SV call is valid + if (sv_call.start > sv_call.end) { + printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end)); return; } // Insert the SV call in sorted order - SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1}; auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call); sv_calls.insert(it, sv_call); + + // Insert the SV call in sorted order + // SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1}; + // auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call); + // sv_calls.insert(it, sv_call); } uint32_t getSVCount(const std::vector& sv_calls) @@ -66,24 +71,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) SVType::INV_DUP }) { - // Create a DBSCAN object for the current SV type - // epsilon = 0.45; - // min_pts = 15; - // if (sv_type == SVType::DEL) { - // epsilon = 0.45; - // min_pts = 16; - // } else { - // // epsilon = 0.65; - // // min_pts = 15; - // // epsilon = 0.45; - // // min_pts = 16; - // // epsilon = 0.45; - // // min_pts = 2; - // // epsilon = 0.45; - // // min_pts = 15; - // } - // DBSCAN dbscan(epsilon, min_pts); - // Create a vector of SV calls for the current SV type and size interval std::vector sv_type_calls; std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) { @@ -110,8 +97,7 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) // continue; // Skip noise and unclassified points // } else { if (true) { - // Use the highest HMM likelihood normalized by SV size as the - // representative SV (if any non-zero likelihoods exist) + // Check if any SV has a non-zero likelihood bool has_nonzero_likelihood = false; if (cluster_sv_calls.size() > 0) { for (const auto& sv_call : cluster_sv_calls) { @@ -123,53 +109,20 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) } } } - - // [TEST] Check if any SV has a length greater than 600kb - bool found_large_sv = false; - for (const auto& sv_call : cluster_sv_calls) { - if (sv_call.end - sv_call.start > 600000) { - found_large_sv = true; - break; - } - } - if (found_large_sv) { - printMessage("Found large SV with length greater than 600kb"); - printMessage("Found " + std::to_string(cluster_sv_calls.size()) + " SVs in cluster " + std::to_string(cluster_id) + " of type " + getSVTypeString(sv_type) + ", with epsilon=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts)); - } SVCall merged_sv_call = cluster_sv_calls[0]; if (has_nonzero_likelihood) { - // Use the highest HMM likelihood normalized by SV size as the - // representative SV - // std::vector likelihoods; - // Default very low log-likelihood for zero likelihoods - std::vector likelihoods(cluster_sv_calls.size(), -std::numeric_limits::infinity()); - // for (const auto& sv_call : cluster_sv_calls) { - int i = 0; - for (const auto& sv_call : cluster_sv_calls) { - if (sv_call.hmm_likelihood != 0.0) { - uint32_t sv_size = (uint32_t) (sv_call.end - sv_call.start); - if (sv_size > 0) { - likelihoods[i] = sv_call.hmm_likelihood / sv_size; - // likelihoods.push_back(sv_call.hmm_likelihood / sv_size); - } - } + // These are detected from split reads, choose the one with + // the highest non-zero likelihood + std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { + return a.hmm_likelihood > b.hmm_likelihood; + }); - // Print the SV length, likelihood, and normalized - // likelihood - if (found_large_sv) { - printMessage("Start: " + std::to_string(sv_call.start) + ", end: " + std::to_string(sv_call.end) + ", likelihood: " + std::to_string(sv_call.hmm_likelihood) + ", normalized likelihood: " + std::to_string(likelihoods[i]) + ", length: " + std::to_string(sv_call.end - sv_call.start)); - // printMessage("SV length: " + std::to_string(sv_call.end - sv_call.start) + ", likelihood: " + std::to_string(sv_call.hmm_likelihood) + ", normalized likelihood: " + std::to_string(likelihoods[i]) + ", start: " + std::to_string(sv_call.start) + ", end: " + std::to_string(sv_call.end)); - } - i++; - } - - // Find the index of the maximum element in the likelihoods - // vector - auto max_likelihood_it = std::max_element(likelihoods.begin(), likelihoods.end()); - int max_likelihood_index = std::distance(likelihoods.begin(), max_likelihood_it); - merged_sv_call = cluster_sv_calls[max_likelihood_index]; - printMessage("Merged SV with highest normalized likelihood: " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", likelihood: " + std::to_string(merged_sv_call.hmm_likelihood) + ", normalized likelihood: " + std::to_string(merged_sv_call.hmm_likelihood / (merged_sv_call.end - merged_sv_call.start)) + ", size: " + std::to_string(merged_sv_call.end - merged_sv_call.start)); + // Obtain the highest non-zero likelihood + auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) { + return sv_call.hmm_likelihood != 0.0; + }); + merged_sv_call = *it; } else { // Use the median length SV @@ -178,30 +131,59 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) }); int median_index = cluster_sv_calls.size() / 2; merged_sv_call = cluster_sv_calls[median_index]; - printMessage("Merged SV with median length: " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", likelihood: " + std::to_string(merged_sv_call.hmm_likelihood) + ", size: " + std::to_string(merged_sv_call.end - merged_sv_call.start)); - + } + if (cluster_id < 0) { merged_sv_call.cluster_size = cluster_id; } else { merged_sv_call.cluster_size = (int) cluster_sv_calls.size(); } - // merged_sv_call.cluster_size = (int) cluster_sv_calls.size(); merged_sv_calls.push_back(merged_sv_call); cluster_count++; - } } } printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type)); } - - printMessage("[TEST] Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(merged_sv_calls.size()) + " SV calls"); sv_calls = std::move(merged_sv_calls); // Replace with filtered list int updated_size = sv_calls.size(); printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls"); } -void filterSVsWithLowSupport(std::vector& sv_calls, int min_support) +void mergeSVSubsets(std::vector &sv_calls) +{ + // Sort the SV calls by start position + int initial_size = sv_calls.size(); + std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { + return a.start < b.start; + }); + + // Remove SVs that are subsets of other SVs + std::vector filtered_sv_calls; + // Since the input SV calls are sorted by start position, we can iterate + // through them in order and only keep the SVs that are not subsets of + // others + for (const auto& sv_call : sv_calls) { + // Check if the current SV call is a subset of any previously added + // SV call + bool is_subset = false; + for (const auto& filtered_sv_call : filtered_sv_calls) { + if (sv_call.start >= filtered_sv_call.start && sv_call.end <= filtered_sv_call.end) { + is_subset = true; + break; + } + } + // If it's not a subset, add it to the filtered list + if (!is_subset) { + filtered_sv_calls.push_back(sv_call); + } + } + sv_calls = std::move(filtered_sv_calls); // Replace with filtered list + int updated_size = sv_calls.size(); + printMessage("Filtered SV calls to remove subsets, from " + std::to_string(initial_size) + " to " + std::to_string(updated_size)); +} + +void filterSVsWithLowSupport(std::vector &sv_calls, int min_support) { // Filter SV calls with low read support or low cluster size sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) { From 3341a9295016509419f07a9e61cd6fb85f87c0fd Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 10 Feb 2025 20:39:40 -0500 Subject: [PATCH 073/134] remove test code --- src/sv_caller.cpp | 66 +++++++++-------------------------------------- 1 file changed, 12 insertions(+), 54 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 98877f3f..362413f2 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -61,7 +61,6 @@ std::vector SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, uint32_t supplementary_count = 0; // Main loop to process the alignments - // std::unordered_map primary_map_qual; uint32_t num_alignments = 0; while (readNextAlignment(fp_in, itr, bam1) >= 0) { @@ -81,7 +80,6 @@ std::vector SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, // Process supplementary alignments } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { - // supp_map[qname].push_back(itr); // Store chromosome (TID), start, and end positions (1-based) of the // supplementary alignment, and the strand (true for forward, false for reverse) supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq, 0}); @@ -107,46 +105,6 @@ std::vector SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_destroy1(bam1); printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments"); - // Create a set of dummy SVs from the primary alignments for each chromosome - // and run DBSCAN to cluster them - // std::vector dummy_sv_map; - // std::vector dummy_sv_qnames; - // for (const auto& entry : primary_map) { - // const std::string& chrom = bamHdr->target_name[entry.second.tid]; - // if (chrom != region) { - // continue; // Skip alignments not in the same chromosome - // } - // uint32_t start = entry.second.start; - // uint32_t end = entry.second.end; - // const std::string& qname = entry.first; - // SVCall sv_call(start, end, SVType::DUP, ".", qname, ".", 0.0, 0, 0, 0); - // dummy_sv_map.emplace_back(sv_call); - // dummy_sv_qnames.emplace_back(entry.first); - // } - - // // Run DBSCAN to merge the dummy SVs - // // double epsilon = 0.65; - // double epsilon = 0.45; - // int min_pts = 2; - // std::vector> primary_clusters; - // DBSCAN dbscan(epsilon, min_pts); - // dbscan.fit(dummy_sv_map); - // const std::vector& cluster_ids = dbscan.getClusters(); - - // // Create the 2D vector of clusters - // for (int cluster_id : cluster_ids) { - // if (cluster_id < 0) { - // continue; // Skip noise and unclassified points - // } - // std::vector cluster; - // for (size_t i = 0; i < cluster_ids.size(); ++i) { - // if (cluster_ids[i] == cluster_id) { - // cluster.push_back(dummy_sv_qnames[i]); - // } - // } - // primary_clusters.push_back(cluster); - // } - // Identify overlapping primary alignments and then cluster their primary // start, end vs. supplementary alignment start, end positions, keeping the // median of the largest cluster for the primary and supplementary positions @@ -509,23 +467,23 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v } // Detect SVs from the CIGAR strings - // printMessage(chr + ": CIGAR SVs..."); - // this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); + printMessage(chr + ": CIGAR SVs..."); + this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); - // printMessage(chr + ": Merging CIGAR..."); - // double cigar_epsilon = 0.45; - // int cigar_min_pts = 15; - // mergeSVs(chr_sv_calls, cigar_epsilon, cigar_min_pts); + printMessage(chr + ": Merging CIGAR..."); + double cigar_epsilon = 0.45; + int cigar_min_pts = 15; + mergeSVs(chr_sv_calls, cigar_epsilon, cigar_min_pts); - // int region_sv_count = getSVCount(chr_sv_calls); - // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); + int region_sv_count = getSVCount(chr_sv_calls); + printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); // Run copy number variant predictions on the SVs detected from the // CIGAR string, using a minimum CNV length threshold - // if (region_sv_count > 0) { - // printMessage(chr + ": CIGAR predictions..."); - // cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data); - // } + if (region_sv_count > 0) { + printMessage(chr + ": CIGAR predictions..."); + cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data); + } // Run split-read SV and copy number variant predictions printMessage(chr + ": Split read SVs..."); From 12ae0f76a7ba85f909abfb312ad067aab7a856b9 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 15 Feb 2025 14:00:30 -0500 Subject: [PATCH 074/134] get chromosomes from bam --- include/sv_caller.h | 2 + src/sv_caller.cpp | 105 +++++++++++++++++++++++++++++++++----------- 2 files changed, 82 insertions(+), 25 deletions(-) diff --git a/include/sv_caller.h b/include/sv_caller.h index 1998547f..e0b477e6 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -77,6 +77,8 @@ class SVCaller { int min_mapq = 20; // Minimum mapping quality to be considered std::mutex shared_mutex; + std::vector getChromosomes(const std::string& bam_filepath); + std::vector getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map); // Detect SVs from the CIGAR string of a read alignment, and return the diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 362413f2..8449930e 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -42,7 +42,31 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) return ret; } -std::vector SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map) +std::vector SVCaller::getChromosomes(const std::string &bam_filepath) +{ + // Open the BAM file + samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); + if (!fp_in) { + printError("ERROR: failed to open BAM file " + bam_filepath); + return {}; + } + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); + if (!bamHdr) { + sam_close(fp_in); + printError("ERROR: failed to read header from " + bam_filepath); + return {}; + } + std::vector chromosomes; + for (int i = 0; i < bamHdr->n_targets; i++) { + chromosomes.push_back(bamHdr->target_name[i]); + // printMessage("Chromosome: " + std::string(bamHdr->target_name[i])); + } + bam_hdr_destroy(bamHdr); + sam_close(fp_in); + return chromosomes; +} + +std::vector SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const std::string ®ion, std::unordered_map &primary_map, std::unordered_map> &supp_map) { // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); @@ -142,15 +166,34 @@ std::vector SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, int min_length = 2000; int max_length = 1000000; for (const auto& primary_group : primary_clusters) { + // Determine if the primary alignments are mostly on opposite strands to + // the corresponding supplementary alignments (potential inversions) + bool inversion = false; + for (const std::string& qname : primary_group) { + const std::vector& regions = supp_map[qname]; + int num_supp = (int) regions.size(); + int num_opposite_strand = 0; + for (const GenomicRegion& region : regions) { + if (region.strand != primary_map[qname].strand) { + num_opposite_strand++; + } + } + if (static_cast(num_opposite_strand) / static_cast(num_supp) > 0.5) { + inversion = true; + } + } + // Use DBSCAN to cluster primary alignment start, end positions DBSCAN1D dbscan(100, 5); current_group++; std::vector starts; std::vector ends; + std::vector primary_strands; for (const std::string& qname : primary_group) { const GenomicRegion& region = primary_map[qname]; starts.push_back(region.start); ends.push_back(region.end); + primary_strands.push_back(region.strand); } // Get the largest cluster of primary alignment start positions @@ -164,11 +207,13 @@ std::vector SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, // Get the supplementary alignment positions std::vector supp_starts; std::vector supp_ends; + std::vector supp_strands; for (const std::string& qname : primary_group) { const std::vector& regions = supp_map[qname]; for (const GenomicRegion& region : regions) { supp_starts.push_back(region.start); supp_ends.push_back(region.end); + supp_strands.push_back(region.strand); } } @@ -206,12 +251,13 @@ std::vector SVCaller::getSplitAlignments(samFile* fp_in, hts_idx_t* idx, continue; } - // Store the SV candidate if the length is between 2kb and 1Mb + // Store the SV candidate if the length is within the specified range int sv_start = std::min(primary_pos, supp_pos); int sv_end = std::max(primary_pos, supp_pos); int sv_length = sv_end - sv_start + 1; + SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; if (sv_length >= min_length && sv_length <= max_length) { - SVCall sv_candidate(sv_start, sv_end, SVType::UNKNOWN, ".", "NA", "./.", 0.0, 0, 0, 0); + SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "NA", "./.", 0.0, 0, 0, 0); sv_candidates.push_back(sv_candidate); } } @@ -289,7 +335,6 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec uint32_t* cigar = bam_get_cigar(alignment); // CIGAR array int cigar_len = alignment->core.n_cigar; uint32_t query_pos = 0; - uint8_t qual = alignment->core.qual; // Loop through the CIGAR string, process operations, detect SVs (primary // only), and calculate sequence identity for potential duplications (primary only) @@ -404,13 +449,6 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome) { - // int filter_threshold = 4; // Minimum number of supporting reads for an SV call - // int filter_threshold = 10; // Minimum number of supporting reads for an - // SV call - int cigar_sv_support_threshold = input_data.getMinReadSupport(); // Minimum number of supporting reads for an SV call - // int split_sv_support_threshold = 4; // Minimum number of supporting - // reads for an SV call - int split_sv_support_threshold = input_data.getMinReadSupport(); double dbscan_epsilon = input_data.getDBSCAN_Epsilon(); int dbscan_min_pts = input_data.getDBSCAN_MinPts(); @@ -447,6 +485,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // Set the region to process std::string region = chr; uint32_t chr_len = ref_genome.getChromosomeLength(chr); + // uint32_t chr_len = bamHdr->target_len[bam_name2id(bamHdr, chr.c_str())]; if (input_data.isRegionSet()) { // Use one chunk for the specified region @@ -471,9 +510,9 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); printMessage(chr + ": Merging CIGAR..."); - double cigar_epsilon = 0.45; - int cigar_min_pts = 15; - mergeSVs(chr_sv_calls, cigar_epsilon, cigar_min_pts); + // double cigar_epsilon = 0.45; + // int cigar_min_pts = 15; + mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts); int region_sv_count = getSVCount(chr_sv_calls); printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); @@ -509,6 +548,8 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v printMessage("Completed chromosome " + chr); } + + void SVCaller::run(const InputData& input_data) { // Set up the reference genome @@ -520,9 +561,12 @@ void SVCaller::run(const InputData& input_data) // Get the chromosomes std::vector chromosomes; if (input_data.isSingleChr()) { + // Get the chromosome from the user input argument chromosomes.push_back(input_data.getChromosome()); } else { - chromosomes = ref_genome.getChromosomes(); + // chromosomes = ref_genome.getChromosomes(); + // Get the chromosomes from the input BAM file + chromosomes = this->getChromosomes(input_data.getLongReadBam()); } // Read the HMM from the file @@ -614,12 +658,8 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in printMessage(region + ": Split read predictions..."); int current_sv = 0; int total_svs = sv_candidates.size(); - int min_cnv_length = input_data.getMinCNVLength(); for (auto& sv_candidate : sv_candidates) { - // Skip if the SV is too small - if ((int)sv_candidate.end - (int)sv_candidate.start <= min_cnv_length) { - continue; - } + bool is_inversion = sv_candidate.sv_type == SVType::INV; std::tuple result = cnv_caller.runCopyNumberPrediction(region, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); if (std::get<1>(result) == SVType::UNKNOWN) { @@ -628,12 +668,27 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); + std::string genotype = std::get<2>(result); + // int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); - if (supp_type != SVType::NEUTRAL) { - int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); - std::string alt_allele = supp_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(supp_type) + ">"; - SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", "./.", supp_lh, read_depth, 1, sv_candidate.cluster_size); - addSVCall(split_sv_calls, sv_call); + if (supp_type != SVType::UNKNOWN) { + if (is_inversion) { + if (supp_type == SVType::DEL) { + supp_type = SVType::INV_DEL; + } else if (supp_type == SVType::DUP) { + supp_type = SVType::INV_DUP; + } else if (supp_type == SVType::NEUTRAL) { + supp_type = SVType::INV; + } + printMessage("Inversion detected: " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " (LENGTH " + std::to_string(sv_candidate.end - sv_candidate.start + 1) + ")"); + } + + if (supp_type != SVType::NEUTRAL) { + int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); + std::string alt_allele = "<" + getSVTypeString(supp_type) + ">"; + SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); + addSVCall(split_sv_calls, sv_call); + } } current_sv++; if (current_sv % 1000 == 0) { From f39fd8a6a3ca3f8a0ed2f2c1811ac7c63d4ad492 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 26 Feb 2025 19:47:06 -0500 Subject: [PATCH 075/134] Fix large cnv detection errors --- src/dbscan.cpp | 2 - src/sv_caller.cpp | 319 ++++++++++++++-------------------------------- src/sv_object.cpp | 3 +- 3 files changed, 98 insertions(+), 226 deletions(-) diff --git a/src/dbscan.cpp b/src/dbscan.cpp index d5310292..6fe97563 100644 --- a/src/dbscan.cpp +++ b/src/dbscan.cpp @@ -26,8 +26,6 @@ const std::vector& DBSCAN::getClusters() const { return clusters; } -// bool DBSCAN::expandCluster(const std::vector>& -// points, size_t pointIdx, int clusterId) { bool DBSCAN::expandCluster(const std::vector& sv_calls, size_t pointIdx, int clusterId) { std::vector seeds = regionQuery(sv_calls, pointIdx); if (static_cast(seeds.size()) < minPts) { diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 8449930e..fddc04d3 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -204,6 +204,11 @@ std::vector SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, dbscan.fit(ends); std::vector primary_end_cluster = dbscan.getLargestCluster(ends); + // Continue if no clusters were found + if (primary_start_cluster.empty() && primary_end_cluster.empty()) { + continue; + } + // Get the supplementary alignment positions std::vector supp_starts; std::vector supp_ends; @@ -225,26 +230,61 @@ std::vector SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, dbscan.fit(supp_ends); std::vector supp_end_cluster = dbscan.getLargestCluster(supp_ends); + // Continue if no clusters were found + if (supp_start_cluster.empty() && supp_end_cluster.empty()) { + continue; + } + // Use the median of the largest cluster of primary and supplementary // alignment start, end positions as the final genome coordinates of the // SV int primary_pos = -1; + int primary_pos2 = -1; if (primary_start_cluster.size() > primary_end_cluster.size()) { std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; } else if (primary_end_cluster.size() > primary_start_cluster.size()) { std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); primary_pos = primary_end_cluster[primary_end_cluster.size() / 2]; + } else { + // Use both positions + std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); + std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); + primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; + primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2]; } // Get the supplementary alignment positions int supp_pos = -1; + int supp_pos2 = -1; if (supp_start_cluster.size() > supp_end_cluster.size()) { std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; } else if (supp_end_cluster.size() > supp_start_cluster.size()) { std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); supp_pos = supp_end_cluster[supp_end_cluster.size() / 2]; + } else { + // Use both positions. This has been shown to occur in nested SVs + std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); + std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); + supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; + supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2]; + } + + // If two of either were found, use the larger SV candidate + if (primary_pos2 != -1) { + int sv_length1 = std::abs(primary_pos - supp_pos); + int sv_length2 = std::abs(primary_pos2 - supp_pos); + if (sv_length2 > sv_length1) { + primary_pos = primary_pos2; + } + } + if (supp_pos2 != -1) { + int sv_length1 = std::abs(primary_pos - supp_pos); + int sv_length2 = std::abs(primary_pos - supp_pos2); + if (sv_length2 > sv_length1) { + supp_pos = supp_pos2; + } } if (primary_pos == -1 || supp_pos == -1) { @@ -509,9 +549,16 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v printMessage(chr + ": CIGAR SVs..."); this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); + // Calculate the median read depth across the SV calls + printMessage(chr + ": Calculating median SV read depth..."); + uint32_t cumulative_depth = 0; + for (auto& sv_call : chr_sv_calls) { + cumulative_depth += sv_call.read_depth; + } + double median_sv_depth = (double)cumulative_depth / (double)chr_sv_calls.size(); + printMessage("Median SV read depth: " + std::to_string(median_sv_depth)); + printMessage(chr + ": Merging CIGAR..."); - // double cigar_epsilon = 0.45; - // int cigar_min_pts = 15; mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts); int region_sv_count = getSVCount(chr_sv_calls); @@ -529,16 +576,16 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v std::vector split_sv_calls; this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, split_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data); - // Merge the split-read SVs separately + // // Merge the split-read SVs separately printMessage(chr + ": Merging split reads..."); double split_epsilon = 0.45; - int split_min_pts = 2; + int split_min_pts = 2; // This is low since split alignments were already previously merged mergeSVs(split_sv_calls, split_epsilon, split_min_pts); printMessage(chr + ": Unifying SVs..."); chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); - mergeSVSubsets(chr_sv_calls); + // mergeSVSubsets(chr_sv_calls); // Sort the SV calls by start position std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) { @@ -548,8 +595,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v printMessage("Completed chromosome " + chr); } - - void SVCaller::run(const InputData& input_data) { // Set up the reference genome @@ -680,7 +725,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in } else if (supp_type == SVType::NEUTRAL) { supp_type = SVType::INV; } - printMessage("Inversion detected: " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " (LENGTH " + std::to_string(sv_candidate.end - sv_candidate.start + 1) + ")"); } if (supp_type != SVType::NEUTRAL) { @@ -695,176 +739,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates"); } } - - // // Find split-read SV evidence - // // printMessage(region + ": Finding split-read SVs..."); - // // std::vector split_sv_calls; - // int current_primary = 0; - // int primary_count = primary_map.size(); - // uint32_t min_cnv_length = input_data.getMinCNVLength(); - // for (auto& entry : primary_map) { - // current_primary++; - // const std::string& qname = entry.first; - // GenomicRegion& primary = entry.second; - // const std::string& primary_chr = bamHdr->target_name[primary.tid]; - // int primary_cluster_size = primary.cluster_size; - - // // Find the largest supplementary alignment - // auto& supp_regions = supp_map[qname]; - // // GenomicRegion largest_supp = supp_regions[0]; - // auto it = std::max_element(supp_regions.begin(), supp_regions.end(), [](const GenomicRegion& a, const GenomicRegion& b) { - // return a.end - a.start < b.end - b.start; - // }); - // GenomicRegion largest_supp = *it; - - // // If on a different chromosome, label as a translocation - // if (primary.tid != largest_supp.tid) { - // // Note that these do not currently have a likelihood score or read depth - // // Create two BND records for the translocation - // // Create the alternate allele format for the first BND record - // const std::string& supp_chr = bamHdr->target_name[largest_supp.tid]; - // std::string alt_allele = "N[" + supp_chr + ":" + std::to_string(largest_supp.start) + "["; - // if (largest_supp.strand == false) { - // // Reverse-oriented relative to the reference - // alt_allele = "N]" + supp_chr + ":" + std::to_string(largest_supp.start) + "]"; - // } - // // addSVCall(split_sv_calls, primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0); - // SVCall sv_call1(primary.start, primary.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, 1, primary_cluster_size); - // addSVCall(split_sv_calls, sv_call1); - - // // Create the alternate allele format for the second BND record - // alt_allele = "N[" + primary_chr + ":" + std::to_string(primary.start) + "["; - // if (primary.strand == false) { - // // Reverse-oriented relative to the reference - // alt_allele = "N]" + primary_chr + ":" + std::to_string(primary.start) + "]"; - // } - // // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, - // // SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0); - // SVCall sv_call2(largest_supp.start, largest_supp.end, SVType::BND, alt_allele, "SPLIT", "./.", 0.0, 0, 1, primary_cluster_size); - // addSVCall(split_sv_calls, sv_call2); - - // continue; - // } - - // // Inversion detection - // bool is_opposite_strand = primary.strand != largest_supp.strand; - // if (is_opposite_strand) { - // if (largest_supp.end - largest_supp.start >= min_cnv_length) { - - // // Print error if the start position is greater than the end - // // position - // if (largest_supp.start > largest_supp.end) { - // printError("ERROR: Invalid inversion coordinates: " + primary_chr + ":" + std::to_string(largest_supp.start) + "-" + std::to_string(largest_supp.end)); - // continue; - // } - - // std::tuple result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, largest_supp.start, largest_supp.end, mean_chr_cov, pos_depth_map, input_data); - // if (std::get<1>(result) == SVType::UNKNOWN) { - // continue; - // } - - // double supp_lh = std::get<0>(result); - // SVType supp_type = std::get<1>(result); - // int read_depth = this->calculateReadDepth(pos_depth_map, largest_supp.start, largest_supp.end); - // if (supp_type == SVType::NEUTRAL) { - // SVCall sv_call(largest_supp.start, largest_supp.end, SVType::INV, "", "SPLIT", "./.", supp_lh, read_depth, 1, primary_cluster_size); - // addSVCall(split_sv_calls, sv_call); - // // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV, "", "SPLIT", "./.", supp_lh, read_depth); - // continue; - - // } else if (supp_type == SVType::DUP) { - // SVCall sv_call(largest_supp.start, largest_supp.end, SVType::INV_DUP, "", "SPLIT", "./.", supp_lh, read_depth, 1, primary_cluster_size); - // addSVCall(split_sv_calls, sv_call); - // // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DUP, "", "SPLIT", "./.", supp_lh, read_depth); - // continue; - // } else if (supp_type == SVType::DEL) { - // SVCall sv_call(largest_supp.start, largest_supp.end, SVType::INV_DEL, "", "SPLIT", "./.", supp_lh, read_depth, 1, primary_cluster_size); - // addSVCall(split_sv_calls, sv_call); - // // addSVCall(split_sv_calls, largest_supp.start, largest_supp.end, SVType::INV_DEL, "", "SPLIT", "./.", supp_lh, read_depth); - // continue; - // } - // } - // } - - // // Analyze split-read evidence for deletions and duplications - // bool gap_exists = false; - // uint32_t boundary_left, boundary_right, gap_left, gap_right; - // boundary_left = std::min(primary.start, largest_supp.start); - // boundary_right = std::max(primary.end, largest_supp.end); - // gap_left = std::min(primary.end, largest_supp.start); - // gap_right = std::max(primary.start, largest_supp.end); - // gap_exists = gap_left < gap_right; - - // // Run copy number variant predictions on the boundary if large enough - // if (boundary_right - boundary_left >= min_cnv_length) { - - // // Print error if the start position is greater than the end - // // position - // if (boundary_left > boundary_right) { - // printError("ERROR: Invalid boundary coordinates: " + primary_chr + ":" + std::to_string(boundary_left) + "-" + std::to_string(boundary_right)); - // continue; - // } - - // std::tuple bd_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, boundary_left, boundary_right, mean_chr_cov, pos_depth_map, input_data); - // if (std::get<1>(bd_result) == SVType::UNKNOWN) { - // continue; - // } - // double bd_lh = std::get<0>(bd_result); - // SVType bd_type = std::get<1>(bd_result); - - // // Run copy number variant predictions on the gap if it exists - // if (gap_exists && gap_right - gap_left >= min_cnv_length) { - - // // Print error if the start position is greater than the end - // // position - // if (gap_left > gap_right) { - // printError("ERROR: Invalid gap coordinates: " + primary_chr + ":" + std::to_string(gap_left) + "-" + std::to_string(gap_right)); - // continue; - // } - - // // printMessage(region + ": Running copy number prediction for - // // gap..."); - // // printMessage("Running copy number prediction, length: " + std::to_string(gap_right - gap_left)); - // std::tuple gap_result = cnv_caller.runCopyNumberPrediction(primary_chr, hmm, gap_left, gap_right, mean_chr_cov, pos_depth_map, input_data); - // if (std::get<1>(gap_result) == SVType::UNKNOWN) { - // continue; - // } - // double gap_lh = std::get<0>(gap_result); - // SVType gap_type = std::get<1>(gap_result); - - // // If higher likelihood than the boundary, add the gap as the SV call - // if (gap_lh > bd_lh) { - // int read_depth = this->calculateReadDepth(pos_depth_map, gap_left, gap_right); - // std::string alt_allele = gap_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(gap_type) + ">"; - // SVCall sv_call(gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth, 1, primary_cluster_size); - // addSVCall(split_sv_calls, sv_call); - // // addSVCall(split_sv_calls, gap_left, gap_right, gap_type, alt_allele, "SPLIT", "./.", gap_lh, read_depth); - // } else { - // // Add the boundary as the SV call - // int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); - // std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - // SVCall sv_call(boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, 1, primary_cluster_size); - // addSVCall(split_sv_calls, sv_call); - // // addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth); - // } - // } else { - // // Add the boundary as the SV call - // int read_depth = this->calculateReadDepth(pos_depth_map, boundary_left, boundary_right); - // std::string alt_allele = bd_type == SVType::NEUTRAL ? "." : "<" + getSVTypeString(bd_type) + ">"; - // SVCall sv_call(boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth, 1, primary_cluster_size); - // addSVCall(split_sv_calls, sv_call); - // // addSVCall(split_sv_calls, boundary_left, boundary_right, bd_type, alt_allele, "SPLIT", "./.", bd_lh, read_depth); - // } - // } - - // // Print progress every 1000 primary alignments - // if (current_primary % 1000 == 0) { - // printMessage(region + ": Processed " + std::to_string(current_primary) + " of " + std::to_string(primary_count) + " primary alignments..."); - // } - // } - - // Unify the SV calls - // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); } void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const @@ -897,6 +771,7 @@ void SVCaller::saveToVCF(const std::unordered_map", "##INFO=", + "##INFO=", "##INFO=", "##INFO=", "##INFO=", @@ -938,7 +813,6 @@ void SVCaller::saveToVCF(const std::unordered_mapinput_data.queryRefGenome(chr, - // preceding_pos, end); ref_allele = ref_genome.query(chr, preceding_pos, end); // Use the preceding base as the alternate allele @@ -990,44 +866,28 @@ void SVCaller::saveToVCF(const std::unordered_mapinput_data.queryRefGenome(chr, - // preceding_pos, preceding_pos); ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos); - - // Update the start position to the preceding base start = preceding_pos; // Update the end position to the same base for duplications and insertions - if (sv_type_str == "DUP" || sv_type_str == "INS") { + if (sv_type == SVType::DUP || sv_type == SVType::INS) { end = start; } - if (sv_type_str == "INS") { - // Check if in symbolic form + if (sv_type == SVType::INS) { if (alt_allele != "") { - // Use the insertion sequence as the alternate allele + // Insert the reference allele before the insertion alt_allele.insert(0, ref_allele); } - // start = preceding_pos; // Update the position to the preceding base - - // // Update the end position to the start position to change from - // // query to reference coordinates for insertions - // end = start; } } - // Print the REF allele if SVTYPE = DUP and if it is empty or "." (symbolic) - if (sv_type_str == "DUP" && (ref_allele == "" || ref_allele == ".")) { - printMessage("REF allele for DUP at " + chr + ":" + std::to_string(start) + "-" + std::to_string(end) + ": " + ref_allele + ", ALT allele: " + alt_allele); - } - // Fix ambiguous bases in the reference allele const std::string amb_bases = "RYKMSWBDHV"; // Ambiguous bases std::bitset<256> amb_bases_bitset; @@ -1042,9 +902,18 @@ void SVCaller::saveToVCF(const std::unordered_map& pos_depth_map, uint32_t start, uint32_t end) @@ -1084,13 +955,15 @@ int SVCaller::calculateReadDepth(const std::vector& pos_depth_map, uin // depth map." << std::endl; printError("Error: Start position " + std::to_string(start) + " not found in depth map."); } - try { - // printMessage("Read depth at end: " + std::to_string(pos_depth_map.at(end)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start)); - read_depth += pos_depth_map.at(end); - } catch (const std::out_of_range& e) { - printError("Error: End position " + std::to_string(end) + " not found in depth map."); - // std::cerr << "Warning: End position " << end << " not found in depth map of size " << pos_depth_map.size() << "." << std::endl; - } + + // UPDATE: Only use the start position for the read depth calculation + // try { + // // printMessage("Read depth at end: " + std::to_string(pos_depth_map.at(end)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start)); + // read_depth += pos_depth_map.at(end); + // } catch (const std::out_of_range& e) { + // printError("Error: End position " + std::to_string(end) + " not found in depth map."); + // // std::cerr << "Warning: End position " << end << " not found in depth map of size " << pos_depth_map.size() << "." << std::endl; + // } // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth)); return read_depth; } diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 32738adc..ef12b2f6 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -68,7 +68,8 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) SVType::INV, SVType::INS, SVType::BND, - SVType::INV_DUP + SVType::INV_DUP, + SVType::INV_DEL, }) { // Create a vector of SV calls for the current SV type and size interval From 2c0a8a09c2999db1b256dd5738ee9a5f557e0dfa Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 1 Mar 2025 00:56:35 -0500 Subject: [PATCH 076/134] use pct mean coverage for minpts --- include/input_data.h | 10 ++++--- python/plot_distributions.py | 54 ++++++++++++++++++++++++------------ src/input_data.cpp | 30 +++++++++----------- src/main.cpp | 11 +++++--- src/sv_caller.cpp | 7 +++++ src/sv_object.cpp | 17 ++++-------- 6 files changed, 75 insertions(+), 54 deletions(-) diff --git a/include/input_data.h b/include/input_data.h index d88426f3..106c70b6 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -61,10 +61,6 @@ class InputData { void setMinCNVLength(int min_cnv_length); uint32_t getMinCNVLength() const; - // Set the minimum number of reads supporting an SV for filtering steps. - void setMinReadSupport(int min_reads); - int getMinReadSupport() const; - // Set the epsilon parameter for DBSCAN clustering. void setDBSCAN_Epsilon(double epsilon); double getDBSCAN_Epsilon() const; @@ -73,6 +69,11 @@ class InputData { void setDBSCAN_MinPts(int min_pts); int getDBSCAN_MinPts() const; + // Set the percentage of mean chromosome coverage to use for DBSCAN + // minimum points. + void setDBSCAN_MinPtsPct(double min_pts_pct); + double getDBSCAN_MinPtsPct() const; + // Set the chromosome to analyze. void setChromosome(std::string chr); std::string getChromosome() const; @@ -113,6 +114,7 @@ class InputData { int min_reads; double dbscan_epsilon; int dbscan_min_pts; + double dbscan_min_pts_pct; std::string chr; // Chromosome to analyze std::pair start_end; // Region to analyze bool region_set; // True if a region is set diff --git a/python/plot_distributions.py b/python/plot_distributions.py index 8766a157..37eb1638 100644 --- a/python/plot_distributions.py +++ b/python/plot_distributions.py @@ -26,11 +26,28 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"): # Read VCF file into a pandas DataFrame - vcf_df = pd.read_csv(input_vcf, sep='\t', comment='#', header=None, \ - names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE'], \ - dtype={'CHROM': str, 'POS': np.int64, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, \ - 'FILTER': str, 'INFO': str, 'FORMAT': str, 'SAMPLE': str}) - + try: + vcf_df = pd.read_csv(input_vcf, sep='\t', comment='#', header=None, \ + names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE'], \ + dtype={'CHROM': str, 'POS': np.int64, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, \ + 'FILTER': str, 'INFO': str, 'FORMAT': str, 'SAMPLE': str}) + except Exception as e: + try: + print("[DEBUG] Caught TypeError") + # Truvari merged VCF format with different columns + vcf_df = pd.read_csv(input_vcf, sep='\t', comment='#', header=None, \ + names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE', 'SAMPLE2'], \ + dtype={'CHROM': str, 'POS': np.int64, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, \ + 'FILTER': str, 'INFO': str, 'FORMAT': str, 'SAMPLE': str, 'SAMPLE2': str}) + except Exception as e: + print("[DEBUG] Caught Exception") + # Platinum pedigree VCF format with different columns + vcf_df = pd.read_csv(input_vcf, sep='\t', comment='#', header=None, \ + names=['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT', 'SAMPLE', 'SAMPLE2', 'SAMPLE3', 'SAMPLE4', 'SAMPLE5', 'SAMPLE6', 'SAMPLE7'], \ + dtype={'CHROM': str, 'POS': np.int64, 'ID': str, 'REF': str, 'ALT': str, 'QUAL': str, \ + 'FILTER': str, 'INFO': str, 'FORMAT': str, 'SAMPLE1': str, 'SAMPLE2': str, 'SAMPLE3': str, 'SAMPLE4': str, \ + 'SAMPLE5': str, 'SAMPLE6': str, 'SAMPLE7': str}) + # Initialize dictionaries to store SV sizes for each type of SV sv_sizes = {} @@ -61,6 +78,7 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"): # Continue if SV type is BND (no SV size) if sv_type == "BND": continue + # If the SV caller is DELLY, then we use the second SV size for non-INS # (they don't have SVLEN) and the first SV size for INS sv_size = None @@ -71,7 +89,9 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"): # If the plot title is GIAB, then we need to convert INS to DUP if # INFO/SVTYPE is INS and INFO/REPTYPE is DUP - if plot_title == "GIAB" and sv_type == "INS": + # if plot_title == "GIAB" and sv_type == "INS": + # Check if GIAB is a substring of the plot title + if "GIAB" in plot_title and sv_type == "INS": if 'REPTYPE=DUP' in record['INFO']: sv_type = "DUP" @@ -192,22 +212,22 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"): # Add the bin edges to the x-axis ticks as a range fig.update_xaxes(tickvals=x_values, ticktext=bin_labels) - # # Move the legend to the top right inside the plot - # fig.update_layout(legend=dict( - # orientation='v', - # yanchor='top', - # y=0.75, - # xanchor='right', - # x=0.75, - # )) - # Move the legend to the bottom right outside the plot + # Move the legend to the top right inside the plot fig.update_layout(legend=dict( orientation='v', yanchor='top', - y=1.0, + y=0.75, xanchor='right', - x=1.15, + x=0.75, )) + # # Move the legend to the bottom right outside the plot + # fig.update_layout(legend=dict( + # orientation='v', + # yanchor='top', + # y=1.0, + # xanchor='right', + # x=1.15, + # )) # Set a larger font size for all text in the plot fig.update_layout(font=dict(size=26)) diff --git a/src/input_data.cpp b/src/input_data.cpp index 649e8b1c..7a073dae 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -27,8 +27,9 @@ InputData::InputData() this->sample_size = 100; this->min_cnv_length = 1000; this->min_reads = 5; - this->dbscan_epsilon = 0.5; - this->dbscan_min_pts = 5; + this->dbscan_epsilon = 0.99; + this->dbscan_min_pts = 15; + this->dbscan_min_pts_pct = 0.0; this->thread_count = 1; this->hmm_filepath = "data/wgs.hmm"; this->verbose = false; @@ -157,21 +158,6 @@ void InputData::setMinCNVLength(int min_cnv_length) this->min_cnv_length = (uint32_t) min_cnv_length; } -void InputData::setMinReadSupport(int min_reads) -{ - // Ensure that the minimum read support is an integer and greater than 0 - if (min_reads < 1) - { - throw std::runtime_error("Minimum read support must be an integer greater than 0"); - } - this->min_reads = min_reads; -} - -int InputData::getMinReadSupport() const -{ - return this->min_reads; -} - void InputData::setDBSCAN_Epsilon(double epsilon) { this->dbscan_epsilon = epsilon; @@ -192,6 +178,16 @@ int InputData::getDBSCAN_MinPts() const return this->dbscan_min_pts; } +void InputData::setDBSCAN_MinPtsPct(double min_pts_pct) +{ + this->dbscan_min_pts_pct = min_pts_pct; +} + +double InputData::getDBSCAN_MinPtsPct() const +{ + return this->dbscan_min_pts_pct; +} + void InputData::setChromosome(std::string chr) { this->chr = chr; diff --git a/src/main.cpp b/src/main.cpp index 5275f368..e493cd4e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -46,9 +46,6 @@ void runContextSV(const std::unordered_map& args) if (args.find("min-cnv") != args.end()) { input_data.setMinCNVLength(std::stoi(args.at("min-cnv"))); } - if (args.find("min-reads") != args.end()) { - input_data.setMinReadSupport(std::stoi(args.at("min-reads"))); - } if (args.find("eth") != args.end()) { input_data.setEthnicity(args.at("eth")); } @@ -71,6 +68,10 @@ void runContextSV(const std::unordered_map& args) input_data.setDBSCAN_MinPts(std::stoi(args.at("min-pts"))); } + if (args.find("min-pts-pct") != args.end()) { + input_data.setDBSCAN_MinPtsPct(std::stod(args.at("min-pts-pct"))); + } + // Run ContextSV run(input_data); } @@ -88,9 +89,9 @@ void printUsage(const std::string& programName) { << " -h, --hmm HMM file\n" << " -n, --sample-size Sample size for HMM predictions\n" << " --min-cnv Minimum CNV length\n" - << " --min-reads Minimum read support\n" << " --eps DBSCAN epsilon\n" << " --min-pts DBSCAN minimum points\n" + << " --min-pts-pct Percentage of mean chr. coverage to use for DBSCAN minimum points\n" << " -e, --eth ETH file\n" << " -p, --pfb PFB file\n" << " --save-cnv Save CNV data\n" @@ -131,6 +132,8 @@ std::unordered_map parseArguments(int argc, char* argv args["epsilon"] = argv[++i]; } else if (arg == "--min-pts" && i + 1 < argc) { args["min-pts"] = argv[++i]; + } else if (arg == "--min-pts-pct" && i + 1 < argc) { + args["min-pts-pct"] = argv[++i]; } else if ((arg == "-e" || arg == "--eth") && i + 1 < argc) { args["eth"] = argv[++i]; } else if ((arg == "-p" || arg == "--pfb") && i + 1 < argc) { diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index fddc04d3..4875bbc2 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -545,6 +545,13 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v return; } + // Estimate DBSCAN minimum points + double dbscan_min_pts_pct = input_data.getDBSCAN_MinPtsPct(); + if (dbscan_min_pts_pct > 0.0) { + dbscan_min_pts = (int)std::ceil(mean_chr_cov * dbscan_min_pts_pct); + printMessage(chr + ": Mean chr. cov.: " + std::to_string(mean_chr_cov) + " (DBSCAN min. pts.= " + std::to_string(dbscan_min_pts) + ", min. pts. pct.= " + std::to_string(dbscan_min_pts_pct) + ")"); + } + // Detect SVs from the CIGAR strings printMessage(chr + ": CIGAR SVs..."); this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); diff --git a/src/sv_object.cpp b/src/sv_object.cpp index ef12b2f6..41d787fc 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -31,11 +31,6 @@ void addSVCall(std::vector& sv_calls, SVCall& sv_call) // Insert the SV call in sorted order auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call); sv_calls.insert(it, sv_call); - - // Insert the SV call in sorted order - // SVCall sv_call{start, end, sv_type, alt_allele, data_type, genotype, hmm_likelihood, read_depth, 1, 1}; - // auto it = std::lower_bound(sv_calls.begin(), sv_calls.end(), sv_call); - // sv_calls.insert(it, sv_call); } uint32_t getSVCount(const std::vector& sv_calls) @@ -55,12 +50,10 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) if (sv_calls.size() < 2) { return; } - int initial_size = sv_calls.size(); // Cluster SVs using DBSCAN for each SV type + int initial_size = sv_calls.size(); std::vector merged_sv_calls; - - // Cluster SVs using DBSCAN for each SV type DBSCAN dbscan(epsilon, min_pts); for ( const auto& sv_type : { SVType::DEL, @@ -94,10 +87,10 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) for (auto& cluster : cluster_map) { int cluster_id = cluster.first; std::vector& cluster_sv_calls = cluster.second; - // if (cluster_id < 0) { - // continue; // Skip noise and unclassified points - // } else { - if (true) { + if (cluster_id < 0) { + continue; // Skip noise and unclassified points + } else { + // if (true) { // Check if any SV has a non-zero likelihood bool has_nonzero_likelihood = false; if (cluster_sv_calls.size() > 0) { From f282196c9c3a0010865bc27e31bfcfd3326ec1a8 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 1 Mar 2025 16:19:29 -0500 Subject: [PATCH 077/134] remove unused code --- src/sv_caller.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 4875bbc2..cc72a247 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -556,15 +556,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v printMessage(chr + ": CIGAR SVs..."); this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); - // Calculate the median read depth across the SV calls - printMessage(chr + ": Calculating median SV read depth..."); - uint32_t cumulative_depth = 0; - for (auto& sv_call : chr_sv_calls) { - cumulative_depth += sv_call.read_depth; - } - double median_sv_depth = (double)cumulative_depth / (double)chr_sv_calls.size(); - printMessage("Median SV read depth: " + std::to_string(median_sv_depth)); - printMessage(chr + ": Merging CIGAR..."); mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts); From 972a27126760c960d99dd418a1c1f3ad721dae0c Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 5 Mar 2025 15:03:47 -0500 Subject: [PATCH 078/134] fix memory leaks --- include/cnv_caller.h | 13 +- include/fasta_query.h | 8 +- include/sv_caller.h | 48 ++-- include/utils.h | 3 + src/cnv_caller.cpp | 643 +++++++++++++++++++++++++++--------------- src/fasta_query.cpp | 57 +--- src/sv_caller.cpp | 276 +++++++++++++----- 7 files changed, 659 insertions(+), 389 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 055e3247..c2961cc8 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -14,7 +14,8 @@ #include #include #include -#include +// #include +#include #include /// @endcond @@ -48,7 +49,8 @@ class CNVCaller { //mutable std::mutex snp_file_mtx; // SNP file mutex //mutable std::mutex pfb_file_mtx; // Population frequency file mutex //mutable std::mutex bam_file_mtx; // BAM file mutex - std::mutex& shared_mutex; + // std::mutex& shared_mutex; + std::shared_mutex& shared_mutex; // Define a map of CNV genotypes by HMM predicted state. // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output. @@ -82,10 +84,7 @@ class CNVCaller { std::vector splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const; public: - // explicit CNVCaller(const InputData& input_data); - // Constructor with no arguments - //CNVCaller() = default; - CNVCaller(std::mutex& mtx) : shared_mutex(mtx) {} + CNVCaller(std::shared_mutex& shared_mutex) : shared_mutex(shared_mutex) {} // Run copy number prediction for a single SV candidate, returning the // likelihood, predicted CNV type, genotype, and whether SNPs were found @@ -96,7 +95,7 @@ class CNVCaller { double calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const; - void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& pos_log2) const; + // void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& pos_log2) const; void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data) const; diff --git a/include/fasta_query.h b/include/fasta_query.h index a0697446..4486bdb0 100644 --- a/include/fasta_query.h +++ b/include/fasta_query.h @@ -8,7 +8,8 @@ #include #include #include -#include +// #include +#include #include /// @endcond @@ -18,11 +19,10 @@ class ReferenceGenome { std::vector chromosomes; std::unordered_map chr_to_seq; std::map chr_to_length; - //mutable std::mutex mtx; - std::mutex& shared_mutex; + std::shared_mutex& shared_mutex; public: - ReferenceGenome(std::mutex& mtx) : shared_mutex(mtx) {} + ReferenceGenome(std::shared_mutex& shared_mutex) : shared_mutex(shared_mutex) {} int setFilepath(std::string fasta_filepath); std::string getFilepath() const; diff --git a/include/sv_caller.h b/include/sv_caller.h index e0b477e6..d795f44b 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -11,7 +11,8 @@ #include /// @cond -#include +// #include +#include #include #include /// @endcond @@ -30,28 +31,41 @@ struct IntervalNode { GenomicRegion region; std::string qname; hts_pos_t max_end; // To optimize queries - IntervalNode* left; - IntervalNode* right; + // IntervalNode* left; + // IntervalNode* right; + std::unique_ptr left; + std::unique_ptr right; IntervalNode(GenomicRegion r, std::string name) : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {} }; -IntervalNode* insert(IntervalNode* root, GenomicRegion region, std::string qname) { - if (!root) - return new IntervalNode(region, qname); +// IntervalNode* insert(IntervalNode* root, GenomicRegion region, std::string +// qname) { +void insert(std::unique_ptr& root, GenomicRegion region, std::string qname) { + if (!root) { + // return new IntervalNode(region, qname); + root = std::make_unique(region, qname); + return; + } if (region.start < root->region.start) - root->left = insert(root->left, region, qname); - else - root->right = insert(root->right, region, qname); + { + // root->left = insert(root->left, region, qname); + insert(root->left, region, qname); + } else { + // root->right = insert(root->right, region, qname); + insert(root->right, region, qname); + } // Update max_end root->max_end = std::max(root->max_end, region.end); - return root; + // return root; } -void findOverlaps(IntervalNode* root, GenomicRegion query, std::vector& result) { +// void findOverlaps(IntervalNode* root, GenomicRegion query, +// std::vector& result) { +void findOverlaps(const std::unique_ptr& root, GenomicRegion query, std::vector& result) { if (!root) return; // If overlapping, add to result @@ -75,17 +89,17 @@ struct MismatchData { class SVCaller { private: int min_mapq = 20; // Minimum mapping quality to be considered - std::mutex shared_mutex; + mutable std::shared_mutex shared_mutex; // Shared mutex for thread safety std::vector getChromosomes(const std::string& bam_filepath); - std::vector getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::unordered_map& primary_map, std::unordered_map>& supp_map); + void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls); // Detect SVs from the CIGAR string of a read alignment, and return the // mismatch rate, and the start and end positions of the query sequence void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome); - void processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome); + void processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov, std::vector& split_sv_calls); // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. @@ -96,14 +110,16 @@ class SVCaller { int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); // Detect SVs from split alignments - void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data); + // void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data); // Calculate the mismatch rate given a map of query positions to // match/mismatch (1/0) values within a specified range of the query // sequence double calculateMismatchRate(const MismatchData& mismatch_data); - void saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const; + void runSplitReadCopyNumberPredictions(const std::string& chr, std::vector& split_sv_calls, const CNVCaller &cnv_caller, const CHMM &hmm, double mean_chr_cov, const std::vector &pos_depth_map, const InputData &input_data); + + void saveToVCF(const std::unordered_map> &sv_calls, const std::string &output_dir, const ReferenceGenome &ref_genome) const; // Calculate the read depth (INFO/DP) for a region int calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end); diff --git a/include/utils.h b/include/utils.h index 6715b00e..6ec95610 100644 --- a/include/utils.h +++ b/include/utils.h @@ -25,12 +25,15 @@ struct BamFileGuard { ~BamFileGuard() { if (idx) { hts_idx_destroy(idx); + idx = nullptr; } if (bamHdr) { bam_hdr_destroy(bamHdr); + bamHdr = nullptr; } if (fp_in) { sam_close(fp_in); + fp_in = nullptr; } } diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 16d62d4f..d1a711e9 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -24,6 +24,9 @@ #include #include // std::max #include // std::pair +#include +#include // std::execution::par +// #include #include "utils.h" #include "sv_types.h" @@ -52,28 +55,116 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end { // Initialize the SNP data with default values and sample size length int sample_size = input_data.getSampleSize(); - int region_length = (int) (end_pos - start_pos + 1); - if (region_length < sample_size) - { - sample_size = region_length; - } - - std::vector snp_pos(sample_size, 0); - std::vector snp_baf(sample_size, -1.0); - std::vector snp_pfb(sample_size, 0.5); - std::vector snp_log2_cov(sample_size, 0.0); - std::vector is_snp(sample_size, false); + // int region_length = (int) (end_pos - start_pos + 1); + // if (region_length < sample_size) + // { + // sample_size = region_length; + // } + + // std::vector snp_pos(sample_size, 0); + // std::vector snp_baf(sample_size, -1.0); + // std::vector snp_pfb(sample_size, 0.5); + // std::vector snp_log2_cov(sample_size, 0.0); + // std::vector is_snp(sample_size, false); + std::vector snp_pos; + std::vector snp_baf; + std::vector snp_pfb; + std::vector is_snp; this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data); // Get the log2 ratio for evenly spaced positions in the // region - this->calculateRegionLog2Ratio(start_pos, end_pos, sample_size, pos_depth_map, mean_chr_cov, snp_log2_cov); + // this->calculateSNPLog2Ratios(snp_pos, snp_log2_cov, pos_depth_map, + // mean_chr_cov, input_data); + sample_size = std::max((int) snp_pos.size(), sample_size); + // std::vector snp_pos_hmm(sample_size, 0); + // std::vector snp_baf_hmm(sample_size, -1.0); + // std::vector snp_pfb_hmm(sample_size, 0.5); + // std::vector snp_log2_hmm(sample_size, 0.0); + // std::vector is_snp_hmm(sample_size, false); + std::vector snp_pos_hmm; + std::vector snp_baf_hmm; + std::vector snp_pfb_hmm; + std::vector snp_log2_hmm; + std::vector is_snp_hmm; + + // Loop through evenly spaced positions in the region and get the log2 ratio + double pos_step = (double) (end_pos - start_pos + 1) / (double) sample_size; + // Convert SNP positions for faster access (convert to a set) + std::unordered_set snp_pos_set(snp_pos.begin(), snp_pos.end()); + + for (int i = 0; i < sample_size; i++) + { + // Calculate the mean depth for the window + double cov_sum = 0.0; + int pos_count = 0; + for (int j = 0; j < pos_step; j++) + { + uint32_t pos = (uint32_t) (start_pos + i * pos_step + j); + if (pos > end_pos) + { + break; + } + try + { + cov_sum += pos_depth_map.at(pos); + pos_count++; + } + catch (const std::out_of_range& e) + { + // Ignore out of range errors + } + } + double log2_cov = 0.0; + if (pos_count > 0) + { + log2_cov = log2((cov_sum / (double) pos_count) / mean_chr_cov); + } + + // Loop through positions and get the log2 ratio + bool snp_found_in_sample = false; + for (int j = 0; j < pos_step; j++) + { + uint32_t pos = (uint32_t) (start_pos + i * pos_step + j); + if (pos > end_pos) + { + break; + } + + // Check if the position is a SNP + if (snp_pos_set.find(pos) != snp_pos_set.end()) + { + // Update the SNP data + snp_pos_hmm.push_back(pos); + snp_baf_hmm.push_back(snp_baf[i]); + snp_pfb_hmm.push_back(snp_pfb[i]); + snp_log2_hmm.push_back(log2_cov); + is_snp_hmm.push_back(true); + snp_found_in_sample = true; + } + } + + // If no SNP was found in the sample, then use the middle of the window + // as a placeholder + // This is to ensure that the HMM has a value for every position in the + // sample + if (!snp_found_in_sample) + { + uint32_t pos = (uint32_t) (start_pos + (i * pos_step) + (pos_step / 2.0)); + snp_pos_hmm.push_back(pos); + snp_baf_hmm.push_back(-1.0); + snp_pfb_hmm.push_back(0.5); + snp_log2_hmm.push_back(log2_cov); + is_snp_hmm.push_back(false); + } + } + // this->calculateRegionLog2Ratio(start_pos, end_pos, sample_size, pos_depth_map, mean_chr_cov, snp_log2_cov); // Update the SNP data with all information snp_data.pos = std::move(snp_pos); snp_data.baf = std::move(snp_baf); snp_data.pfb = std::move(snp_pfb); - snp_data.log2_cov = std::move(snp_log2_cov); + snp_data.log2_cov = std::move(snp_log2_hmm); snp_data.is_snp = std::move(is_snp); } @@ -249,8 +340,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector lock(this->shared_mutex); // Lock the BAM file + // std::shared_lock lock(this->shared_mutex); // + // Lock the BAM file + printMessage("Opening BAM file: " + bam_filepath); samFile *bam_file = sam_open(bam_filepath.c_str(), "r"); if (!bam_file) { @@ -324,12 +393,8 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector= 0) { // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads - if (bam_record->core.flag & BAM_FUNMAP || bam_record->core.flag & BAM_FSECONDARY || bam_record->core.flag & BAM_FQCFAIL || bam_record->core.flag & BAM_FDUP) + uint16_t flag = bam_record->core.flag; + if (flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP)) { continue; } + // if (bam_record->core.flag & BAM_FUNMAP || bam_record->core.flag & BAM_FSECONDARY || bam_record->core.flag & BAM_FQCFAIL || bam_record->core.flag & BAM_FDUP) + // { + // continue; + // } // Parse the CIGAR string to get the depth (match, sequence match, and // mismatch) @@ -392,11 +468,17 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector= chr_pos_depth_map.size()) + { + printError("ERROR: Reference position out of range for " + chr + ":" + std::to_string(ref_pos+j)); + continue; } + chr_pos_depth_map[ref_pos + j]++; + // try { + // chr_pos_depth_map[ref_pos + j]++; + // } catch (const std::out_of_range& oor) { + // printError("Out of range error for " + chr + ":" + std::to_string(ref_pos+j)); + // } } } @@ -416,55 +498,111 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector 0) - { - cum_depth += pos_depth; - pos_count++; - } - } - - double mean_chr_cov = 0.0; - if (pos_count > 0) - { - mean_chr_cov = static_cast(cum_depth) / static_cast(pos_count); - } + printMessage("Finished reading BAM file, calculating mean chromosome coverage..."); + + // // Calculate the mean chromosome coverage for positions with non-zero depth + // uint64_t cum_depth = 0; + // uint32_t pos_count = 0; + // for (const auto& pos_depth : chr_pos_depth_map) + // { + // if (pos_depth > 0) + // { + // cum_depth += pos_depth; + // pos_count++; + // } + // } + + // double mean_chr_cov = 0.0; + // if (pos_count > 0) + // { + // mean_chr_cov = static_cast(cum_depth) / static_cast(pos_count); + // } + // printMessage("Completed calculating mean chromosome coverage: " + + // std::to_string(mean_chr_cov)); + + // Parallel sum of the depth map + uint64_t cum_depth = std::reduce( + std::execution::par, + chr_pos_depth_map.begin(), + chr_pos_depth_map.end(), + 0ULL + ); + + // Parallel count of the non-zero depth positions + uint32_t pos_count = std::count_if( + std::execution::par, + chr_pos_depth_map.begin(), + chr_pos_depth_map.end(), + [](uint32_t depth) { return depth > 0; } + ); + + printMessage("Number of positions with non-zero depth: " + std::to_string(pos_count)); + printMessage("Total depth: " + std::to_string(cum_depth)); + + double mean_chr_cov = (pos_count > 0) ? static_cast(cum_depth) / static_cast(pos_count) : 0.0; + printMessage("Completed calculating mean chromosome coverage: " + std::to_string(mean_chr_cov)); return mean_chr_cov; } -void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& log2_region) const -{ - uint32_t region_length = end_pos - start_pos + 1; - for (int i = 0; i < sample_size; i++) - { - uint32_t pos = start_pos + ((double)region_length / sample_size) * i; - try { - uint32_t depth = pos_depth_map.at(pos); - - // Calculate the log2 ratio for the position - if (depth == 0) - { - log2_region[i] = 0.0; - } else { - log2_region[i] = log2((double) depth / mean_chr_cov); - } - - } catch (const std::out_of_range& e) { - log2_region[i] = 0.0; - } - } -} +// void CNVCaller::calculateSNPLog2Ratios(const std::vector& snp_pos, const std::vector& snp_log2_cov, const std::vector& pos_depth_map, double mean_chr_cov) const +// { +// // Calculate the log2 ratio for each SNP position +// for (size_t i = 0; i < snp_pos.size(); i++) +// { +// uint32_t pos = snp_pos[i]; +// try { +// uint32_t depth = pos_depth_map.at(pos); + +// // Calculate the log2 ratio for the position +// if (depth == 0) +// { +// snp_log2_cov[i] = 0.0; +// } else { +// snp_log2_cov[i] = log2((double) depth / mean_chr_cov); +// } + +// } catch (const std::out_of_range& e) { +// snp_log2_cov[i] = 0.0; +// } +// } +// } + +// void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& log2_region, std::vector& snp_pos) const +// { +// uint32_t region_length = end_pos - start_pos + 1; +// double step_size = (double) region_length / sample_size; +// std::set snp_pos_set(snp_pos.begin(), snp_pos.end()); + +// // Loop through each interval in the region and calculate the log2 ratio +// for (int i = 0; i < sample_size; i++) +// { +// uint32_t pos = start_pos + (uint32_t) (i * step_size); +// if (pos > end_pos) +// { +// pos = end_pos; +// } +// try { +// uint32_t depth = pos_depth_map.at(pos); + +// // Calculate the log2 ratio for the position +// if (depth == 0) +// { +// log2_region[i] = 0.0; +// } else { +// log2_region[i] = log2((double) depth / mean_chr_cov); +// } + +// } catch (const std::out_of_range& e) { +// log2_region[i] = 0.0; +// } +// } +// } void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data) const { // Lock during reading - std::lock_guard lock(this->shared_mutex); + std::shared_lock lock(this->shared_mutex); // --------- SNP file --------- const std::string snp_filepath = input_data.getSNPFilepath(); @@ -475,7 +613,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui } // Initialize the SNP file reader - // printMessage("Initializing SNP reader..."); bcf_srs_t *snp_reader = bcf_sr_init(); if (!snp_reader) { @@ -484,19 +621,23 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui } snp_reader->require_index = 1; - // Use multi-threading. This is possible here due to the lock + // Use multi-threading if not threading by chromosome int thread_count = input_data.getThreadCount(); + // if (!input_data.isSingleChr()) + // { + // // Use half of the threads for SNP reading + // thread_count = std::max(1, input_data.getThreadCount() / 2); + // } + printMessage("Using " + std::to_string(thread_count) + " threads for SNP reading..."); bcf_sr_set_threads(snp_reader, thread_count); // Add the SNP file to the reader - // printMessage("Adding SNP file to reader..."); if (bcf_sr_add_reader(snp_reader, snp_filepath.c_str()) < 0) { bcf_sr_destroy(snp_reader); printError("ERROR: Could not add SNP file to reader: " + snp_filepath); return; } - // printMessage("SNP file added to reader."); // --------- Population allele frequency file --------- @@ -507,7 +648,16 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui { use_pfb = false; } - + + // Ensure the file exists (ifsstream will throw an exception if the file + // does not exist) + std::ifstream pfb_file(pfb_filepath); + if (!pfb_file) + { + use_pfb = false; + } + pfb_file.close(); + bcf_srs_t *pfb_reader = bcf_sr_init(); std::string chr_gnomad; std::string AF_key; @@ -552,7 +702,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui pfb_reader->require_index = 1; // Add the population allele frequency file to the reader - // printMessage("Adding population allele frequency file to reader..."); if (bcf_sr_add_reader(pfb_reader, pfb_filepath.c_str()) < 0) { printError("ERROR: Could not add population allele frequency file to reader: " + pfb_filepath); @@ -563,188 +712,212 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui return; } - // Use multi-threading. This is possible here due to the lock + // Use multi-threading if not threading by chromosome + int thread_count = input_data.getThreadCount(); + // if (!input_data.isSingleChr()) + // { + // // Use half of the threads for population allele frequency reading + // thread_count = std::max(1, input_data.getThreadCount() / 2); + // } + printMessage("Using " + std::to_string(thread_count) + " threads for population allele frequency reading..."); bcf_sr_set_threads(pfb_reader, thread_count); } // Split the region into samples - int sample_size = snp_pos.size(); - std::vector region_chunks = splitRegionIntoChunks(chr, start_pos, end_pos, sample_size); + // int sample_size = snp_pos.size(); + // std::vector region_chunks = splitRegionIntoChunks(chr, start_pos, end_pos, sample_size); // Loop through the samples and read the SNP data, storing the first // SNP position and BAF value for each sample - // int print_count = 0; int current_region = 0; - for (size_t i = 0; i < region_chunks.size(); ++i) - { - current_region++; - // Lock during reading - // std::lock_guard lock(this->shared_mutex); + // for (size_t i = 0; i < region_chunks.size(); ++i) + // { + // current_region++; + + // Read the SNP data ---------------------------------------------- - // Read the SNP data ---------------------------------------------- + // Set the region + printMessage("Setting region for SNP reader..."); + // std::string region_str = region_chunks[i]; + if (bcf_sr_set_regions(snp_reader, chr.c_str(), 0) < 0) + { + printError("ERROR: Could not set region for SNP reader: " + chr); + bcf_sr_destroy(snp_reader); + bcf_sr_destroy(pfb_reader); + return; + } - // Set the region - // printMessage("Setting region for SNP reader..."); - std::string region_str = region_chunks[i]; - if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) + printMessage("Region set for SNP reader, loading SNP data..."); + bool snp_found = false; + while (bcf_sr_next_line(snp_reader) > 0) + { + if (!bcf_sr_has_line(snp_reader, 0)) { - printError("ERROR: Could not set region for SNP reader: " + region_str); - break; + continue; } - // printMessage("Region set for SNP reader, loading SNP data..."); - - bool snp_found = false; - while (bcf_sr_next_line(snp_reader) > 0) + bcf1_t *snp_record = bcf_sr_get_line(snp_reader, 0); + if (snp_record) { - if (!bcf_sr_has_line(snp_reader, 0)) + uint32_t pos = (uint32_t)snp_record->pos + 1; + + // Skip if not a SNP + if (!bcf_is_snp(snp_record)) { continue; } - bcf1_t *snp_record = bcf_sr_get_line(snp_reader, 0); - if (snp_record) - { - uint32_t pos = (uint32_t)snp_record->pos + 1; - - // Skip if not a SNP - if (!bcf_is_snp(snp_record)) - { - continue; - } - // Get the QUAL, DP, and AD values - if (bcf_float_is_missing(snp_record->qual) || snp_record->qual <= 30) - { - continue; - } + // Get the QUAL, DP, and AD values + if (bcf_float_is_missing(snp_record->qual) || snp_record->qual <= 30) + { + continue; + } - // Extract DP from FORMAT field - int32_t *dp = 0; - int dp_count = 0; - int dp_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "DP", &dp, &dp_count); - if (dp_ret < 0 || dp[0] <= 10) - { - continue; - } - free(dp); + // Extract DP from FORMAT field + int32_t *dp = 0; + int dp_count = 0; + int dp_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "DP", &dp, &dp_count); + if (dp_ret < 0 || dp[0] <= 10) + { + continue; + } + free(dp); - // Skip if the SNP does not pass the filter - if (bcf_has_filter(snp_reader->readers[0].header, snp_record, const_cast("PASS")) != 1) - { - continue; - } + // Skip if the SNP does not pass the filter + if (bcf_has_filter(snp_reader->readers[0].header, snp_record, const_cast("PASS")) != 1) + { + continue; + } - // Extract AD from FORMAT field - int32_t *ad = 0; - int ad_count = 0; - int ad_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "AD", &ad, &ad_count); - if (ad_ret < 0 || ad_count < 2) - { - continue; - } + // Extract AD from FORMAT field + int32_t *ad = 0; + int ad_count = 0; + int ad_ret = bcf_get_format_int32(snp_reader->readers[0].header, snp_record, "AD", &ad, &ad_count); + if (ad_ret < 0 || ad_count < 2) + { + continue; + } - // Calculate the B-allele frequency (BAF) - double baf = (double) ad[1] / (double) (ad[0] + ad[1]); - free(ad); + // Calculate the B-allele frequency (BAF) + double baf = (double) ad[1] / (double) (ad[0] + ad[1]); + free(ad); + + // Add the SNP position and BAF information + snp_pos.push_back(pos); + snp_baf.push_back(baf); + // is_snp.push_back(true); + snp_pfb.push_back(0.5); + // snp_pos[i] = pos; + // snp_baf[i] = baf; + // is_snp[i] = true; + snp_found = true; + // break; // Only one SNP per region + } + } - // Add the SNP position and BAF information - snp_pos[i] = pos; - snp_baf[i] = baf; - is_snp[i] = true; - snp_found = true; + if (snp_reader->errnum) + { + printError("ERROR: " + std::string(bcf_sr_strerror(snp_reader->errnum))); + } - break; // Only one SNP per region - } - } + // Continue if no SNP was found in the region + if (!snp_found) + { + printMessage("No SNP found in region: " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos)); + bcf_sr_destroy(snp_reader); + bcf_sr_destroy(pfb_reader); + return; + } - if (snp_reader->errnum) + // Read the population allele frequency data ---------------------- + // Get the minimum and maximum SNP positions + uint32_t min_snp_pos = *std::min_element(snp_pos.begin(), snp_pos.end()); + uint32_t max_snp_pos = *std::max_element(snp_pos.begin(), snp_pos.end()); + std::unordered_set snp_pos_set(snp_pos.begin(), snp_pos.end()); + std::unordered_map snp_index_map; + for (size_t i = 0; i < snp_pos.size(); i++) + { + snp_index_map[snp_pos[i]] = i; + } + if (use_pfb) + { + // Set the region for the population allele frequency reader + std::string pfb_region_str = chr_gnomad + ":" + std::to_string(min_snp_pos) + "-" + std::to_string(max_snp_pos); + printMessage("Setting region for population allele frequency reader: " + pfb_region_str); + if (bcf_sr_set_regions(pfb_reader, pfb_region_str.c_str(), 0) < 0) { - printError("ERROR: " + std::string(bcf_sr_strerror(snp_reader->errnum))); + printError("ERROR: Could not set region for population allele frequency reader: " + pfb_region_str); } - // Continue if no SNP was found in the region - if (!snp_found) - { - continue; - } + printMessage("Loading population allele frequency data..."); + // for (size_t i = 0; i < snp_pos.size(); ++i) + // { + // Set the region as the SNP position + // printMessage("Setting region for population allele frequency reader..."); + // uint32_t target_snp_pos = snp_pos[i]; // Already 1-based + // std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos); + // if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0) + // { + // printError("ERROR: Could not set region for population allele frequency reader: " + snp_region_str); + // } + // printMessage("Region set for population allele frequency reader, loading population allele frequency data..."); - // Read the population allele frequency data ---------------------- - if (use_pfb) + // Find the SNP position in the population allele frequency file + float *pfb_f = NULL; + int count = 0; + while (bcf_sr_next_line(pfb_reader) > 0) { - // Set the region as the SNP position - // printMessage("Setting region for population allele frequency reader..."); - uint32_t target_snp_pos = snp_pos[i]; // Already 1-based - std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos); - if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0) + // Get the SNP record and validate + bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0); + if (!pfb_record || !bcf_is_snp(pfb_record)) { - printError("ERROR: Could not set region for population allele frequency reader: " + region_str); - break; + continue; // Skip if not a SNP } - // printMessage("Region set for population allele frequency reader, loading population allele frequency data..."); - // Find the SNP position in the population allele frequency file - float *pfb_f = NULL; - int count = 0; - while (bcf_sr_next_line(pfb_reader) > 0) + // Get the SNP position + uint32_t pfb_pos = (uint32_t) pfb_record->pos + 1; + if (snp_pos_set.find(pfb_pos) == snp_pos_set.end()) { - // Get the SNP record and validate - bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0); - if (!pfb_record || !bcf_is_snp(pfb_record)) - { - continue; // Skip if not a SNP - } + continue; // Skip if the SNP position is not in the set + } - // if (!bcf_sr_has_line(pfb_reader, 0)) - // { - // continue; - // } - // bcf1_t *pfb_record = bcf_sr_get_line(pfb_reader, 0); - // if (pfb_record) - // { - // // Skip if not a SNP - // if (!bcf_is_snp(pfb_record)) - // { - // continue; - // } - - // Get the population frequency for the SNP - // float *pfb_f = NULL; - // int count = 0; - int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count); - if (pfb_status < 0 || count == 0) - { - continue; - } - // double pfb = (double) pfb_f[0]; - double pfb = static_cast(pfb_f[0]); - // free(pfb_f); + // Get the SNP position index + size_t i = snp_index_map[pfb_pos]; - // Skip if outside the acceptable range - if (pfb <= MIN_PFB || pfb >= MAX_PFB) - { - continue; - } + // Get the population frequency for the SNP + int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count); + if (pfb_status < 0 || count == 0) + { + continue; + } + // double pfb = (double) pfb_f[0]; + double pfb = static_cast(pfb_f[0]); + // free(pfb_f); - // Add the population frequency to the SNP data - snp_pfb[i] = pfb; + // Skip if outside the acceptable range + if (pfb <= MIN_PFB || pfb >= MAX_PFB) + { + continue; + } - // Break after finding the SNP position - break; + // Add the population frequency to the SNP data + snp_pfb[i] = pfb; - // if (print_count < 20) { - // printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")"); - // print_count++; - // } - } - free(pfb_f); + break; // Break after finding the SNP position + // if (print_count < 20) { + // printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")"); + // print_count++; // } - if (pfb_reader->errnum) - { - printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum))); - } } - // printMessage("SNP region " + std::to_string(current_region) + " of " + std::to_string(region_chunks.size()) + " completed."); + free(pfb_f); + + // if (pfb_reader->errnum) + // { + // printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum))); + // } + // } } + // } // Clean up bcf_sr_destroy(snp_reader); diff --git a/src/fasta_query.cpp b/src/fasta_query.cpp index 445643cc..84a2ae07 100644 --- a/src/fasta_query.cpp +++ b/src/fasta_query.cpp @@ -34,8 +34,6 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath) } // Get the chromosomes and sequences - // std::vector chromosomes; - // std::unordered_map chr_to_seq; std::string current_chr = ""; std::string sequence = ""; std::string line_str = ""; @@ -51,13 +49,10 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath) this->chromosomes.push_back(current_chr); // Add the chromosome to the list this->chr_to_seq[current_chr] = sequence; // Add the sequence to the map this->chr_to_length[current_chr] = sequence.length(); // Add the sequence length to the map - // chromosomes.push_back(current_chr); // Add the chromosome to the list - // chr_to_seq[current_chr] = sequence; // Add the sequence to the map sequence = ""; // Reset the sequence } - // Get the new chromosome - current_chr = line_str.substr(1); + current_chr = line_str.substr(1); // Remove the '>' character // Remove the description size_t space_pos = current_chr.find(" "); @@ -65,15 +60,7 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath) { current_chr.erase(space_pos); } - - // Check if the chromosome is already in the map - // if (chr_to_seq.find(current_chr) != chr_to_seq.end()) - // { - // std::cerr << "Duplicate chromosome " << current_chr << std::endl; - // exit(1); - // } } else { - // Sequence line sequence += line_str; } } @@ -84,21 +71,11 @@ int ReferenceGenome::setFilepath(std::string fasta_filepath) this->chromosomes.push_back(current_chr); // Add the chromosome to the list this->chr_to_seq[current_chr] = sequence; // Add the sequence to the map this->chr_to_length[current_chr] = sequence.length(); // Add the sequence length to the map - // chromosomes.push_back(current_chr); // Add the chromosome to the list - // chr_to_seq[current_chr] = sequence; // Add the sequence to the map } - // Close the file fasta_file.close(); - - // Sort the chromosomes - // std::sort(chromosomes.begin(), chromosomes.end()); std::sort(this->chromosomes.begin(), this->chromosomes.end()); - // Set the chromosomes and sequences - // this->chromosomes = chromosomes; - // this->chr_to_seq = chr_to_seq; - return 0; } @@ -109,55 +86,36 @@ std::string ReferenceGenome::getFilepath() const // Function to get the reference sequence at a given position range std::string_view ReferenceGenome::query(const std::string& chr, uint32_t pos_start, uint32_t pos_end) const -{ - // printMessage("Querying reference genome"); - // std::lock_guard lock(this->shared_mutex); - +{ // Convert positions from 1-indexed (reference) to 0-indexed (string indexing) pos_start--; pos_end--; // Ensure that the end position is not larger than the chromosome length - // if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length()) const std::string& sequence = this->chr_to_seq.at(chr); if (pos_end >= sequence.length() || pos_start > pos_end) { return {}; } - // uint32_t length = pos_end - pos_start + 1; - - // If the subsequence is empty, return empty string - // if (sequence.substr(pos_start, length).empty()) - // { - // return ""; - // } - - // return sequence.substr(pos_start, length); return std::string_view(sequence).substr(pos_start, (pos_end - pos_start) + 1); } // Function to compare the reference sequence at a given position range bool ReferenceGenome::compare(const std::string& chr, uint32_t pos_start, uint32_t pos_end, const std::string& compare_seq, float match_threshold) const -{ - // std::lock_guard lock(this->shared_mutex); - +{ // Convert positions from 1-indexed (reference) to 0-indexed (string indexing) pos_start--; pos_end--; // Ensure that the end position is not larger than the chromosome length - // if (pos_end >= (uint32_t)this->chr_to_seq.at(chr).length()) const std::string& sequence = this->chr_to_seq.at(chr); if (pos_end >= sequence.length() || pos_start >= pos_end) { return {}; } - // Get the subsequence std::string_view subseq = std::string_view(sequence).substr(pos_start, pos_end - pos_start + 1); - - // Ensure the lengths are equal if (subseq.length() != compare_seq.length()) { printError("ERROR: Sequence lengths do not match for comparison"); @@ -175,14 +133,13 @@ bool ReferenceGenome::compare(const std::string& chr, uint32_t pos_start, uint32 } float match_rate = (float)num_matches / (float)subseq.length(); - // Check if the match rate is above the threshold return match_rate >= match_threshold; } // Function to get the chromosome contig lengths in VCF header format std::string ReferenceGenome::getContigHeader() const { - std::lock_guard lock(this->shared_mutex); + std::shared_lock lock(this->shared_mutex); std::string contig_header = ""; // Sort the chromosomes @@ -192,13 +149,10 @@ std::string ReferenceGenome::getContigHeader() const chromosomes.push_back(chr_seq.first); } std::sort(chromosomes.begin(), chromosomes.end()); - - // Iterate over the chromosomes and add them to the contig header for (auto const& chr : chromosomes) { // Add the contig header line contig_header += "##contig=\n"; - // contig_header += "##contig=\n"; } // Remove the last newline character @@ -209,13 +163,10 @@ std::string ReferenceGenome::getContigHeader() const std::vector ReferenceGenome::getChromosomes() const { - // std::lock_guard lock(this->shared_mutex); return this->chromosomes; } uint32_t ReferenceGenome::getChromosomeLength(std::string chr) const { - // std::lock_guard lock(this->shared_mutex); - // return this->chr_to_seq.at(chr).length(); return this->chr_to_length.at(chr); } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index cc72a247..d2e5fa31 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -37,7 +37,8 @@ int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) { - std::lock_guard lock(this->shared_mutex); + // std::lock_guard lock(this->shared_mutex); + std::shared_lock lock(this->shared_mutex); int ret = sam_itr_next(fp_in, itr, bam1); return ret; } @@ -66,19 +67,22 @@ std::vector SVCaller::getChromosomes(const std::string &bam_filepat return chromosomes; } -std::vector SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const std::string ®ion, std::unordered_map &primary_map, std::unordered_map> &supp_map) +void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const std::string ®ion, std::vector& sv_calls) { + std::unordered_map primary_map; + std::unordered_map> supp_map; + // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); if (!bam1) { printError("ERROR: failed to initialize BAM record"); - return {}; + return; } hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str()); if (!itr) { bam_destroy1(bam1); printError("ERROR: failed to query region " + region); - return {}; + return; } uint32_t primary_count = 0; @@ -133,11 +137,13 @@ std::vector SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, // start, end vs. supplementary alignment start, end positions, keeping the // median of the largest cluster for the primary and supplementary positions // as the final genome coordinates of the SV - IntervalNode* root = nullptr; + // IntervalNode* root = nullptr; + std::unique_ptr root = nullptr; for (const auto& entry : primary_map) { const std::string& qname = entry.first; const GenomicRegion& region = entry.second; - root = insert(root, region, qname); + // root = insert(root, region, qname); + insert(root, region, qname); } std::vector> primary_clusters; std::set processed; @@ -302,7 +308,7 @@ std::vector SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, } } - return sv_candidates; + // return sv_candidates; } @@ -416,8 +422,6 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) { int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); - // addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", - // "LSEQSIM", "./.", default_lh, read_depth); SVCall sv_call(bp1, bp2, SVType::DUP, "", "LSEQSIM", "./.", default_lh, read_depth, 1, 0); addSVCall(sv_calls, sv_call); continue; @@ -435,7 +439,6 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); SVCall sv_call(bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth, 1, 0); addSVCall(sv_calls, sv_call); - // addSVCall(sv_calls, bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth); continue; } } @@ -454,7 +457,6 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec } SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, 1, 0); addSVCall(sv_calls, sv_call); - // addSVCall(sv_calls, ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth); // Check if the CIGAR operation is a deletion } else if (op == BAM_CDEL && is_primary) { @@ -462,15 +464,8 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec ref_pos = pos+1; ref_end = ref_pos + op_len -1; int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); - // addSVCall(sv_calls, ref_pos, ref_end, SVType::DEL, "", - // "CIGARDEL", "./.", default_lh, read_depth); SVCall sv_call(ref_pos, ref_end, SVType::DEL, "", "CIGARDEL", "./.", default_lh, read_depth, 1, 0); addSVCall(sv_calls, sv_call); - - // Print if the ref pos is within the range 44007800-44007930 - if (ref_pos >= 44007800 && ref_pos <= 44007930) { - printMessage("DEL: " + chr + ":" + std::to_string(ref_pos) + "-" + std::to_string(ref_end) + " (LENGTH " + std::to_string(op_len) + ")"); - } } } @@ -487,7 +482,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec } } -void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome) +void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov, std::vector& split_sv_calls) { double dbscan_epsilon = input_data.getDBSCAN_Epsilon(); int dbscan_min_pts = input_data.getDBSCAN_MinPts(); @@ -500,9 +495,16 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v return; } - // Set multi-threading - int num_threads = input_data.getThreadCount(); - hts_set_threads(fp_in, num_threads); + // Use multi-threading for the BAM file + // int thread_count = input_data.getThreadCount(); + // // if (!input_data.isSingleChr()) { + // // // Use half the threads for chromosomes, the other half for file I/O + // // thread_count = std::max(1, thread_count / 2); + // // } + // printMessage("Using " + std::to_string(thread_count) + " threads for BAM file I/O"); + // // int num_threads = input_data.getThreadCount(); + // hts_set_threads(fp_in, thread_count); + hts_set_threads(fp_in, 1); // Disable multi-threading for now // Load the header bam_hdr_t *bamHdr = sam_hdr_read(fp_in); @@ -524,7 +526,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // Set the region to process std::string region = chr; - uint32_t chr_len = ref_genome.getChromosomeLength(chr); + // uint32_t chr_len = ref_genome.getChromosomeLength(chr); // uint32_t chr_len = bamHdr->target_len[bam_name2id(bamHdr, chr.c_str())]; if (input_data.isRegionSet()) { @@ -536,14 +538,15 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v } // Load chromosome data for copy number predictions - printMessage(chr + ": Loading chromosome data..."); + // printMessage(chr + ": Loading chromosome data..."); CNVCaller cnv_caller(this->shared_mutex); - std::vector chr_pos_depth_map(chr_len+1, 0); // 1-based index - int thread_count = input_data.getThreadCount(); - double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count); - if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) { - return; - } + // std::vector chr_pos_depth_map(chr_len+1, 0); // 1-based index + + // // Use only half the threads for chromosomes, the other half for file I/O + // double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count); + // if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) { + // return; + // } // Estimate DBSCAN minimum points double dbscan_min_pts_pct = input_data.getDBSCAN_MinPtsPct(); @@ -564,26 +567,30 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // Run copy number variant predictions on the SVs detected from the // CIGAR string, using a minimum CNV length threshold - if (region_sv_count > 0) { - printMessage(chr + ": CIGAR predictions..."); - cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data); - } + // if (region_sv_count > 0) { + // printMessage(chr + ": CIGAR predictions..."); + // cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data); + // } + // [TEST] Before this section has no memory leaks // Run split-read SV and copy number variant predictions printMessage(chr + ": Split read SVs..."); - std::vector split_sv_calls; - this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, split_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data); - - // // Merge the split-read SVs separately - printMessage(chr + ": Merging split reads..."); - double split_epsilon = 0.45; - int split_min_pts = 2; // This is low since split alignments were already previously merged - mergeSVs(split_sv_calls, split_epsilon, split_min_pts); - - printMessage(chr + ": Unifying SVs..."); - chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); - - // mergeSVSubsets(chr_sv_calls); + // std::unordered_map primary_map; + // std::unordered_map> supp_map; + // std::vector sv_candidates = this->getSplitAlignments(fp_in, idx, + // bamHdr, region, primary_map, supp_map); + this->getSplitAlignments(fp_in, idx, bamHdr, region, split_sv_calls); + // std::vector split_sv_calls; + // this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, split_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data); + + // // // Merge the split-read SVs separately + // printMessage(chr + ": Merging split reads..."); + // double split_epsilon = 0.45; + // int split_min_pts = 2; // This is low since split alignments were already previously merged + // mergeSVs(split_sv_calls, split_epsilon, split_min_pts); + + // printMessage(chr + ": Unifying SVs..."); + // chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); // Sort the SV calls by start position std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) { @@ -598,7 +605,9 @@ void SVCaller::run(const InputData& input_data) // Set up the reference genome printMessage("Loading the reference genome..."); const std::string ref_filepath = input_data.getRefGenome(); - ReferenceGenome ref_genome(this->shared_mutex); + std::shared_mutex ref_mutex; // Dummy mutex (remove later) + // ReferenceGenome ref_genome(this->shared_mutex); + ReferenceGenome ref_genome(ref_mutex); ref_genome.setFilepath(ref_filepath); // Get the chromosomes @@ -611,33 +620,75 @@ void SVCaller::run(const InputData& input_data) // Get the chromosomes from the input BAM file chromosomes = this->getChromosomes(input_data.getLongReadBam()); } + + // [TEST] Keep only the first 6 chromosomes + chromosomes.resize(4); + // Remove the first chromosome + chromosomes.erase(chromosomes.begin()); + printMessage("Chromosomes: " + std::to_string(chromosomes.size())); + for (const auto& chr : chromosomes) { + printMessage(" " + chr); + } // Read the HMM from the file std::string hmm_filepath = input_data.getHMMFilepath(); std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; const CHMM& hmm = ReadCHMM(hmm_filepath.c_str()); - // Use multi-threading across chromosomes unless a single chromosome is - // specified - int max_threads = 1; - if (!input_data.isSingleChr()) { - max_threads = input_data.getThreadCount(); - std::cout << "Using " << max_threads << " threads for processing..." << std::endl; + // Calculate the mean chromosome coverage and generate the position depth + // maps for each chromosome (I/O is multi-threaded, which is more efficient + // than per-chromosome multi-threading in this case) + std::shared_mutex shared_mutex; + CNVCaller cnv_caller(shared_mutex); + std::unordered_map> chr_pos_depth_map; + std::unordered_map chr_mean_cov_map; + const std::string bam_filepath = input_data.getLongReadBam(); + int chr_thread_count = input_data.getThreadCount(); + std::cout << "Reading chromosome coverage..." << std::endl; + std::cout << "(Thread count: " << chr_thread_count << ")" << std::endl; + int current_chr = 0; + int total_chr_count = chromosomes.size(); + for (const auto& chr : chromosomes) { + current_chr++; + uint32_t chr_len = ref_genome.getChromosomeLength(chr); + if (chr_len == 0) { + printError("ERROR: chromosome " + chr + " not found in reference genome"); + return; + } + printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Reading " + chr + "..."); + std::vector pos_depth_map(chr_len+1, 0); // 1-based index + double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, pos_depth_map, bam_filepath, chr_thread_count); + if (mean_chr_cov == 0.0 || pos_depth_map.size() == 0) { + printError("ERROR: failed to calculate mean chromosome coverage for " + chr); + return; + } + chr_pos_depth_map[chr] = std::move(pos_depth_map); + chr_mean_cov_map[chr] = mean_chr_cov; + printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean cov. for " + chr + ": " + std::to_string(mean_chr_cov)); } - ThreadPool pool(max_threads); + printMessage("Completed reading chromosome coverage."); - // Shared resources + // Use multi-threading across chromosomes. If a single chromosome is + // specified, use a single main thread (multi-threading is used for file I/O) + int thread_count = 1; + if (!input_data.isSingleChr()) { + thread_count = input_data.getThreadCount(); + std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl; + } + ThreadPool pool(thread_count); std::unordered_map> whole_genome_sv_calls; - - // Lambda to process a chromosome + std::unordered_map> whole_genome_split_sv_calls; auto process_chr = [&](const std::string& chr) { try { std::vector sv_calls; + std::vector split_sv_calls; InputData chr_input_data = input_data; // Use a thread-local copy - this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome); + this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls); { - std::lock_guard lock(this->shared_mutex); + // std::lock_guard lock(this->shared_mutex); + std::shared_lock lock(this->shared_mutex); whole_genome_sv_calls[chr] = std::move(sv_calls); + whole_genome_split_sv_calls[chr] = std::move(split_sv_calls); } // printMessage("Completed chromosome " + chr); } catch (const std::exception& e) { @@ -657,8 +708,7 @@ void SVCaller::run(const InputData& input_data) } // Wait for all tasks to complete - int total_chr_count = futures.size(); - int current_chr = 0; + current_chr = 0; for (auto& future : futures) { try { current_chr++; @@ -672,6 +722,42 @@ void SVCaller::run(const InputData& input_data) } printMessage("All tasks have finished."); + // Run copy number variant predictions on the SVs detected from the + // CIGAR string, using a minimum CNV length threshold + printMessage("Running copy number predictions on CIGAR SVs..."); + for (auto& entry : whole_genome_sv_calls) { + const std::string& chr = entry.first; + std::vector& sv_calls = entry.second; + if (sv_calls.size() > 0) { + printMessage("Running copy number predictions on " + chr + "..."); + cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); + } + } + + printMessage("Running copy number predictions on split-read SVs..."); + for (auto& entry : whole_genome_split_sv_calls) { + const std::string& chr = entry.first; + std::vector& sv_calls = entry.second; + if (sv_calls.size() > 0) { + printMessage("Running copy number predictions on " + chr + "..."); + this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); + + // Merge the split-read SVs separately + printMessage(chr + ": Merging split reads..."); + double split_epsilon = 0.45; + int split_min_pts = 2; // This is low since split alignments were already previously merged + mergeSVs(sv_calls, split_epsilon, split_min_pts); + } + } + + printMessage("Unifying SVs..."); + for (auto& entry : whole_genome_split_sv_calls) { + const std::string& chr = entry.first; + std::vector& sv_calls = entry.second; + whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end()); + } + // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); + // Print the total number of SVs detected for each chromosome uint32_t total_sv_count = 0; for (const auto& entry : whole_genome_sv_calls) { @@ -690,21 +776,64 @@ void SVCaller::run(const InputData& input_data) // Detect SVs from split read alignments -void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) -{ - // printMessage(region + ": Getting split alignments..."); - std::unordered_map primary_map; - std::unordered_map> supp_map; - std::vector sv_candidates = this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map); +// void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) +// { +// // printMessage(region + ": Getting split alignments..."); +// // std::unordered_map primary_map; +// // std::unordered_map> supp_map; +// std::vector sv_candidates = this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map); + +// // Run copy number predictions on the SVs detected from the split reads +// printMessage(region + ": Split read predictions..."); +// int current_sv = 0; +// int total_svs = sv_candidates.size(); +// for (auto& sv_candidate : sv_candidates) { +// bool is_inversion = sv_candidate.sv_type == SVType::INV; + +// std::tuple result = cnv_caller.runCopyNumberPrediction(region, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); +// if (std::get<1>(result) == SVType::UNKNOWN) { +// continue; +// } + +// double supp_lh = std::get<0>(result); +// SVType supp_type = std::get<1>(result); +// std::string genotype = std::get<2>(result); +// if (supp_type != SVType::UNKNOWN) { +// if (is_inversion) { +// if (supp_type == SVType::DEL) { +// supp_type = SVType::INV_DEL; +// } else if (supp_type == SVType::DUP) { +// supp_type = SVType::INV_DUP; +// } else if (supp_type == SVType::NEUTRAL) { +// supp_type = SVType::INV; +// } +// } + +// if (supp_type != SVType::NEUTRAL) { +// int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); +// std::string alt_allele = "<" + getSVTypeString(supp_type) + ">"; +// SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); +// addSVCall(split_sv_calls, sv_call); +// } +// } +// current_sv++; +// if (current_sv % 1000 == 0) { +// printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates"); +// } +// } +// } +// Detect SVs from split read alignments +void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::vector& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) +{ // Run copy number predictions on the SVs detected from the split reads - printMessage(region + ": Split read predictions..."); + printMessage("Split read predictions..."); int current_sv = 0; - int total_svs = sv_candidates.size(); - for (auto& sv_candidate : sv_candidates) { + int total_svs = split_sv_calls.size(); + for (auto& sv_candidate : split_sv_calls) { bool is_inversion = sv_candidate.sv_type == SVType::INV; - std::tuple result = cnv_caller.runCopyNumberPrediction(region, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); + std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); if (std::get<1>(result) == SVType::UNKNOWN) { continue; } @@ -712,8 +841,6 @@ void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); std::string genotype = std::get<2>(result); - - // int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); if (supp_type != SVType::UNKNOWN) { if (is_inversion) { if (supp_type == SVType::DEL) { @@ -764,6 +891,7 @@ void SVCaller::saveToVCF(const std::unordered_map header_lines = { std::string("##reference=") + ref_genome.getFilepath(), contig_header, From f0f84275d2fd2246d16b8f5a9a8b9451e2d049eb Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 6 Mar 2025 09:05:44 -0500 Subject: [PATCH 079/134] Fix inversion detection --- include/cnv_caller.h | 3 +- src/cnv_caller.cpp | 207 ++++++++++++++++++++++--------------------- src/sv_caller.cpp | 119 ++++++++++++++++++------- 3 files changed, 196 insertions(+), 133 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index c2961cc8..6b10bc29 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -93,7 +93,8 @@ class CNVCaller { // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; - double calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const; + // double calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const; + void calculateMeanChromosomeCoverage(const std::vector& chromosomes, std::unordered_map>& chr_pos_depth_map, std::unordered_map& chr_mean_cov_map, const std::string& bam_filepath, int thread_count) const; // void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& pos_log2) const; diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index d1a711e9..45f59ec4 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -77,6 +77,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end // this->calculateSNPLog2Ratios(snp_pos, snp_log2_cov, pos_depth_map, // mean_chr_cov, input_data); sample_size = std::max((int) snp_pos.size(), sample_size); + //printMessage("Sample size: " + std::to_string(sample_size)); // std::vector snp_pos_hmm(sample_size, 0); // std::vector snp_baf_hmm(sample_size, -1.0); // std::vector snp_pfb_hmm(sample_size, 0.5); @@ -161,11 +162,11 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end // this->calculateRegionLog2Ratio(start_pos, end_pos, sample_size, pos_depth_map, mean_chr_cov, snp_log2_cov); // Update the SNP data with all information - snp_data.pos = std::move(snp_pos); - snp_data.baf = std::move(snp_baf); - snp_data.pfb = std::move(snp_pfb); + snp_data.pos = std::move(snp_pos_hmm); + snp_data.baf = std::move(snp_baf_hmm); + snp_data.pfb = std::move(snp_pfb_hmm); snp_data.log2_cov = std::move(snp_log2_hmm); - snp_data.is_snp = std::move(is_snp); + snp_data.is_snp = std::move(is_snp_hmm); } std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const @@ -379,67 +380,70 @@ std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint3 } // Calculate the mean chromosome coverage -double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const +// double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, +// std::vector& chr_pos_depth_map, const std::string& bam_filepath, +// int thread_count) const +void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& chromosomes, std::unordered_map>& chr_pos_depth_map, std::unordered_map& chr_mean_cov_map, const std::string& bam_filepath, int thread_count) const { + // Open the BAM file + // std::shared_lock lock(this->shared_mutex); // Lock the BAM file + printMessage("Opening BAM file: " + bam_filepath); + samFile *bam_file = sam_open(bam_filepath.c_str(), "r"); + if (!bam_file) { - // Open the BAM file - // std::shared_lock lock(this->shared_mutex); // - // Lock the BAM file - printMessage("Opening BAM file: " + bam_filepath); - samFile *bam_file = sam_open(bam_filepath.c_str(), "r"); - if (!bam_file) - { - printError("ERROR: Could not open BAM file: " + bam_filepath); - return 0.0; - } + printError("ERROR: Could not open BAM file: " + bam_filepath); + return; + } - // Enable multi-threading while opening the BAM file - hts_set_threads(bam_file, thread_count); + // Enable multi-threading while opening the BAM file + hts_set_threads(bam_file, thread_count); - // Read the header - bam_hdr_t *bam_header = sam_hdr_read(bam_file); - if (!bam_header) - { - sam_close(bam_file); - printError("ERROR: Could not read header from BAM file: " + bam_filepath); - return 0.0; - } + // Read the header + bam_hdr_t *bam_header = sam_hdr_read(bam_file); + if (!bam_header) + { + sam_close(bam_file); + printError("ERROR: Could not read header from BAM file: " + bam_filepath); + return; + } - // Load the index - hts_idx_t *bam_index = sam_index_load(bam_file, bam_filepath.c_str()); - if (!bam_index) - { - bam_hdr_destroy(bam_header); - sam_close(bam_file); - printError("ERROR: Could not load index for BAM file: " + bam_filepath); - return 0.0; - } - BamFileGuard bam_guard(bam_file, bam_index, bam_header); // Guard to close the BAM file + // Load the index + hts_idx_t *bam_index = sam_index_load(bam_file, bam_filepath.c_str()); + if (!bam_index) + { + bam_hdr_destroy(bam_header); + sam_close(bam_file); + printError("ERROR: Could not load index for BAM file: " + bam_filepath); + return; + } + BamFileGuard bam_guard(bam_file, bam_index, bam_header); // Guard to close the BAM file + + // Initialize the record + bam1_t *bam_record = bam_init1(); + if (!bam_record) + { + // Clean up the BAM file and index + bam_hdr_destroy(bam_header); + sam_close(bam_file); + printError("ERROR: Could not initialize BAM record."); + return; + } + // Iterate through each chromosome and update the depth map + int current_chr = 0; + int total_chr_count = chromosomes.size(); + for (const std::string& chr : chromosomes) + { // Create an iterator for the chromosome hts_itr_t *bam_iter = sam_itr_querys(bam_index, bam_header, chr.c_str()); if (!bam_iter) { printError("ERROR: Could not create iterator for chromosome: " + chr + ", check if the chromosome exists in the BAM file."); - return 0.0; - } - - // Initialize the record - bam1_t *bam_record = bam_init1(); - if (!bam_record) - { - hts_itr_destroy(bam_iter); - printError("ERROR: Could not initialize BAM record."); - return 0.0; + continue; } - // Set threading back to 1 for reading the BAM file - // printMessage("Setting threads to 1 for reading BAM file..."); - // hts_set_threads(bam_file, 1); - // printMessage("Threads set to 1 for reading BAM file."); - - // Iterate through the chromosome and update the depth map - printMessage("Iterating through BAM file reads..."); + printMessage("(" + std::to_string(++current_chr) + "/" + std::to_string(total_chr_count) + ") Reading BAM file for chromosome: " + chr); + std::vector& pos_depth_map = chr_pos_depth_map[chr]; while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0) { // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads @@ -468,12 +472,12 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector= chr_pos_depth_map.size()) + if (ref_pos + j >= pos_depth_map.size()) { printError("ERROR: Reference position out of range for " + chr + ":" + std::to_string(ref_pos+j)); continue; } - chr_pos_depth_map[ref_pos + j]++; + pos_depth_map[ref_pos + j]++; // try { // chr_pos_depth_map[ref_pos + j]++; // } catch (const std::out_of_range& oor) { @@ -494,55 +498,58 @@ double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, std::vector 0) - // { - // cum_depth += pos_depth; - // pos_count++; - // } - // } + printMessage("Finished reading BAM file, calculating mean chromosome coverage..."); - // double mean_chr_cov = 0.0; - // if (pos_count > 0) - // { - // mean_chr_cov = static_cast(cum_depth) / static_cast(pos_count); - // } - // printMessage("Completed calculating mean chromosome coverage: " + - // std::to_string(mean_chr_cov)); - - // Parallel sum of the depth map - uint64_t cum_depth = std::reduce( - std::execution::par, - chr_pos_depth_map.begin(), - chr_pos_depth_map.end(), - 0ULL - ); - - // Parallel count of the non-zero depth positions - uint32_t pos_count = std::count_if( - std::execution::par, - chr_pos_depth_map.begin(), - chr_pos_depth_map.end(), - [](uint32_t depth) { return depth > 0; } - ); - - printMessage("Number of positions with non-zero depth: " + std::to_string(pos_count)); - printMessage("Total depth: " + std::to_string(cum_depth)); - - double mean_chr_cov = (pos_count > 0) ? static_cast(cum_depth) / static_cast(pos_count) : 0.0; - printMessage("Completed calculating mean chromosome coverage: " + std::to_string(mean_chr_cov)); - - return mean_chr_cov; + // // Calculate the mean chromosome coverage for positions with non-zero depth + // uint64_t cum_depth = 0; + // uint32_t pos_count = 0; + // for (const auto& pos_depth : chr_pos_depth_map) + // { + // if (pos_depth > 0) + // { + // cum_depth += pos_depth; + // pos_count++; + // } + // } + + // double mean_chr_cov = 0.0; + // if (pos_count > 0) + // { + // mean_chr_cov = static_cast(cum_depth) / static_cast(pos_count); + // } + // printMessage("Completed calculating mean chromosome coverage: " + + // std::to_string(mean_chr_cov)); + + // Parallel sum of the depth map + uint64_t cum_depth = std::reduce( + std::execution::par, + pos_depth_map.begin(), + pos_depth_map.end(), + 0ULL + ); + + // Parallel count of the non-zero depth positions + uint32_t pos_count = std::count_if( + std::execution::par, + pos_depth_map.begin(), + pos_depth_map.end(), + [](uint32_t depth) { return depth > 0; } + ); + + printMessage("Number of positions with non-zero depth: " + std::to_string(pos_count)); + printMessage("Total depth: " + std::to_string(cum_depth)); + + double mean_chr_cov = (pos_count > 0) ? static_cast(cum_depth) / static_cast(pos_count) : 0.0; + chr_mean_cov_map[chr] = mean_chr_cov; + + printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov)); + } + + // Clean up + // sam_close(bam_file); } // void CNVCaller::calculateSNPLog2Ratios(const std::vector& snp_pos, const std::vector& snp_log2_cov, const std::vector& pos_depth_map, double mean_chr_cov) const diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index d2e5fa31..fcfded24 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -167,7 +167,7 @@ void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bam // For each primary alignment cluster the supplementary alignment start and // end positions, keeping the median of the largest cluster - std::vector sv_candidates; + // std::vector sv_candidates; int current_group = 0; int min_length = 2000; int max_length = 1000000; @@ -304,7 +304,7 @@ void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bam SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; if (sv_length >= min_length && sv_length <= max_length) { SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "NA", "./.", 0.0, 0, 0, 0); - sv_candidates.push_back(sv_candidate); + sv_calls.push_back(sv_candidate); } } @@ -592,10 +592,10 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // printMessage(chr + ": Unifying SVs..."); // chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); - // Sort the SV calls by start position - std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) { - return a.start < b.start; - }); + // // Sort the SV calls by start position + // std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) { + // return a.start < b.start; + // }); printMessage("Completed chromosome " + chr); } @@ -622,13 +622,13 @@ void SVCaller::run(const InputData& input_data) } // [TEST] Keep only the first 6 chromosomes - chromosomes.resize(4); - // Remove the first chromosome - chromosomes.erase(chromosomes.begin()); - printMessage("Chromosomes: " + std::to_string(chromosomes.size())); - for (const auto& chr : chromosomes) { - printMessage(" " + chr); - } + // chromosomes.resize(4); + // // Remove the first chromosome + // chromosomes.erase(chromosomes.begin()); + // printMessage("Chromosomes: " + std::to_string(chromosomes.size())); + // for (const auto& chr : chromosomes) { + // printMessage(" " + chr); + // } // Read the HMM from the file std::string hmm_filepath = input_data.getHMMFilepath(); @@ -644,29 +644,63 @@ void SVCaller::run(const InputData& input_data) std::unordered_map chr_mean_cov_map; const std::string bam_filepath = input_data.getLongReadBam(); int chr_thread_count = input_data.getThreadCount(); - std::cout << "Reading chromosome coverage..." << std::endl; - std::cout << "(Thread count: " << chr_thread_count << ")" << std::endl; - int current_chr = 0; - int total_chr_count = chromosomes.size(); + + // Initialize the chromosome position depth map and mean coverage map for (const auto& chr : chromosomes) { - current_chr++; uint32_t chr_len = ref_genome.getChromosomeLength(chr); if (chr_len == 0) { - printError("ERROR: chromosome " + chr + " not found in reference genome"); - return; + printError("Chromosome " + chr + " not found in reference genome"); + continue; } - printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Reading " + chr + "..."); - std::vector pos_depth_map(chr_len+1, 0); // 1-based index - double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, pos_depth_map, bam_filepath, chr_thread_count); - if (mean_chr_cov == 0.0 || pos_depth_map.size() == 0) { - printError("ERROR: failed to calculate mean chromosome coverage for " + chr); - return; + chr_pos_depth_map[chr] = std::vector(chr_len+1, 0); // 1-based index + chr_mean_cov_map[chr] = 0.0; + } + cnv_caller.calculateMeanChromosomeCoverage(chromosomes, chr_pos_depth_map, chr_mean_cov_map, bam_filepath, chr_thread_count); + + // Remove chromosomes with no reads (mean coverage is zero) + std::vector null_chr; + for (const auto& chr : chromosomes) { + if (chr_mean_cov_map[chr] == 0.0) { + null_chr.push_back(chr); } - chr_pos_depth_map[chr] = std::move(pos_depth_map); - chr_mean_cov_map[chr] = mean_chr_cov; - printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean cov. for " + chr + ": " + std::to_string(mean_chr_cov)); } - printMessage("Completed reading chromosome coverage."); + for (const auto& chr : null_chr) { + printMessage("Removing chromosome " + chr + " with no reads..."); + chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end()); + } + // std::cout << "Reading chromosome coverage..." << std::endl; + // std::cout << "(Thread count: " << chr_thread_count << ")" << std::endl; + // int current_chr = 0; + // int total_chr_count = chromosomes.size(); + // std::vector null_chr; + // for (const auto& chr : chromosomes) { + // current_chr++; + // uint32_t chr_len = ref_genome.getChromosomeLength(chr); + // if (chr_len == 0) { + // printError("ERROR: chromosome " + chr + " not found in reference genome"); + // return; + // } + // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Reading " + chr + "..."); + // std::vector pos_depth_map(chr_len+1, 0); // 1-based index + // double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, pos_depth_map, bam_filepath, chr_thread_count); + // if (mean_chr_cov == 0.0 || pos_depth_map.size() == 0) { + // // No reads, continue to the next chromosome + // null_chr.push_back(chr); + // continue; + // // printError("ERROR: failed to calculate mean chromosome coverage for " + chr); + // // return; + // } + // chr_pos_depth_map[chr] = std::move(pos_depth_map); + // chr_mean_cov_map[chr] = mean_chr_cov; + // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean cov. for " + chr + ": " + std::to_string(mean_chr_cov)); + // } + // printMessage("Completed reading chromosome coverage."); + + // Remove chromosomes with no reads + // for (const auto& chr : null_chr) { + // printMessage("Removing chromosome " + chr + " with no reads..."); + // chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end()); + // } // Use multi-threading across chromosomes. If a single chromosome is // specified, use a single main thread (multi-threading is used for file I/O) @@ -708,7 +742,8 @@ void SVCaller::run(const InputData& input_data) } // Wait for all tasks to complete - current_chr = 0; + int current_chr = 0; + int total_chr_count = chromosomes.size(); for (auto& future : futures) { try { current_chr++; @@ -756,7 +791,21 @@ void SVCaller::run(const InputData& input_data) std::vector& sv_calls = entry.second; whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end()); } - // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); + // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), + // split_sv_calls.end()); + + + // Sort the SV calls by start position + for (auto& entry : whole_genome_sv_calls) { + std::sort(entry.second.begin(), entry.second.end(), [](const SVCall& a, const SVCall& b) { + return a.start < b.start; + }); + } + // // Sort the SV calls by start position + // std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) { + // return a.start < b.start; + // }); + // Print the total number of SVs detected for each chromosome uint32_t total_sv_count = 0; @@ -858,6 +907,12 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); addSVCall(split_sv_calls, sv_call); } + } else if (supp_type == SVType::UNKNOWN && sv_candidate.sv_type == SVType::INV) { + // Inversion with no CNV prediction + int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); + std::string alt_allele = ""; + SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); + addSVCall(split_sv_calls, sv_call); } current_sv++; if (current_sv % 1000 == 0) { From 96f18df80c3e230ba2bcb2e86d58b91fda067d8c Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 6 Mar 2025 16:16:16 -0500 Subject: [PATCH 080/134] inversion fixes --- src/cnv_caller.cpp | 29 --------- src/sv_caller.cpp | 147 ++++++++++++++++++++++++++++++--------------- src/sv_object.cpp | 6 +- 3 files changed, 101 insertions(+), 81 deletions(-) diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 45f59ec4..6db7a78b 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -630,12 +630,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Use multi-threading if not threading by chromosome int thread_count = input_data.getThreadCount(); - // if (!input_data.isSingleChr()) - // { - // // Use half of the threads for SNP reading - // thread_count = std::max(1, input_data.getThreadCount() / 2); - // } - printMessage("Using " + std::to_string(thread_count) + " threads for SNP reading..."); bcf_sr_set_threads(snp_reader, thread_count); // Add the SNP file to the reader @@ -721,31 +715,11 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Use multi-threading if not threading by chromosome int thread_count = input_data.getThreadCount(); - // if (!input_data.isSingleChr()) - // { - // // Use half of the threads for population allele frequency reading - // thread_count = std::max(1, input_data.getThreadCount() / 2); - // } - printMessage("Using " + std::to_string(thread_count) + " threads for population allele frequency reading..."); bcf_sr_set_threads(pfb_reader, thread_count); } - // Split the region into samples - // int sample_size = snp_pos.size(); - // std::vector region_chunks = splitRegionIntoChunks(chr, start_pos, end_pos, sample_size); - - // Loop through the samples and read the SNP data, storing the first - // SNP position and BAF value for each sample - int current_region = 0; - // for (size_t i = 0; i < region_chunks.size(); ++i) - // { - // current_region++; - // Read the SNP data ---------------------------------------------- - // Set the region - printMessage("Setting region for SNP reader..."); - // std::string region_str = region_chunks[i]; if (bcf_sr_set_regions(snp_reader, chr.c_str(), 0) < 0) { printError("ERROR: Could not set region for SNP reader: " + chr); @@ -754,7 +728,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui return; } - printMessage("Region set for SNP reader, loading SNP data..."); bool snp_found = false; while (bcf_sr_next_line(snp_reader) > 0) { @@ -849,13 +822,11 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui { // Set the region for the population allele frequency reader std::string pfb_region_str = chr_gnomad + ":" + std::to_string(min_snp_pos) + "-" + std::to_string(max_snp_pos); - printMessage("Setting region for population allele frequency reader: " + pfb_region_str); if (bcf_sr_set_regions(pfb_reader, pfb_region_str.c_str(), 0) < 0) { printError("ERROR: Could not set region for population allele frequency reader: " + pfb_region_str); } - printMessage("Loading population allele frequency data..."); // for (size_t i = 0; i < snp_pos.size(); ++i) // { // Set the region as the SNP position diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index fcfded24..1dae1e66 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -246,35 +246,43 @@ void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bam // SV int primary_pos = -1; int primary_pos2 = -1; + int primary_cluster_size = 0; if (primary_start_cluster.size() > primary_end_cluster.size()) { std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; + primary_cluster_size = primary_start_cluster.size(); } else if (primary_end_cluster.size() > primary_start_cluster.size()) { std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); primary_pos = primary_end_cluster[primary_end_cluster.size() / 2]; + primary_cluster_size = primary_end_cluster.size(); } else { // Use both positions std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2]; + primary_cluster_size = primary_start_cluster.size(); } // Get the supplementary alignment positions int supp_pos = -1; int supp_pos2 = -1; + int supp_cluster_size = 0; if (supp_start_cluster.size() > supp_end_cluster.size()) { std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; + supp_cluster_size = supp_start_cluster.size(); } else if (supp_end_cluster.size() > supp_start_cluster.size()) { std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); supp_pos = supp_end_cluster[supp_end_cluster.size() / 2]; + supp_cluster_size = supp_end_cluster.size(); } else { // Use both positions. This has been shown to occur in nested SVs std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2]; + supp_cluster_size = supp_start_cluster.size(); } // If two of either were found, use the larger SV candidate @@ -301,13 +309,34 @@ void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bam int sv_start = std::min(primary_pos, supp_pos); int sv_end = std::max(primary_pos, supp_pos); int sv_length = sv_end - sv_start + 1; + int cluster_size = std::max(primary_cluster_size, supp_cluster_size); SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; if (sv_length >= min_length && sv_length <= max_length) { - SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "NA", "./.", 0.0, 0, 0, 0); - sv_calls.push_back(sv_candidate); + SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size); + addSVCall(sv_calls, sv_candidate); + // printMessage(region + ": Found SV candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group) + " with inversion status " + std::to_string(inversion)); } } + // Combine SVs with identical start and end positions, and sum the cluster + // sizes + std::vector combined_sv_calls; + std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { + return a.start < b.start || (a.start == b.start && a.end < b.end); + }); + int merge_count = 0; + for (size_t i = 0; i < sv_calls.size(); i++) { + SVCall& sv_call = sv_calls[i]; + if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.end == sv_calls[i - 1].end) { + sv_calls[i - 1].cluster_size += sv_call.cluster_size; + merge_count++; + } else { + combined_sv_calls.push_back(sv_call); + } + } + sv_calls = std::move(combined_sv_calls); + printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions"); + // return sv_candidates; } @@ -456,7 +485,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec alt_allele = ins_seq_str; } SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, 1, 0); - addSVCall(sv_calls, sv_call); + addSVCall(sv_calls, sv_call); // Check if the CIGAR operation is a deletion } else if (op == BAM_CDEL && is_primary) { @@ -565,13 +594,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v int region_sv_count = getSVCount(chr_sv_calls); printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); - // Run copy number variant predictions on the SVs detected from the - // CIGAR string, using a minimum CNV length threshold - // if (region_sv_count > 0) { - // printMessage(chr + ": CIGAR predictions..."); - // cnv_caller.runCIGARCopyNumberPrediction(chr, chr_sv_calls, hmm, mean_chr_cov, chr_pos_depth_map, input_data); - // } - // [TEST] Before this section has no memory leaks // Run split-read SV and copy number variant predictions printMessage(chr + ": Split read SVs..."); @@ -580,6 +602,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // std::vector sv_candidates = this->getSplitAlignments(fp_in, idx, // bamHdr, region, primary_map, supp_map); this->getSplitAlignments(fp_in, idx, bamHdr, region, split_sv_calls); + // std::vector split_sv_calls; // this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, split_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data); @@ -773,15 +796,18 @@ void SVCaller::run(const InputData& input_data) for (auto& entry : whole_genome_split_sv_calls) { const std::string& chr = entry.first; std::vector& sv_calls = entry.second; + if (sv_calls.size() > 0) { - printMessage("Running copy number predictions on " + chr + "..."); + printMessage("Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates..."); this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); // Merge the split-read SVs separately - printMessage(chr + ": Merging split reads..."); - double split_epsilon = 0.45; - int split_min_pts = 2; // This is low since split alignments were already previously merged - mergeSVs(sv_calls, split_epsilon, split_min_pts); + // printMessage(chr + ": Merging split reads..."); + // double split_epsilon = 0.45; + // // int split_min_pts = 2; // This is low since split alignments + // // were already previously merged + // int split_min_pts = 1; + // mergeSVs(sv_calls, split_epsilon, split_min_pts); } } @@ -796,11 +822,25 @@ void SVCaller::run(const InputData& input_data) // Sort the SV calls by start position - for (auto& entry : whole_genome_sv_calls) { - std::sort(entry.second.begin(), entry.second.end(), [](const SVCall& a, const SVCall& b) { - return a.start < b.start; - }); - } + // printMessage("Sorting SVs..."); + // for (auto& entry : whole_genome_sv_calls) { + // std::sort(entry.second.begin(), entry.second.end(), [](const SVCall& a, const SVCall& b) { + // return a.start < b.start || (a.start == b.start && a.end < b.end); + // }); + + // // Check that the SVs are sorted + // bool unsorted = false; + // for (size_t i = 1; i < entry.second.size(); i++) { + // if (entry.second[i].start < entry.second[i-1].start || (entry.second[i].start == entry.second[i-1].start && entry.second[i].end < entry.second[i-1].end)) { + // printError("ERROR: SVs are not sorted for chromosome " + entry.first); + // unsorted = true; + // break; + // } + // } + // if (!unsorted) { + // printMessage("SVs are sorted for chromosome " + entry.first); + // } + // } // // Sort the SV calls by start position // std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) { // return a.start < b.start; @@ -883,10 +923,6 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve bool is_inversion = sv_candidate.sv_type == SVType::INV; std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); - if (std::get<1>(result) == SVType::UNKNOWN) { - continue; - } - double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); std::string genotype = std::get<2>(result); @@ -905,6 +941,7 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); std::string alt_allele = "<" + getSVTypeString(supp_type) + ">"; SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); + // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type)); addSVCall(split_sv_calls, sv_call); } } else if (supp_type == SVType::UNKNOWN && sv_candidate.sv_type == SVType::INV) { @@ -912,12 +949,13 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); std::string alt_allele = ""; SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); + // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type)); addSVCall(split_sv_calls, sv_call); } current_sv++; - if (current_sv % 1000 == 0) { - printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates"); - } + // if (current_sv % 1000 == 0) { + // printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates"); + // } } } @@ -1017,6 +1055,7 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls, SVCall& sv_call) { - if (sv_call.sv_type == SVType::UNKNOWN || sv_call.sv_type == SVType::NEUTRAL) { - return; - } + // if (sv_call.sv_type == SVType::UNKNOWN || sv_call.sv_type == SVType::NEUTRAL) { + // return; + // } // Check if the SV call is valid if (sv_call.start > sv_call.end) { From 766b220e478d432784d00f0ac2414e2e7837fc64 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 8 Mar 2025 22:13:39 -0500 Subject: [PATCH 081/134] improve insertion detection --- src/dbscan.cpp | 6 -- src/sv_caller.cpp | 233 ++++++++-------------------------------------- src/sv_object.cpp | 85 ++++++++++++++++- 3 files changed, 121 insertions(+), 203 deletions(-) diff --git a/src/dbscan.cpp b/src/dbscan.cpp index 6fe97563..d6c41346 100644 --- a/src/dbscan.cpp +++ b/src/dbscan.cpp @@ -70,11 +70,6 @@ std::vector DBSCAN::regionQuery(const std::vector& sv_calls, siz } double DBSCAN::distance(const SVCall& point1, const SVCall& point2) const { - // return std::sqrt(std::pow(point1.first - point2.first, 2) + - // std::pow(point1.second - point2.second, 2)); - // return std::sqrt(std::pow(static_cast(point1.start) - static_cast(point2.start), 2) + - // std::pow(static_cast(point1.end) - - // static_cast(point2.end), 2)); // Calculate reciprocal overlap-based distance // https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02840-6 @@ -85,6 +80,5 @@ double DBSCAN::distance(const SVCall& point1, const SVCall& point2) const { // Minimum reciprocal overlap double distance = 1.0 - std::min(static_cast(overlap) / static_cast(length1), static_cast(overlap) / static_cast(length2)); - // double distance = 1.0 - static_cast(overlap) / std::min(length1, length2); return distance; // 0.0 means identical, 1.0 means no overlap } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 1dae1e66..1669e093 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -335,9 +335,10 @@ void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bam } } sv_calls = std::move(combined_sv_calls); - printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions"); - // return sv_candidates; + // if (merge_count > 0) { + // printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions"); + // } } @@ -441,6 +442,7 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec ins_seq_str[j] = base; } } + // std::string ins_seq_str_rc = reverseComplement(ins_seq_str); // Before the insertion if (pos >= (uint32_t)op_len-1) @@ -523,17 +525,7 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v printError("ERROR: failed to open " + bam_filepath); return; } - - // Use multi-threading for the BAM file - // int thread_count = input_data.getThreadCount(); - // // if (!input_data.isSingleChr()) { - // // // Use half the threads for chromosomes, the other half for file I/O - // // thread_count = std::max(1, thread_count / 2); - // // } - // printMessage("Using " + std::to_string(thread_count) + " threads for BAM file I/O"); - // // int num_threads = input_data.getThreadCount(); - // hts_set_threads(fp_in, thread_count); - hts_set_threads(fp_in, 1); // Disable multi-threading for now + hts_set_threads(fp_in, 1); // Load the header bam_hdr_t *bamHdr = sam_hdr_read(fp_in); @@ -555,8 +547,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v // Set the region to process std::string region = chr; - // uint32_t chr_len = ref_genome.getChromosomeLength(chr); - // uint32_t chr_len = bamHdr->target_len[bam_name2id(bamHdr, chr.c_str())]; if (input_data.isRegionSet()) { // Use one chunk for the specified region @@ -566,17 +556,6 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v region = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end); } - // Load chromosome data for copy number predictions - // printMessage(chr + ": Loading chromosome data..."); - CNVCaller cnv_caller(this->shared_mutex); - // std::vector chr_pos_depth_map(chr_len+1, 0); // 1-based index - - // // Use only half the threads for chromosomes, the other half for file I/O - // double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, chr_pos_depth_map, bam_filepath, thread_count); - // if (mean_chr_cov == 0.0 || chr_pos_depth_map.size() == 0) { - // return; - // } - // Estimate DBSCAN minimum points double dbscan_min_pts_pct = input_data.getDBSCAN_MinPtsPct(); if (dbscan_min_pts_pct > 0.0) { @@ -584,6 +563,8 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v printMessage(chr + ": Mean chr. cov.: " + std::to_string(mean_chr_cov) + " (DBSCAN min. pts.= " + std::to_string(dbscan_min_pts) + ", min. pts. pct.= " + std::to_string(dbscan_min_pts_pct) + ")"); } + + // ----------------------------------------------------------------------- // Detect SVs from the CIGAR strings printMessage(chr + ": CIGAR SVs..."); this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); @@ -594,33 +575,10 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v int region_sv_count = getSVCount(chr_sv_calls); printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); - // [TEST] Before this section has no memory leaks - // Run split-read SV and copy number variant predictions + // ----------------------------------------------------------------------- + // Detect SVs from the split reads printMessage(chr + ": Split read SVs..."); - // std::unordered_map primary_map; - // std::unordered_map> supp_map; - // std::vector sv_candidates = this->getSplitAlignments(fp_in, idx, - // bamHdr, region, primary_map, supp_map); this->getSplitAlignments(fp_in, idx, bamHdr, region, split_sv_calls); - - // std::vector split_sv_calls; - // this->detectSVsFromSplitReads(region, fp_in, idx, bamHdr, split_sv_calls, cnv_caller, hmm, mean_chr_cov, chr_pos_depth_map, input_data); - - // // // Merge the split-read SVs separately - // printMessage(chr + ": Merging split reads..."); - // double split_epsilon = 0.45; - // int split_min_pts = 2; // This is low since split alignments were already previously merged - // mergeSVs(split_sv_calls, split_epsilon, split_min_pts); - - // printMessage(chr + ": Unifying SVs..."); - // chr_sv_calls.insert(chr_sv_calls.end(), split_sv_calls.begin(), split_sv_calls.end()); - - // // Sort the SV calls by start position - // std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) { - // return a.start < b.start; - // }); - - printMessage("Completed chromosome " + chr); } void SVCaller::run(const InputData& input_data) @@ -643,15 +601,6 @@ void SVCaller::run(const InputData& input_data) // Get the chromosomes from the input BAM file chromosomes = this->getChromosomes(input_data.getLongReadBam()); } - - // [TEST] Keep only the first 6 chromosomes - // chromosomes.resize(4); - // // Remove the first chromosome - // chromosomes.erase(chromosomes.begin()); - // printMessage("Chromosomes: " + std::to_string(chromosomes.size())); - // for (const auto& chr : chromosomes) { - // printMessage(" " + chr); - // } // Read the HMM from the file std::string hmm_filepath = input_data.getHMMFilepath(); @@ -691,39 +640,6 @@ void SVCaller::run(const InputData& input_data) printMessage("Removing chromosome " + chr + " with no reads..."); chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end()); } - // std::cout << "Reading chromosome coverage..." << std::endl; - // std::cout << "(Thread count: " << chr_thread_count << ")" << std::endl; - // int current_chr = 0; - // int total_chr_count = chromosomes.size(); - // std::vector null_chr; - // for (const auto& chr : chromosomes) { - // current_chr++; - // uint32_t chr_len = ref_genome.getChromosomeLength(chr); - // if (chr_len == 0) { - // printError("ERROR: chromosome " + chr + " not found in reference genome"); - // return; - // } - // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Reading " + chr + "..."); - // std::vector pos_depth_map(chr_len+1, 0); // 1-based index - // double mean_chr_cov = cnv_caller.calculateMeanChromosomeCoverage(chr, pos_depth_map, bam_filepath, chr_thread_count); - // if (mean_chr_cov == 0.0 || pos_depth_map.size() == 0) { - // // No reads, continue to the next chromosome - // null_chr.push_back(chr); - // continue; - // // printError("ERROR: failed to calculate mean chromosome coverage for " + chr); - // // return; - // } - // chr_pos_depth_map[chr] = std::move(pos_depth_map); - // chr_mean_cov_map[chr] = mean_chr_cov; - // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean cov. for " + chr + ": " + std::to_string(mean_chr_cov)); - // } - // printMessage("Completed reading chromosome coverage."); - - // Remove chromosomes with no reads - // for (const auto& chr : null_chr) { - // printMessage("Removing chromosome " + chr + " with no reads..."); - // chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end()); - // } // Use multi-threading across chromosomes. If a single chromosome is // specified, use a single main thread (multi-threading is used for file I/O) @@ -742,7 +658,6 @@ void SVCaller::run(const InputData& input_data) InputData chr_input_data = input_data; // Use a thread-local copy this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls); { - // std::lock_guard lock(this->shared_mutex); std::shared_lock lock(this->shared_mutex); whole_genome_sv_calls[chr] = std::move(sv_calls); whole_genome_split_sv_calls[chr] = std::move(split_sv_calls); @@ -759,7 +674,7 @@ void SVCaller::run(const InputData& input_data) std::vector> futures; for (const auto& chr : chromosomes) { futures.emplace_back(pool.enqueue([&, chr] { - printMessage("Processing chromosome " + chr); + // printMessage("Processing chromosome " + chr); process_chr(chr); })); } @@ -771,7 +686,6 @@ void SVCaller::run(const InputData& input_data) try { current_chr++; future.get(); - printMessage("Chromosome task "+ std::to_string(current_chr) + " of " + std::to_string(total_chr_count) + " completed."); } catch (const std::exception& e) { printError("Error processing chromosome task: " + std::string(e.what())); } catch (...) { @@ -780,34 +694,34 @@ void SVCaller::run(const InputData& input_data) } printMessage("All tasks have finished."); + // ------------------------------------------------------- // Run copy number variant predictions on the SVs detected from the // CIGAR string, using a minimum CNV length threshold + current_chr = 0; printMessage("Running copy number predictions on CIGAR SVs..."); for (auto& entry : whole_genome_sv_calls) { + current_chr++; const std::string& chr = entry.first; std::vector& sv_calls = entry.second; if (sv_calls.size() > 0) { - printMessage("Running copy number predictions on " + chr + "..."); + // printMessage("Running copy number predictions on " + chr + + // "..."); + printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); } } + // ------------------------------------------------------- printMessage("Running copy number predictions on split-read SVs..."); + current_chr = 0; for (auto& entry : whole_genome_split_sv_calls) { const std::string& chr = entry.first; std::vector& sv_calls = entry.second; if (sv_calls.size() > 0) { - printMessage("Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates..."); + current_chr++; + printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates..."); this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); - - // Merge the split-read SVs separately - // printMessage(chr + ": Merging split reads..."); - // double split_epsilon = 0.45; - // // int split_min_pts = 2; // This is low since split alignments - // // were already previously merged - // int split_min_pts = 1; - // mergeSVs(sv_calls, split_epsilon, split_min_pts); } } @@ -817,35 +731,6 @@ void SVCaller::run(const InputData& input_data) std::vector& sv_calls = entry.second; whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end()); } - // sv_calls.insert(sv_calls.end(), split_sv_calls.begin(), - // split_sv_calls.end()); - - - // Sort the SV calls by start position - // printMessage("Sorting SVs..."); - // for (auto& entry : whole_genome_sv_calls) { - // std::sort(entry.second.begin(), entry.second.end(), [](const SVCall& a, const SVCall& b) { - // return a.start < b.start || (a.start == b.start && a.end < b.end); - // }); - - // // Check that the SVs are sorted - // bool unsorted = false; - // for (size_t i = 1; i < entry.second.size(); i++) { - // if (entry.second[i].start < entry.second[i-1].start || (entry.second[i].start == entry.second[i-1].start && entry.second[i].end < entry.second[i-1].end)) { - // printError("ERROR: SVs are not sorted for chromosome " + entry.first); - // unsorted = true; - // break; - // } - // } - // if (!unsorted) { - // printMessage("SVs are sorted for chromosome " + entry.first); - // } - // } - // // Sort the SV calls by start position - // std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) { - // return a.start < b.start; - // }); - // Print the total number of SVs detected for each chromosome uint32_t total_sv_count = 0; @@ -853,9 +738,9 @@ void SVCaller::run(const InputData& input_data) std::string chr = entry.first; int sv_count = getSVCount(entry.second); total_sv_count += sv_count; - printMessage("Total SVs detected for chromosome " + chr + ": " + std::to_string(sv_count)); + printMessage("Total SVs detected for " + chr + ": " + std::to_string(sv_count)); } - printMessage("Total SVs detected for all chromosomes: " + std::to_string(total_sv_count)); + printMessage("Total SVs detected: " + std::to_string(total_sv_count)); // Save to VCF std::cout << "Saving SVs to VCF..." << std::endl; @@ -863,63 +748,14 @@ void SVCaller::run(const InputData& input_data) this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome); } - -// Detect SVs from split read alignments -// void SVCaller::detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) -// { -// // printMessage(region + ": Getting split alignments..."); -// // std::unordered_map primary_map; -// // std::unordered_map> supp_map; -// std::vector sv_candidates = this->getSplitAlignments(fp_in, idx, bamHdr, region, primary_map, supp_map); - -// // Run copy number predictions on the SVs detected from the split reads -// printMessage(region + ": Split read predictions..."); -// int current_sv = 0; -// int total_svs = sv_candidates.size(); -// for (auto& sv_candidate : sv_candidates) { -// bool is_inversion = sv_candidate.sv_type == SVType::INV; - -// std::tuple result = cnv_caller.runCopyNumberPrediction(region, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); -// if (std::get<1>(result) == SVType::UNKNOWN) { -// continue; -// } - -// double supp_lh = std::get<0>(result); -// SVType supp_type = std::get<1>(result); -// std::string genotype = std::get<2>(result); -// if (supp_type != SVType::UNKNOWN) { -// if (is_inversion) { -// if (supp_type == SVType::DEL) { -// supp_type = SVType::INV_DEL; -// } else if (supp_type == SVType::DUP) { -// supp_type = SVType::INV_DUP; -// } else if (supp_type == SVType::NEUTRAL) { -// supp_type = SVType::INV; -// } -// } - -// if (supp_type != SVType::NEUTRAL) { -// int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); -// std::string alt_allele = "<" + getSVTypeString(supp_type) + ">"; -// SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); -// addSVCall(split_sv_calls, sv_call); -// } -// } -// current_sv++; -// if (current_sv % 1000 == 0) { -// printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates"); -// } -// } -// } - // Detect SVs from split read alignments void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::vector& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) { // Run copy number predictions on the SVs detected from the split reads - printMessage("Split read predictions..."); - int current_sv = 0; - int total_svs = split_sv_calls.size(); - for (auto& sv_candidate : split_sv_calls) { + std::vector processed_calls; + for (const auto& sv_candidate : split_sv_calls) { + printMessage("Processing SV candidate " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " " + getSVTypeString(sv_candidate.sv_type) + "..."); + bool is_inversion = sv_candidate.sv_type == SVType::INV; std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); @@ -928,6 +764,12 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve std::string genotype = std::get<2>(result); if (supp_type != SVType::UNKNOWN) { if (is_inversion) { + // Add an additional inversion separately + int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); + std::string alt_allele = ""; + SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); + processed_calls.push_back(sv_call); + /* if (supp_type == SVType::DEL) { supp_type = SVType::INV_DEL; } else if (supp_type == SVType::DUP) { @@ -935,6 +777,7 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve } else if (supp_type == SVType::NEUTRAL) { supp_type = SVType::INV; } + */ } if (supp_type != SVType::NEUTRAL) { @@ -942,7 +785,8 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve std::string alt_allele = "<" + getSVTypeString(supp_type) + ">"; SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type)); - addSVCall(split_sv_calls, sv_call); + // addSVCall(split_sv_calls, sv_call); + processed_calls.push_back(sv_call); } } else if (supp_type == SVType::UNKNOWN && sv_candidate.sv_type == SVType::INV) { // Inversion with no CNV prediction @@ -950,13 +794,16 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve std::string alt_allele = ""; SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type)); - addSVCall(split_sv_calls, sv_call); + // addSVCall(split_sv_calls, sv_call); + processed_calls.push_back(sv_call); } - current_sv++; // if (current_sv % 1000 == 0) { // printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates"); // } } + + // Replace with the processed calls + split_sv_calls = std::move(processed_calls); } void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 371fa521..0c19468c 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -65,6 +65,11 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) SVType::INV_DEL, }) { + // [TEST] Skip if not insertions + // if (sv_type != SVType::INS) { + // continue; + // } + // Create a vector of SV calls for the current SV type and size interval std::vector sv_type_calls; std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) { @@ -87,7 +92,49 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) for (auto& cluster : cluster_map) { int cluster_id = cluster.first; std::vector& cluster_sv_calls = cluster.second; + + + // [TEST] If insertions, and if any SV has length between 9400 and + // 9500, print all SV coordinates in the cluster + bool print_all = false; + // if (sv_type == SVType::INS) { + // for (const auto& sv_call : cluster_sv_calls) { + // // printMessage("[TEST] SV call " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1)); + // // if (sv_call.end - sv_call.start >= 9400 && sv_call.end - + // // sv_call.start <= 9500) { + // // if (sv_call.end - sv_call.start >= 15100 && sv_call.end - + // // sv_call.start <= 15200) { + // // if (sv_call.end - sv_call.start >= 11200 && sv_call.end - + // // sv_call.start <= 11300) { + // // if (sv_call.end - sv_call.start >= 16800 && sv_call.end - + // // sv_call.start <= 17000) { + // // if (sv_call.end - sv_call.start >= 11300 && sv_call.end - + // // sv_call.start <= 11400) { + // // if (sv_call.end - sv_call.start >= 13100 && sv_call.end - + // // sv_call.start <= 13200) { + // if (sv_call.end - sv_call.start >= 28200 && sv_call.end - sv_call.start <= 28300) { + // print_all = true; + // break; + // } + // } + // } + if (print_all) { + printMessage("[TEST] Cluster " + std::to_string(cluster_id) + " has " + std::to_string(cluster_sv_calls.size()) + " SVs:"); + for (const auto& sv_call : cluster_sv_calls) { + printMessage(" " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1)); + } + } + if (cluster_id < 0) { + // Add all noise points to the merged list if >10 kb + // for (const auto& sv_call : cluster_sv_calls) { + // if ((sv_call.end - sv_call.start)+1 >= 10000) { + // SVCall noise_sv_call = sv_call; + // noise_sv_call.cluster_size = cluster_id; + // merged_sv_calls.push_back(noise_sv_call); + // printMessage("[TEST] Adding noise SV " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1)); + // } + // } continue; // Skip noise and unclassified points } else { // if (true) { @@ -118,13 +165,43 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) }); merged_sv_call = *it; + // [TEST] + if (print_all) { + printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with highest likelihood SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1)); + } + } else { - // Use the median length SV + // Use the median length SV of the top 10% of the cluster + // (shorter reads are often noise) std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { - return (a.end - a.start) < (b.end - b.start); + return (a.end - a.start) > (b.end - b.start); }); - int median_index = cluster_sv_calls.size() / 2; - merged_sv_call = cluster_sv_calls[median_index]; + + // Get the top 10% of the cluster + size_t top_10_percent = std::max(1, (int) (cluster_sv_calls.size() * 0.1)); + std::vector top_10(cluster_sv_calls.begin(), cluster_sv_calls.begin() + top_10_percent); + + // Get the median SV for the top 10% of the cluster + size_t median_index = top_10.size() / 2; + merged_sv_call = top_10[median_index]; + + // // Get the starting index of the top 10% of the cluster + // // (Cluster is sorted by descending length) + // size_t start_index = std::max(0, (int) (cluster_sv_calls.size() * 0.9)); + + // // Get the top 10% of the cluster + // std::vector top_half(cluster_sv_calls.begin() + start_index, cluster_sv_calls.end()); + + // // Get the median SV for the top 50% of the cluster + // size_t median_index = top_half.size() / 2; + // merged_sv_call = top_half[median_index]; + // int median_index = cluster_sv_calls.size() / 2; + // merged_sv_call = cluster_sv_calls[median_index]; + + // [TEST] + if (print_all) { + printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with median SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1)); + } } if (cluster_id < 0) { From 0776822eb6bbb43a271e9a9b43735ec8cb4241d0 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 12 Mar 2025 23:06:39 -0400 Subject: [PATCH 082/134] fix insertions --- include/sv_caller.h | 66 ++- src/sv_caller.cpp | 1335 ++++++++++++++++++++++++++++++++----------- 2 files changed, 1050 insertions(+), 351 deletions(-) diff --git a/include/sv_caller.h b/include/sv_caller.h index d795f44b..1b420226 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -21,51 +21,70 @@ struct GenomicRegion { int tid; hts_pos_t start; hts_pos_t end; + int query_start; + int query_end; bool strand; - uint8_t qual; int cluster_size; // Number of alignments used for this region }; +struct PrimaryAlignment { + hts_pos_t start; + hts_pos_t end; + int query_start; + int query_end; + bool strand; + int cluster_size; // Number of alignments used for this region +}; + +struct SuppAlignment { + int tid; + hts_pos_t start; + hts_pos_t end; + int query_start; + int query_end; + bool strand; + int cluster_size; // Number of alignments used for this region +}; + +struct SplitSignature { + int tid; + hts_pos_t start; + hts_pos_t end; + bool strand; + hts_pos_t query_start; + hts_pos_t query_end; +}; + // Interval Tree Node struct IntervalNode { - GenomicRegion region; + PrimaryAlignment region; std::string qname; hts_pos_t max_end; // To optimize queries - // IntervalNode* left; - // IntervalNode* right; std::unique_ptr left; std::unique_ptr right; - IntervalNode(GenomicRegion r, std::string name) + IntervalNode(PrimaryAlignment r, std::string name) : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {} }; -// IntervalNode* insert(IntervalNode* root, GenomicRegion region, std::string -// qname) { -void insert(std::unique_ptr& root, GenomicRegion region, std::string qname) { +void insert(std::unique_ptr& root, const PrimaryAlignment& region, std::string qname) { if (!root) { - // return new IntervalNode(region, qname); root = std::make_unique(region, qname); return; } if (region.start < root->region.start) { - // root->left = insert(root->left, region, qname); insert(root->left, region, qname); } else { - // root->right = insert(root->right, region, qname); insert(root->right, region, qname); } // Update max_end root->max_end = std::max(root->max_end, region.end); - // return root; } -// void findOverlaps(IntervalNode* root, GenomicRegion query, -// std::vector& result) { -void findOverlaps(const std::unique_ptr& root, GenomicRegion query, std::vector& result) { +void findOverlaps(const std::unique_ptr& root, const PrimaryAlignment& query, std::vector& result) { if (!root) return; // If overlapping, add to result @@ -93,18 +112,23 @@ class SVCaller { std::vector getChromosomes(const std::string& bam_filepath); - void getSplitAlignments(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls); + // void findSplitCNVBreakpoints(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls); + + void findSplitSVSignatures(std::unordered_map>& sv_calls, const InputData& input_data); + + void findSplitReadSVs(std::unordered_map>& sv_calls, const ReferenceGenome& ref_genome, const InputData& input_data); - // Detect SVs from the CIGAR string of a read alignment, and return the - // mismatch rate, and the start and end positions of the query sequence - void detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome); + // Process a single CIGAR record and find candidate SVs + void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome); + + std::pair getAlignmentReadPositions(bam1_t* alignment); void processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov, std::vector& split_sv_calls); // Detect SVs at a region from long read alignments. This is used for // whole genome analysis running in parallel. // RegionData detectSVsFromRegion(std::string region); - void detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome); + void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome); // Read the next alignment from the BAM file in a thread-safe manner int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); @@ -124,8 +148,6 @@ class SVCaller { // Calculate the read depth (INFO/DP) for a region int calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end); - bool regionOverlaps(const GenomicRegion& a, const GenomicRegion& b); - public: // Constructor with no arguments SVCaller() = default; diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 1669e093..31b7270a 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -67,29 +67,355 @@ std::vector SVCaller::getChromosomes(const std::string &bam_filepat return chromosomes; } -void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const std::string ®ion, std::vector& sv_calls) +// void SVCaller::findSplitCNVBreakpoints(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const std::string ®ion, std::vector& sv_calls) +// { +// std::unordered_map primary_map; +// std::unordered_map> supp_map; + +// // Create a read and iterator for the region +// bam1_t *bam1 = bam_init1(); +// if (!bam1) { +// printError("ERROR: failed to initialize BAM record"); +// return; +// } +// hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str()); +// if (!itr) { +// bam_destroy1(bam1); +// printError("ERROR: failed to query region " + region); +// return; +// } + +// uint32_t primary_count = 0; +// uint32_t supplementary_count = 0; + +// // Main loop to process the alignments +// uint32_t num_alignments = 0; +// while (readNextAlignment(fp_in, itr, bam1) >= 0) { + +// // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality +// if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) { +// continue; +// } +// const std::string qname = bam_get_qname(bam1); // Query template name + +// // Process primary alignments +// if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { +// // Store chromosome (TID), start, and end positions (1-based) of the +// // primary alignment, and the strand (true for forward, false for reverse) +// primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0}; +// primary_count++; + +// // Process supplementary alignments +// } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { +// // Store chromosome (TID), start, and end positions (1-based) of the +// // supplementary alignment, and the strand (true for forward, false for reverse) +// supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0}); +// supplementary_count++; +// } +// num_alignments++; +// } + +// // Remove primary alignments without supplementary alignments +// std::vector to_remove; +// for (const auto& entry : primary_map) { +// const std::string& qname = entry.first; +// if (supp_map.find(qname) == supp_map.end()) { +// to_remove.push_back(qname); +// } +// } +// for (const std::string& qname : to_remove) { +// primary_map.erase(qname); +// } + +// // // Clean up the iterator and alignment +// // hts_itr_destroy(itr); +// // bam_destroy1(bam1); +// // printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments"); + +// // Identify overlapping primary alignments and then cluster their primary +// // start, end vs. supplementary alignment start, end positions, keeping the +// // median of the largest cluster for the primary and supplementary positions +// // as the final genome coordinates of the SV +// // IntervalNode* root = nullptr; +// std::unique_ptr root = nullptr; +// for (const auto& entry : primary_map) { +// const std::string& qname = entry.first; +// const GenomicRegion& region = entry.second; +// // root = insert(root, region, qname); +// insert(root, region, qname); +// } +// std::vector> primary_clusters; +// std::set processed; + +// for (const auto& entry : primary_map) { +// const std::string& qname = entry.first; +// if (processed.find(qname) != processed.end()) { +// continue; // Skip already processed primary alignments +// } +// const GenomicRegion& region = entry.second; +// std::vector overlap_group; +// findOverlaps(root, region, overlap_group); +// for (const std::string& qname : overlap_group) { +// processed.insert(qname); +// } +// if (overlap_group.size() > 1) { +// primary_clusters.push_back(overlap_group); +// } +// } +// printMessage(region + ": Found " + std::to_string(primary_clusters.size()) + " groups of overlapping primary alignments"); + +// // For each primary alignment cluster the supplementary alignment start and +// // end positions, keeping the median of the largest cluster +// // std::vector sv_candidates; +// int current_group = 0; +// int min_length = 2000; +// int max_length = 1000000; +// for (const auto& primary_cluster : primary_clusters) { +// // Determine if the primary alignments are mostly on opposite strands to +// // the corresponding supplementary alignments (potential inversions) +// bool inversion = false; +// for (const std::string& qname : primary_cluster) { +// const std::vector& supp_alns = supp_map[qname]; +// int num_supp = (int) supp_alns.size(); +// int num_opposite_strand = 0; +// for (const GenomicRegion& supp_aln : supp_alns) { +// // Opposite-strand alignment on the same chromosome +// // (Since the iterator is single-chromosome, this is the case) +// if (supp_aln.strand != primary_map[qname].strand) { +// num_opposite_strand++; +// } +// } +// if (static_cast(num_opposite_strand) / static_cast(num_supp) > 0.5) { +// inversion = true; +// } +// } + +// // Use DBSCAN to cluster primary alignment start, end positions +// DBSCAN1D dbscan(100, 5); +// current_group++; +// std::vector starts; +// std::vector ends; +// std::vector primary_strands; +// for (const std::string& qname : primary_cluster) { +// const GenomicRegion& region = primary_map[qname]; +// starts.push_back(region.start); +// ends.push_back(region.end); +// primary_strands.push_back(region.strand); +// } + +// // Get the largest cluster of primary alignment start positions +// dbscan.fit(starts); +// std::vector primary_start_cluster = dbscan.getLargestCluster(starts); + +// // Get the largest cluster of primary alignment end positions +// dbscan.fit(ends); +// std::vector primary_end_cluster = dbscan.getLargestCluster(ends); + +// // Continue if no clusters were found +// if (primary_start_cluster.empty() && primary_end_cluster.empty()) { +// continue; +// } + +// // Get the supplementary alignment positions +// std::vector supp_starts; +// std::vector supp_ends; +// std::vector supp_strands; +// for (const std::string& qname : primary_cluster) { +// const std::vector& regions = supp_map[qname]; +// for (const GenomicRegion& region : regions) { +// supp_starts.push_back(region.start); +// supp_ends.push_back(region.end); +// supp_strands.push_back(region.strand); +// } +// } + +// // Get the largest cluster of supplementary alignment start positions +// dbscan.fit(supp_starts); +// std::vector supp_start_cluster = dbscan.getLargestCluster(supp_starts); + +// // Get the largest cluster of supplementary alignment end positions +// dbscan.fit(supp_ends); +// std::vector supp_end_cluster = dbscan.getLargestCluster(supp_ends); + +// // Continue if no clusters were found +// if (supp_start_cluster.empty() && supp_end_cluster.empty()) { +// continue; +// } + +// // Use the median of the largest cluster of primary and supplementary +// // alignment start, end positions as the final genome coordinates of the +// // SV +// int primary_pos = -1; +// int primary_pos2 = -1; +// int primary_cluster_size = 0; +// if (primary_start_cluster.size() > primary_end_cluster.size()) { +// std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); +// primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; +// primary_cluster_size = primary_start_cluster.size(); +// } else if (primary_end_cluster.size() > primary_start_cluster.size()) { +// std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); +// primary_pos = primary_end_cluster[primary_end_cluster.size() / 2]; +// primary_cluster_size = primary_end_cluster.size(); +// } else { +// // Use both positions +// std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); +// std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); +// primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; +// primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2]; +// primary_cluster_size = primary_start_cluster.size(); +// } + +// // Get the supplementary alignment positions +// int supp_pos = -1; +// int supp_pos2 = -1; +// int supp_cluster_size = 0; +// if (supp_start_cluster.size() > supp_end_cluster.size()) { +// std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); +// supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; +// supp_cluster_size = supp_start_cluster.size(); +// } else if (supp_end_cluster.size() > supp_start_cluster.size()) { +// std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); +// supp_pos = supp_end_cluster[supp_end_cluster.size() / 2]; +// supp_cluster_size = supp_end_cluster.size(); +// } else { +// // Use both positions. This has been shown to occur in nested SVs +// std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); +// std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); +// supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; +// supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2]; +// supp_cluster_size = supp_start_cluster.size(); +// } + +// // If two of either were found, use the larger SV candidate +// if (primary_pos2 != -1) { +// int sv_length1 = std::abs(primary_pos - supp_pos); +// int sv_length2 = std::abs(primary_pos2 - supp_pos); +// if (sv_length2 > sv_length1) { +// primary_pos = primary_pos2; +// } +// } +// if (supp_pos2 != -1) { +// int sv_length1 = std::abs(primary_pos - supp_pos); +// int sv_length2 = std::abs(primary_pos - supp_pos2); +// if (sv_length2 > sv_length1) { +// supp_pos = supp_pos2; +// } +// } + +// if (primary_pos == -1 || supp_pos == -1) { +// continue; +// } + +// // Store the SV candidate if the length is within the specified range +// int sv_start = std::min(primary_pos, supp_pos); +// int sv_end = std::max(primary_pos, supp_pos); +// int sv_length = sv_end - sv_start + 1; +// int cluster_size = std::max(primary_cluster_size, supp_cluster_size); + +// // Determine the SV type +// SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; +// if (sv_length >= min_length && sv_length <= max_length) { +// SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size); +// addSVCall(sv_calls, sv_candidate); +// // printMessage(region + ": Found SV candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group) + " with inversion status " + std::to_string(inversion)); +// } +// } + +// // Combine SVs with identical start and end positions, and sum the cluster +// // sizes +// std::vector combined_sv_calls; +// std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { +// return a.start < b.start || (a.start == b.start && a.end < b.end); +// }); +// int merge_count = 0; +// for (size_t i = 0; i < sv_calls.size(); i++) { +// SVCall& sv_call = sv_calls[i]; +// if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.end == sv_calls[i - 1].end) { +// sv_calls[i - 1].cluster_size += sv_call.cluster_size; +// merge_count++; +// } else { +// combined_sv_calls.push_back(sv_call); +// } +// } +// sv_calls = std::move(combined_sv_calls); + +// // if (merge_count > 0) { +// // printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions"); +// // } +// } + +void SVCaller::findSplitSVSignatures(std::unordered_map> &sv_calls, const InputData &input_data) { - std::unordered_map primary_map; - std::unordered_map> supp_map; + // Open the BAM file + std::string bam_filepath = input_data.getLongReadBam(); + samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); + if (!fp_in) { + printError("ERROR: failed to open " + bam_filepath); + return; + } + + // Set maximum thread count + int thread_count = input_data.getThreadCount(); + hts_set_threads(fp_in, thread_count); + printMessage("Using " + std::to_string(thread_count) + " threads for split read analysis"); + + // Load the header + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); + if (!bamHdr) { + sam_close(fp_in); + printError("ERROR: failed to read header from " + bam_filepath); + return; + } + + // Load the index + hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str()); + if (!idx) { + bam_hdr_destroy(bamHdr); + sam_close(fp_in); + printError("ERROR: failed to load index for " + bam_filepath); + return; + } + BamFileGuard bam_guard(fp_in, idx, bamHdr); // Guard to close the BAM file + + // Alignment data structures + // std::unordered_map primary_map; + // std::unordered_map> supp_map; + std::unordered_map> primary_map; // TID-> qname -> primary alignment + std::unordered_map> supp_map; // qname -> supplementary alignment - // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); if (!bam1) { printError("ERROR: failed to initialize BAM record"); return; } - hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str()); - if (!itr) { - bam_destroy1(bam1); - printError("ERROR: failed to query region " + region); - return; + + // Set the region to the whole genome, or a user-specified chromosome + hts_itr_t *itr = nullptr; + if (input_data.isSingleChr()) { + std::string chr = input_data.getChromosome(); + itr = sam_itr_querys(idx, bamHdr, chr.c_str()); + if (!itr) { + bam_destroy1(bam1); + printError("ERROR: failed to create iterator for " + chr); + return; + } + } else { + itr = sam_itr_queryi(idx, HTS_IDX_START, 0, 0); + if (!itr) { + bam_destroy1(bam1); + printError("ERROR: failed to create iterator for the whole genome"); + return; + } } uint32_t primary_count = 0; uint32_t supplementary_count = 0; // Main loop to process the alignments + printMessage("Processing alignments from " + bam_filepath); uint32_t num_alignments = 0; + std::unordered_set alignment_tids; // All unique chromosome IDs + std::unordered_set supp_qnames; // All unique query names while (readNextAlignment(fp_in, itr, bam1) >= 0) { // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality @@ -97,252 +423,404 @@ void SVCaller::getSplitAlignments(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bam continue; } const std::string qname = bam_get_qname(bam1); // Query template name - uint8_t mapq = bam1->core.qual; // Mapping quality // Process primary alignments if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { // Store chromosome (TID), start, and end positions (1-based) of the - // primary alignment, and the strand (true for forward, false for reverse) - primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq, 0}; + // primary alignment, and the strand (true for forward, false for + // reverse) + std::pair qpos = getAlignmentReadPositions(bam1); + + primary_map[bam1->core.tid][qname] = PrimaryAlignment{bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0}; + // primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0}; + // primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0}; + alignment_tids.insert(bam1->core.tid); primary_count++; // Process supplementary alignments } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { // Store chromosome (TID), start, and end positions (1-based) of the - // supplementary alignment, and the strand (true for forward, false for reverse) - supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), mapq, 0}); + // supplementary alignment, and the strand (true for forward, false + // for reverse) + std::pair qpos = getAlignmentReadPositions(bam1); + supp_map[qname].push_back(SuppAlignment{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0}); + // supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0}); + alignment_tids.insert(bam1->core.tid); + supp_qnames.insert(qname); supplementary_count++; } num_alignments++; + + if (num_alignments % 1000000 == 0) { + printMessage("Processed " + std::to_string(num_alignments) + " alignments"); + } } // Remove primary alignments without supplementary alignments - std::vector to_remove; - for (const auto& entry : primary_map) { - const std::string& qname = entry.first; - if (supp_map.find(qname) == supp_map.end()) { - to_remove.push_back(qname); + std::unordered_map> to_remove; + for (auto& chr_primary : primary_map) { + // Get the qnames for this chromosome + std::unordered_set qnames; + for (const auto& entry : chr_primary.second) { + if (supp_qnames.find(entry.first) == supp_qnames.end()) { + to_remove[chr_primary.first].insert(entry.first); + } } } - for (const std::string& qname : to_remove) { - primary_map.erase(qname); - } - // Clean up the iterator and alignment - hts_itr_destroy(itr); - bam_destroy1(bam1); - printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments"); - - // Identify overlapping primary alignments and then cluster their primary - // start, end vs. supplementary alignment start, end positions, keeping the - // median of the largest cluster for the primary and supplementary positions - // as the final genome coordinates of the SV - // IntervalNode* root = nullptr; - std::unique_ptr root = nullptr; - for (const auto& entry : primary_map) { - const std::string& qname = entry.first; - const GenomicRegion& region = entry.second; - // root = insert(root, region, qname); - insert(root, region, qname); + int total_removed = 0; + for (auto& chr_primary : primary_map) { + // Remove the qnames from the primary map + total_removed += to_remove[chr_primary.first].size(); + for (const auto& qname : to_remove[chr_primary.first]) { + chr_primary.second.erase(qname); + } } - std::vector> primary_clusters; - std::set processed; + printMessage("Removed " + std::to_string(total_removed) + " primary alignments without supplementary alignments"); + + // std::vector to_remove; + // for (const auto& entry : primary_map) { + // const std::string& qname = entry.first; + // if (supp_map.find(qname) == supp_map.end()) { + // to_remove.push_back(qname); + // } + // } + // for (const std::string& qname : to_remove) { + // primary_map.erase(qname); + // } - for (const auto& entry : primary_map) { - const std::string& qname = entry.first; - if (processed.find(qname) != processed.end()) { - continue; // Skip already processed primary alignments - } - const GenomicRegion& region = entry.second; - std::vector overlap_group; - findOverlaps(root, region, overlap_group); - for (const std::string& qname : overlap_group) { - processed.insert(qname); + + for (const auto& chr_primary : primary_map) { + int primary_tid = chr_primary.first; + std::string chr_name = bamHdr->target_name[primary_tid]; + printMessage("Processing chromosome " + chr_name + " with " + std::to_string(chr_primary.second.size()) + " primary alignments"); + + std::vector chr_sv_calls; + + // std::unordered_map> primary_map; // TID-> qname -> primary alignment + // const std::unordered_map>& + // chr_primary_map = chr_primary.second; + const std::unordered_map& chr_primary_map = chr_primary.second; + + // Identify overlapping primary alignments and then cluster their primary + // start, end vs. supplementary alignment start, end positions, keeping the + // median of the largest cluster for the primary and supplementary positions + // as the final genome coordinates of the SV + // IntervalNode* root = nullptr; + std::unique_ptr root = nullptr; + for (const auto& entry : chr_primary_map) { + const std::string& qname = entry.first; + const PrimaryAlignment& region = entry.second; + insert(root, region, qname); } - if (overlap_group.size() > 1) { - primary_clusters.push_back(overlap_group); + + std::vector> primary_clusters; + std::set processed; + for (const auto& entry : chr_primary_map) { + const std::string& qname = entry.first; + if (processed.find(qname) != processed.end()) { + continue; // Skip already processed primary alignments + } + const PrimaryAlignment& primary_aln = entry.second; + std::vector overlap_group; + findOverlaps(root, primary_aln, overlap_group); + for (const std::string& qname : overlap_group) { + processed.insert(qname); + } + if (overlap_group.size() > 1) { + primary_clusters.push_back(overlap_group); + } } - } - printMessage(region + ": Found " + std::to_string(primary_clusters.size()) + " groups of overlapping primary alignments"); - - // For each primary alignment cluster the supplementary alignment start and - // end positions, keeping the median of the largest cluster - // std::vector sv_candidates; - int current_group = 0; - int min_length = 2000; - int max_length = 1000000; - for (const auto& primary_group : primary_clusters) { - // Determine if the primary alignments are mostly on opposite strands to - // the corresponding supplementary alignments (potential inversions) - bool inversion = false; - for (const std::string& qname : primary_group) { - const std::vector& regions = supp_map[qname]; - int num_supp = (int) regions.size(); - int num_opposite_strand = 0; - for (const GenomicRegion& region : regions) { - if (region.strand != primary_map[qname].strand) { - num_opposite_strand++; + printMessage(chr_name + ": Found " + std::to_string(primary_clusters.size()) + " groups of overlapping primary alignments"); + + // For each primary alignment cluster the supplementary alignment start and + // end positions, keeping the median of the largest cluster + int current_group = 0; + int min_length = 2000; + int max_length = 1000000; + for (const auto& primary_cluster : primary_clusters) { + // Determine if the primary alignments are mostly on opposite strands to + // the corresponding supplementary alignments (potential inversions) + bool inversion = false; + int num_primary = (int) primary_cluster.size(); + int num_supp_opposite_strand = 0; + for (const std::string& qname : primary_cluster) { + const std::vector& supp_alns = supp_map[qname]; + bool primary_strand = chr_primary_map.at(qname).strand; + bool has_opposite_strand = false; + for (const SuppAlignment& supp_aln : supp_alns) { + // Analyze if on the same chromosome + if (supp_aln.tid == primary_tid && supp_aln.strand != primary_strand) { + has_opposite_strand = true; + } + } + if (has_opposite_strand) { + num_supp_opposite_strand++; } } - if (static_cast(num_opposite_strand) / static_cast(num_supp) > 0.5) { + if (static_cast(num_supp_opposite_strand) / static_cast(num_primary) > 0.5) { inversion = true; } - } - // Use DBSCAN to cluster primary alignment start, end positions - DBSCAN1D dbscan(100, 5); - current_group++; - std::vector starts; - std::vector ends; - std::vector primary_strands; - for (const std::string& qname : primary_group) { - const GenomicRegion& region = primary_map[qname]; - starts.push_back(region.start); - ends.push_back(region.end); - primary_strands.push_back(region.strand); - } + // Use DBSCAN to cluster primary alignment start, end positions + DBSCAN1D dbscan(100, 5); + current_group++; + std::vector starts; + std::vector ends; + std::vector primary_strands; + for (const std::string& qname : primary_cluster) { + const PrimaryAlignment& primary_aln = chr_primary_map.at(qname); + starts.push_back(primary_aln.start); + ends.push_back(primary_aln.end); + primary_strands.push_back(primary_aln.strand); + } - // Get the largest cluster of primary alignment start positions - dbscan.fit(starts); - std::vector primary_start_cluster = dbscan.getLargestCluster(starts); + // Get the largest cluster of primary alignment start positions + dbscan.fit(starts); + std::vector primary_start_cluster = dbscan.getLargestCluster(starts); - // Get the largest cluster of primary alignment end positions - dbscan.fit(ends); - std::vector primary_end_cluster = dbscan.getLargestCluster(ends); + // Get the largest cluster of primary alignment end positions + dbscan.fit(ends); + std::vector primary_end_cluster = dbscan.getLargestCluster(ends); - // Continue if no clusters were found - if (primary_start_cluster.empty() && primary_end_cluster.empty()) { - continue; - } + // Continue if no clusters were found + if (primary_start_cluster.empty() && primary_end_cluster.empty()) { + continue; + } - // Get the supplementary alignment positions - std::vector supp_starts; - std::vector supp_ends; - std::vector supp_strands; - for (const std::string& qname : primary_group) { - const std::vector& regions = supp_map[qname]; - for (const GenomicRegion& region : regions) { - supp_starts.push_back(region.start); - supp_ends.push_back(region.end); - supp_strands.push_back(region.strand); + // Get the supplementary alignment positions, and also the distances + // between the primary and supplementary alignments on the read + std::vector supp_starts; + std::vector supp_ends; + std::vector supp_strands; + std::vector split_distances; + for (const std::string& qname : primary_cluster) { + const PrimaryAlignment& primary_aln = chr_primary_map.at(qname); + const std::vector& supp_alns = supp_map.at(qname); + for (const SuppAlignment& supp_aln : supp_alns) { + if (supp_aln.tid == primary_tid) { + // Same chromosome + int distance = 0; + supp_starts.push_back(supp_aln.start); + supp_ends.push_back(supp_aln.end); + supp_strands.push_back(supp_aln.strand); + + // Calculate the distance between the primary and supplementary + // alignments on the read if on the same chromosome and same + // strand + if (supp_aln.strand == primary_aln.strand) { + // Same strand + // Calculate distance (negative if overlapping) + if (primary_aln.query_start <= supp_aln.query_start) { + distance = supp_aln.query_start - primary_aln.query_end; + } else { + distance = primary_aln.query_start - supp_aln.query_end; + } + split_distances.push_back(distance); + } else { + // TODO: INVERSIONS + } + } else { + // TODO: TRANSLOCATIONS + } + } } - } - // Get the largest cluster of supplementary alignment start positions - dbscan.fit(supp_starts); - std::vector supp_start_cluster = dbscan.getLargestCluster(supp_starts); + // Get the largest cluster of supplementary alignment start positions + dbscan.fit(supp_starts); + std::vector supp_start_cluster = dbscan.getLargestCluster(supp_starts); - // Get the largest cluster of supplementary alignment end positions - dbscan.fit(supp_ends); - std::vector supp_end_cluster = dbscan.getLargestCluster(supp_ends); + // Get the largest cluster of supplementary alignment end positions + dbscan.fit(supp_ends); + std::vector supp_end_cluster = dbscan.getLargestCluster(supp_ends); - // Continue if no clusters were found - if (supp_start_cluster.empty() && supp_end_cluster.empty()) { - continue; - } + // Get the largest cluster of split distances + dbscan.fit(split_distances); + std::vector split_distance_cluster = dbscan.getLargestCluster(split_distances); + // printMessage("Found " + std::to_string(split_distance_cluster.size()) + " split distances (cluster size)"); - // Use the median of the largest cluster of primary and supplementary - // alignment start, end positions as the final genome coordinates of the - // SV - int primary_pos = -1; - int primary_pos2 = -1; - int primary_cluster_size = 0; - if (primary_start_cluster.size() > primary_end_cluster.size()) { - std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); - primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; - primary_cluster_size = primary_start_cluster.size(); - } else if (primary_end_cluster.size() > primary_start_cluster.size()) { - std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); - primary_pos = primary_end_cluster[primary_end_cluster.size() / 2]; - primary_cluster_size = primary_end_cluster.size(); - } else { - // Use both positions - std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); - std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); - primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; - primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2]; - primary_cluster_size = primary_start_cluster.size(); - } + // Continue if no clusters were found + // if (supp_start_cluster.empty() && supp_end_cluster.empty()) { + if (supp_start_cluster.empty() && supp_end_cluster.empty() && split_distance_cluster.empty()) { + continue; + } - // Get the supplementary alignment positions - int supp_pos = -1; - int supp_pos2 = -1; - int supp_cluster_size = 0; - if (supp_start_cluster.size() > supp_end_cluster.size()) { - std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); - supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; - supp_cluster_size = supp_start_cluster.size(); - } else if (supp_end_cluster.size() > supp_start_cluster.size()) { - std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); - supp_pos = supp_end_cluster[supp_end_cluster.size() / 2]; - supp_cluster_size = supp_end_cluster.size(); - } else { - // Use both positions. This has been shown to occur in nested SVs - std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); - std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); - supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; - supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2]; - supp_cluster_size = supp_start_cluster.size(); - } + // Use the median of the largest cluster of primary and supplementary + // alignment start, end positions as the final genome coordinates of the + // SV + int primary_pos = -1; + int primary_pos2 = -1; + int primary_cluster_size = 0; + if (primary_start_cluster.size() > primary_end_cluster.size()) { + std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); + primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; + primary_cluster_size = primary_start_cluster.size(); + } else if (primary_end_cluster.size() > primary_start_cluster.size()) { + std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); + primary_pos = primary_end_cluster[primary_end_cluster.size() / 2]; + primary_cluster_size = primary_end_cluster.size(); + } else { + // Use both positions + std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); + std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); + primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; + primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2]; + primary_cluster_size = primary_start_cluster.size(); + } - // If two of either were found, use the larger SV candidate - if (primary_pos2 != -1) { - int sv_length1 = std::abs(primary_pos - supp_pos); - int sv_length2 = std::abs(primary_pos2 - supp_pos); - if (sv_length2 > sv_length1) { - primary_pos = primary_pos2; + // ------------------------------- + // SPLIT INSERTION DETECTION + int read_distance = 0; + if (!split_distance_cluster.empty()) { + // Use the median of the largest cluster of split distances as the + // insertion size + std::sort(split_distance_cluster.begin(), split_distance_cluster.end()); + read_distance = split_distance_cluster[split_distance_cluster.size() / 2]; + + // Add an insertion SV call at the primary position + if (primary_pos != -1 && read_distance > 2000) { + if (primary_pos2 != -1) { + // If two positions were found, use the 5'most position + primary_pos = std::min(primary_pos, primary_pos2); + } + SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, "", "SPLITINS", "./.", 0.0, 0, 0, primary_cluster_size); + addSVCall(chr_sv_calls, sv_candidate); + printMessage(chr_name + ": Found split insertion candidate " + std::to_string(primary_pos) + "-" + std::to_string(primary_pos + (read_distance-1)) + " with size " + std::to_string(read_distance) + " for group " + std::to_string(current_group)); + // continue; + } } - } - if (supp_pos2 != -1) { - int sv_length1 = std::abs(primary_pos - supp_pos); - int sv_length2 = std::abs(primary_pos - supp_pos2); - if (sv_length2 > sv_length1) { - supp_pos = supp_pos2; + + // TODO: After this classify deletions if negative (keep the rest + // the same) + + // -------------------------------- + + // Get the supplementary alignment positions + int supp_pos = -1; + int supp_pos2 = -1; + int supp_cluster_size = 0; + int supp_best_start = -1; + int supp_best_end = -1; + if (!supp_start_cluster.empty() && !supp_end_cluster.empty()) { + std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); + int supp_best_start = supp_start_cluster[supp_start_cluster.size() / 2]; + std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); + int supp_best_end = supp_end_cluster[supp_end_cluster.size() / 2]; + if (supp_start_cluster.size() > supp_end_cluster.size()) { + // std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); + // supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; + supp_pos = supp_best_start; + supp_cluster_size = supp_start_cluster.size(); + } else if (supp_end_cluster.size() > supp_start_cluster.size()) { + // std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); + // supp_pos = supp_end_cluster[supp_end_cluster.size() / 2]; + supp_pos = supp_best_end; + supp_cluster_size = supp_end_cluster.size(); + } else { + // Use both positions. This has been shown to occur in nested SVs + // std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); + // std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); + // supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; + // supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2]; + supp_pos = supp_best_start; + supp_pos2 = supp_best_end; + supp_cluster_size = supp_start_cluster.size(); + } + + // Store the inversion as the supplementary start and end positions + if (inversion && std::abs(supp_best_start - supp_best_end) >= 50) { + SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, "", "SUPPINV", "./.", 0.0, 0, 0, supp_cluster_size); + addSVCall(chr_sv_calls, sv_candidate); + printMessage(chr_name + ": Found inversion candidate " + std::to_string(supp_best_start) + "-" + std::to_string(supp_best_end) + " for group " + std::to_string(current_group)); + } } - } - if (primary_pos == -1 || supp_pos == -1) { - continue; - } + // If two of either were found, use the larger SV candidate + if (primary_pos2 != -1) { + int sv_length1 = std::abs(primary_pos - supp_pos); + int sv_length2 = std::abs(primary_pos2 - supp_pos); + if (sv_length2 > sv_length1) { + primary_pos = primary_pos2; + } + } + if (supp_pos2 != -1) { + int sv_length1 = std::abs(primary_pos - supp_pos); + int sv_length2 = std::abs(primary_pos - supp_pos2); + if (sv_length2 > sv_length1) { + supp_pos = supp_pos2; + } + } - // Store the SV candidate if the length is within the specified range - int sv_start = std::min(primary_pos, supp_pos); - int sv_end = std::max(primary_pos, supp_pos); - int sv_length = sv_end - sv_start + 1; - int cluster_size = std::max(primary_cluster_size, supp_cluster_size); - SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; - if (sv_length >= min_length && sv_length <= max_length) { - SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size); - addSVCall(sv_calls, sv_candidate); - // printMessage(region + ": Found SV candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group) + " with inversion status " + std::to_string(inversion)); + if (primary_pos == -1 || supp_pos == -1) { + continue; + } + + // Store the SV candidate if the length is within the specified range + int sv_start = std::min(primary_pos, supp_pos); + int sv_end = std::max(primary_pos, supp_pos); + int sv_length = sv_end - sv_start + 1; + int cluster_size = std::max(primary_cluster_size, supp_cluster_size); + + // If the read distance is < 30bp while the SV is > 2kb, then this is a + // potential deletion + if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) { + printMessage(chr_name + ": Found potential deletion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group)); + // printMessage("Found potential deletion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group)); + // continue; + SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDEL", "./.", 0.0, 0, 0, cluster_size); + addSVCall(chr_sv_calls, sv_candidate); + } + + // Add a dummy SV call for CNV detection + else if (sv_length >= min_length && sv_length <= max_length) { + SVCall sv_candidate(sv_start, sv_end, SVType::UNKNOWN, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size); + addSVCall(chr_sv_calls, sv_candidate); + } + + // Determine the SV type + // SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; + // if (sv_length >= min_length && sv_length <= max_length) { + // SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size); + // addSVCall(chr_sv_calls, sv_candidate); + // // printMessage(region + ": Found SV candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group) + " with inversion status " + std::to_string(inversion)); + // } } - } + // Combine SVs with identical start and end positions, and sum the cluster + // sizes + printMessage("Combining SVs with identical start and end positions"); + std::vector combined_sv_calls; + std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) { + return a.start < b.start || (a.start == b.start && a.end < b.end); + }); + int merge_count = 0; + for (size_t i = 0; i < chr_sv_calls.size(); i++) { + SVCall& sv_call = chr_sv_calls[i]; + // SVCall& sv_call = sv_calls[i]; + if (i > 0 && sv_call.start == chr_sv_calls[i - 1].start && sv_call.end == chr_sv_calls[i - 1].end) { + chr_sv_calls[i - 1].cluster_size += sv_call.cluster_size; + merge_count++; + } else { + combined_sv_calls.push_back(sv_call); + } + } + + // Add the combined SV calls to the main vector + sv_calls[chr_name] = std::move(combined_sv_calls); - // Combine SVs with identical start and end positions, and sum the cluster - // sizes - std::vector combined_sv_calls; - std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { - return a.start < b.start || (a.start == b.start && a.end < b.end); - }); - int merge_count = 0; - for (size_t i = 0; i < sv_calls.size(); i++) { - SVCall& sv_call = sv_calls[i]; - if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.end == sv_calls[i - 1].end) { - sv_calls[i - 1].cluster_size += sv_call.cluster_size; - merge_count++; - } else { - combined_sv_calls.push_back(sv_call); + // Print the number of merged SV calls + printMessage(chr_name + ": Found " + std::to_string(sv_calls[chr_name].size()) + " SV candidates"); + + if (merge_count > 0) { + printMessage(chr_name + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions"); } } - sv_calls = std::move(combined_sv_calls); // if (merge_count > 0) { // printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions"); // } } - -void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) +void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) { // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); @@ -367,7 +845,7 @@ void SVCaller::detectCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, // Process the alignment bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY); - this->detectSVsFromCIGAR(bamHdr, bam1, sv_calls, primary, pos_depth_map, ref_genome); + this->processCIGARRecord(bamHdr, bam1, sv_calls, primary, pos_depth_map, ref_genome); } // Clean up the iterator and alignment @@ -404,7 +882,181 @@ double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) return mismatch_rate; } -void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) +void SVCaller::findSplitReadSVs(std::unordered_map> &sv_calls, const ReferenceGenome &ref_genome, const InputData& input_data) +{ + // Open the BAM file + std::string bam_filepath = input_data.getLongReadBam(); + samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); + if (!fp_in) { + printError("ERROR: failed to open " + bam_filepath); + return; + } + + // Set maximum thread count + int thread_count = input_data.getThreadCount(); + hts_set_threads(fp_in, thread_count); + printMessage("Using " + std::to_string(thread_count) + " threads for split read analysis"); + + // Load the header + bam_hdr_t *bamHdr = sam_hdr_read(fp_in); + if (!bamHdr) { + sam_close(fp_in); + printError("ERROR: failed to read header from " + bam_filepath); + return; + } + + // Load the index + hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str()); + if (!idx) { + bam_hdr_destroy(bamHdr); + sam_close(fp_in); + printError("ERROR: failed to load index for " + bam_filepath); + return; + } + BamFileGuard bam_guard(fp_in, idx, bamHdr); // Guard to close the BAM file + + // Create a whole-genome iterator + hts_itr_t *itr = sam_itr_queryi(idx, HTS_IDX_START, 0, 0); + if (!itr) { + printError("ERROR: failed to query the whole genome"); + return; + } + + // Process the alignments + std::unordered_map primary_map; + std::unordered_map> supp_map; + bam1_t *bam1 = bam_init1(); + if (!bam1) { + printError("ERROR: failed to initialize BAM record"); + return; + } + uint32_t primary_count = 0; + uint32_t supplementary_count = 0; + uint32_t num_alignments = 0; + printMessage("Processing split read alignment records..."); + while (readNextAlignment(fp_in, itr, bam1) >= 0) { + + // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality + if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) { + continue; + } + const std::string qname = bam_get_qname(bam1); // Query template name + + // Process primary alignments + if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { + + // Get the start and end positions in the read sequence + uint32_t query_start = 0; + uint32_t query_end = 0; + uint32_t* cigar = bam_get_cigar(bam1); + int cigar_len = bam1->core.n_cigar; + for (int i = 0; i < cigar_len; i++) { + int op_len = bam_cigar_oplen(cigar[i]); + int op = bam_cigar_op(cigar[i]); + + if (i == 0 && op == BAM_CSOFT_CLIP) { + query_start = op_len; + } + + // https://github.com/samtools/htslib/blob/develop/htslib/sam.h: + // bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference + if (bam_cigar_type(op) & 1) { + query_end += op_len; + } + } + + // Store the SV signature + primary_map[qname] = SplitSignature{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), query_start, query_end}; + primary_count++; + + // Process supplementary alignments + } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { + // Get the start and end positions in the read sequence + uint32_t query_start = 0; + uint32_t query_end = 0; + uint32_t* cigar = bam_get_cigar(bam1); + int cigar_len = bam1->core.n_cigar; + for (int i = 0; i < cigar_len; i++) { + int op_len = bam_cigar_oplen(cigar[i]); + int op = bam_cigar_op(cigar[i]); + + if (i == 0 && op == BAM_CSOFT_CLIP) { + query_start = op_len; + } + + // https://github.com/samtools/htslib/blob/develop/htslib/sam.h: + // bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference + if (bam_cigar_type(op) & 1) { + query_end += op_len; + } + } + + // Store the SV signature + supp_map[qname].push_back(SplitSignature{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), query_start, query_end}); + supplementary_count++; + } + num_alignments++; + + if (num_alignments % 100000 == 0) { + printMessage("Processed " + std::to_string(num_alignments) + " split read alignment records"); + } + } + + // Remove primary alignments without supplementary alignments + std::vector to_remove; + for (const auto& entry : primary_map) { + const std::string& qname = entry.first; + if (supp_map.find(qname) == supp_map.end()) { + to_remove.push_back(qname); + } + } + for (const std::string& qname : to_remove) { + primary_map.erase(qname); + } + + // Clean up the iterator and alignment + hts_itr_destroy(itr); + bam_destroy1(bam1); + printMessage("Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments"); + + // Find insertions by comparing the primary vs. supplementary alignment + // distances in the read vs. reference genome on the same chromosome + int ins_count = 0; + std::vector sv_candidates; + for (const auto& entry : primary_map) { + const std::string& qname = entry.first; + const SplitSignature& primary = entry.second; + const std::vector& supp_alns = supp_map[qname]; + + // TODO: Cluster positions for improved performance + + for (const SplitSignature& supp : supp_alns) { + if (primary.tid == supp.tid) { + int ref_dist = std::abs(primary.start - supp.start); + int query_dist = std::abs(primary.query_start - supp.query_start); + + // If the reads are within 100 bp of each other, and the + // reference distance is greater than 2kb, then it is likely an + // insertion + if (query_dist <= 100 && ref_dist >= 2000) { + int sv_start = std::min(primary.start, supp.start); + int sv_end = std::max(primary.start, supp.start); + int sv_length = sv_end - sv_start + 1; + int cluster_size = 1; + printMessage("Found insertion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length)); + SVCall sv_candidate(sv_start, sv_end, SVType::INS, "", "SPLITINS", "./.", 0.0, 0, 0, cluster_size); + std::string chr = bamHdr->target_name[primary.tid]; + sv_calls[chr].push_back(sv_candidate); + ins_count++; + } + } + } + } + + printMessage("Found " + std::to_string(ins_count) + " insertions"); +} + +void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector &sv_calls, bool is_primary, const std::vector &pos_depth_map, const ReferenceGenome &ref_genome) { std::string chr = header->target_name[alignment->core.tid]; // Chromosome name uint32_t pos = (uint32_t)alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) @@ -513,11 +1165,31 @@ void SVCaller::detectSVsFromCIGAR(bam_hdr_t* header, bam1_t* alignment, std::vec } } -void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov, std::vector& split_sv_calls) +std::pair SVCaller::getAlignmentReadPositions(bam1_t *alignment) { - double dbscan_epsilon = input_data.getDBSCAN_Epsilon(); - int dbscan_min_pts = input_data.getDBSCAN_MinPts(); + int query_start = 0; + int query_end = 0; + uint32_t* cigar = bam_get_cigar(alignment); + int cigar_len = alignment->core.n_cigar; + for (int i = 0; i < cigar_len; i++) { + int op_len = bam_cigar_oplen(cigar[i]); + int op = bam_cigar_op(cigar[i]); + + if (i == 0 && op == BAM_CSOFT_CLIP) { + query_start = op_len; + } + + // https://github.com/samtools/htslib/blob/develop/htslib/sam.h: + // bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference + if (bam_cigar_type(op) & 1) { + query_end += op_len; + } + } + return std::make_pair(query_start, query_end); +} +void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov, std::vector& split_sv_calls) +{ // Open the BAM file std::string bam_filepath = input_data.getLongReadBam(); samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); @@ -545,40 +1217,31 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v } BamFileGuard bam_guard(fp_in, idx, bamHdr); // Guard to close the BAM file - // Set the region to process - std::string region = chr; - if (input_data.isRegionSet()) { - - // Use one chunk for the specified region - std::pair region_data = input_data.getRegion(); - int region_start = region_data.first; - int region_end = region_data.second; - region = chr + ":" + std::to_string(region_start) + "-" + std::to_string(region_end); - } - - // Estimate DBSCAN minimum points + // Get DBSCAN parameters + double dbscan_epsilon = input_data.getDBSCAN_Epsilon(); + int dbscan_min_pts = 5; double dbscan_min_pts_pct = input_data.getDBSCAN_MinPtsPct(); if (dbscan_min_pts_pct > 0.0) { dbscan_min_pts = (int)std::ceil(mean_chr_cov * dbscan_min_pts_pct); printMessage(chr + ": Mean chr. cov.: " + std::to_string(mean_chr_cov) + " (DBSCAN min. pts.= " + std::to_string(dbscan_min_pts) + ", min. pts. pct.= " + std::to_string(dbscan_min_pts_pct) + ")"); - } - + } // ----------------------------------------------------------------------- - // Detect SVs from the CIGAR strings - printMessage(chr + ": CIGAR SVs..."); - this->detectCIGARSVs(fp_in, idx, bamHdr, region, chr_sv_calls, chr_pos_depth_map, ref_genome); + // // Detect SVs from the CIGAR strings + // printMessage(chr + ": CIGAR SVs..."); + // this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome); - printMessage(chr + ": Merging CIGAR..."); - mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts); + // printMessage(chr + ": Merging CIGAR..."); + // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts); - int region_sv_count = getSVCount(chr_sv_calls); - printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); + // int region_sv_count = getSVCount(chr_sv_calls); + // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); // ----------------------------------------------------------------------- - // Detect SVs from the split reads - printMessage(chr + ": Split read SVs..."); - this->getSplitAlignments(fp_in, idx, bamHdr, region, split_sv_calls); + + // // Detect SVs from the split reads + // printMessage(chr + ": Split read SVs..."); + // this->findSplitCNVBreakpoints(fp_in, idx, bamHdr, chr, split_sv_calls); } void SVCaller::run(const InputData& input_data) @@ -643,77 +1306,95 @@ void SVCaller::run(const InputData& input_data) // Use multi-threading across chromosomes. If a single chromosome is // specified, use a single main thread (multi-threading is used for file I/O) - int thread_count = 1; - if (!input_data.isSingleChr()) { - thread_count = input_data.getThreadCount(); - std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl; - } - ThreadPool pool(thread_count); + // int thread_count = 1; + // if (!input_data.isSingleChr()) { + // thread_count = input_data.getThreadCount(); + // std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl; + // } + // ThreadPool pool(thread_count); std::unordered_map> whole_genome_sv_calls; std::unordered_map> whole_genome_split_sv_calls; - auto process_chr = [&](const std::string& chr) { - try { - std::vector sv_calls; - std::vector split_sv_calls; - InputData chr_input_data = input_data; // Use a thread-local copy - this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls); - { - std::shared_lock lock(this->shared_mutex); - whole_genome_sv_calls[chr] = std::move(sv_calls); - whole_genome_split_sv_calls[chr] = std::move(split_sv_calls); - } - // printMessage("Completed chromosome " + chr); - } catch (const std::exception& e) { - printError("Error processing chromosome " + chr + ": " + e.what()); - } catch (...) { - printError("Unknown error processing chromosome " + chr); - } - }; - - // Submit tasks to the thread pool and track futures - std::vector> futures; - for (const auto& chr : chromosomes) { - futures.emplace_back(pool.enqueue([&, chr] { - // printMessage("Processing chromosome " + chr); - process_chr(chr); - })); - } + // auto process_chr = [&](const std::string& chr) { + // try { + // std::vector sv_calls; + // std::vector split_sv_calls; + // InputData chr_input_data = input_data; // Use a thread-local copy + // this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls); + // { + // std::shared_lock lock(this->shared_mutex); + // whole_genome_sv_calls[chr] = std::move(sv_calls); + // whole_genome_split_sv_calls[chr] = std::move(split_sv_calls); + // } + // // printMessage("Completed chromosome " + chr); + // } catch (const std::exception& e) { + // printError("Error processing chromosome " + chr + ": " + e.what()); + // } catch (...) { + // printError("Unknown error processing chromosome " + chr); + // } + // }; + + // // Submit tasks to the thread pool and track futures + // std::vector> futures; + // for (const auto& chr : chromosomes) { + // futures.emplace_back(pool.enqueue([&, chr] { + // // printMessage("Processing chromosome " + chr); + // process_chr(chr); + // })); + // } - // Wait for all tasks to complete - int current_chr = 0; - int total_chr_count = chromosomes.size(); - for (auto& future : futures) { - try { - current_chr++; - future.get(); - } catch (const std::exception& e) { - printError("Error processing chromosome task: " + std::string(e.what())); - } catch (...) { - printError("Unknown error processing chromosome task."); - } - } - printMessage("All tasks have finished."); + // // Wait for all tasks to complete + // int current_chr = 0; + // int total_chr_count = chromosomes.size(); + // for (auto& future : futures) { + // try { + // current_chr++; + // future.get(); + // } catch (const std::exception& e) { + // printError("Error processing chromosome task: " + std::string(e.what())); + // } catch (...) { + // printError("Unknown error processing chromosome task."); + // } + // } + // printMessage("All tasks have finished."); // ------------------------------------------------------- // Run copy number variant predictions on the SVs detected from the // CIGAR string, using a minimum CNV length threshold - current_chr = 0; - printMessage("Running copy number predictions on CIGAR SVs..."); - for (auto& entry : whole_genome_sv_calls) { - current_chr++; - const std::string& chr = entry.first; - std::vector& sv_calls = entry.second; - if (sv_calls.size() > 0) { - // printMessage("Running copy number predictions on " + chr + - // "..."); - printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); - cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); - } - } + // current_chr = 0; + // printMessage("Running copy number predictions on CIGAR SVs..."); + // for (auto& entry : whole_genome_sv_calls) { + // current_chr++; + // const std::string& chr = entry.first; + // std::vector& sv_calls = entry.second; + // if (sv_calls.size() > 0) { + // // printMessage("Running copy number predictions on " + chr + + // // "..."); + // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); + // cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); + // } + // } // ------------------------------------------------------- + // printMessage("Running copy number predictions on split-read SVs..."); + // current_chr = 0; + // for (auto& entry : whole_genome_split_sv_calls) { + // const std::string& chr = entry.first; + // std::vector& sv_calls = entry.second; + + // if (sv_calls.size() > 0) { + // current_chr++; + // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates..."); + // this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); + // } + // } + + // Identify split-SV signatures + printMessage("Identifying split-SV signatures..."); + this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data); + printMessage("Running copy number predictions on split-read SVs..."); - current_chr = 0; + int current_chr = 0; + int total_chr_count = whole_genome_split_sv_calls.size(); for (auto& entry : whole_genome_split_sv_calls) { const std::string& chr = entry.first; std::vector& sv_calls = entry.second; @@ -725,11 +1406,18 @@ void SVCaller::run(const InputData& input_data) } } + // Detect inversions, insertions, and translocations from the split read + // alignments (no copy number predictions) + // printMessage("Detecting inversions, insertions, and translocations from split read alignments..."); + // std::unordered_map> neutral_sv_calls; + // this->findSplitReadSVs(neutral_sv_calls, ref_genome, input_data); + printMessage("Unifying SVs..."); for (auto& entry : whole_genome_split_sv_calls) { const std::string& chr = entry.first; std::vector& sv_calls = entry.second; whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end()); + // whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), neutral_sv_calls[chr].begin(), neutral_sv_calls[chr].end()); } // Print the total number of SVs detected for each chromosome @@ -756,54 +1444,48 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve for (const auto& sv_candidate : split_sv_calls) { printMessage("Processing SV candidate " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " " + getSVTypeString(sv_candidate.sv_type) + "..."); - bool is_inversion = sv_candidate.sv_type == SVType::INV; + // bool is_inversion = sv_candidate.sv_type == SVType::INV; std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); std::string genotype = std::get<2>(result); - if (supp_type != SVType::UNKNOWN) { - if (is_inversion) { - // Add an additional inversion separately - int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); - std::string alt_allele = ""; - SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); - processed_calls.push_back(sv_call); - /* - if (supp_type == SVType::DEL) { - supp_type = SVType::INV_DEL; - } else if (supp_type == SVType::DUP) { - supp_type = SVType::INV_DUP; - } else if (supp_type == SVType::NEUTRAL) { - supp_type = SVType::INV; - } - */ - } + if (supp_type != SVType::UNKNOWN && supp_type != SVType::NEUTRAL) { + // if (is_inversion) { + // // Add an additional inversion separately + // int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); + // std::string alt_allele = ""; + // SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); + // processed_calls.push_back(sv_call); + // } - if (supp_type != SVType::NEUTRAL) { - int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); - std::string alt_allele = "<" + getSVTypeString(supp_type) + ">"; - SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); - // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type)); - // addSVCall(split_sv_calls, sv_call); - processed_calls.push_back(sv_call); - } - } else if (supp_type == SVType::UNKNOWN && sv_candidate.sv_type == SVType::INV) { - // Inversion with no CNV prediction int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); - std::string alt_allele = ""; - SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); + std::string alt_allele = "<" + getSVTypeString(supp_type) + ">"; + SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type)); // addSVCall(split_sv_calls, sv_call); processed_calls.push_back(sv_call); } + + // } else if (sv_candidate.sv_type == SVType::INV) { + // // SV with no copy number prediction, but is a potential inversion or insertion + // int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); + // // std::string alt_allele = ""; + // std::string alt_allele = "<" + getSVTypeString(sv_candidate.sv_type) + ">"; + // SVCall sv_call(sv_candidate.start, sv_candidate.end, sv_candidate.sv_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); + // printMessage("[TEST-SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type)); + // processed_calls.push_back(sv_call); + // } // if (current_sv % 1000 == 0) { // printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates"); // } } + // Insert the copy number predictions back into the split SV calls + printMessage("Inserting CNV calls..."); + split_sv_calls.insert(split_sv_calls.end(), processed_calls.begin(), processed_calls.end()); // Replace with the processed calls - split_sv_calls = std::move(processed_calls); + // split_sv_calls = std::move(processed_calls); } void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const @@ -1044,8 +1726,3 @@ int SVCaller::calculateReadDepth(const std::vector& pos_depth_map, uin // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth)); return read_depth; } - -bool SVCaller::regionOverlaps(const GenomicRegion &a, const GenomicRegion &b) -{ - return a.tid == b.tid && a.start <= b.end && b.start <= a.end; -} From 84d1d6e60b49bcb56dbcd8381b5a0b759af011f2 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 13 Mar 2025 16:58:08 -0400 Subject: [PATCH 083/134] fix sv duplicate merging --- include/sv_object.h | 3 + src/cnv_caller.cpp | 10 +- src/sv_caller.cpp | 290 +++++++++++++++++++++++++------------------- src/sv_object.cpp | 29 +++++ 4 files changed, 201 insertions(+), 131 deletions(-) diff --git a/include/sv_object.h b/include/sv_object.h index fc090166..7c1f5410 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -39,6 +39,9 @@ void addSVCall(std::vector& sv_calls, SVCall& sv_call); void mergeSVs(std::vector& sv_calls); +// Merge SVs with identical start positions, and sum the cluster sizes +void mergeDuplicateSVs(std::vector& sv_calls); + void mergeSVSubsets(std::vector& sv_calls); void filterSVsWithLowSupport(std::vector &sv_calls, int min_depth); diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 6db7a78b..c23c146b 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -501,7 +501,7 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& // Clean up the iterator hts_itr_destroy(bam_iter); - printMessage("Finished reading BAM file, calculating mean chromosome coverage..."); + // printMessage("Finished reading BAM file, calculating mean chromosome coverage..."); // // Calculate the mean chromosome coverage for positions with non-zero depth // uint64_t cum_depth = 0; @@ -539,13 +539,13 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& [](uint32_t depth) { return depth > 0; } ); - printMessage("Number of positions with non-zero depth: " + std::to_string(pos_count)); - printMessage("Total depth: " + std::to_string(cum_depth)); + // printMessage("Number of positions with non-zero depth: " + std::to_string(pos_count)); + // printMessage("Total depth: " + std::to_string(cum_depth)); double mean_chr_cov = (pos_count > 0) ? static_cast(cum_depth) / static_cast(pos_count) : 0.0; chr_mean_cov_map[chr] = mean_chr_cov; - printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov)); + // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov)); } // Clean up @@ -802,7 +802,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Continue if no SNP was found in the region if (!snp_found) { - printMessage("No SNP found in region: " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos)); + // printMessage("No SNP found in region: " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos)); bcf_sr_destroy(snp_reader); bcf_sr_destroy(pfb_reader); return; diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 31b7270a..00a6f0ae 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -531,7 +531,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map", "SPLITINS", "./.", 0.0, 0, 0, primary_cluster_size); addSVCall(chr_sv_calls, sv_candidate); - printMessage(chr_name + ": Found split insertion candidate " + std::to_string(primary_pos) + "-" + std::to_string(primary_pos + (read_distance-1)) + " with size " + std::to_string(read_distance) + " for group " + std::to_string(current_group)); + // printMessage(chr_name + ": Found split insertion candidate " + std::to_string(primary_pos) + "-" + std::to_string(primary_pos + (read_distance-1)) + " with size " + std::to_string(read_distance) + " for group " + std::to_string(current_group)); // continue; } } @@ -731,7 +731,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map= 50) { SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, "", "SUPPINV", "./.", 0.0, 0, 0, supp_cluster_size); addSVCall(chr_sv_calls, sv_candidate); - printMessage(chr_name + ": Found inversion candidate " + std::to_string(supp_best_start) + "-" + std::to_string(supp_best_end) + " for group " + std::to_string(current_group)); + // printMessage(chr_name + ": Found inversion candidate " + std::to_string(supp_best_start) + "-" + std::to_string(supp_best_end) + " for group " + std::to_string(current_group)); } } @@ -764,9 +764,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map 2kb, then this is a // potential deletion if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) { - printMessage(chr_name + ": Found potential deletion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group)); - // printMessage("Found potential deletion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group)); - // continue; + // printMessage(chr_name + ": Found potential deletion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group)); SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDEL", "./.", 0.0, 0, 0, cluster_size); addSVCall(chr_sv_calls, sv_candidate); } @@ -785,34 +783,47 @@ void SVCaller::findSplitSVSignatures(std::unordered_map combined_sv_calls; + printMessage("Combining SVs with identical start positions"); std::sort(chr_sv_calls.begin(), chr_sv_calls.end(), [](const SVCall& a, const SVCall& b) { return a.start < b.start || (a.start == b.start && a.end < b.end); }); - int merge_count = 0; - for (size_t i = 0; i < chr_sv_calls.size(); i++) { - SVCall& sv_call = chr_sv_calls[i]; - // SVCall& sv_call = sv_calls[i]; - if (i > 0 && sv_call.start == chr_sv_calls[i - 1].start && sv_call.end == chr_sv_calls[i - 1].end) { - chr_sv_calls[i - 1].cluster_size += sv_call.cluster_size; - merge_count++; - } else { - combined_sv_calls.push_back(sv_call); - } - } + + // Merge duplicate SV calls with identical start positions + mergeDuplicateSVs(chr_sv_calls); + + // int initial_size = chr_sv_calls.size(); + // std::vector combined_sv_calls; + // for (size_t i = 0; i < chr_sv_calls.size(); i++) { + // SVCall& sv_call = chr_sv_calls[i]; + // if (i > 0 && sv_call.start == chr_sv_calls[i - 1].start) { + // // Keep the largest cluster size for the same start position + // if (sv_call.cluster_size > chr_sv_calls[i - 1].cluster_size) { + // combined_sv_calls.back() = sv_call; + // } + + // // Combine cluster sizes + // combined_sv_calls.back().cluster_size += sv_call.cluster_size; + // } else { + // // Add the SV call to the combined list + // combined_sv_calls.push_back(sv_call); + // } + // } + // int merge_count = initial_size - combined_sv_calls.size(); + // printMessage("Merged " + std::to_string(merge_count) + " SV candidates with identical start positions"); // Add the combined SV calls to the main vector - sv_calls[chr_name] = std::move(combined_sv_calls); + // sv_calls[chr_name] = std::move(combined_sv_calls); + sv_calls[chr_name] = std::move(chr_sv_calls); // Print the number of merged SV calls printMessage(chr_name + ": Found " + std::to_string(sv_calls[chr_name].size()) + " SV candidates"); - if (merge_count > 0) { - printMessage(chr_name + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions"); - } + // if (merge_count > 0) { + // printMessage(chr_name + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions"); + // } } // if (merge_count > 0) { @@ -1154,6 +1165,10 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec // Update the reference position // https://samtools.github.io/hts-specs/SAMv1.pdf + // if (bam_cigar_type(op) & 2) { + // // bit 2: consume reference + // ref_pos += op_len; + // } if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { pos += op_len; } @@ -1167,7 +1182,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec std::pair SVCaller::getAlignmentReadPositions(bam1_t *alignment) { - int query_start = 0; + int query_start = -1; int query_end = 0; uint32_t* cigar = bam_get_cigar(alignment); int cigar_len = alignment->core.n_cigar; @@ -1175,16 +1190,26 @@ std::pair SVCaller::getAlignmentReadPositions(bam1_t *alignment) int op_len = bam_cigar_oplen(cigar[i]); int op = bam_cigar_op(cigar[i]); - if (i == 0 && op == BAM_CSOFT_CLIP) { - query_start = op_len; + // if (i == 0 && op == BAM_CSOFT_CLIP) { + // query_start = op_len; + // } + // Set the query start position to the first non-soft clip operation + if (query_start == -1 && (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CEQUAL || op == BAM_CDIFF)) { + query_start = query_end; // First valid query position } // https://github.com/samtools/htslib/blob/develop/htslib/sam.h: // bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference - if (bam_cigar_type(op) & 1) { + if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) { query_end += op_len; } } + + if (query_start == -1) { + // If no valid query start position was found, set it to 0 + query_start = 0; + } + return std::make_pair(query_start, query_end); } @@ -1227,21 +1252,15 @@ void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::v } // ----------------------------------------------------------------------- - // // Detect SVs from the CIGAR strings - // printMessage(chr + ": CIGAR SVs..."); - // this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome); - - // printMessage(chr + ": Merging CIGAR..."); - // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts); + // Detect SVs from the CIGAR strings + printMessage(chr + ": CIGAR SVs..."); + this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome); - // int region_sv_count = getSVCount(chr_sv_calls); - // printMessage("Total SVs detected from CIGAR string: " + std::to_string(region_sv_count)); + printMessage(chr + ": Merging CIGAR..."); + mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts); - // ----------------------------------------------------------------------- - - // // Detect SVs from the split reads - // printMessage(chr + ": Split read SVs..."); - // this->findSplitCNVBreakpoints(fp_in, idx, bamHdr, chr, split_sv_calls); + int region_sv_count = getSVCount(chr_sv_calls); + printMessage(chr + ": Found " + std::to_string(region_sv_count) + " SV candidates in the CIGAR string"); } void SVCaller::run(const InputData& input_data) @@ -1250,7 +1269,6 @@ void SVCaller::run(const InputData& input_data) printMessage("Loading the reference genome..."); const std::string ref_filepath = input_data.getRefGenome(); std::shared_mutex ref_mutex; // Dummy mutex (remove later) - // ReferenceGenome ref_genome(this->shared_mutex); ReferenceGenome ref_genome(ref_mutex); ref_genome.setFilepath(ref_filepath); @@ -1260,7 +1278,6 @@ void SVCaller::run(const InputData& input_data) // Get the chromosome from the user input argument chromosomes.push_back(input_data.getChromosome()); } else { - // chromosomes = ref_genome.getChromosomes(); // Get the chromosomes from the input BAM file chromosomes = this->getChromosomes(input_data.getLongReadBam()); } @@ -1306,95 +1323,80 @@ void SVCaller::run(const InputData& input_data) // Use multi-threading across chromosomes. If a single chromosome is // specified, use a single main thread (multi-threading is used for file I/O) - // int thread_count = 1; - // if (!input_data.isSingleChr()) { - // thread_count = input_data.getThreadCount(); - // std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl; - // } - // ThreadPool pool(thread_count); + int thread_count = 1; + if (!input_data.isSingleChr()) { + thread_count = input_data.getThreadCount(); + std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl; + } + ThreadPool pool(thread_count); std::unordered_map> whole_genome_sv_calls; - std::unordered_map> whole_genome_split_sv_calls; - // auto process_chr = [&](const std::string& chr) { - // try { - // std::vector sv_calls; - // std::vector split_sv_calls; - // InputData chr_input_data = input_data; // Use a thread-local copy - // this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls); - // { - // std::shared_lock lock(this->shared_mutex); - // whole_genome_sv_calls[chr] = std::move(sv_calls); - // whole_genome_split_sv_calls[chr] = std::move(split_sv_calls); - // } - // // printMessage("Completed chromosome " + chr); - // } catch (const std::exception& e) { - // printError("Error processing chromosome " + chr + ": " + e.what()); - // } catch (...) { - // printError("Unknown error processing chromosome " + chr); - // } - // }; - - // // Submit tasks to the thread pool and track futures - // std::vector> futures; - // for (const auto& chr : chromosomes) { - // futures.emplace_back(pool.enqueue([&, chr] { - // // printMessage("Processing chromosome " + chr); - // process_chr(chr); - // })); - // } + auto process_chr = [&](const std::string& chr) { + try { + std::vector sv_calls; + std::vector split_sv_calls; + InputData chr_input_data = input_data; // Use a thread-local copy + this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls); + { + std::shared_lock lock(this->shared_mutex); + whole_genome_sv_calls[chr] = std::move(sv_calls); + } + // printMessage("Completed chromosome " + chr); + } catch (const std::exception& e) { + printError("Error processing chromosome " + chr + ": " + e.what()); + } catch (...) { + printError("Unknown error processing chromosome " + chr); + } + }; + + // Submit tasks to the thread pool and track futures + std::vector> futures; + for (const auto& chr : chromosomes) { + futures.emplace_back(pool.enqueue([&, chr] { + // printMessage("Processing chromosome " + chr); + process_chr(chr); + })); + } // // Wait for all tasks to complete - // int current_chr = 0; - // int total_chr_count = chromosomes.size(); - // for (auto& future : futures) { - // try { - // current_chr++; - // future.get(); - // } catch (const std::exception& e) { - // printError("Error processing chromosome task: " + std::string(e.what())); - // } catch (...) { - // printError("Unknown error processing chromosome task."); - // } - // } - // printMessage("All tasks have finished."); + int current_chr = 0; + int total_chr_count = chromosomes.size(); + for (auto& future : futures) { + try { + current_chr++; + future.get(); + } catch (const std::exception& e) { + printError("Error processing chromosome task: " + std::string(e.what())); + } catch (...) { + printError("Unknown error processing chromosome task."); + } + } + printMessage("All tasks have finished."); // ------------------------------------------------------- // Run copy number variant predictions on the SVs detected from the // CIGAR string, using a minimum CNV length threshold - // current_chr = 0; - // printMessage("Running copy number predictions on CIGAR SVs..."); - // for (auto& entry : whole_genome_sv_calls) { - // current_chr++; - // const std::string& chr = entry.first; - // std::vector& sv_calls = entry.second; - // if (sv_calls.size() > 0) { - // // printMessage("Running copy number predictions on " + chr + - // // "..."); - // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); - // cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); - // } - // } + current_chr = 0; + printMessage("Running copy number predictions on CIGAR SVs..."); + for (auto& entry : whole_genome_sv_calls) { + current_chr++; + const std::string& chr = entry.first; + std::vector& sv_calls = entry.second; + if (sv_calls.size() > 0) { + // printMessage("Running copy number predictions on " + chr + + // "..."); + printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); + cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); + } + } // ------------------------------------------------------- - // printMessage("Running copy number predictions on split-read SVs..."); - // current_chr = 0; - // for (auto& entry : whole_genome_split_sv_calls) { - // const std::string& chr = entry.first; - // std::vector& sv_calls = entry.second; - - // if (sv_calls.size() > 0) { - // current_chr++; - // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates..."); - // this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); - // } - // } - // Identify split-SV signatures printMessage("Identifying split-SV signatures..."); + std::unordered_map> whole_genome_split_sv_calls; this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data); printMessage("Running copy number predictions on split-read SVs..."); - int current_chr = 0; - int total_chr_count = whole_genome_split_sv_calls.size(); + current_chr = 0; for (auto& entry : whole_genome_split_sv_calls) { const std::string& chr = entry.first; std::vector& sv_calls = entry.second; @@ -1405,19 +1407,12 @@ void SVCaller::run(const InputData& input_data) this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); } } - - // Detect inversions, insertions, and translocations from the split read - // alignments (no copy number predictions) - // printMessage("Detecting inversions, insertions, and translocations from split read alignments..."); - // std::unordered_map> neutral_sv_calls; - // this->findSplitReadSVs(neutral_sv_calls, ref_genome, input_data); printMessage("Unifying SVs..."); for (auto& entry : whole_genome_split_sv_calls) { const std::string& chr = entry.first; std::vector& sv_calls = entry.second; whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end()); - // whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), neutral_sv_calls[chr].begin(), neutral_sv_calls[chr].end()); } // Print the total number of SVs detected for each chromosome @@ -1442,7 +1437,7 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve // Run copy number predictions on the SVs detected from the split reads std::vector processed_calls; for (const auto& sv_candidate : split_sv_calls) { - printMessage("Processing SV candidate " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " " + getSVTypeString(sv_candidate.sv_type) + "..."); + // printMessage("Processing SV candidate " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " " + getSVTypeString(sv_candidate.sv_type) + "..."); // bool is_inversion = sv_candidate.sv_type == SVType::INV; @@ -1486,6 +1481,43 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve split_sv_calls.insert(split_sv_calls.end(), processed_calls.begin(), processed_calls.end()); // Replace with the processed calls // split_sv_calls = std::move(processed_calls); + + mergeDuplicateSVs(split_sv_calls); + + // Combine SVs with identical start positions, keeping the largest cluster size + // printMessage("[2] Combining SVs with identical start and end positions..."); + // std::vector combined_calls; + // std::sort(split_sv_calls.begin(), split_sv_calls.end(), [](const SVCall& a, const SVCall& b) { + // return (a.start < b.start) || (a.start == b.start && a.end < b.end); + // }); + // int initial_size = split_sv_calls.size(); + // for (size_t i = 0; i < split_sv_calls.size(); ++i) { + // const SVCall& current_call = split_sv_calls[i]; + // printMessage("Current start: " + std::to_string(current_call.start) + ", previous start: " + (i > 0 ? std::to_string(split_sv_calls[i-1].start) : "N/A")); + // if (i > 0 && current_call.start == split_sv_calls[i-1].start) { + // printMessage("Found identical start position: " + std::to_string(current_call.start) + " with end: " + std::to_string(current_call.end)); + // // Keep the largest cluster size + // if (current_call.cluster_size > split_sv_calls[i-1].cluster_size) { + // combined_calls.back() = current_call; + // printMessage("Replacing previous call with larger cluster size: " + std::to_string(current_call.cluster_size) + " > " + std::to_string(split_sv_calls[i-1].cluster_size)); + // } + + // // Merge the cluster sizes + // combined_calls.back().cluster_size += current_call.cluster_size; + // printMessage("Merged cluster size: " + std::to_string(combined_calls.back().cluster_size) + " (previous: " + std::to_string(split_sv_calls[i-1].cluster_size) + ")"); + // } else { + // // Add the current call to the combined calls + // combined_calls.push_back(current_call); + // } + // } + + + // printMessage("Merged " + std::to_string(merge_count) + " SVs with identical start and end positions"); + // // Replace the split SV calls with the combined calls + // printMessage("[TEST] Total SVs before merging: " + std::to_string(split_sv_calls.size())); + // split_sv_calls.clear(); + // split_sv_calls.insert(split_sv_calls.end(), combined_calls.begin(), combined_calls.end()); + // printMessage("[TEST] Total SVs after merging: " + std::to_string(split_sv_calls.size())); } void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const @@ -1562,6 +1594,7 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls = pair.second; @@ -1583,7 +1616,9 @@ void SVCaller::saveToVCF(const std::unordered_map 0) { + std::cout << "Total unclassified SVs: " << unclassified_svs << std::endl; + } } int SVCaller::calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end) diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 0c19468c..7eabc4c6 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -221,6 +221,35 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls"); } +void mergeDuplicateSVs(std::vector &sv_calls) +{ + int initial_size = sv_calls.size(); + std::vector combined_sv_calls; + std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { + return a.start < b.start; + }); + for (size_t i = 0; i < sv_calls.size(); i++) { + SVCall& sv_call = sv_calls[i]; + if (i > 0 && sv_call.start == sv_calls[i - 1].start) { + // Keep the larger cluster size for the same start position + if (sv_call.cluster_size > sv_calls[i - 1].cluster_size) { + combined_sv_calls.back() = sv_call; + } + + // Combine cluster sizes + combined_sv_calls.back().cluster_size += sv_call.cluster_size; + } else { + // Add the SV call to the combined list + combined_sv_calls.push_back(sv_call); + } + } + int merge_count = initial_size - combined_sv_calls.size(); + sv_calls = std::move(combined_sv_calls); // Replace with filtered list + if (merge_count > 0) { + printMessage("Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions"); + } +} + void mergeSVSubsets(std::vector &sv_calls) { // Sort the SV calls by start position From 0f305baee2bbc98cb62045810960968d2a158f02 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 14 Mar 2025 11:51:39 -0400 Subject: [PATCH 084/134] improved ins and del --- src/sv_caller.cpp | 146 +++++++++++++++++----------------------------- src/sv_object.cpp | 34 +++++++++-- 2 files changed, 81 insertions(+), 99 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 00a6f0ae..d1244895 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -690,9 +690,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map supp_end_cluster.size()) { - // std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); - // supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; - supp_pos = supp_best_start; - supp_cluster_size = supp_start_cluster.size(); - } else if (supp_end_cluster.size() > supp_start_cluster.size()) { - // std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); - // supp_pos = supp_end_cluster[supp_end_cluster.size() / 2]; - supp_pos = supp_best_end; - supp_cluster_size = supp_end_cluster.size(); - } else { - // Use both positions. This has been shown to occur in nested SVs - // std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); - // std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); - // supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; - // supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2]; - supp_pos = supp_best_start; - supp_pos2 = supp_best_end; - supp_cluster_size = supp_start_cluster.size(); - } + supp_best_end = supp_end_cluster[supp_end_cluster.size() / 2]; + } + + if (supp_start_cluster.size() > supp_end_cluster.size()) { + // std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); + // supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; + supp_pos = supp_best_start; + supp_cluster_size = supp_start_cluster.size(); + } else if (supp_end_cluster.size() > supp_start_cluster.size()) { + // std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); + // supp_pos = supp_end_cluster[supp_end_cluster.size() / 2]; + supp_pos = supp_best_end; + supp_cluster_size = supp_end_cluster.size(); + } else if (supp_best_end == -1 && supp_best_start == -1) { + // Use both positions. This has been shown to occur in some nested SVs + supp_pos = supp_best_start; + supp_pos2 = supp_best_end; + supp_cluster_size = supp_start_cluster.size(); + } - // Store the inversion as the supplementary start and end positions + // Store the inversion as the supplementary start and end positions + if (supp_best_start != -1 && supp_best_end != -1) { if (inversion && std::abs(supp_best_start - supp_best_end) >= 50) { SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, "", "SUPPINV", "./.", 0.0, 0, 0, supp_cluster_size); addSVCall(chr_sv_calls, sv_candidate); @@ -764,14 +762,22 @@ void SVCaller::findSplitSVSignatures(std::unordered_map 2kb, then this is a // potential deletion if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) { - // printMessage(chr_name + ": Found potential deletion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group)); SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDEL", "./.", 0.0, 0, 0, cluster_size); addSVCall(chr_sv_calls, sv_candidate); + + // Add an inversion call if necessary + if (inversion) { + SVCall sv_candidate(sv_start, sv_end, SVType::INV, "", "INVDEL", "./.", 0.0, 0, 0, cluster_size); + addSVCall(chr_sv_calls, sv_candidate); + } } // Add a dummy SV call for CNV detection else if (sv_length >= min_length && sv_length <= max_length) { - SVCall sv_candidate(sv_start, sv_end, SVType::UNKNOWN, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size); + SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; + std::string alt = (sv_type == SVType::INV) ? "" : "."; + SVCall sv_candidate(sv_start, sv_end, sv_type, alt, "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size); + // SVCall sv_candidate(sv_start, sv_end, SVType::UNKNOWN, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size); addSVCall(chr_sv_calls, sv_candidate); } @@ -793,37 +799,15 @@ void SVCaller::findSplitSVSignatures(std::unordered_map combined_sv_calls; - // for (size_t i = 0; i < chr_sv_calls.size(); i++) { - // SVCall& sv_call = chr_sv_calls[i]; - // if (i > 0 && sv_call.start == chr_sv_calls[i - 1].start) { - // // Keep the largest cluster size for the same start position - // if (sv_call.cluster_size > chr_sv_calls[i - 1].cluster_size) { - // combined_sv_calls.back() = sv_call; - // } - - // // Combine cluster sizes - // combined_sv_calls.back().cluster_size += sv_call.cluster_size; - // } else { - // // Add the SV call to the combined list - // combined_sv_calls.push_back(sv_call); - // } - // } - // int merge_count = initial_size - combined_sv_calls.size(); - // printMessage("Merged " + std::to_string(merge_count) + " SV candidates with identical start positions"); - - // Add the combined SV calls to the main vector - // sv_calls[chr_name] = std::move(combined_sv_calls); sv_calls[chr_name] = std::move(chr_sv_calls); // Print the number of merged SV calls printMessage(chr_name + ": Found " + std::to_string(sv_calls[chr_name].size()) + " SV candidates"); - - // if (merge_count > 0) { - // printMessage(chr_name + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions"); - // } + + // Print all SV calls + for (const SVCall& sv_call : sv_calls[chr_name]) { + printMessage("SV: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", length: " + std::to_string(sv_call.end - sv_call.start + 1) + ", cluster size: " + std::to_string(sv_call.cluster_size) + ", group: " + std::to_string(current_group)); + } } // if (merge_count > 0) { @@ -1320,6 +1304,9 @@ void SVCaller::run(const InputData& input_data) printMessage("Removing chromosome " + chr + " with no reads..."); chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end()); } + std::unordered_map> whole_genome_sv_calls; + int current_chr = 0; + int total_chr_count = chromosomes.size(); // Use multi-threading across chromosomes. If a single chromosome is // specified, use a single main thread (multi-threading is used for file I/O) @@ -1329,7 +1316,6 @@ void SVCaller::run(const InputData& input_data) std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl; } ThreadPool pool(thread_count); - std::unordered_map> whole_genome_sv_calls; auto process_chr = [&](const std::string& chr) { try { std::vector sv_calls; @@ -1358,8 +1344,6 @@ void SVCaller::run(const InputData& input_data) } // // Wait for all tasks to complete - int current_chr = 0; - int total_chr_count = chromosomes.size(); for (auto& future : futures) { try { current_chr++; @@ -1479,45 +1463,21 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve // Insert the copy number predictions back into the split SV calls printMessage("Inserting CNV calls..."); split_sv_calls.insert(split_sv_calls.end(), processed_calls.begin(), processed_calls.end()); - // Replace with the processed calls - // split_sv_calls = std::move(processed_calls); - mergeDuplicateSVs(split_sv_calls); - // Combine SVs with identical start positions, keeping the largest cluster size - // printMessage("[2] Combining SVs with identical start and end positions..."); - // std::vector combined_calls; - // std::sort(split_sv_calls.begin(), split_sv_calls.end(), [](const SVCall& a, const SVCall& b) { - // return (a.start < b.start) || (a.start == b.start && a.end < b.end); - // }); - // int initial_size = split_sv_calls.size(); - // for (size_t i = 0; i < split_sv_calls.size(); ++i) { - // const SVCall& current_call = split_sv_calls[i]; - // printMessage("Current start: " + std::to_string(current_call.start) + ", previous start: " + (i > 0 ? std::to_string(split_sv_calls[i-1].start) : "N/A")); - // if (i > 0 && current_call.start == split_sv_calls[i-1].start) { - // printMessage("Found identical start position: " + std::to_string(current_call.start) + " with end: " + std::to_string(current_call.end)); - // // Keep the largest cluster size - // if (current_call.cluster_size > split_sv_calls[i-1].cluster_size) { - // combined_calls.back() = current_call; - // printMessage("Replacing previous call with larger cluster size: " + std::to_string(current_call.cluster_size) + " > " + std::to_string(split_sv_calls[i-1].cluster_size)); - // } - - // // Merge the cluster sizes - // combined_calls.back().cluster_size += current_call.cluster_size; - // printMessage("Merged cluster size: " + std::to_string(combined_calls.back().cluster_size) + " (previous: " + std::to_string(split_sv_calls[i-1].cluster_size) + ")"); + // Remove any deletions with no HMM predictions (HMM likelihood is zero) + // int failed_del_count = 0; + // for (auto it = split_sv_calls.begin(); it != split_sv_calls.end();) { + // if (it->hmm_likelihood == 0.0 && it->sv_type == SVType::DEL) { + // it = split_sv_calls.erase(it); + // failed_del_count++; // } else { - // // Add the current call to the combined calls - // combined_calls.push_back(current_call); + // ++it; // } // } - - - // printMessage("Merged " + std::to_string(merge_count) + " SVs with identical start and end positions"); - // // Replace the split SV calls with the combined calls - // printMessage("[TEST] Total SVs before merging: " + std::to_string(split_sv_calls.size())); - // split_sv_calls.clear(); - // split_sv_calls.insert(split_sv_calls.end(), combined_calls.begin(), combined_calls.end()); - // printMessage("[TEST] Total SVs after merging: " + std::to_string(split_sv_calls.size())); + // if (failed_del_count > 0) { + // printMessage("Removed " + std::to_string(failed_del_count) + " failed deletion candidates with no HMM predictions"); + // } } void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 7eabc4c6..d316b726 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -225,19 +225,41 @@ void mergeDuplicateSVs(std::vector &sv_calls) { int initial_size = sv_calls.size(); std::vector combined_sv_calls; + // std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { + // return a.start < b.start; + // }); + // Sort first by start position, then by SV type std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { - return a.start < b.start; + return std::tie(a.start, a.sv_type) < std::tie(b.start, b.sv_type); }); for (size_t i = 0; i < sv_calls.size(); i++) { SVCall& sv_call = sv_calls[i]; - if (i > 0 && sv_call.start == sv_calls[i - 1].start) { - // Keep the larger cluster size for the same start position - if (sv_call.cluster_size > sv_calls[i - 1].cluster_size) { + if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.sv_type == sv_calls[i - 1].sv_type) { + // Keep the SV call with a non-zero likelihood + // The HMM prediction is more reliable than the split read prediction + if (sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) { + combined_sv_calls.back() = sv_call; + } + + // If the likelihoods are equal, keep the one with the larger cluster size + // This is to ensure that the SV call with more supporting reads is + // kept + else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size > sv_calls[i - 1].cluster_size) { combined_sv_calls.back() = sv_call; } + // // Keep the larger cluster size for the same start position + // if (sv_call.cluster_size > sv_calls[i - 1].cluster_size) { + // combined_sv_calls.back() = sv_call; + // } + + // // If cluster sizes are equal, keep the one with non-zero likelihood + // // The HMM prediction is more reliable than the split read prediction + // else if (sv_call.cluster_size == sv_calls[i - 1].cluster_size && sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) { + // combined_sv_calls.back() = sv_call; + // } - // Combine cluster sizes - combined_sv_calls.back().cluster_size += sv_call.cluster_size; + // // Combine cluster sizes + // combined_sv_calls.back().cluster_size += sv_call.cluster_size; } else { // Add the SV call to the combined list combined_sv_calls.push_back(sv_call); From 63ee46cba612bb383b36bb738bb7580600b3a0c0 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 14 Mar 2025 13:13:14 -0400 Subject: [PATCH 085/134] remove comments and fix warnings --- include/sv_caller.h | 22 +- include/sv_object.h | 14 +- src/sv_caller.cpp | 652 +------------------------------------------- src/sv_object.cpp | 67 +---- 4 files changed, 15 insertions(+), 740 deletions(-) diff --git a/include/sv_caller.h b/include/sv_caller.h index 1b420226..6e446fa6 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -112,44 +112,28 @@ class SVCaller { std::vector getChromosomes(const std::string& bam_filepath); - // void findSplitCNVBreakpoints(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls); - void findSplitSVSignatures(std::unordered_map>& sv_calls, const InputData& input_data); - void findSplitReadSVs(std::unordered_map>& sv_calls, const ReferenceGenome& ref_genome, const InputData& input_data); - // Process a single CIGAR record and find candidate SVs void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome); std::pair getAlignmentReadPositions(bam1_t* alignment); - void processChromosome(const std::string& chr, const CHMM& hmm, std::vector& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov, std::vector& split_sv_calls); + void processChromosome(const std::string& chr, std::vector& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov); - // Detect SVs at a region from long read alignments. This is used for - // whole genome analysis running in parallel. - // RegionData detectSVsFromRegion(std::string region); void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome); // Read the next alignment from the BAM file in a thread-safe manner int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); - // Detect SVs from split alignments - // void detectSVsFromSplitReads(const std::string& region, samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, std::vector& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data); - - // Calculate the mismatch rate given a map of query positions to - // match/mismatch (1/0) values within a specified range of the query - // sequence - double calculateMismatchRate(const MismatchData& mismatch_data); - void runSplitReadCopyNumberPredictions(const std::string& chr, std::vector& split_sv_calls, const CNVCaller &cnv_caller, const CHMM &hmm, double mean_chr_cov, const std::vector &pos_depth_map, const InputData &input_data); void saveToVCF(const std::unordered_map> &sv_calls, const std::string &output_dir, const ReferenceGenome &ref_genome) const; - // Calculate the read depth (INFO/DP) for a region - int calculateReadDepth(const std::vector& pos_depth_map, uint32_t start, uint32_t end); + // Query the read depth (INFO/DP) at a position + int getReadDepth(const std::vector& pos_depth_map, uint32_t start); public: - // Constructor with no arguments SVCaller() = default; // Detect SVs and predict SV type from long read alignments and CNV calls diff --git a/include/sv_object.h b/include/sv_object.h index 7c1f5410..d838e968 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -12,11 +12,9 @@ using namespace sv_types; -// Struct to represent a structural variant call struct SVCall { uint32_t start; uint32_t end; - // std::string sv_type = "NA"; SVType sv_type = SVType::UNKNOWN; std::string alt_allele = "."; std::string data_type = "NA"; @@ -26,32 +24,22 @@ struct SVCall { int support = 0; // Number of supporting reads int cluster_size = 0; // Number of SV calls in the cluster - // Comparison operator for std::set bool operator<(const SVCall& other) const; - // Constructor with parameters for all fields SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) : start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {} }; -// void addSVCall(std::vector& sv_calls, uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth); void addSVCall(std::vector& sv_calls, SVCall& sv_call); -void mergeSVs(std::vector& sv_calls); - // Merge SVs with identical start positions, and sum the cluster sizes void mergeDuplicateSVs(std::vector& sv_calls); -void mergeSVSubsets(std::vector& sv_calls); - -void filterSVsWithLowSupport(std::vector &sv_calls, int min_depth); - -void filterSVsWithLowSupport(std::vector &sv_calls, int min_depth, const std::string& data_type); - uint32_t getSVCount(const std::vector& sv_calls); void concatenateSVCalls(std::vector& sv_calls, const std::vector& sv_calls_update); +// Merge SVs using DBSCAN clustering void mergeSVs(std::vector &sv_calls, double epsilon, int min_pts); #endif // SV_OBJECT_H diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index d1244895..a58d985d 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -33,11 +33,8 @@ # define DUP_SEQSIM_THRESHOLD 0.9 // Sequence similarity threshold for duplication detection -//std::mutex bam_mutex; - int SVCaller::readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1) { - // std::lock_guard lock(this->shared_mutex); std::shared_lock lock(this->shared_mutex); int ret = sam_itr_next(fp_in, itr, bam1); return ret; @@ -60,290 +57,12 @@ std::vector SVCaller::getChromosomes(const std::string &bam_filepat std::vector chromosomes; for (int i = 0; i < bamHdr->n_targets; i++) { chromosomes.push_back(bamHdr->target_name[i]); - // printMessage("Chromosome: " + std::string(bamHdr->target_name[i])); } bam_hdr_destroy(bamHdr); sam_close(fp_in); return chromosomes; } -// void SVCaller::findSplitCNVBreakpoints(samFile *fp_in, hts_idx_t *idx, bam_hdr_t *bamHdr, const std::string ®ion, std::vector& sv_calls) -// { -// std::unordered_map primary_map; -// std::unordered_map> supp_map; - -// // Create a read and iterator for the region -// bam1_t *bam1 = bam_init1(); -// if (!bam1) { -// printError("ERROR: failed to initialize BAM record"); -// return; -// } -// hts_itr_t *itr = sam_itr_querys(idx, bamHdr, region.c_str()); -// if (!itr) { -// bam_destroy1(bam1); -// printError("ERROR: failed to query region " + region); -// return; -// } - -// uint32_t primary_count = 0; -// uint32_t supplementary_count = 0; - -// // Main loop to process the alignments -// uint32_t num_alignments = 0; -// while (readNextAlignment(fp_in, itr, bam1) >= 0) { - -// // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality -// if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) { -// continue; -// } -// const std::string qname = bam_get_qname(bam1); // Query template name - -// // Process primary alignments -// if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { -// // Store chromosome (TID), start, and end positions (1-based) of the -// // primary alignment, and the strand (true for forward, false for reverse) -// primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0}; -// primary_count++; - -// // Process supplementary alignments -// } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { -// // Store chromosome (TID), start, and end positions (1-based) of the -// // supplementary alignment, and the strand (true for forward, false for reverse) -// supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0}); -// supplementary_count++; -// } -// num_alignments++; -// } - -// // Remove primary alignments without supplementary alignments -// std::vector to_remove; -// for (const auto& entry : primary_map) { -// const std::string& qname = entry.first; -// if (supp_map.find(qname) == supp_map.end()) { -// to_remove.push_back(qname); -// } -// } -// for (const std::string& qname : to_remove) { -// primary_map.erase(qname); -// } - -// // // Clean up the iterator and alignment -// // hts_itr_destroy(itr); -// // bam_destroy1(bam1); -// // printMessage(region + ": Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments"); - -// // Identify overlapping primary alignments and then cluster their primary -// // start, end vs. supplementary alignment start, end positions, keeping the -// // median of the largest cluster for the primary and supplementary positions -// // as the final genome coordinates of the SV -// // IntervalNode* root = nullptr; -// std::unique_ptr root = nullptr; -// for (const auto& entry : primary_map) { -// const std::string& qname = entry.first; -// const GenomicRegion& region = entry.second; -// // root = insert(root, region, qname); -// insert(root, region, qname); -// } -// std::vector> primary_clusters; -// std::set processed; - -// for (const auto& entry : primary_map) { -// const std::string& qname = entry.first; -// if (processed.find(qname) != processed.end()) { -// continue; // Skip already processed primary alignments -// } -// const GenomicRegion& region = entry.second; -// std::vector overlap_group; -// findOverlaps(root, region, overlap_group); -// for (const std::string& qname : overlap_group) { -// processed.insert(qname); -// } -// if (overlap_group.size() > 1) { -// primary_clusters.push_back(overlap_group); -// } -// } -// printMessage(region + ": Found " + std::to_string(primary_clusters.size()) + " groups of overlapping primary alignments"); - -// // For each primary alignment cluster the supplementary alignment start and -// // end positions, keeping the median of the largest cluster -// // std::vector sv_candidates; -// int current_group = 0; -// int min_length = 2000; -// int max_length = 1000000; -// for (const auto& primary_cluster : primary_clusters) { -// // Determine if the primary alignments are mostly on opposite strands to -// // the corresponding supplementary alignments (potential inversions) -// bool inversion = false; -// for (const std::string& qname : primary_cluster) { -// const std::vector& supp_alns = supp_map[qname]; -// int num_supp = (int) supp_alns.size(); -// int num_opposite_strand = 0; -// for (const GenomicRegion& supp_aln : supp_alns) { -// // Opposite-strand alignment on the same chromosome -// // (Since the iterator is single-chromosome, this is the case) -// if (supp_aln.strand != primary_map[qname].strand) { -// num_opposite_strand++; -// } -// } -// if (static_cast(num_opposite_strand) / static_cast(num_supp) > 0.5) { -// inversion = true; -// } -// } - -// // Use DBSCAN to cluster primary alignment start, end positions -// DBSCAN1D dbscan(100, 5); -// current_group++; -// std::vector starts; -// std::vector ends; -// std::vector primary_strands; -// for (const std::string& qname : primary_cluster) { -// const GenomicRegion& region = primary_map[qname]; -// starts.push_back(region.start); -// ends.push_back(region.end); -// primary_strands.push_back(region.strand); -// } - -// // Get the largest cluster of primary alignment start positions -// dbscan.fit(starts); -// std::vector primary_start_cluster = dbscan.getLargestCluster(starts); - -// // Get the largest cluster of primary alignment end positions -// dbscan.fit(ends); -// std::vector primary_end_cluster = dbscan.getLargestCluster(ends); - -// // Continue if no clusters were found -// if (primary_start_cluster.empty() && primary_end_cluster.empty()) { -// continue; -// } - -// // Get the supplementary alignment positions -// std::vector supp_starts; -// std::vector supp_ends; -// std::vector supp_strands; -// for (const std::string& qname : primary_cluster) { -// const std::vector& regions = supp_map[qname]; -// for (const GenomicRegion& region : regions) { -// supp_starts.push_back(region.start); -// supp_ends.push_back(region.end); -// supp_strands.push_back(region.strand); -// } -// } - -// // Get the largest cluster of supplementary alignment start positions -// dbscan.fit(supp_starts); -// std::vector supp_start_cluster = dbscan.getLargestCluster(supp_starts); - -// // Get the largest cluster of supplementary alignment end positions -// dbscan.fit(supp_ends); -// std::vector supp_end_cluster = dbscan.getLargestCluster(supp_ends); - -// // Continue if no clusters were found -// if (supp_start_cluster.empty() && supp_end_cluster.empty()) { -// continue; -// } - -// // Use the median of the largest cluster of primary and supplementary -// // alignment start, end positions as the final genome coordinates of the -// // SV -// int primary_pos = -1; -// int primary_pos2 = -1; -// int primary_cluster_size = 0; -// if (primary_start_cluster.size() > primary_end_cluster.size()) { -// std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); -// primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; -// primary_cluster_size = primary_start_cluster.size(); -// } else if (primary_end_cluster.size() > primary_start_cluster.size()) { -// std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); -// primary_pos = primary_end_cluster[primary_end_cluster.size() / 2]; -// primary_cluster_size = primary_end_cluster.size(); -// } else { -// // Use both positions -// std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); -// std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); -// primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; -// primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2]; -// primary_cluster_size = primary_start_cluster.size(); -// } - -// // Get the supplementary alignment positions -// int supp_pos = -1; -// int supp_pos2 = -1; -// int supp_cluster_size = 0; -// if (supp_start_cluster.size() > supp_end_cluster.size()) { -// std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); -// supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; -// supp_cluster_size = supp_start_cluster.size(); -// } else if (supp_end_cluster.size() > supp_start_cluster.size()) { -// std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); -// supp_pos = supp_end_cluster[supp_end_cluster.size() / 2]; -// supp_cluster_size = supp_end_cluster.size(); -// } else { -// // Use both positions. This has been shown to occur in nested SVs -// std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); -// std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); -// supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; -// supp_pos2 = supp_end_cluster[supp_end_cluster.size() / 2]; -// supp_cluster_size = supp_start_cluster.size(); -// } - -// // If two of either were found, use the larger SV candidate -// if (primary_pos2 != -1) { -// int sv_length1 = std::abs(primary_pos - supp_pos); -// int sv_length2 = std::abs(primary_pos2 - supp_pos); -// if (sv_length2 > sv_length1) { -// primary_pos = primary_pos2; -// } -// } -// if (supp_pos2 != -1) { -// int sv_length1 = std::abs(primary_pos - supp_pos); -// int sv_length2 = std::abs(primary_pos - supp_pos2); -// if (sv_length2 > sv_length1) { -// supp_pos = supp_pos2; -// } -// } - -// if (primary_pos == -1 || supp_pos == -1) { -// continue; -// } - -// // Store the SV candidate if the length is within the specified range -// int sv_start = std::min(primary_pos, supp_pos); -// int sv_end = std::max(primary_pos, supp_pos); -// int sv_length = sv_end - sv_start + 1; -// int cluster_size = std::max(primary_cluster_size, supp_cluster_size); - -// // Determine the SV type -// SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; -// if (sv_length >= min_length && sv_length <= max_length) { -// SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size); -// addSVCall(sv_calls, sv_candidate); -// // printMessage(region + ": Found SV candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group) + " with inversion status " + std::to_string(inversion)); -// } -// } - -// // Combine SVs with identical start and end positions, and sum the cluster -// // sizes -// std::vector combined_sv_calls; -// std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { -// return a.start < b.start || (a.start == b.start && a.end < b.end); -// }); -// int merge_count = 0; -// for (size_t i = 0; i < sv_calls.size(); i++) { -// SVCall& sv_call = sv_calls[i]; -// if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.end == sv_calls[i - 1].end) { -// sv_calls[i - 1].cluster_size += sv_call.cluster_size; -// merge_count++; -// } else { -// combined_sv_calls.push_back(sv_call); -// } -// } -// sv_calls = std::move(combined_sv_calls); - -// // if (merge_count > 0) { -// // printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions"); -// // } -// } - void SVCaller::findSplitSVSignatures(std::unordered_map> &sv_calls, const InputData &input_data) { // Open the BAM file @@ -378,8 +97,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map primary_map; - // std::unordered_map> supp_map; std::unordered_map> primary_map; // TID-> qname -> primary alignment std::unordered_map> supp_map; // qname -> supplementary alignment @@ -432,8 +149,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map qpos = getAlignmentReadPositions(bam1); primary_map[bam1->core.tid][qname] = PrimaryAlignment{bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0}; - // primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0}; - // primary_map[qname] = GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0}; alignment_tids.insert(bam1->core.tid); primary_count++; @@ -444,7 +159,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map qpos = getAlignmentReadPositions(bam1); supp_map[qname].push_back(SuppAlignment{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0}); - // supp_map[qname].push_back(GenomicRegion{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), 0}); alignment_tids.insert(bam1->core.tid); supp_qnames.insert(qname); supplementary_count++; @@ -459,7 +173,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map> to_remove; for (auto& chr_primary : primary_map) { - // Get the qnames for this chromosome std::unordered_set qnames; for (const auto& entry : chr_primary.second) { if (supp_qnames.find(entry.first) == supp_qnames.end()) { @@ -478,35 +191,15 @@ void SVCaller::findSplitSVSignatures(std::unordered_map to_remove; - // for (const auto& entry : primary_map) { - // const std::string& qname = entry.first; - // if (supp_map.find(qname) == supp_map.end()) { - // to_remove.push_back(qname); - // } - // } - // for (const std::string& qname : to_remove) { - // primary_map.erase(qname); - // } - - for (const auto& chr_primary : primary_map) { int primary_tid = chr_primary.first; std::string chr_name = bamHdr->target_name[primary_tid]; printMessage("Processing chromosome " + chr_name + " with " + std::to_string(chr_primary.second.size()) + " primary alignments"); std::vector chr_sv_calls; - - // std::unordered_map> primary_map; // TID-> qname -> primary alignment - // const std::unordered_map>& - // chr_primary_map = chr_primary.second; const std::unordered_map& chr_primary_map = chr_primary.second; - // Identify overlapping primary alignments and then cluster their primary - // start, end vs. supplementary alignment start, end positions, keeping the - // median of the largest cluster for the primary and supplementary positions - // as the final genome coordinates of the SV - // IntervalNode* root = nullptr; + // Identify overlapping primary alignments and cluster endpoints std::unique_ptr root = nullptr; for (const auto& entry : chr_primary_map) { const std::string& qname = entry.first; @@ -531,7 +224,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map split_distance_cluster = dbscan.getLargestCluster(split_distances); - // printMessage("Found " + std::to_string(split_distance_cluster.size()) + " split distances (cluster size)"); // Continue if no clusters were found - // if (supp_start_cluster.empty() && supp_end_cluster.empty()) { if (supp_start_cluster.empty() && supp_end_cluster.empty() && split_distance_cluster.empty()) { continue; } @@ -685,8 +375,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map", "SPLITINS", "./.", 0.0, 0, 0, primary_cluster_size); addSVCall(chr_sv_calls, sv_candidate); - // printMessage(chr_name + ": Found split insertion candidate " + std::to_string(primary_pos) + "-" + std::to_string(primary_pos + (read_distance-1)) + " with size " + std::to_string(read_distance) + " for group " + std::to_string(current_group)); - // continue; } } @@ -708,13 +396,9 @@ void SVCaller::findSplitSVSignatures(std::unordered_map supp_end_cluster.size()) { - // std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); - // supp_pos = supp_start_cluster[supp_start_cluster.size() / 2]; supp_pos = supp_best_start; supp_cluster_size = supp_start_cluster.size(); } else if (supp_end_cluster.size() > supp_start_cluster.size()) { - // std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); - // supp_pos = supp_end_cluster[supp_end_cluster.size() / 2]; supp_pos = supp_best_end; supp_cluster_size = supp_end_cluster.size(); } else if (supp_best_end == -1 && supp_best_start == -1) { @@ -729,7 +413,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map= 50) { SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, "", "SUPPINV", "./.", 0.0, 0, 0, supp_cluster_size); addSVCall(chr_sv_calls, sv_candidate); - // printMessage(chr_name + ": Found inversion candidate " + std::to_string(supp_best_start) + "-" + std::to_string(supp_best_end) + " for group " + std::to_string(current_group)); } } @@ -777,17 +460,8 @@ void SVCaller::findSplitSVSignatures(std::unordered_map" : "."; SVCall sv_candidate(sv_start, sv_end, sv_type, alt, "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size); - // SVCall sv_candidate(sv_start, sv_end, SVType::UNKNOWN, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size); addSVCall(chr_sv_calls, sv_candidate); } - - // Determine the SV type - // SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; - // if (sv_length >= min_length && sv_length <= max_length) { - // SVCall sv_candidate(sv_start, sv_end, sv_type, ".", "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size); - // addSVCall(chr_sv_calls, sv_candidate); - // // printMessage(region + ": Found SV candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " for group " + std::to_string(current_group) + " with inversion status " + std::to_string(inversion)); - // } } // Combine SVs with identical start and end positions, and sum the cluster @@ -809,10 +483,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map 0) { - // printMessage(region + ": Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions"); - // } } void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) @@ -848,209 +518,6 @@ void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, c bam_destroy1(bam1); } -double SVCaller::calculateMismatchRate(const MismatchData& mismatch_data) -{ - int start = mismatch_data.query_start; - int end = mismatch_data.query_end; - const std::vector& mismatch_map = mismatch_data.match_map; - start = std::max(start, 0); - end = std::min(end, (int32_t)mismatch_map.size() - 1); - int match_count = 0; - int mismatch_count = 0; - int MATCH = 1; - int MISMATCH = -1; - for (int i = start; i <= end; i++) { - if (mismatch_map[i] == MATCH) { - match_count++; - } else if (mismatch_map[i] == MISMATCH) { - mismatch_count++; - } - } - - // Avoid division by zero - if (match_count + mismatch_count == 0) { - return 0.0; - } - - double mismatch_rate = static_cast(mismatch_count) / static_cast(match_count + mismatch_count); - - return mismatch_rate; -} - -void SVCaller::findSplitReadSVs(std::unordered_map> &sv_calls, const ReferenceGenome &ref_genome, const InputData& input_data) -{ - // Open the BAM file - std::string bam_filepath = input_data.getLongReadBam(); - samFile *fp_in = sam_open(bam_filepath.c_str(), "r"); - if (!fp_in) { - printError("ERROR: failed to open " + bam_filepath); - return; - } - - // Set maximum thread count - int thread_count = input_data.getThreadCount(); - hts_set_threads(fp_in, thread_count); - printMessage("Using " + std::to_string(thread_count) + " threads for split read analysis"); - - // Load the header - bam_hdr_t *bamHdr = sam_hdr_read(fp_in); - if (!bamHdr) { - sam_close(fp_in); - printError("ERROR: failed to read header from " + bam_filepath); - return; - } - - // Load the index - hts_idx_t *idx = sam_index_load(fp_in, bam_filepath.c_str()); - if (!idx) { - bam_hdr_destroy(bamHdr); - sam_close(fp_in); - printError("ERROR: failed to load index for " + bam_filepath); - return; - } - BamFileGuard bam_guard(fp_in, idx, bamHdr); // Guard to close the BAM file - - // Create a whole-genome iterator - hts_itr_t *itr = sam_itr_queryi(idx, HTS_IDX_START, 0, 0); - if (!itr) { - printError("ERROR: failed to query the whole genome"); - return; - } - - // Process the alignments - std::unordered_map primary_map; - std::unordered_map> supp_map; - bam1_t *bam1 = bam_init1(); - if (!bam1) { - printError("ERROR: failed to initialize BAM record"); - return; - } - uint32_t primary_count = 0; - uint32_t supplementary_count = 0; - uint32_t num_alignments = 0; - printMessage("Processing split read alignment records..."); - while (readNextAlignment(fp_in, itr, bam1) >= 0) { - - // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality - if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) { - continue; - } - const std::string qname = bam_get_qname(bam1); // Query template name - - // Process primary alignments - if (!(bam1->core.flag & BAM_FSUPPLEMENTARY)) { - - // Get the start and end positions in the read sequence - uint32_t query_start = 0; - uint32_t query_end = 0; - uint32_t* cigar = bam_get_cigar(bam1); - int cigar_len = bam1->core.n_cigar; - for (int i = 0; i < cigar_len; i++) { - int op_len = bam_cigar_oplen(cigar[i]); - int op = bam_cigar_op(cigar[i]); - - if (i == 0 && op == BAM_CSOFT_CLIP) { - query_start = op_len; - } - - // https://github.com/samtools/htslib/blob/develop/htslib/sam.h: - // bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference - if (bam_cigar_type(op) & 1) { - query_end += op_len; - } - } - - // Store the SV signature - primary_map[qname] = SplitSignature{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), query_start, query_end}; - primary_count++; - - // Process supplementary alignments - } else if (bam1->core.flag & BAM_FSUPPLEMENTARY) { - // Get the start and end positions in the read sequence - uint32_t query_start = 0; - uint32_t query_end = 0; - uint32_t* cigar = bam_get_cigar(bam1); - int cigar_len = bam1->core.n_cigar; - for (int i = 0; i < cigar_len; i++) { - int op_len = bam_cigar_oplen(cigar[i]); - int op = bam_cigar_op(cigar[i]); - - if (i == 0 && op == BAM_CSOFT_CLIP) { - query_start = op_len; - } - - // https://github.com/samtools/htslib/blob/develop/htslib/sam.h: - // bam_cigar_type(o) (BAM_CIGAR_TYPE>>((o)<<1)&3) // bit 1: consume query; bit 2: consume reference - if (bam_cigar_type(op) & 1) { - query_end += op_len; - } - } - - // Store the SV signature - supp_map[qname].push_back(SplitSignature{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), !(bam1->core.flag & BAM_FREVERSE), query_start, query_end}); - supplementary_count++; - } - num_alignments++; - - if (num_alignments % 100000 == 0) { - printMessage("Processed " + std::to_string(num_alignments) + " split read alignment records"); - } - } - - // Remove primary alignments without supplementary alignments - std::vector to_remove; - for (const auto& entry : primary_map) { - const std::string& qname = entry.first; - if (supp_map.find(qname) == supp_map.end()) { - to_remove.push_back(qname); - } - } - for (const std::string& qname : to_remove) { - primary_map.erase(qname); - } - - // Clean up the iterator and alignment - hts_itr_destroy(itr); - bam_destroy1(bam1); - printMessage("Found " + std::to_string(primary_map.size()) + " primary and " + std::to_string(supplementary_count) + " supplementary alignments"); - - // Find insertions by comparing the primary vs. supplementary alignment - // distances in the read vs. reference genome on the same chromosome - int ins_count = 0; - std::vector sv_candidates; - for (const auto& entry : primary_map) { - const std::string& qname = entry.first; - const SplitSignature& primary = entry.second; - const std::vector& supp_alns = supp_map[qname]; - - // TODO: Cluster positions for improved performance - - for (const SplitSignature& supp : supp_alns) { - if (primary.tid == supp.tid) { - int ref_dist = std::abs(primary.start - supp.start); - int query_dist = std::abs(primary.query_start - supp.query_start); - - // If the reads are within 100 bp of each other, and the - // reference distance is greater than 2kb, then it is likely an - // insertion - if (query_dist <= 100 && ref_dist >= 2000) { - int sv_start = std::min(primary.start, supp.start); - int sv_end = std::max(primary.start, supp.start); - int sv_length = sv_end - sv_start + 1; - int cluster_size = 1; - printMessage("Found insertion candidate " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length)); - SVCall sv_candidate(sv_start, sv_end, SVType::INS, "", "SPLITINS", "./.", 0.0, 0, 0, cluster_size); - std::string chr = bamHdr->target_name[primary.tid]; - sv_calls[chr].push_back(sv_candidate); - ins_count++; - } - } - } - } - - printMessage("Found " + std::to_string(ins_count) + " insertions"); -} - void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector &sv_calls, bool is_primary, const std::vector &pos_depth_map, const ReferenceGenome &ref_genome) { std::string chr = header->target_name[alignment->core.tid]; // Chromosome name @@ -1089,7 +556,6 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec ins_seq_str[j] = base; } } - // std::string ins_seq_str_rc = reverseComplement(ins_seq_str); // Before the insertion if (pos >= (uint32_t)op_len-1) @@ -1099,7 +565,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) { - int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); + int read_depth = this->getReadDepth(pos_depth_map, bp1); SVCall sv_call(bp1, bp2, SVType::DUP, "", "LSEQSIM", "./.", default_lh, read_depth, 1, 0); addSVCall(sv_calls, sv_call); continue; @@ -1114,7 +580,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) { - int read_depth = this->calculateReadDepth(pos_depth_map, bp1, bp2); + int read_depth = this->getReadDepth(pos_depth_map, bp1); SVCall sv_call(bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth, 1, 0); addSVCall(sv_calls, sv_call); continue; @@ -1122,11 +588,9 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec } // Add as an insertion - // For read depth calculation, use the previous and current - // positions (1-based) uint32_t ins_pos = pos + 1; uint32_t ins_end = ins_pos + op_len - 1; - int read_depth = this->calculateReadDepth(pos_depth_map, ins_pos-1, ins_pos); + int read_depth = this->getReadDepth(pos_depth_map, ins_pos-1); // Determine the ALT allele format based on small vs. large insertion std::string alt_allele = ""; @@ -1141,7 +605,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec ref_pos = pos+1; ref_end = ref_pos + op_len -1; - int read_depth = this->calculateReadDepth(pos_depth_map, ref_pos, ref_end); + int read_depth = this->getReadDepth(pos_depth_map, ref_pos); SVCall sv_call(ref_pos, ref_end, SVType::DEL, "", "CIGARDEL", "./.", default_lh, read_depth, 1, 0); addSVCall(sv_calls, sv_call); } @@ -1149,10 +613,6 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec // Update the reference position // https://samtools.github.io/hts-specs/SAMv1.pdf - // if (bam_cigar_type(op) & 2) { - // // bit 2: consume reference - // ref_pos += op_len; - // } if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { pos += op_len; } @@ -1174,9 +634,6 @@ std::pair SVCaller::getAlignmentReadPositions(bam1_t *alignment) int op_len = bam_cigar_oplen(cigar[i]); int op = bam_cigar_op(cigar[i]); - // if (i == 0 && op == BAM_CSOFT_CLIP) { - // query_start = op_len; - // } // Set the query start position to the first non-soft clip operation if (query_start == -1 && (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CEQUAL || op == BAM_CDIFF)) { query_start = query_end; // First valid query position @@ -1190,14 +647,13 @@ std::pair SVCaller::getAlignmentReadPositions(bam1_t *alignment) } if (query_start == -1) { - // If no valid query start position was found, set it to 0 query_start = 0; } return std::make_pair(query_start, query_end); } -void SVCaller::processChromosome(const std::string& chr, const CHMM& hmm, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov, std::vector& split_sv_calls) +void SVCaller::processChromosome(const std::string& chr, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov) { // Open the BAM file std::string bam_filepath = input_data.getLongReadBam(); @@ -1321,12 +777,11 @@ void SVCaller::run(const InputData& input_data) std::vector sv_calls; std::vector split_sv_calls; InputData chr_input_data = input_data; // Use a thread-local copy - this->processChromosome(chr, hmm, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], split_sv_calls); + this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]); { std::shared_lock lock(this->shared_mutex); whole_genome_sv_calls[chr] = std::move(sv_calls); } - // printMessage("Completed chromosome " + chr); } catch (const std::exception& e) { printError("Error processing chromosome " + chr + ": " + e.what()); } catch (...) { @@ -1366,8 +821,6 @@ void SVCaller::run(const InputData& input_data) const std::string& chr = entry.first; std::vector& sv_calls = entry.second; if (sv_calls.size() > 0) { - // printMessage("Running copy number predictions on " + chr + - // "..."); printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); } @@ -1415,69 +868,27 @@ void SVCaller::run(const InputData& input_data) this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome); } -// Detect SVs from split read alignments +// Run copy number predictions on the SVs detected from the split reads void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::vector& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) { - // Run copy number predictions on the SVs detected from the split reads std::vector processed_calls; for (const auto& sv_candidate : split_sv_calls) { - // printMessage("Processing SV candidate " + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " " + getSVTypeString(sv_candidate.sv_type) + "..."); - - // bool is_inversion = sv_candidate.sv_type == SVType::INV; - std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); std::string genotype = std::get<2>(result); if (supp_type != SVType::UNKNOWN && supp_type != SVType::NEUTRAL) { - // if (is_inversion) { - // // Add an additional inversion separately - // int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); - // std::string alt_allele = ""; - // SVCall sv_call(sv_candidate.start, sv_candidate.end, SVType::INV, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); - // processed_calls.push_back(sv_call); - // } - - int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); + int read_depth = this->getReadDepth(pos_depth_map, sv_candidate.start); std::string alt_allele = "<" + getSVTypeString(supp_type) + ">"; SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); - // printMessage("[SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type)); - // addSVCall(split_sv_calls, sv_call); processed_calls.push_back(sv_call); } - - // } else if (sv_candidate.sv_type == SVType::INV) { - // // SV with no copy number prediction, but is a potential inversion or insertion - // int read_depth = this->calculateReadDepth(pos_depth_map, sv_candidate.start, sv_candidate.end); - // // std::string alt_allele = ""; - // std::string alt_allele = "<" + getSVTypeString(sv_candidate.sv_type) + ">"; - // SVCall sv_call(sv_candidate.start, sv_candidate.end, sv_candidate.sv_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); - // printMessage("[TEST-SPLIT] Adding SV call: " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " " + getSVTypeString(sv_call.sv_type) + ", len=" + std::to_string(sv_call.end - sv_call.start) + ", type=" + getSVTypeString(sv_call.sv_type)); - // processed_calls.push_back(sv_call); - // } - // if (current_sv % 1000 == 0) { - // printMessage("Processed " + std::to_string(current_sv) + " of " + std::to_string(total_svs) + " SV candidates"); - // } } // Insert the copy number predictions back into the split SV calls printMessage("Inserting CNV calls..."); split_sv_calls.insert(split_sv_calls.end(), processed_calls.begin(), processed_calls.end()); mergeDuplicateSVs(split_sv_calls); - - // Remove any deletions with no HMM predictions (HMM likelihood is zero) - // int failed_del_count = 0; - // for (auto it = split_sv_calls.begin(); it != split_sv_calls.end();) { - // if (it->hmm_likelihood == 0.0 && it->sv_type == SVType::DEL) { - // it = split_sv_calls.erase(it); - // failed_del_count++; - // } else { - // ++it; - // } - // } - // if (failed_del_count > 0) { - // printMessage("Removed " + std::to_string(failed_del_count) + " failed deletion candidates with no HMM predictions"); - // } } void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const @@ -1576,8 +987,6 @@ void SVCaller::saveToVCF(const std::unordered_map 0) { @@ -1701,26 +1081,14 @@ void SVCaller::saveToVCF(const std::unordered_map& pos_depth_map, uint32_t start, uint32_t end) +int SVCaller::getReadDepth(const std::vector& pos_depth_map, uint32_t start) { int read_depth = 0; try { - // printMessage("Read depth at start: " + std::to_string(pos_depth_map.at(start)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start)); read_depth += pos_depth_map.at(start); } catch (const std::out_of_range& e) { - // std::cerr << "Warning: Start position " << start << " not found in - // depth map." << std::endl; printError("Error: Start position " + std::to_string(start) + " not found in depth map."); } - // UPDATE: Only use the start position for the read depth calculation - // try { - // // printMessage("Read depth at end: " + std::to_string(pos_depth_map.at(end)) + " for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start)); - // read_depth += pos_depth_map.at(end); - // } catch (const std::out_of_range& e) { - // printError("Error: End position " + std::to_string(end) + " not found in depth map."); - // // std::cerr << "Warning: End position " << end << " not found in depth map of size " << pos_depth_map.size() << "." << std::endl; - // } - // printMessage("Read depth for SV at " + std::to_string(start) + "-" + std::to_string(end) + " with length " + std::to_string(end-start) + ": " + std::to_string(read_depth)); return read_depth; } diff --git a/src/sv_object.cpp b/src/sv_object.cpp index d316b726..e4bb4699 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -225,9 +225,7 @@ void mergeDuplicateSVs(std::vector &sv_calls) { int initial_size = sv_calls.size(); std::vector combined_sv_calls; - // std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { - // return a.start < b.start; - // }); + // Sort first by start position, then by SV type std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { return std::tie(a.start, a.sv_type) < std::tie(b.start, b.sv_type); @@ -247,21 +245,7 @@ void mergeDuplicateSVs(std::vector &sv_calls) else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size > sv_calls[i - 1].cluster_size) { combined_sv_calls.back() = sv_call; } - // // Keep the larger cluster size for the same start position - // if (sv_call.cluster_size > sv_calls[i - 1].cluster_size) { - // combined_sv_calls.back() = sv_call; - // } - - // // If cluster sizes are equal, keep the one with non-zero likelihood - // // The HMM prediction is more reliable than the split read prediction - // else if (sv_call.cluster_size == sv_calls[i - 1].cluster_size && sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) { - // combined_sv_calls.back() = sv_call; - // } - - // // Combine cluster sizes - // combined_sv_calls.back().cluster_size += sv_call.cluster_size; } else { - // Add the SV call to the combined list combined_sv_calls.push_back(sv_call); } } @@ -271,52 +255,3 @@ void mergeDuplicateSVs(std::vector &sv_calls) printMessage("Merged " + std::to_string(merge_count) + " SV candidates with identical start and end positions"); } } - -void mergeSVSubsets(std::vector &sv_calls) -{ - // Sort the SV calls by start position - int initial_size = sv_calls.size(); - std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { - return a.start < b.start; - }); - - // Remove SVs that are subsets of other SVs - std::vector filtered_sv_calls; - // Since the input SV calls are sorted by start position, we can iterate - // through them in order and only keep the SVs that are not subsets of - // others - for (const auto& sv_call : sv_calls) { - // Check if the current SV call is a subset of any previously added - // SV call - bool is_subset = false; - for (const auto& filtered_sv_call : filtered_sv_calls) { - if (sv_call.start >= filtered_sv_call.start && sv_call.end <= filtered_sv_call.end) { - is_subset = true; - break; - } - } - // If it's not a subset, add it to the filtered list - if (!is_subset) { - filtered_sv_calls.push_back(sv_call); - } - } - sv_calls = std::move(filtered_sv_calls); // Replace with filtered list - int updated_size = sv_calls.size(); - printMessage("Filtered SV calls to remove subsets, from " + std::to_string(initial_size) + " to " + std::to_string(updated_size)); -} - -void filterSVsWithLowSupport(std::vector &sv_calls, int min_support) -{ - // Filter SV calls with low read support or low cluster size - sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support](const SVCall& sv_call) { - return sv_call.support < min_support && sv_call.cluster_size < min_support; - }), sv_calls.end()); -} - -void filterSVsWithLowSupport(std::vector &sv_calls, int min_support, const std::string &data_type) -{ - // Filter SV calls with low read depth only for the specified data type, keeping the rest - sv_calls.erase(std::remove_if(sv_calls.begin(), sv_calls.end(), [min_support, data_type](const SVCall& sv_call) { - return sv_call.support < min_support && sv_call.data_type == data_type; - }), sv_calls.end()); -} From c10a530c033a5639f4ef3e96135819c73f6eace2 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 15 Mar 2025 09:18:04 -0400 Subject: [PATCH 086/134] cnv merge fix --- src/cnv_caller.cpp | 170 ++------------------------------------------- src/sv_caller.cpp | 30 ++++---- 2 files changed, 22 insertions(+), 178 deletions(-) diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index c23c146b..f9f346a8 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -26,7 +26,6 @@ #include // std::pair #include #include // std::execution::par -// #include #include "utils.h" #include "sv_types.h" @@ -55,17 +54,6 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end { // Initialize the SNP data with default values and sample size length int sample_size = input_data.getSampleSize(); - // int region_length = (int) (end_pos - start_pos + 1); - // if (region_length < sample_size) - // { - // sample_size = region_length; - // } - - // std::vector snp_pos(sample_size, 0); - // std::vector snp_baf(sample_size, -1.0); - // std::vector snp_pfb(sample_size, 0.5); - // std::vector snp_log2_cov(sample_size, 0.0); - // std::vector is_snp(sample_size, false); std::vector snp_pos; std::vector snp_baf; std::vector snp_pfb; @@ -74,15 +62,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end // Get the log2 ratio for evenly spaced positions in the // region - // this->calculateSNPLog2Ratios(snp_pos, snp_log2_cov, pos_depth_map, - // mean_chr_cov, input_data); sample_size = std::max((int) snp_pos.size(), sample_size); - //printMessage("Sample size: " + std::to_string(sample_size)); - // std::vector snp_pos_hmm(sample_size, 0); - // std::vector snp_baf_hmm(sample_size, -1.0); - // std::vector snp_pfb_hmm(sample_size, 0.5); - // std::vector snp_log2_hmm(sample_size, 0.0); - // std::vector is_snp_hmm(sample_size, false); std::vector snp_pos_hmm; std::vector snp_baf_hmm; std::vector snp_pfb_hmm; @@ -91,9 +71,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end // Loop through evenly spaced positions in the region and get the log2 ratio double pos_step = (double) (end_pos - start_pos + 1) / (double) sample_size; - // Convert SNP positions for faster access (convert to a set) std::unordered_set snp_pos_set(snp_pos.begin(), snp_pos.end()); - for (int i = 0; i < sample_size; i++) { // Calculate the mean depth for the window @@ -145,10 +123,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end } } - // If no SNP was found in the sample, then use the middle of the window - // as a placeholder - // This is to ensure that the HMM has a value for every position in the - // sample + // If no SNP was found in the sample, then use the center position if (!snp_found_in_sample) { uint32_t pos = (uint32_t) (start_pos + (i * pos_step) + (pos_step / 2.0)); @@ -159,7 +134,6 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end is_snp_hmm.push_back(false); } } - // this->calculateRegionLog2Ratio(start_pos, end_pos, sample_size, pos_depth_map, mean_chr_cov, snp_log2_cov); // Update the SNP data with all information snp_data.pos = std::move(snp_pos_hmm); @@ -217,8 +191,6 @@ std::tuple CNVCaller::runCopyNumberPrediction // Determine if there is a majority state within the SV region and if it // is greater than 75% double pct_threshold = 0.75; - // double pct_threshold = 0.90; - // double pct_threshold = 0.80; int max_state = 0; int max_count = 0; @@ -345,7 +317,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint3 } // Calculate the mean chromosome coverage -// double CNVCaller::calculateMeanChromosomeCoverage(std::string chr, -// std::vector& chr_pos_depth_map, const std::string& bam_filepath, -// int thread_count) const void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& chromosomes, std::unordered_map>& chr_pos_depth_map, std::unordered_map& chr_mean_cov_map, const std::string& bam_filepath, int thread_count) const { // Open the BAM file @@ -452,11 +420,7 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& { continue; } - // if (bam_record->core.flag & BAM_FUNMAP || bam_record->core.flag & BAM_FSECONDARY || bam_record->core.flag & BAM_FQCFAIL || bam_record->core.flag & BAM_FDUP) - // { - // continue; - // } - + // Parse the CIGAR string to get the depth (match, sequence match, and // mismatch) uint32_t pos = (uint32_t)bam_record->core.pos + 1; // 0-based to 1-based @@ -478,11 +442,6 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& continue; } pos_depth_map[ref_pos + j]++; - // try { - // chr_pos_depth_map[ref_pos + j]++; - // } catch (const std::out_of_range& oor) { - // printError("Out of range error for " + chr + ":" + std::to_string(ref_pos+j)); - // } } } @@ -497,31 +456,7 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& } } } - - // Clean up the iterator hts_itr_destroy(bam_iter); - - // printMessage("Finished reading BAM file, calculating mean chromosome coverage..."); - - // // Calculate the mean chromosome coverage for positions with non-zero depth - // uint64_t cum_depth = 0; - // uint32_t pos_count = 0; - // for (const auto& pos_depth : chr_pos_depth_map) - // { - // if (pos_depth > 0) - // { - // cum_depth += pos_depth; - // pos_count++; - // } - // } - - // double mean_chr_cov = 0.0; - // if (pos_count > 0) - // { - // mean_chr_cov = static_cast(cum_depth) / static_cast(pos_count); - // } - // printMessage("Completed calculating mean chromosome coverage: " + - // std::to_string(mean_chr_cov)); // Parallel sum of the depth map uint64_t cum_depth = std::reduce( @@ -539,73 +474,11 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& [](uint32_t depth) { return depth > 0; } ); - // printMessage("Number of positions with non-zero depth: " + std::to_string(pos_count)); - // printMessage("Total depth: " + std::to_string(cum_depth)); - double mean_chr_cov = (pos_count > 0) ? static_cast(cum_depth) / static_cast(pos_count) : 0.0; chr_mean_cov_map[chr] = mean_chr_cov; - - // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Mean chromosome coverage for " + chr + ": " + std::to_string(mean_chr_cov)); } - - // Clean up - // sam_close(bam_file); } -// void CNVCaller::calculateSNPLog2Ratios(const std::vector& snp_pos, const std::vector& snp_log2_cov, const std::vector& pos_depth_map, double mean_chr_cov) const -// { -// // Calculate the log2 ratio for each SNP position -// for (size_t i = 0; i < snp_pos.size(); i++) -// { -// uint32_t pos = snp_pos[i]; -// try { -// uint32_t depth = pos_depth_map.at(pos); - -// // Calculate the log2 ratio for the position -// if (depth == 0) -// { -// snp_log2_cov[i] = 0.0; -// } else { -// snp_log2_cov[i] = log2((double) depth / mean_chr_cov); -// } - -// } catch (const std::out_of_range& e) { -// snp_log2_cov[i] = 0.0; -// } -// } -// } - -// void CNVCaller::calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& log2_region, std::vector& snp_pos) const -// { -// uint32_t region_length = end_pos - start_pos + 1; -// double step_size = (double) region_length / sample_size; -// std::set snp_pos_set(snp_pos.begin(), snp_pos.end()); - -// // Loop through each interval in the region and calculate the log2 ratio -// for (int i = 0; i < sample_size; i++) -// { -// uint32_t pos = start_pos + (uint32_t) (i * step_size); -// if (pos > end_pos) -// { -// pos = end_pos; -// } -// try { -// uint32_t depth = pos_depth_map.at(pos); - -// // Calculate the log2 ratio for the position -// if (depth == 0) -// { -// log2_region[i] = 0.0; -// } else { -// log2_region[i] = log2((double) depth / mean_chr_cov); -// } - -// } catch (const std::out_of_range& e) { -// log2_region[i] = 0.0; -// } -// } -// } - void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data) const { // Lock during reading @@ -784,13 +657,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Add the SNP position and BAF information snp_pos.push_back(pos); snp_baf.push_back(baf); - // is_snp.push_back(true); snp_pfb.push_back(0.5); - // snp_pos[i] = pos; - // snp_baf[i] = baf; - // is_snp[i] = true; snp_found = true; - // break; // Only one SNP per region } } @@ -802,7 +670,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Continue if no SNP was found in the region if (!snp_found) { - // printMessage("No SNP found in region: " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos)); bcf_sr_destroy(snp_reader); bcf_sr_destroy(pfb_reader); return; @@ -827,18 +694,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui printError("ERROR: Could not set region for population allele frequency reader: " + pfb_region_str); } - // for (size_t i = 0; i < snp_pos.size(); ++i) - // { - // Set the region as the SNP position - // printMessage("Setting region for population allele frequency reader..."); - // uint32_t target_snp_pos = snp_pos[i]; // Already 1-based - // std::string snp_region_str = chr_gnomad + ":" + std::to_string(target_snp_pos) + "-" + std::to_string(target_snp_pos); - // if (bcf_sr_set_regions(pfb_reader, snp_region_str.c_str(), 0) < 0) - // { - // printError("ERROR: Could not set region for population allele frequency reader: " + snp_region_str); - // } - // printMessage("Region set for population allele frequency reader, loading population allele frequency data..."); - // Find the SNP position in the population allele frequency file float *pfb_f = NULL; int count = 0; @@ -867,36 +722,19 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui { continue; } - // double pfb = (double) pfb_f[0]; double pfb = static_cast(pfb_f[0]); - // free(pfb_f); // Skip if outside the acceptable range if (pfb <= MIN_PFB || pfb >= MAX_PFB) { continue; } - - // Add the population frequency to the SNP data snp_pfb[i] = pfb; - - break; // Break after finding the SNP position - - // if (print_count < 20) { - // printMessage("SNP " + std::to_string(snp_pos[i]) + " BAF: " + std::to_string(snp_baf[i]) + " PFB: " + std::to_string(snp_pfb[i]) + " (Region: " + snp_region_str + ")"); - // print_count++; - // } + break; } free(pfb_f); - - // if (pfb_reader->errnum) - // { - // printError("ERROR: " + std::string(bcf_sr_strerror(pfb_reader->errnum))); - // } - // } } - // } - + // Clean up bcf_sr_destroy(snp_reader); bcf_sr_destroy(pfb_reader); diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index a58d985d..e2d429bb 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -872,23 +872,26 @@ void SVCaller::run(const InputData& input_data) void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::vector& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) { std::vector processed_calls; - for (const auto& sv_candidate : split_sv_calls) { + for (auto& sv_candidate : split_sv_calls) { std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); std::string genotype = std::get<2>(result); - if (supp_type != SVType::UNKNOWN && supp_type != SVType::NEUTRAL) { - int read_depth = this->getReadDepth(pos_depth_map, sv_candidate.start); - std::string alt_allele = "<" + getSVTypeString(supp_type) + ">"; - SVCall sv_call(sv_candidate.start, sv_candidate.end, supp_type, alt_allele, "SPLIT", genotype, supp_lh, read_depth, 1, sv_candidate.cluster_size); - processed_calls.push_back(sv_call); + + // For inversions with copy-neutral support, update the HMM likelihood + if (supp_type == SVType::NEUTRAL && sv_candidate.sv_type == SVType::INV) { + sv_candidate.hmm_likelihood = supp_lh; } - } - // Insert the copy number predictions back into the split SV calls - printMessage("Inserting CNV calls..."); - split_sv_calls.insert(split_sv_calls.end(), processed_calls.begin(), processed_calls.end()); - mergeDuplicateSVs(split_sv_calls); + // Update the SV type if the support is not neutral or unknown + else if (supp_type != SVType::UNKNOWN && supp_type != SVType::NEUTRAL) { + sv_candidate.sv_type = supp_type; + sv_candidate.alt_allele = "<" + getSVTypeString(supp_type) + ">"; + sv_candidate.data_type += "+HMM"; // Update the data type to include HMM + sv_candidate.genotype = genotype; + sv_candidate.hmm_likelihood = supp_lh; + } + } } void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const @@ -997,9 +1000,11 @@ void SVCaller::saveToVCF(const std::unordered_map Date: Tue, 18 Mar 2025 15:19:34 -0400 Subject: [PATCH 087/134] fix genotypes --- include/cnv_caller.h | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 6b10bc29..4a250692 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -57,21 +57,30 @@ class CNVCaller { // Each of the 6 state predictions corresponds to a copy number state // (0=No predicted state) // 0: Unknown (No predicted state) - // 1: 0/0 (Two copy loss: homozygous deletion, GT: 0/0) - // 2: 1/0 (One copy loss: heterozygous deletion, GT: 0/1) - // 3: 1/1 (Normal diploid: no copy number change, GT: 1/1) - // 4: 1/1 (Copy neutral LOH: no copy number change, GT: 1/1) + // 1: 1/1 (Two copy loss: homozygous deletion, GT: 1/1 for homozygous variant) + // 2: 0/1 (One copy loss: heterozygous deletion, GT: 0/1) + // 3: 0/0 (Normal diploid: no copy number change, GT: 0/0 for homozygous reference) + // 4: 1/1 (Copy neutral LOH: no copy number change, GT: 1/1 for homozygous variant) // 5: 2/1 (One copy gain: heterozygous duplication, GT: 1/2->0/1) // 6: 2/2 (Two copy gain: homozygous duplication, GT: 2/2->1/1) - std ::map cnv_genotype_map = { + std::map cnv_genotype_map = { {0, "./."}, - {1, "0/0"}, + {1, "1/1"}, {2, "0/1"}, - {3, "1/1"}, + {3, "0/0"}, {4, "1/1"}, {5, "0/1"}, {6, "1/1"} }; + // std ::map cnv_genotype_map = { + // {0, "./."}, + // {1, "0/0"}, + // {2, "0/1"}, + // {3, "1/1"}, + // {4, "1/1"}, + // {5, "0/1"}, + // {6, "1/1"} + // }; void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp); From 877d1ff7440a51557735aed989a9d80a93813004 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 19 Mar 2025 17:02:03 -0400 Subject: [PATCH 088/134] clean up comments and save cnv in json format --- include/cnv_caller.h | 20 +---- include/input_data.h | 4 + include/utils.h | 2 + src/cnv_caller.cpp | 175 +++++++++++++++++++++++++++++++++++++++---- src/input_data.cpp | 11 +++ src/khmm.cpp | 40 +--------- src/main.cpp | 13 ++++ src/sv_caller.cpp | 8 +- src/sv_object.cpp | 8 +- src/utils.cpp | 10 ++- 10 files changed, 210 insertions(+), 81 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 4a250692..b424b6c2 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -46,10 +46,6 @@ struct SNPData { // CNVCaller: Detect CNVs and return the state sequence by SNP position class CNVCaller { private: - //mutable std::mutex snp_file_mtx; // SNP file mutex - //mutable std::mutex pfb_file_mtx; // Population frequency file mutex - //mutable std::mutex bam_file_mtx; // BAM file mutex - // std::mutex& shared_mutex; std::shared_mutex& shared_mutex; // Define a map of CNV genotypes by HMM predicted state. @@ -72,15 +68,6 @@ class CNVCaller { {5, "0/1"}, {6, "1/1"} }; - // std ::map cnv_genotype_map = { - // {0, "./."}, - // {1, "0/0"}, - // {2, "0/1"}, - // {3, "1/1"}, - // {4, "1/1"}, - // {5, "0/1"}, - // {6, "1/1"} - // }; void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp); @@ -102,15 +89,14 @@ class CNVCaller { // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; - // double calculateMeanChromosomeCoverage(std::string chr, std::vector& chr_pos_depth_map, const std::string& bam_filepath, int thread_count) const; void calculateMeanChromosomeCoverage(const std::vector& chromosomes, std::unordered_map>& chr_pos_depth_map, std::unordered_map& chr_mean_cov_map, const std::string& bam_filepath, int thread_count) const; - // void calculateRegionLog2Ratio(uint32_t start_pos, uint32_t end_pos, int sample_size, const std::vector& pos_depth_map, double mean_chr_cov, std::vector& pos_log2) const; - - void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data) const; + void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, const InputData& input_data) const; // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const; + + void saveSVCopyNumberToJSON(SNPData& before_sv, SNPData& after_sv, SNPData& snp_data, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood, const std::string& filepath) const; }; #endif // CNV_CALLER_H diff --git a/include/input_data.h b/include/input_data.h index 106c70b6..0687af76 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -100,6 +100,9 @@ class InputData { // (+/- 1/2 SV length), save a TSV file, and generate HTML reports. void saveCNVData(bool save_cnv_data); bool getSaveCNVData() const; + + void setCNVOutputFile(std::string filepath); + std::string getCNVOutputFile() const; private: std::string short_read_bam; @@ -124,6 +127,7 @@ class InputData { bool verbose; // True if verbose output is enabled bool save_cnv_data; // True if SNP CNV regions should be extended around SV breakpoints, and saved to a TSV file (Large performance hit) bool single_chr; + std::string cnv_output_file; }; #endif // INPUT_DATA_H diff --git a/include/utils.h b/include/utils.h index 6ec95610..e7bf164a 100644 --- a/include/utils.h +++ b/include/utils.h @@ -59,4 +59,6 @@ std::string removeChrPrefix(std::string chr); void printMemoryUsage(const std::string &functionName); +bool fileExists(const std::string &filepath); + #endif // UTILS_H diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index f9f346a8..b5d61745 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -57,8 +57,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end std::vector snp_pos; std::vector snp_baf; std::vector snp_pfb; - std::vector is_snp; - this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, is_snp, input_data); + this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, input_data); // Get the log2 ratio for evenly spaced positions in the // region @@ -97,6 +96,11 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end double log2_cov = 0.0; if (pos_count > 0) { + if (cov_sum == 0) + { + // Use a small value to avoid division by zero + cov_sum = 1e-9; + } log2_cov = log2((cov_sum / (double) pos_count) / mean_chr_cov); } @@ -156,11 +160,24 @@ std::tuple CNVCaller::runCopyNumberPrediction // Only extend the region if "save CNV data" is enabled uint32_t snp_start_pos = start_pos; uint32_t snp_end_pos = end_pos; + SNPData before_sv; + SNPData after_sv; if (input_data.getSaveCNVData()) { uint32_t sv_half_length = (end_pos - start_pos) / 2.0; - snp_start_pos = start_pos > sv_half_length ? start_pos - sv_half_length : 1; - snp_end_pos = end_pos + sv_half_length; + if (start_pos > 1) + { + uint32_t before_sv_start = std::max((uint32_t) 1, start_pos - sv_half_length); + uint32_t before_sv_end = start_pos - 1; + querySNPRegion(chr, before_sv_start, before_sv_end, pos_depth_map, mean_chr_cov, before_sv, input_data); + } + uint32_t chr_last_index = pos_depth_map.size() - 1; + if (end_pos < chr_last_index) + { + uint32_t after_sv_start = end_pos + 1; + uint32_t after_sv_end = std::min(chr_last_index, end_pos + sv_half_length); + querySNPRegion(chr, after_sv_start, after_sv_end, pos_depth_map, mean_chr_cov, after_sv, input_data); + } } // Query the SNP region for the SV candidate @@ -217,16 +234,17 @@ std::tuple CNVCaller::runCopyNumberPrediction } snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data - // Save the SV calls as a TSV file if enabled + // Save the SV calls if enabled bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL); - // if (save_cnv_data && copy_number_change && (end_pos - start_pos) > 10000) - if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 10000) + if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 50000) { std::string cnv_type_str = getSVTypeString(predicted_cnv_type); - const std::string output_dir = input_data.getOutputDir(); - std::string sv_filename = output_dir + "/" + cnv_type_str + "_" + chr + "_" + std::to_string((int) start_pos) + "-" + std::to_string((int) end_pos) + "_SPLITALN.tsv"; - printMessage("Saving SV split-alignment copy number predictions to " + sv_filename + "..."); - this->saveSVCopyNumberToTSV(snp_data, sv_filename, chr, start_pos, end_pos, cnv_type_str, likelihood); + // const std::string output_dir = input_data.getOutputDir(); + // std::string json_filepath = output_dir + "/CNVCalls.json"; + std::string json_filepath = input_data.getCNVOutputFile(); + printMessage("Saving SV copy number predictions to " + json_filepath + "..."); + + this->saveSVCopyNumberToJSON(before_sv, after_sv, snp_data, chr, start_pos, end_pos, cnv_type_str, likelihood, json_filepath); } return std::make_tuple(likelihood, predicted_cnv_type, genotype, true); @@ -479,7 +497,7 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& } } -void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, std::vector& is_snp, const InputData& input_data) const +void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, const InputData& input_data) const { // Lock during reading std::shared_lock lock(this->shared_mutex); @@ -593,7 +611,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Read the SNP data ---------------------------------------------- // Set the region - if (bcf_sr_set_regions(snp_reader, chr.c_str(), 0) < 0) + std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); + if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) //chr.c_str(), 0) < 0) { printError("ERROR: Could not set region for SNP reader: " + chr); bcf_sr_destroy(snp_reader); @@ -822,6 +841,136 @@ void CNVCaller::saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, s tsv_file.close(); } +void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SNPData &snp_data, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood, const std::string& filepath) const +{ + // Append the SV information to the JSON file + std::ofstream json_file(filepath, std::ios::app); + if (!json_file.is_open()) + { + std::cerr << "ERROR: Could not open JSON file for writing: " << filepath << std::endl; + exit(1); + } + json_file << "{\n"; + json_file << " \"chromosome\": \"" << chr << "\",\n"; + json_file << " \"start\": " << start << ",\n"; + json_file << " \"end\": " << end << ",\n"; + json_file << " \"sv_type\": \"" << sv_type << "\",\n"; + json_file << " \"likelihood\": " << likelihood << ",\n"; + json_file << " \"before_sv\": {\n"; + json_file << " \"positions\": ["; + for (size_t i = 0; i < before_sv.pos.size(); ++i) + { + json_file << before_sv.pos[i]; + if (i < before_sv.pos.size() - 1) + json_file << ", "; + } + json_file << "],\n"; + json_file << " \"b_allele_freq\": ["; + for (size_t i = 0; i < before_sv.baf.size(); ++i) + { + json_file << before_sv.baf[i]; + if (i < before_sv.baf.size() - 1) + json_file << ", "; + } + json_file << "],\n"; + json_file << " \"population_freq\": ["; + for (size_t i = 0; i < before_sv.pfb.size(); ++i) + { + json_file << before_sv.pfb[i]; + if (i < before_sv.pfb.size() - 1) + json_file << ", "; + } + json_file << "],\n"; + json_file << " \"log2_ratio\": ["; + for (size_t i = 0; i < before_sv.log2_cov.size(); ++i) + { + json_file << before_sv.log2_cov[i]; + if (i < before_sv.log2_cov.size() - 1) + json_file << ", "; + } + json_file << "]\n"; + json_file << " },\n"; + json_file << " \"after_sv\": {\n"; + json_file << " \"positions\": ["; + for (size_t i = 0; i < after_sv.pos.size(); ++i) + { + json_file << after_sv.pos[i]; + if (i < after_sv.pos.size() - 1) + json_file << ", "; + } + json_file << "],\n"; + json_file << " \"b_allele_freq\": ["; + for (size_t i = 0; i < after_sv.baf.size(); ++i) + { + json_file << after_sv.baf[i]; + if (i < after_sv.baf.size() - 1) + json_file << ", "; + } + json_file << "],\n"; + json_file << " \"population_freq\": ["; + for (size_t i = 0; i < after_sv.pfb.size(); ++i) + { + json_file << after_sv.pfb[i]; + if (i < after_sv.pfb.size() - 1) + json_file << ", "; + } + json_file << "],\n"; + json_file << " \"log2_ratio\": ["; + for (size_t i = 0; i < after_sv.log2_cov.size(); ++i) + { + json_file << after_sv.log2_cov[i]; + if (i < after_sv.log2_cov.size() - 1) + json_file << ", "; + } + json_file << "]\n"; + json_file << " },\n"; + json_file << " \"sv\": {\n"; + json_file << " \"positions\": ["; + for (size_t i = 0; i < snp_data.pos.size(); ++i) + { + json_file << snp_data.pos[i]; + if (i < snp_data.pos.size() - 1) + json_file << ", "; + } + json_file << "],\n"; + json_file << " \"b_allele_freq\": ["; + for (size_t i = 0; i < snp_data.baf.size(); ++i) + { + json_file << snp_data.baf[i]; + if (i < snp_data.baf.size() - 1) + json_file << ", "; + } + json_file << "],\n"; + json_file << " \"population_freq\": ["; + for (size_t i = 0; i < snp_data.pfb.size(); ++i) + { + json_file << snp_data.pfb[i]; + if (i < snp_data.pfb.size() - 1) + json_file << ", "; + } + json_file << "],\n"; + json_file << " \"log2_ratio\": ["; + for (size_t i = 0; i < snp_data.log2_cov.size(); ++i) + { + json_file << snp_data.log2_cov[i]; + if (i < snp_data.log2_cov.size() - 1) + json_file << ", "; + } + json_file << "],\n"; + json_file << " \"states\": ["; + for (size_t i = 0; i < snp_data.state_sequence.size(); ++i) + { + json_file << snp_data.state_sequence[i]; + if (i < snp_data.state_sequence.size() - 1) + json_file << ", "; + } + json_file << "]\n"; + json_file << " }\n"; + json_file << "}\n"; + json_file.close(); + printMessage("Saved copy number predictions for " + chr + ":" + std::to_string(start) + "-" + std::to_string(end) + " to " + filepath); +} + void CNVCaller::updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp) { // Update the SNP data diff --git a/src/input_data.cpp b/src/input_data.cpp index 7a073dae..5661eb3b 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -35,6 +35,7 @@ InputData::InputData() this->verbose = false; this->save_cnv_data = false; this->single_chr = false; + this->cnv_output_file = ""; } std::string InputData::getShortReadBam() const @@ -403,3 +404,13 @@ bool InputData::getSaveCNVData() const { return this->save_cnv_data; } + +void InputData::setCNVOutputFile(std::string filepath) +{ + this->cnv_output_file = filepath; +} + +std::string InputData::getCNVOutputFile() const +{ + return this->cnv_output_file; +} diff --git a/src/khmm.cpp b/src/khmm.cpp index fcc4899d..43ed3958 100644 --- a/src/khmm.cpp +++ b/src/khmm.cpp @@ -55,15 +55,8 @@ std::pair, double> testVit_CHMM(CHMM hmm, int T, std::vector mean, std::vector sd, double uf, double o) { - // if (o < mean[1]) - // { - // o = mean[1]; - // } - // double p = uf + ((1 - uf) * pdf_normal(o, mean[state], sd[state])); - // Get the values (0-based indexing) if (o < mean[0]) { @@ -74,22 +67,8 @@ double b1iot(int state, std::vector mean, std::vector sd, double return log(p); } -// double b2iot(int state, double *mean, double *sd, double uf, double pfb, double b) double b2iot(int state, const std::vector mean, const std::vector sd, double uf, double pfb, double b) { - // double p = 0; - // double mean0 = mean[1]; // mean[1] = 0 - // double mean25 = mean[2]; // mean[2] = 0.25 - // double mean33 = mean[3]; // mean[3] = 0.33 - // double mean50 = mean[4]; // mean[4] = 0.5 - // double mean50_state1 = mean[5]; // mean[5] = 0.5 - // double sd0 = sd[1]; // sd[1] = 0 - // double sd25 = sd[2]; // sd[2] = 0.25 - // double sd33 = sd[3]; // sd[3] = 0.33 - // double sd50 = sd[4]; // sd[4] = 0.5 - // double sd50_state1 = sd[5]; // sd[5] = 0.5 - // p = uf; // UF = previous alpha (transition probability) - // Get the values (0-based indexing) double p = 0; double mean0 = mean[0]; // mean[0] = 0 @@ -275,7 +254,6 @@ std::pair, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect { for (j = 1; j <= hmm.N; j++) { - // A1[i][j] = hmm.A[i][j]; // Update for 0-based indexing A1[i][j] = hmm.A[i-1][j-1]; } @@ -333,11 +311,7 @@ std::pair, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect /* 1. Initialization */ for (i = 1; i <= hmm.N; i++) - { - // delta[1][i] = hmm.pi[i] + biot[i][1]; // Initialize the delta matrix - // (log probability) to the initial state distribution + the emission - // probability - + { // Update to 0-based indexing delta[1][i] = hmm.pi[i-1] + biot[i][1]; // Initialize the delta matrix psi[1][i] = 0; // Initialize the psi matrix (state sequence) to 0 (no state) @@ -396,20 +370,8 @@ std::pair, double> ViterbiLogNP_CHMM(CHMM hmm, int T, std::vect q[t] = psi[t + 1][q[t + 1]]; } - // // Print t, the state, delta, biot, and psi - // for (t = 1; t <= T; t++) - // { - // std::cout << "Time " << t << " with state " << q[t] << ":" << std::endl; - // for (i = 1; i <= hmm.N; i++) - // { - // std::cout << "State " << i << ": delta = " << delta[t][i] << ", biot = " << biot[i][t] << ", psi = " << psi[t][i] << ", LRR = " << O1[t-1] << ", BAF = " << O2[t-1] << std::endl; - // } - // std::cout << std::endl; - // } - for (i = 1; i <= hmm.N; i++) { /*recover the HMM model as original*/ - // hmm.pi[i] = exp(hmm.pi[i]); // Update to 0-based indexing hmm.pi[i-1] = exp(hmm.pi[i-1]); } diff --git a/src/main.cpp b/src/main.cpp index e493cd4e..b793619b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -72,6 +72,19 @@ void runContextSV(const std::unordered_map& args) input_data.setDBSCAN_MinPtsPct(std::stod(args.at("min-pts-pct"))); } + // Set up the CNV JSON file if enabled + if (input_data.getSaveCNVData()) { + const std::string output_dir = input_data.getOutputDir(); + std::string json_filepath = output_dir + "/CNVCalls.json"; + int json_file_count = 1; + while (fileExists(json_filepath)) { + json_filepath = output_dir + "/CNVCalls_" + std::to_string(json_file_count) + ".json"; + json_file_count++; + } + input_data.setCNVOutputFile(json_filepath); + std::cout << "Saving CNV data to: " << json_filepath << std::endl; + } + // Run ContextSV run(input_data); } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index e2d429bb..3dc567bc 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -373,7 +373,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map", "SPLITINS", "./.", 0.0, 0, 0, primary_cluster_size); + SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, "", "SPLITDIST1", "./.", 0.0, 0, 0, primary_cluster_size); addSVCall(chr_sv_calls, sv_candidate); } } @@ -445,12 +445,12 @@ void SVCaller::findSplitSVSignatures(std::unordered_map 2kb, then this is a // potential deletion if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) { - SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDEL", "./.", 0.0, 0, 0, cluster_size); + SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDIST2", "./.", 0.0, 0, 0, cluster_size); addSVCall(chr_sv_calls, sv_candidate); // Add an inversion call if necessary if (inversion) { - SVCall sv_candidate(sv_start, sv_end, SVType::INV, "", "INVDEL", "./.", 0.0, 0, 0, cluster_size); + SVCall sv_candidate(sv_start, sv_end, SVType::INV, "", "SPLITINV", "./.", 0.0, 0, 0, cluster_size); addSVCall(chr_sv_calls, sv_candidate); } } @@ -459,7 +459,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map= min_length && sv_length <= max_length) { SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; std::string alt = (sv_type == SVType::INV) ? "" : "."; - SVCall sv_candidate(sv_start, sv_end, sv_type, alt, "PRIMSUPP", "./.", 0.0, 0, 0, cluster_size); + SVCall sv_candidate(sv_start, sv_end, sv_type, alt, "SPLIT", "./.", 0.0, 0, 0, cluster_size); addSVCall(chr_sv_calls, sv_candidate); } } diff --git a/src/sv_object.cpp b/src/sv_object.cpp index e4bb4699..9934805a 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -18,10 +18,6 @@ bool SVCall::operator<(const SVCall & other) const void addSVCall(std::vector& sv_calls, SVCall& sv_call) { - // if (sv_call.sv_type == SVType::UNKNOWN || sv_call.sv_type == SVType::NEUTRAL) { - // return; - // } - // Check if the SV call is valid if (sv_call.start > sv_call.end) { printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end)); @@ -232,7 +228,9 @@ void mergeDuplicateSVs(std::vector &sv_calls) }); for (size_t i = 0; i < sv_calls.size(); i++) { SVCall& sv_call = sv_calls[i]; - if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.sv_type == sv_calls[i - 1].sv_type) { + // For SVs at the same start position with the same SV type, keep the one + // with the highest likelihood + if (i > 0 && sv_call.start == sv_calls[i - 1].start && ((sv_call.sv_type == sv_calls[i - 1].sv_type) || sv_call.sv_type == SVType::UNKNOWN || sv_calls[i - 1].sv_type == SVType::UNKNOWN)) { // Keep the SV call with a non-zero likelihood // The HMM prediction is more reliable than the split read prediction if (sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) { diff --git a/src/utils.cpp b/src/utils.cpp index bb82abbc..884139a5 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -6,6 +6,7 @@ #include #include #include +#include /// @endcond @@ -15,10 +16,7 @@ std::mutex print_mtx; // Print a progress bar void printProgress(int progress, int total) { - // Get the percentage float percent = (float)progress / (float)total * 100.0; - - // Get the number of hashes int num_hashes = (int)(percent / 2.0); // Print the progress bar @@ -121,3 +119,9 @@ void printMemoryUsage(const std::string& functionName) { std::cout << functionName << " memory usage: " << std::fixed << std::setprecision(2) << mem_usage_gb << " GB" << std::endl; } + +bool fileExists(const std::string &filepath) +{ + std::ifstream file(filepath); + return file.is_open(); +} From 53cc23dc2a1a175aa603c4f29367e435c4a6b9e5 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 19 Mar 2025 17:43:47 -0400 Subject: [PATCH 089/134] fix json format --- include/utils.h | 4 +++ python/cnv_plots_json.py | 78 ++++++++++++++++++++++++++++++++++++++++ src/sv_caller.cpp | 11 +++++- src/utils.cpp | 18 ++++++++++ 4 files changed, 110 insertions(+), 1 deletion(-) create mode 100644 python/cnv_plots_json.py diff --git a/include/utils.h b/include/utils.h index e7bf164a..9f40d9d5 100644 --- a/include/utils.h +++ b/include/utils.h @@ -61,4 +61,8 @@ void printMemoryUsage(const std::string &functionName); bool fileExists(const std::string &filepath); +void openJSON(const std::string & filepath); + +void closeJSON(const std::string & filepath); + #endif // UTILS_H diff --git a/python/cnv_plots_json.py b/python/cnv_plots_json.py new file mode 100644 index 00000000..5fad7319 --- /dev/null +++ b/python/cnv_plots_json.py @@ -0,0 +1,78 @@ +import plotly.graph_objs as go +import json +import argparse + +# Set up argument parser +parser = argparse.ArgumentParser(description='Generate CNV plots from JSON data.') +parser.add_argument('json_file', type=str, help='Path to the JSON file containing SV data') +args = parser.parse_args() + +# Load your JSON data +with open(args.json_file) as f: + sv_data = json.load(f) + +# Loop through each SV (assuming your JSON contains multiple SVs) +for sv in sv_data: + print(type(sv)) + + # Extract data for plotting + positions_before = sv['before_sv']['positions'] + b_allele_freq_before = sv['before_sv']['b_allele_freq'] + positions_after = sv['after_sv']['positions'] + b_allele_freq_after = sv['after_sv']['b_allele_freq'] + + # Generate hover text (optional, can be customized) + hover_text_before = [f"Position: {pos}, BAF: {baf}" for pos, baf in zip(positions_before, b_allele_freq_before)] + hover_text_after = [f"Position: {pos}, BAF: {baf}" for pos, baf in zip(positions_after, b_allele_freq_after)] + + # Plotting data for 'before_sv' and 'after_sv' + baf_trace_before = go.Scatter( + x=positions_before, + y=b_allele_freq_before, + mode="markers+lines", + name="B-Allele Frequency (Before SV)", + text=hover_text_before, + marker=dict( + color='blue', + size=10 + ), + line=dict( + color="black", + width=0 + ), + showlegend=False + ) + + baf_trace_after = go.Scatter( + x=positions_after, + y=b_allele_freq_after, + mode="markers+lines", + name="B-Allele Frequency (After SV)", + text=hover_text_after, + marker=dict( + color='red', + size=10 + ), + line=dict( + color="black", + width=0 + ), + showlegend=False + ) + + # Create layout for the plot + layout = go.Layout( + title=f"SV Plot: {sv['chromosome']} {sv['start']}-{sv['end']} ({sv['sv_type']})", + xaxis=dict(title="Position"), + yaxis=dict(title="B-Allele Frequency"), + hovermode='closest' + ) + + # Create figure with data and layout + fig = go.Figure(data=[baf_trace_before, baf_trace_after], layout=layout) + + # Save the plot to an HTML file (use a unique filename per SV) + file_name = f"output/SV_{sv['chromosome']}_{sv['start']}_{sv['end']}.html" + fig.write_html(file_name) + + print(f"Plot saved as {file_name}") diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 3dc567bc..00aaff67 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -727,6 +727,12 @@ void SVCaller::run(const InputData& input_data) std::cout << "Reading HMM from file: " << hmm_filepath << std::endl; const CHMM& hmm = ReadCHMM(hmm_filepath.c_str()); + // Set up the JSON output file for CNV data + const std::string& json_fp = input_data.getCNVOutputFile(); + if (input_data.getSaveCNVData()) { + openJSON(json_fp); + } + // Calculate the mean chromosome coverage and generate the position depth // maps for each chromosome (I/O is multi-threaded, which is more efficient // than per-chromosome multi-threading in this case) @@ -844,7 +850,10 @@ void SVCaller::run(const InputData& input_data) this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); } } - + if (input_data.getSaveCNVData()) { + closeJSON(json_fp); + } + printMessage("Unifying SVs..."); for (auto& entry : whole_genome_split_sv_calls) { const std::string& chr = entry.first; diff --git a/src/utils.cpp b/src/utils.cpp index 884139a5..553af91d 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -125,3 +125,21 @@ bool fileExists(const std::string &filepath) std::ifstream file(filepath); return file.is_open(); } + +void openJSON(const std::string &filepath) +{ + // Add the initial [ and close + std::ofstream + json_file(filepath); + json_file << "[\n"; + json_file.close(); +} + +void closeJSON(const std::string &filepath) +{ + // Add the final ] and close + std::ofstream + json_file(filepath, std::ios::app); + json_file << "]"; + json_file.close(); +} From 56fc8d2c7a34273519e24f2e4826535e7f2f393d Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 19 Mar 2025 19:39:26 -0400 Subject: [PATCH 090/134] json plots --- python/cnv_plots_json.py | 239 +++++++++++++++++++++++++++++++-------- src/cnv_caller.cpp | 22 +++- 2 files changed, 209 insertions(+), 52 deletions(-) diff --git a/python/cnv_plots_json.py b/python/cnv_plots_json.py index 5fad7319..9190b53a 100644 --- a/python/cnv_plots_json.py +++ b/python/cnv_plots_json.py @@ -1,4 +1,5 @@ -import plotly.graph_objs as go +import plotly +from plotly.subplots import make_subplots import json import argparse @@ -11,68 +12,206 @@ with open(args.json_file) as f: sv_data = json.load(f) +# State marker colors +# https://community.plotly.com/t/plotly-colours-list/11730/6 +state_colors_dict = { + '1': 'red', + '2': 'darkred', + '3': 'darkgreen', + '4': 'green', + '5': 'darkblue', + '6': 'blue', +} + +sv_type_dict = { + 'DEL': 'Deletion', + 'DUP': 'Duplication', + 'INV': 'Inversion' +} + # Loop through each SV (assuming your JSON contains multiple SVs) for sv in sv_data: - print(type(sv)) # Extract data for plotting positions_before = sv['before_sv']['positions'] b_allele_freq_before = sv['before_sv']['b_allele_freq'] positions_after = sv['after_sv']['positions'] b_allele_freq_after = sv['after_sv']['b_allele_freq'] + + # Create a subplot for the CNV plot and the BAF plot. + fig = make_subplots( + rows=2, + cols=1, + shared_xaxes=True, + vertical_spacing=0.05, + subplot_titles=(r"SNP Log2 Ratio", "SNP B-Allele Frequency") + ) + + # Get the chromosome, start, end, and sv_type from the SV data + chromosome = sv['chromosome'] + start = sv['start'] + end = sv['end'] + sv_type = sv['sv_type'] + likelihood = sv['likelihood'] + sv_length = sv['size'] + + # Plot the data for 'before_sv', 'sv', and 'after_sv' + for section in ["before_sv", "sv", "after_sv"]: + positions = sv[section]['positions'] + b_allele_freq = sv[section]['b_allele_freq'] + population_freq = sv[section]['population_freq'] + log2_ratio = sv[section]['log2_ratio'] + + if section == "sv": + is_snp = sv[section]['is_snp'] + states = sv[section]['states'] + state_colors = [state_colors_dict[str(state)] for state in states] + marker_symbols = ['circle' if is_snp_val else 'circle-open' for is_snp_val in is_snp] + + # Set the hover text + hover_text = [] + for i, position in enumerate(positions): + # Add hover text for each point + hover_text.append( + f"Position: {position}
" + f"State: {states[i]}
" + f"Log2 Ratio: {log2_ratio[i]}
" + f"SNP: {is_snp[i]}
" + f"BAF: {b_allele_freq[i]}
" + f"Population Frequency: {population_freq[i]}
" + ) + else: + state_colors = ['black'] * len(positions) + marker_symbols = ['circle-open'] * len(positions) + hover_text = [] + for i, position in enumerate(positions): + # Add hover text for each point + hover_text.append( + f"Position: {position}
" + f"Log2 Ratio: {log2_ratio[i]}
" + f"BAF: {b_allele_freq[i]}
" + f"Population Frequency: {population_freq[i]}
" + ) + + # Create the log2 trace + log2_trace = plotly.graph_objs.Scatter( + x=positions, + y=log2_ratio, + mode='markers+lines', + name=r'Log2 Ratio', + text=hover_text, + hoverinfo='text', + marker=dict( + color=state_colors, + size=10, + symbol=marker_symbols, + ), + line=dict( + color='black', + width=0 + ), + showlegend=False + ) + + # Create the BAF trace + baf_trace = plotly.graph_objs.Scatter( + x=positions, + y=b_allele_freq, + mode='markers+lines', + name='B-Allele Frequency', + text=hover_text, + hoverinfo='text', + marker=dict( + color=state_colors, + size=10, + symbol=marker_symbols, + ), + line=dict( + color='black', + width=0 + ), + showlegend=False + ) + + if section == "sv": + # Create a shaded rectangle for the CNV, layering it below the CNV + # trace and labeling it with the CNV type. + fig.add_vrect( + x0 = start, + x1 = end, + fillcolor = "Black", + layer = "below", + line_width = 0, + opacity = 0.1, + annotation_text = '', + annotation_position = "top left", + annotation_font_size = 20, + annotation_font_color = "black" + ) + + # Add vertical lines at the start and end positions of the CNV. + fig.add_vline( + x = start, + line_width = 2, + line_color = "black", + layer = "below" + ) + + fig.add_vline( + x = end, + line_width = 2, + line_color = "black", + layer = "below" + ) + + # Add traces to the figure + fig.append_trace(log2_trace, row=1, col=1) + fig.append_trace(baf_trace, row=2, col=1) - # Generate hover text (optional, can be customized) - hover_text_before = [f"Position: {pos}, BAF: {baf}" for pos, baf in zip(positions_before, b_allele_freq_before)] - hover_text_after = [f"Position: {pos}, BAF: {baf}" for pos, baf in zip(positions_after, b_allele_freq_after)] - - # Plotting data for 'before_sv' and 'after_sv' - baf_trace_before = go.Scatter( - x=positions_before, - y=b_allele_freq_before, - mode="markers+lines", - name="B-Allele Frequency (Before SV)", - text=hover_text_before, - marker=dict( - color='blue', - size=10 - ), - line=dict( - color="black", - width=0 - ), - showlegend=False + # Set the x-axis title. + fig.update_xaxes( + title_text = "Chromosome Position", + row = 2, + col = 1 ) - baf_trace_after = go.Scatter( - x=positions_after, - y=b_allele_freq_after, - mode="markers+lines", - name="B-Allele Frequency (After SV)", - text=hover_text_after, - marker=dict( - color='red', - size=10 - ), - line=dict( - color="black", - width=0 - ), - showlegend=False + # Set the y-axis titles. + fig.update_yaxes( + title_text = r"Log2 Ratio", + row = 1, + col = 1 ) - - # Create layout for the plot - layout = go.Layout( - title=f"SV Plot: {sv['chromosome']} {sv['start']}-{sv['end']} ({sv['sv_type']})", - xaxis=dict(title="Position"), - yaxis=dict(title="B-Allele Frequency"), - hovermode='closest' + + fig.update_yaxes( + title_text = "B-Allele Frequency", + row = 2, + col = 1 ) - - # Create figure with data and layout - fig = go.Figure(data=[baf_trace_before, baf_trace_after], layout=layout) - + + # Set the Y-axis range for the log2 ratio plot. + fig.update_yaxes( + range = [-2.0, 2.0], + row = 1, + col = 1 + ) + + # Set the Y-axis range for the BAF plot. + fig.update_yaxes( + range = [-0.2, 1.2], + row = 2, + col = 1 + ) + + # Set the title of the plot. + fig.update_layout( + title_text = f"{sv_type_dict[sv_type]} at {chromosome}:{start}-{end} ({sv_length} bp) (LLH={likelihood})", + title_x = 0.5, + showlegend = False, + ) + # height = 800, + # width = 800 + # ) # Save the plot to an HTML file (use a unique filename per SV) - file_name = f"output/SV_{sv['chromosome']}_{sv['start']}_{sv['end']}.html" + file_name = f"output/SV_{chromosome}_{start}_{end}.html" fig.write_html(file_name) - print(f"Plot saved as {file_name}") diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index b5d61745..7d6d6636 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -238,9 +238,18 @@ std::tuple CNVCaller::runCopyNumberPrediction bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL); if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 50000) { + // Set B-allele and population frequency values to 0 for non-SNPs + for (size_t i = 0; i < snp_data.pos.size(); i++) + { + if (!snp_data.is_snp[i]) + { + snp_data.baf[i] = 0.0; + snp_data.pfb[i] = 0.0; + } + } + + // Save the SNP data to JSON std::string cnv_type_str = getSVTypeString(predicted_cnv_type); - // const std::string output_dir = input_data.getOutputDir(); - // std::string json_filepath = output_dir + "/CNVCalls.json"; std::string json_filepath = input_data.getCNVOutputFile(); printMessage("Saving SV copy number predictions to " + json_filepath + "..."); @@ -856,6 +865,7 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN json_file << " \"end\": " << end << ",\n"; json_file << " \"sv_type\": \"" << sv_type << "\",\n"; json_file << " \"likelihood\": " << likelihood << ",\n"; + json_file << " \"size\": " << (end - start + 1) << ",\n"; json_file << " \"before_sv\": {\n"; json_file << " \"positions\": ["; for (size_t i = 0; i < before_sv.pos.size(); ++i) @@ -964,6 +974,14 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN if (i < snp_data.state_sequence.size() - 1) json_file << ", "; } + json_file << "],\n"; + json_file << " \"is_snp\": ["; + for (size_t i = 0; i < snp_data.is_snp.size(); ++i) + { + json_file << snp_data.is_snp[i]; + if (i < snp_data.is_snp.size() - 1) + json_file << ", "; + } json_file << "]\n"; json_file << " }\n"; json_file << "}\n"; From 2e23c50137c0f10ae4b2dd5361ea320e61193e09 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 19 Mar 2025 20:43:02 -0400 Subject: [PATCH 091/134] fix multi-cnv json --- include/utils.h | 2 +- src/cnv_caller.cpp | 13 ++++++++++++- src/sv_caller.cpp | 6 +++--- src/utils.cpp | 14 ++++++-------- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/include/utils.h b/include/utils.h index 9f40d9d5..6eb1237d 100644 --- a/include/utils.h +++ b/include/utils.h @@ -61,7 +61,7 @@ void printMemoryUsage(const std::string &functionName); bool fileExists(const std::string &filepath); -void openJSON(const std::string & filepath); +bool isFileEmpty(const std::string &filepath); void closeJSON(const std::string & filepath); diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 7d6d6636..08e4fb4d 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -859,6 +859,17 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN std::cerr << "ERROR: Could not open JSON file for writing: " << filepath << std::endl; exit(1); } + + // If not the first record, write the closing bracket + // Check if file is empty + if (isFileEmpty(filepath)) + { + json_file << "[\n"; + } else { + // Close the previous JSON object + json_file << "},\n"; + } + json_file << "{\n"; json_file << " \"chromosome\": \"" << chr << "\",\n"; json_file << " \"start\": " << start << ",\n"; @@ -984,7 +995,7 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN } json_file << "]\n"; json_file << " }\n"; - json_file << "}\n"; + // json_file << "},\n"; json_file.close(); printMessage("Saved copy number predictions for " + chr + ":" + std::to_string(start) + "-" + std::to_string(end) + " to " + filepath); } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 00aaff67..5fee517e 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -729,9 +729,9 @@ void SVCaller::run(const InputData& input_data) // Set up the JSON output file for CNV data const std::string& json_fp = input_data.getCNVOutputFile(); - if (input_data.getSaveCNVData()) { - openJSON(json_fp); - } + // if (input_data.getSaveCNVData()) { + // openJSON(json_fp); + // } // Calculate the mean chromosome coverage and generate the position depth // maps for each chromosome (I/O is multi-threaded, which is more efficient diff --git a/src/utils.cpp b/src/utils.cpp index 553af91d..a27263b7 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -7,6 +7,7 @@ #include #include #include +#include /// @endcond @@ -126,20 +127,17 @@ bool fileExists(const std::string &filepath) return file.is_open(); } -void openJSON(const std::string &filepath) +bool isFileEmpty(const std::string &filepath) { - // Add the initial [ and close - std::ofstream - json_file(filepath); - json_file << "[\n"; - json_file.close(); + return std::filesystem::file_size(filepath) == 0; } void closeJSON(const std::string &filepath) { - // Add the final ] and close std::ofstream json_file(filepath, std::ios::app); - json_file << "]"; + + json_file << "}\n"; // Close the last JSON object + json_file << "]"; // Close the JSON array json_file.close(); } From 347e5ad42077ff20c7c98108ade6672f5219d46f Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 19 Mar 2025 21:07:42 -0400 Subject: [PATCH 092/134] update plots --- python/cnv_plots_json.py | 4 +++- src/cnv_caller.cpp | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/python/cnv_plots_json.py b/python/cnv_plots_json.py index 9190b53a..31a59110 100644 --- a/python/cnv_plots_json.py +++ b/python/cnv_plots_json.py @@ -81,8 +81,10 @@ f"Population Frequency: {population_freq[i]}
" ) else: + is_snp = sv[section]['is_snp'] state_colors = ['black'] * len(positions) - marker_symbols = ['circle-open'] * len(positions) + # marker_symbols = ['circle-open'] * len(positions) + marker_symbols = ['circle' if is_snp_val else 'circle-open' for is_snp_val in is_snp] hover_text = [] for i, position in enumerate(positions): # Add hover text for each point diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 08e4fb4d..b5301a1e 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -247,6 +247,22 @@ std::tuple CNVCaller::runCopyNumberPrediction snp_data.pfb[i] = 0.0; } } + for (size_t i = 0; i < before_sv.pos.size(); i++) + { + if (!before_sv.is_snp[i]) + { + before_sv.baf[i] = 0.0; + before_sv.pfb[i] = 0.0; + } + } + for (size_t i = 0; i < after_sv.pos.size(); i++) + { + if (!after_sv.is_snp[i]) + { + after_sv.baf[i] = 0.0; + after_sv.pfb[i] = 0.0; + } + } // Save the SNP data to JSON std::string cnv_type_str = getSVTypeString(predicted_cnv_type); @@ -909,6 +925,14 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN if (i < before_sv.log2_cov.size() - 1) json_file << ", "; } + json_file << "],\n"; + json_file << " \"is_snp\": ["; + for (size_t i = 0; i < snp_data.is_snp.size(); ++i) + { + json_file << snp_data.is_snp[i]; + if (i < snp_data.is_snp.size() - 1) + json_file << ", "; + } json_file << "]\n"; json_file << " },\n"; json_file << " \"after_sv\": {\n"; @@ -943,6 +967,14 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN if (i < after_sv.log2_cov.size() - 1) json_file << ", "; } + json_file << "],\n"; + json_file << " \"is_snp\": ["; + for (size_t i = 0; i < snp_data.is_snp.size(); ++i) + { + json_file << snp_data.is_snp[i]; + if (i < snp_data.is_snp.size() - 1) + json_file << ", "; + } json_file << "]\n"; json_file << " },\n"; json_file << " \"sv\": {\n"; From 708e7823079740549a272d233b20c50f79f94be8 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 19 Mar 2025 22:47:30 -0400 Subject: [PATCH 093/134] simplify snp analysis --- include/cnv_caller.h | 2 +- src/cnv_caller.cpp | 108 ++++++++++++++++++++--------------------- src/sv_caller.cpp | 111 +++++++++++++++++++++---------------------- 3 files changed, 110 insertions(+), 111 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index b424b6c2..609238ef 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -91,7 +91,7 @@ class CNVCaller { void calculateMeanChromosomeCoverage(const std::vector& chromosomes, std::unordered_map>& chr_pos_depth_map, std::unordered_map& chr_mean_cov_map, const std::string& bam_filepath, int thread_count) const; - void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, const InputData& input_data) const; + void readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::unordered_map& snp_baf, std::unordered_map& snp_pfb, const InputData& input_data) const; // Save a TSV with B-allele frequencies, log2 ratios, and copy number predictions void saveSVCopyNumberToTSV(SNPData& snp_data, std::string filepath, std::string chr, uint32_t start, uint32_t end, std::string sv_type, double likelihood) const; diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index b5301a1e..dc978c22 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -55,24 +55,23 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end // Initialize the SNP data with default values and sample size length int sample_size = input_data.getSampleSize(); std::vector snp_pos; - std::vector snp_baf; - std::vector snp_pfb; - this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf, snp_pfb, input_data); + std::unordered_map snp_baf_map; + std::unordered_map snp_pfb_map; + this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf_map, snp_pfb_map, input_data); // Get the log2 ratio for evenly spaced positions in the // region sample_size = std::max((int) snp_pos.size(), sample_size); - std::vector snp_pos_hmm; - std::vector snp_baf_hmm; - std::vector snp_pfb_hmm; - std::vector snp_log2_hmm; - std::vector is_snp_hmm; // Loop through evenly spaced positions in the region and get the log2 ratio double pos_step = (double) (end_pos - start_pos + 1) / (double) sample_size; std::unordered_set snp_pos_set(snp_pos.begin(), snp_pos.end()); + std::unordered_map window_log2_map; for (int i = 0; i < sample_size; i++) { + uint32_t window_start = (uint32_t) (start_pos + i * pos_step); + uint32_t window_end = (uint32_t) (start_pos + (i + 1) * pos_step); + // Calculate the mean depth for the window double cov_sum = 0.0; int pos_count = 0; @@ -83,15 +82,11 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end { break; } - try - { - cov_sum += pos_depth_map.at(pos); + if (pos < pos_depth_map.size()) { + cov_sum += pos_depth_map[pos]; pos_count++; } - catch (const std::out_of_range& e) - { - // Ignore out of range errors - } + } double log2_cov = 0.0; if (pos_count > 0) @@ -104,34 +99,47 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end log2_cov = log2((cov_sum / (double) pos_count) / mean_chr_cov); } - // Loop through positions and get the log2 ratio - bool snp_found_in_sample = false; - for (int j = 0; j < pos_step; j++) - { - uint32_t pos = (uint32_t) (start_pos + i * pos_step + j); - if (pos > end_pos) - { - break; - } + // Store the log2 ratio for the window + std::string window_key = std::to_string(window_start) + "-" + std::to_string(window_end); + window_log2_map[window_key] = log2_cov; + } - // Check if the position is a SNP - if (snp_pos_set.find(pos) != snp_pos_set.end()) + // Create new vectors for the SNP data + std::vector snp_pos_hmm; + std::vector snp_baf_hmm; + std::vector snp_pfb_hmm; + std::vector snp_log2_hmm; + std::vector is_snp_hmm; + + // Loop through the window ranges and append all SNPs in the range, using + // the log2 ratio for the window + for (const auto& window : window_log2_map) + { + uint32_t window_start = std::stoi(window.first.substr(0, window.first.find('-'))); + uint32_t window_end = std::stoi(window.first.substr(window.first.find('-') + 1)); + double log2_cov = window.second; + + // Loop through the SNP positions and add them to the SNP data + bool snp_found = false; + for (uint32_t pos : snp_pos) + { + if (pos >= window_start && pos <= window_end) { - // Update the SNP data snp_pos_hmm.push_back(pos); - snp_baf_hmm.push_back(snp_baf[i]); - snp_pfb_hmm.push_back(snp_pfb[i]); + snp_baf_hmm.push_back(snp_baf_map[pos]); + snp_pfb_hmm.push_back(snp_pfb_map[pos]); snp_log2_hmm.push_back(log2_cov); is_snp_hmm.push_back(true); - snp_found_in_sample = true; + snp_found = true; } } - - // If no SNP was found in the sample, then use the center position - if (!snp_found_in_sample) + if (!snp_found) { - uint32_t pos = (uint32_t) (start_pos + (i * pos_step) + (pos_step / 2.0)); - snp_pos_hmm.push_back(pos); + // If no SNPs were found in the window, add a dummy SNP with the + // log2 ratio for the window, using the window center as the SNP + // position + uint32_t window_center = (window_start + window_end) / 2; + snp_pos_hmm.push_back(window_center); snp_baf_hmm.push_back(-1.0); snp_pfb_hmm.push_back(0.5); snp_log2_hmm.push_back(log2_cov); @@ -522,7 +530,7 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& } } -void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::vector& snp_baf, std::vector& snp_pfb, const InputData& input_data) const +void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::unordered_map& snp_baf, std::unordered_map& snp_pfb, const InputData& input_data) const { // Lock during reading std::shared_lock lock(this->shared_mutex); @@ -700,8 +708,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Add the SNP position and BAF information snp_pos.push_back(pos); - snp_baf.push_back(baf); - snp_pfb.push_back(0.5); + snp_baf[pos] = baf; + printMessage("SNP found: " + chr + ":" + std::to_string(pos) + " BAF: " + std::to_string(baf)); snp_found = true; } } @@ -724,11 +732,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui uint32_t min_snp_pos = *std::min_element(snp_pos.begin(), snp_pos.end()); uint32_t max_snp_pos = *std::max_element(snp_pos.begin(), snp_pos.end()); std::unordered_set snp_pos_set(snp_pos.begin(), snp_pos.end()); - std::unordered_map snp_index_map; - for (size_t i = 0; i < snp_pos.size(); i++) - { - snp_index_map[snp_pos[i]] = i; - } if (use_pfb) { // Set the region for the population allele frequency reader @@ -757,9 +760,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui continue; // Skip if the SNP position is not in the set } - // Get the SNP position index - size_t i = snp_index_map[pfb_pos]; - // Get the population frequency for the SNP int pfb_status = bcf_get_info_float(pfb_reader->readers[0].header, pfb_record, AF_key.c_str(), &pfb_f, &count); if (pfb_status < 0 || count == 0) @@ -773,7 +773,9 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui { continue; } - snp_pfb[i] = pfb; + // snp_pfb[i] = pfb; + snp_pfb[pfb_pos] = pfb; + printMessage("Population frequency found: " + chr + ":" + std::to_string(pfb_pos) + " PFB: " + std::to_string(pfb)); break; } free(pfb_f); @@ -927,10 +929,10 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN } json_file << "],\n"; json_file << " \"is_snp\": ["; - for (size_t i = 0; i < snp_data.is_snp.size(); ++i) + for (size_t i = 0; i < before_sv.is_snp.size(); ++i) { - json_file << snp_data.is_snp[i]; - if (i < snp_data.is_snp.size() - 1) + json_file << before_sv.is_snp[i]; + if (i < before_sv.is_snp.size() - 1) json_file << ", "; } json_file << "]\n"; @@ -969,10 +971,10 @@ void CNVCaller::saveSVCopyNumberToJSON(SNPData &before_sv, SNPData &after_sv, SN } json_file << "],\n"; json_file << " \"is_snp\": ["; - for (size_t i = 0; i < snp_data.is_snp.size(); ++i) + for (size_t i = 0; i < after_sv.is_snp.size(); ++i) { - json_file << snp_data.is_snp[i]; - if (i < snp_data.is_snp.size() - 1) + json_file << after_sv.is_snp[i]; + if (i < after_sv.is_snp.size() - 1) json_file << ", "; } json_file << "]\n"; diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 5fee517e..08530c16 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -729,9 +729,6 @@ void SVCaller::run(const InputData& input_data) // Set up the JSON output file for CNV data const std::string& json_fp = input_data.getCNVOutputFile(); - // if (input_data.getSaveCNVData()) { - // openJSON(json_fp); - // } // Calculate the mean chromosome coverage and generate the position depth // maps for each chromosome (I/O is multi-threaded, which is more efficient @@ -772,65 +769,65 @@ void SVCaller::run(const InputData& input_data) // Use multi-threading across chromosomes. If a single chromosome is // specified, use a single main thread (multi-threading is used for file I/O) - int thread_count = 1; - if (!input_data.isSingleChr()) { - thread_count = input_data.getThreadCount(); - std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl; - } - ThreadPool pool(thread_count); - auto process_chr = [&](const std::string& chr) { - try { - std::vector sv_calls; - std::vector split_sv_calls; - InputData chr_input_data = input_data; // Use a thread-local copy - this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]); - { - std::shared_lock lock(this->shared_mutex); - whole_genome_sv_calls[chr] = std::move(sv_calls); - } - } catch (const std::exception& e) { - printError("Error processing chromosome " + chr + ": " + e.what()); - } catch (...) { - printError("Unknown error processing chromosome " + chr); - } - }; - - // Submit tasks to the thread pool and track futures - std::vector> futures; - for (const auto& chr : chromosomes) { - futures.emplace_back(pool.enqueue([&, chr] { - // printMessage("Processing chromosome " + chr); - process_chr(chr); - })); - } + // int thread_count = 1; + // if (!input_data.isSingleChr()) { + // thread_count = input_data.getThreadCount(); + // std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl; + // } + // ThreadPool pool(thread_count); + // auto process_chr = [&](const std::string& chr) { + // try { + // std::vector sv_calls; + // std::vector split_sv_calls; + // InputData chr_input_data = input_data; // Use a thread-local copy + // this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]); + // { + // std::shared_lock lock(this->shared_mutex); + // whole_genome_sv_calls[chr] = std::move(sv_calls); + // } + // } catch (const std::exception& e) { + // printError("Error processing chromosome " + chr + ": " + e.what()); + // } catch (...) { + // printError("Unknown error processing chromosome " + chr); + // } + // }; + + // // Submit tasks to the thread pool and track futures + // std::vector> futures; + // for (const auto& chr : chromosomes) { + // futures.emplace_back(pool.enqueue([&, chr] { + // // printMessage("Processing chromosome " + chr); + // process_chr(chr); + // })); + // } - // // Wait for all tasks to complete - for (auto& future : futures) { - try { - current_chr++; - future.get(); - } catch (const std::exception& e) { - printError("Error processing chromosome task: " + std::string(e.what())); - } catch (...) { - printError("Unknown error processing chromosome task."); - } - } - printMessage("All tasks have finished."); + // // // Wait for all tasks to complete + // for (auto& future : futures) { + // try { + // current_chr++; + // future.get(); + // } catch (const std::exception& e) { + // printError("Error processing chromosome task: " + std::string(e.what())); + // } catch (...) { + // printError("Unknown error processing chromosome task."); + // } + // } + // printMessage("All tasks have finished."); // ------------------------------------------------------- // Run copy number variant predictions on the SVs detected from the // CIGAR string, using a minimum CNV length threshold - current_chr = 0; - printMessage("Running copy number predictions on CIGAR SVs..."); - for (auto& entry : whole_genome_sv_calls) { - current_chr++; - const std::string& chr = entry.first; - std::vector& sv_calls = entry.second; - if (sv_calls.size() > 0) { - printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); - cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); - } - } + // current_chr = 0; + // printMessage("Running copy number predictions on CIGAR SVs..."); + // for (auto& entry : whole_genome_sv_calls) { + // current_chr++; + // const std::string& chr = entry.first; + // std::vector& sv_calls = entry.second; + // if (sv_calls.size() > 0) { + // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); + // cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); + // } + // } // ------------------------------------------------------- // Identify split-SV signatures From c6fe7af61eea8052f445b07505d443ff9fc664fe Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 19 Mar 2025 23:08:02 -0400 Subject: [PATCH 094/134] remove debug output --- src/cnv_caller.cpp | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index dc978c22..26bca566 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -709,7 +709,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Add the SNP position and BAF information snp_pos.push_back(pos); snp_baf[pos] = baf; - printMessage("SNP found: " + chr + ":" + std::to_string(pos) + " BAF: " + std::to_string(baf)); snp_found = true; } } @@ -773,9 +772,7 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui { continue; } - // snp_pfb[i] = pfb; snp_pfb[pfb_pos] = pfb; - printMessage("Population frequency found: " + chr + ":" + std::to_string(pfb_pos) + " PFB: " + std::to_string(pfb)); break; } free(pfb_f); From 255b9a3a8086a7153fa15cbc6066c57044078fef Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 19 Mar 2025 23:24:49 -0400 Subject: [PATCH 095/134] remove test code --- src/sv_caller.cpp | 110 +++++++++++++++++++++++----------------------- 1 file changed, 55 insertions(+), 55 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 08530c16..0abc413a 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -769,65 +769,65 @@ void SVCaller::run(const InputData& input_data) // Use multi-threading across chromosomes. If a single chromosome is // specified, use a single main thread (multi-threading is used for file I/O) - // int thread_count = 1; - // if (!input_data.isSingleChr()) { - // thread_count = input_data.getThreadCount(); - // std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl; - // } - // ThreadPool pool(thread_count); - // auto process_chr = [&](const std::string& chr) { - // try { - // std::vector sv_calls; - // std::vector split_sv_calls; - // InputData chr_input_data = input_data; // Use a thread-local copy - // this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]); - // { - // std::shared_lock lock(this->shared_mutex); - // whole_genome_sv_calls[chr] = std::move(sv_calls); - // } - // } catch (const std::exception& e) { - // printError("Error processing chromosome " + chr + ": " + e.what()); - // } catch (...) { - // printError("Unknown error processing chromosome " + chr); - // } - // }; - - // // Submit tasks to the thread pool and track futures - // std::vector> futures; - // for (const auto& chr : chromosomes) { - // futures.emplace_back(pool.enqueue([&, chr] { - // // printMessage("Processing chromosome " + chr); - // process_chr(chr); - // })); - // } - - // // // Wait for all tasks to complete - // for (auto& future : futures) { - // try { - // current_chr++; - // future.get(); - // } catch (const std::exception& e) { - // printError("Error processing chromosome task: " + std::string(e.what())); - // } catch (...) { - // printError("Unknown error processing chromosome task."); - // } - // } - // printMessage("All tasks have finished."); + int thread_count = 1; + if (!input_data.isSingleChr()) { + thread_count = input_data.getThreadCount(); + std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl; + } + ThreadPool pool(thread_count); + auto process_chr = [&](const std::string& chr) { + try { + std::vector sv_calls; + std::vector split_sv_calls; + InputData chr_input_data = input_data; // Use a thread-local copy + this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]); + { + std::shared_lock lock(this->shared_mutex); + whole_genome_sv_calls[chr] = std::move(sv_calls); + } + } catch (const std::exception& e) { + printError("Error processing chromosome " + chr + ": " + e.what()); + } catch (...) { + printError("Unknown error processing chromosome " + chr); + } + }; + + // Submit tasks to the thread pool and track futures + std::vector> futures; + for (const auto& chr : chromosomes) { + futures.emplace_back(pool.enqueue([&, chr] { + // printMessage("Processing chromosome " + chr); + process_chr(chr); + })); + } + + // Wait for all tasks to complete + for (auto& future : futures) { + try { + current_chr++; + future.get(); + } catch (const std::exception& e) { + printError("Error processing chromosome task: " + std::string(e.what())); + } catch (...) { + printError("Unknown error processing chromosome task."); + } + } + printMessage("All tasks have finished."); // ------------------------------------------------------- // Run copy number variant predictions on the SVs detected from the // CIGAR string, using a minimum CNV length threshold - // current_chr = 0; - // printMessage("Running copy number predictions on CIGAR SVs..."); - // for (auto& entry : whole_genome_sv_calls) { - // current_chr++; - // const std::string& chr = entry.first; - // std::vector& sv_calls = entry.second; - // if (sv_calls.size() > 0) { - // printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); - // cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); - // } - // } + current_chr = 0; + printMessage("Running copy number predictions on CIGAR SVs..."); + for (auto& entry : whole_genome_sv_calls) { + current_chr++; + const std::string& chr = entry.first; + std::vector& sv_calls = entry.second; + if (sv_calls.size() > 0) { + printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); + cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); + } + } // ------------------------------------------------------- // Identify split-SV signatures From 57c6c0c751294fd7acafb5b2c17b9258117ffcda Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 21 Mar 2025 14:07:29 -0400 Subject: [PATCH 096/134] add clipped base insertions --- src/sv_caller.cpp | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 0abc413a..04338063 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -537,6 +537,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec amb_bases_bitset.set(base); amb_bases_bitset.set(std::tolower(base)); } + for (int i = 0; i < cigar_len; i++) { int op_len = bam_cigar_oplen(cigar[i]); // CIGAR operation length int op = bam_cigar_op(cigar[i]); // CIGAR operation @@ -599,6 +600,33 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec } SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, 1, 0); addSVCall(sv_calls, sv_call); + + // Process clipped bases as potential insertions + } else if (op == BAM_CSOFT_CLIP && is_primary) { + // Get the sequence of the insertion from the query + std::string ins_seq_str(op_len, ' '); + for (int j = 0; j < op_len; j++) { + // Replace ambiguous bases with N + char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; + if (amb_bases_bitset.test(base)) { + ins_seq_str[j] = 'N'; + } else { + ins_seq_str[j] = base; + } + } + + // Add as an insertion + uint32_t ins_pos = pos + 1; + uint32_t ins_end = ins_pos + op_len - 1; + int read_depth = this->getReadDepth(pos_depth_map, ins_pos-1); + + // Determine the ALT allele format based on small vs. large insertion + std::string alt_allele = ""; + if (op_len <= 50) { + alt_allele = ins_seq_str; + } + SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARCLIP", "./.", default_lh, read_depth, 1, 0); + addSVCall(sv_calls, sv_call); // Check if the CIGAR operation is a deletion } else if (op == BAM_CDEL && is_primary) { @@ -778,7 +806,6 @@ void SVCaller::run(const InputData& input_data) auto process_chr = [&](const std::string& chr) { try { std::vector sv_calls; - std::vector split_sv_calls; InputData chr_input_data = input_data; // Use a thread-local copy this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]); { From 190a699528e16da7bced58bce1c6ad52e82a677a Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 22 Mar 2025 15:22:44 -0400 Subject: [PATCH 097/134] improve split read merging --- include/sv_object.h | 2 +- src/dbscan.cpp | 3 - src/sv_caller.cpp | 206 ++++++++++++++++++++++---------------------- src/sv_object.cpp | 92 +++++++++++++------- 4 files changed, 165 insertions(+), 138 deletions(-) diff --git a/include/sv_object.h b/include/sv_object.h index d838e968..155b50bf 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -40,6 +40,6 @@ uint32_t getSVCount(const std::vector& sv_calls); void concatenateSVCalls(std::vector& sv_calls, const std::vector& sv_calls_update); // Merge SVs using DBSCAN clustering -void mergeSVs(std::vector &sv_calls, double epsilon, int min_pts); +void mergeSVs(std::vector &sv_calls, double epsilon, int min_pts, bool keep_noise); #endif // SV_OBJECT_H diff --git a/src/dbscan.cpp b/src/dbscan.cpp index d6c41346..c1f3f314 100644 --- a/src/dbscan.cpp +++ b/src/dbscan.cpp @@ -8,13 +8,10 @@ void DBSCAN::fit(const std::vector& sv_calls) { int clusterId = 0; - // clusters.assign(points.size(), -1); // -1 means unclassified clusters.assign(sv_calls.size(), -1); // -1 means unclassified - // for (size_t i = 0; i < points.size(); ++i) { for (size_t i = 0; i < sv_calls.size(); ++i) { if (clusters[i] == -1) { // if point is not yet classified - // if (expandCluster(points, i, clusterId)) { if (expandCluster(sv_calls, i, clusterId)) { ++clusterId; } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 04338063..9969342f 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -445,13 +445,17 @@ void SVCaller::findSplitSVSignatures(std::unordered_map 2kb, then this is a // potential deletion if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) { - SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDIST2", "./.", 0.0, 0, 0, cluster_size); - addSVCall(chr_sv_calls, sv_candidate); + // SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDIST2", "./.", 0.0, 0, 0, cluster_size); + // addSVCall(chr_sv_calls, sv_candidate); // Add an inversion call if necessary if (inversion) { + // printMessage("[TEST] Found inversion at " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + ", length=" + std::to_string(sv_length)); SVCall sv_candidate(sv_start, sv_end, SVType::INV, "", "SPLITINV", "./.", 0.0, 0, 0, cluster_size); addSVCall(chr_sv_calls, sv_candidate); + } else { + SVCall sv_candidate(sv_start, sv_end, SVType::DEL, "", "SPLITDIST2", "./.", 0.0, 0, 0, cluster_size); + addSVCall(chr_sv_calls, sv_candidate); } } @@ -477,11 +481,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map& ch this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome); printMessage(chr + ": Merging CIGAR..."); - mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts); + mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false); int region_sv_count = getSVCount(chr_sv_calls); printMessage(chr + ": Found " + std::to_string(region_sv_count) + " SV candidates in the CIGAR string"); @@ -733,6 +732,9 @@ void SVCaller::processChromosome(const std::string& chr, std::vector& ch void SVCaller::run(const InputData& input_data) { + bool cigar_svs = true; + bool split_svs = true; + // Set up the reference genome printMessage("Loading the reference genome..."); const std::string ref_filepath = input_data.getRefGenome(); @@ -795,96 +797,109 @@ void SVCaller::run(const InputData& input_data) int current_chr = 0; int total_chr_count = chromosomes.size(); - // Use multi-threading across chromosomes. If a single chromosome is - // specified, use a single main thread (multi-threading is used for file I/O) - int thread_count = 1; - if (!input_data.isSingleChr()) { - thread_count = input_data.getThreadCount(); - std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl; - } - ThreadPool pool(thread_count); - auto process_chr = [&](const std::string& chr) { - try { - std::vector sv_calls; - InputData chr_input_data = input_data; // Use a thread-local copy - this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]); - { - std::shared_lock lock(this->shared_mutex); - whole_genome_sv_calls[chr] = std::move(sv_calls); + if (cigar_svs) { + // Use multi-threading across chromosomes. If a single chromosome is + // specified, use a single main thread (multi-threading is used for file I/O) + int thread_count = 1; + if (!input_data.isSingleChr()) { + thread_count = input_data.getThreadCount(); + std::cout << "Using " << thread_count << " threads for chr processing..." << std::endl; + } + ThreadPool pool(thread_count); + auto process_chr = [&](const std::string& chr) { + try { + std::vector sv_calls; + InputData chr_input_data = input_data; // Use a thread-local copy + this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]); + { + std::shared_lock lock(this->shared_mutex); + whole_genome_sv_calls[chr] = std::move(sv_calls); + } + } catch (const std::exception& e) { + printError("Error processing chromosome " + chr + ": " + e.what()); + } catch (...) { + printError("Unknown error processing chromosome " + chr); } - } catch (const std::exception& e) { - printError("Error processing chromosome " + chr + ": " + e.what()); - } catch (...) { - printError("Unknown error processing chromosome " + chr); + }; + + // Submit tasks to the thread pool and track futures + std::vector> futures; + for (const auto& chr : chromosomes) { + futures.emplace_back(pool.enqueue([&, chr] { + // printMessage("Processing chromosome " + chr); + process_chr(chr); + })); } - }; - // Submit tasks to the thread pool and track futures - std::vector> futures; - for (const auto& chr : chromosomes) { - futures.emplace_back(pool.enqueue([&, chr] { - // printMessage("Processing chromosome " + chr); - process_chr(chr); - })); - } - - // Wait for all tasks to complete - for (auto& future : futures) { - try { + // Wait for all tasks to complete + for (auto& future : futures) { + try { + current_chr++; + future.get(); + } catch (const std::exception& e) { + printError("Error processing chromosome task: " + std::string(e.what())); + } catch (...) { + printError("Unknown error processing chromosome task."); + } + } + printMessage("All tasks have finished."); + + // ------------------------------------------------------- + // Run copy number variant predictions on the SVs detected from the + // CIGAR string, using a minimum CNV length threshold + current_chr = 0; + printMessage("Running copy number predictions on CIGAR SVs..."); + for (auto& entry : whole_genome_sv_calls) { current_chr++; - future.get(); - } catch (const std::exception& e) { - printError("Error processing chromosome task: " + std::string(e.what())); - } catch (...) { - printError("Unknown error processing chromosome task."); + const std::string& chr = entry.first; + std::vector& sv_calls = entry.second; + if (sv_calls.size() > 0) { + printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); + cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); + } } + // ------------------------------------------------------- } - printMessage("All tasks have finished."); - - // ------------------------------------------------------- - // Run copy number variant predictions on the SVs detected from the - // CIGAR string, using a minimum CNV length threshold - current_chr = 0; - printMessage("Running copy number predictions on CIGAR SVs..."); - for (auto& entry : whole_genome_sv_calls) { - current_chr++; - const std::string& chr = entry.first; - std::vector& sv_calls = entry.second; - if (sv_calls.size() > 0) { - printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); - cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); + + if (split_svs) { + // Identify split-SV signatures + printMessage("Identifying split-SV signatures..."); + std::unordered_map> whole_genome_split_sv_calls; + this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data); + + printMessage("Running copy number predictions on split-read SVs..."); + current_chr = 0; + for (auto& entry : whole_genome_split_sv_calls) { + const std::string& chr = entry.first; + std::vector& sv_calls = entry.second; + + if (sv_calls.size() > 0) { + current_chr++; + printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates..."); + this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); + } } - } - // ------------------------------------------------------- - - // Identify split-SV signatures - printMessage("Identifying split-SV signatures..."); - std::unordered_map> whole_genome_split_sv_calls; - this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data); - printMessage("Running copy number predictions on split-read SVs..."); - current_chr = 0; - for (auto& entry : whole_genome_split_sv_calls) { - const std::string& chr = entry.first; - std::vector& sv_calls = entry.second; + printMessage("Merging split-read SVs..."); + int min_pts = 2; + for (auto& entry : whole_genome_split_sv_calls) { + const std::string& chr = entry.first; + std::vector& sv_calls = entry.second; + mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), min_pts, true); + } - if (sv_calls.size() > 0) { - current_chr++; - printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates..."); - this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); + printMessage("Unifying SVs..."); + for (auto& entry : whole_genome_split_sv_calls) { + const std::string& chr = entry.first; + std::vector& sv_calls = entry.second; + whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end()); } } + if (input_data.getSaveCNVData()) { closeJSON(json_fp); } - printMessage("Unifying SVs..."); - for (auto& entry : whole_genome_split_sv_calls) { - const std::string& chr = entry.first; - std::vector& sv_calls = entry.second; - whole_genome_sv_calls[chr].insert(whole_genome_sv_calls[chr].end(), sv_calls.begin(), sv_calls.end()); - } - // Print the total number of SVs detected for each chromosome uint32_t total_sv_count = 0; for (const auto& entry : whole_genome_sv_calls) { @@ -958,7 +973,6 @@ void SVCaller::saveToVCF(const std::unordered_map", "##INFO=", - "##INFO=", "##INFO=", "##INFO=", "##INFO=", @@ -1029,19 +1043,6 @@ void SVCaller::saveToVCF(const std::unordered_map &target, const std::vector& target.insert(target.end(), source.begin(), source.end()); } -void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) +void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool keep_noise) { printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts)); @@ -78,9 +78,19 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) dbscan.fit(sv_type_calls); const std::vector& clusters = dbscan.getClusters(); - std::map> cluster_map; - for (size_t i = 0; i < clusters.size(); ++i) { - cluster_map[clusters[i]].push_back(sv_type_calls[i]); + std::map> cluster_map; // Cluster ID to SV calls + // Create a map of cluster IDs to SV calls + if (sv_type == SVType::INS) { + // Add only non-CIGARCLIP SVs to the cluster map + for (size_t i = 0; i < clusters.size(); ++i) { + if (sv_type_calls[i].data_type != "CIGARCLIP") { + cluster_map[clusters[i]].push_back(sv_type_calls[i]); + } + } + } else { + for (size_t i = 0; i < clusters.size(); ++i) { + cluster_map[clusters[i]].push_back(sv_type_calls[i]); + } } // Merge SVs in each cluster @@ -89,7 +99,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) int cluster_id = cluster.first; std::vector& cluster_sv_calls = cluster.second; - // [TEST] If insertions, and if any SV has length between 9400 and // 9500, print all SV coordinates in the cluster bool print_all = false; @@ -121,8 +130,14 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) } } - if (cluster_id < 0) { - // Add all noise points to the merged list if >10 kb + if (cluster_id < 0 && keep_noise) { + + // Add all unclustered points to the merged list + for (const auto& sv_call : cluster_sv_calls) { + SVCall noise_sv_call = sv_call; + merged_sv_calls.push_back(noise_sv_call); + } + // for (const auto& sv_call : cluster_sv_calls) { // if ((sv_call.end - sv_call.start)+1 >= 10000) { // SVCall noise_sv_call = sv_call; @@ -131,9 +146,14 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) // printMessage("[TEST] Adding noise SV " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1)); // } // } - continue; // Skip noise and unclassified points + // continue; // Skip noise and unclassified points } else { // if (true) { + + // ---------------------------- + // HMM-BASED MERGING + // ---------------------------- + // Check if any SV has a non-zero likelihood bool has_nonzero_likelihood = false; if (cluster_sv_calls.size() > 0) { @@ -159,14 +179,27 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) { return sv_call.hmm_likelihood != 0.0; }); + + // Add SV call merged_sv_call = *it; + merged_sv_calls.push_back(merged_sv_call); // [TEST] - if (print_all) { - printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with highest likelihood SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1)); - } + // print_all = true; + // if (print_all) { + // printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with highest likelihood SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1)); + // printMessage("SV type: " + getSVTypeString(merged_sv_call.sv_type)); + // printMessage("Cluster members:"); + // for (const auto& sv_call : cluster_sv_calls) { + // printMessage(" " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1)); + // } + // } + + // ---------------------------- + // CIGAR-BASED MERGING + // ---------------------------- - } else { + } else if (cluster_sv_calls.size() > 1) { // Could be low if all CIGARCLIP // Use the median length SV of the top 10% of the cluster // (shorter reads are often noise) std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { @@ -181,18 +214,9 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) size_t median_index = top_10.size() / 2; merged_sv_call = top_10[median_index]; - // // Get the starting index of the top 10% of the cluster - // // (Cluster is sorted by descending length) - // size_t start_index = std::max(0, (int) (cluster_sv_calls.size() * 0.9)); - - // // Get the top 10% of the cluster - // std::vector top_half(cluster_sv_calls.begin() + start_index, cluster_sv_calls.end()); - - // // Get the median SV for the top 50% of the cluster - // size_t median_index = top_half.size() / 2; - // merged_sv_call = top_half[median_index]; - // int median_index = cluster_sv_calls.size() / 2; - // merged_sv_call = cluster_sv_calls[median_index]; + // Add SV call + merged_sv_call.cluster_size = (int) cluster_sv_calls.size(); + merged_sv_calls.push_back(merged_sv_call); // [TEST] if (print_all) { @@ -200,12 +224,12 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) } } - if (cluster_id < 0) { - merged_sv_call.cluster_size = cluster_id; - } else { - merged_sv_call.cluster_size = (int) cluster_sv_calls.size(); - } - merged_sv_calls.push_back(merged_sv_call); + // if (cluster_id < 0) { + // merged_sv_call.cluster_size = cluster_id; + // } else { + // merged_sv_call.cluster_size = (int) cluster_sv_calls.size(); + // } + // merged_sv_calls.push_back(merged_sv_call); cluster_count++; } } @@ -213,6 +237,14 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts) } sv_calls = std::move(merged_sv_calls); // Replace with filtered list + // Print an error if any have CIGARCLIP data type + for (const auto& sv_call : sv_calls) { + if (sv_call.data_type == "CIGARCLIP") { + printError("[ERROR1] Found CIGARCLIP SV in merged SVs"); + break; + } + } + int updated_size = sv_calls.size(); printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls"); } From c7f7f89e7c21bcd79190aa2fffaaff817d6d2b15 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 28 Mar 2025 16:16:19 -0400 Subject: [PATCH 098/134] fix split read merge and reduce svcall overhead --- include/cnv_caller.h | 47 ++-- include/sv_caller.h | 233 +++++++++++-------- include/sv_object.h | 17 +- include/sv_types.h | 69 ++++++ python/plot_distributions.py | 27 +-- src/cnv_caller.cpp | 17 +- src/sv_caller.cpp | 423 ++++++++++++++++++++++++++++------- src/sv_object.cpp | 27 +-- 8 files changed, 611 insertions(+), 249 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 609238ef..87a9f011 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -48,6 +48,19 @@ class CNVCaller { private: std::shared_mutex& shared_mutex; + void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp); + + void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair, double>& prediction) const; + + // Query a region for SNPs and return the SNP data + void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const; + + // Split a region into chunks for parallel processing + std::vector splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const; + + public: + CNVCaller(std::shared_mutex& shared_mutex) : shared_mutex(shared_mutex) {} + // Define a map of CNV genotypes by HMM predicted state. // We only use the first 3 genotypes (0/0, 0/1, 1/1) for the VCF output. // Each of the 6 state predictions corresponds to a copy number state @@ -59,32 +72,24 @@ class CNVCaller { // 4: 1/1 (Copy neutral LOH: no copy number change, GT: 1/1 for homozygous variant) // 5: 2/1 (One copy gain: heterozygous duplication, GT: 1/2->0/1) // 6: 2/2 (Two copy gain: homozygous duplication, GT: 2/2->1/1) - std::map cnv_genotype_map = { - {0, "./."}, - {1, "1/1"}, - {2, "0/1"}, - {3, "0/0"}, - {4, "1/1"}, - {5, "0/1"}, - {6, "1/1"} + const std::unordered_map StateGenotypeMap = { + {0, Genotype::UNKNOWN}, + {1, Genotype::HOMOZYGOUS_ALT}, + {2, Genotype::HETEROZYGOUS}, + {3, Genotype::HOMOZYGOUS_REF}, + {4, Genotype::HOMOZYGOUS_ALT}, + {5, Genotype::HETEROZYGOUS}, + {6, Genotype::HOMOZYGOUS_ALT} }; - void updateSNPData(SNPData& snp_data, uint32_t pos, double pfb, double baf, double log2_cov, bool is_snp); - - void runViterbi(const CHMM& hmm, SNPData& snp_data, std::pair, double>& prediction) const; - - // Query a region for SNPs and return the SNP data - void querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end_pos, const std::vector& pos_depth_map, double mean_chr_cov, SNPData& snp_data, const InputData& input_data) const; - - // Split a region into chunks for parallel processing - std::vector splitRegionIntoChunks(std::string chr, uint32_t start_pos, uint32_t end_pos, int chunk_count) const; - - public: - CNVCaller(std::shared_mutex& shared_mutex) : shared_mutex(shared_mutex) {} + // Function to get the genotype string from the state + inline Genotype getGenotypeFromCNState(int cn_state) const { + return StateGenotypeMap.at(cn_state); + } // Run copy number prediction for a single SV candidate, returning the // likelihood, predicted CNV type, genotype, and whether SNPs were found - std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; + std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; diff --git a/include/sv_caller.h b/include/sv_caller.h index 6e446fa6..247adf8c 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -17,111 +17,157 @@ #include /// @endcond -struct GenomicRegion { - int tid; - hts_pos_t start; - hts_pos_t end; - int query_start; - int query_end; - bool strand; - int cluster_size; // Number of alignments used for this region -}; - -struct PrimaryAlignment { - hts_pos_t start; - hts_pos_t end; - int query_start; - int query_end; - bool strand; - int cluster_size; // Number of alignments used for this region -}; - -struct SuppAlignment { - int tid; - hts_pos_t start; - hts_pos_t end; - int query_start; - int query_end; - bool strand; - int cluster_size; // Number of alignments used for this region -}; - -struct SplitSignature { - int tid; - hts_pos_t start; - hts_pos_t end; - bool strand; - hts_pos_t query_start; - hts_pos_t query_end; -}; - -// Interval Tree Node -struct IntervalNode { - PrimaryAlignment region; - std::string qname; - hts_pos_t max_end; // To optimize queries - std::unique_ptr left; - std::unique_ptr right; - - IntervalNode(PrimaryAlignment r, std::string name) - : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {} -}; - -void insert(std::unique_ptr& root, const PrimaryAlignment& region, std::string qname) { - if (!root) { - root = std::make_unique(region, qname); - return; - } - - if (region.start < root->region.start) - { - insert(root->left, region, qname); - } else { - insert(root->right, region, qname); - } - - // Update max_end - root->max_end = std::max(root->max_end, region.end); -} - -void findOverlaps(const std::unique_ptr& root, const PrimaryAlignment& query, std::vector& result) { - if (!root) return; - - // If overlapping, add to result - if (query.start <= root->region.end && query.end >= root->region.start) - result.push_back(root->qname); - - // If left subtree may have overlaps, search left - if (root->left && root->left->max_end >= query.start) - findOverlaps(root->left, query, result); - - // Always check the right subtree - findOverlaps(root->right, query, result); -} - -struct MismatchData { - uint32_t query_start; - uint32_t query_end; - std::vector match_map; -}; +// struct GenomicRegion { +// int tid; +// hts_pos_t start; +// hts_pos_t end; +// int query_start; +// int query_end; +// bool strand; +// int cluster_size; // Number of alignments used for this region +// }; + +// struct PrimaryAlignment { +// hts_pos_t start; +// hts_pos_t end; +// int query_start; +// int query_end; +// bool strand; +// int cluster_size; // Number of alignments used for this region +// }; + +// struct SuppAlignment { +// int tid; +// hts_pos_t start; +// hts_pos_t end; +// int query_start; +// int query_end; +// bool strand; +// int cluster_size; // Number of alignments used for this region +// }; + +// struct SplitSignature { +// int tid; +// hts_pos_t start; +// hts_pos_t end; +// bool strand; +// hts_pos_t query_start; +// hts_pos_t query_end; +// }; + +// // Interval Tree Node +// struct IntervalNode { +// PrimaryAlignment region; +// std::string qname; +// hts_pos_t max_end; // To optimize queries +// std::unique_ptr left; +// std::unique_ptr right; + +// IntervalNode(PrimaryAlignment r, std::string name) +// : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {} +// }; + +// void insert(std::unique_ptr& root, const PrimaryAlignment& region, std::string qname) { +// if (!root) { +// root = std::make_unique(region, qname); +// return; +// } + +// if (region.start < root->region.start) +// { +// insert(root->left, region, qname); +// } else { +// insert(root->right, region, qname); +// } + +// // Update max_end +// root->max_end = std::max(root->max_end, region.end); +// } + +// void findOverlaps(const std::unique_ptr& root, const PrimaryAlignment& query, std::vector& result) { +// if (!root) return; + +// // If overlapping, add to result +// if (query.start <= root->region.end && query.end >= root->region.start) +// result.push_back(root->qname); + +// // If left subtree may have overlaps, search left +// if (root->left && root->left->max_end >= query.start) +// findOverlaps(root->left, query, result); + +// // Always check the right subtree +// findOverlaps(root->right, query, result); +// } class SVCaller { private: + struct GenomicRegion { + int tid; + hts_pos_t start; + hts_pos_t end; + int query_start; + int query_end; + bool strand; + int cluster_size; // Number of alignments used for this region + }; + + struct PrimaryAlignment { + hts_pos_t start; + hts_pos_t end; + int query_start; + int query_end; + bool strand; + int cluster_size; // Number of alignments used for this region + }; + + struct SuppAlignment { + int tid; + hts_pos_t start; + hts_pos_t end; + int query_start; + int query_end; + bool strand; + int cluster_size; // Number of alignments used for this region + }; + + struct SplitSignature { + int tid; + hts_pos_t start; + hts_pos_t end; + bool strand; + hts_pos_t query_start; + hts_pos_t query_end; + }; + + // Interval Tree Node + struct IntervalNode { + PrimaryAlignment region; + std::string qname; + hts_pos_t max_end; // To optimize queries + std::unique_ptr left; + std::unique_ptr right; + + IntervalNode(PrimaryAlignment r, std::string name) + : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {} + }; + int min_mapq = 20; // Minimum mapping quality to be considered mutable std::shared_mutex shared_mutex; // Shared mutex for thread safety std::vector getChromosomes(const std::string& bam_filepath); - void findSplitSVSignatures(std::unordered_map>& sv_calls, const InputData& input_data); + void findSplitSVSignatures(std::unordered_map>& sv_calls, const InputData& input_data, const std::unordered_map>& chr_pos_depth_map, const ReferenceGenome& ref_genome); // Process a single CIGAR record and find candidate SVs - void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, bool is_primary, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome); + void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome, std::unordered_map& read_mismatch_rates); std::pair getAlignmentReadPositions(bam1_t* alignment); - void processChromosome(const std::string& chr, std::vector& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov); + void processChromosome(const std::string& chr, std::vector& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov, std::unordered_map& read_mismatch_rates); + + void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome, std::unordered_map& read_mismatch_rates); - void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome); + double getReadMismatchRate(bam1_t * alignment, const std::string& chr, const ReferenceGenome & ref_genome); // Read the next alignment from the BAM file in a thread-safe manner int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); @@ -138,6 +184,11 @@ class SVCaller { // Detect SVs and predict SV type from long read alignments and CNV calls void run(const InputData& input_data); + + // Interval tree + void findOverlaps(const std::unique_ptr& root, const PrimaryAlignment& query, std::vector& result); + + void insert(std::unique_ptr& root, const PrimaryAlignment& region, std::string qname); }; #endif // SV_CALLER_H diff --git a/include/sv_object.h b/include/sv_object.h index 155b50bf..08fe8f70 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -7,27 +7,30 @@ #include #include #include +#include #include "sv_types.h" using namespace sv_types; struct SVCall { - uint32_t start; - uint32_t end; + uint32_t start = 0; + uint32_t end = 0; SVType sv_type = SVType::UNKNOWN; std::string alt_allele = "."; - std::string data_type = "NA"; - std::string genotype = "./."; + SVDataType data_type = SVDataType::UNKNOWN; + Genotype genotype = Genotype::UNKNOWN; double hmm_likelihood = 0.0; int read_depth = 0; // Breakpoint depth - int support = 0; // Number of supporting reads + double mismatch_rate = 0.0; // Highest mismatch rate in reads used for the SV call int cluster_size = 0; // Number of SV calls in the cluster bool operator<(const SVCall& other) const; - SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, std::string data_type, std::string genotype, double hmm_likelihood, int read_depth, int support, int cluster_size) : - start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), support(support), cluster_size(cluster_size) {} + SVCall() = default; + + SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int read_depth, double mismatch_rate, int cluster_size) : + start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), mismatch_rate(mismatch_rate), cluster_size(cluster_size) {} }; void addSVCall(std::vector& sv_calls, SVCall& sv_call); diff --git a/include/sv_types.h b/include/sv_types.h index 26415935..58f6063b 100644 --- a/include/sv_types.h +++ b/include/sv_types.h @@ -39,6 +39,60 @@ namespace sv_types { {SVType::COMPLEX, "COMPLEX"} }; + // Mapping of SV types to symbols + const std::unordered_map SVTypeSymbol = { + {SVType::UNKNOWN, "."}, + {SVType::DEL, ""}, + {SVType::DUP, ""}, + {SVType::INV, ""}, + {SVType::INS, ""}, + {SVType::BND, ""}, + }; + + // Define constants for genotypes + enum class Genotype { + HOMOZYGOUS_REF = 0, + HETEROZYGOUS = 1, + HOMOZYGOUS_ALT = 2, + UNKNOWN = 3 + }; + + // Mapping of genotypes to strings + const std::unordered_map GenotypeString = { + {Genotype::HOMOZYGOUS_REF, "0/0"}, + {Genotype::HETEROZYGOUS, "0/1"}, + {Genotype::HOMOZYGOUS_ALT, "1/1"}, + {Genotype::UNKNOWN, "./."} + }; + + // Define constants for SV data types (evidence types) + enum class SVDataType { + CIGARINS = 0, + CIGARDEL = 1, + CIGARCLIP = 2, + SPLIT = 3, + SPLITDIST1 = 4, + SPLITDIST2 = 5, + SPLITINV = 6, + SUPPINV = 7, + HMM = 8, + UNKNOWN = 9 + }; + + // Mapping of SV data types to strings + const std::unordered_map SVDataTypeString = { + {SVDataType::CIGARINS, "CIGARINS"}, + {SVDataType::CIGARDEL, "CIGARDEL"}, + {SVDataType::CIGARCLIP, "CIGARCLIP"}, + {SVDataType::SPLIT, "SPLIT"}, + {SVDataType::SPLITDIST1, "SPLITDIST1"}, + {SVDataType::SPLITDIST2, "SPLITDIST2"}, + {SVDataType::SPLITINV, "SPLITINV"}, + {SVDataType::SUPPINV, "SUPPINV"}, + {SVDataType::HMM, "HMM"}, + {SVDataType::UNKNOWN, "UNKNOWN"} + }; + // Mapping of 6 copy number states to SV types const std::unordered_map CNVTypeMap = { {0, SVType::UNKNOWN}, @@ -60,6 +114,21 @@ namespace sv_types { return CNVTypeMap.at(cn_state); } + // Function to get the genotype string + inline std::string getGenotypeString(Genotype genotype) { + return GenotypeString.at(genotype); + } + + // Function to get the SV data type string + inline std::string getSVDataTypeString(SVDataType data_type) { + return SVDataTypeString.at(data_type); + } + + // Function to get the SV type symbol + inline std::string getSVTypeSymbol(SVType sv_type) { + return SVTypeSymbol.at(sv_type); + } + // Function to check if an SV type is a valid update from copy number predictions inline bool isValidCopyNumberUpdate(SVType sv_type, SVType updated_sv_type) { if (updated_sv_type == SVType::UNKNOWN) { diff --git a/python/plot_distributions.py b/python/plot_distributions.py index 37eb1638..c2644a8a 100644 --- a/python/plot_distributions.py +++ b/python/plot_distributions.py @@ -89,8 +89,6 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"): # If the plot title is GIAB, then we need to convert INS to DUP if # INFO/SVTYPE is INS and INFO/REPTYPE is DUP - # if plot_title == "GIAB" and sv_type == "INS": - # Check if GIAB is a substring of the plot title if "GIAB" in plot_title and sv_type == "INS": if 'REPTYPE=DUP' in record['INFO']: sv_type = "DUP" @@ -110,7 +108,6 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"): # Create a dictionary of SV types and their corresponding colors. # From: https://davidmathlogic.com/colorblind/ - # sv_colors = {'DEL': '#D81B60', 'DUP': '#1E88E5', 'INV': '#FFC107', 'INS': '#004D40'} # WONG colors sv_colors = {'DEL': '#E69F00', 'DUP': '#56B4E9', 'INV': '#009E73', 'INS': '#F0E442', 'INVDUP': '#D55E00', 'COMPLEX': '#CC79A7'} @@ -163,16 +160,16 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"): # Use a log scale for the y-axis axes[i].set_yscale('log') - # # In the same axis, plot a known duplication if within the range of the plot - if sv_type == 'DUP': - print("TEST: Found DUP") - cnv_size = 776237 / size_scale - x_min, x_max = axes[i].get_xlim() - if cnv_size > x_min and cnv_size < x_max: - axes[i].axvline(x=cnv_size, color='black', linestyle='--') - else: - # Print the values - print(f'CNV size: {cnv_size}, x_min: {x_min}, x_max: {x_max}') + # In the same axis, plot a known duplication if within the range of the plot + # if sv_type == 'DUP': + # print("TEST: Found DUP") + # cnv_size = 776237 / size_scale + # x_min, x_max = axes[i].get_xlim() + # if cnv_size > x_min and cnv_size < x_max: + # axes[i].axvline(x=cnv_size, color='black', linestyle='--') + # else: + # # Print the values + # print(f'CNV size: {cnv_size}, x_min: {x_min}, x_max: {x_max}') # Refresh the plot plt.draw() @@ -216,9 +213,9 @@ def generate_sv_size_plot(input_vcf, output_png, plot_title="SV Caller"): fig.update_layout(legend=dict( orientation='v', yanchor='top', - y=0.75, + y=0.9, xanchor='right', - x=0.75, + x=0.9, )) # # Move the legend to the bottom right outside the plot # fig.update_layout(legend=dict( diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 26bca566..c293d7fb 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -155,13 +155,13 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end snp_data.is_snp = std::move(is_snp_hmm); } -std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const +std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const { // Check that the start position is less than the end position if (start_pos > end_pos) { printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); - return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false); + return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false); } // Run the Viterbi algorithm on SNPs in the SV region @@ -197,7 +197,7 @@ std::tuple CNVCaller::runCopyNumberPrediction runViterbi(hmm, snp_data, prediction); if (prediction.first.size() == 0) { - return std::make_tuple(0.0, SVType::UNKNOWN, "./.", false); + return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false); } std::vector& state_sequence = prediction.first; @@ -233,12 +233,13 @@ std::tuple CNVCaller::runCopyNumberPrediction // Update SV type and genotype based on the majority state SVType predicted_cnv_type = SVType::UNKNOWN; - std::string genotype = "./."; + Genotype genotype = Genotype::UNKNOWN; int state_count = (int) sv_states.size(); if ((double) max_count / (double) state_count > pct_threshold) { predicted_cnv_type = getSVTypeFromCNState(max_state); - genotype = cnv_genotype_map.at(max_state); + // genotype = cnv_genotype_map.at(max_state); + genotype = getGenotypeFromCNState(max_state); } snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data @@ -360,18 +361,17 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector SVCaller::getChromosomes(const std::string &bam_filepat return chromosomes; } -void SVCaller::findSplitSVSignatures(std::unordered_map> &sv_calls, const InputData &input_data) +void SVCaller::findSplitSVSignatures(std::unordered_map> &sv_calls, const InputData &input_data, const std::unordered_map>& chr_pos_depth_map, const ReferenceGenome& ref_genome) { // Open the BAM file std::string bam_filepath = input_data.getLongReadBam(); @@ -133,6 +133,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map alignment_tids; // All unique chromosome IDs std::unordered_set supp_qnames; // All unique query names + std::unordered_map read_mismatch_rates; // Query name -> mismatch rate while (readNextAlignment(fp_in, itr, bam1) >= 0) { // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality @@ -154,6 +155,12 @@ void SVCaller::findSplitSVSignatures(std::unordered_mapcore.flag & BAM_FSUPPLEMENTARY) { + // Get the mismatch rate for the read + const std::string supp_chr = bamHdr->target_name[bam1->core.tid]; + double mismatch_rate = getReadMismatchRate(bam1, supp_chr, ref_genome); + read_mismatch_rates[qname] = mismatch_rate; + // printMessage("[TEST] Mismatch rate for " + qname + ": " + std::to_string(mismatch_rate) + " at position " + std::to_string(bam1->core.pos + 1) + std::string(bam_endpos(bam1) > 0 ? "-" + std::to_string(bam_endpos(bam1)) : "")); + // Store chromosome (TID), start, and end positions (1-based) of the // supplementary alignment, and the strand (true for forward, false // for reverse) @@ -260,11 +267,14 @@ void SVCaller::findSplitSVSignatures(std::unordered_map starts; std::vector ends; std::vector primary_strands; + double min_mismatch_rate = 1.0; for (const std::string& qname : primary_cluster) { const PrimaryAlignment& primary_aln = chr_primary_map.at(qname); starts.push_back(primary_aln.start); ends.push_back(primary_aln.end); primary_strands.push_back(primary_aln.strand); + min_mismatch_rate = std::min(min_mismatch_rate, read_mismatch_rates[qname]); + // printMessage("[TEST-SPLIT] Mismatch rate for " + qname + ": " + std::to_string(read_mismatch_rates[qname])); } // Get the largest cluster of primary alignment start positions @@ -309,9 +319,8 @@ void SVCaller::findSplitSVSignatures(std::unordered_map", "SPLITDIST1", "./.", 0.0, 0, 0, primary_cluster_size); + int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), primary_pos); + // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, "", "SPLITDIST1", "./.", 0.0, read_depth, min_mismatch_rate, primary_cluster_size); + SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, getSVTypeSymbol(SVType::INS), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, primary_cluster_size); + + // Print if end position = 162908547 + if (primary_pos + (read_distance - 1) == 162908547) { + printMessage("[TEST] Adding insertion SV candidate at " + chr_name + ":" + std::to_string(primary_pos) + "-" + std::to_string(primary_pos + (read_distance - 1)) + " with length " + std::to_string(read_distance)); + } addSVCall(chr_sv_calls, sv_candidate); } } @@ -411,7 +427,14 @@ void SVCaller::findSplitSVSignatures(std::unordered_map= 50) { - SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, "", "SUPPINV", "./.", 0.0, 0, 0, supp_cluster_size); + int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), std::min(supp_best_start, supp_best_end)); + + // Print if end position = 162908547 + if (std::max(supp_best_start, supp_best_end) == 162908547) { + printMessage("[TEST] Adding inversion SV candidate at " + chr_name + ":" + std::to_string(std::min(supp_best_start, supp_best_end)) + "-" + std::to_string(std::max(supp_best_start, supp_best_end)) + " with length " + std::to_string(std::abs(supp_best_start - supp_best_end))); + } + + SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, supp_cluster_size); addSVCall(chr_sv_calls, sv_candidate); } } @@ -445,16 +468,24 @@ void SVCaller::findSplitSVSignatures(std::unordered_map 2kb, then this is a // potential deletion if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) { - // SVCall sv_candidate(sv_start, sv_end, SVType::DEL, ".", "SPLITDIST2", "./.", 0.0, 0, 0, cluster_size); - // addSVCall(chr_sv_calls, sv_candidate); // Add an inversion call if necessary + int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start); if (inversion) { - // printMessage("[TEST] Found inversion at " + std::to_string(sv_start) + "-" + std::to_string(sv_end) + ", length=" + std::to_string(sv_length)); - SVCall sv_candidate(sv_start, sv_end, SVType::INV, "", "SPLITINV", "./.", 0.0, 0, 0, cluster_size); + SVCall sv_candidate(sv_start, sv_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, cluster_size); addSVCall(chr_sv_calls, sv_candidate); + + // Print if end position = 162908547 + if (sv_end == 162908547) { + printMessage("[TEST] Adding inversion SV candidate at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length)); + } } else { - SVCall sv_candidate(sv_start, sv_end, SVType::DEL, "", "SPLITDIST2", "./.", 0.0, 0, 0, cluster_size); + SVCall sv_candidate(sv_start, sv_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, cluster_size); + + // Print if end position = 162908547 + if (sv_end == 162908547) { + printMessage("[TEST] Adding deletion SV candidate at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and mismatch rate " + std::to_string(min_mismatch_rate)); + } addSVCall(chr_sv_calls, sv_candidate); } } @@ -463,7 +494,13 @@ void SVCaller::findSplitSVSignatures(std::unordered_map= min_length && sv_length <= max_length) { SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; std::string alt = (sv_type == SVType::INV) ? "" : "."; - SVCall sv_candidate(sv_start, sv_end, sv_type, alt, "SPLIT", "./.", 0.0, 0, 0, cluster_size); + int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start); + + // Print if end position = 162908547 + if (sv_end == 162908547) { + printMessage("[TEST] Adding CNV SV candidate at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and type " + getSVTypeSymbol(sv_type) + " and mismatch rate " + std::to_string(min_mismatch_rate)); + } + SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, cluster_size); addSVCall(chr_sv_calls, sv_candidate); } } @@ -477,6 +514,12 @@ void SVCaller::findSplitSVSignatures(std::unordered_map& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome) +void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome, std::unordered_map& read_mismatch_rates) { // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); @@ -502,14 +545,15 @@ void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, c // Main loop to process the alignments while (readNextAlignment(fp_in, itr, bam1) >= 0) { - // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality - if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq) { + // Skip secondary and unmapped alignments, duplicates, QC failures, and + // low mapping quality, and supplementary alignments + if (bam1->core.flag & BAM_FSECONDARY || bam1->core.flag & BAM_FUNMAP || bam1->core.flag & BAM_FDUP || bam1->core.flag & BAM_FQCFAIL || bam1->core.qual < this->min_mapq || bam1->core.flag & BAM_FSUPPLEMENTARY) { continue; } // Process the alignment - bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY); - this->processCIGARRecord(bamHdr, bam1, sv_calls, primary, pos_depth_map, ref_genome); + // bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY); + this->processCIGARRecord(bamHdr, bam1, sv_calls, pos_depth_map, ref_genome, read_mismatch_rates); } // Clean up the iterator and alignment @@ -517,10 +561,65 @@ void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, c bam_destroy1(bam1); } -void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector &sv_calls, bool is_primary, const std::vector &pos_depth_map, const ReferenceGenome &ref_genome) +double SVCaller::getReadMismatchRate(bam1_t *alignment, const std::string& chr, const ReferenceGenome & ref_genome) +{ + uint32_t* cigar = bam_get_cigar(alignment); // CIGAR array + int cigar_len = alignment->core.n_cigar; + uint32_t query_pos = 0; + uint32_t pos = (uint32_t)alignment->core.pos; + uint32_t aln_start = pos; + uint32_t end = (uint32_t)bam_endpos(alignment) - 1; // Rightmost position of the alignment in the reference genome (0-based) + + // Get the reference sequence + std::string_view ref_seq = ref_genome.query(chr, pos + 1, end + 1); + + // Loop through the CIGAR string and calculate the number of matches and + // mismatches + int match_count = 0; + int mismatch_count = 0; + for (int i = 0; i < cigar_len; i++) { + int op_len = bam_cigar_oplen(cigar[i]); // CIGAR operation length + int op = bam_cigar_op(cigar[i]); // CIGAR operation + if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + for (int j = 0; j < op_len; j++) { + char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; + if (base == ref_seq[pos - aln_start + j]) { + match_count++; + } else { + mismatch_count++; + } + } + } + // Update the reference position + // https://samtools.github.io/hts-specs/SAMv1.pdf + if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { + pos += op_len; + } + + // Update the query position + if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) { + query_pos += op_len; + } + } + + // Calculate the mismatch rate + double mismatch_rate = 0.0; + if (match_count + mismatch_count > 0) { + mismatch_rate = static_cast(mismatch_count) / static_cast(match_count + mismatch_count); + } + return mismatch_rate; +} + +void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector &sv_calls, const std::vector &pos_depth_map, const ReferenceGenome &ref_genome, std::unordered_map &read_mismatch_rates) { std::string chr = header->target_name[alignment->core.tid]; // Chromosome name - uint32_t pos = (uint32_t)alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) + uint32_t aln_start = (uint32_t)alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) + uint32_t pos = aln_start; + uint32_t end = (uint32_t)bam_endpos(alignment) - 1; // Rightmost position of the alignment in the reference genome (0-based) + + // Get the reference sequence (used for mismatch rate) + std::string_view ref_seq = ref_genome.query(chr, pos + 1, end + 1); + uint32_t* cigar = bam_get_cigar(alignment); // CIGAR array int cigar_len = alignment->core.n_cigar; uint32_t query_pos = 0; @@ -537,13 +636,17 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec amb_bases_bitset.set(std::tolower(base)); } + int match_count = 0; + int mismatch_count = 0; + std::vector cigar_sv_calls; + cigar_sv_calls.reserve(1000); for (int i = 0; i < cigar_len; i++) { int op_len = bam_cigar_oplen(cigar[i]); // CIGAR operation length int op = bam_cigar_op(cigar[i]); // CIGAR operation if (op_len >= 50) { // Process the CIGAR operation - if (op == BAM_CINS && is_primary) { + if (op == BAM_CINS) { // Get the sequence of the insertion from the query std::string ins_seq_str(op_len, ' '); @@ -557,51 +660,57 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec } } - // Before the insertion - if (pos >= (uint32_t)op_len-1) - { - uint32_t bp1 = pos - (op_len - 1) + 1; - uint32_t bp2 = bp1 + op_len - 1; //pos + 1; - - if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) - { - int read_depth = this->getReadDepth(pos_depth_map, bp1); - SVCall sv_call(bp1, bp2, SVType::DUP, "", "LSEQSIM", "./.", default_lh, read_depth, 1, 0); - addSVCall(sv_calls, sv_call); - continue; - } - } - - // After the insertion - if (pos + op_len < ref_genome.getChromosomeLength(chr)) - { - uint32_t bp1 = pos + 1; - uint32_t bp2 = bp1 + op_len - 1; - - if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) - { - int read_depth = this->getReadDepth(pos_depth_map, bp1); - SVCall sv_call(bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth, 1, 0); - addSVCall(sv_calls, sv_call); - continue; - } - } + // // Before the insertion + // if (pos >= (uint32_t)op_len-1) + // { + // uint32_t bp1 = pos - (op_len - 1) + 1; + // uint32_t bp2 = bp1 + op_len - 1; //pos + 1; + + // if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) + // { + // int read_depth = this->getReadDepth(pos_depth_map, bp1); + // SVCall sv_call(bp1, bp2, SVType::DUP, "", "LSEQSIM", "./.", default_lh, read_depth, 1, 0); + // // addSVCall(sv_calls, sv_call); + // addSVCall(cigar_sv_calls, sv_call); + // continue; + // } + // } + + // // After the insertion + // if (pos + op_len < ref_genome.getChromosomeLength(chr)) + // { + // uint32_t bp1 = pos + 1; + // uint32_t bp2 = bp1 + op_len - 1; + + // if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) + // { + // int read_depth = this->getReadDepth(pos_depth_map, bp1); + // SVCall sv_call(bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth, 1, 0); + // // addSVCall(sv_calls, sv_call); + // addSVCall(cigar_sv_calls, sv_call); + // continue; + // } + // } // Add as an insertion uint32_t ins_pos = pos + 1; uint32_t ins_end = ins_pos + op_len - 1; - int read_depth = this->getReadDepth(pos_depth_map, ins_pos-1); + int read_depth = this->getReadDepth(pos_depth_map, ins_pos); // Determine the ALT allele format based on small vs. large insertion std::string alt_allele = ""; if (op_len <= 50) { alt_allele = ins_seq_str; } - SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARINS", "./.", default_lh, read_depth, 1, 0); - addSVCall(sv_calls, sv_call); + // SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, + // "CIGARINS", "./.", default_lh, read_depth, 1, 0); + SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, read_depth, 1, 0); + // addSVCall(sv_calls, sv_call); + // addSVCall(cigar_sv_calls, sv_call); + cigar_sv_calls.emplace_back(sv_call); // Process clipped bases as potential insertions - } else if (op == BAM_CSOFT_CLIP && is_primary) { + } else if (op == BAM_CSOFT_CLIP) { // Get the sequence of the insertion from the query std::string ins_seq_str(op_len, ' '); for (int j = 0; j < op_len; j++) { @@ -617,25 +726,51 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec // Add as an insertion uint32_t ins_pos = pos + 1; uint32_t ins_end = ins_pos + op_len - 1; - int read_depth = this->getReadDepth(pos_depth_map, ins_pos-1); + int read_depth = this->getReadDepth(pos_depth_map, ins_pos); // Determine the ALT allele format based on small vs. large insertion std::string alt_allele = ""; if (op_len <= 50) { alt_allele = ins_seq_str; } - SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, "CIGARCLIP", "./.", default_lh, read_depth, 1, 0); - addSVCall(sv_calls, sv_call); + // SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, + // "CIGARCLIP", "./.", default_lh, read_depth, 0.0, 0); + SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, read_depth, 0.0, 0); + // addSVCall(sv_calls, sv_call); + cigar_sv_calls.emplace_back(sv_call); // Commented for testing + // printMessage("Completed adding SV: " + std::to_string(ins_pos) + "-" + std::to_string(ins_end) + " " + alt_allele + ", RD=" + std::to_string(read_depth) + ", data type=" + sv_call.data_type); // Check if the CIGAR operation is a deletion - } else if (op == BAM_CDEL && is_primary) { + } else if (op == BAM_CDEL) { ref_pos = pos+1; ref_end = ref_pos + op_len -1; int read_depth = this->getReadDepth(pos_depth_map, ref_pos); - SVCall sv_call(ref_pos, ref_end, SVType::DEL, "", "CIGARDEL", "./.", default_lh, read_depth, 1, 0); - addSVCall(sv_calls, sv_call); + // SVCall sv_call(ref_pos, ref_end, SVType::DEL, "", + // "CIGARDEL", "./.", default_lh, read_depth, 1, 0); + SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, read_depth, 1, 0); + // addSVCall(sv_calls, sv_call); + // addSVCall(cigar_sv_calls, sv_call); + cigar_sv_calls.emplace_back(sv_call); } + + // For matches, calculate the sequence identity + // } else if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { + // if (ref_seq.size() < static_cast(op_len)) { + // printError("ERROR: reference sequence length is less than the CIGAR operation length"); + // continue; + // } + + // // printMessage("Calculating sequence identity for matches"); + // for (int j = 0; j < op_len; j++) { + // char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; + // if (base == ref_seq[pos - aln_start + j]) { + // match_count++; + // } else { + // mismatch_count++; + // } + // } + // } } // Update the reference position @@ -649,6 +784,27 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec query_pos += op_len; } } + + // Get the read name + // std::string read_name = bam_get_qname(alignment); + + // If read name starts with c08844a5 then print the read name and the number + // of matches and mismatches + // if (read_name.find("c08844a5") != std::string::npos) { + // printMessage(read_name + ": matches=" + std::to_string(match_count) + ", mismatches=" + std::to_string(mismatch_count) + ", mismatches/length=" + std::to_string((double)mismatch_count / (double)(match_count + mismatch_count))); + // } + // double mismatch_rate = (double)mismatch_count / (double)(match_count + mismatch_count); + // if (mismatch_rate > 0) { + // printMessage("Read name: " + read_name + ", mismatch rate: " + std::to_string(mismatch_rate) + ", matches: " + std::to_string(match_count) + ", mismatches: " + std::to_string(mismatch_count)); + // } + // read_mismatch_rates[read_name] = mismatch_rate; + // printMessage("Completed processing read: " + read_name); + + // Set the mismatch rate for all SVs from this read, and add the SV calls + for (SVCall& sv_call : cigar_sv_calls) { + // sv_call.mismatch_rate = mismatch_rate; + addSVCall(sv_calls, sv_call); + } } std::pair SVCaller::getAlignmentReadPositions(bam1_t *alignment) @@ -680,7 +836,7 @@ std::pair SVCaller::getAlignmentReadPositions(bam1_t *alignment) return std::make_pair(query_start, query_end); } -void SVCaller::processChromosome(const std::string& chr, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov) +void SVCaller::processChromosome(const std::string& chr, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov, std::unordered_map& read_mismatch_rates) { // Open the BAM file std::string bam_filepath = input_data.getLongReadBam(); @@ -721,7 +877,7 @@ void SVCaller::processChromosome(const std::string& chr, std::vector& ch // ----------------------------------------------------------------------- // Detect SVs from the CIGAR strings printMessage(chr + ": CIGAR SVs..."); - this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome); + this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome, read_mismatch_rates); printMessage(chr + ": Merging CIGAR..."); mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false); @@ -794,6 +950,7 @@ void SVCaller::run(const InputData& input_data) chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end()); } std::unordered_map> whole_genome_sv_calls; + std::unordered_map read_mismatch_rates; int current_chr = 0; int total_chr_count = chromosomes.size(); @@ -810,10 +967,18 @@ void SVCaller::run(const InputData& input_data) try { std::vector sv_calls; InputData chr_input_data = input_data; // Use a thread-local copy - this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr]); + std::unordered_map chr_read_mismatch_rates; + this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], chr_read_mismatch_rates); { std::shared_lock lock(this->shared_mutex); + + // Update the SV calls for the chromosome whole_genome_sv_calls[chr] = std::move(sv_calls); + + // Update the mismatch rates for each read name + for (const auto& entry : chr_read_mismatch_rates) { + read_mismatch_rates[entry.first] = entry.second; + } } } catch (const std::exception& e) { printError("Error processing chromosome " + chr + ": " + e.what()); @@ -865,7 +1030,7 @@ void SVCaller::run(const InputData& input_data) // Identify split-SV signatures printMessage("Identifying split-SV signatures..."); std::unordered_map> whole_genome_split_sv_calls; - this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data); + this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data, chr_pos_depth_map, ref_genome); printMessage("Running copy number predictions on split-read SVs..."); current_chr = 0; @@ -883,7 +1048,6 @@ void SVCaller::run(const InputData& input_data) printMessage("Merging split-read SVs..."); int min_pts = 2; for (auto& entry : whole_genome_split_sv_calls) { - const std::string& chr = entry.first; std::vector& sv_calls = entry.second; mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), min_pts, true); } @@ -916,15 +1080,49 @@ void SVCaller::run(const InputData& input_data) this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome); } +void SVCaller::findOverlaps(const std::unique_ptr &root, const PrimaryAlignment &query, std::vector &result) +{ + if (!root) return; + + // If overlapping, add to result + if (query.start <= root->region.end && query.end >= root->region.start) + result.push_back(root->qname); + + // If left subtree may have overlaps, search left + if (root->left && root->left->max_end >= query.start) + findOverlaps(root->left, query, result); + + // Always check the right subtree + findOverlaps(root->right, query, result); +} + +void SVCaller::insert(std::unique_ptr &root, const PrimaryAlignment ®ion, std::string qname) +{ + if (!root) { + root = std::make_unique(region, qname); + return; + } + + if (region.start < root->region.start) + { + insert(root->left, region, qname); + } else { + insert(root->right, region, qname); + } + + // Update max_end + root->max_end = std::max(root->max_end, region.end); +} + // Run copy number predictions on the SVs detected from the split reads void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::vector& split_sv_calls, const CNVCaller& cnv_caller, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) { - std::vector processed_calls; + std::vector additional_calls; for (auto& sv_candidate : split_sv_calls) { - std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); + std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); - std::string genotype = std::get<2>(result); + Genotype genotype = std::get<2>(result); // For inversions with copy-neutral support, update the HMM likelihood if (supp_type == SVType::NEUTRAL && sv_candidate.sv_type == SVType::INV) { @@ -933,11 +1131,57 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve // Update the SV type if the support is not neutral or unknown else if (supp_type != SVType::UNKNOWN && supp_type != SVType::NEUTRAL) { - sv_candidate.sv_type = supp_type; - sv_candidate.alt_allele = "<" + getSVTypeString(supp_type) + ">"; - sv_candidate.data_type += "+HMM"; // Update the data type to include HMM - sv_candidate.genotype = genotype; - sv_candidate.hmm_likelihood = supp_lh; + // Update information if the SV call is unknown + if (sv_candidate.sv_type == SVType::UNKNOWN) { + sv_candidate.sv_type = supp_type; + sv_candidate.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format + sv_candidate.data_type = SVDataType::HMM; + sv_candidate.genotype = genotype; + sv_candidate.hmm_likelihood = supp_lh; + + // Print if end position = 162908547 + if (sv_candidate.end == 162908547) { + printMessage("SV at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + + " updated to type " + getSVTypeSymbol(supp_type) + + " with likelihood " + std::to_string(supp_lh) + + " and genotype " + getGenotypeString(genotype)); + } + // Add an additional SV call if the type is different + } else if (sv_candidate.sv_type != supp_type) { + SVCall new_sv_call = sv_candidate; // Copy the original SV call + new_sv_call.sv_type = supp_type; + new_sv_call.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format + new_sv_call.data_type = SVDataType::HMM; + new_sv_call.genotype = genotype; + new_sv_call.hmm_likelihood = supp_lh; + + // Print if end position = 162908547 + if (new_sv_call.end == 162908547) { + printMessage("Additional SV at " + chr + ":" + std::to_string(new_sv_call.start) + "-" + std::to_string(new_sv_call.end) + + " with type " + getSVTypeSymbol(supp_type) + + " and likelihood " + std::to_string(supp_lh) + + " and genotype " + getGenotypeString(genotype)); + } + additional_calls.push_back(new_sv_call); + } + } + } + + // Add the additional SV calls to the original list, replacing any existing + // ones + for (auto& new_sv_call : additional_calls) { + bool found = false; + for (auto& existing_sv_call : split_sv_calls) { + if (existing_sv_call.start == new_sv_call.start && existing_sv_call.end == new_sv_call.end && + existing_sv_call.sv_type == new_sv_call.sv_type) { + // Update the existing SV call with the new one + existing_sv_call = new_sv_call; + found = true; + break; + } + } + if (!found) { + addSVCall(split_sv_calls, new_sv_call); // Add as a new SV call } } } @@ -979,6 +1223,7 @@ void SVCaller::saveToVCF(const std::unordered_map", "##INFO=", "##INFO=", + "##INFO=", "##FILTER=", "##FILTER=", "##FORMAT=", @@ -1016,6 +1261,7 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls = pair.second; @@ -1025,19 +1271,30 @@ void SVCaller::saveToVCF(const std::unordered_map max_mismatch_rate) { + filter = "LowQual"; + filtered_svs += 1; + } // If the SV type is unknown, print a warning and skip if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) { - unclassified_svs += 1; + unclassified_svs += 1; continue; } else { total_count += 1; @@ -1095,17 +1352,16 @@ void SVCaller::saveToVCF(const std::unordered_map samples = {sample_str}; - // Write the SV call to the file (CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, SAMPLES) - vcf_stream << chr << "\t" << start << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "PASS" << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl; + // Write the SV call to the file (CHROM, POS, ID, REF, ALT, QUAL, + // FILTER, INFO, FORMAT, SAMPLES) + vcf_stream << chr << "\t" << start << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << filter << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl; + // vcf_stream << chr << "\t" << start << "\t" << "." << "\t" << ref_allele << "\t" << alt_allele << "\t" << "." << "\t" << "PASS" << "\t" << info_str << "\t" << format_str << "\t" << samples[0] << std::endl; } } vcf_stream.close(); @@ -1116,6 +1372,7 @@ void SVCaller::saveToVCF(const std::unordered_map 0) { std::cout << "Total unclassified SVs: " << unclassified_svs << std::endl; } + printMessage("Total PASS filtered SVs: " + std::to_string(filtered_svs)); } int SVCaller::getReadDepth(const std::vector& pos_depth_map, uint32_t start) @@ -1124,7 +1381,7 @@ int SVCaller::getReadDepth(const std::vector& pos_depth_map, uint32_t try { read_depth += pos_depth_map.at(start); } catch (const std::out_of_range& e) { - printError("Error: Start position " + std::to_string(start) + " not found in depth map."); + printError("Error: Start position " + std::to_string(start) + " not found in depth map of size " + std::to_string(pos_depth_map.size()) + ". Exception: " + e.what()); } return read_depth; diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 89c2b0c3..b94b6a78 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -83,7 +83,7 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k if (sv_type == SVType::INS) { // Add only non-CIGARCLIP SVs to the cluster map for (size_t i = 0; i < clusters.size(); ++i) { - if (sv_type_calls[i].data_type != "CIGARCLIP") { + if (sv_type_calls[i].data_type != SVDataType::CIGARCLIP) { cluster_map[clusters[i]].push_back(sv_type_calls[i]); } } @@ -138,17 +138,7 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k merged_sv_calls.push_back(noise_sv_call); } - // for (const auto& sv_call : cluster_sv_calls) { - // if ((sv_call.end - sv_call.start)+1 >= 10000) { - // SVCall noise_sv_call = sv_call; - // noise_sv_call.cluster_size = cluster_id; - // merged_sv_calls.push_back(noise_sv_call); - // printMessage("[TEST] Adding noise SV " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1)); - // } - // } - // continue; // Skip noise and unclassified points } else { - // if (true) { // ---------------------------- // HMM-BASED MERGING @@ -170,9 +160,9 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k SVCall merged_sv_call = cluster_sv_calls[0]; if (has_nonzero_likelihood) { // These are detected from split reads, choose the one with - // the highest non-zero likelihood + // the highest non-zero likelihood normalized by the length of the SV std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { - return a.hmm_likelihood > b.hmm_likelihood; + return (a.hmm_likelihood / (double)(a.end - a.start + 1)) > (b.hmm_likelihood / (double)(b.end - b.start + 1)); }); // Obtain the highest non-zero likelihood @@ -236,15 +226,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type)); } sv_calls = std::move(merged_sv_calls); // Replace with filtered list - - // Print an error if any have CIGARCLIP data type - for (const auto& sv_call : sv_calls) { - if (sv_call.data_type == "CIGARCLIP") { - printError("[ERROR1] Found CIGARCLIP SV in merged SVs"); - break; - } - } - int updated_size = sv_calls.size(); printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls"); } @@ -272,7 +253,7 @@ void mergeDuplicateSVs(std::vector &sv_calls) // If the likelihoods are equal, keep the one with the larger cluster size // This is to ensure that the SV call with more supporting reads is // kept - else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size > sv_calls[i - 1].cluster_size) { + else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size >= sv_calls[i - 1].cluster_size) { combined_sv_calls.back() = sv_call; } } else { From b1cf23f0bf349f19f8c50deba51c0beb44f71b6f Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 29 Mar 2025 19:40:58 -0400 Subject: [PATCH 099/134] remove mismatch filter --- src/sv_caller.cpp | 23 ++++++++++++----------- src/sv_object.cpp | 7 ------- 2 files changed, 12 insertions(+), 18 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index eac69390..7a7a3191 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -383,7 +383,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_mapgetReadDepth(chr_pos_depth_map.at(chr_name), primary_pos); - // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, "", "SPLITDIST1", "./.", 0.0, read_depth, min_mismatch_rate, primary_cluster_size); SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, getSVTypeSymbol(SVType::INS), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, primary_cluster_size); // Print if end position = 162908547 @@ -711,6 +710,13 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec // Process clipped bases as potential insertions } else if (op == BAM_CSOFT_CLIP) { + // Soft-clipped bases are considered as potential insertions + // Skip if the position exceeds the reference genome length + if (pos + 1 >= pos_depth_map.size()) { + // printMessage("Skipping soft-clipped insertion at position " + std::to_string(pos + 1) + " as it exceeds the reference genome length"); + continue; + } + // Get the sequence of the insertion from the query std::string ins_seq_str(op_len, ' '); for (int j = 0; j < op_len; j++) { @@ -722,12 +728,12 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec ins_seq_str[j] = base; } } - + // Add as an insertion uint32_t ins_pos = pos + 1; uint32_t ins_end = ins_pos + op_len - 1; int read_depth = this->getReadDepth(pos_depth_map, ins_pos); - + // Determine the ALT allele format based on small vs. large insertion std::string alt_allele = ""; if (op_len <= 50) { @@ -1283,14 +1289,7 @@ void SVCaller::saveToVCF(const std::unordered_map max_mismatch_rate) { - filter = "LowQual"; - filtered_svs += 1; - } // If the SV type is unknown, print a warning and skip if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) { @@ -1381,7 +1380,9 @@ int SVCaller::getReadDepth(const std::vector& pos_depth_map, uint32_t try { read_depth += pos_depth_map.at(start); } catch (const std::out_of_range& e) { - printError("Error: Start position " + std::to_string(start) + " not found in depth map of size " + std::to_string(pos_depth_map.size()) + ". Exception: " + e.what()); + // Occurs with clipped reads (insertion evidence) that are outside the + // range of the depth map + printError("Warning: Read depth for position " + std::to_string(start) + " is out of range of size " + std::to_string(pos_depth_map.size())); } return read_depth; diff --git a/src/sv_object.cpp b/src/sv_object.cpp index b94b6a78..ed3f8802 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -213,13 +213,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with median SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1)); } } - - // if (cluster_id < 0) { - // merged_sv_call.cluster_size = cluster_id; - // } else { - // merged_sv_call.cluster_size = (int) cluster_sv_calls.size(); - // } - // merged_sv_calls.push_back(merged_sv_call); cluster_count++; } } From a0e713fece9c562c58f6a0914f3bc2de1faa5072 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 31 Mar 2025 15:31:31 -0400 Subject: [PATCH 100/134] update read depth and mismatch rate --- include/sv_caller.h | 93 ++-------------------------- src/sv_caller.cpp | 146 +++++++++++++------------------------------- 2 files changed, 48 insertions(+), 191 deletions(-) diff --git a/include/sv_caller.h b/include/sv_caller.h index 247adf8c..4a30820c 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -17,87 +17,6 @@ #include /// @endcond -// struct GenomicRegion { -// int tid; -// hts_pos_t start; -// hts_pos_t end; -// int query_start; -// int query_end; -// bool strand; -// int cluster_size; // Number of alignments used for this region -// }; - -// struct PrimaryAlignment { -// hts_pos_t start; -// hts_pos_t end; -// int query_start; -// int query_end; -// bool strand; -// int cluster_size; // Number of alignments used for this region -// }; - -// struct SuppAlignment { -// int tid; -// hts_pos_t start; -// hts_pos_t end; -// int query_start; -// int query_end; -// bool strand; -// int cluster_size; // Number of alignments used for this region -// }; - -// struct SplitSignature { -// int tid; -// hts_pos_t start; -// hts_pos_t end; -// bool strand; -// hts_pos_t query_start; -// hts_pos_t query_end; -// }; - -// // Interval Tree Node -// struct IntervalNode { -// PrimaryAlignment region; -// std::string qname; -// hts_pos_t max_end; // To optimize queries -// std::unique_ptr left; -// std::unique_ptr right; - -// IntervalNode(PrimaryAlignment r, std::string name) -// : region(r), qname(name), max_end(r.end), left(nullptr), right(nullptr) {} -// }; - -// void insert(std::unique_ptr& root, const PrimaryAlignment& region, std::string qname) { -// if (!root) { -// root = std::make_unique(region, qname); -// return; -// } - -// if (region.start < root->region.start) -// { -// insert(root->left, region, qname); -// } else { -// insert(root->right, region, qname); -// } - -// // Update max_end -// root->max_end = std::max(root->max_end, region.end); -// } - -// void findOverlaps(const std::unique_ptr& root, const PrimaryAlignment& query, std::vector& result) { -// if (!root) return; - -// // If overlapping, add to result -// if (query.start <= root->region.end && query.end >= root->region.start) -// result.push_back(root->qname); - -// // If left subtree may have overlaps, search left -// if (root->left && root->left->max_end >= query.start) -// findOverlaps(root->left, query, result); - -// // Always check the right subtree -// findOverlaps(root->right, query, result); -// } class SVCaller { private: @@ -127,7 +46,7 @@ class SVCaller { int query_start; int query_end; bool strand; - int cluster_size; // Number of alignments used for this region + double mismatch_rate; // Mismatch rate for this alignment }; struct SplitSignature { @@ -159,13 +78,13 @@ class SVCaller { void findSplitSVSignatures(std::unordered_map>& sv_calls, const InputData& input_data, const std::unordered_map>& chr_pos_depth_map, const ReferenceGenome& ref_genome); // Process a single CIGAR record and find candidate SVs - void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome, std::unordered_map& read_mismatch_rates); + void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, const std::vector& pos_depth_map); std::pair getAlignmentReadPositions(bam1_t* alignment); - void processChromosome(const std::string& chr, std::vector& combined_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov, std::unordered_map& read_mismatch_rates); + void processChromosome(const std::string& chr, std::vector& combined_sv_calls, const InputData& input_data, const std::vector& chr_pos_depth_map, double mean_chr_cov); - void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome, std::unordered_map& read_mismatch_rates); + void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map); double getReadMismatchRate(bam1_t * alignment, const std::string& chr, const ReferenceGenome & ref_genome); @@ -174,10 +93,10 @@ class SVCaller { void runSplitReadCopyNumberPredictions(const std::string& chr, std::vector& split_sv_calls, const CNVCaller &cnv_caller, const CHMM &hmm, double mean_chr_cov, const std::vector &pos_depth_map, const InputData &input_data); - void saveToVCF(const std::unordered_map> &sv_calls, const std::string &output_dir, const ReferenceGenome &ref_genome) const; + void saveToVCF(const std::unordered_map> &sv_calls, const std::string &output_dir, const ReferenceGenome &ref_genome, const std::unordered_map>& chr_pos_depth_map) const; // Query the read depth (INFO/DP) at a position - int getReadDepth(const std::vector& pos_depth_map, uint32_t start); + int getReadDepth(const std::vector& pos_depth_map, uint32_t start) const; public: SVCaller() = default; diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 7a7a3191..b25991be 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -133,7 +133,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map alignment_tids; // All unique chromosome IDs std::unordered_set supp_qnames; // All unique query names - std::unordered_map read_mismatch_rates; // Query name -> mismatch rate while (readNextAlignment(fp_in, itr, bam1) >= 0) { // Skip secondary and unmapped alignments, duplicates, QC failures, and low mapping quality @@ -158,14 +157,12 @@ void SVCaller::findSplitSVSignatures(std::unordered_maptarget_name[bam1->core.tid]; double mismatch_rate = getReadMismatchRate(bam1, supp_chr, ref_genome); - read_mismatch_rates[qname] = mismatch_rate; - // printMessage("[TEST] Mismatch rate for " + qname + ": " + std::to_string(mismatch_rate) + " at position " + std::to_string(bam1->core.pos + 1) + std::string(bam_endpos(bam1) > 0 ? "-" + std::to_string(bam_endpos(bam1)) : "")); // Store chromosome (TID), start, and end positions (1-based) of the // supplementary alignment, and the strand (true for forward, false // for reverse) std::pair qpos = getAlignmentReadPositions(bam1); - supp_map[qname].push_back(SuppAlignment{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), 0}); + supp_map[qname].push_back(SuppAlignment{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), mismatch_rate}); alignment_tids.insert(bam1->core.tid); supp_qnames.insert(qname); supplementary_count++; @@ -177,6 +174,10 @@ void SVCaller::findSplitSVSignatures(std::unordered_map> to_remove; for (auto& chr_primary : primary_map) { @@ -198,6 +199,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_maptarget_name[primary_tid]; @@ -267,14 +269,11 @@ void SVCaller::findSplitSVSignatures(std::unordered_map starts; std::vector ends; std::vector primary_strands; - double min_mismatch_rate = 1.0; for (const std::string& qname : primary_cluster) { const PrimaryAlignment& primary_aln = chr_primary_map.at(qname); starts.push_back(primary_aln.start); ends.push_back(primary_aln.end); primary_strands.push_back(primary_aln.strand); - min_mismatch_rate = std::min(min_mismatch_rate, read_mismatch_rates[qname]); - // printMessage("[TEST-SPLIT] Mismatch rate for " + qname + ": " + std::to_string(read_mismatch_rates[qname])); } // Get the largest cluster of primary alignment start positions @@ -296,6 +295,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map supp_ends; std::vector supp_strands; std::vector split_distances; + std::vector supp_mismatch_rates; for (const std::string& qname : primary_cluster) { const PrimaryAlignment& primary_aln = chr_primary_map.at(qname); const std::vector& supp_alns = supp_map.at(qname); @@ -306,6 +306,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_mapgetReadDepth(chr_pos_depth_map.at(chr_name), primary_pos); - SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, getSVTypeSymbol(SVType::INS), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, primary_cluster_size); + //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), primary_pos); + SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, getSVTypeSymbol(SVType::INS), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, primary_cluster_size); // Print if end position = 162908547 if (primary_pos + (read_distance - 1) == 162908547) { @@ -426,14 +432,14 @@ void SVCaller::findSplitSVSignatures(std::unordered_map= 50) { - int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), std::min(supp_best_start, supp_best_end)); + //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), std::min(supp_best_start, supp_best_end)); // Print if end position = 162908547 if (std::max(supp_best_start, supp_best_end) == 162908547) { printMessage("[TEST] Adding inversion SV candidate at " + chr_name + ":" + std::to_string(std::min(supp_best_start, supp_best_end)) + "-" + std::to_string(std::max(supp_best_start, supp_best_end)) + " with length " + std::to_string(std::abs(supp_best_start - supp_best_end))); } - SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, supp_cluster_size); + SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, supp_cluster_size); addSVCall(chr_sv_calls, sv_candidate); } } @@ -469,22 +475,12 @@ void SVCaller::findSplitSVSignatures(std::unordered_map 2000 && sv_length <= 1000000) { // Add an inversion call if necessary - int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start); + //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start); if (inversion) { - SVCall sv_candidate(sv_start, sv_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, cluster_size); + SVCall sv_candidate(sv_start, sv_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, cluster_size); addSVCall(chr_sv_calls, sv_candidate); - - // Print if end position = 162908547 - if (sv_end == 162908547) { - printMessage("[TEST] Adding inversion SV candidate at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length)); - } } else { - SVCall sv_candidate(sv_start, sv_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, cluster_size); - - // Print if end position = 162908547 - if (sv_end == 162908547) { - printMessage("[TEST] Adding deletion SV candidate at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and mismatch rate " + std::to_string(min_mismatch_rate)); - } + SVCall sv_candidate(sv_start, sv_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, cluster_size); addSVCall(chr_sv_calls, sv_candidate); } } @@ -493,13 +489,8 @@ void SVCaller::findSplitSVSignatures(std::unordered_map= min_length && sv_length <= max_length) { SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; std::string alt = (sv_type == SVType::INV) ? "" : "."; - int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start); - - // Print if end position = 162908547 - if (sv_end == 162908547) { - printMessage("[TEST] Adding CNV SV candidate at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and type " + getSVTypeSymbol(sv_type) + " and mismatch rate " + std::to_string(min_mismatch_rate)); - } - SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, read_depth, min_mismatch_rate, cluster_size); + //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start); + SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, cluster_size); addSVCall(chr_sv_calls, sv_candidate); } } @@ -526,7 +517,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map& sv_calls, const std::vector& pos_depth_map, const ReferenceGenome& ref_genome, std::unordered_map& read_mismatch_rates) +void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map) { // Create a read and iterator for the region bam1_t *bam1 = bam_init1(); @@ -552,7 +543,7 @@ void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, c // Process the alignment // bool primary = !(bam1->core.flag & BAM_FSUPPLEMENTARY); - this->processCIGARRecord(bamHdr, bam1, sv_calls, pos_depth_map, ref_genome, read_mismatch_rates); + this->processCIGARRecord(bamHdr, bam1, sv_calls, pos_depth_map); } // Clean up the iterator and alignment @@ -609,15 +600,12 @@ double SVCaller::getReadMismatchRate(bam1_t *alignment, const std::string& chr, return mismatch_rate; } -void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector &sv_calls, const std::vector &pos_depth_map, const ReferenceGenome &ref_genome, std::unordered_map &read_mismatch_rates) +void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector &sv_calls, const std::vector &pos_depth_map) { std::string chr = header->target_name[alignment->core.tid]; // Chromosome name uint32_t aln_start = (uint32_t)alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) uint32_t pos = aln_start; - uint32_t end = (uint32_t)bam_endpos(alignment) - 1; // Rightmost position of the alignment in the reference genome (0-based) - - // Get the reference sequence (used for mismatch rate) - std::string_view ref_seq = ref_genome.query(chr, pos + 1, end + 1); + // uint32_t end = (uint32_t)bam_endpos(alignment) - 1; // Rightmost position of the alignment in the reference genome (0-based) uint32_t* cigar = bam_get_cigar(alignment); // CIGAR array int cigar_len = alignment->core.n_cigar; @@ -635,8 +623,6 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec amb_bases_bitset.set(std::tolower(base)); } - int match_count = 0; - int mismatch_count = 0; std::vector cigar_sv_calls; cigar_sv_calls.reserve(1000); for (int i = 0; i < cigar_len; i++) { @@ -658,54 +644,18 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec ins_seq_str[j] = base; } } - - // // Before the insertion - // if (pos >= (uint32_t)op_len-1) - // { - // uint32_t bp1 = pos - (op_len - 1) + 1; - // uint32_t bp2 = bp1 + op_len - 1; //pos + 1; - - // if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) - // { - // int read_depth = this->getReadDepth(pos_depth_map, bp1); - // SVCall sv_call(bp1, bp2, SVType::DUP, "", "LSEQSIM", "./.", default_lh, read_depth, 1, 0); - // // addSVCall(sv_calls, sv_call); - // addSVCall(cigar_sv_calls, sv_call); - // continue; - // } - // } - - // // After the insertion - // if (pos + op_len < ref_genome.getChromosomeLength(chr)) - // { - // uint32_t bp1 = pos + 1; - // uint32_t bp2 = bp1 + op_len - 1; - - // if (ref_genome.compare(chr, bp1, bp2, ins_seq_str, DUP_SEQSIM_THRESHOLD)) - // { - // int read_depth = this->getReadDepth(pos_depth_map, bp1); - // SVCall sv_call(bp1, bp2, SVType::DUP, "", "RSEQSIM", "./.", default_lh, read_depth, 1, 0); - // // addSVCall(sv_calls, sv_call); - // addSVCall(cigar_sv_calls, sv_call); - // continue; - // } - // } // Add as an insertion uint32_t ins_pos = pos + 1; uint32_t ins_end = ins_pos + op_len - 1; - int read_depth = this->getReadDepth(pos_depth_map, ins_pos); + //int read_depth = this->getReadDepth(pos_depth_map, ins_pos); // Determine the ALT allele format based on small vs. large insertion std::string alt_allele = ""; if (op_len <= 50) { alt_allele = ins_seq_str; } - // SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, - // "CIGARINS", "./.", default_lh, read_depth, 1, 0); - SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, read_depth, 1, 0); - // addSVCall(sv_calls, sv_call); - // addSVCall(cigar_sv_calls, sv_call); + SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, 0, 1, 0); cigar_sv_calls.emplace_back(sv_call); // Process clipped bases as potential insertions @@ -713,7 +663,6 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec // Soft-clipped bases are considered as potential insertions // Skip if the position exceeds the reference genome length if (pos + 1 >= pos_depth_map.size()) { - // printMessage("Skipping soft-clipped insertion at position " + std::to_string(pos + 1) + " as it exceeds the reference genome length"); continue; } @@ -732,29 +681,25 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec // Add as an insertion uint32_t ins_pos = pos + 1; uint32_t ins_end = ins_pos + op_len - 1; - int read_depth = this->getReadDepth(pos_depth_map, ins_pos); + //int read_depth = this->getReadDepth(pos_depth_map, ins_pos); // Determine the ALT allele format based on small vs. large insertion std::string alt_allele = ""; if (op_len <= 50) { alt_allele = ins_seq_str; } - // SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, - // "CIGARCLIP", "./.", default_lh, read_depth, 0.0, 0); - SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, read_depth, 0.0, 0); - // addSVCall(sv_calls, sv_call); + SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, 0, 0.0, 0); cigar_sv_calls.emplace_back(sv_call); // Commented for testing - // printMessage("Completed adding SV: " + std::to_string(ins_pos) + "-" + std::to_string(ins_end) + " " + alt_allele + ", RD=" + std::to_string(read_depth) + ", data type=" + sv_call.data_type); // Check if the CIGAR operation is a deletion } else if (op == BAM_CDEL) { ref_pos = pos+1; ref_end = ref_pos + op_len -1; - int read_depth = this->getReadDepth(pos_depth_map, ref_pos); + //int read_depth = this->getReadDepth(pos_depth_map, ref_pos); // SVCall sv_call(ref_pos, ref_end, SVType::DEL, "", // "CIGARDEL", "./.", default_lh, read_depth, 1, 0); - SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, read_depth, 1, 0); + SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, 0, 1, 0); // addSVCall(sv_calls, sv_call); // addSVCall(cigar_sv_calls, sv_call); cigar_sv_calls.emplace_back(sv_call); @@ -842,7 +787,7 @@ std::pair SVCaller::getAlignmentReadPositions(bam1_t *alignment) return std::make_pair(query_start, query_end); } -void SVCaller::processChromosome(const std::string& chr, std::vector& chr_sv_calls, const InputData& input_data, const ReferenceGenome& ref_genome, const std::vector& chr_pos_depth_map, double mean_chr_cov, std::unordered_map& read_mismatch_rates) +void SVCaller::processChromosome(const std::string& chr, std::vector& chr_sv_calls, const InputData& input_data, const std::vector& chr_pos_depth_map, double mean_chr_cov) { // Open the BAM file std::string bam_filepath = input_data.getLongReadBam(); @@ -883,7 +828,7 @@ void SVCaller::processChromosome(const std::string& chr, std::vector& ch // ----------------------------------------------------------------------- // Detect SVs from the CIGAR strings printMessage(chr + ": CIGAR SVs..."); - this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map, ref_genome, read_mismatch_rates); + this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map); printMessage(chr + ": Merging CIGAR..."); mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false); @@ -956,7 +901,6 @@ void SVCaller::run(const InputData& input_data) chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end()); } std::unordered_map> whole_genome_sv_calls; - std::unordered_map read_mismatch_rates; int current_chr = 0; int total_chr_count = chromosomes.size(); @@ -973,18 +917,10 @@ void SVCaller::run(const InputData& input_data) try { std::vector sv_calls; InputData chr_input_data = input_data; // Use a thread-local copy - std::unordered_map chr_read_mismatch_rates; - this->processChromosome(chr, sv_calls, chr_input_data, ref_genome, chr_pos_depth_map[chr], chr_mean_cov_map[chr], chr_read_mismatch_rates); + this->processChromosome(chr, sv_calls, chr_input_data, chr_pos_depth_map[chr], chr_mean_cov_map[chr]); { std::shared_lock lock(this->shared_mutex); - - // Update the SV calls for the chromosome whole_genome_sv_calls[chr] = std::move(sv_calls); - - // Update the mismatch rates for each read name - for (const auto& entry : chr_read_mismatch_rates) { - read_mismatch_rates[entry.first] = entry.second; - } } } catch (const std::exception& e) { printError("Error processing chromosome " + chr + ": " + e.what()); @@ -997,7 +933,6 @@ void SVCaller::run(const InputData& input_data) std::vector> futures; for (const auto& chr : chromosomes) { futures.emplace_back(pool.enqueue([&, chr] { - // printMessage("Processing chromosome " + chr); process_chr(chr); })); } @@ -1083,7 +1018,7 @@ void SVCaller::run(const InputData& input_data) // Save to VCF std::cout << "Saving SVs to VCF..." << std::endl; const std::string output_dir = input_data.getOutputDir(); - this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome); + this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome, chr_pos_depth_map); } void SVCaller::findOverlaps(const std::unique_ptr &root, const PrimaryAlignment &query, std::vector &result) @@ -1192,7 +1127,7 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve } } -void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome) const +void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome, const std::unordered_map>& chr_pos_depth_map) const { std::cout << "Creating VCF writer..." << std::endl; std::string output_vcf = output_dir + "/output.vcf"; @@ -1286,7 +1221,7 @@ void SVCaller::saveToVCF(const std::unordered_mapgetReadDepth(chr_pos_depth_map.at(chr), start); // Create the VCF parameter strings std::string sv_type_str = getSVTypeString(sv_type); @@ -1374,7 +1312,7 @@ void SVCaller::saveToVCF(const std::unordered_map& pos_depth_map, uint32_t start) +int SVCaller::getReadDepth(const std::vector& pos_depth_map, uint32_t start) const { int read_depth = 0; try { From b7eed21fc6816d12e6a96df8622ef430594ce4b9 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 11 Apr 2025 11:24:54 -0400 Subject: [PATCH 101/134] add cn state and read distance features --- include/cnv_caller.h | 2 +- include/sv_caller.h | 5 +- include/sv_object.h | 12 +- src/cnv_caller.cpp | 12 +- src/sv_caller.cpp | 530 +++++++++++++++++++++++-------------------- src/sv_object.cpp | 2 +- 6 files changed, 296 insertions(+), 267 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 87a9f011..03764da8 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -89,7 +89,7 @@ class CNVCaller { // Run copy number prediction for a single SV candidate, returning the // likelihood, predicted CNV type, genotype, and whether SNPs were found - std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; + std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; diff --git a/include/sv_caller.h b/include/sv_caller.h index 4a30820c..4ef59700 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -46,7 +46,6 @@ class SVCaller { int query_start; int query_end; bool strand; - double mismatch_rate; // Mismatch rate for this alignment }; struct SplitSignature { @@ -75,7 +74,7 @@ class SVCaller { std::vector getChromosomes(const std::string& bam_filepath); - void findSplitSVSignatures(std::unordered_map>& sv_calls, const InputData& input_data, const std::unordered_map>& chr_pos_depth_map, const ReferenceGenome& ref_genome); + void findSplitSVSignatures(std::unordered_map>& sv_calls, const InputData& input_data); // Process a single CIGAR record and find candidate SVs void processCIGARRecord(bam_hdr_t* header, bam1_t* alignment, std::vector& sv_calls, const std::vector& pos_depth_map); @@ -85,8 +84,6 @@ class SVCaller { void processChromosome(const std::string& chr, std::vector& combined_sv_calls, const InputData& input_data, const std::vector& chr_pos_depth_map, double mean_chr_cov); void findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, const std::string& region, std::vector& sv_calls, const std::vector& pos_depth_map); - - double getReadMismatchRate(bam1_t * alignment, const std::string& chr, const ReferenceGenome & ref_genome); // Read the next alignment from the BAM file in a thread-safe manner int readNextAlignment(samFile *fp_in, hts_itr_t *itr, bam1_t *bam1); diff --git a/include/sv_object.h b/include/sv_object.h index 08fe8f70..a99fb4fb 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -21,16 +21,20 @@ struct SVCall { SVDataType data_type = SVDataType::UNKNOWN; Genotype genotype = Genotype::UNKNOWN; double hmm_likelihood = 0.0; - int read_depth = 0; // Breakpoint depth - double mismatch_rate = 0.0; // Highest mismatch rate in reads used for the SV call + int cn_state = 0; // Copy number state + int aln_offset = 0; // Alignment offset (read vs. reference distance factor) + // int read_depth = 0; // Breakpoint depth + // double mismatch_rate = 0.0; // Highest mismatch rate in reads used for the SV call int cluster_size = 0; // Number of SV calls in the cluster bool operator<(const SVCall& other) const; SVCall() = default; - SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int read_depth, double mismatch_rate, int cluster_size) : - start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), mismatch_rate(mismatch_rate), cluster_size(cluster_size) {} + SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int cn_state, int aln_offset, int cluster_size) : + start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), cn_state(cn_state), aln_offset(aln_offset), cluster_size(cluster_size) {} + // SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int read_depth, double mismatch_rate, int cluster_size) : + // start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), mismatch_rate(mismatch_rate), cluster_size(cluster_size) {} }; void addSVCall(std::vector& sv_calls, SVCall& sv_call); diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index c293d7fb..c7832fd1 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -155,13 +155,13 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end snp_data.is_snp = std::move(is_snp_hmm); } -std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const +std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const { // Check that the start position is less than the end position if (start_pos > end_pos) { printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); - return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false); + return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false, 0); } // Run the Viterbi algorithm on SNPs in the SV region @@ -197,7 +197,7 @@ std::tuple CNVCaller::runCopyNumberPrediction(st runViterbi(hmm, snp_data, prediction); if (prediction.first.size() == 0) { - return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false); + return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false, 0); } std::vector& state_sequence = prediction.first; @@ -238,7 +238,6 @@ std::tuple CNVCaller::runCopyNumberPrediction(st if ((double) max_count / (double) state_count > pct_threshold) { predicted_cnv_type = getSVTypeFromCNState(max_state); - // genotype = cnv_genotype_map.at(max_state); genotype = getGenotypeFromCNState(max_state); } snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data @@ -281,7 +280,7 @@ std::tuple CNVCaller::runCopyNumberPrediction(st this->saveSVCopyNumberToJSON(before_sv, after_sv, snp_data, chr, start_pos, end_pos, cnv_type_str, likelihood, json_filepath); } - return std::make_tuple(likelihood, predicted_cnv_type, genotype, true); + return std::make_tuple(likelihood, predicted_cnv_type, genotype, true, max_state); } @@ -369,9 +368,10 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector SVCaller::getChromosomes(const std::string &bam_filepat return chromosomes; } -void SVCaller::findSplitSVSignatures(std::unordered_map> &sv_calls, const InputData &input_data, const std::unordered_map>& chr_pos_depth_map, const ReferenceGenome& ref_genome) +void SVCaller::findSplitSVSignatures(std::unordered_map> &sv_calls, const InputData &input_data) { // Open the BAM file std::string bam_filepath = input_data.getLongReadBam(); @@ -154,15 +154,11 @@ void SVCaller::findSplitSVSignatures(std::unordered_mapcore.flag & BAM_FSUPPLEMENTARY) { - // Get the mismatch rate for the read - const std::string supp_chr = bamHdr->target_name[bam1->core.tid]; - double mismatch_rate = getReadMismatchRate(bam1, supp_chr, ref_genome); - // Store chromosome (TID), start, and end positions (1-based) of the // supplementary alignment, and the strand (true for forward, false // for reverse) std::pair qpos = getAlignmentReadPositions(bam1); - supp_map[qname].push_back(SuppAlignment{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE), mismatch_rate}); + supp_map[qname].push_back(SuppAlignment{bam1->core.tid, bam1->core.pos + 1, bam_endpos(bam1), qpos.first, qpos.second, !(bam1->core.flag & BAM_FREVERSE)}); alignment_tids.insert(bam1->core.tid); supp_qnames.insert(qname); supplementary_count++; @@ -206,6 +202,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map chr_sv_calls; + chr_sv_calls.reserve(1000); const std::unordered_map& chr_primary_map = chr_primary.second; // Identify overlapping primary alignments and cluster endpoints @@ -294,32 +291,33 @@ void SVCaller::findSplitSVSignatures(std::unordered_map supp_starts; std::vector supp_ends; std::vector supp_strands; - std::vector split_distances; - std::vector supp_mismatch_rates; + std::vector read_distances; + std::vector ref_distances; for (const std::string& qname : primary_cluster) { const PrimaryAlignment& primary_aln = chr_primary_map.at(qname); const std::vector& supp_alns = supp_map.at(qname); for (const SuppAlignment& supp_aln : supp_alns) { if (supp_aln.tid == primary_tid) { // Same chromosome - int distance = 0; + int read_distance = 0; + int ref_distance = 0; supp_starts.push_back(supp_aln.start); supp_ends.push_back(supp_aln.end); supp_strands.push_back(supp_aln.strand); - supp_mismatch_rates.push_back(supp_aln.mismatch_rate); // Calculate the distance between the primary and supplementary // alignments on the read if on the same chromosome and same // strand if (supp_aln.strand == primary_aln.strand) { // Same strand - // Calculate distance (negative if overlapping) - if (primary_aln.query_start <= supp_aln.query_start) { - distance = supp_aln.query_start - primary_aln.query_end; - } else { - distance = primary_aln.query_start - supp_aln.query_end; - } - split_distances.push_back(distance); + // Calculate distance between alignments on the read + read_distance = std::max(0, std::max(static_cast(supp_aln.query_start), static_cast(primary_aln.query_start)) - std::min(static_cast(supp_aln.query_end), static_cast(primary_aln.query_end))); + + // Calculate distance between alignments on the + // reference + ref_distance = std::max(0, std::max(static_cast(supp_aln.start), static_cast(primary_aln.start)) - std::min(static_cast(supp_aln.end), static_cast(primary_aln.end))); + read_distances.push_back(read_distance); + ref_distances.push_back(ref_distance); } } else { @@ -327,11 +325,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map supp_end_cluster = dbscan.getLargestCluster(supp_ends); - // Get the largest cluster of split distances - dbscan.fit(split_distances); - std::vector split_distance_cluster = dbscan.getLargestCluster(split_distances); + // Get the largest cluster of read distances + dbscan.fit(read_distances); + std::vector read_distance_cluster = dbscan.getLargestCluster(read_distances); + + // Get the largest cluster of reference distances + dbscan.fit(ref_distances); + std::vector ref_distance_cluster = dbscan.getLargestCluster(ref_distances); // Continue if no clusters were found - if (supp_start_cluster.empty() && supp_end_cluster.empty() && split_distance_cluster.empty()) { + if (supp_start_cluster.empty() && supp_end_cluster.empty() && read_distance_cluster.empty() && ref_distance_cluster.empty()) { continue; } // Use the median of the largest cluster of primary and supplementary // alignment start, end positions as the final genome coordinates of the // SV - int primary_pos = -1; - int primary_pos2 = -1; + // int primary_pos = -1; + // int primary_pos2 = -1; + std::vector primary_positions; int primary_cluster_size = 0; - if (primary_start_cluster.size() > primary_end_cluster.size()) { + if (!primary_start_cluster.empty()) { std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); - primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; + primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]); primary_cluster_size = primary_start_cluster.size(); - } else if (primary_end_cluster.size() > primary_start_cluster.size()) { - std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); - primary_pos = primary_end_cluster[primary_end_cluster.size() / 2]; - primary_cluster_size = primary_end_cluster.size(); - } else { - // Use both positions - std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); + } + + if (!primary_end_cluster.empty()) { std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); - primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; - primary_pos2 = primary_end_cluster[primary_end_cluster.size() / 2]; - primary_cluster_size = primary_start_cluster.size(); + primary_positions.push_back(primary_end_cluster[primary_end_cluster.size() / 2]); + primary_cluster_size = std::max(primary_cluster_size, (int) primary_end_cluster.size()); } + // if (primary_start_cluster.size() > primary_end_cluster.size()) { + // std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); + // // primary_pos = + // // primary_start_cluster[primary_start_cluster.size() / 2]; + // primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]); + // primary_cluster_size = primary_start_cluster.size(); + // } else if (primary_end_cluster.size() > primary_start_cluster.size()) { + // std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); + // // primary_pos = primary_end_cluster[primary_end_cluster.size() + // // / 2]; + // primary_positions.push_back(primary_end_cluster[primary_end_cluster.size() / 2]); + // primary_cluster_size = primary_end_cluster.size(); + // } else { + // // Use both positions + // std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); + // std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); + // // primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; + // // primary_pos2 = primary_end_cluster[primary_end_cluster.size() + // // / 2]; + // primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]); + // primary_cluster_size = primary_start_cluster.size(); + // } // ------------------------------- - // SPLIT INSERTION DETECTION + // SPLIT INSERTION CALLS int read_distance = 0; - if (!split_distance_cluster.empty()) { + int ref_distance = 0; + if (!read_distance_cluster.empty() && !ref_distance_cluster.empty()) { // Use the median of the largest cluster of split distances as the // insertion size - std::sort(split_distance_cluster.begin(), split_distance_cluster.end()); - read_distance = split_distance_cluster[split_distance_cluster.size() / 2]; + std::sort(read_distance_cluster.begin(), read_distance_cluster.end()); + read_distance = read_distance_cluster[read_distance_cluster.size() / 2]; + + std::sort(ref_distance_cluster.begin(), ref_distance_cluster.end()); + ref_distance = ref_distance_cluster[ref_distance_cluster.size() / 2]; // Add an insertion SV call at the primary position - if (primary_pos != -1 && read_distance > 2000) { - if (primary_pos2 != -1) { - // If two positions were found, use the 5'most position - primary_pos = std::min(primary_pos, primary_pos2); - } - //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), primary_pos); - SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), SVType::INS, getSVTypeSymbol(SVType::INS), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, primary_cluster_size); - - // Print if end position = 162908547 - if (primary_pos + (read_distance - 1) == 162908547) { - printMessage("[TEST] Adding insertion SV candidate at " + chr_name + ":" + std::to_string(primary_pos) + "-" + std::to_string(primary_pos + (read_distance - 1)) + " with length " + std::to_string(read_distance)); + // Using a minimum read distance of 2000bp since most insertions + // < 2kb can be identified more accurately using CIGAR-based + // methods + // if (primary_pos != -1 && read_distance > 2000) { + // if (primary_pos != -1) { + // if (primary_pos2 != -1) { + // // If two positions were found, use the 5'most position + // primary_pos = std::min(primary_pos, primary_pos2); + // } + if (!primary_positions.empty()) { + int aln_offset = static_cast(ref_distance - read_distance); + if (read_distance > ref_distance && read_distance >= min_length && read_distance <= max_length) { + // Add an insertion SV call at the primary positions + SVType sv_type = SVType::INS; + // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); + // addSVCall(chr_sv_calls, sv_candidate); + for (int primary_pos : primary_positions) { + SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); + addSVCall(chr_sv_calls, sv_candidate); + } + } else if (ref_distance > read_distance && ref_distance >= min_length && ref_distance <= max_length) { + for (int primary_pos : primary_positions) { + SVType sv_type = SVType::DEL; + SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); + addSVCall(chr_sv_calls, sv_candidate); + } + // Add a deletion SV call at the primary position + // SVType sv_type = SVType::DEL; + // SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); } - addSVCall(chr_sv_calls, sv_candidate); } + + // if (ref_distance >= 50 && ref_distance < read_distance) { + // // Add an insertion SV call at the primary position + // SVType sv_type = SVType::INS; + // int aln_offset = static_cast(ref_distance - read_distance); + // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); + // addSVCall(chr_sv_calls, sv_candidate); + // } + // SVType sv_type = SVType::INS; + // int aln_offset = static_cast(read_distance - ref_distance); + // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); + // addSVCall(chr_sv_calls, sv_candidate); } + // if (!ref_distance_cluster.empty()) { + // // Use the median of the largest cluster of split distances as the + // // insertion size + // std::sort(ref_distance_cluster.begin(), ref_distance_cluster.end()); + // size_t median_index = ref_distance_cluster.size() / 2; + // ref_distance = ref_distance_cluster[median_index]; + // read_distance = read_distance_cluster[median_index]; + + // // Add a deletion SV call at the primary position + // if (primary_pos != -1 && ref_distance >= 50 && ref_distance > read_distance) { + // if (primary_pos2 != -1) { + // // If two positions were found, use the 5'most position + // primary_pos = std::min(primary_pos, primary_pos2); + // } + // SVType sv_type = SVType::DEL; + // int aln_offset = static_cast(ref_distance - read_distance); + // SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); + // addSVCall(chr_sv_calls, sv_candidate); + + // // Add an inversion if necessary (inverted deletion) + // if (inversion) { + // SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); + // addSVCall(chr_sv_calls, sv_candidate); + // } + // } + // } + // -------------------------------- // Get the supplementary alignment positions - int supp_pos = -1; - int supp_pos2 = -1; + // int supp_pos = -1; + // int supp_pos2 = -1; + std::vector supp_positions; int supp_cluster_size = 0; - int supp_best_start = -1; - int supp_best_end = -1; + // int supp_best_start = -1; + // int supp_best_end = -1; + // if (!supp_start_cluster.empty()) { + // std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); + // supp_best_start = supp_start_cluster[supp_start_cluster.size() / 2]; + // } + // if (!supp_end_cluster.empty()) { + // std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); + // supp_best_end = supp_end_cluster[supp_end_cluster.size() / 2]; + // } + if (!supp_start_cluster.empty()) { std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); - supp_best_start = supp_start_cluster[supp_start_cluster.size() / 2]; + supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]); + supp_cluster_size = supp_start_cluster.size(); } if (!supp_end_cluster.empty()) { std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); - supp_best_end = supp_end_cluster[supp_end_cluster.size() / 2]; + supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]); + supp_cluster_size = std::max(supp_cluster_size, (int) supp_end_cluster.size()); } - if (supp_start_cluster.size() > supp_end_cluster.size()) { - supp_pos = supp_best_start; - supp_cluster_size = supp_start_cluster.size(); - } else if (supp_end_cluster.size() > supp_start_cluster.size()) { - supp_pos = supp_best_end; - supp_cluster_size = supp_end_cluster.size(); - } else if (supp_best_end == -1 && supp_best_start == -1) { - // Use both positions. This has been shown to occur in some nested SVs - supp_pos = supp_best_start; - supp_pos2 = supp_best_end; - supp_cluster_size = supp_start_cluster.size(); - } + // if (supp_start_cluster.size() > supp_end_cluster.size()) { + // // supp_pos = supp_best_start; + // std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); + // supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]); + // // supp_positions.push_back(supp_best_start); + // supp_cluster_size = supp_start_cluster.size(); + // } else if (supp_end_cluster.size() > supp_start_cluster.size()) { + // // supp_pos = supp_best_end; + // // supp_positions.push_back(supp_best_end); + // std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); + // supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]); + // supp_cluster_size = supp_end_cluster.size(); + // } else if (supp_start_cluster.size() == supp_end_cluster.size() && !supp_start_cluster.empty() && !supp_end_cluster.empty()) { + // // Use both positions. This has been shown to occur in some nested SVs + // // supp_pos = supp_best_start; + // // supp_pos2 = supp_best_end; + // std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); + // std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); + // supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]); + // supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]); + // supp_cluster_size = supp_start_cluster.size(); + // // supp_positions.push_back(supp_best_start); + // // supp_positions.push_back(supp_best_end); + // supp_cluster_size = supp_start_cluster.size(); + // } // Store the inversion as the supplementary start and end positions - if (supp_best_start != -1 && supp_best_end != -1) { - if (inversion && std::abs(supp_best_start - supp_best_end) >= 50) { - //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), std::min(supp_best_start, supp_best_end)); - - // Print if end position = 162908547 - if (std::max(supp_best_start, supp_best_end) == 162908547) { - printMessage("[TEST] Adding inversion SV candidate at " + chr_name + ":" + std::to_string(std::min(supp_best_start, supp_best_end)) + "-" + std::to_string(std::max(supp_best_start, supp_best_end)) + " with length " + std::to_string(std::abs(supp_best_start - supp_best_end))); - } - - SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, supp_cluster_size); + if (inversion && supp_positions.size() > 1) { + std::sort(supp_positions.begin(), supp_positions.end()); + int supp_start = supp_positions.front(); + int supp_end = supp_positions.back(); + int sv_length = std::abs(supp_start - supp_end); + if (sv_length >= min_length && sv_length <= max_length) { + SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); addSVCall(chr_sv_calls, sv_candidate); } + // int sv_length = std::abs(supp_best_start - supp_best_end); + // if (inversion && sv_length >= min_length && sv_length <= max_length) { + // // SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); + // // addSVCall(chr_sv_calls, sv_candidate); + // } } // If two of either were found, use the larger SV candidate - if (primary_pos2 != -1) { - int sv_length1 = std::abs(primary_pos - supp_pos); - int sv_length2 = std::abs(primary_pos2 - supp_pos); - if (sv_length2 > sv_length1) { - primary_pos = primary_pos2; - } - } - if (supp_pos2 != -1) { - int sv_length1 = std::abs(primary_pos - supp_pos); - int sv_length2 = std::abs(primary_pos - supp_pos2); - if (sv_length2 > sv_length1) { - supp_pos = supp_pos2; - } - } + // if (primary_pos2 != -1) { + // int sv_length1 = std::abs(primary_pos - supp_pos); + // int sv_length2 = std::abs(primary_pos2 - supp_pos); + // if (sv_length2 > sv_length1) { + // primary_pos = primary_pos2; + // } + // } + // if (supp_pos2 != -1) { + // int sv_length1 = std::abs(primary_pos - supp_pos); + // int sv_length2 = std::abs(primary_pos - supp_pos2); + // if (sv_length2 > sv_length1) { + // supp_pos = supp_pos2; + // } + // } - if (primary_pos == -1 || supp_pos == -1) { - continue; - } + // if (primary_pos == -1 || supp_pos == -1) { + // continue; + // } // Store the SV candidate if the length is within the specified range - int sv_start = std::min(primary_pos, supp_pos); - int sv_end = std::max(primary_pos, supp_pos); - int sv_length = sv_end - sv_start + 1; - int cluster_size = std::max(primary_cluster_size, supp_cluster_size); + // int sv_start = std::min(primary_pos, supp_pos); + // int sv_end = std::max(primary_pos, supp_pos); + // int sv_length = sv_end - sv_start + 1; + // int cluster_size = std::max(primary_cluster_size, supp_cluster_size); // If the read distance is < 30bp while the SV is > 2kb, then this is a // potential deletion - if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) { - - // Add an inversion call if necessary - //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start); - if (inversion) { - SVCall sv_candidate(sv_start, sv_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, cluster_size); - addSVCall(chr_sv_calls, sv_candidate); - } else { - SVCall sv_candidate(sv_start, sv_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, cluster_size); - addSVCall(chr_sv_calls, sv_candidate); - } - } + // if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) { + + // // Add an inversion call if necessary + // if (inversion) { + // for (int primary_pos : primary_positions) { + // for (int supp_pos : supp_positions) { + // SVCall sv_candidate(std::min(primary_pos, supp_pos), std::max(primary_pos, supp_pos), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size); + // addSVCall(chr_sv_calls, sv_candidate); + // } + // } + // // SVCall sv_candidate(sv_start, sv_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size); + // // addSVCall(chr_sv_calls, sv_candidate); + // } else { + // for (int primary_pos : primary_positions) { + // for (int supp_pos : supp_positions) { + // uint32_t sv_start = std::min(primary_pos, supp_pos); + // uint32_t sv_end = std::max(primary_pos, supp_pos); + // if (sv_end - sv_start + 1 >= 50) { + // SVCall sv_candidate(std::min(primary_pos, supp_pos), std::max(primary_pos, supp_pos), SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size); + // addSVCall(chr_sv_calls, sv_candidate); + // } + // } + // // SVCall sv_candidate(sv_start, sv_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size); + // // addSVCall(chr_sv_calls, sv_candidate); + // } + // } // Add a dummy SV call for CNV detection - else if (sv_length >= min_length && sv_length <= max_length) { - SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; - std::string alt = (sv_type == SVType::INV) ? "" : "."; - //int read_depth = this->getReadDepth(chr_pos_depth_map.at(chr_name), sv_start); - SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, mean_supp_mismatch_rate, cluster_size); - addSVCall(chr_sv_calls, sv_candidate); + // if (sv_length >= min_length && sv_length <= max_length) { + int cluster_size = std::max(primary_cluster_size, supp_cluster_size); + SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; + std::string alt = (sv_type == SVType::INV) ? "" : "."; + for (int primary_pos : primary_positions) { + for (int supp_pos : supp_positions) { + int sv_start = std::min(primary_pos, supp_pos); + int sv_end = std::max(primary_pos, supp_pos); + int sv_length = sv_end - sv_start + 1; + if (sv_length >= min_length && sv_length <= max_length) { + // printMessage("Adding SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and cluster size " + std::to_string(cluster_size)); + SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size); + addSVCall(chr_sv_calls, sv_candidate); + } + } } + // SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size); + // addSVCall(chr_sv_calls, sv_candidate); + // } } // Combine SVs with identical start and end positions, and sum the cluster @@ -505,10 +633,10 @@ void SVCaller::findSplitSVSignatures(std::unordered_mapcore.flag & BAM_FSUPPLEMENTARY); this->processCIGARRecord(bamHdr, bam1, sv_calls, pos_depth_map); } @@ -551,61 +678,11 @@ void SVCaller::findCIGARSVs(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr, c bam_destroy1(bam1); } -double SVCaller::getReadMismatchRate(bam1_t *alignment, const std::string& chr, const ReferenceGenome & ref_genome) -{ - uint32_t* cigar = bam_get_cigar(alignment); // CIGAR array - int cigar_len = alignment->core.n_cigar; - uint32_t query_pos = 0; - uint32_t pos = (uint32_t)alignment->core.pos; - uint32_t aln_start = pos; - uint32_t end = (uint32_t)bam_endpos(alignment) - 1; // Rightmost position of the alignment in the reference genome (0-based) - - // Get the reference sequence - std::string_view ref_seq = ref_genome.query(chr, pos + 1, end + 1); - - // Loop through the CIGAR string and calculate the number of matches and - // mismatches - int match_count = 0; - int mismatch_count = 0; - for (int i = 0; i < cigar_len; i++) { - int op_len = bam_cigar_oplen(cigar[i]); // CIGAR operation length - int op = bam_cigar_op(cigar[i]); // CIGAR operation - if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - for (int j = 0; j < op_len; j++) { - char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; - if (base == ref_seq[pos - aln_start + j]) { - match_count++; - } else { - mismatch_count++; - } - } - } - // Update the reference position - // https://samtools.github.io/hts-specs/SAMv1.pdf - if (op == BAM_CMATCH || op == BAM_CDEL || op == BAM_CREF_SKIP || op == BAM_CEQUAL || op == BAM_CDIFF) { - pos += op_len; - } - - // Update the query position - if (op == BAM_CMATCH || op == BAM_CINS || op == BAM_CSOFT_CLIP || op == BAM_CEQUAL || op == BAM_CDIFF) { - query_pos += op_len; - } - } - - // Calculate the mismatch rate - double mismatch_rate = 0.0; - if (match_count + mismatch_count > 0) { - mismatch_rate = static_cast(mismatch_count) / static_cast(match_count + mismatch_count); - } - return mismatch_rate; -} - void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vector &sv_calls, const std::vector &pos_depth_map) { std::string chr = header->target_name[alignment->core.tid]; // Chromosome name uint32_t aln_start = (uint32_t)alignment->core.pos; // Leftmost position of the alignment in the reference genome (0-based) uint32_t pos = aln_start; - // uint32_t end = (uint32_t)bam_endpos(alignment) - 1; // Rightmost position of the alignment in the reference genome (0-based) uint32_t* cigar = bam_get_cigar(alignment); // CIGAR array int cigar_len = alignment->core.n_cigar; @@ -648,7 +725,6 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec // Add as an insertion uint32_t ins_pos = pos + 1; uint32_t ins_end = ins_pos + op_len - 1; - //int read_depth = this->getReadDepth(pos_depth_map, ins_pos); // Determine the ALT allele format based on small vs. large insertion std::string alt_allele = ""; @@ -681,7 +757,6 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec // Add as an insertion uint32_t ins_pos = pos + 1; uint32_t ins_end = ins_pos + op_len - 1; - //int read_depth = this->getReadDepth(pos_depth_map, ins_pos); // Determine the ALT allele format based on small vs. large insertion std::string alt_allele = ""; @@ -689,39 +764,16 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec alt_allele = ins_seq_str; } SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, 0, 0.0, 0); - cigar_sv_calls.emplace_back(sv_call); // Commented for testing + cigar_sv_calls.emplace_back(sv_call); // Check if the CIGAR operation is a deletion } else if (op == BAM_CDEL) { ref_pos = pos+1; ref_end = ref_pos + op_len -1; - //int read_depth = this->getReadDepth(pos_depth_map, ref_pos); - // SVCall sv_call(ref_pos, ref_end, SVType::DEL, "", - // "CIGARDEL", "./.", default_lh, read_depth, 1, 0); SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, 0, 1, 0); - // addSVCall(sv_calls, sv_call); - // addSVCall(cigar_sv_calls, sv_call); cigar_sv_calls.emplace_back(sv_call); } - - // For matches, calculate the sequence identity - // } else if (op == BAM_CMATCH || op == BAM_CEQUAL || op == BAM_CDIFF) { - // if (ref_seq.size() < static_cast(op_len)) { - // printError("ERROR: reference sequence length is less than the CIGAR operation length"); - // continue; - // } - - // // printMessage("Calculating sequence identity for matches"); - // for (int j = 0; j < op_len; j++) { - // char base = seq_nt16_str[bam_seqi(bam_get_seq(alignment), query_pos + j)]; - // if (base == ref_seq[pos - aln_start + j]) { - // match_count++; - // } else { - // mismatch_count++; - // } - // } - // } } // Update the reference position @@ -736,24 +788,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec } } - // Get the read name - // std::string read_name = bam_get_qname(alignment); - - // If read name starts with c08844a5 then print the read name and the number - // of matches and mismatches - // if (read_name.find("c08844a5") != std::string::npos) { - // printMessage(read_name + ": matches=" + std::to_string(match_count) + ", mismatches=" + std::to_string(mismatch_count) + ", mismatches/length=" + std::to_string((double)mismatch_count / (double)(match_count + mismatch_count))); - // } - // double mismatch_rate = (double)mismatch_count / (double)(match_count + mismatch_count); - // if (mismatch_rate > 0) { - // printMessage("Read name: " + read_name + ", mismatch rate: " + std::to_string(mismatch_rate) + ", matches: " + std::to_string(match_count) + ", mismatches: " + std::to_string(mismatch_count)); - // } - // read_mismatch_rates[read_name] = mismatch_rate; - // printMessage("Completed processing read: " + read_name); - - // Set the mismatch rate for all SVs from this read, and add the SV calls for (SVCall& sv_call : cigar_sv_calls) { - // sv_call.mismatch_rate = mismatch_rate; addSVCall(sv_calls, sv_call); } } @@ -858,6 +893,9 @@ void SVCaller::run(const InputData& input_data) // Get the chromosomes from the input BAM file chromosomes = this->getChromosomes(input_data.getLongReadBam()); } + + // [TEST] Use only the last 6 chromosomes + // chromosomes = {"chr6", "chr7", "chr8", "chr9", "chr10", "chr11"}; // Read the HMM from the file std::string hmm_filepath = input_data.getHMMFilepath(); @@ -971,7 +1009,7 @@ void SVCaller::run(const InputData& input_data) // Identify split-SV signatures printMessage("Identifying split-SV signatures..."); std::unordered_map> whole_genome_split_sv_calls; - this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data, chr_pos_depth_map, ref_genome); + this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data); printMessage("Running copy number predictions on split-read SVs..."); current_chr = 0; @@ -1060,14 +1098,17 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve { std::vector additional_calls; for (auto& sv_candidate : split_sv_calls) { - std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); + std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); Genotype genotype = std::get<2>(result); + int cn_state = std::get<3>(result); // For inversions with copy-neutral support, update the HMM likelihood if (supp_type == SVType::NEUTRAL && sv_candidate.sv_type == SVType::INV) { sv_candidate.hmm_likelihood = supp_lh; + sv_candidate.genotype = genotype; + sv_candidate.cn_state = cn_state; } // Update the SV type if the support is not neutral or unknown @@ -1077,32 +1118,21 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve sv_candidate.sv_type = supp_type; sv_candidate.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format sv_candidate.data_type = SVDataType::HMM; - sv_candidate.genotype = genotype; sv_candidate.hmm_likelihood = supp_lh; + sv_candidate.genotype = genotype; + sv_candidate.cn_state = cn_state; - // Print if end position = 162908547 - if (sv_candidate.end == 162908547) { - printMessage("SV at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + - " updated to type " + getSVTypeSymbol(supp_type) + - " with likelihood " + std::to_string(supp_lh) + - " and genotype " + getGenotypeString(genotype)); - } // Add an additional SV call if the type is different } else if (sv_candidate.sv_type != supp_type) { SVCall new_sv_call = sv_candidate; // Copy the original SV call new_sv_call.sv_type = supp_type; new_sv_call.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format new_sv_call.data_type = SVDataType::HMM; - new_sv_call.genotype = genotype; new_sv_call.hmm_likelihood = supp_lh; + new_sv_call.genotype = genotype; + new_sv_call.cn_state = cn_state; - // Print if end position = 162908547 - if (new_sv_call.end == 162908547) { - printMessage("Additional SV at " + chr + ":" + std::to_string(new_sv_call.start) + "-" + std::to_string(new_sv_call.end) + - " with type " + getSVTypeSymbol(supp_type) + - " and likelihood " + std::to_string(supp_lh) + - " and genotype " + getGenotypeString(genotype)); - } + // Add the new SV call to the list additional_calls.push_back(new_sv_call); } } @@ -1164,7 +1194,8 @@ void SVCaller::saveToVCF(const std::unordered_map", "##INFO=", "##INFO=", - "##INFO=", + "##INFO=", + "##INFO=", "##FILTER=", "##FILTER=", "##FORMAT=", @@ -1208,23 +1239,21 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls = pair.second; std::cout << "Saving SV calls for " << chr << "..." << std::endl; for (const auto& sv_call : sv_calls) { - // Get the SV candidate and SV info uint32_t start = sv_call.start; uint32_t end = sv_call.end; + int sv_length = end - start + 1; + std::string ref_allele = "."; + std::string alt_allele = sv_call.alt_allele; SVType sv_type = sv_call.sv_type; - // std::string genotype = sv_call.genotype; - // std::string data_type_str = sv_call.data_type; - // std::string alt_allele = sv_call.alt_allele; std::string genotype = getGenotypeString(sv_call.genotype); std::string data_type_str = getSVDataTypeString(sv_call.data_type); - std::string alt_allele = sv_call.alt_allele; double hmm_likelihood = sv_call.hmm_likelihood; - int sv_length = end - start + 1; int cluster_size = sv_call.cluster_size; //int read_depth = sv_call.read_depth; - std::string ref_allele = "."; - double mismatch_rate = sv_call.mismatch_rate; + // double mismatch_rate = sv_call.mismatch_rate; std::string filter = "PASS"; + int aln_offset = sv_call.aln_offset; + int cn_state = sv_call.cn_state; // If the SV type is unknown, print a warning and skip if (sv_type == SVType::UNKNOWN || sv_type == SVType::NEUTRAL) { @@ -1289,8 +1318,8 @@ void SVCaller::saveToVCF(const std::unordered_map samples = {sample_str}; @@ -1298,7 +1327,6 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls, SVCall& sv_call) { // Check if the SV call is valid if (sv_call.start > sv_call.end) { - printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end)); + printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " from data type " + getSVDataTypeString(sv_call.data_type)); return; } From 153db48e2e670922f6bd9ecb5142270860f191d4 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 14 Apr 2025 20:15:57 -0400 Subject: [PATCH 102/134] fix save cnv error --- include/cnv_caller.h | 9 +++++- include/sv_types.h | 10 ++----- src/cnv_caller.cpp | 45 +++++++++++++++++++---------- src/fasta_query.cpp | 10 ++++++- src/main.cpp | 29 +++++++++++++++++++ src/sv_caller.cpp | 51 +++++++++++++++++++++----------- src/sv_object.cpp | 69 ++++++++++---------------------------------- 7 files changed, 128 insertions(+), 95 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index 03764da8..deb9187d 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -8,6 +8,7 @@ #include "input_data.h" #include "sv_types.h" #include "sv_object.h" +#include "utils.h" /// @cond #include @@ -84,7 +85,13 @@ class CNVCaller { // Function to get the genotype string from the state inline Genotype getGenotypeFromCNState(int cn_state) const { - return StateGenotypeMap.at(cn_state); + // return StateGenotypeMap.at(cn_state); + try { + return StateGenotypeMap.at(cn_state); + } catch (const std::out_of_range& e) { + printError("ERROR: Invalid CN state: " + std::to_string(cn_state)); + return Genotype::UNKNOWN; + } } // Run copy number prediction for a single SV candidate, returning the diff --git a/include/sv_types.h b/include/sv_types.h index 58f6063b..dd67c2a4 100644 --- a/include/sv_types.h +++ b/include/sv_types.h @@ -20,9 +20,7 @@ namespace sv_types { INS = 3, BND = 4, NEUTRAL = 5, // Neutral copy number with unknown type - INV_DUP = 6, // Inverted duplication - INV_DEL = 7, // Inverted deletion - COMPLEX = 8 // Complex SV + LOH = 6 // Loss of heterozygosity }; // Mapping of SV types to strings @@ -34,9 +32,7 @@ namespace sv_types { {SVType::INS, "INS"}, {SVType::BND, "BND"}, {SVType::NEUTRAL, "NEUTRAL"}, - {SVType::INV_DUP, "INVDUP"}, - {SVType::INV_DEL, "INVDEL"}, - {SVType::COMPLEX, "COMPLEX"} + {SVType::LOH, "LOH"} }; // Mapping of SV types to symbols @@ -99,7 +95,7 @@ namespace sv_types { {1, SVType::DEL}, {2, SVType::DEL}, {3, SVType::NEUTRAL}, - {4, SVType::NEUTRAL}, + {4, SVType::LOH}, {5, SVType::DUP}, {6, SVType::DUP} }; diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index c7832fd1..dd602c1c 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -57,15 +57,24 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end std::vector snp_pos; std::unordered_map snp_baf_map; std::unordered_map snp_pfb_map; + printMessage("Reading SNP data for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf_map, snp_pfb_map, input_data); // Get the log2 ratio for evenly spaced positions in the // region sample_size = std::max((int) snp_pos.size(), sample_size); + // Print an error if the end position is less than or equal to the start + // position + if (start_pos > end_pos) + { + printError("ERROR: Invalid SNP region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); + return; + } + // Loop through evenly spaced positions in the region and get the log2 ratio - double pos_step = (double) (end_pos - start_pos + 1) / (double) sample_size; - std::unordered_set snp_pos_set(snp_pos.begin(), snp_pos.end()); + double pos_step = static_cast(end_pos - start_pos + 1) / static_cast(sample_size); + // double pos_step = (double) (end_pos - start_pos + 1) / (double) sample_size; std::unordered_map window_log2_map; for (int i = 0; i < sample_size; i++) { @@ -166,31 +175,30 @@ std::tuple CNVCaller::runCopyNumberPredicti // Run the Viterbi algorithm on SNPs in the SV region // Only extend the region if "save CNV data" is enabled - uint32_t snp_start_pos = start_pos; - uint32_t snp_end_pos = end_pos; SNPData before_sv; SNPData after_sv; if (input_data.getSaveCNVData()) { - uint32_t sv_half_length = (end_pos - start_pos) / 2.0; - if (start_pos > 1) + int sv_half_length = (static_cast(end_pos) - static_cast(start_pos)) / 2; + int before_sv_start = std::max(1, static_cast(start_pos) - sv_half_length); + int before_sv_end = std::max(1, static_cast(start_pos) - 1); + if (before_sv_start < before_sv_end) { - uint32_t before_sv_start = std::max((uint32_t) 1, start_pos - sv_half_length); - uint32_t before_sv_end = start_pos - 1; querySNPRegion(chr, before_sv_start, before_sv_end, pos_depth_map, mean_chr_cov, before_sv, input_data); } - uint32_t chr_last_index = pos_depth_map.size() - 1; - if (end_pos < chr_last_index) + + int chr_last_index = static_cast(pos_depth_map.size()) - 1; + int after_sv_start = std::min(chr_last_index, static_cast(end_pos) + 1); + int after_sv_end = std::min(chr_last_index, static_cast(end_pos) + sv_half_length); + if (after_sv_start < after_sv_end) { - uint32_t after_sv_start = end_pos + 1; - uint32_t after_sv_end = std::min(chr_last_index, end_pos + sv_half_length); querySNPRegion(chr, after_sv_start, after_sv_end, pos_depth_map, mean_chr_cov, after_sv, input_data); } } // Query the SNP region for the SV candidate SNPData snp_data; - querySNPRegion(chr, snp_start_pos, snp_end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data); + querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data); // Run the Viterbi algorithm std::pair, double> prediction; @@ -316,6 +324,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vectorquerySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data); // Run the Viterbi algorithm @@ -324,6 +333,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector, double> prediction; runViterbi(hmm, snp_data, prediction); std::vector& state_sequence = prediction.first; @@ -364,6 +374,11 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector ReferenceGenome::getChromosomes() const uint32_t ReferenceGenome::getChromosomeLength(std::string chr) const { - return this->chr_to_length.at(chr); + try + { + return this->chr_to_length.at(chr); + } + catch (const std::out_of_range& e) + { + printError("Chromosome " + chr + " not found in reference genome"); + return 0; + } } diff --git a/src/main.cpp b/src/main.cpp index b793619b..64fc4e5f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -4,6 +4,11 @@ /// @cond DOXYGEN_IGNORE #include #include + +// For signal handling +#include +#include + // #include /// @endcond @@ -12,8 +17,32 @@ #include "utils.h" +void printStackTrace(int sig) +{ + void *array[10]; + size_t size; + + // get void*'s for all entries on the stack + size = backtrace(array, 10); + + // print out all the frames to stderr + fprintf(stderr, "Error: signal %d:\n", sig); + backtrace_symbols_fd(array, size, STDERR_FILENO); + exit(1); +} + + void runContextSV(const std::unordered_map& args) { + // Set up signal handling + signal(SIGSEGV, printStackTrace); + signal(SIGABRT, printStackTrace); + signal(SIGINT, printStackTrace); + signal(SIGTERM, printStackTrace); + signal(SIGILL, printStackTrace); + signal(SIGFPE, printStackTrace); + signal(SIGBUS, printStackTrace); + // Placeholder for setting up input data and running ContextSV std::cout << "ContextSV version " << VERSION << std::endl; std::cout << "Input parameters:" << std::endl; diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index c5607e31..e10b36e7 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -895,7 +895,7 @@ void SVCaller::run(const InputData& input_data) } // [TEST] Use only the last 6 chromosomes - // chromosomes = {"chr6", "chr7", "chr8", "chr9", "chr10", "chr11"}; + //chromosomes = {"chr1", "chrX"}; // Read the HMM from the file std::string hmm_filepath = input_data.getHMMFilepath(); @@ -920,7 +920,8 @@ void SVCaller::run(const InputData& input_data) uint32_t chr_len = ref_genome.getChromosomeLength(chr); if (chr_len == 0) { printError("Chromosome " + chr + " not found in reference genome"); - continue; + return; + // continue; } chr_pos_depth_map[chr] = std::vector(chr_len+1, 0); // 1-based index chr_mean_cov_map[chr] = 0.0; @@ -954,6 +955,7 @@ void SVCaller::run(const InputData& input_data) auto process_chr = [&](const std::string& chr) { try { std::vector sv_calls; + sv_calls.reserve(1000); InputData chr_input_data = input_data; // Use a thread-local copy this->processChromosome(chr, sv_calls, chr_input_data, chr_pos_depth_map[chr], chr_mean_cov_map[chr]); { @@ -1104,17 +1106,22 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve Genotype genotype = std::get<2>(result); int cn_state = std::get<3>(result); - // For inversions with copy-neutral support, update the HMM likelihood - if (supp_type == SVType::NEUTRAL && sv_candidate.sv_type == SVType::INV) { - sv_candidate.hmm_likelihood = supp_lh; - sv_candidate.genotype = genotype; - sv_candidate.cn_state = cn_state; - } + // // For inversions with copy-neutral support, update the HMM likelihood + // if (supp_type == SVType::NEUTRAL && sv_candidate.sv_type == SVType::INV) { + // sv_candidate.hmm_likelihood = supp_lh; + // sv_candidate.genotype = genotype; + // sv_candidate.cn_state = cn_state; + // } - // Update the SV type if the support is not neutral or unknown - else if (supp_type != SVType::UNKNOWN && supp_type != SVType::NEUTRAL) { - // Update information if the SV call is unknown - if (sv_candidate.sv_type == SVType::UNKNOWN) { + // // Update the SV type if the state is not neutral or unknown + // else if (supp_type != SVType::UNKNOWN && supp_type != + // SVType::NEUTRAL) { + + // Update the SV type if the predicted type is not unknown + if (supp_type != SVType::UNKNOWN) { + // Update all information if the current SV call is not known and + // there is a predicted CNV type + if (sv_candidate.sv_type == SVType::UNKNOWN && (supp_type == SVType::DEL || supp_type == SVType::DUP)) { sv_candidate.sv_type = supp_type; sv_candidate.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format sv_candidate.data_type = SVDataType::HMM; @@ -1122,8 +1129,15 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve sv_candidate.genotype = genotype; sv_candidate.cn_state = cn_state; + // For predictions with the same type, or LOH predictions, update the + // prediction information + } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type == sv_candidate.sv_type || supp_type == SVType::LOH)) { + sv_candidate.hmm_likelihood = supp_lh; + sv_candidate.genotype = genotype; + sv_candidate.cn_state = cn_state; + // Add an additional SV call if the type is different - } else if (sv_candidate.sv_type != supp_type) { + } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type != sv_candidate.sv_type && (supp_type == SVType::DEL || supp_type == SVType::DUP))) { SVCall new_sv_call = sv_candidate; // Copy the original SV call new_sv_call.sv_type = supp_type; new_sv_call.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format @@ -1131,8 +1145,6 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve new_sv_call.hmm_likelihood = supp_lh; new_sv_call.genotype = genotype; new_sv_call.cn_state = cn_state; - - // Add the new SV call to the list additional_calls.push_back(new_sv_call); } } @@ -1192,6 +1204,7 @@ void SVCaller::saveToVCF(const std::unordered_map", "##INFO=", "##INFO=", + "##INFO=", "##INFO=", "##INFO=", "##INFO=", @@ -1255,6 +1268,9 @@ void SVCaller::saveToVCF(const std::unordered_map(start)-1); // Make sure the position is not negative ref_allele = ref_genome.query(chr, preceding_pos, end); // Use the preceding base as the alternate allele @@ -1319,7 +1335,8 @@ void SVCaller::saveToVCF(const std::unordered_map samples = {sample_str}; diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 7ee8f3bf..fc847f37 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -57,15 +57,8 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k SVType::INV, SVType::INS, SVType::BND, - SVType::INV_DUP, - SVType::INV_DEL, }) { - // [TEST] Skip if not insertions - // if (sv_type != SVType::INS) { - // continue; - // } - // Create a vector of SV calls for the current SV type and size interval std::vector sv_type_calls; std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) { @@ -73,13 +66,19 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k }); if (sv_type_calls.size() < 2) { + // Add all unclustered points to the merged list + for (const auto& sv_call : sv_type_calls) { + SVCall noise_sv_call = sv_call; + merged_sv_calls.push_back(noise_sv_call); + } continue; } dbscan.fit(sv_type_calls); + + // Create a map of cluster IDs to SV calls const std::vector& clusters = dbscan.getClusters(); std::map> cluster_map; // Cluster ID to SV calls - // Create a map of cluster IDs to SV calls if (sv_type == SVType::INS) { // Add only non-CIGARCLIP SVs to the cluster map for (size_t i = 0; i < clusters.size(); ++i) { @@ -99,37 +98,12 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k int cluster_id = cluster.first; std::vector& cluster_sv_calls = cluster.second; - // [TEST] If insertions, and if any SV has length between 9400 and - // 9500, print all SV coordinates in the cluster - bool print_all = false; - // if (sv_type == SVType::INS) { - // for (const auto& sv_call : cluster_sv_calls) { - // // printMessage("[TEST] SV call " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1)); - // // if (sv_call.end - sv_call.start >= 9400 && sv_call.end - - // // sv_call.start <= 9500) { - // // if (sv_call.end - sv_call.start >= 15100 && sv_call.end - - // // sv_call.start <= 15200) { - // // if (sv_call.end - sv_call.start >= 11200 && sv_call.end - - // // sv_call.start <= 11300) { - // // if (sv_call.end - sv_call.start >= 16800 && sv_call.end - - // // sv_call.start <= 17000) { - // // if (sv_call.end - sv_call.start >= 11300 && sv_call.end - - // // sv_call.start <= 11400) { - // // if (sv_call.end - sv_call.start >= 13100 && sv_call.end - - // // sv_call.start <= 13200) { - // if (sv_call.end - sv_call.start >= 28200 && sv_call.end - sv_call.start <= 28300) { - // print_all = true; - // break; - // } - // } - // } - if (print_all) { - printMessage("[TEST] Cluster " + std::to_string(cluster_id) + " has " + std::to_string(cluster_sv_calls.size()) + " SVs:"); - for (const auto& sv_call : cluster_sv_calls) { - printMessage(" " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1)); - } + // Continue if fewer than 2 SV calls in the cluster (due to CIGARCLIP filter) + if (cluster_sv_calls.size() < 2) { + continue; } + // Add unmerged SV calls if (cluster_id < 0 && keep_noise) { // Add all unclustered points to the merged list @@ -138,6 +112,7 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k merged_sv_calls.push_back(noise_sv_call); } + // Merge clustered SV calls } else { // ---------------------------- @@ -174,22 +149,13 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k merged_sv_call = *it; merged_sv_calls.push_back(merged_sv_call); - // [TEST] - // print_all = true; - // if (print_all) { - // printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with highest likelihood SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1)); - // printMessage("SV type: " + getSVTypeString(merged_sv_call.sv_type)); - // printMessage("Cluster members:"); - // for (const auto& sv_call : cluster_sv_calls) { - // printMessage(" " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", length=" + std::to_string((sv_call.end - sv_call.start) + 1)); - // } - // } - // ---------------------------- // CIGAR-BASED MERGING // ---------------------------- - } else if (cluster_sv_calls.size() > 1) { // Could be low if all CIGARCLIP + // } else if (cluster_sv_calls.size() > 1) { // Could be low if + // all CIGARCLIP + } else { // Use the median length SV of the top 10% of the cluster // (shorter reads are often noise) std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { @@ -207,11 +173,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k // Add SV call merged_sv_call.cluster_size = (int) cluster_sv_calls.size(); merged_sv_calls.push_back(merged_sv_call); - - // [TEST] - if (print_all) { - printMessage("[TEST] Merging cluster " + std::to_string(cluster_id) + " with median SV " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + ", length=" + std::to_string((merged_sv_call.end - merged_sv_call.start) + 1)); - } } cluster_count++; } From 883bfc7e1ffd66d4f014969e9494bf6777e2ed87 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 19 Apr 2025 21:15:03 -0400 Subject: [PATCH 103/134] improve cnv state predictions --- include/cnv_caller.h | 2 +- src/cnv_caller.cpp | 107 ++++++++++++++++++++++++++++++++++--------- src/main.cpp | 8 ++++ src/sv_caller.cpp | 16 +------ src/sv_object.cpp | 3 ++ 5 files changed, 99 insertions(+), 37 deletions(-) diff --git a/include/cnv_caller.h b/include/cnv_caller.h index deb9187d..afdd78b3 100644 --- a/include/cnv_caller.h +++ b/include/cnv_caller.h @@ -96,7 +96,7 @@ class CNVCaller { // Run copy number prediction for a single SV candidate, returning the // likelihood, predicted CNV type, genotype, and whether SNPs were found - std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; + std::tuple runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; // Run copy number prediction for SVs meeting the minimum length threshold obtained from CIGAR strings void runCIGARCopyNumberPrediction(std::string chr, std::vector& sv_candidates, const CHMM& hmm, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const; diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index dd602c1c..6fd4ce64 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -57,7 +57,7 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end std::vector snp_pos; std::unordered_map snp_baf_map; std::unordered_map snp_pfb_map; - printMessage("Reading SNP data for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); + // printMessage("Reading SNP data for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); this->readSNPAlleleFrequencies(chr, start_pos, end_pos, snp_pos, snp_baf_map, snp_pfb_map, input_data); // Get the log2 ratio for evenly spaced positions in the @@ -164,13 +164,13 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end snp_data.is_snp = std::move(is_snp_hmm); } -std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const +std::tuple CNVCaller::runCopyNumberPrediction(std::string chr, const CHMM& hmm, uint32_t start_pos, uint32_t end_pos, double mean_chr_cov, const std::vector& pos_depth_map, const InputData& input_data) const { // Check that the start position is less than the end position if (start_pos > end_pos) { printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); - return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false, 0); + return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0); } // Run the Viterbi algorithm on SNPs in the SV region @@ -205,50 +205,113 @@ std::tuple CNVCaller::runCopyNumberPredicti runViterbi(hmm, snp_data, prediction); if (prediction.first.size() == 0) { - return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, false, 0); + return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0); } std::vector& state_sequence = prediction.first; double likelihood = prediction.second; + // Check whether the start position begins with 225835 + bool debug = false; + // std::string start_pos_str = std::to_string(start_pos); + // if (start_pos_str.find("225835") != std::string::npos) + // { + // printMessage("Found 225835 in the start position: " + start_pos_str); + // debug = true; + // } + + // Print all states if debug is enabled + if (debug) + { + printMessage("State sequence length: " + std::to_string(state_sequence.size())); + printMessage("State sequence: "); + for (size_t i = 0; i < state_sequence.size(); i++) + { + printMessage(std::to_string(state_sequence[i]) + " "); + } + printMessage(""); + } + // Get all the states in the SV region - std::vector sv_states; - for (size_t i = 0; i < state_sequence.size(); i++) + // std::vector sv_states; + // for (size_t i = 0; i < state_sequence.size(); i++) + // { + // if (snp_data.pos[i] >= start_pos && snp_data.pos[i] <= end_pos) + // { + // sv_states.push_back(state_sequence[i]); + // } + // } + + // Print all states in the SV region if debug is enabled + if (debug) { - if (snp_data.pos[i] >= start_pos && snp_data.pos[i] <= end_pos) + printMessage("SV state length: " + std::to_string(state_sequence.size())); + printMessage("SV states: "); + for (size_t i = 0; i < state_sequence.size(); i++) { - sv_states.push_back(state_sequence[i]); + printMessage(std::to_string(state_sequence[i]) + " "); } + printMessage(""); } - // Determine if there is a majority state within the SV region and if it - // is greater than 75% - double pct_threshold = 0.75; + // Determine if there is a majority state within the SV region + // double pct_threshold = 0.75; int max_state = 0; int max_count = 0; + int non_normal_count = 0; - // Combine counts for states 1 and 2, states 3 and 4, and states 5 and 6 - for (int i = 0; i < 6; i += 2) + std::vector state_counts(6, 0); + for (int state : state_sequence) { - // Combine counts for states 1 and 2, states 3 and 4, and states 5 and 6 - int state_count = std::count(sv_states.begin(), sv_states.end(), i+1) + std::count(sv_states.begin(), sv_states.end(), i+2); - if (state_count > max_count) + // Skip state 3 (normal state) + if (state != 3) { - max_state = i+1; // Set the state to the first state in the pair (sequence remains intact) - max_count = state_count; + state_counts[state - 1]++; + non_normal_count++; } + // state_counts[state - 1]++; + } + + // Determine the maximum state and count + int max_state_index = std::distance(state_counts.begin(), std::max_element(state_counts.begin(), state_counts.end())); + max_state = max_state_index + 1; + max_count = state_counts[max_state_index]; + + // Find the state with the maximum count + // for (int i = 0; i < 6; i += 2) + // { + // // Combine counts for states 1 and 2, states 3 and 4, and states 5 and 6 + // int state_count = std::count(sv_states.begin(), sv_states.end(), i+1) + std::count(sv_states.begin(), sv_states.end(), i+2); + // if (state_count > max_count) + // { + // max_state = i+1; // Set the state to the first state in the pair (sequence remains intact) + // max_count = state_count; + // } + // } + + int state_count = static_cast(state_sequence.size()); + if (debug) + { + printMessage("Max state: " + std::to_string(max_state)); + printMessage("Max count: " + std::to_string(max_count)); + printMessage("Max count percentage: " + std::to_string((double) max_count / (double) state_count)); + printMessage("Non-normal count: " + std::to_string(non_normal_count)); + printMessage("Non-normal count percentage: " + std::to_string((double) max_count / (double) non_normal_count)); + printMessage("Predicted CNV type: " + getSVTypeString(getSVTypeFromCNState(max_state))); } // Update SV type and genotype based on the majority state + // SVType predicted_cnv_type = getSVTypeFromCNState(max_state); + // Genotype genotype = getGenotypeFromCNState(max_state); SVType predicted_cnv_type = SVType::UNKNOWN; Genotype genotype = Genotype::UNKNOWN; - int state_count = (int) sv_states.size(); - if ((double) max_count / (double) state_count > pct_threshold) + // int state_count = (int) sv_states.size(); + if (max_count > 0 && ((double) max_count / (double) non_normal_count) > 0.5) { predicted_cnv_type = getSVTypeFromCNState(max_state); genotype = getGenotypeFromCNState(max_state); + snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data } - snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data // Save the SV calls if enabled bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL); @@ -288,7 +351,7 @@ std::tuple CNVCaller::runCopyNumberPredicti this->saveSVCopyNumberToJSON(before_sv, after_sv, snp_data, chr, start_pos, end_pos, cnv_type_str, likelihood, json_filepath); } - return std::make_tuple(likelihood, predicted_cnv_type, genotype, true, max_state); + return std::make_tuple(likelihood, predicted_cnv_type, genotype, max_state); } diff --git a/src/main.cpp b/src/main.cpp index 64fc4e5f..89a835c9 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -43,6 +43,14 @@ void runContextSV(const std::unordered_map& args) signal(SIGFPE, printStackTrace); signal(SIGBUS, printStackTrace); + std::cout << R"(` + ___ _ _ _____ __ + / __|___ _ _| |_ _____ _| |_/ __\ \ / / + | (__/ _ \ ' \ _/ -_) \ / _\__ \\ V / + \___\___/_||_\__\___/_\_\\__|___/ \_/ + + )" << std::endl; + // Placeholder for setting up input data and running ContextSV std::cout << "ContextSV version " << VERSION << std::endl; std::cout << "Input parameters:" << std::endl; diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index e10b36e7..10f2ca92 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -893,9 +893,6 @@ void SVCaller::run(const InputData& input_data) // Get the chromosomes from the input BAM file chromosomes = this->getChromosomes(input_data.getLongReadBam()); } - - // [TEST] Use only the last 6 chromosomes - //chromosomes = {"chr1", "chrX"}; // Read the HMM from the file std::string hmm_filepath = input_data.getHMMFilepath(); @@ -1100,22 +1097,13 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve { std::vector additional_calls; for (auto& sv_candidate : split_sv_calls) { - std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); + std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); Genotype genotype = std::get<2>(result); int cn_state = std::get<3>(result); - // // For inversions with copy-neutral support, update the HMM likelihood - // if (supp_type == SVType::NEUTRAL && sv_candidate.sv_type == SVType::INV) { - // sv_candidate.hmm_likelihood = supp_lh; - // sv_candidate.genotype = genotype; - // sv_candidate.cn_state = cn_state; - // } - - // // Update the SV type if the state is not neutral or unknown - // else if (supp_type != SVType::UNKNOWN && supp_type != - // SVType::NEUTRAL) { + // printMessage("Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVDataTypeString(sv_candidate.data_type)); // Update the SV type if the predicted type is not unknown if (supp_type != SVType::UNKNOWN) { diff --git a/src/sv_object.cpp b/src/sv_object.cpp index fc847f37..6c7890a7 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -18,6 +18,9 @@ bool SVCall::operator<(const SVCall & other) const void addSVCall(std::vector& sv_calls, SVCall& sv_call) { + // Print the SV call + // printMessage("Adding SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " with length " + std::to_string(sv_call.end - sv_call.start + 1) + " and cluster size " + std::to_string(sv_call.cluster_size) + " from data type " + getSVDataTypeString(sv_call.data_type) + " and type " + getSVTypeString(sv_call.sv_type)); + // Check if the SV call is valid if (sv_call.start > sv_call.end) { printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " from data type " + getSVDataTypeString(sv_call.data_type)); From 98d41d49fb9212478d65c3441dcc975f6e5215e0 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 1 May 2025 16:43:04 -0400 Subject: [PATCH 104/134] remove test code --- src/cnv_caller.cpp | 71 +-------------- src/main.cpp | 32 ++++--- src/sv_caller.cpp | 216 ++++----------------------------------------- src/sv_object.cpp | 5 -- 4 files changed, 36 insertions(+), 288 deletions(-) diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 6fd4ce64..f04093af 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -211,51 +211,7 @@ std::tuple CNVCaller::runCopyNumberPrediction(std std::vector& state_sequence = prediction.first; double likelihood = prediction.second; - // Check whether the start position begins with 225835 - bool debug = false; - // std::string start_pos_str = std::to_string(start_pos); - // if (start_pos_str.find("225835") != std::string::npos) - // { - // printMessage("Found 225835 in the start position: " + start_pos_str); - // debug = true; - // } - - // Print all states if debug is enabled - if (debug) - { - printMessage("State sequence length: " + std::to_string(state_sequence.size())); - printMessage("State sequence: "); - for (size_t i = 0; i < state_sequence.size(); i++) - { - printMessage(std::to_string(state_sequence[i]) + " "); - } - printMessage(""); - } - - // Get all the states in the SV region - // std::vector sv_states; - // for (size_t i = 0; i < state_sequence.size(); i++) - // { - // if (snp_data.pos[i] >= start_pos && snp_data.pos[i] <= end_pos) - // { - // sv_states.push_back(state_sequence[i]); - // } - // } - - // Print all states in the SV region if debug is enabled - if (debug) - { - printMessage("SV state length: " + std::to_string(state_sequence.size())); - printMessage("SV states: "); - for (size_t i = 0; i < state_sequence.size(); i++) - { - printMessage(std::to_string(state_sequence[i]) + " "); - } - printMessage(""); - } - // Determine if there is a majority state within the SV region - // double pct_threshold = 0.75; int max_state = 0; int max_count = 0; int non_normal_count = 0; @@ -269,43 +225,18 @@ std::tuple CNVCaller::runCopyNumberPrediction(std state_counts[state - 1]++; non_normal_count++; } - // state_counts[state - 1]++; } // Determine the maximum state and count int max_state_index = std::distance(state_counts.begin(), std::max_element(state_counts.begin(), state_counts.end())); max_state = max_state_index + 1; max_count = state_counts[max_state_index]; - - // Find the state with the maximum count - // for (int i = 0; i < 6; i += 2) - // { - // // Combine counts for states 1 and 2, states 3 and 4, and states 5 and 6 - // int state_count = std::count(sv_states.begin(), sv_states.end(), i+1) + std::count(sv_states.begin(), sv_states.end(), i+2); - // if (state_count > max_count) - // { - // max_state = i+1; // Set the state to the first state in the pair (sequence remains intact) - // max_count = state_count; - // } - // } - - int state_count = static_cast(state_sequence.size()); - if (debug) - { - printMessage("Max state: " + std::to_string(max_state)); - printMessage("Max count: " + std::to_string(max_count)); - printMessage("Max count percentage: " + std::to_string((double) max_count / (double) state_count)); - printMessage("Non-normal count: " + std::to_string(non_normal_count)); - printMessage("Non-normal count percentage: " + std::to_string((double) max_count / (double) non_normal_count)); - printMessage("Predicted CNV type: " + getSVTypeString(getSVTypeFromCNState(max_state))); - } - + // Update SV type and genotype based on the majority state // SVType predicted_cnv_type = getSVTypeFromCNState(max_state); // Genotype genotype = getGenotypeFromCNState(max_state); SVType predicted_cnv_type = SVType::UNKNOWN; Genotype genotype = Genotype::UNKNOWN; - // int state_count = (int) sv_states.size(); if (max_count > 0 && ((double) max_count / (double) non_normal_count) > 0.5) { predicted_cnv_type = getSVTypeFromCNState(max_state); diff --git a/src/main.cpp b/src/main.cpp index 89a835c9..6f2c1b69 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -4,6 +4,7 @@ /// @cond DOXYGEN_IGNORE #include #include +#include // For signal handling #include @@ -32,6 +33,18 @@ void printStackTrace(int sig) } +void printBanner() +{ + std::time_t now = std::time(nullptr); + char date_str[100]; + std::strftime(date_str, sizeof(date_str), "%Y-%m-%d", std::localtime(&now)); + std::cout << "═══════════════════════════════════════════════════════════════" << std::endl; + std::cout << " ContextSV - Long-read Structural Variant Caller" << std::endl; + std::cout << " Version: " << VERSION << std::endl; + std::cout << " Date: " << date_str << std::endl; + std::cout << "═══════════════════════════════════════════════════════════════" << std::endl; +} + void runContextSV(const std::unordered_map& args) { // Set up signal handling @@ -43,20 +56,13 @@ void runContextSV(const std::unordered_map& args) signal(SIGFPE, printStackTrace); signal(SIGBUS, printStackTrace); - std::cout << R"(` - ___ _ _ _____ __ - / __|___ _ _| |_ _____ _| |_/ __\ \ / / - | (__/ _ \ ' \ _/ -_) \ / _\__ \\ V / - \___\___/_||_\__\___/_\_\\__|___/ \_/ - - )" << std::endl; - // Placeholder for setting up input data and running ContextSV - std::cout << "ContextSV version " << VERSION << std::endl; - std::cout << "Input parameters:" << std::endl; - for (const auto& arg : args) { - std::cout << arg.first << ": " << arg.second << std::endl; - } + // std::cout << "ContextSV version " << VERSION << std::endl; + // std::cout << "Input parameters:" << std::endl; + // for (const auto& arg : args) { + // std::cout << arg.first << ": " << arg.second << std::endl; + // } + printBanner(); // Set up input data InputData input_data; diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 10f2ca92..06504a77 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -350,8 +350,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map primary_positions; int primary_cluster_size = 0; if (!primary_start_cluster.empty()) { @@ -365,28 +363,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map primary_end_cluster.size()) { - // std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); - // // primary_pos = - // // primary_start_cluster[primary_start_cluster.size() / 2]; - // primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]); - // primary_cluster_size = primary_start_cluster.size(); - // } else if (primary_end_cluster.size() > primary_start_cluster.size()) { - // std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); - // // primary_pos = primary_end_cluster[primary_end_cluster.size() - // // / 2]; - // primary_positions.push_back(primary_end_cluster[primary_end_cluster.size() / 2]); - // primary_cluster_size = primary_end_cluster.size(); - // } else { - // // Use both positions - // std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); - // std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); - // // primary_pos = primary_start_cluster[primary_start_cluster.size() / 2]; - // // primary_pos2 = primary_end_cluster[primary_end_cluster.size() - // // / 2]; - // primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]); - // primary_cluster_size = primary_start_cluster.size(); - // } // ------------------------------- // SPLIT INSERTION CALLS @@ -402,96 +378,29 @@ void SVCaller::findSplitSVSignatures(std::unordered_map 2000) { - // if (primary_pos != -1) { - // if (primary_pos2 != -1) { - // // If two positions were found, use the 5'most position - // primary_pos = std::min(primary_pos, primary_pos2); - // } if (!primary_positions.empty()) { int aln_offset = static_cast(ref_distance - read_distance); if (read_distance > ref_distance && read_distance >= min_length && read_distance <= max_length) { // Add an insertion SV call at the primary positions SVType sv_type = SVType::INS; - // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); - // addSVCall(chr_sv_calls, sv_candidate); for (int primary_pos : primary_positions) { SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); addSVCall(chr_sv_calls, sv_candidate); } } else if (ref_distance > read_distance && ref_distance >= min_length && ref_distance <= max_length) { + // Add a deletion SV call at the primary positions for (int primary_pos : primary_positions) { SVType sv_type = SVType::DEL; SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); addSVCall(chr_sv_calls, sv_candidate); } - // Add a deletion SV call at the primary position - // SVType sv_type = SVType::DEL; - // SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); } } - - // if (ref_distance >= 50 && ref_distance < read_distance) { - // // Add an insertion SV call at the primary position - // SVType sv_type = SVType::INS; - // int aln_offset = static_cast(ref_distance - read_distance); - // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); - // addSVCall(chr_sv_calls, sv_candidate); - // } - // SVType sv_type = SVType::INS; - // int aln_offset = static_cast(read_distance - ref_distance); - // SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); - // addSVCall(chr_sv_calls, sv_candidate); } - // if (!ref_distance_cluster.empty()) { - // // Use the median of the largest cluster of split distances as the - // // insertion size - // std::sort(ref_distance_cluster.begin(), ref_distance_cluster.end()); - // size_t median_index = ref_distance_cluster.size() / 2; - // ref_distance = ref_distance_cluster[median_index]; - // read_distance = read_distance_cluster[median_index]; - - // // Add a deletion SV call at the primary position - // if (primary_pos != -1 && ref_distance >= 50 && ref_distance > read_distance) { - // if (primary_pos2 != -1) { - // // If two positions were found, use the 5'most position - // primary_pos = std::min(primary_pos, primary_pos2); - // } - // SVType sv_type = SVType::DEL; - // int aln_offset = static_cast(ref_distance - read_distance); - // SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); - // addSVCall(chr_sv_calls, sv_candidate); - - // // Add an inversion if necessary (inverted deletion) - // if (inversion) { - // SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); - // addSVCall(chr_sv_calls, sv_candidate); - // } - // } - // } - - // -------------------------------- - // Get the supplementary alignment positions - // int supp_pos = -1; - // int supp_pos2 = -1; std::vector supp_positions; int supp_cluster_size = 0; - // int supp_best_start = -1; - // int supp_best_end = -1; - // if (!supp_start_cluster.empty()) { - // std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); - // supp_best_start = supp_start_cluster[supp_start_cluster.size() / 2]; - // } - // if (!supp_end_cluster.empty()) { - // std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); - // supp_best_end = supp_end_cluster[supp_end_cluster.size() / 2]; - // } - if (!supp_start_cluster.empty()) { std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]); @@ -503,32 +412,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map supp_end_cluster.size()) { - // // supp_pos = supp_best_start; - // std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); - // supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]); - // // supp_positions.push_back(supp_best_start); - // supp_cluster_size = supp_start_cluster.size(); - // } else if (supp_end_cluster.size() > supp_start_cluster.size()) { - // // supp_pos = supp_best_end; - // // supp_positions.push_back(supp_best_end); - // std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); - // supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]); - // supp_cluster_size = supp_end_cluster.size(); - // } else if (supp_start_cluster.size() == supp_end_cluster.size() && !supp_start_cluster.empty() && !supp_end_cluster.empty()) { - // // Use both positions. This has been shown to occur in some nested SVs - // // supp_pos = supp_best_start; - // // supp_pos2 = supp_best_end; - // std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); - // std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); - // supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]); - // supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]); - // supp_cluster_size = supp_start_cluster.size(); - // // supp_positions.push_back(supp_best_start); - // // supp_positions.push_back(supp_best_end); - // supp_cluster_size = supp_start_cluster.size(); - // } - // Store the inversion as the supplementary start and end positions if (inversion && supp_positions.size() > 1) { std::sort(supp_positions.begin(), supp_positions.end()); @@ -539,70 +422,9 @@ void SVCaller::findSplitSVSignatures(std::unordered_map= min_length && sv_length <= max_length) { - // // SVCall sv_candidate(std::min(supp_best_start, supp_best_end), std::max(supp_best_start, supp_best_end), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); - // // addSVCall(chr_sv_calls, sv_candidate); - // } } - // If two of either were found, use the larger SV candidate - // if (primary_pos2 != -1) { - // int sv_length1 = std::abs(primary_pos - supp_pos); - // int sv_length2 = std::abs(primary_pos2 - supp_pos); - // if (sv_length2 > sv_length1) { - // primary_pos = primary_pos2; - // } - // } - // if (supp_pos2 != -1) { - // int sv_length1 = std::abs(primary_pos - supp_pos); - // int sv_length2 = std::abs(primary_pos - supp_pos2); - // if (sv_length2 > sv_length1) { - // supp_pos = supp_pos2; - // } - // } - - // if (primary_pos == -1 || supp_pos == -1) { - // continue; - // } - - // Store the SV candidate if the length is within the specified range - // int sv_start = std::min(primary_pos, supp_pos); - // int sv_end = std::max(primary_pos, supp_pos); - // int sv_length = sv_end - sv_start + 1; - // int cluster_size = std::max(primary_cluster_size, supp_cluster_size); - - // If the read distance is < 30bp while the SV is > 2kb, then this is a - // potential deletion - // if (std::abs(read_distance) < 30 && sv_length > 2000 && sv_length <= 1000000) { - - // // Add an inversion call if necessary - // if (inversion) { - // for (int primary_pos : primary_positions) { - // for (int supp_pos : supp_positions) { - // SVCall sv_candidate(std::min(primary_pos, supp_pos), std::max(primary_pos, supp_pos), SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size); - // addSVCall(chr_sv_calls, sv_candidate); - // } - // } - // // SVCall sv_candidate(sv_start, sv_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size); - // // addSVCall(chr_sv_calls, sv_candidate); - // } else { - // for (int primary_pos : primary_positions) { - // for (int supp_pos : supp_positions) { - // uint32_t sv_start = std::min(primary_pos, supp_pos); - // uint32_t sv_end = std::max(primary_pos, supp_pos); - // if (sv_end - sv_start + 1 >= 50) { - // SVCall sv_candidate(std::min(primary_pos, supp_pos), std::max(primary_pos, supp_pos), SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITINV, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size); - // addSVCall(chr_sv_calls, sv_candidate); - // } - // } - // // SVCall sv_candidate(sv_start, sv_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size); - // // addSVCall(chr_sv_calls, sv_candidate); - // } - // } - // Add a dummy SV call for CNV detection - // if (sv_length >= min_length && sv_length <= max_length) { int cluster_size = std::max(primary_cluster_size, supp_cluster_size); SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; std::string alt = (sv_type == SVType::INV) ? "" : "."; @@ -618,9 +440,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map& ch void SVCaller::run(const InputData& input_data) { bool cigar_svs = true; + bool cigar_cn = true; bool split_svs = true; // Set up the reference genome @@ -987,18 +801,20 @@ void SVCaller::run(const InputData& input_data) } printMessage("All tasks have finished."); - // ------------------------------------------------------- - // Run copy number variant predictions on the SVs detected from the - // CIGAR string, using a minimum CNV length threshold - current_chr = 0; - printMessage("Running copy number predictions on CIGAR SVs..."); - for (auto& entry : whole_genome_sv_calls) { - current_chr++; - const std::string& chr = entry.first; - std::vector& sv_calls = entry.second; - if (sv_calls.size() > 0) { - printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); - cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); + if (cigar_cn) { + // ------------------------------------------------------- + // Run copy number variant predictions on the SVs detected from the + // CIGAR string, using a minimum CNV length threshold + current_chr = 0; + printMessage("Running copy number predictions on CIGAR SVs..."); + for (auto& entry : whole_genome_sv_calls) { + current_chr++; + const std::string& chr = entry.first; + std::vector& sv_calls = entry.second; + if (sv_calls.size() > 0) { + printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + "..."); + cnv_caller.runCIGARCopyNumberPrediction(chr, sv_calls, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); + } } } // ------------------------------------------------------- diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 6c7890a7..ddc0cf4b 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -18,9 +18,6 @@ bool SVCall::operator<(const SVCall & other) const void addSVCall(std::vector& sv_calls, SVCall& sv_call) { - // Print the SV call - // printMessage("Adding SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " with length " + std::to_string(sv_call.end - sv_call.start + 1) + " and cluster size " + std::to_string(sv_call.cluster_size) + " from data type " + getSVDataTypeString(sv_call.data_type) + " and type " + getSVTypeString(sv_call.sv_type)); - // Check if the SV call is valid if (sv_call.start > sv_call.end) { printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " from data type " + getSVDataTypeString(sv_call.data_type)); @@ -156,8 +153,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k // CIGAR-BASED MERGING // ---------------------------- - // } else if (cluster_sv_calls.size() > 1) { // Could be low if - // all CIGARCLIP } else { // Use the median length SV of the top 10% of the cluster // (shorter reads are often noise) From fe77d3d0d72b9e320aaf71c3b283a2bb03db8fcc Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 5 May 2025 16:45:33 -0400 Subject: [PATCH 105/134] fix false positive in split reads --- src/sv_caller.cpp | 74 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 57 insertions(+), 17 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 06504a77..684dcdf8 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -379,21 +379,23 @@ void SVCaller::findSplitSVSignatures(std::unordered_map(ref_distance - read_distance); if (read_distance > ref_distance && read_distance >= min_length && read_distance <= max_length) { - // Add an insertion SV call at the primary positions + // Add an insertion SV call at the 5'-most primary position SVType sv_type = SVType::INS; - for (int primary_pos : primary_positions) { - SVCall sv_candidate(primary_pos, primary_pos + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); - addSVCall(chr_sv_calls, sv_candidate); - } + // for (int primary_pos : primary_positions) { + SVCall sv_candidate(sv_start, sv_start + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); + addSVCall(chr_sv_calls, sv_candidate); + // } } else if (ref_distance > read_distance && ref_distance >= min_length && ref_distance <= max_length) { // Add a deletion SV call at the primary positions - for (int primary_pos : primary_positions) { - SVType sv_type = SVType::DEL; - SVCall sv_candidate(primary_pos, primary_pos + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); - addSVCall(chr_sv_calls, sv_candidate); - } + // for (int primary_pos : primary_positions) { + SVType sv_type = SVType::DEL; + SVCall sv_candidate(sv_start, sv_start + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); + addSVCall(chr_sv_calls, sv_candidate); + // } } } } @@ -1091,8 +1093,12 @@ void SVCaller::saveToVCF(const std::unordered_map(start) > 1) { + uint32_t preceding_pos = start - 1; + ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos); + start = preceding_pos; + if (ref_allele != "") { + if (alt_allele != "") { + // Insert the reference allele before the insertion + alt_allele.insert(0, ref_allele); + } + } else { + // If the reference allele is empty, use a symbolic allele + ref_allele = "N"; // Convention for INS + alt_allele = ""; // Symbolic allele + std::cerr << "Warning: Reference allele is empty for insertion at " << chr << ":" << start << "-" << end << std::endl; + } + } else { + // ref_allele = "N"; // No preceding base for the first + // position + // Throw an error if the insertion is at the first position + std::cerr << "Error: Insertion at the first position " << chr << ":" << start << "-" << end << std::endl; + continue; } + // int64_t preceding_pos = (int64_t) std::max(1, (int) start-1); // Make sure the position is not negative + // ref_allele = ref_genome.query(chr, preceding_pos, preceding_pos); + // start = preceding_pos; + // if (ref_allele != "") { + // if (alt_allele != "") { + // // Insert the reference allele before the insertion + // alt_allele.insert(0, ref_allele); + // } + // } else { + // // If the reference allele is empty, use a symbolic allele + // ref_allele = "N"; // Convention for INS + // alt_allele = ""; // Symbolic allele + // std::cerr << "Warning: Reference allele is empty for insertion at " << chr << ":" << start << "-" << end << std::endl; + // } end = start; // Update the end position to the same base } else { @@ -1136,6 +1170,12 @@ void SVCaller::saveToVCF(const std::unordered_mapgetReadDepth(chr_pos_depth_map.at(chr), start); + // If read depth equals zero, then set the filter to LowQual + if (read_depth == 0) { + filter = "LowQual"; + filtered_svs += 1; + } + // Create the VCF parameter strings std::string sv_type_str = getSVTypeString(sv_type); // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";MISMATCH=" + std::to_string(mismatch_rate); From fd75f7fed6ba8b8f72d74f71aac11d60d1b47b2e Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 12 May 2025 16:51:20 -0400 Subject: [PATCH 106/134] fix split sv detection errors --- __main__.py | 1 - include/input_data.h | 10 +- include/utils.h | 52 +++++----- src/cnv_caller.cpp | 88 ++++++++++++---- src/input_data.cpp | 49 +++------ src/main.cpp | 14 --- src/sv_caller.cpp | 230 +++++++++++++++++++++++++++++++----------- tests/test_general.py | 1 - 8 files changed, 279 insertions(+), 166 deletions(-) diff --git a/__main__.py b/__main__.py index a888cdbf..3821b8d1 100644 --- a/__main__.py +++ b/__main__.py @@ -214,7 +214,6 @@ def main(): # Set input parameters input_data = contextsv.InputData() input_data.setVerbose(args.debug) - input_data.setShortReadBam(args.short_read) input_data.setLongReadBam(args.long_read) input_data.setRefGenome(args.reference) input_data.setSNPFilepath(args.snps) diff --git a/include/input_data.h b/include/input_data.h index 0687af76..452b5e6c 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -23,9 +23,7 @@ class InputData { public: InputData(); - std::string getShortReadBam() const; - - void setShortReadBam(std::string filepath); + void printParameters() const; std::string getLongReadBam() const; @@ -65,10 +63,6 @@ class InputData { void setDBSCAN_Epsilon(double epsilon); double getDBSCAN_Epsilon() const; - // Set the minimum number of points in a cluster for DBSCAN. - void setDBSCAN_MinPts(int min_pts); - int getDBSCAN_MinPts() const; - // Set the percentage of mean chromosome coverage to use for DBSCAN // minimum points. void setDBSCAN_MinPtsPct(double min_pts_pct); @@ -105,7 +99,6 @@ class InputData { std::string getCNVOutputFile() const; private: - std::string short_read_bam; std::string long_read_bam; std::string ref_filepath; std::string snp_vcf_filepath; @@ -116,7 +109,6 @@ class InputData { uint32_t min_cnv_length; int min_reads; double dbscan_epsilon; - int dbscan_min_pts; double dbscan_min_pts_pct; std::string chr; // Chromosome to analyze std::pair start_end; // Region to analyze diff --git a/include/utils.h b/include/utils.h index 6eb1237d..d95f0a8a 100644 --- a/include/utils.h +++ b/include/utils.h @@ -14,32 +14,32 @@ // Guard to close the BAM file -struct BamFileGuard { - samFile* fp_in; - hts_idx_t* idx; - bam_hdr_t* bamHdr; - - BamFileGuard(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr) - : fp_in(fp_in), idx(idx), bamHdr(bamHdr) {} - - ~BamFileGuard() { - if (idx) { - hts_idx_destroy(idx); - idx = nullptr; - } - if (bamHdr) { - bam_hdr_destroy(bamHdr); - bamHdr = nullptr; - } - if (fp_in) { - sam_close(fp_in); - fp_in = nullptr; - } - } - - BamFileGuard(const BamFileGuard&) = delete; // Non-copyable - BamFileGuard& operator=(const BamFileGuard&) = delete; // Non-assignable -}; +// struct BamFileGuard { +// samFile* fp_in; +// hts_idx_t* idx; +// bam_hdr_t* bamHdr; + +// BamFileGuard(samFile* fp_in, hts_idx_t* idx, bam_hdr_t* bamHdr) +// : fp_in(fp_in), idx(idx), bamHdr(bamHdr) {} + +// ~BamFileGuard() { +// if (idx) { +// hts_idx_destroy(idx); +// idx = nullptr; +// } +// if (bamHdr) { +// bam_hdr_destroy(bamHdr); +// bamHdr = nullptr; +// } +// if (fp_in) { +// sam_close(fp_in); +// fp_in = nullptr; +// } +// } + +// BamFileGuard(const BamFileGuard&) = delete; // Non-copyable +// BamFileGuard& operator=(const BamFileGuard&) = delete; // Non-assignable +// }; // Print the progress of a task void printProgress(int progress, int total); diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index f04093af..0222fb7c 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -318,7 +318,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vectorquerySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data); // Run the Viterbi algorithm @@ -327,7 +327,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector, double> prediction; runViterbi(hmm, snp_data, prediction); std::vector& state_sequence = prediction.first; @@ -371,7 +371,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector& printError("ERROR: Could not load index for BAM file: " + bam_filepath); return; } - BamFileGuard bam_guard(bam_file, bam_index, bam_header); // Guard to close the BAM file + // BamFileGuard bam_guard(bam_file, bam_index, bam_header); // Guard to close the BAM file // Initialize the record bam1_t *bam_record = bam_init1(); @@ -472,6 +472,20 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& printMessage("(" + std::to_string(++current_chr) + "/" + std::to_string(total_chr_count) + ") Reading BAM file for chromosome: " + chr); std::vector& pos_depth_map = chr_pos_depth_map[chr]; + int tid = bam_name2id(bam_header, chr.c_str()); + if (tid < 0) + { + printError("ERROR: Could not find chromosome " + chr + " in BAM file."); + continue; + } + // Resize the depth map to the length of the chromosome + uint32_t chr_length = bam_header->target_len[tid] + 1; + if (pos_depth_map.size() != static_cast(chr_length)) + { + printError("ERROR: Chromosome length mismatch for " + chr + ": expected " + std::to_string(chr_length) + ", found " + std::to_string(pos_depth_map.size()) + ", resizing to " + std::to_string(chr_length)); + // Resize the depth map to the length of the chromosome + pos_depth_map.resize(chr_length, 0); + } while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0) { // Ignore UNMAP, SECONDARY, QCFAIL, and DUP reads @@ -517,26 +531,58 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& } } hts_itr_destroy(bam_iter); - - // Parallel sum of the depth map - uint64_t cum_depth = std::reduce( - std::execution::par, - pos_depth_map.begin(), - pos_depth_map.end(), - 0ULL - ); - - // Parallel count of the non-zero depth positions - uint32_t pos_count = std::count_if( - std::execution::par, - pos_depth_map.begin(), - pos_depth_map.end(), - [](uint32_t depth) { return depth > 0; } - ); + // You can parallelize the depth map calculation here but first close the + // BAM file and index + // Bam cleanup (delete guard if using this) + // bam_destroy1(bam_record); + // bam_hdr_destroy(bam_header); + // sam_close(bam_file); + // bam_index_destroy(bam_index); + // bam_record = nullptr; + // bam_header = nullptr; + // bam_file = nullptr; + // bam_index = nullptr; + + // // Parallel sum of the depth map + // uint64_t cum_depth = std::reduce( + // std::execution::par, + // pos_depth_map.begin(), + // pos_depth_map.end(), + // 0ULL + // ); + + // // Parallel count of the non-zero depth positions + // uint32_t pos_count = std::count_if( + // std::execution::par, + // pos_depth_map.begin(), + // pos_depth_map.end(), + // [](uint32_t depth) { return depth > 0; } + // ); + + // Sum without parallelization + uint64_t cum_depth = std::accumulate(pos_depth_map.begin(), pos_depth_map.end(), 0ULL); + uint32_t pos_count = std::count_if(pos_depth_map.begin(), pos_depth_map.end(), [](uint32_t depth) { return depth > 0; }); + + // Calculate the mean coverage for the chromosome double mean_chr_cov = (pos_count > 0) ? static_cast(cum_depth) / static_cast(pos_count) : 0.0; - chr_mean_cov_map[chr] = mean_chr_cov; + printMessage("Mean coverage for chromosome " + chr + ": " + std::to_string(mean_chr_cov)); + if (mean_chr_cov != 0.0) { + chr_mean_cov_map[chr] = mean_chr_cov; + } } + + // Clean up the BAM file and index + printMessage("Closing BAM file " + bam_filepath); + bam_destroy1(bam_record); + hts_idx_destroy(bam_index); + bam_hdr_destroy(bam_header); + sam_close(bam_file); + bam_record = nullptr; + bam_index = nullptr; + bam_header = nullptr; + bam_file = nullptr; + printMessage("BAM file closed."); } void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, uint32_t end_pos, std::vector& snp_pos, std::unordered_map& snp_baf, std::unordered_map& snp_pfb, const InputData& input_data) const diff --git a/src/input_data.cpp b/src/input_data.cpp index 5661eb3b..cd55f67e 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -16,7 +16,6 @@ // Constructor InputData::InputData() { - this->short_read_bam = ""; this->long_read_bam = ""; this->ref_filepath = ""; this->snp_vcf_filepath = ""; @@ -28,7 +27,6 @@ InputData::InputData() this->min_cnv_length = 1000; this->min_reads = 5; this->dbscan_epsilon = 0.99; - this->dbscan_min_pts = 15; this->dbscan_min_pts_pct = 0.0; this->thread_count = 1; this->hmm_filepath = "data/wgs.hmm"; @@ -38,29 +36,24 @@ InputData::InputData() this->cnv_output_file = ""; } -std::string InputData::getShortReadBam() const +void InputData::printParameters() const { - return this->short_read_bam; -} - -void InputData::setShortReadBam(std::string filepath) -{ - this->short_read_bam = filepath; - - // Check if empty string - if (filepath.empty()) + std::cout << "Input parameters:" << std::endl; + std::cout << "Long read BAM: " << this->long_read_bam << std::endl; + std::cout << "Reference genome: " << this->ref_filepath << std::endl; + std::cout << "SNP VCF: " << this->snp_vcf_filepath << std::endl; + std::cout << "Output directory: " << this->output_dir << std::endl; + std::cout << "Sample size: " << this->sample_size << std::endl; + std::cout << "Minimum CNV length: " << this->min_cnv_length << std::endl; + std::cout << "DBSCAN epsilon: " << this->dbscan_epsilon << std::endl; + std::cout << "DBSCAN minimum points percentage: " << this->dbscan_min_pts_pct * 100.0f << "%" << std::endl; + if (this->region_set) { - return; - - } else { - // Check if the file exists - FILE *fp = fopen(filepath.c_str(), "r"); - if (fp == NULL) - { - throw std::runtime_error("Short read BAM file does not exist: " + filepath); - } else { - fclose(fp); - } + std::cout << "Region set to: chr" + this->chr + ":" + std::to_string(this->start_end.first) + "-" + std::to_string(this->start_end.second) + "\n"; + } + else + { + std::cout << "Running on whole genome" << std::endl; } } @@ -169,16 +162,6 @@ double InputData::getDBSCAN_Epsilon() const return this->dbscan_epsilon; } -void InputData::setDBSCAN_MinPts(int min_pts) -{ - this->dbscan_min_pts = min_pts; -} - -int InputData::getDBSCAN_MinPts() const -{ - return this->dbscan_min_pts; -} - void InputData::setDBSCAN_MinPtsPct(double min_pts_pct) { this->dbscan_min_pts_pct = min_pts_pct; diff --git a/src/main.cpp b/src/main.cpp index 6f2c1b69..5ba0fb1b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -56,18 +56,11 @@ void runContextSV(const std::unordered_map& args) signal(SIGFPE, printStackTrace); signal(SIGBUS, printStackTrace); - // Placeholder for setting up input data and running ContextSV - // std::cout << "ContextSV version " << VERSION << std::endl; - // std::cout << "Input parameters:" << std::endl; - // for (const auto& arg : args) { - // std::cout << arg.first << ": " << arg.second << std::endl; - // } printBanner(); // Set up input data InputData input_data; input_data.setLongReadBam(args.at("bam-file")); - input_data.setShortReadBam(args.at("bam-file")); input_data.setRefGenome(args.at("ref-file")); input_data.setSNPFilepath(args.at("snps-file")); input_data.setOutputDir(args.at("output-dir")); @@ -107,10 +100,6 @@ void runContextSV(const std::unordered_map& args) input_data.setDBSCAN_Epsilon(std::stod(args.at("epsilon"))); } - if (args.find("min-pts") != args.end()) { - input_data.setDBSCAN_MinPts(std::stoi(args.at("min-pts"))); - } - if (args.find("min-pts-pct") != args.end()) { input_data.setDBSCAN_MinPtsPct(std::stod(args.at("min-pts-pct"))); } @@ -146,7 +135,6 @@ void printUsage(const std::string& programName) { << " -n, --sample-size Sample size for HMM predictions\n" << " --min-cnv Minimum CNV length\n" << " --eps DBSCAN epsilon\n" - << " --min-pts DBSCAN minimum points\n" << " --min-pts-pct Percentage of mean chr. coverage to use for DBSCAN minimum points\n" << " -e, --eth ETH file\n" << " -p, --pfb PFB file\n" @@ -186,8 +174,6 @@ std::unordered_map parseArguments(int argc, char* argv args["min-reads"] = argv[++i]; } else if (arg == "--eps" && i + 1 < argc) { args["epsilon"] = argv[++i]; - } else if (arg == "--min-pts" && i + 1 < argc) { - args["min-pts"] = argv[++i]; } else if (arg == "--min-pts-pct" && i + 1 < argc) { args["min-pts-pct"] = argv[++i]; } else if ((arg == "-e" || arg == "--eth") && i + 1 < argc) { diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 684dcdf8..c08c3c32 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -94,7 +94,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map> primary_map; // TID-> qname -> primary alignment @@ -173,6 +172,11 @@ void SVCaller::findSplitSVSignatures(std::unordered_map> to_remove; @@ -310,12 +314,35 @@ void SVCaller::findSplitSVSignatures(std::unordered_map(supp_aln.query_start), static_cast(primary_aln.query_start)) - std::min(static_cast(supp_aln.query_end), static_cast(primary_aln.query_end))); // Calculate distance between alignments on the // reference ref_distance = std::max(0, std::max(static_cast(supp_aln.start), static_cast(primary_aln.start)) - std::min(static_cast(supp_aln.end), static_cast(primary_aln.end))); + + // Throw an error if the read distance is negative + if (read_distance < 0) { + printError("ERROR: negative read distance between primary and supplementary alignments for " + qname); + } + // Throw an error if the reference distance is + // negative + if (ref_distance < 0) { + printError("ERROR: negative reference distance between primary and supplementary alignments for " + qname); + } + + // Use a negative read distance to indicate that the + // primary alignment is not 5'-most + if (!primary_5p) { + read_distance = -read_distance; + } read_distances.push_back(read_distance); ref_distances.push_back(ref_distance); } @@ -352,16 +379,52 @@ void SVCaller::findSplitSVSignatures(std::unordered_map primary_positions; int primary_cluster_size = 0; + bool primary_start = false; + bool primary_end = false; if (!primary_start_cluster.empty()) { std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]); primary_cluster_size = primary_start_cluster.size(); + primary_start = true; } if (!primary_end_cluster.empty()) { std::sort(primary_end_cluster.begin(), primary_end_cluster.end()); primary_positions.push_back(primary_end_cluster[primary_end_cluster.size() / 2]); primary_cluster_size = std::max(primary_cluster_size, (int) primary_end_cluster.size()); + primary_end = true; + } + + // Get the supplementary alignment positions + std::vector supp_positions; + bool supp_start = false; + bool supp_end = false; + int supp_cluster_size = 0; + if (!supp_start_cluster.empty()) { + std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); + supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]); + supp_cluster_size = supp_start_cluster.size(); + supp_start = true; + } + if (!supp_end_cluster.empty()) { + std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); + supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]); + supp_cluster_size = std::max(supp_cluster_size, (int) supp_end_cluster.size()); + supp_end = true; + } + + // Store the inversion as the supplementary start and end positions + if (inversion && supp_positions.size() > 1) { + std::sort(supp_positions.begin(), supp_positions.end()); + int supp_start = supp_positions.front(); + int supp_end = supp_positions.back(); + int sv_length = std::abs(supp_start - supp_end); + + // Use 50bp as the minimum length for an inversion + if (sv_length >= 50 && sv_length <= max_length) { + SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); + addSVCall(chr_sv_calls, sv_candidate); + } } // ------------------------------- @@ -373,59 +436,76 @@ void SVCaller::findSplitSVSignatures(std::unordered_map 0; + read_distance = std::abs(read_distance); std::sort(ref_distance_cluster.begin(), ref_distance_cluster.end()); ref_distance = ref_distance_cluster[ref_distance_cluster.size() / 2]; // Add an insertion SV call at the primary position - if (!primary_positions.empty()) { + + // bool print_debug = false; + bool print_debug = true; + + // int sv_start = primary_positions[0]; + // Use the 3'-most primary position as the start position + int sv_start; + bool split_candidate_sv = false; + if (primary_5p_most && primary_end) { std::sort(primary_positions.begin(), primary_positions.end()); - int sv_start = primary_positions[0]; + // Supplementary alignment is downstream with the + // insertion sequence, starting at the 3'-most + // primary position + sv_start = primary_positions.back(); + + // Print debug if SV start equals 223608935 + // if (sv_start == 223608936) { + // print_debug = true; + // printMessage("DEBUG: SV start is" + std::to_string(sv_start) + " and read distance is " + std::to_string(read_distance) + " and ref distance is " + std::to_string(ref_distance) + " and primary 5p_most is " + std::to_string(primary_5p_most) + " and primary positions are " + std::to_string(primary_positions[0]) + " and " + std::to_string(primary_positions.back()) + " and number of primary positions is " + std::to_string(primary_positions.size()) + " and start bool is " + std::to_string(primary_start) + " and end bool is " + std::to_string(primary_end)); + // } + split_candidate_sv = true; + } else if (!primary_5p_most && supp_end) { + + // Supplementary alignment is upstream with the + // insertion sequence, starting at the 5'-most + // primary position + // sv_start = primary_positions.front(); + std::sort(supp_positions.begin(), supp_positions.end()); + sv_start = supp_positions.back(); + + // Print debug if SV start equals 223608935 + // if (sv_start == 223608936) { + // print_debug = true; + // printMessage("DEBUG: SV start is " + std::to_string(sv_start) + " and read distance is " + std::to_string(read_distance) + " and ref distance is " + std::to_string(ref_distance) + " and primary 5p_most is " + std::to_string(primary_5p_most) + " and primary positions are " + std::to_string(primary_positions[0]) + " and " + std::to_string(primary_positions.back()) + " and number of primary positions is " + std::to_string(primary_positions.size()) + " and start bool is " + std::to_string(primary_start) + " and end bool is " + std::to_string(primary_end)); + // } + split_candidate_sv = true; + } + if (split_candidate_sv) { int aln_offset = static_cast(ref_distance - read_distance); if (read_distance > ref_distance && read_distance >= min_length && read_distance <= max_length) { // Add an insertion SV call at the 5'-most primary position SVType sv_type = SVType::INS; - // for (int primary_pos : primary_positions) { SVCall sv_candidate(sv_start, sv_start + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); addSVCall(chr_sv_calls, sv_candidate); // } } else if (ref_distance > read_distance && ref_distance >= min_length && ref_distance <= max_length) { // Add a deletion SV call at the primary positions - // for (int primary_pos : primary_positions) { SVType sv_type = SVType::DEL; + + // if (print_debug) { + // printMessage("DEBUG: Adding deletion SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_start + (ref_distance-1)) + " with length " + std::to_string(ref_distance) + " and cluster size " + std::to_string(primary_cluster_size)); + // } + + // Add a dummy SV call before and after the start + // position for HMM predictions + // SVType sv_type = SVType::UNKNOWN; SVCall sv_candidate(sv_start, sv_start + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); addSVCall(chr_sv_calls, sv_candidate); - // } + // SVCall sv_candidate2(sv_start + (ref_distance-1), sv_start + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST2, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); } } } - // Get the supplementary alignment positions - std::vector supp_positions; - int supp_cluster_size = 0; - if (!supp_start_cluster.empty()) { - std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); - supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]); - supp_cluster_size = supp_start_cluster.size(); - } - if (!supp_end_cluster.empty()) { - std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); - supp_positions.push_back(supp_end_cluster[supp_end_cluster.size() / 2]); - supp_cluster_size = std::max(supp_cluster_size, (int) supp_end_cluster.size()); - } - - // Store the inversion as the supplementary start and end positions - if (inversion && supp_positions.size() > 1) { - std::sort(supp_positions.begin(), supp_positions.end()); - int supp_start = supp_positions.front(); - int supp_end = supp_positions.back(); - int sv_length = std::abs(supp_start - supp_end); - if (sv_length >= min_length && sv_length <= max_length) { - SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); - addSVCall(chr_sv_calls, sv_candidate); - } - } - // Add a dummy SV call for CNV detection int cluster_size = std::max(primary_cluster_size, supp_cluster_size); SVType sv_type = inversion ? SVType::INV : SVType::UNKNOWN; @@ -446,7 +526,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map& sv_calls, const std::vector& pos_depth_map) @@ -546,7 +629,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec if (op_len <= 50) { alt_allele = ins_seq_str; } - SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, 0, 1, 0); + SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, 0, 0, 0); cigar_sv_calls.emplace_back(sv_call); // Process clipped bases as potential insertions @@ -578,7 +661,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec if (op_len <= 50) { alt_allele = ins_seq_str; } - SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, 0, 0.0, 0); + SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, 0, 0, 0); cigar_sv_calls.emplace_back(sv_call); // Check if the CIGAR operation is a deletion @@ -586,7 +669,7 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec ref_pos = pos+1; ref_end = ref_pos + op_len -1; - SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, 0, 1, 0); + SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, 0, 0, 0); cigar_sv_calls.emplace_back(sv_call); } } @@ -664,7 +747,7 @@ void SVCaller::processChromosome(const std::string& chr, std::vector& ch printError("ERROR: failed to load index for " + bam_filepath); return; } - BamFileGuard bam_guard(fp_in, idx, bamHdr); // Guard to close the BAM file + // BamFileGuard bam_guard(fp_in, idx, bamHdr); // Guard to close the BAM file // Get DBSCAN parameters double dbscan_epsilon = input_data.getDBSCAN_Epsilon(); @@ -680,6 +763,11 @@ void SVCaller::processChromosome(const std::string& chr, std::vector& ch printMessage(chr + ": CIGAR SVs..."); this->findCIGARSVs(fp_in, idx, bamHdr, chr, chr_sv_calls, chr_pos_depth_map); + // Clean up the BAM file and index + sam_close(fp_in); + hts_idx_destroy(idx); + bam_hdr_destroy(bamHdr); + printMessage(chr + ": Merging CIGAR..."); mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false); @@ -693,6 +781,9 @@ void SVCaller::run(const InputData& input_data) bool cigar_cn = true; bool split_svs = true; + // Print the input data + input_data.printParameters(); + // Set up the reference genome printMessage("Loading the reference genome..."); const std::string ref_filepath = input_data.getRefGenome(); @@ -742,16 +833,34 @@ void SVCaller::run(const InputData& input_data) cnv_caller.calculateMeanChromosomeCoverage(chromosomes, chr_pos_depth_map, chr_mean_cov_map, bam_filepath, chr_thread_count); // Remove chromosomes with no reads (mean coverage is zero) - std::vector null_chr; + printMessage("Removing chromosomes with no reads..."); + std::vector valid_chr; for (const auto& chr : chromosomes) { + if (chr_mean_cov_map.find(chr) != chr_mean_cov_map.end()) { + valid_chr.push_back(chr); + } + chromosomes = valid_chr; + /* + try { + if (chr_mean_cov_map.at(chr) == 0.0) { + printMessage("Chromosome " + chr + " has no reads"); + } + } catch (const std::out_of_range& e) { + printError("Chromosome " + chr + " not found in mean coverage map: " + std::string(e.what())); + }*/ + /* + // Check if the chromosome has no reads if (chr_mean_cov_map[chr] == 0.0) { null_chr.push_back(chr); } + */ } + /* + printMessage("Removing " + std::to_string(null_chr.size()) + " chromosomes with no reads..."); for (const auto& chr : null_chr) { printMessage("Removing chromosome " + chr + " with no reads..."); chromosomes.erase(std::remove(chromosomes.begin(), chromosomes.end(), chr), chromosomes.end()); - } + }*/ std::unordered_map> whole_genome_sv_calls; int current_chr = 0; int total_chr_count = chromosomes.size(); @@ -1017,6 +1126,7 @@ void SVCaller::saveToVCF(const std::unordered_map", "##FILTER=", "##FILTER=", + "##FILTER=", "##FORMAT=", "##FORMAT=", }; @@ -1053,6 +1163,7 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls = pair.second; @@ -1068,8 +1179,6 @@ void SVCaller::saveToVCF(const std::unordered_map(start)-1); // Make sure the position is not negative ref_allele = ref_genome.query(chr, preceding_pos, end); + + // Use the preceding base as the alternate allele if (ref_allele != "") { + // If the sequence is >90% N, skip the SV call (assembly + // gap) + int allele_length_90pct = static_cast(ref_allele.size() * 0.9); + if (std::count(ref_allele.begin(), ref_allele.end(), 'N') > allele_length_90pct) { + assembly_gaps += 1; + // continue; + + // Don't skip but set the filter to assembly gap + filter = "AssemblyGap"; + } + // The alt allele is the preceding base, and the reference // allele is the deleted sequence including the preceding base alt_allele = ref_allele.at(0); @@ -1127,26 +1249,10 @@ void SVCaller::saveToVCF(const std::unordered_mapgetReadDepth(chr_pos_depth_map.at(chr), start); // If read depth equals zero, then set the filter to LowQual - if (read_depth == 0) { - filter = "LowQual"; - filtered_svs += 1; - } + // if (read_depth == 0) { + // printError("Warning: Read depth is zero for " + chr + ":" + std::to_string(start) + "-" + std::to_string(end)); + // filter = "LowQual"; + // filtered_svs += 1; + // } // Create the VCF parameter strings std::string sv_type_str = getSVTypeString(sv_type); @@ -1199,6 +1306,7 @@ void SVCaller::saveToVCF(const std::unordered_map& pos_depth_map, uint32_t start) const diff --git a/tests/test_general.py b/tests/test_general.py index ac7d5d8d..ff65faba 100644 --- a/tests/test_general.py +++ b/tests/test_general.py @@ -40,7 +40,6 @@ def test_run(): # Set input parameters. input_data = contextsv.InputData() - input_data.setShortReadBam(TEST_BAM_FILE) input_data.setLongReadBam(TEST_BAM_FILE) input_data.setRefGenome(TEST_REF_FILE) input_data.setSNPFilepath(TEST_SNPS_FILE) From d86e1c7384c73aee0dcf230a8fb5874b1bfba1a7 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 16 May 2025 16:09:42 -0400 Subject: [PATCH 107/134] filter assembly gaps --- include/input_data.h | 5 + include/sv_caller.h | 3 +- include/sv_object.h | 14 +-- include/sv_types.h | 23 ++++- src/cnv_caller.cpp | 3 +- src/input_data.cpp | 29 ++++++ src/main.cpp | 5 + src/sv_caller.cpp | 217 +++++++++++++++++++++++++++++++++++++------ src/sv_object.cpp | 8 +- 9 files changed, 262 insertions(+), 45 deletions(-) diff --git a/include/input_data.h b/include/input_data.h index 452b5e6c..1e2c3c1e 100644 --- a/include/input_data.h +++ b/include/input_data.h @@ -51,6 +51,10 @@ class InputData { void setEthnicity(std::string ethnicity); std::string getEthnicity() const; + // Set the assembly gaps file. + void setAssemblyGaps(std::string filepath); + std::string getAssemblyGaps() const; + // Set the sample size for HMM predictions. void setSampleSize(int sample_size); int getSampleSize() const; @@ -116,6 +120,7 @@ class InputData { int thread_count; std::string hmm_filepath; std::string cnv_filepath; + std::string assembly_gaps; // Assembly gaps file bool verbose; // True if verbose output is enabled bool save_cnv_data; // True if SNP CNV regions should be extended around SV breakpoints, and saved to a TSV file (Large performance hit) bool single_chr; diff --git a/include/sv_caller.h b/include/sv_caller.h index 4ef59700..997603ef 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -90,7 +90,8 @@ class SVCaller { void runSplitReadCopyNumberPredictions(const std::string& chr, std::vector& split_sv_calls, const CNVCaller &cnv_caller, const CHMM &hmm, double mean_chr_cov, const std::vector &pos_depth_map, const InputData &input_data); - void saveToVCF(const std::unordered_map> &sv_calls, const std::string &output_dir, const ReferenceGenome &ref_genome, const std::unordered_map>& chr_pos_depth_map) const; + void saveToVCF(const std::unordered_map> &sv_calls, const InputData &input_data, const ReferenceGenome &ref_genome, const std::unordered_map> &chr_pos_depth_map) const; + // void saveToVCF(const std::unordered_map> &sv_calls, const std::string &output_dir, const ReferenceGenome &ref_genome, const std::unordered_map>& chr_pos_depth_map) const; // Query the read depth (INFO/DP) at a position int getReadDepth(const std::vector& pos_depth_map, uint32_t start) const; diff --git a/include/sv_object.h b/include/sv_object.h index a99fb4fb..4fd34c56 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -18,23 +18,23 @@ struct SVCall { uint32_t end = 0; SVType sv_type = SVType::UNKNOWN; std::string alt_allele = "."; - SVDataType data_type = SVDataType::UNKNOWN; + // SVDataType data_type = SVDataType::UNKNOWN; + SVEvidenceFlags aln_type; Genotype genotype = Genotype::UNKNOWN; double hmm_likelihood = 0.0; int cn_state = 0; // Copy number state int aln_offset = 0; // Alignment offset (read vs. reference distance factor) - // int read_depth = 0; // Breakpoint depth - // double mismatch_rate = 0.0; // Highest mismatch rate in reads used for the SV call int cluster_size = 0; // Number of SV calls in the cluster bool operator<(const SVCall& other) const; SVCall() = default; - SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int cn_state, int aln_offset, int cluster_size) : - start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), cn_state(cn_state), aln_offset(aln_offset), cluster_size(cluster_size) {} - // SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int read_depth, double mismatch_rate, int cluster_size) : - // start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), read_depth(read_depth), mismatch_rate(mismatch_rate), cluster_size(cluster_size) {} + SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, SVEvidenceFlags aln_type, Genotype genotype, double hmm_likelihood, int cn_state, int aln_offset, int cluster_size) : + start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), aln_type(aln_type), genotype(genotype), hmm_likelihood(hmm_likelihood), cn_state(cn_state), aln_offset(aln_offset), cluster_size(cluster_size) {} + + // SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int cn_state, int aln_offset, int cluster_size) : + // start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), cn_state(cn_state), aln_offset(aln_offset), cluster_size(cluster_size) {} }; void addSVCall(std::vector& sv_calls, SVCall& sv_call); diff --git a/include/sv_types.h b/include/sv_types.h index dd67c2a4..359d0dc9 100644 --- a/include/sv_types.h +++ b/include/sv_types.h @@ -7,6 +7,7 @@ #include #include #include +#include /// @endcond namespace sv_types { @@ -75,6 +76,8 @@ namespace sv_types { UNKNOWN = 9 }; + using SVEvidenceFlags = std::bitset<10>; // Bitset for SV data types + // Mapping of SV data types to strings const std::unordered_map SVDataTypeString = { {SVDataType::CIGARINS, "CIGARINS"}, @@ -105,6 +108,20 @@ namespace sv_types { return SVTypeString.at(sv_type); } + // Function to get the SV alignment type string from the bitset + inline std::string getSVAlignmentTypeString(SVEvidenceFlags aln_type) { + std::string result; + for (size_t i = 0; i < SVDataTypeString.size(); ++i) { + if (aln_type.test(i)) { + result += SVDataTypeString.at(static_cast(i)) + ","; + } + } + if (!result.empty()) { + result.pop_back(); // Remove the trailing comma + } + return result; + } + // Function to get the SV type from the CNV state inline SVType getSVTypeFromCNState(int cn_state) { return CNVTypeMap.at(cn_state); @@ -116,9 +133,9 @@ namespace sv_types { } // Function to get the SV data type string - inline std::string getSVDataTypeString(SVDataType data_type) { - return SVDataTypeString.at(data_type); - } + // inline std::string getSVDataTypeString(SVDataType data_type) { + // return SVDataTypeString.at(data_type); + // } // Function to get the SV type symbol inline std::string getSVTypeSymbol(SVType sv_type) { diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 0222fb7c..60dca02f 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -377,7 +377,8 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector(SVDataType::HMM)); sv_call.hmm_likelihood = likelihood; sv_call.genotype = genotype; sv_call.cn_state = max_state; diff --git a/src/input_data.cpp b/src/input_data.cpp index cd55f67e..3e7ad69d 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -34,6 +34,7 @@ InputData::InputData() this->save_cnv_data = false; this->single_chr = false; this->cnv_output_file = ""; + this->assembly_gaps = ""; } void InputData::printParameters() const @@ -142,6 +143,34 @@ void InputData::setEthnicity(std::string ethnicity) this->ethnicity = ethnicity; } +void InputData::setAssemblyGaps(std::string filepath) +{ + // Check if the file exists + FILE *fp = fopen(filepath.c_str(), "r"); + if (fp == NULL) + { + std::cerr << "Assembly gaps file does not exist: " << filepath << std::endl; + exit(1); + } + + // Check if the file is a BED file + std::string ext = filepath.substr(filepath.find_last_of(".") + 1); + if (ext != "bed") + { + std::cerr << "Assembly gaps file is not a BED file: " << filepath << std::endl; + exit(1); + } + fclose(fp); + + // Set the assembly gaps file + this->assembly_gaps = filepath; +} + +std::string InputData::getAssemblyGaps() const +{ + return this->assembly_gaps; +} + uint32_t InputData::getMinCNVLength() const { return this->min_cnv_length; diff --git a/src/main.cpp b/src/main.cpp index 5ba0fb1b..f7f5db0a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -88,6 +88,9 @@ void runContextSV(const std::unordered_map& args) if (args.find("pfb-file") != args.end()) { input_data.setAlleleFreqFilepaths(args.at("pfb-file")); } + if (args.find("assembly-gaps") != args.end()) { + input_data.setAssemblyGaps(args.at("assembly-gaps")); + } if (args.find("save-cnv") != args.end()) { input_data.saveCNVData(true); } @@ -180,6 +183,8 @@ std::unordered_map parseArguments(int argc, char* argv args["eth"] = argv[++i]; } else if ((arg == "-p" || arg == "--pfb") && i + 1 < argc) { args["pfb-file"] = argv[++i]; + } else if (arg == "--assembly-gaps" && i + 1 < argc) { + args["assembly-gaps"] = argv[++i]; } else if (arg == "--save-cnv") { args["save-cnv"] = "true"; } else if (arg == "--debug") { diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index c08c3c32..b1e4be60 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -21,6 +21,7 @@ #include #include #include +#include #include "ThreadPool.h" #include "utils.h" @@ -379,13 +380,13 @@ void SVCaller::findSplitSVSignatures(std::unordered_map primary_positions; int primary_cluster_size = 0; - bool primary_start = false; + // bool primary_start = false; bool primary_end = false; if (!primary_start_cluster.empty()) { std::sort(primary_start_cluster.begin(), primary_start_cluster.end()); primary_positions.push_back(primary_start_cluster[primary_start_cluster.size() / 2]); primary_cluster_size = primary_start_cluster.size(); - primary_start = true; + // primary_start = true; } if (!primary_end_cluster.empty()) { @@ -397,14 +398,14 @@ void SVCaller::findSplitSVSignatures(std::unordered_map supp_positions; - bool supp_start = false; + // bool supp_start = false; bool supp_end = false; int supp_cluster_size = 0; if (!supp_start_cluster.empty()) { std::sort(supp_start_cluster.begin(), supp_start_cluster.end()); supp_positions.push_back(supp_start_cluster[supp_start_cluster.size() / 2]); supp_cluster_size = supp_start_cluster.size(); - supp_start = true; + // supp_start = true; } if (!supp_end_cluster.empty()) { std::sort(supp_end_cluster.begin(), supp_end_cluster.end()); @@ -422,7 +423,10 @@ void SVCaller::findSplitSVSignatures(std::unordered_map= 50 && sv_length <= max_length) { - SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); + SVEvidenceFlags aln_type; + aln_type.set(static_cast(SVDataType::SUPPINV)); + SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), aln_type, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); + // SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); addSVCall(chr_sv_calls, sv_candidate); } } @@ -480,17 +484,24 @@ void SVCaller::findSplitSVSignatures(std::unordered_map(SVDataType::SPLITDIST1)); if (split_candidate_sv) { int aln_offset = static_cast(ref_distance - read_distance); if (read_distance > ref_distance && read_distance >= min_length && read_distance <= max_length) { // Add an insertion SV call at the 5'-most primary position SVType sv_type = SVType::INS; - SVCall sv_candidate(sv_start, sv_start + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); + SVCall sv_candidate(sv_start, sv_start + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), aln_type, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); + // SVCall sv_candidate(sv_start, sv_start + (read_distance-1), sv_type, getSVTypeSymbol(sv_type), SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); addSVCall(chr_sv_calls, sv_candidate); // } } else if (ref_distance > read_distance && ref_distance >= min_length && ref_distance <= max_length) { // Add a deletion SV call at the primary positions - SVType sv_type = SVType::DEL; + // SVType sv_type = SVType::DEL; + + // Set it to unknown, SV type will be determined by the + // HMM prediction + SVType sv_type = SVType::UNKNOWN; // if (print_debug) { // printMessage("DEBUG: Adding deletion SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_start + (ref_distance-1)) + " with length " + std::to_string(ref_distance) + " and cluster size " + std::to_string(primary_cluster_size)); @@ -499,9 +510,12 @@ void SVCaller::findSplitSVSignatures(std::unordered_map= min_length && sv_length <= max_length) { // printMessage("Adding SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and cluster size " + std::to_string(cluster_size)); - SVCall sv_candidate(sv_start, sv_end, sv_type, alt, SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size); + // SVCall sv_candidate(sv_start, sv_end, sv_type, alt, + // SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, 0, + // cluster_size); + SVEvidenceFlags aln_type; + aln_type.set(static_cast(SVDataType::SPLIT)); + SVCall sv_candidate(sv_start, sv_end, sv_type, alt, aln_type, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size); addSVCall(chr_sv_calls, sv_candidate); } } @@ -629,7 +648,10 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec if (op_len <= 50) { alt_allele = ins_seq_str; } - SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, 0, 0, 0); + SVEvidenceFlags aln_type; + aln_type.set(static_cast(SVDataType::CIGARINS)); + SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, aln_type, Genotype::UNKNOWN, default_lh, 0, 0, 0); + // SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARINS, Genotype::UNKNOWN, default_lh, 0, 0, 0); cigar_sv_calls.emplace_back(sv_call); // Process clipped bases as potential insertions @@ -661,7 +683,10 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec if (op_len <= 50) { alt_allele = ins_seq_str; } - SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, 0, 0, 0); + SVEvidenceFlags aln_type; + aln_type.set(static_cast(SVDataType::CIGARCLIP)); + SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, aln_type, Genotype::UNKNOWN, default_lh, 0, 0, 0); + // SVCall sv_call(ins_pos, ins_end, SVType::INS, alt_allele, SVDataType::CIGARCLIP, Genotype::UNKNOWN, default_lh, 0, 0, 0); cigar_sv_calls.emplace_back(sv_call); // Check if the CIGAR operation is a deletion @@ -669,7 +694,10 @@ void SVCaller::processCIGARRecord(bam_hdr_t *header, bam1_t *alignment, std::vec ref_pos = pos+1; ref_end = ref_pos + op_len -1; - SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, 0, 0, 0); + SVEvidenceFlags aln_type; + aln_type.set(static_cast(SVDataType::CIGARDEL)); + SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), aln_type, Genotype::UNKNOWN, default_lh, 0, 0, 0); + // SVCall sv_call(ref_pos, ref_end, SVType::DEL, getSVTypeSymbol(SVType::DEL), SVDataType::CIGARDEL, Genotype::UNKNOWN, default_lh, 0, 0, 0); cigar_sv_calls.emplace_back(sv_call); } } @@ -965,6 +993,38 @@ void SVCaller::run(const InputData& input_data) } } + // Merge any duplicate SV calls from the CIGAR and split-read + // detections (same start positions) + printMessage("Merging CIGAR and split read SV calls..."); + for (auto& entry : whole_genome_sv_calls) { + std::vector& sv_calls = entry.second; + // mergeDuplicateSVs(sv_calls); + // mergeSVs(sv_calls, 0.1, 2, false); + + // [TEST 1] Keep noise and use the DBSCAN epsilon from the + // command line + // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), 2, true); + + // [TEST 2] Remove noise and use the DBSCAN epsilon from the + // command line (= really low recall, and low precision) + // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), 2, false); + + // [TEST 3] Remove noise and use a DBSCAN epsilon of 0.1 (low recall, + // higher precision) + // mergeSVs(sv_calls, 0.1, 2, false); + + // [TEST 4] Keep noise and use a DBSCAN epsilon of 0.1 (slightly better + // recall) + // Using a more aggressive epsilon works better for the final merge + mergeSVs(sv_calls, 0.1, 2, true); + + // [TEST 5] Keep noise and use a DBSCAN epsilon of 0.01 (1 more FP) + // mergeSVs(sv_calls, 0.01, 2, true); + + // [TEST 6] do nothing (reduced precision, same recall as #4) + // continue; + } + if (input_data.getSaveCNVData()) { closeJSON(json_fp); } @@ -981,8 +1041,9 @@ void SVCaller::run(const InputData& input_data) // Save to VCF std::cout << "Saving SVs to VCF..." << std::endl; - const std::string output_dir = input_data.getOutputDir(); - this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome, chr_pos_depth_map); + // const std::string output_dir = input_data.getOutputDir(); + // this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome, chr_pos_depth_map); + this->saveToVCF(whole_genome_sv_calls, input_data, ref_genome, chr_pos_depth_map); } void SVCaller::findOverlaps(const std::unique_ptr &root, const PrimaryAlignment &query, std::vector &result) @@ -1030,6 +1091,14 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve Genotype genotype = std::get<2>(result); int cn_state = std::get<3>(result); + bool print_debug = false; + if (sv_candidate.start == 15287019) { + // if (true) { + print_debug = true; + + printMessage("DEBUG: Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type)); + } + // printMessage("Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVDataTypeString(sv_candidate.data_type)); // Update the SV type if the predicted type is not unknown @@ -1039,11 +1108,16 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve if (sv_candidate.sv_type == SVType::UNKNOWN && (supp_type == SVType::DEL || supp_type == SVType::DUP)) { sv_candidate.sv_type = supp_type; sv_candidate.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format - sv_candidate.data_type = SVDataType::HMM; + sv_candidate.aln_type.set(static_cast(SVDataType::HMM)); + // sv_candidate.data_type = SVDataType::HMM; sv_candidate.hmm_likelihood = supp_lh; sv_candidate.genotype = genotype; sv_candidate.cn_state = cn_state; + if (print_debug) { + printMessage("DEBUG [1]: Updating SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type)); + } + // For predictions with the same type, or LOH predictions, update the // prediction information } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type == sv_candidate.sv_type || supp_type == SVType::LOH)) { @@ -1051,16 +1125,25 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve sv_candidate.genotype = genotype; sv_candidate.cn_state = cn_state; + if (print_debug) { + printMessage("DEBUG [2]: Updating SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type)); + } + // Add an additional SV call if the type is different } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type != sv_candidate.sv_type && (supp_type == SVType::DEL || supp_type == SVType::DUP))) { SVCall new_sv_call = sv_candidate; // Copy the original SV call new_sv_call.sv_type = supp_type; new_sv_call.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format - new_sv_call.data_type = SVDataType::HMM; + // new_sv_call.aln_type = SVDataType::HMM; + new_sv_call.aln_type.set(static_cast(SVDataType::HMM)); new_sv_call.hmm_likelihood = supp_lh; new_sv_call.genotype = genotype; new_sv_call.cn_state = cn_state; additional_calls.push_back(new_sv_call); + + if (print_debug) { + printMessage("DEBUG [3]: Adding additional SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type)); + } } } } @@ -1084,9 +1167,46 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve } } -void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome, const std::unordered_map>& chr_pos_depth_map) const +// void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome, const std::unordered_map>& chr_pos_depth_map) const +void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const InputData &input_data, const ReferenceGenome& ref_genome, const std::unordered_map>& chr_pos_depth_map) const { + // Check if an assembly gap file was provided + std::string assembly_gap_file = input_data.getAssemblyGaps(); + std::unordered_map>> assembly_gaps; + if (!assembly_gap_file.empty()) { + std::cout << "Loading assembly gap file: " << assembly_gap_file << std::endl; + // Load the assembly gap file and process it + std::ifstream gap_stream(assembly_gap_file); + if (!gap_stream.is_open()) { + printError("Failed to open assembly gap file: " + assembly_gap_file); + return; + } + std::string line; + while (std::getline(gap_stream, line)) { + // Skip empty lines and comments + if (line.empty() || line[0] == '#') { + continue; + } + + // Parse the line (assuming tab-separated values) + std::istringstream iss(line); + std::string chr; + uint32_t start, end; + if (!(iss >> chr >> start >> end)) { + printError("Failed to parse assembly gap file line: " + line); + continue; + } + // Add the assembly gap to the map + assembly_gaps[chr].emplace_back(start, end); + // Print the assembly gap information + // std::cout << "Assembly gap: " << chr << ":" << start << "-" << end << std::endl; + } + gap_stream.close(); + std::cout << "Loaded " << assembly_gaps.size() << " assembly gaps." << std::endl; + } + std::cout << "Creating VCF writer..." << std::endl; + std::string output_dir = input_data.getOutputDir(); std::string output_vcf = output_dir + "/output.vcf"; std::cout << "Writing VCF file to " << output_vcf << std::endl; std::ofstream vcf_stream(output_vcf); @@ -1163,7 +1283,7 @@ void SVCaller::saveToVCF(const std::unordered_map& sv_calls = pair.second; @@ -1176,7 +1296,7 @@ void SVCaller::saveToVCF(const std::unordered_map(overlap_length) / static_cast(sv_length); + if (overlap_pct > 0.2) { + in_assembly_gap = true; + break; + } + } + // double overlap = 0.0; + // uint32_t gap_start = gap.first + 1; // Convert to 1-based + // uint32_t gap_end = gap.second + 1; // Convert to 1-based + // overlap = static_cast(std::min(end, gap_end) - std::max(start, gap_start) + 1) / static_cast(sv_length); + // if (overlap > 0.2) { + // std::cout << "Assembly gap overlap is " << overlap << " for " << chr << ":" << start << "-" << end << std::endl; + // in_assembly_gap = true; + // break; + // } + } + if (in_assembly_gap) { + filter = "AssemblyGap"; + assembly_gap_filtered_svs += 1; + } + } + } + // Get the deleted sequence from the reference genome, also including the preceding base uint32_t preceding_pos = (uint32_t) std::max(1, static_cast(start)-1); // Make sure the position is not negative ref_allele = ref_genome.query(chr, preceding_pos, end); - - // Use the preceding base as the alternate allele if (ref_allele != "") { // If the sequence is >90% N, skip the SV call (assembly // gap) - int allele_length_90pct = static_cast(ref_allele.size() * 0.9); - if (std::count(ref_allele.begin(), ref_allele.end(), 'N') > allele_length_90pct) { - assembly_gaps += 1; - // continue; + // int allele_length_90pct = static_cast(ref_allele.size() * 0.9); + // if (std::count(ref_allele.begin(), ref_allele.end(), 'N') > allele_length_90pct) { + // assembly_gaps += 1; + // // continue; - // Don't skip but set the filter to assembly gap - filter = "AssemblyGap"; - } + // // Don't skip but set the filter to assembly gap + // filter = "AssemblyGap"; + // } // The alt allele is the preceding base, and the reference // allele is the deleted sequence including the preceding base @@ -1306,7 +1463,7 @@ void SVCaller::saveToVCF(const std::unordered_map& pos_depth_map, uint32_t start) const diff --git a/src/sv_object.cpp b/src/sv_object.cpp index ddc0cf4b..2621b785 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -20,7 +20,7 @@ void addSVCall(std::vector& sv_calls, SVCall& sv_call) { // Check if the SV call is valid if (sv_call.start > sv_call.end) { - printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " from data type " + getSVDataTypeString(sv_call.data_type)); + printError("ERROR: Invalid SV call at position " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + " from data type " + getSVAlignmentTypeString(sv_call.aln_type)); return; } @@ -82,7 +82,9 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k if (sv_type == SVType::INS) { // Add only non-CIGARCLIP SVs to the cluster map for (size_t i = 0; i < clusters.size(); ++i) { - if (sv_type_calls[i].data_type != SVDataType::CIGARCLIP) { + // if (sv_type_calls[i].data_type != SVDataType::CIGARCLIP) { + // Use the SVEvidenceFlags to check for CIGARCLIP + if (!sv_type_calls[i].aln_type.test(static_cast(SVDataType::CIGARCLIP))) { cluster_map[clusters[i]].push_back(sv_type_calls[i]); } } @@ -175,7 +177,7 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k cluster_count++; } } - printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type)); + printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + " SVs"); } sv_calls = std::move(merged_sv_calls); // Replace with filtered list int updated_size = sv_calls.size(); From 0acbf130daf2125b380e3a83f0ccb004b780cab4 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 16 May 2025 19:34:38 -0400 Subject: [PATCH 108/134] improve split sv merging --- src/sv_caller.cpp | 27 ++++----------------- src/sv_object.cpp | 60 ++++++++++++++++++++++++++++++++--------------- 2 files changed, 46 insertions(+), 41 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index b1e4be60..3e9f2ddc 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -514,6 +514,7 @@ void SVCaller::findSplitSVSignatures(std::unordered_map= min_length && sv_length <= max_length) { // printMessage("Adding SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and cluster size " + std::to_string(cluster_size)); @@ -550,7 +551,8 @@ void SVCaller::findSplitSVSignatures(std::unordered_map> whole_genome_sv_calls; int current_chr = 0; int total_chr_count = chromosomes.size(); @@ -1017,6 +999,7 @@ void SVCaller::run(const InputData& input_data) // recall) // Using a more aggressive epsilon works better for the final merge mergeSVs(sv_calls, 0.1, 2, true); + // continue; // [TEST 5] Keep noise and use a DBSCAN epsilon of 0.01 (1 more FP) // mergeSVs(sv_calls, 0.01, 2, true); diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 2621b785..1fe60b31 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -136,13 +136,21 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k SVCall merged_sv_call = cluster_sv_calls[0]; if (has_nonzero_likelihood) { - // These are detected from split reads, choose the one with - // the highest non-zero likelihood normalized by the length of the SV + // // These are detected from split reads, choose the one with + // // the highest non-zero likelihood normalized by the length of the SV + // std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { + // return (a.hmm_likelihood / (double)(a.end - a.start + 1)) > (b.hmm_likelihood / (double)(b.end - b.start + 1)); + // }); + + // // Obtain the highest non-zero likelihood + // auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) { + // return sv_call.hmm_likelihood != 0.0; + // }); + + // Choose the SV with the highest cluster size of all SVs with non-zero likelihood std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { - return (a.hmm_likelihood / (double)(a.end - a.start + 1)) > (b.hmm_likelihood / (double)(b.end - b.start + 1)); + return a.cluster_size > b.cluster_size || (a.cluster_size == b.cluster_size && a.hmm_likelihood > b.hmm_likelihood); }); - - // Obtain the highest non-zero likelihood auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) { return sv_call.hmm_likelihood != 0.0; }); @@ -195,24 +203,38 @@ void mergeDuplicateSVs(std::vector &sv_calls) }); for (size_t i = 0; i < sv_calls.size(); i++) { SVCall& sv_call = sv_calls[i]; - // For SVs at the same start position with the same SV type, keep the one - // with the highest likelihood - if (i > 0 && sv_call.start == sv_calls[i - 1].start && ((sv_call.sv_type == sv_calls[i - 1].sv_type) || sv_call.sv_type == SVType::UNKNOWN || sv_calls[i - 1].sv_type == SVType::UNKNOWN)) { - // Keep the SV call with a non-zero likelihood - // The HMM prediction is more reliable than the split read prediction - if (sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) { - combined_sv_calls.back() = sv_call; - } - // If the likelihoods are equal, keep the one with the larger cluster size - // This is to ensure that the SV call with more supporting reads is - // kept - else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size >= sv_calls[i - 1].cluster_size) { - combined_sv_calls.back() = sv_call; - } + // Merge cluster sizes if start and end positions are the same + if (i > 0 && sv_call.start == sv_calls[i - 1].start && sv_call.end == sv_calls[i - 1].end) { + // Combine the cluster sizes + sv_call.cluster_size += sv_calls[i - 1].cluster_size; + combined_sv_calls.back() = sv_call; } else { combined_sv_calls.push_back(sv_call); } + // SVCall& sv_call = sv_calls[i]; + // // For SVs at the same start position with the same SV type, keep the one + // // with the highest likelihood + // if (i > 0 && sv_call.start == sv_calls[i - 1].start && ((sv_call.sv_type == sv_calls[i - 1].sv_type) || sv_call.sv_type == SVType::UNKNOWN || sv_calls[i - 1].sv_type == SVType::UNKNOWN)) { + // // Keep the SV call with a non-zero likelihood + // // The HMM prediction is more reliable than the split read prediction + // if (sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) { + // // Combine the cluster sizes + // sv_call.cluster_size += sv_calls[i - 1].cluster_size; + // combined_sv_calls.back() = sv_call; + // } + + // // If the likelihoods are equal, keep the one with the larger cluster size + // // This is to ensure that the SV call with more supporting reads is + // // kept + // else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size >= sv_calls[i - 1].cluster_size) { + // // Combine the cluster sizes + // sv_call.cluster_size += sv_calls[i - 1].cluster_size; + // combined_sv_calls.back() = sv_call; + // } + // } else { + // combined_sv_calls.push_back(sv_call); + // } } int merge_count = initial_size - combined_sv_calls.size(); sv_calls = std::move(combined_sv_calls); // Replace with filtered list From 5affdd3edeef080d15c88bd33fb2133c6d43f141 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 16 May 2025 23:00:57 -0400 Subject: [PATCH 109/134] fix dup prediction error --- src/cnv_caller.cpp | 29 ++++++++++++++++++++++++++++- src/khmm.cpp | 10 ++++++++++ src/main.cpp | 13 +++++++++---- src/sv_caller.cpp | 16 +++++++++++----- 4 files changed, 58 insertions(+), 10 deletions(-) diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 60dca02f..52c6ea67 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -173,6 +173,13 @@ std::tuple CNVCaller::runCopyNumberPrediction(std return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0); } + // bool print_debug = (start_pos == 62971016 || start_pos == 62971017); + bool print_debug = false; + if (print_debug) + { + printMessage("DEBUG: Running copy number prediction for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); + } + // Run the Viterbi algorithm on SNPs in the SV region // Only extend the region if "save CNV data" is enabled SNPData before_sv; @@ -200,6 +207,20 @@ std::tuple CNVCaller::runCopyNumberPrediction(std SNPData snp_data; querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data); + if (print_debug) + { + printMessage("DEBUG: SNP data size: " + std::to_string(snp_data.pos.size())); + printMessage("DEBUG: SNP data baf size: " + std::to_string(snp_data.baf.size())); + printMessage("DEBUG: SNP data pfb size: " + std::to_string(snp_data.pfb.size())); + printMessage("DEBUG: SNP data log2_cov size: " + std::to_string(snp_data.log2_cov.size())); + printMessage("DEBUG: mean_chr_cov: " + std::to_string(mean_chr_cov)); + // Print all log2_cov values + for (size_t i = 0; i < snp_data.log2_cov.size(); i++) + { + printMessage("DEBUG: SNP data log2_cov[" + std::to_string(i) + "]: " + std::to_string(snp_data.log2_cov[i])); + } + } + // Run the Viterbi algorithm std::pair, double> prediction; runViterbi(hmm, snp_data, prediction); @@ -219,6 +240,11 @@ std::tuple CNVCaller::runCopyNumberPrediction(std std::vector state_counts(6, 0); for (int state : state_sequence) { + if (print_debug) + { + printMessage("DEBUG: State: " + std::to_string(state)); + } + // Skip state 3 (normal state) if (state != 3) { @@ -245,8 +271,9 @@ std::tuple CNVCaller::runCopyNumberPrediction(std } // Save the SV calls if enabled + uint32_t min_length = 30000; bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL); - if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) > 50000) + if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) >= min_length) { // Set B-allele and population frequency values to 0 for non-SNPs for (size_t i = 0; i < snp_data.pos.size(); i++) diff --git a/src/khmm.cpp b/src/khmm.cpp index 43ed3958..7dfce96e 100644 --- a/src/khmm.cpp +++ b/src/khmm.cpp @@ -58,12 +58,22 @@ std::pair, double> testVit_CHMM(CHMM hmm, int T, std::vector mean, std::vector sd, double uf, double o) { // Get the values (0-based indexing) + + // Fix within the expected normalized coverage range if (o < mean[0]) { o = mean[0]; + } else if (o > mean[5]) + { + o = mean[5]; } + double p = uf + ((1 - uf) * pdf_normal(o, mean[state-1], sd[state-1])); + // Print the equation and the result + // printMessage("b1iot: state = " + std::to_string(state) + ", mean = " + std::to_string(mean[state-1]) + ", sd = " + std::to_string(sd[state-1]) + ", uf = " + std::to_string(uf) + ", o = " + std::to_string(o) + ", p = " + std::to_string(p)); + // printMessage("Equation: b1iot = uf + ((1 - uf) * pdf_normal(o, mean[state-1], sd[state-1]))"); + return log(p); } diff --git a/src/main.cpp b/src/main.cpp index f7f5db0a..4755a0e4 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -111,11 +111,16 @@ void runContextSV(const std::unordered_map& args) if (input_data.getSaveCNVData()) { const std::string output_dir = input_data.getOutputDir(); std::string json_filepath = output_dir + "/CNVCalls.json"; - int json_file_count = 1; - while (fileExists(json_filepath)) { - json_filepath = output_dir + "/CNVCalls_" + std::to_string(json_file_count) + ".json"; - json_file_count++; + + // Remove the old JSON file if it exists + if (fileExists(json_filepath)) { + remove(json_filepath.c_str()); } + // int json_file_count = 1; + // while (fileExists(json_filepath)) { + // json_filepath = output_dir + "/CNVCalls_" + std::to_string(json_file_count) + ".json"; + // json_file_count++; + // } input_data.setCNVOutputFile(json_filepath); std::cout << "Saving CNV data to: " << json_filepath << std::endl; } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 3e9f2ddc..bf3936b9 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -1068,6 +1068,12 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve { std::vector additional_calls; for (auto& sv_candidate : split_sv_calls) { + + // [TEST] Skip the SV start is not 62971016 or 62971017 + // if (sv_candidate.start != 62971016 && sv_candidate.start != 62971017) { + // continue; + // } + std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); @@ -1075,12 +1081,12 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve int cn_state = std::get<3>(result); bool print_debug = false; - if (sv_candidate.start == 15287019) { - // if (true) { - print_debug = true; + // if (sv_candidate.start == 15287019) { + // // if (true) { + // print_debug = true; - printMessage("DEBUG: Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type)); - } + // printMessage("DEBUG: Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type)); + // } // printMessage("Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVDataTypeString(sv_candidate.data_type)); From 34f8a9a80ee8eba5fa597e07745a23eddba0fb4a Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Sat, 17 May 2025 14:20:28 -0400 Subject: [PATCH 110/134] reduce cnv false positives --- src/sv_caller.cpp | 53 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 44 insertions(+), 9 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index bf3936b9..55c2a663 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -1110,6 +1110,7 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve // For predictions with the same type, or LOH predictions, update the // prediction information } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type == sv_candidate.sv_type || supp_type == SVType::LOH)) { + sv_candidate.aln_type.set(static_cast(SVDataType::HMM)); sv_candidate.hmm_likelihood = supp_lh; sv_candidate.genotype = genotype; sv_candidate.cn_state = cn_state; @@ -1120,15 +1121,49 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve // Add an additional SV call if the type is different } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type != sv_candidate.sv_type && (supp_type == SVType::DEL || supp_type == SVType::DUP))) { - SVCall new_sv_call = sv_candidate; // Copy the original SV call - new_sv_call.sv_type = supp_type; - new_sv_call.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format - // new_sv_call.aln_type = SVDataType::HMM; - new_sv_call.aln_type.set(static_cast(SVDataType::HMM)); - new_sv_call.hmm_likelihood = supp_lh; - new_sv_call.genotype = genotype; - new_sv_call.cn_state = cn_state; - additional_calls.push_back(new_sv_call); + // For inversions, just update the alignment type, copy number + // state, and HMM likelihood. Coverage changes for these may be + // predicted as CNVs + if (sv_candidate.sv_type == SVType::INV) { + sv_candidate.aln_type.set(static_cast(SVDataType::HMM)); + sv_candidate.hmm_likelihood = supp_lh; + sv_candidate.genotype = genotype; + sv_candidate.cn_state = cn_state; + // For insertions predicted as duplications, update all information + } else if (sv_candidate.sv_type == SVType::INS && supp_type == SVType::DUP) { + sv_candidate.sv_type = supp_type; + sv_candidate.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format + sv_candidate.aln_type.set(static_cast(SVDataType::HMM)); + sv_candidate.hmm_likelihood = supp_lh; + sv_candidate.genotype = genotype; + sv_candidate.cn_state = cn_state; + } else { + // Add a new SV call with the conflicting type + SVCall new_sv_call = sv_candidate; // Copy the original SV call + new_sv_call.sv_type = supp_type; + new_sv_call.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format + new_sv_call.aln_type.set(static_cast(SVDataType::HMM)); + new_sv_call.hmm_likelihood = supp_lh; + new_sv_call.genotype = genotype; + new_sv_call.cn_state = cn_state; + additional_calls.push_back(new_sv_call); + } + // SVCall new_sv_call = sv_candidate; // Copy the original SV call + // // new_sv_call.sv_type = supp_type; + + // // Update the SV type unless the current type is inversion + // if (sv_candidate.sv_type != SVType::INV) { + // new_sv_call.sv_type = supp_type; + // new_sv_call.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format + // new_sv_call.genotype = genotype; + // } + + // // new_sv_call.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format + // new_sv_call.aln_type.set(static_cast(SVDataType::HMM)); + // new_sv_call.hmm_likelihood = supp_lh; + // // new_sv_call.genotype = genotype; + // new_sv_call.cn_state = cn_state; + // additional_calls.push_back(new_sv_call); if (print_debug) { printMessage("DEBUG [3]: Adding additional SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type)); From 7948062a5269075219cf01b343d892705bee4484 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 19 May 2025 10:35:04 -0400 Subject: [PATCH 111/134] assembly gaps --- src/cnv_caller.cpp | 75 ++++++++++++++++++++++++++++++---------------- src/sv_caller.cpp | 74 ++++++++++++++++++++++----------------------- 2 files changed, 87 insertions(+), 62 deletions(-) diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 52c6ea67..3bc2016e 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -235,46 +235,71 @@ std::tuple CNVCaller::runCopyNumberPrediction(std // Determine if there is a majority state within the SV region int max_state = 0; int max_count = 0; - int non_normal_count = 0; - - std::vector state_counts(6, 0); - for (int state : state_sequence) + for (int i = 0; i < 6; i++) { - if (print_debug) + int state_count = std::count(state_sequence.begin(), state_sequence.end(), i+1); + if (state_count > max_count) { - printMessage("DEBUG: State: " + std::to_string(state)); + max_state = i+1; + max_count = state_count; } + } - // Skip state 3 (normal state) - if (state != 3) - { - state_counts[state - 1]++; - non_normal_count++; - } + // If there is no majority state, then set the state to unknown + double pct_threshold = 0.50; + int state_count = (int) state_sequence.size(); + if ((double) max_count / (double) state_count < pct_threshold) + { + max_state = 0; } + Genotype genotype = getGenotypeFromCNState(max_state); + SVType predicted_cnv_type = getSVTypeFromCNState(max_state); + // snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data + + + // int non_normal_count = 0; + + // std::vector state_counts(6, 0); + // for (int state : state_sequence) + // { + // if (print_debug) + // { + // printMessage("DEBUG: State: " + std::to_string(state)); + // } + + // // Skip state 3 (normal state) + // if (state != 3) + // { + // state_counts[state - 1]++; + // non_normal_count++; + // } + // } // Determine the maximum state and count - int max_state_index = std::distance(state_counts.begin(), std::max_element(state_counts.begin(), state_counts.end())); - max_state = max_state_index + 1; - max_count = state_counts[max_state_index]; + // int max_state_index = std::distance(state_counts.begin(), std::max_element(state_counts.begin(), state_counts.end())); + // max_state = max_state_index + 1; + // max_count = state_counts[max_state_index]; // Update SV type and genotype based on the majority state // SVType predicted_cnv_type = getSVTypeFromCNState(max_state); // Genotype genotype = getGenotypeFromCNState(max_state); - SVType predicted_cnv_type = SVType::UNKNOWN; - Genotype genotype = Genotype::UNKNOWN; - if (max_count > 0 && ((double) max_count / (double) non_normal_count) > 0.5) - { - predicted_cnv_type = getSVTypeFromCNState(max_state); - genotype = getGenotypeFromCNState(max_state); - snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data - } + // SVType predicted_cnv_type = SVType::UNKNOWN; + // Genotype genotype = Genotype::UNKNOWN; + // if (max_count > 0 && ((double) max_count / (double) non_normal_count) > 0.5) + // { + // predicted_cnv_type = getSVTypeFromCNState(max_state); + // genotype = getGenotypeFromCNState(max_state); + // snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data + // } // Save the SV calls if enabled uint32_t min_length = 30000; bool copy_number_change = (predicted_cnv_type != SVType::UNKNOWN && predicted_cnv_type != SVType::NEUTRAL); if (input_data.getSaveCNVData() && copy_number_change && (end_pos - start_pos) >= min_length) { + // Move the state sequence to the SNP data + snp_data.state_sequence = std::move(state_sequence); + // Set B-allele and population frequency values to 0 for non-SNPs for (size_t i = 0; i < snp_data.pos.size(); i++) { @@ -371,7 +396,7 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vector(overlap_length) / static_cast(sv_length); - if (overlap_pct > 0.2) { - in_assembly_gap = true; - break; - } + // Check if the SV is in an assembly gap (0-based) + if (assembly_gap_file != "") { + bool in_assembly_gap = false; + // if (assembly_gaps.find(chr) != assembly_gaps.end()) { + auto it = assembly_gaps.find(chr); + if (it != assembly_gaps.end()) { + // Check if the deletion overlaps with any assembly gaps + for (const auto& gap : assembly_gaps[chr]) { + // Determine if the deletion overlaps with the + // assembly gap by greater than 50% + uint32_t overlap_start = std::max(start, gap.first + 1); // Convert to 1-based + uint32_t overlap_end = std::min(end, gap.second + 1); // Convert to 1-based + if (overlap_start <= overlap_end) { + // Calculate the overlap length + uint32_t overlap_length = overlap_end - overlap_start + 1; + // Calculate the percentage of overlap + double overlap_pct = static_cast(overlap_length) / static_cast(sv_length); + if (overlap_pct > 0.2) { + in_assembly_gap = true; + break; } - // double overlap = 0.0; - // uint32_t gap_start = gap.first + 1; // Convert to 1-based - // uint32_t gap_end = gap.second + 1; // Convert to 1-based - // overlap = static_cast(std::min(end, gap_end) - std::max(start, gap_start) + 1) / static_cast(sv_length); - // if (overlap > 0.2) { - // std::cout << "Assembly gap overlap is " << overlap << " for " << chr << ":" << start << "-" << end << std::endl; - // in_assembly_gap = true; - // break; - // } - } - if (in_assembly_gap) { - filter = "AssemblyGap"; - assembly_gap_filtered_svs += 1; } + // double overlap = 0.0; + // uint32_t gap_start = gap.first + 1; // Convert to 1-based + // uint32_t gap_end = gap.second + 1; // Convert to 1-based + // overlap = static_cast(std::min(end, gap_end) - std::max(start, gap_start) + 1) / static_cast(sv_length); + // if (overlap > 0.2) { + // std::cout << "Assembly gap overlap is " << overlap << " for " << chr << ":" << start << "-" << end << std::endl; + // in_assembly_gap = true; + // break; + // } + } + if (in_assembly_gap) { + filter = "AssemblyGap"; + assembly_gap_filtered_svs += 1; } } + } + // Deletion + if (sv_type == SVType::DEL) { // Get the deleted sequence from the reference genome, also including the preceding base uint32_t preceding_pos = (uint32_t) std::max(1, static_cast(start)-1); // Make sure the position is not negative ref_allele = ref_genome.query(chr, preceding_pos, end); From 32e14eacd9b481b7663ebcebf778526124316ad2 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 19 May 2025 13:27:32 -0400 Subject: [PATCH 112/134] add inversion hmm prediction and improve merging --- src/cnv_caller.cpp | 39 +++++++++++++++++---------------------- src/sv_caller.cpp | 34 +++++----------------------------- src/sv_object.cpp | 8 ++++++-- 3 files changed, 28 insertions(+), 53 deletions(-) diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 3bc2016e..7f14aec1 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -173,13 +173,6 @@ std::tuple CNVCaller::runCopyNumberPrediction(std return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0); } - // bool print_debug = (start_pos == 62971016 || start_pos == 62971017); - bool print_debug = false; - if (print_debug) - { - printMessage("DEBUG: Running copy number prediction for " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); - } - // Run the Viterbi algorithm on SNPs in the SV region // Only extend the region if "save CNV data" is enabled SNPData before_sv; @@ -207,20 +200,6 @@ std::tuple CNVCaller::runCopyNumberPrediction(std SNPData snp_data; querySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data); - if (print_debug) - { - printMessage("DEBUG: SNP data size: " + std::to_string(snp_data.pos.size())); - printMessage("DEBUG: SNP data baf size: " + std::to_string(snp_data.baf.size())); - printMessage("DEBUG: SNP data pfb size: " + std::to_string(snp_data.pfb.size())); - printMessage("DEBUG: SNP data log2_cov size: " + std::to_string(snp_data.log2_cov.size())); - printMessage("DEBUG: mean_chr_cov: " + std::to_string(mean_chr_cov)); - // Print all log2_cov values - for (size_t i = 0; i < snp_data.log2_cov.size(); i++) - { - printMessage("DEBUG: SNP data log2_cov[" + std::to_string(i) + "]: " + std::to_string(snp_data.log2_cov[i])); - } - } - // Run the Viterbi algorithm std::pair, double> prediction; runViterbi(hmm, snp_data, prediction); @@ -245,6 +224,13 @@ std::tuple CNVCaller::runCopyNumberPrediction(std } } + bool print_debug = false; + if (start_pos == 70955983) // || start_pos == 70955984) + { + print_debug = true; + printMessage("Max state for " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + " is " + std::to_string(max_state) + " with count " + std::to_string(max_count) + " of " + std::to_string(state_sequence.size())); + } + // If there is no majority state, then set the state to unknown double pct_threshold = 0.50; int state_count = (int) state_sequence.size(); @@ -252,11 +238,20 @@ std::tuple CNVCaller::runCopyNumberPrediction(std { max_state = 0; } + + if (print_debug) + { + printMessage("Pct max count: " + std::to_string((double) max_count / (double) state_count)); + } + Genotype genotype = getGenotypeFromCNState(max_state); SVType predicted_cnv_type = getSVTypeFromCNState(max_state); // snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data - + if (print_debug) + { + printMessage("Predicted CNV type: " + getSVTypeString(predicted_cnv_type) + " with genotype " + getGenotypeString(genotype) + " and likelihood " + std::to_string(likelihood)); + } // int non_normal_count = 0; // std::vector state_counts(6, 0); diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 1238f660..3ba9628b 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -799,7 +799,8 @@ void SVCaller::processChromosome(const std::string& chr, std::vector& ch bam_hdr_destroy(bamHdr); printMessage(chr + ": Merging CIGAR..."); - mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false); + // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false); + mergeSVs(chr_sv_calls, 0.1, dbscan_min_pts, false); int region_sv_count = getSVCount(chr_sv_calls); printMessage(chr + ": Found " + std::to_string(region_sv_count) + " SV candidates in the CIGAR string"); @@ -980,32 +981,7 @@ void SVCaller::run(const InputData& input_data) printMessage("Merging CIGAR and split read SV calls..."); for (auto& entry : whole_genome_sv_calls) { std::vector& sv_calls = entry.second; - // mergeDuplicateSVs(sv_calls); - // mergeSVs(sv_calls, 0.1, 2, false); - - // [TEST 1] Keep noise and use the DBSCAN epsilon from the - // command line - // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), 2, true); - - // [TEST 2] Remove noise and use the DBSCAN epsilon from the - // command line (= really low recall, and low precision) - // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), 2, false); - - // [TEST 3] Remove noise and use a DBSCAN epsilon of 0.1 (low recall, - // higher precision) - // mergeSVs(sv_calls, 0.1, 2, false); - - // [TEST 4] Keep noise and use a DBSCAN epsilon of 0.1 (slightly better - // recall) - // Using a more aggressive epsilon works better for the final merge - mergeSVs(sv_calls, 0.1, 2, true); - // continue; - - // [TEST 5] Keep noise and use a DBSCAN epsilon of 0.01 (1 more FP) - // mergeSVs(sv_calls, 0.01, 2, true); - - // [TEST 6] do nothing (reduced precision, same recall as #4) - // continue; + // mergeSVs(sv_calls, 0.1, 2, true); } if (input_data.getSaveCNVData()) { @@ -1107,9 +1083,9 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve printMessage("DEBUG [1]: Updating SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type)); } - // For predictions with the same type, or LOH predictions, update the + // For predictions with the same type, or LOH, neutral predictions, update the // prediction information - } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type == sv_candidate.sv_type || supp_type == SVType::LOH)) { + } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type == sv_candidate.sv_type || supp_type == SVType::LOH || supp_type == SVType::NEUTRAL)) { sv_candidate.aln_type.set(static_cast(SVDataType::HMM)); sv_candidate.hmm_likelihood = supp_lh; sv_candidate.genotype = genotype; diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 1fe60b31..5a80e39d 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -147,10 +147,14 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k // return sv_call.hmm_likelihood != 0.0; // }); - // Choose the SV with the highest cluster size of all SVs with non-zero likelihood + // Choose the SV with the highest cluster size of all SVs + // with non-zero likelihood (if equal, choose the larger SV) std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { - return a.cluster_size > b.cluster_size || (a.cluster_size == b.cluster_size && a.hmm_likelihood > b.hmm_likelihood); + return a.cluster_size > b.cluster_size || (a.cluster_size == b.cluster_size && a.end - a.start > b.end - b.start); }); + // std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { + // return a.cluster_size > b.cluster_size || (a.cluster_size == b.cluster_size && a.hmm_likelihood > b.hmm_likelihood); + // }); auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) { return sv_call.hmm_likelihood != 0.0; }); From ab2743675334b49101ce6d51e951860ca35bbad4 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Mon, 19 May 2025 15:44:38 -0400 Subject: [PATCH 113/134] achieve highest recall for large svs --- src/sv_caller.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 3ba9628b..70d0e86e 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -965,7 +965,9 @@ void SVCaller::run(const InputData& input_data) int min_pts = 2; for (auto& entry : whole_genome_split_sv_calls) { std::vector& sv_calls = entry.second; - mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), min_pts, true); + // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), min_pts, + // true); + mergeSVs(sv_calls, 0.1, min_pts, true); } printMessage("Unifying SVs..."); @@ -981,7 +983,7 @@ void SVCaller::run(const InputData& input_data) printMessage("Merging CIGAR and split read SV calls..."); for (auto& entry : whole_genome_sv_calls) { std::vector& sv_calls = entry.second; - // mergeSVs(sv_calls, 0.1, 2, true); + mergeSVs(sv_calls, 0.1, 2, true); } if (input_data.getSaveCNVData()) { From 940f61e86737dce87caa553bee7dfd9df3f0d6f5 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Tue, 20 May 2025 21:27:26 -0400 Subject: [PATCH 114/134] revert merge --- src/sv_caller.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 70d0e86e..4c8aeee0 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -799,8 +799,9 @@ void SVCaller::processChromosome(const std::string& chr, std::vector& ch bam_hdr_destroy(bamHdr); printMessage(chr + ": Merging CIGAR..."); - // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false); - mergeSVs(chr_sv_calls, 0.1, dbscan_min_pts, false); + mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false); + // mergeSVs(chr_sv_calls, 0.1, dbscan_min_pts, false); + // mergeSVs(chr_sv_calls, 0.3, dbscan_min_pts, false); int region_sv_count = getSVCount(chr_sv_calls); printMessage(chr + ": Found " + std::to_string(region_sv_count) + " SV candidates in the CIGAR string"); @@ -962,12 +963,13 @@ void SVCaller::run(const InputData& input_data) } printMessage("Merging split-read SVs..."); - int min_pts = 2; + // int min_pts = 2; for (auto& entry : whole_genome_split_sv_calls) { std::vector& sv_calls = entry.second; // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), min_pts, // true); - mergeSVs(sv_calls, 0.1, min_pts, true); + mergeSVs(sv_calls, 0.1, 2, true); + // mergeSVs(sv_calls, 0.3, min_pts, true); } printMessage("Unifying SVs..."); @@ -984,6 +986,7 @@ void SVCaller::run(const InputData& input_data) for (auto& entry : whole_genome_sv_calls) { std::vector& sv_calls = entry.second; mergeSVs(sv_calls, 0.1, 2, true); + // mergeSVs(sv_calls, 0.3, 2, true); } if (input_data.getSaveCNVData()) { From 87be75f5eb9b51ef9af52a5f36378d85b92be4f6 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 21 May 2025 14:18:01 -0400 Subject: [PATCH 115/134] remove comments --- src/cnv_caller.cpp | 89 +----------------------------- src/sv_caller.cpp | 131 +-------------------------------------------- src/sv_object.cpp | 34 ------------ 3 files changed, 3 insertions(+), 251 deletions(-) diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index 7f14aec1..c7045955 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -74,7 +74,6 @@ void CNVCaller::querySNPRegion(std::string chr, uint32_t start_pos, uint32_t end // Loop through evenly spaced positions in the region and get the log2 ratio double pos_step = static_cast(end_pos - start_pos + 1) / static_cast(sample_size); - // double pos_step = (double) (end_pos - start_pos + 1) / (double) sample_size; std::unordered_map window_log2_map; for (int i = 0; i < sample_size; i++) { @@ -224,13 +223,6 @@ std::tuple CNVCaller::runCopyNumberPrediction(std } } - bool print_debug = false; - if (start_pos == 70955983) // || start_pos == 70955984) - { - print_debug = true; - printMessage("Max state for " + chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos) + " is " + std::to_string(max_state) + " with count " + std::to_string(max_count) + " of " + std::to_string(state_sequence.size())); - } - // If there is no majority state, then set the state to unknown double pct_threshold = 0.50; int state_count = (int) state_sequence.size(); @@ -239,53 +231,8 @@ std::tuple CNVCaller::runCopyNumberPrediction(std max_state = 0; } - if (print_debug) - { - printMessage("Pct max count: " + std::to_string((double) max_count / (double) state_count)); - } - Genotype genotype = getGenotypeFromCNState(max_state); SVType predicted_cnv_type = getSVTypeFromCNState(max_state); - // snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data - - if (print_debug) - { - printMessage("Predicted CNV type: " + getSVTypeString(predicted_cnv_type) + " with genotype " + getGenotypeString(genotype) + " and likelihood " + std::to_string(likelihood)); - } - // int non_normal_count = 0; - - // std::vector state_counts(6, 0); - // for (int state : state_sequence) - // { - // if (print_debug) - // { - // printMessage("DEBUG: State: " + std::to_string(state)); - // } - - // // Skip state 3 (normal state) - // if (state != 3) - // { - // state_counts[state - 1]++; - // non_normal_count++; - // } - // } - - // Determine the maximum state and count - // int max_state_index = std::distance(state_counts.begin(), std::max_element(state_counts.begin(), state_counts.end())); - // max_state = max_state_index + 1; - // max_count = state_counts[max_state_index]; - - // Update SV type and genotype based on the majority state - // SVType predicted_cnv_type = getSVTypeFromCNState(max_state); - // Genotype genotype = getGenotypeFromCNState(max_state); - // SVType predicted_cnv_type = SVType::UNKNOWN; - // Genotype genotype = Genotype::UNKNOWN; - // if (max_count > 0 && ((double) max_count / (double) non_normal_count) > 0.5) - // { - // predicted_cnv_type = getSVTypeFromCNState(max_state); - // genotype = getGenotypeFromCNState(max_state); - // snp_data.state_sequence = std::move(state_sequence); // Move the state sequence to the SNP data - // } // Save the SV calls if enabled uint32_t min_length = 30000; @@ -365,7 +312,6 @@ void CNVCaller::runCIGARCopyNumberPrediction(std::string chr, std::vectorquerySNPRegion(chr, start_pos, end_pos, pos_depth_map, mean_chr_cov, snp_data, input_data); // Run the Viterbi algorithm @@ -462,7 +408,6 @@ std::vector CNVCaller::splitRegionIntoChunks(std::string chr, uint3 void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& chromosomes, std::unordered_map>& chr_pos_depth_map, std::unordered_map& chr_mean_cov_map, const std::string& bam_filepath, int thread_count) const { // Open the BAM file - // std::shared_lock lock(this->shared_mutex); // Lock the BAM file printMessage("Opening BAM file: " + bam_filepath); samFile *bam_file = sam_open(bam_filepath.c_str(), "r"); if (!bam_file) @@ -531,7 +476,6 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& if (pos_depth_map.size() != static_cast(chr_length)) { printError("ERROR: Chromosome length mismatch for " + chr + ": expected " + std::to_string(chr_length) + ", found " + std::to_string(pos_depth_map.size()) + ", resizing to " + std::to_string(chr_length)); - // Resize the depth map to the length of the chromosome pos_depth_map.resize(chr_length, 0); } while (sam_itr_next(bam_file, bam_iter, bam_record) >= 0) @@ -580,35 +524,6 @@ void CNVCaller::calculateMeanChromosomeCoverage(const std::vector& } hts_itr_destroy(bam_iter); - // You can parallelize the depth map calculation here but first close the - // BAM file and index - // Bam cleanup (delete guard if using this) - // bam_destroy1(bam_record); - // bam_hdr_destroy(bam_header); - // sam_close(bam_file); - // bam_index_destroy(bam_index); - // bam_record = nullptr; - // bam_header = nullptr; - // bam_file = nullptr; - // bam_index = nullptr; - - // // Parallel sum of the depth map - // uint64_t cum_depth = std::reduce( - // std::execution::par, - // pos_depth_map.begin(), - // pos_depth_map.end(), - // 0ULL - // ); - - // // Parallel count of the non-zero depth positions - // uint32_t pos_count = std::count_if( - // std::execution::par, - // pos_depth_map.begin(), - // pos_depth_map.end(), - // [](uint32_t depth) { return depth > 0; } - // ); - - // Sum without parallelization uint64_t cum_depth = std::accumulate(pos_depth_map.begin(), pos_depth_map.end(), 0ULL); uint32_t pos_count = std::count_if(pos_depth_map.begin(), pos_depth_map.end(), [](uint32_t depth) { return depth > 0; }); @@ -701,7 +616,6 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui // Check if the filepath uses the 'chr' prefix notations based on the // chromosome name (*.chr1.vcf.gz vs *.1.vcf.gz) - // chr_gnomad = chr; // gnomAD data may or may not have the 'chr' prefix std::string chr_prefix = "chr"; if (pfb_filepath.find(chr_prefix) == std::string::npos) { @@ -745,7 +659,8 @@ void CNVCaller::readSNPAlleleFrequencies(std::string chr, uint32_t start_pos, ui bcf_sr_set_threads(pfb_reader, thread_count); } - // Read the SNP data ---------------------------------------------- + // Read the SNP data + // Set the region std::string region_str = chr + ":" + std::to_string(start_pos) + "-" + std::to_string(end_pos); if (bcf_sr_set_regions(snp_reader, region_str.c_str(), 0) < 0) //chr.c_str(), 0) < 0) diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index 4c8aeee0..cf39c5de 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -446,42 +446,20 @@ void SVCaller::findSplitSVSignatures(std::unordered_map read_distance && ref_distance >= min_length && ref_distance <= max_length) { - // Add a deletion SV call at the primary positions - // SVType sv_type = SVType::DEL; // Set it to unknown, SV type will be determined by the // HMM prediction SVType sv_type = SVType::UNKNOWN; - - // if (print_debug) { - // printMessage("DEBUG: Adding deletion SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_start + (ref_distance-1)) + " with length " + std::to_string(ref_distance) + " and cluster size " + std::to_string(primary_cluster_size)); - // } - - // Add a dummy SV call before and after the start - // position for HMM predictions - // SVType sv_type = SVType::UNKNOWN; - // SVCall sv_candidate(sv_start, sv_start + - // (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), - // SVDataType::SPLITDIST1, Genotype::UNKNOWN, 0.0, 0, - // aln_offset, primary_cluster_size); - // printMessage("DEBUG: Adding deletion SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_start + (ref_distance-1)) + " with length " + std::to_string(ref_distance) + " and cluster size " + std::to_string(primary_cluster_size) + " and 5p-most is " + std::to_string(primary_5p_most) + " and read distance is " + std::to_string(read_distance) + " and ref distance is " + std::to_string(ref_distance)); SVCall sv_candidate(sv_start, sv_start + (ref_distance-1), sv_type, getSVTypeSymbol(sv_type), aln_type, Genotype::UNKNOWN, 0.0, 0, aln_offset, primary_cluster_size); addSVCall(chr_sv_calls, sv_candidate); } @@ -531,10 +493,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map= min_length && sv_length <= max_length) { - // printMessage("Adding SV call at " + chr_name + ":" + std::to_string(sv_start) + "-" + std::to_string(sv_end) + " with length " + std::to_string(sv_length) + " and cluster size " + std::to_string(cluster_size)); - // SVCall sv_candidate(sv_start, sv_end, sv_type, alt, - // SVDataType::SPLIT, Genotype::UNKNOWN, 0.0, 0, 0, - // cluster_size); SVEvidenceFlags aln_type; aln_type.set(static_cast(SVDataType::SPLIT)); SVCall sv_candidate(sv_start, sv_end, sv_type, alt, aln_type, Genotype::UNKNOWN, 0.0, 0, 0, cluster_size); @@ -546,7 +504,6 @@ void SVCaller::findSplitSVSignatures(std::unordered_map& ch printMessage(chr + ": Merging CIGAR..."); mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false); - // mergeSVs(chr_sv_calls, 0.1, dbscan_min_pts, false); - // mergeSVs(chr_sv_calls, 0.3, dbscan_min_pts, false); int region_sv_count = getSVCount(chr_sv_calls); printMessage(chr + ": Found " + std::to_string(region_sv_count) + " SV candidates in the CIGAR string"); @@ -963,13 +916,9 @@ void SVCaller::run(const InputData& input_data) } printMessage("Merging split-read SVs..."); - // int min_pts = 2; for (auto& entry : whole_genome_split_sv_calls) { std::vector& sv_calls = entry.second; - // mergeSVs(sv_calls, input_data.getDBSCAN_Epsilon(), min_pts, - // true); mergeSVs(sv_calls, 0.1, 2, true); - // mergeSVs(sv_calls, 0.3, min_pts, true); } printMessage("Unifying SVs..."); @@ -986,7 +935,6 @@ void SVCaller::run(const InputData& input_data) for (auto& entry : whole_genome_sv_calls) { std::vector& sv_calls = entry.second; mergeSVs(sv_calls, 0.1, 2, true); - // mergeSVs(sv_calls, 0.3, 2, true); } if (input_data.getSaveCNVData()) { @@ -1050,27 +998,12 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve std::vector additional_calls; for (auto& sv_candidate : split_sv_calls) { - // [TEST] Skip the SV start is not 62971016 or 62971017 - // if (sv_candidate.start != 62971016 && sv_candidate.start != 62971017) { - // continue; - // } - std::tuple result = cnv_caller.runCopyNumberPrediction(chr, hmm, sv_candidate.start, sv_candidate.end, mean_chr_cov, pos_depth_map, input_data); double supp_lh = std::get<0>(result); SVType supp_type = std::get<1>(result); Genotype genotype = std::get<2>(result); int cn_state = std::get<3>(result); - bool print_debug = false; - // if (sv_candidate.start == 15287019) { - // // if (true) { - // print_debug = true; - - // printMessage("DEBUG: Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type)); - // } - - // printMessage("Running copy number prediction on " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVDataTypeString(sv_candidate.data_type)); - // Update the SV type if the predicted type is not unknown if (supp_type != SVType::UNKNOWN) { // Update all information if the current SV call is not known and @@ -1079,15 +1012,10 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve sv_candidate.sv_type = supp_type; sv_candidate.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format sv_candidate.aln_type.set(static_cast(SVDataType::HMM)); - // sv_candidate.data_type = SVDataType::HMM; sv_candidate.hmm_likelihood = supp_lh; sv_candidate.genotype = genotype; sv_candidate.cn_state = cn_state; - if (print_debug) { - printMessage("DEBUG [1]: Updating SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type)); - } - // For predictions with the same type, or LOH, neutral predictions, update the // prediction information } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type == sv_candidate.sv_type || supp_type == SVType::LOH || supp_type == SVType::NEUTRAL)) { @@ -1096,10 +1024,6 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve sv_candidate.genotype = genotype; sv_candidate.cn_state = cn_state; - if (print_debug) { - printMessage("DEBUG [2]: Updating SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type)); - } - // Add an additional SV call if the type is different } else if (sv_candidate.sv_type != SVType::UNKNOWN && (supp_type != sv_candidate.sv_type && (supp_type == SVType::DEL || supp_type == SVType::DUP))) { // For inversions, just update the alignment type, copy number @@ -1129,26 +1053,6 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve new_sv_call.cn_state = cn_state; additional_calls.push_back(new_sv_call); } - // SVCall new_sv_call = sv_candidate; // Copy the original SV call - // // new_sv_call.sv_type = supp_type; - - // // Update the SV type unless the current type is inversion - // if (sv_candidate.sv_type != SVType::INV) { - // new_sv_call.sv_type = supp_type; - // new_sv_call.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format - // new_sv_call.genotype = genotype; - // } - - // // new_sv_call.alt_allele = getSVTypeSymbol(supp_type); // Update the ALT allele format - // new_sv_call.aln_type.set(static_cast(SVDataType::HMM)); - // new_sv_call.hmm_likelihood = supp_lh; - // // new_sv_call.genotype = genotype; - // new_sv_call.cn_state = cn_state; - // additional_calls.push_back(new_sv_call); - - if (print_debug) { - printMessage("DEBUG [3]: Adding additional SV call at " + chr + ":" + std::to_string(sv_candidate.start) + "-" + std::to_string(sv_candidate.end) + " with HMM likelihood " + std::to_string(supp_lh) + " and type " + getSVTypeString(supp_type) + " and data type " + getSVAlignmentTypeString(sv_candidate.aln_type)); - } } } } @@ -1203,8 +1107,6 @@ void SVCaller::saveToVCF(const std::unordered_map(overlap_length) / static_cast(sv_length); if (overlap_pct > 0.2) { @@ -1341,15 +1243,6 @@ void SVCaller::saveToVCF(const std::unordered_map(std::min(end, gap_end) - std::max(start, gap_start) + 1) / static_cast(sv_length); - // if (overlap > 0.2) { - // std::cout << "Assembly gap overlap is " << overlap << " for " << chr << ":" << start << "-" << end << std::endl; - // in_assembly_gap = true; - // break; - // } } if (in_assembly_gap) { filter = "AssemblyGap"; @@ -1366,17 +1259,6 @@ void SVCaller::saveToVCF(const std::unordered_map90% N, skip the SV call (assembly - // gap) - // int allele_length_90pct = static_cast(ref_allele.size() * 0.9); - // if (std::count(ref_allele.begin(), ref_allele.end(), 'N') > allele_length_90pct) { - // assembly_gaps += 1; - // // continue; - - // // Don't skip but set the filter to assembly gap - // filter = "AssemblyGap"; - // } - // The alt allele is the preceding base, and the reference // allele is the deleted sequence including the preceding base alt_allele = ref_allele.at(0); @@ -1434,21 +1316,10 @@ void SVCaller::saveToVCF(const std::unordered_mapgetReadDepth(chr_pos_depth_map.at(chr), start); - // If read depth equals zero, then set the filter to LowQual - // if (read_depth == 0) { - // printError("Warning: Read depth is zero for " + chr + ":" + std::to_string(start) + "-" + std::to_string(end)); - // filter = "LowQual"; - // filtered_svs += 1; - // } - // Create the VCF parameter strings std::string sv_type_str = getSVTypeString(sv_type); - // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";MISMATCH=" + std::to_string(mismatch_rate); - // std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";ALNOFFSET=" + std::to_string(aln_offset) + ";CN=" + std::to_string(cn_state); std::string info_str = "END=" + std::to_string(end) + ";SVTYPE=" + sv_type_str + ";SVLEN=" + std::to_string(sv_length) + ";SVMETHOD=" + sv_method + ";ALN=" + data_type_str + ";HMM=" + std::to_string(hmm_likelihood) + ";SUPPORT=" + std::to_string(read_depth) + ";CLUSTER=" + std::to_string(cluster_size) + ";ALNOFFSET=" + std::to_string(aln_offset) + ";CN=" + std::to_string(cn_state) + loh; std::string format_str = "GT:DP"; std::string sample_str = genotype + ":" + std::to_string(read_depth); diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 5a80e39d..9b4172b7 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -136,17 +136,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k SVCall merged_sv_call = cluster_sv_calls[0]; if (has_nonzero_likelihood) { - // // These are detected from split reads, choose the one with - // // the highest non-zero likelihood normalized by the length of the SV - // std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { - // return (a.hmm_likelihood / (double)(a.end - a.start + 1)) > (b.hmm_likelihood / (double)(b.end - b.start + 1)); - // }); - - // // Obtain the highest non-zero likelihood - // auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) { - // return sv_call.hmm_likelihood != 0.0; - // }); - // Choose the SV with the highest cluster size of all SVs // with non-zero likelihood (if equal, choose the larger SV) std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { @@ -216,29 +205,6 @@ void mergeDuplicateSVs(std::vector &sv_calls) } else { combined_sv_calls.push_back(sv_call); } - // SVCall& sv_call = sv_calls[i]; - // // For SVs at the same start position with the same SV type, keep the one - // // with the highest likelihood - // if (i > 0 && sv_call.start == sv_calls[i - 1].start && ((sv_call.sv_type == sv_calls[i - 1].sv_type) || sv_call.sv_type == SVType::UNKNOWN || sv_calls[i - 1].sv_type == SVType::UNKNOWN)) { - // // Keep the SV call with a non-zero likelihood - // // The HMM prediction is more reliable than the split read prediction - // if (sv_call.hmm_likelihood != 0.0 && sv_calls[i - 1].hmm_likelihood == 0.0) { - // // Combine the cluster sizes - // sv_call.cluster_size += sv_calls[i - 1].cluster_size; - // combined_sv_calls.back() = sv_call; - // } - - // // If the likelihoods are equal, keep the one with the larger cluster size - // // This is to ensure that the SV call with more supporting reads is - // // kept - // else if (sv_call.hmm_likelihood == sv_calls[i - 1].hmm_likelihood && sv_call.cluster_size >= sv_calls[i - 1].cluster_size) { - // // Combine the cluster sizes - // sv_call.cluster_size += sv_calls[i - 1].cluster_size; - // combined_sv_calls.back() = sv_call; - // } - // } else { - // combined_sv_calls.push_back(sv_call); - // } } int merge_count = initial_size - combined_sv_calls.size(); sv_calls = std::move(combined_sv_calls); // Replace with filtered list From b62c9d8762245f1f3dbec1fa34bb6195e9b8b7bc Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 21 May 2025 17:47:19 -0400 Subject: [PATCH 116/134] save cluster plot data and merge duplicates --- include/sv_object.h | 10 ++- src/sv_caller.cpp | 11 +++- src/sv_object.cpp | 147 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 152 insertions(+), 16 deletions(-) diff --git a/include/sv_object.h b/include/sv_object.h index 4fd34c56..8b9b2347 100644 --- a/include/sv_object.h +++ b/include/sv_object.h @@ -32,9 +32,6 @@ struct SVCall { SVCall(uint32_t start, uint32_t end, SVType sv_type, const std::string& alt_allele, SVEvidenceFlags aln_type, Genotype genotype, double hmm_likelihood, int cn_state, int aln_offset, int cluster_size) : start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), aln_type(aln_type), genotype(genotype), hmm_likelihood(hmm_likelihood), cn_state(cn_state), aln_offset(aln_offset), cluster_size(cluster_size) {} - - // SVCall(uint32_t start, uint32_t end, SVType sv_type, std::string alt_allele, SVDataType data_type, Genotype genotype, double hmm_likelihood, int cn_state, int aln_offset, int cluster_size) : - // start(start), end(end), sv_type(sv_type), alt_allele(alt_allele), data_type(data_type), genotype(genotype), hmm_likelihood(hmm_likelihood), cn_state(cn_state), aln_offset(aln_offset), cluster_size(cluster_size) {} }; void addSVCall(std::vector& sv_calls, SVCall& sv_call); @@ -44,9 +41,10 @@ void mergeDuplicateSVs(std::vector& sv_calls); uint32_t getSVCount(const std::vector& sv_calls); -void concatenateSVCalls(std::vector& sv_calls, const std::vector& sv_calls_update); - // Merge SVs using DBSCAN clustering -void mergeSVs(std::vector &sv_calls, double epsilon, int min_pts, bool keep_noise); +void mergeSVs(std::vector &sv_calls, double epsilon, int min_pts, bool keep_noise, const std::string& json_filepath = ""); + +// Save clusters of SV calls to a JSON file +void saveClustersToJSON(const std::string& filename, const std::map>& clusters); #endif // SV_OBJECT_H diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index cf39c5de..a1a52e53 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -754,6 +754,13 @@ void SVCaller::processChromosome(const std::string& chr, std::vector& ch bam_hdr_destroy(bamHdr); printMessage(chr + ": Merging CIGAR..."); + // Save JSON if chr21 + // if (chr == "chr21") { + // std::string json_fp = input_data.getOutputDir() + "/" + chr + ".json"; + // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, true, json_fp); + // } else { + // mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false); + // } mergeSVs(chr_sv_calls, dbscan_epsilon, dbscan_min_pts, false); int region_sv_count = getSVCount(chr_sv_calls); @@ -953,8 +960,6 @@ void SVCaller::run(const InputData& input_data) // Save to VCF std::cout << "Saving SVs to VCF..." << std::endl; - // const std::string output_dir = input_data.getOutputDir(); - // this->saveToVCF(whole_genome_sv_calls, output_dir, ref_genome, chr_pos_depth_map); this->saveToVCF(whole_genome_sv_calls, input_data, ref_genome, chr_pos_depth_map); } @@ -1076,7 +1081,7 @@ void SVCaller::runSplitReadCopyNumberPredictions(const std::string& chr, std::ve } } -// void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const std::string& output_dir, const ReferenceGenome& ref_genome, const std::unordered_map>& chr_pos_depth_map) const + void SVCaller::saveToVCF(const std::unordered_map>& sv_calls, const InputData &input_data, const ReferenceGenome& ref_genome, const std::unordered_map>& chr_pos_depth_map) const { // Check if an assembly gap file was provided diff --git a/src/sv_object.cpp b/src/sv_object.cpp index 9b4172b7..b763efd2 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -7,6 +7,8 @@ #include #include #include +#include +#include #include "dbscan.h" #include "utils.h" @@ -39,7 +41,7 @@ void concatenateSVCalls(std::vector &target, const std::vector& target.insert(target.end(), source.begin(), source.end()); } -void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool keep_noise) +void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool keep_noise, const std::string& json_filepath) { printMessage("Merging SVs with DBSCAN, eps=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts)); @@ -59,6 +61,8 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k SVType::BND, }) { + std::vector merged_sv_type_calls; + // Create a vector of SV calls for the current SV type and size interval std::vector sv_type_calls; std::copy_if(sv_calls.begin(), sv_calls.end(), std::back_inserter(sv_type_calls), [sv_type](const SVCall& sv_call) { @@ -69,7 +73,8 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k // Add all unclustered points to the merged list for (const auto& sv_call : sv_type_calls) { SVCall noise_sv_call = sv_call; - merged_sv_calls.push_back(noise_sv_call); + // merged_sv_calls.push_back(noise_sv_call); + merged_sv_type_calls.push_back(noise_sv_call); } continue; } @@ -82,7 +87,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k if (sv_type == SVType::INS) { // Add only non-CIGARCLIP SVs to the cluster map for (size_t i = 0; i < clusters.size(); ++i) { - // if (sv_type_calls[i].data_type != SVDataType::CIGARCLIP) { // Use the SVEvidenceFlags to check for CIGARCLIP if (!sv_type_calls[i].aln_type.test(static_cast(SVDataType::CIGARCLIP))) { cluster_map[clusters[i]].push_back(sv_type_calls[i]); @@ -94,6 +98,23 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k } } + // Save clusters to JSON if requested + if (!json_filepath.empty()) { + // Create the directory if it doesn't exist + std::string dir = json_filepath.substr(0, json_filepath.find_last_of('/')); + if (!fileExists(dir)) { + std::string command = "mkdir -p " + dir; + system(command.c_str()); + } + // Save the clusters to a JSON file + // Prepend the SV type before the extension + // Remove the file extension from the JSON filename + std::string json_filename_no_ext = json_filepath.substr(0, json_filepath.find_last_of('.')); + std::string json_filename = json_filename_no_ext + "_" + getSVTypeString(sv_type) + ".json"; + // std::string json_filename = json_filepath + "/clusters_" + getSVTypeString(sv_type) + ".json"; + saveClustersToJSON(json_filename, cluster_map); + } + // Merge SVs in each cluster int cluster_count = 0; for (auto& cluster : cluster_map) { @@ -111,7 +132,8 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k // Add all unclustered points to the merged list for (const auto& sv_call : cluster_sv_calls) { SVCall noise_sv_call = sv_call; - merged_sv_calls.push_back(noise_sv_call); + // merged_sv_calls.push_back(noise_sv_call); + merged_sv_type_calls.push_back(noise_sv_call); } // Merge clustered SV calls @@ -150,7 +172,8 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k // Add SV call merged_sv_call = *it; - merged_sv_calls.push_back(merged_sv_call); + // merged_sv_calls.push_back(merged_sv_call); + merged_sv_type_calls.push_back(merged_sv_call); // ---------------------------- // CIGAR-BASED MERGING @@ -173,16 +196,126 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k // Add SV call merged_sv_call.cluster_size = (int) cluster_sv_calls.size(); - merged_sv_calls.push_back(merged_sv_call); + // merged_sv_calls.push_back(merged_sv_call); + merged_sv_type_calls.push_back(merged_sv_call); } cluster_count++; } } - printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + " SVs"); + printMessage("Merged " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + ", found " + std::to_string(merged_sv_type_calls.size()) + " merged SV calls"); + + // Merge overlapping SVs by cluster size + std::sort(merged_sv_type_calls.begin(), merged_sv_type_calls.end(), [](const SVCall& a, const SVCall& b) { + return a.start < b.start || (a.start == b.start && a.end < b.end); + }); + std::vector merged_sv_calls_final; + for (size_t i = 0; i < merged_sv_type_calls.size(); i++) { + SVCall& sv_call = merged_sv_type_calls[i]; + + // Merge cluster sizes if they overlap + if (i > 0 && sv_call.start <= merged_sv_type_calls[i - 1].end) { + // Keep the larger cluster size + if (sv_call.cluster_size > merged_sv_type_calls[i - 1].cluster_size) { + merged_sv_calls_final.push_back(sv_call); + } + } else { + merged_sv_calls_final.push_back(sv_call); + } + } + printMessage("Merged " + std::to_string(merged_sv_type_calls.size()) + " overlapping SV calls into " + std::to_string(merged_sv_calls_final.size()) + " merged SV calls"); + + // Insert merged SV calls into the final list + merged_sv_calls.insert(merged_sv_calls.end(), merged_sv_calls_final.begin(), merged_sv_calls_final.end()); + + // printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + " SVs"); } sv_calls = std::move(merged_sv_calls); // Replace with filtered list int updated_size = sv_calls.size(); printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls"); + + // // Merge overlapping SVs by cluster size + // std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { + // return a.start < b.start || (a.start == b.start && a.end < b.end); + // }); + // std::vector merged_sv_calls_final; + // for (size_t i = 0; i < sv_calls.size(); i++) { + // SVCall& sv_call = sv_calls[i]; + + // // Merge cluster sizes if they overlap + // if (i > 0 && sv_call.start <= sv_calls[i - 1].end) { + // // Keep the larger cluster size + // if (sv_call.cluster_size > sv_calls[i - 1].cluster_size) { + // sv_calls[i - 1] = sv_call; + // } + // } else { + // merged_sv_calls_final.push_back(sv_call); + // } + // } + // sv_calls = std::move(merged_sv_calls_final); // Replace with filtered list + // int final_size = sv_calls.size(); + // printMessage("Merged " + std::to_string(updated_size) + " overlapping SV calls into " + std::to_string(final_size) + " SV calls"); +} + +void saveClustersToJSON(const std::string &filename, const std::map> &clusters) +{ + // Check if the filename is empty + if (filename.empty()) { + printError("ERROR: Filename is empty"); + return; + } + + // Remove the file if it already exists + if (fileExists(filename)) { + std::remove(filename.c_str()); + } + + // Open the JSON file for writing + std::ofstream json_file(filename); + + if (!json_file.is_open()) { + printError("ERROR: Unable to open JSON file for writing: " + filename); + return; + } + json_file << "{\n"; + json_file << " \"clusters\": [\n"; + size_t count = 0; + // for (size_t i = 0; i < clusters.size(); ++i) { + for (const auto& [cluster_id, cluster] : clusters) { + if (cluster_id < 0) { + continue; // Skip noise points + } + + // const auto& cluster = clusters.at(i); + // const auto& cluster = sv_list; + json_file << " {\n"; + json_file << " \"cluster_id\": " << cluster_id << ",\n"; + json_file << " \"cluster_size\": " << cluster.size() << ",\n"; + json_file << " \"sv_calls\": [\n"; + for (size_t j = 0; j < cluster.size(); ++j) { + const auto& sv_call = cluster[j]; + json_file << " {\n"; + json_file << " \"start\": " << sv_call.start << ",\n"; + json_file << " \"end\": " << sv_call.end << "\n"; + // json_file << " \"sv_type\": \"" << getSVTypeString(sv_call.sv_type) << "\",\n"; + // json_file << " \"alt_allele\": \"" << sv_call.alt_allele << "\",\n"; + // json_file << " \"genotype\": \"" << getGenotypeString(sv_call.genotype) << "\",\n"; + // json_file << " \"hmm_likelihood\": " << sv_call.hmm_likelihood << "\n"; + json_file << " }" << (j < cluster.size() - 1 ? "," : "") << "\n"; + } + json_file << " ]\n"; + // json_file << " }" << (i < clusters.size() - 1 ? "," : "") << "\n"; + count++; + if (count < clusters.size() - 1) { + json_file << " }," << "\n"; + } else { + json_file << " }\n"; + printMessage("JSON found last cluster: " + std::to_string(cluster_id)); + } + } + json_file << " ]\n"; + json_file << "}\n"; + json_file.close(); + printMessage("Saved clusters to JSON file: " + filename); } void mergeDuplicateSVs(std::vector &sv_calls) From 17ebf23ee93790684a3787d31893bb6287720a97 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Thu, 26 Jun 2025 14:54:12 -0400 Subject: [PATCH 117/134] add debug mode --- Makefile | 3 + Makefile-cpp | 5 ++ include/debug.h | 23 +++++ src/cnv_caller.cpp | 18 +++- src/debug.cpp | 4 + src/input_data.cpp | 23 ++--- src/sv_caller.cpp | 66 +++++++------- src/sv_object.cpp | 208 +++++++++++++++++++++++++++++++-------------- 8 files changed, 242 insertions(+), 108 deletions(-) create mode 100644 include/debug.h create mode 100644 src/debug.cpp diff --git a/Makefile b/Makefile index 6b0170ae..b8186167 100644 --- a/Makefile +++ b/Makefile @@ -9,6 +9,9 @@ python: cpp: $(MAKE) -f Makefile-cpp +debug: + $(MAKE) -f Makefile-cpp DEBUG=1 + clean: $(MAKE) -f Makefile-python clean $(MAKE) -f Makefile-cpp clean diff --git a/Makefile-cpp b/Makefile-cpp index e77cf0a8..ceda4018 100644 --- a/Makefile-cpp +++ b/Makefile-cpp @@ -19,6 +19,11 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib # Compiler and Flags CXX := g++ CXXFLAGS := -std=c++17 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic + +ifdef DEBUG + CXXFLAGS += -DDEBUG +endif + LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR) # Add rpath for shared libraries LDLIBS := -lhts # Link with libhts.a or libhts.so diff --git a/include/debug.h b/include/debug.h new file mode 100644 index 00000000..08038b3c --- /dev/null +++ b/include/debug.h @@ -0,0 +1,23 @@ +// debug.h +#pragma once + +#include +#include +#include +#include +#include + +extern std::mutex debug_mutex; + +#ifdef DEBUG + #define DEBUG_PRINT(x) do { \ + std::lock_guard lock(debug_mutex); \ + auto now = std::chrono::system_clock::now(); \ + std::time_t now_time = std::chrono::system_clock::to_time_t(now); \ + std::ostringstream oss; \ + oss << std::put_time(std::localtime(&now_time), "%Y-%m-%d %H:%M:%S"); \ + std::cerr << oss.str() << " - " << x << std::endl; \ + } while (0) +#else + #define DEBUG_PRINT(x) +#endif diff --git a/src/cnv_caller.cpp b/src/cnv_caller.cpp index c7045955..66f1f146 100644 --- a/src/cnv_caller.cpp +++ b/src/cnv_caller.cpp @@ -171,7 +171,23 @@ std::tuple CNVCaller::runCopyNumberPrediction(std printError("ERROR: Invalid SV region for copy number prediction: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0); } - + /* + // Check that there is no large number of zero-depth positions in the region + int zero_depth_count = 0; + for (uint32_t pos = start_pos; pos <= end_pos; pos++) + { + if (pos < pos_depth_map.size() && pos_depth_map[pos] == 0) + { + zero_depth_count++; + } + } + if (zero_depth_count > 0.1 * (end_pos - start_pos + 1)) + { + printError("WARNING: Too many zero-depth positions in the SV region for copy number prediction, skipping: " + chr + ":" + std::to_string((int)start_pos) + "-" + std::to_string((int)end_pos)); + return std::make_tuple(0.0, SVType::UNKNOWN, Genotype::UNKNOWN, 0); + } + */ + // Run the Viterbi algorithm on SNPs in the SV region // Only extend the region if "save CNV data" is enabled SNPData before_sv; diff --git a/src/debug.cpp b/src/debug.cpp new file mode 100644 index 00000000..2028e5f6 --- /dev/null +++ b/src/debug.cpp @@ -0,0 +1,4 @@ +// debug.cpp +#include "debug.h" + +std::mutex debug_mutex; diff --git a/src/input_data.cpp b/src/input_data.cpp index 3e7ad69d..4e0211df 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -8,6 +8,7 @@ #include #include "utils.h" +#include "debug.h" // For DEBUG_PRINT /// @endcond #define MIN_PFB 0.01 // Minimum SNP population allele frequency @@ -39,22 +40,22 @@ InputData::InputData() void InputData::printParameters() const { - std::cout << "Input parameters:" << std::endl; - std::cout << "Long read BAM: " << this->long_read_bam << std::endl; - std::cout << "Reference genome: " << this->ref_filepath << std::endl; - std::cout << "SNP VCF: " << this->snp_vcf_filepath << std::endl; - std::cout << "Output directory: " << this->output_dir << std::endl; - std::cout << "Sample size: " << this->sample_size << std::endl; - std::cout << "Minimum CNV length: " << this->min_cnv_length << std::endl; - std::cout << "DBSCAN epsilon: " << this->dbscan_epsilon << std::endl; - std::cout << "DBSCAN minimum points percentage: " << this->dbscan_min_pts_pct * 100.0f << "%" << std::endl; + DEBUG_PRINT("Input parameters:"); + DEBUG_PRINT("Long read BAM: " << this->long_read_bam); + DEBUG_PRINT("Reference genome: " << this->ref_filepath); + DEBUG_PRINT("SNP VCF: " << this->snp_vcf_filepath); + DEBUG_PRINT("Output directory: " << this->output_dir); + DEBUG_PRINT("Sample size: " << this->sample_size); + DEBUG_PRINT("Minimum CNV length: " << this->min_cnv_length); + DEBUG_PRINT("DBSCAN epsilon: " << this->dbscan_epsilon); + DEBUG_PRINT("DBSCAN minimum points percentage: " << this->dbscan_min_pts_pct * 100.0f << "%"); if (this->region_set) { - std::cout << "Region set to: chr" + this->chr + ":" + std::to_string(this->start_end.first) + "-" + std::to_string(this->start_end.second) + "\n"; + DEBUG_PRINT("Region set to: chr" + this->chr + ":" + std::to_string(this->start_end.first) + "-" + std::to_string(this->start_end.second)); } else { - std::cout << "Running on whole genome" << std::endl; + DEBUG_PRINT("Running on whole genome"); } } diff --git a/src/sv_caller.cpp b/src/sv_caller.cpp index a1a52e53..29dab604 100644 --- a/src/sv_caller.cpp +++ b/src/sv_caller.cpp @@ -30,6 +30,7 @@ #include "fasta_query.h" #include "dbscan.h" #include "dbscan1d.h" +#include "debug.h" /// @endcond # define DUP_SEQSIM_THRESHOLD 0.9 // Sequence similarity threshold for duplication detection @@ -415,21 +416,21 @@ void SVCaller::findSplitSVSignatures(std::unordered_map 1) { - std::sort(supp_positions.begin(), supp_positions.end()); - int supp_start = supp_positions.front(); - int supp_end = supp_positions.back(); - int sv_length = std::abs(supp_start - supp_end); - - // Use 50bp as the minimum length for an inversion - if (sv_length >= 50 && sv_length <= max_length) { - SVEvidenceFlags aln_type; - aln_type.set(static_cast(SVDataType::SUPPINV)); - SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), aln_type, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); - // SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); - addSVCall(chr_sv_calls, sv_candidate); - } - } + // if (inversion && supp_positions.size() > 1) { + // std::sort(supp_positions.begin(), supp_positions.end()); + // int supp_start = supp_positions.front(); + // int supp_end = supp_positions.back(); + // int sv_length = std::abs(supp_start - supp_end); + + // // Use 50bp as the minimum length for an inversion + // if (sv_length >= 50 && sv_length <= max_length) { + // SVEvidenceFlags aln_type; + // aln_type.set(static_cast(SVDataType::SUPPINV)); + // SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), aln_type, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); + // // SVCall sv_candidate(supp_start, supp_end, SVType::INV, getSVTypeSymbol(SVType::INV), SVDataType::SUPPINV, Genotype::UNKNOWN, 0.0, 0, 0, supp_cluster_size); + // addSVCall(chr_sv_calls, sv_candidate); + // } + // } // ------------------------------- // SPLIT INSERTION CALLS @@ -772,6 +773,8 @@ void SVCaller::run(const InputData& input_data) bool cigar_svs = true; bool cigar_cn = true; bool split_svs = true; + bool merge_split_svs = true; + bool merge_final_svs = true; // Print the input data input_data.printParameters(); @@ -904,12 +907,11 @@ void SVCaller::run(const InputData& input_data) } if (split_svs) { - // Identify split-SV signatures - printMessage("Identifying split-SV signatures..."); + DEBUG_PRINT("Identifying split-SV signatures..."); std::unordered_map> whole_genome_split_sv_calls; this->findSplitSVSignatures(whole_genome_split_sv_calls, input_data); - printMessage("Running copy number predictions on split-read SVs..."); + DEBUG_PRINT("Running copy number predictions on split-read SVs..."); current_chr = 0; for (auto& entry : whole_genome_split_sv_calls) { const std::string& chr = entry.first; @@ -917,18 +919,20 @@ void SVCaller::run(const InputData& input_data) if (sv_calls.size() > 0) { current_chr++; - printMessage("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates..."); + DEBUG_PRINT("(" + std::to_string(current_chr) + "/" + std::to_string(total_chr_count) + ") Running copy number predictions on " + chr + " with " + std::to_string(sv_calls.size()) + " SV candidates..."); this->runSplitReadCopyNumberPredictions(chr, sv_calls, cnv_caller, hmm, chr_mean_cov_map[chr], chr_pos_depth_map[chr], input_data); } } - printMessage("Merging split-read SVs..."); - for (auto& entry : whole_genome_split_sv_calls) { - std::vector& sv_calls = entry.second; - mergeSVs(sv_calls, 0.1, 2, true); + if (merge_split_svs) { + DEBUG_PRINT("Merging split-read SVs..."); + for (auto& entry : whole_genome_split_sv_calls) { + std::vector& sv_calls = entry.second; + mergeSVs(sv_calls, 0.1, 2, true); + } } - printMessage("Unifying SVs..."); + DEBUG_PRINT("Unifying SVs..."); for (auto& entry : whole_genome_split_sv_calls) { const std::string& chr = entry.first; std::vector& sv_calls = entry.second; @@ -936,12 +940,14 @@ void SVCaller::run(const InputData& input_data) } } - // Merge any duplicate SV calls from the CIGAR and split-read - // detections (same start positions) - printMessage("Merging CIGAR and split read SV calls..."); - for (auto& entry : whole_genome_sv_calls) { - std::vector& sv_calls = entry.second; - mergeSVs(sv_calls, 0.1, 2, true); + if (merge_final_svs) { + // Merge any duplicate SV calls from the CIGAR and split-read + // detections (same start positions) + DEBUG_PRINT("Merging CIGAR and split read SV calls..."); + for (auto& entry : whole_genome_sv_calls) { + std::vector& sv_calls = entry.second; + mergeSVs(sv_calls, 0.1, 2, true); + } } if (input_data.getSaveCNVData()) { diff --git a/src/sv_object.cpp b/src/sv_object.cpp index b763efd2..d09bd6fe 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -12,6 +12,7 @@ #include "dbscan.h" #include "utils.h" +#include "debug.h" bool SVCall::operator<(const SVCall & other) const { @@ -49,6 +50,14 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k return; } + // Set this to print cluster information for a specific SV call for debugging + // This is useful for debugging purposes to see how the SVs are merged + bool debug_mode = false; + int debug_start = 10414914; // Set to -1 to disable + int debug_svlen_min = 15000; + int debug_svlen_max = 16000; + SVType debug_sv_type = SVType::INV; + // Cluster SVs using DBSCAN for each SV type int initial_size = sv_calls.size(); std::vector merged_sv_calls; @@ -61,6 +70,13 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k SVType::BND, }) { + // Skip if not the debug SV type + if (debug_mode && (sv_type != debug_sv_type)) { + DEBUG_PRINT("DEBUG: Skipping SV type " + getSVTypeString(sv_type) + " for debug mode"); + continue; + } + + DEBUG_PRINT("Merging SV type: " + getSVTypeString(sv_type) + " (epsilon=" + std::to_string(epsilon) + ", min_pts=" + std::to_string(min_pts) + ", num SVs=" + std::to_string(sv_calls.size()) + ")"); std::vector merged_sv_type_calls; // Create a vector of SV calls for the current SV type and size interval @@ -73,7 +89,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k // Add all unclustered points to the merged list for (const auto& sv_call : sv_type_calls) { SVCall noise_sv_call = sv_call; - // merged_sv_calls.push_back(noise_sv_call); merged_sv_type_calls.push_back(noise_sv_call); } continue; @@ -84,18 +99,8 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k // Create a map of cluster IDs to SV calls const std::vector& clusters = dbscan.getClusters(); std::map> cluster_map; // Cluster ID to SV calls - if (sv_type == SVType::INS) { - // Add only non-CIGARCLIP SVs to the cluster map - for (size_t i = 0; i < clusters.size(); ++i) { - // Use the SVEvidenceFlags to check for CIGARCLIP - if (!sv_type_calls[i].aln_type.test(static_cast(SVDataType::CIGARCLIP))) { - cluster_map[clusters[i]].push_back(sv_type_calls[i]); - } - } - } else { - for (size_t i = 0; i < clusters.size(); ++i) { - cluster_map[clusters[i]].push_back(sv_type_calls[i]); - } + for (size_t i = 0; i < clusters.size(); ++i) { + cluster_map[clusters[i]].push_back(sv_type_calls[i]); } // Save clusters to JSON if requested @@ -111,7 +116,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k // Remove the file extension from the JSON filename std::string json_filename_no_ext = json_filepath.substr(0, json_filepath.find_last_of('.')); std::string json_filename = json_filename_no_ext + "_" + getSVTypeString(sv_type) + ".json"; - // std::string json_filename = json_filepath + "/clusters_" + getSVTypeString(sv_type) + ".json"; saveClustersToJSON(json_filename, cluster_map); } @@ -121,6 +125,30 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k int cluster_id = cluster.first; std::vector& cluster_sv_calls = cluster.second; + // Continue unless the debug SV call is in the cluster + // if (debug_mode && cluster_id >= 0) { + // if (!cluster_sv_calls.empty() && + // std::any_of(cluster_sv_calls.begin(), cluster_sv_calls.end(), + // [debug_start, debug_sv_type, debug_svlen_min, debug_svlen_max](const SVCall& sv_call) { + // const int len = std::abs(static_cast(sv_call.end - sv_call.start)); + + // const bool start_ok = (debug_start < 0 || static_cast(sv_call.start) == debug_start); + + // const bool len_ok = (debug_svlen_min == -1 || len >= debug_svlen_min) && + // (debug_svlen_max == -1 || len <= debug_svlen_max); + + // const bool type_ok = (debug_sv_type == SVType::UNKNOWN || sv_call.sv_type == debug_sv_type); + + // return start_ok && len_ok && type_ok; + // } + // )) { + // DEBUG_PRINT("DEBUG: Found SV call in noise cluster " + std::to_string(cluster_id) + " with type " + getSVTypeString(debug_sv_type)); + + // } else { + // continue; + // } + // } + // Continue if fewer than 2 SV calls in the cluster (due to CIGARCLIP filter) if (cluster_sv_calls.size() < 2) { continue; @@ -132,8 +160,16 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k // Add all unclustered points to the merged list for (const auto& sv_call : cluster_sv_calls) { SVCall noise_sv_call = sv_call; - // merged_sv_calls.push_back(noise_sv_call); merged_sv_type_calls.push_back(noise_sv_call); + + // Print the added SV calls if >10 kb and the debug SV type + if (debug_mode && noise_sv_call.sv_type == debug_sv_type && (noise_sv_call.end - noise_sv_call.start) > 10000) { + DEBUG_PRINT("DEBUG: Adding noise SV call at " + std::to_string(noise_sv_call.start) + "-" + std::to_string(noise_sv_call.end) + + ", type: " + getSVTypeString(noise_sv_call.sv_type) + + ", length: " + std::to_string(noise_sv_call.end - noise_sv_call.start) + + ", cluster size: " + std::to_string(noise_sv_call.cluster_size) + + ", likelihood: " + std::to_string(noise_sv_call.hmm_likelihood)); + } } // Merge clustered SV calls @@ -163,16 +199,12 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { return a.cluster_size > b.cluster_size || (a.cluster_size == b.cluster_size && a.end - a.start > b.end - b.start); }); - // std::sort(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& a, const SVCall& b) { - // return a.cluster_size > b.cluster_size || (a.cluster_size == b.cluster_size && a.hmm_likelihood > b.hmm_likelihood); - // }); auto it = std::find_if(cluster_sv_calls.begin(), cluster_sv_calls.end(), [](const SVCall& sv_call) { return sv_call.hmm_likelihood != 0.0; }); // Add SV call merged_sv_call = *it; - // merged_sv_calls.push_back(merged_sv_call); merged_sv_type_calls.push_back(merged_sv_call); // ---------------------------- @@ -186,25 +218,79 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k return (a.end - a.start) > (b.end - b.start); }); - // Get the top 10% of the cluster - size_t top_10_percent = std::max(1, (int) (cluster_sv_calls.size() * 0.1)); - std::vector top_10(cluster_sv_calls.begin(), cluster_sv_calls.begin() + top_10_percent); + // Print the added SV calls if >10 kb and the debug SV type + if (debug_mode && sv_type == debug_sv_type) { + DEBUG_PRINT("DEBUG: Cluster " + std::to_string(cluster_id) + " with " + std::to_string(cluster_sv_calls.size()) + " SV calls (length sorted):"); + for (const auto& sv_call : cluster_sv_calls) { + if ((sv_call.end - sv_call.start) > 10000) { + DEBUG_PRINT("DEBUG: SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + + ", type: " + getSVTypeString(sv_call.sv_type) + + ", length: " + std::to_string(sv_call.end - sv_call.start) + + ", cluster size: " + std::to_string(sv_call.cluster_size) + + ", likelihood: " + std::to_string(sv_call.hmm_likelihood)); + } + } + } + + // Get the top % of the cluster + double top_pct = 0.2; + size_t top_pct_size = std::max(1, (int) (cluster_sv_calls.size() * top_pct)); + std::vector top_pct_calls(cluster_sv_calls.begin(), cluster_sv_calls.begin() + top_pct_size); + + // Print the added SV calls if >10 kb and the debug SV type + if (debug_mode && sv_type == debug_sv_type) { + DEBUG_PRINT("DEBUG: Top " + std::to_string((int)(top_pct * 100)) + "% of cluster " + std::to_string(cluster_id) + " with " + + std::to_string(top_pct_calls.size()) + " SV calls (length sorted):"); + for (const auto& sv_call : top_pct_calls) { + if ((sv_call.end - sv_call.start) > 10000) { + DEBUG_PRINT("DEBUG: SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + + ", type: " + getSVTypeString(sv_call.sv_type) + + ", length: " + std::to_string(sv_call.end - sv_call.start) + + ", cluster size: " + std::to_string(sv_call.cluster_size) + + ", likelihood: " + std::to_string(sv_call.hmm_likelihood)); + } + } + } - // Get the median SV for the top 10% of the cluster - size_t median_index = top_10.size() / 2; - merged_sv_call = top_10[median_index]; + // Get the median SV for the top % of the cluster + size_t median_index = top_pct_calls.size() / 2; + merged_sv_call = top_pct_calls[median_index]; + + // Print the merged SV call + if (debug_mode && sv_type == debug_sv_type) { + DEBUG_PRINT("DEBUG: Merged SV call at " + std::to_string(merged_sv_call.start) + "-" + std::to_string(merged_sv_call.end) + + ", type: " + getSVTypeString(merged_sv_call.sv_type) + + ", length: " + std::to_string(merged_sv_call.end - merged_sv_call.start) + + ", cluster size: " + std::to_string(merged_sv_call.cluster_size) + + ", likelihood: " + std::to_string(merged_sv_call.hmm_likelihood)); + } // Add SV call merged_sv_call.cluster_size = (int) cluster_sv_calls.size(); - // merged_sv_calls.push_back(merged_sv_call); merged_sv_type_calls.push_back(merged_sv_call); } cluster_count++; } } - printMessage("Merged " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + ", found " + std::to_string(merged_sv_type_calls.size()) + " merged SV calls"); + DEBUG_PRINT("Merged " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + ", found " + std::to_string(merged_sv_type_calls.size()) + " merged SV calls"); + + // Print SV call start, end, type, and length for debugging if > 10 kb + if (debug_mode && sv_type == debug_sv_type) { + DEBUG_PRINT("DEBUG: Merged SV calls for " + getSVTypeString(sv_type) + ":"); + for (const auto& sv_call : merged_sv_type_calls) { + // if ((int)sv_call.start == debug_start) { + if ((sv_call.end - sv_call.start) > 10000) { + DEBUG_PRINT("DEBUG: SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + + ", type: " + getSVTypeString(sv_call.sv_type) + + ", length: " + std::to_string(sv_call.end - sv_call.start) + + ", cluster size: " + std::to_string(sv_call.cluster_size) + + ", likelihood: " + std::to_string(sv_call.hmm_likelihood)); + } + } + } - // Merge overlapping SVs by cluster size + /* + // Merge overlapping SVs by SV length std::sort(merged_sv_type_calls.begin(), merged_sv_type_calls.end(), [](const SVCall& a, const SVCall& b) { return a.start < b.start || (a.start == b.start && a.end < b.end); }); @@ -214,46 +300,45 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k // Merge cluster sizes if they overlap if (i > 0 && sv_call.start <= merged_sv_type_calls[i - 1].end) { - // Keep the larger cluster size - if (sv_call.cluster_size > merged_sv_type_calls[i - 1].cluster_size) { - merged_sv_calls_final.push_back(sv_call); + // Keep the larger SV call (end - start) if they overlap + if ((sv_call.end - sv_call.start) > (merged_sv_type_calls[i - 1].end - merged_sv_type_calls[i - 1].start)) { + merged_sv_type_calls[i - 1] = sv_call; // Replace the previous SV call with the current one } + // Keep the larger cluster size + // if (sv_call.cluster_size > merged_sv_type_calls[i - 1].cluster_size) { + // merged_sv_calls_final.push_back(sv_call); + // } } else { merged_sv_calls_final.push_back(sv_call); } } - printMessage("Merged " + std::to_string(merged_sv_type_calls.size()) + " overlapping SV calls into " + std::to_string(merged_sv_calls_final.size()) + " merged SV calls"); + DEBUG_PRINT("Merged " + std::to_string(merged_sv_type_calls.size()) + " overlapping SV calls into " + std::to_string(merged_sv_calls_final.size()) + " merged SV calls"); - // Insert merged SV calls into the final list - merged_sv_calls.insert(merged_sv_calls.end(), merged_sv_calls_final.begin(), merged_sv_calls_final.end()); + // Print merged SV calls for debugging + if (debug_mode) { + DEBUG_PRINT("DEBUG: Final merged SV calls for " + getSVTypeString(sv_type) + ":"); + for (const auto& sv_call : merged_sv_calls_final) { + // if ((int)sv_call.start == debug_start) { + if (sv_call.sv_type == SVType::DUP) { + DEBUG_PRINT("DEBUG: SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + + ", type: " + getSVTypeString(sv_call.sv_type) + + ", length: " + std::to_string(sv_call.end - sv_call.start) + + ", cluster size: " + std::to_string(sv_call.cluster_size) + + ", likelihood: " + std::to_string(sv_call.hmm_likelihood)); + } + } + } - // printMessage("Completed DBSCAN with epsilon " + std::to_string(epsilon) + " for " + std::to_string(cluster_count) + " clusters of " + getSVTypeString(sv_type) + " SVs"); + // Insert merged SV calls into the final list + merged_sv_calls.insert(merged_sv_calls.end(), + merged_sv_calls_final.begin(), merged_sv_calls_final.end()); + */ + merged_sv_calls.insert(merged_sv_calls.end(), + merged_sv_type_calls.begin(), merged_sv_type_calls.end()); } sv_calls = std::move(merged_sv_calls); // Replace with filtered list int updated_size = sv_calls.size(); printMessage("Merged " + std::to_string(initial_size) + " SV calls into " + std::to_string(updated_size) + " SV calls"); - - // // Merge overlapping SVs by cluster size - // std::sort(sv_calls.begin(), sv_calls.end(), [](const SVCall& a, const SVCall& b) { - // return a.start < b.start || (a.start == b.start && a.end < b.end); - // }); - // std::vector merged_sv_calls_final; - // for (size_t i = 0; i < sv_calls.size(); i++) { - // SVCall& sv_call = sv_calls[i]; - - // // Merge cluster sizes if they overlap - // if (i > 0 && sv_call.start <= sv_calls[i - 1].end) { - // // Keep the larger cluster size - // if (sv_call.cluster_size > sv_calls[i - 1].cluster_size) { - // sv_calls[i - 1] = sv_call; - // } - // } else { - // merged_sv_calls_final.push_back(sv_call); - // } - // } - // sv_calls = std::move(merged_sv_calls_final); // Replace with filtered list - // int final_size = sv_calls.size(); - // printMessage("Merged " + std::to_string(updated_size) + " overlapping SV calls into " + std::to_string(final_size) + " SV calls"); } void saveClustersToJSON(const std::string &filename, const std::map> &clusters) @@ -271,7 +356,6 @@ void saveClustersToJSON(const std::string &filename, const std::map Date: Fri, 27 Jun 2025 12:04:52 -0400 Subject: [PATCH 118/134] Update .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 9f6f43d4..b7c96918 100644 --- a/.gitignore +++ b/.gitignore @@ -97,3 +97,6 @@ valgrind.log *.log *.err *.out + +# Snakemake files +.snakemake From 81b09ac17cea7c11f7e62a0ee615b2e28648cded Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Wed, 9 Jul 2025 17:05:11 -0400 Subject: [PATCH 119/134] update makefile --- .gitignore | 1 + Makefile | 68 +++++++++++++++++++++++++++++++++++++++++-------- Makefile-cpp | 59 ------------------------------------------ Makefile-python | 15 ----------- environment.yml | 9 ------- 5 files changed, 58 insertions(+), 94 deletions(-) delete mode 100644 Makefile-cpp delete mode 100644 Makefile-python diff --git a/.gitignore b/.gitignore index b7c96918..73f38beb 100644 --- a/.gitignore +++ b/.gitignore @@ -100,3 +100,4 @@ valgrind.log # Snakemake files .snakemake +snakemake_bench/results/ diff --git a/Makefile b/Makefile index b8186167..cfab069c 100644 --- a/Makefile +++ b/Makefile @@ -1,17 +1,63 @@ -# Top-Level Makefile +# Directories +INCL_DIR := $(CURDIR)/include +SRC_DIR := $(CURDIR)/src +BUILD_DIR := $(CURDIR)/build +LIB_DIR := $(CURDIR)/lib -.PHONY: python cpp clean +# Version header +VERSION := $(shell git describe --tags --always) +VERSION_HEADER := $(INCL_DIR)/version.h +.PHONY: $(VERSION_HEADER) + @echo "#pragma once" > $@ + @echo "#define VERSION \"$(VERSION)\"" >> $@ -# Targets for the sub-makefiles -python: - $(MAKE) -f Makefile-python +# Conda environment directories +CONDA_PREFIX := $(shell echo $$CONDA_PREFIX) +CONDA_INCL_DIR := $(CONDA_PREFIX)/include +CONDA_LIB_DIR := $(CONDA_PREFIX)/lib -cpp: - $(MAKE) -f Makefile-cpp +# Compiler and Flags +CXX := g++ +CXXFLAGS := -std=c++17 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic -debug: - $(MAKE) -f Makefile-cpp DEBUG=1 +# ifdef DEBUG +# CXXFLAGS += -DDEBUG +# endif +LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR) # Add rpath for shared libraries +LDLIBS := -lhts # Link with libhts.a or libhts.so + +# Enable thread sanitizer (TSan) +# ifeq ($(TSAN),1) +# TSAN_FLAGS := -fsanitize=thread -fPIE -pie -g +# CXXFLAGS += $(TSAN_FLAGS) +# LDFLAGS += $(TSAN_FLAGS) +# endif + +# Sources and Output +# SOURCES := $(wildcard $(SRC_DIR)/*.cpp) +SOURCES := $(filter-out $(SRC_DIR)/swig_wrapper.cpp, $(wildcard $(SRC_DIR)/*.cpp)) # Filter out the SWIG wrapper from the sources +OBJECTS := $(patsubst $(SRC_DIR)/%.cpp,$(BUILD_DIR)/%.o,$(SOURCES)) +TARGET := $(BUILD_DIR)/cpp_module + +# Default target +all: $(TARGET) + +# Debug target +debug: CXXFLAGS += -DDEBUG +debug: all + +# Link the executable +$(TARGET): $(OBJECTS) + @mkdir -p $(BUILD_DIR) + $(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDLIBS) + +# Compile source files +$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp + @mkdir -p $(BUILD_DIR) + $(CXX) $(CXXFLAGS) -c $< -o $@ + +# Clean the build directory clean: - $(MAKE) -f Makefile-python clean - $(MAKE) -f Makefile-cpp clean + rm -rf $(BUILD_DIR) + \ No newline at end of file diff --git a/Makefile-cpp b/Makefile-cpp deleted file mode 100644 index ceda4018..00000000 --- a/Makefile-cpp +++ /dev/null @@ -1,59 +0,0 @@ -# Directories -INCL_DIR := $(CURDIR)/include -SRC_DIR := $(CURDIR)/src -BUILD_DIR := $(CURDIR)/build -LIB_DIR := $(CURDIR)/lib - -# Version header -VERSION := $(shell git describe --tags --always) -VERSION_HEADER := $(INCL_DIR)/version.h -.PHONY: $(VERSION_HEADER) - @echo "#pragma once" > $@ - @echo "#define VERSION \"$(VERSION)\"" >> $@ - -# Conda environment directories -CONDA_PREFIX := $(shell echo $$CONDA_PREFIX) -CONDA_INCL_DIR := $(CONDA_PREFIX)/include -CONDA_LIB_DIR := $(CONDA_PREFIX)/lib - -# Compiler and Flags -CXX := g++ -CXXFLAGS := -std=c++17 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic - -ifdef DEBUG - CXXFLAGS += -DDEBUG -endif - -LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR) # Add rpath for shared libraries -LDLIBS := -lhts # Link with libhts.a or libhts.so - -# Enable thread sanitizer (TSan) -# ifeq ($(TSAN),1) -# TSAN_FLAGS := -fsanitize=thread -fPIE -pie -g -# CXXFLAGS += $(TSAN_FLAGS) -# LDFLAGS += $(TSAN_FLAGS) -# endif - -# Sources and Output -# SOURCES := $(wildcard $(SRC_DIR)/*.cpp) -SOURCES := $(filter-out $(SRC_DIR)/swig_wrapper.cpp, $(wildcard $(SRC_DIR)/*.cpp)) # Filter out the SWIG wrapper from the sources -OBJECTS := $(patsubst $(SRC_DIR)/%.cpp,$(BUILD_DIR)/%.o,$(SOURCES)) -TARGET := $(BUILD_DIR)/cpp_module - -# Default target -all: $(TARGET) - -# Link the executable -$(TARGET): $(OBJECTS) - @mkdir -p $(BUILD_DIR) - $(CXX) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(LDLIBS) - -# Compile source files -$(BUILD_DIR)/%.o: $(SRC_DIR)/%.cpp - @mkdir -p $(BUILD_DIR) - $(CXX) $(CXXFLAGS) -c $< -o $@ - -# Clean the build directory -clean: - rm -rf $(BUILD_DIR) - \ No newline at end of file diff --git a/Makefile-python b/Makefile-python deleted file mode 100644 index 361ba11b..00000000 --- a/Makefile-python +++ /dev/null @@ -1,15 +0,0 @@ -INCL_DIR := $(CURDIR)/include -SRC_DIR := $(CURDIR)/src -LIB_DIR := $(CURDIR)/lib - - -all: - # Generate the SWIG wrapper (C++ -> Python) - swig -c++ -python -I$(INCL_DIR) -o $(SRC_DIR)/swig_wrapper.cpp -outdir $(LIB_DIR) $(SRC_DIR)/swig_wrapper.i - - # Compile the SWIG wrapper using setuptools - python3 setup.py build_ext --build-lib $(LIB_DIR) - -clean: - rm -rf $(LIB_DIR)/*.so $(LIB_DIR)/contextsv.py - \ No newline at end of file diff --git a/environment.yml b/environment.yml index 26f46822..538f5bc3 100644 --- a/environment.yml +++ b/environment.yml @@ -8,14 +8,5 @@ dependencies: - python - numpy - htslib - - swig - pytest - plotly - -# [A] Generate directly from the file: -# conda env create -f environment.yml -n contextsv -# [B] Generate after creating a new environment: -# conda create -n contextsv -# conda activate contextsv -# conda env update -f environment.yml --prune # Prune removes unused packages - From 9f2aea2e4bd874a3ff90b42f8796a3eaf9afe99e Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 12:53:00 -0400 Subject: [PATCH 120/134] update cnv plots --- .gitignore | 1 + python/cnv_plots_json.py | 40 +++++++++++++++++++++++++++++++--------- src/main.cpp | 5 ----- 3 files changed, 32 insertions(+), 14 deletions(-) diff --git a/.gitignore b/.gitignore index 73f38beb..b7478d26 100644 --- a/.gitignore +++ b/.gitignore @@ -55,6 +55,7 @@ CMakeSettings.json # Output folder output/ +python/ # Doxygen docs/html/ diff --git a/python/cnv_plots_json.py b/python/cnv_plots_json.py index 31a59110..768058e9 100644 --- a/python/cnv_plots_json.py +++ b/python/cnv_plots_json.py @@ -1,11 +1,16 @@ +import os +import argparse +import json +import numpy as np import plotly from plotly.subplots import make_subplots -import json -import argparse + +min_sv_length = 200000 # Minimum SV length in base pairs # Set up argument parser parser = argparse.ArgumentParser(description='Generate CNV plots from JSON data.') parser.add_argument('json_file', type=str, help='Path to the JSON file containing SV data') +parser.add_argument('chromosome', type=str, help='Chromosome to filter the SVs by (e.g., "chr3")', nargs='?', default=None) args = parser.parse_args() # Load your JSON data @@ -32,6 +37,15 @@ # Loop through each SV (assuming your JSON contains multiple SVs) for sv in sv_data: + # If a chromosome is specified, filter the SVs by that chromosome + if args.chromosome and sv['chromosome'] != args.chromosome: + continue + + # Filter out SVs that are smaller than the minimum length + if np.abs(sv['size']) < min_sv_length: + print(f"Skipping SV {sv['chromosome']}:{sv['start']}-{sv['end']} of type {sv['sv_type']} with size {sv['size']} bp (smaller than {min_sv_length} bp)") + continue + # Extract data for plotting positions_before = sv['before_sv']['positions'] b_allele_freq_before = sv['before_sv']['b_allele_freq'] @@ -61,9 +75,13 @@ b_allele_freq = sv[section]['b_allele_freq'] population_freq = sv[section]['population_freq'] log2_ratio = sv[section]['log2_ratio'] + is_snp = sv[section]['is_snp'] + + # Set all b-allele frequencies to NaN if not SNPs + b_allele_freq = [freq if is_snp_val else float('nan') for freq, is_snp_val in zip(b_allele_freq, is_snp)] if section == "sv": - is_snp = sv[section]['is_snp'] + # is_snp = sv[section]['is_snp'] states = sv[section]['states'] state_colors = [state_colors_dict[str(state)] for state in states] marker_symbols = ['circle' if is_snp_val else 'circle-open' for is_snp_val in is_snp] @@ -81,7 +99,7 @@ f"Population Frequency: {population_freq[i]}
" ) else: - is_snp = sv[section]['is_snp'] + # is_snp = sv[section]['is_snp'] state_colors = ['black'] * len(positions) # marker_symbols = ['circle-open'] * len(positions) marker_symbols = ['circle' if is_snp_val else 'circle-open' for is_snp_val in is_snp] @@ -105,7 +123,7 @@ hoverinfo='text', marker=dict( color=state_colors, - size=10, + size=5, symbol=marker_symbols, ), line=dict( @@ -125,7 +143,7 @@ hoverinfo='text', marker=dict( color=state_colors, - size=10, + size=5, symbol=marker_symbols, ), line=dict( @@ -214,6 +232,10 @@ # width = 800 # ) # Save the plot to an HTML file (use a unique filename per SV) - file_name = f"output/SV_{chromosome}_{start}_{end}.html" - fig.write_html(file_name) - print(f"Plot saved as {file_name}") + # Use the input filepath directory as the output directory + output_dir = os.path.dirname(args.json_file) + svlen_kb = sv_length // 1000 + file_name = f"SV_{chromosome}_{start}_{end}_{sv_type}_{svlen_kb}kb.html" + file_path = os.path.join(output_dir, file_name) + fig.write_html(file_path) + print(f"Plot saved as {file_path}") diff --git a/src/main.cpp b/src/main.cpp index 4755a0e4..5425a7c1 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -116,11 +116,6 @@ void runContextSV(const std::unordered_map& args) if (fileExists(json_filepath)) { remove(json_filepath.c_str()); } - // int json_file_count = 1; - // while (fileExists(json_filepath)) { - // json_filepath = output_dir + "/CNVCalls_" + std::to_string(json_file_count) + ".json"; - // json_file_count++; - // } input_data.setCNVOutputFile(json_filepath); std::cout << "Saving CNV data to: " << json_filepath << std::endl; } From 83570521d77b27724cd8227b8e748b051d2d3312 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 13:39:49 -0400 Subject: [PATCH 121/134] simplify environment and update installation --- Makefile | 16 ++----- README.md | 115 ++++++++++++++------------------------------- environment.yml | 9 ++-- src/input_data.cpp | 8 ++-- src/sv_object.cpp | 73 ---------------------------- 5 files changed, 44 insertions(+), 177 deletions(-) diff --git a/Makefile b/Makefile index cfab069c..207f7e09 100644 --- a/Makefile +++ b/Makefile @@ -20,25 +20,15 @@ CONDA_LIB_DIR := $(CONDA_PREFIX)/lib CXX := g++ CXXFLAGS := -std=c++17 -g -I$(INCL_DIR) -I$(CONDA_INCL_DIR) -Wall -Wextra -pedantic -# ifdef DEBUG -# CXXFLAGS += -DDEBUG -# endif - +# Linker Flags +# Ensure that the library paths are set correctly for linking LDFLAGS := -L$(LIB_DIR) -L$(CONDA_LIB_DIR) -Wl,-rpath=$(CONDA_LIB_DIR) # Add rpath for shared libraries LDLIBS := -lhts # Link with libhts.a or libhts.so -# Enable thread sanitizer (TSan) -# ifeq ($(TSAN),1) -# TSAN_FLAGS := -fsanitize=thread -fPIE -pie -g -# CXXFLAGS += $(TSAN_FLAGS) -# LDFLAGS += $(TSAN_FLAGS) -# endif - # Sources and Output -# SOURCES := $(wildcard $(SRC_DIR)/*.cpp) SOURCES := $(filter-out $(SRC_DIR)/swig_wrapper.cpp, $(wildcard $(SRC_DIR)/*.cpp)) # Filter out the SWIG wrapper from the sources OBJECTS := $(patsubst $(SRC_DIR)/%.cpp,$(BUILD_DIR)/%.o,$(SOURCES)) -TARGET := $(BUILD_DIR)/cpp_module +TARGET := $(BUILD_DIR)/contextsv # Default target all: $(TARGET) diff --git a/README.md b/README.md index e84f006d..707f6f64 100644 --- a/README.md +++ b/README.md @@ -12,33 +12,51 @@ corresponding reference genome (FASTA), a VCF with high-quality SNPs Class documentation is available at https://wglab.openbioinformatics.org/ContextSV

-## Installation (Linux) -### Using Anaconda (recommended) -First, install [Anaconda](https://www.anaconda.com/). +## Installation -Next, create a new environment. This installation has been tested with Python 3.11: - -``` -conda create -n contextsv python=3.11 -conda activate contextsv -``` - -ContextSV can then be installed using the following command: +### Building from source (for testing/development) +ContextSV requires HTSLib as a dependency that can be installed using [Anaconda](https://www.anaconda.com/). Create an environment +containing HTSLib: ``` -conda install -c bioconda -c wglab contextsv=1.0.0 +conda create -n htsenv -c bioconda -c conda-forge htslib +conda activate htsenv ``` -### Building from source (for testing/development) -First install [Anaconda](https://www.anaconda.com/). Then follow the instructions below to install LongReadSum and its dependencies: +Then follow the instructions below to build ContextSV: ``` git clone https://github.com/WGLab/ContextSV cd ContextSV -conda env create -f environment.yml make ``` +ContextSV can then be run: +``` +./build/contextsv --help + +Usage: ./build/contextsv [options] +Options: + -b, --bam Long-read BAM file (required) + -r, --ref Reference genome FASTA file (required) + -s, --snp SNPs VCF file (required) + -o, --outdir Output directory (required) + -c, --chr Chromosome + -r, --region Region (start-end) + -t, --threads Number of threads + -h, --hmm HMM file + -n, --sample-size Sample size for HMM predictions + --min-cnv Minimum CNV length + --eps DBSCAN epsilon + --min-pts-pct Percentage of mean chr. coverage to use for DBSCAN minimum points + -e, --eth ETH file + -p, --pfb PFB file + --save-cnv Save CNV data + --debug Debug mode with verbose logging + --version Print version and exit + -h, --help Print usage and exit +``` + ## Downloading gnomAD SNP population frequencies SNP population allele frequency information is used for copy number predictions in this tool (see @@ -53,7 +71,7 @@ Download links for genome VCF files are located here (last updated April 3, - **gnomAD v2.1.1 (GRCh37)**: https://gnomad.broadinstitute.org/downloads#2 -### Example download +### Script for downloading gnomAD VCFs ``` download_dir="~/data/gnomad/v4.0.0/" @@ -78,71 +96,6 @@ X=~/data/gnomad/v4.0.0/gnomad.genomes.v4.0.sites.chrX.vcf.bgz Y=~/data/gnomad/v4.0.0/gnomad.genomes.v4.0.sites.chrY.vcf.bgz ``` -## Calling structural variants -### Example full script generating a merged VCF of structural variants -``` -# Activate the environment -conda activate contextsv - -# Set the input reference genome -ref_file="~/data/GRCh38.fa" - -# Set the input alignment file (e.g. from minimap2) -long_read_bam="~/data/HG002.GRCh38.bam" - -# Set the input SNPs file (e.g. from NanoCaller) -snps_file="~/data/variant_calls.snps.vcf.gz" - -# Set the SNP population frequencies filepath -pfb_file="~/data/gnomadv4_filepaths.txt" - -# Set the output directory -output_dir=~/data/contextSV_output - -# Specify the number of threads (system-specific) -thread_count=40 - -# Run SV calling (~3-4 hours for whole-genome, 40 cores) -python contextsv --threads $thread_count -o $output_dir -lr $long_read_bam --snps $snps_file --reference $ref_file --pfb $pfb_file - -# The output VCF filepath is located here: -output_vcf=$output_dir/sv_calls.vcf - -# Merge SVs (~3-4 hours for whole-genome, 40 cores) -python contextsv --merge $output_vcf - -# The final merged VCF filepath is located here: -merged_vcf=$output_dir/sv_calls.merged.vcf -``` - -## Input arguments - -``` -python contextsv --help - -ContextSV: A tool for integrative structural variant detection. - -options: - -h, --help show this help message and exit - -lr LONG_READ, --long-read LONG_READ - path to the long read alignment BAM file - -g REFERENCE, --reference REFERENCE - path to the reference genome FASTA file - -s SNPS, --snps SNPS path to the SNPs VCF file - --pfb PFB path to the file with SNP population frequency VCF filepaths (see docs for format) - -o OUTPUT, --output OUTPUT - path to the output directory - -r REGION, --region REGION - region to analyze (e.g. chr1, chr1:1000-2000). If not provided, the entire genome will be analyzed - -t THREADS, --threads THREADS - number of threads to use - --hmm HMM path to the PennCNV HMM file - --window-size WINDOW_SIZE - window size for calculating log2 ratios for CNV predictions (default: 10 kb) - -d, --debug debug mode (verbose logging) - -v, --version print the version number and exit -``` - ## Revision history For release history, please visit [here](https://github.com/WGLab/ContextSV/releases). diff --git a/environment.yml b/environment.yml index 538f5bc3..1dd41ce2 100644 --- a/environment.yml +++ b/environment.yml @@ -1,12 +1,9 @@ name: contextsv channels: - - defaults - - anaconda - - conda-forge - bioconda + - conda-forge dependencies: - - python - - numpy + - python=3.11 + - numpy=1.26 - htslib - pytest - - plotly diff --git a/src/input_data.cpp b/src/input_data.cpp index 4e0211df..3b53a7d7 100644 --- a/src/input_data.cpp +++ b/src/input_data.cpp @@ -24,11 +24,11 @@ InputData::InputData() this->start_end = std::make_pair(0, 0); this->region_set = false; this->output_dir = ""; - this->sample_size = 100; - this->min_cnv_length = 1000; + this->sample_size = 20; + this->min_cnv_length = 2000; // Default minimum CNV length this->min_reads = 5; - this->dbscan_epsilon = 0.99; - this->dbscan_min_pts_pct = 0.0; + this->dbscan_epsilon = 0.1; + this->dbscan_min_pts_pct = 0.1; this->thread_count = 1; this->hmm_filepath = "data/wgs.hmm"; this->verbose = false; diff --git a/src/sv_object.cpp b/src/sv_object.cpp index d09bd6fe..d6f46b82 100644 --- a/src/sv_object.cpp +++ b/src/sv_object.cpp @@ -53,9 +53,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k // Set this to print cluster information for a specific SV call for debugging // This is useful for debugging purposes to see how the SVs are merged bool debug_mode = false; - int debug_start = 10414914; // Set to -1 to disable - int debug_svlen_min = 15000; - int debug_svlen_max = 16000; SVType debug_sv_type = SVType::INV; // Cluster SVs using DBSCAN for each SV type @@ -125,30 +122,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k int cluster_id = cluster.first; std::vector& cluster_sv_calls = cluster.second; - // Continue unless the debug SV call is in the cluster - // if (debug_mode && cluster_id >= 0) { - // if (!cluster_sv_calls.empty() && - // std::any_of(cluster_sv_calls.begin(), cluster_sv_calls.end(), - // [debug_start, debug_sv_type, debug_svlen_min, debug_svlen_max](const SVCall& sv_call) { - // const int len = std::abs(static_cast(sv_call.end - sv_call.start)); - - // const bool start_ok = (debug_start < 0 || static_cast(sv_call.start) == debug_start); - - // const bool len_ok = (debug_svlen_min == -1 || len >= debug_svlen_min) && - // (debug_svlen_max == -1 || len <= debug_svlen_max); - - // const bool type_ok = (debug_sv_type == SVType::UNKNOWN || sv_call.sv_type == debug_sv_type); - - // return start_ok && len_ok && type_ok; - // } - // )) { - // DEBUG_PRINT("DEBUG: Found SV call in noise cluster " + std::to_string(cluster_id) + " with type " + getSVTypeString(debug_sv_type)); - - // } else { - // continue; - // } - // } - // Continue if fewer than 2 SV calls in the cluster (due to CIGARCLIP filter) if (cluster_sv_calls.size() < 2) { continue; @@ -278,7 +251,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k if (debug_mode && sv_type == debug_sv_type) { DEBUG_PRINT("DEBUG: Merged SV calls for " + getSVTypeString(sv_type) + ":"); for (const auto& sv_call : merged_sv_type_calls) { - // if ((int)sv_call.start == debug_start) { if ((sv_call.end - sv_call.start) > 10000) { DEBUG_PRINT("DEBUG: SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + ", type: " + getSVTypeString(sv_call.sv_type) + @@ -288,51 +260,6 @@ void mergeSVs(std::vector& sv_calls, double epsilon, int min_pts, bool k } } } - - /* - // Merge overlapping SVs by SV length - std::sort(merged_sv_type_calls.begin(), merged_sv_type_calls.end(), [](const SVCall& a, const SVCall& b) { - return a.start < b.start || (a.start == b.start && a.end < b.end); - }); - std::vector merged_sv_calls_final; - for (size_t i = 0; i < merged_sv_type_calls.size(); i++) { - SVCall& sv_call = merged_sv_type_calls[i]; - - // Merge cluster sizes if they overlap - if (i > 0 && sv_call.start <= merged_sv_type_calls[i - 1].end) { - // Keep the larger SV call (end - start) if they overlap - if ((sv_call.end - sv_call.start) > (merged_sv_type_calls[i - 1].end - merged_sv_type_calls[i - 1].start)) { - merged_sv_type_calls[i - 1] = sv_call; // Replace the previous SV call with the current one - } - // Keep the larger cluster size - // if (sv_call.cluster_size > merged_sv_type_calls[i - 1].cluster_size) { - // merged_sv_calls_final.push_back(sv_call); - // } - } else { - merged_sv_calls_final.push_back(sv_call); - } - } - DEBUG_PRINT("Merged " + std::to_string(merged_sv_type_calls.size()) + " overlapping SV calls into " + std::to_string(merged_sv_calls_final.size()) + " merged SV calls"); - - // Print merged SV calls for debugging - if (debug_mode) { - DEBUG_PRINT("DEBUG: Final merged SV calls for " + getSVTypeString(sv_type) + ":"); - for (const auto& sv_call : merged_sv_calls_final) { - // if ((int)sv_call.start == debug_start) { - if (sv_call.sv_type == SVType::DUP) { - DEBUG_PRINT("DEBUG: SV call at " + std::to_string(sv_call.start) + "-" + std::to_string(sv_call.end) + - ", type: " + getSVTypeString(sv_call.sv_type) + - ", length: " + std::to_string(sv_call.end - sv_call.start) + - ", cluster size: " + std::to_string(sv_call.cluster_size) + - ", likelihood: " + std::to_string(sv_call.hmm_likelihood)); - } - } - } - - // Insert merged SV calls into the final list - merged_sv_calls.insert(merged_sv_calls.end(), - merged_sv_calls_final.begin(), merged_sv_calls_final.end()); - */ merged_sv_calls.insert(merged_sv_calls.end(), merged_sv_type_calls.begin(), merged_sv_type_calls.end()); } From f4f964b3eb687c562bb80cd9de85c335ef6a6ad9 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 13:44:52 -0400 Subject: [PATCH 122/134] update environment --- environment.yml | 1 + src/main.cpp | 1 + 2 files changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index 1dd41ce2..f99a29cd 100644 --- a/environment.yml +++ b/environment.yml @@ -1,5 +1,6 @@ name: contextsv channels: + - defaults - bioconda - conda-forge dependencies: diff --git a/src/main.cpp b/src/main.cpp index 5425a7c1..874f444f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -141,6 +141,7 @@ void printUsage(const std::string& programName) { << " --min-pts-pct Percentage of mean chr. coverage to use for DBSCAN minimum points\n" << " -e, --eth ETH file\n" << " -p, --pfb PFB file\n" + << " --assembly-gaps Assembly gaps file\n" << " --save-cnv Save CNV data\n" << " --debug Debug mode with verbose logging\n" << " --version Print version and exit\n" From 94ae67dc6b136eaff1860395cee1eb1bb7da2632 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 14:17:00 -0400 Subject: [PATCH 123/134] update python version --- environment.yml | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/environment.yml b/environment.yml index f99a29cd..867f41a4 100644 --- a/environment.yml +++ b/environment.yml @@ -1,10 +1,9 @@ name: contextsv channels: - - defaults - bioconda - conda-forge dependencies: - - python=3.11 - - numpy=1.26 + - python=3.10 + - numpy - htslib - pytest From 8ff85bf48122be52c7946f5aa7157580fd6aca49 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 14:20:47 -0400 Subject: [PATCH 124/134] update build yml --- .github/workflows/build-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index 11dc3093..3ba12930 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -28,7 +28,7 @@ jobs: with: activate-environment: contextsv environment-file: environment.yml - python-version: 3.9 + python-version: 3.10 auto-activate-base: false - name: Install samtools and bcftools using sudo apt-get From d13f68caff5e2f6806538732f33882e10f5ca73a Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 14:29:07 -0400 Subject: [PATCH 125/134] update build yml --- .github/workflows/build-tests.yml | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index 3ba12930..ca2fac4f 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -23,18 +23,23 @@ jobs: shell: bash --login {0} run: unzip TestData.zip - - name: Set up conda environment + - name: Set up conda (miniconda) uses: conda-incubator/setup-miniconda@v2 with: - activate-environment: contextsv - environment-file: environment.yml - python-version: 3.10 - auto-activate-base: false + auto-activate-base: true + miniforge-variant: Mambaforge + miniforge-version: latest + use-mamba: true # Use mamba natively - - name: Install samtools and bcftools using sudo apt-get + - name: Configure conda channels and create environment with mamba + shell: bash --login {0} run: | - sudo apt-get update - sudo apt-get install -y samtools bcftools + conda config --remove channels defaults || true + conda config --add channels conda-forge + conda config --add channels bioconda + conda config --set channel_priority strict + mamba env create -f environment.yml + conda activate contextsv - name: Build C++ code shell: bash --login {0} # --login enables PATH variable access From 05ff3f8a78eb8690f107da135e6b28741102487c Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 14:33:22 -0400 Subject: [PATCH 126/134] update build yml --- .github/workflows/build-tests.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index ca2fac4f..25eebe21 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -26,10 +26,10 @@ jobs: - name: Set up conda (miniconda) uses: conda-incubator/setup-miniconda@v2 with: - auto-activate-base: true - miniforge-variant: Mambaforge + use-mamba: true + miniforge-variant: Miniforge miniforge-version: latest - use-mamba: true # Use mamba natively + auto-activate-base: true - name: Configure conda channels and create environment with mamba shell: bash --login {0} From 562a040070ef22b53477456718ef2b284363c0fb Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 14:38:40 -0400 Subject: [PATCH 127/134] update build yml --- .github/workflows/build-tests.yml | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index 25eebe21..e269e900 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -23,23 +23,22 @@ jobs: shell: bash --login {0} run: unzip TestData.zip - - name: Set up conda (miniconda) + - name: Set up conda environment uses: conda-incubator/setup-miniconda@v2 with: - use-mamba: true - miniforge-variant: Miniforge - miniforge-version: latest - auto-activate-base: true + activate-environment: contextsv + environment-file: environment.yml + python-version: 3.10 + auto-activate-base: false - - name: Configure conda channels and create environment with mamba - shell: bash --login {0} + - name: Configure conda channels (remove defaults) + shell: bash -l {0} run: | - conda config --remove channels defaults || true + conda config --remove channels defaults conda config --add channels conda-forge conda config --add channels bioconda conda config --set channel_priority strict - mamba env create -f environment.yml - conda activate contextsv + conda info - name: Build C++ code shell: bash --login {0} # --login enables PATH variable access From 3b1f02a760b3797606cce4a37fe6f6c6a483330e Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 14:45:54 -0400 Subject: [PATCH 128/134] build yml update --- .github/workflows/build-tests.yml | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index e269e900..52b9cca1 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -23,22 +23,21 @@ jobs: shell: bash --login {0} run: unzip TestData.zip - - name: Set up conda environment + - name: Set up conda (Miniconda only) uses: conda-incubator/setup-miniconda@v2 with: - activate-environment: contextsv - environment-file: environment.yml + auto-activate-base: true python-version: 3.10 - auto-activate-base: false - - name: Configure conda channels (remove defaults) + - name: Configure conda channels and create environment shell: bash -l {0} run: | - conda config --remove channels defaults + conda config --remove channels defaults || true conda config --add channels conda-forge conda config --add channels bioconda conda config --set channel_priority strict - conda info + conda info # confirm the change + conda env create -f environment.yml - name: Build C++ code shell: bash --login {0} # --login enables PATH variable access From b58db5116bcf8d2b7283ed89ab62b7c40db317e5 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 14:50:02 -0400 Subject: [PATCH 129/134] update build yml --- .github/workflows/build-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index 52b9cca1..f3358d34 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -27,7 +27,6 @@ jobs: uses: conda-incubator/setup-miniconda@v2 with: auto-activate-base: true - python-version: 3.10 - name: Configure conda channels and create environment shell: bash -l {0} From d4b6c0c63723db4482667b5b25d72062e1dc738e Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 14:53:55 -0400 Subject: [PATCH 130/134] set build env --- .github/workflows/build-tests.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index f3358d34..274ce4ae 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -41,10 +41,14 @@ jobs: - name: Build C++ code shell: bash --login {0} # --login enables PATH variable access run: | + source $(conda info --base)/etc/profile.d/conda.sh + conda activate contextsv make - name: Run unit tests shell: bash --login {0} run: | + source $(conda info --base)/etc/profile.d/conda.sh + conda activate contextsv mkdir -p tests/output python -m pytest -s -v tests/test_general.py From a93076af17c15f35c39cd512e1b57331083d8427 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 15:04:24 -0400 Subject: [PATCH 131/134] update build yml --- .github/workflows/build-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index 274ce4ae..5100a6ad 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -43,7 +43,7 @@ jobs: run: | source $(conda info --base)/etc/profile.d/conda.sh conda activate contextsv - make + make CONDA_PREFIX=$CONDA_PREFIX - name: Run unit tests shell: bash --login {0} From 82ad9b231c930b82538c098ae88f3c89bf9adee8 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 15:36:00 -0400 Subject: [PATCH 132/134] htslib debug output --- .github/workflows/build-tests.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index 5100a6ad..79d9bdd2 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -43,6 +43,8 @@ jobs: run: | source $(conda info --base)/etc/profile.d/conda.sh conda activate contextsv + echo "CONDA_PREFIX=$CONDA_PREFIX" + ls -l $CONDA_PREFIX/include/htslib make CONDA_PREFIX=$CONDA_PREFIX - name: Run unit tests From 7e2b09f062784823e7e4cd87fac7b2a6b1902510 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 15:39:14 -0400 Subject: [PATCH 133/134] update integer type --- include/sv_caller.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/sv_caller.h b/include/sv_caller.h index 997603ef..a0883e6c 100644 --- a/include/sv_caller.h +++ b/include/sv_caller.h @@ -22,8 +22,8 @@ class SVCaller { private: struct GenomicRegion { int tid; - hts_pos_t start; - hts_pos_t end; + int start; + int end; int query_start; int query_end; bool strand; @@ -31,8 +31,8 @@ class SVCaller { }; struct PrimaryAlignment { - hts_pos_t start; - hts_pos_t end; + int start; + int end; int query_start; int query_end; bool strand; @@ -41,8 +41,8 @@ class SVCaller { struct SuppAlignment { int tid; - hts_pos_t start; - hts_pos_t end; + int start; + int end; int query_start; int query_end; bool strand; @@ -50,18 +50,18 @@ class SVCaller { struct SplitSignature { int tid; - hts_pos_t start; - hts_pos_t end; + int start; + int end; bool strand; - hts_pos_t query_start; - hts_pos_t query_end; + int query_start; + int query_end; }; // Interval Tree Node struct IntervalNode { PrimaryAlignment region; std::string qname; - hts_pos_t max_end; // To optimize queries + int max_end; // To optimize queries std::unique_ptr left; std::unique_ptr right; From be814572c53703d6a036e56d5c733c93d7fbbc29 Mon Sep 17 00:00:00 2001 From: jonperdomo Date: Fri, 1 Aug 2025 15:49:47 -0400 Subject: [PATCH 134/134] update unit test --- .github/workflows/build-tests.yml | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-tests.yml b/.github/workflows/build-tests.yml index 79d9bdd2..5c4bbb13 100644 --- a/.github/workflows/build-tests.yml +++ b/.github/workflows/build-tests.yml @@ -52,5 +52,11 @@ jobs: run: | source $(conda info --base)/etc/profile.d/conda.sh conda activate contextsv - mkdir -p tests/output - python -m pytest -s -v tests/test_general.py + ./build/contextsv --version + ./build/contextsv --help + + # run: | + # source $(conda info --base)/etc/profile.d/conda.sh + # conda activate contextsv + # mkdir -p tests/output + # python -m pytest -s -v tests/test_general.py