diff --git a/flexiplex.c++ b/flexiplex.c++ index 9b661d7..5ca81a2 100644 --- a/flexiplex.c++ +++ b/flexiplex.c++ @@ -17,9 +17,14 @@ #include #include #include -#include +#include +#include +#include +#include +#include #include "edlib.h" +#include using namespace std; @@ -27,6 +32,17 @@ using namespace std; // e.g. 1.00.1 (dev) goes to 1.01 (release) const static string VERSION="1.02.5"; +enum SegmentType { FIXED, MATCHED, RANDOM }; + +struct Segment { + SegmentType type; + std::string pattern; + std::string name; // e.g., "BC1", "UMI", "Primer1" + std::string bc_list_name; // Only for MATCHED, refers to a key in known_barcodes_map + int buffer_size; // Only for MATCHED + int max_edit_distance; // Only for MATCHED +}; + struct PredefinedStruct { string description; string params; @@ -43,7 +59,7 @@ static const map< string, PredefinedStruct> predefinedMap = { {"10x5v2",{"10x version 2 chemistry 5'", "-x CTACACGACGCTCTTCCGATCT -b ???????????????? -u ?????????? -x TTTCTTATATGGG -f 8 -e 2"}}, {"grep",{"Simple grep-like search (edit distance up to 2)", - "-f 2 -k ? -b \'\' -u \'\' -i false"}} + "-f 2 -k ? -b '' -u '' -i false"}} }; // the help information which is printed when a user puts in the wrong @@ -67,6 +83,9 @@ void print_usage(){ cerr << " @BC_UMI#READID_+1of2\n"; cerr << " is chimeric, it will become:\n"; cerr << " @BC_UMI#READID_+1of2_C\n"; + cerr << " Reads are considered chimeric if barcodes are found on both\n"; + cerr << " forward and reverse strands.\n"; + // what about when only one strand has multiple barcodes? cerr << " -n prefix Prefix for output filenames.\n"; cerr << " -e N Maximum edit distance to barcode (default 2).\n"; cerr << " -f N Maximum edit distance to primer+polyT (default 8).\n"; @@ -74,15 +93,19 @@ void print_usage(){ cerr << " Specifying adaptor / barcode structure : \n"; cerr << " -x sequence Append flanking sequence to search for\n"; - cerr << " -b sequence Append the barcode pattern to search for\n"; - cerr << " -u sequence Append the UMI pattern to search for\n"; - cerr << " Notes:\n"; - cerr << " The order of these options matters\n"; - cerr << " ? - can be used as a wildcard\n"; + cerr << " -b sequence Append the barcode pattern to search for (named CB by default)\n"; + cerr << " -u sequence[,name:] Append the UMI pattern to search for\n"; + cerr << " (named UB, UB2, UB3... by default if name not provided)\n"; + cerr << " -B sequence[,name:][,list:][,buffer:][,max_ed:]\n"; + cerr << " Append a barcode pattern with optional parameters\n"; + cerr << " - name: specify the name of the barcode segment (defaults to BC1, BC2...)\n"; + cerr << " - list: specify a file containing expected barcodes for this segment\n"; + cerr << " - buffer: number of bases to extend search region on either side (default 5)\n"; + cerr << " - max_ed: maximum edit distance for this segment (default is global -e value)\n"; cerr << " When no search pattern x,b,u option is provided, the following default pattern is used: \n"; cerr << " primer: CTACACGACGCTCTTCCGATCT\n"; - cerr << " barcode: ????????????????\n"; - cerr << " UMI: ????????????\n"; + cerr << " barcode (CB): ????????????????\n"; + cerr << " UMI (UB): ????????????\n"; cerr << " polyT: TTTTTTTTT\n"; cerr << " which is the same as providing: \n"; cerr << " -x CTACACGACGCTCTTCCGATCT -b ???????????????? -u ???????????? -x TTTTTTTTT\n\n"; @@ -122,13 +145,11 @@ void reverse_complement(string & seq){ //Holds the found barcode and associated information struct Barcode { - string barcode; - string umi; - int editd; + std::map features; // Map of segment name to extracted sequence int flank_editd; int flank_start; int flank_end; - bool unambiguous; + bool found_all_matched_segments; } ; struct SearchResult { @@ -142,6 +163,45 @@ struct SearchResult { bool chimeric; }; +// A thread-safe queue for our producer-consumer model +template +class ThreadSafeQueue { +private: + std::queue queue_; + mutable std::mutex mutex_; + std::condition_variable cond_; + +public: + void push(T value) { + std::lock_guard lock(mutex_); + queue_.push(std::move(value)); + cond_.notify_one(); + } + + // Pop an element from the queue. Returns true on success, false if the queue is empty and likely to remain so. + bool pop(T& value) { + std::unique_lock lock(mutex_); + // Wait until the queue is not empty. + cond_.wait(lock, [this] { return !queue_.empty(); }); + + // Although we waited, it's possible to be woken up spuriously. + // Or, another thread could have popped the element. + if (queue_.empty()) { + return false; + } + + value = std::move(queue_.front()); + queue_.pop(); + return true; + } + + void notify_all_finish() { + std::lock_guard lock(mutex_); + cond_.notify_all(); + } +}; + + // Code for fast edit distance calculation for short sequences modified from // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C++ // s2 is always assumned to be the shorter string (barcode) @@ -202,86 +262,22 @@ unsigned int edit_distance(const std::string& s1, const std::string& s2, unsigne return best; // return edit distance } -// extract UMI from the read after barcode matching -std::string get_umi(const std::string &seq, - const std::vector> &search_pattern, - const std::vector &read_to_subpatterns, - const int umi_index, const int bc_index, - const bool sliding_window_match, // if true, use left_bound and endDistance - const int left_bound, - const int endDistance - ) { - - int umi_start, umi_length; - std::string umi_pad = ""; - umi_length = search_pattern[umi_index].second.length(); - - if (umi_index == -1) { - return ""; // protocol does not have UMI - - } else if (umi_index == bc_index + 1) { - // UMI right after BC - if (sliding_window_match) { - umi_start = left_bound + endDistance; - } else { - umi_start = read_to_subpatterns[bc_index] + search_pattern[bc_index].second.length(); - } - if (seq.length() < umi_start + umi_length) { - // read not long enough, pad N to the end - umi_length = seq.length() - umi_start; - umi_pad = std::string(search_pattern[umi_index].second.length() - umi_length, 'N'); - } - return seq.substr(umi_start, umi_length) + umi_pad; - - } else if (umi_index == bc_index - 1) { - // UMI right before BC - int bc_start = sliding_window_match ? left_bound + endDistance : read_to_subpatterns[bc_index]; - // umi should start umi_offset bases before BC - int umi_offset = search_pattern[bc_index].second.length() + search_pattern[umi_index].second.length(); - if (bc_start < umi_offset) { - // not enough bases before BC - umi_pad = std::string(umi_offset - bc_start, 'N'); - umi_start = 0; - umi_length -= umi_offset - bc_start; - } else { - umi_start = bc_start - umi_offset; - } - return umi_pad + seq.substr(umi_start, umi_length); - } else { - // UMI not next to BC, no idea which side was truncated - if (read_to_subpatterns.size() > umi_index + 1) { - // UMI is not the last subpattern - // use the start of the next subpattern as the end of UMI - umi_start = read_to_subpatterns[umi_index]; - umi_length = min(read_to_subpatterns[umi_index + 1] - umi_start, umi_length); - umi_pad = std::string(search_pattern[umi_index].second.length() - umi_length, 'N'); - return seq.substr(umi_start, umi_length) + umi_pad; - } else { - // UMI is the last subpattern - umi_start = read_to_subpatterns[umi_index]; - umi_length = min((int) seq.length() - umi_start, umi_length); - umi_pad = std::string(search_pattern[umi_index].second.length() - umi_length, 'N'); - return seq.substr(umi_start, umi_length) + umi_pad; - } - } -} // Given a string 'seq' search for substring with primer and polyT sequence followed by // a targeted search in the region for barcode // Sequence seearch is performed using edlib -Barcode get_barcode(string & seq, - unordered_set *known_barcodes, - int flank_max_editd, - int barcode_max_editd, - const std::vector> &search_pattern) { - - const int OFFSET=5; //wiggle room in bases of the expected barcode start site to search. +Barcode extract_features(string & seq, + const std::vector &segments, + const std::unordered_map> *known_barcodes_map, + int global_flank_max_editd, + int global_barcode_edit_distance) { //initialise struct variables for return: - Barcode barcode; - barcode.editd=100; barcode.flank_editd=100; barcode.unambiguous = false; + Barcode barcode_result; + barcode_result.found_all_matched_segments = true; + barcode_result.flank_editd=100; //initialise edlib configuration // Use IUPAC codes @@ -298,171 +294,199 @@ Barcode get_barcode(string & seq, {'D', 'A'}, {'D', 'G'}, {'D', 'T'}, {'V', 'A'}, {'V', 'C'}, {'V', 'G'} }; - EdlibAlignConfig edlibConf = {flank_max_editd, EDLIB_MODE_HW, EDLIB_TASK_PATH, additionalEqualities, 28}; + EdlibAlignConfig edlibConf = {global_flank_max_editd, EDLIB_MODE_HW, EDLIB_TASK_PATH, additionalEqualities, 28}; // concatenate patterns in search_pattern in insertion order - std::string search_string; - for (const auto &pair : search_pattern) { - search_string += pair.second; + std::string search_string_template; + std::vector segment_lengths; + for (const auto &segment : segments) { + search_string_template += segment.pattern; + segment_lengths.push_back(segment.pattern.length()); } // shorter than the pattern, skip search - if (seq.length() < search_string.length()) { - return barcode; + if (seq.length() < search_string_template.length()) { + return barcode_result; } //search for the concatenated pattern - EdlibAlignResult result = edlibAlign(search_string.c_str(), search_string.length(), seq.c_str(), seq.length(), edlibConf); - if(result.status != EDLIB_STATUS_OK | result.numLocations==0 ){ + EdlibAlignResult result = edlibAlign(search_string_template.c_str(), search_string_template.length(), seq.c_str(), seq.length(), edlibConf); + if(result.status != EDLIB_STATUS_OK || result.numLocations==0 ){ edlibFreeAlignResult(result); - return(barcode); // no match found - return + return(barcode_result); // no match found - return } //fill in info about found primer and polyT location - barcode.flank_editd=result.editDistance; - barcode.flank_start=result.startLocations[0]; - barcode.flank_end=result.endLocations[0]; - - // Extract sub-patterns from aligment directly - std::vector subpattern_lengths; - for (const auto &pair : search_pattern) { - subpattern_lengths.push_back(pair.second.length()); - } + barcode_result.flank_editd=result.editDistance; + barcode_result.flank_start=result.startLocations[0]; + barcode_result.flank_end=result.endLocations[0]; - std::vector subpattern_ends; - subpattern_ends.resize(subpattern_lengths.size()); - std::partial_sum(subpattern_lengths.begin(), subpattern_lengths.end(), subpattern_ends.begin()); + std::vector segment_template_ends; + segment_template_ends.resize(segment_lengths.size()); + std::partial_sum(segment_lengths.begin(), segment_lengths.end(), segment_template_ends.begin()); - vector read_to_subpatterns; - read_to_subpatterns.reserve(subpattern_ends.size() + 1); - read_to_subpatterns.emplace_back(barcode.flank_start); + vector read_to_segment_starts; + read_to_segment_starts.reserve(segment_template_ends.size() + 1); + read_to_segment_starts.emplace_back(barcode_result.flank_start); // initialise pointers - int i_read = barcode.flank_start; + int i_read = barcode_result.flank_start; int i_pattern = 0; - int i_subpattern = 0; + int i_segment = 0; // walk through edlib aligment // 0 for match - // 1 for insertion to target - // 2 for insertion to query + // 1 for insertion to target (read) + // 2 for insertion to query (template) // 3 for mismatch std::vector alignment_vector(result.alignment, result.alignment + result.alignmentLength); for (const auto& value : alignment_vector) { - if (value != 1) { + if (value != EDLIB_EDOP_INSERT) { // If not an insertion in the read i_read++; } - if (value != 2) { + if (value != EDLIB_EDOP_DELETE) { // If not an insertion in the template i_pattern++; } - if (i_pattern >= subpattern_ends[i_subpattern]) { - read_to_subpatterns.emplace_back(i_read); - i_subpattern++; + if (i_segment < segment_template_ends.size() && i_pattern >= segment_template_ends[i_segment]) { + read_to_segment_starts.emplace_back(i_read); + i_segment++; } } - edlibFreeAlignResult(result); + // Storage for refined positions from matched segments + std::vector refined_segment_starts(segments.size(), -1); + std::vector refined_segment_ends(segments.size(), -1); - // Work out the index of BC and UMI in the pattern - // TODO: Should this be done in main to be more efficient? - // TODO: Handle edge cases where BC / UMI is not speficied in the pattern - int bc_index = -1, umi_index = -1; - auto it_pattern = - std::find_if(search_pattern.begin(), search_pattern.end(), - [](const std::pair &pair) { - return pair.first == "BC"; - }); - if (it_pattern != search_pattern.end()) { - bc_index = std::distance(search_pattern.begin(), it_pattern); - } else { - // error - } - it_pattern = - std::find_if(search_pattern.begin(), search_pattern.end(), - [](const std::pair &pair) { - return pair.first == "UMI"; - }); - if (it_pattern != search_pattern.end()) { - umi_index = std::distance(search_pattern.begin(), it_pattern); - } else { - // error - } - - //if not checking against known list of barcodes, return sequence after the primer - //also check for a perfect match straight up as this will save computer later. - // - // read_to_subpatterns[subpattern_index] gives start of subpattern in the read - std::string exact_bc = seq.substr( - read_to_subpatterns[bc_index], - search_pattern[bc_index].second.length()); - if (known_barcodes->size()==0 || (known_barcodes->find(exact_bc) != known_barcodes->end())){ - barcode.barcode=exact_bc; - barcode.editd=0; - barcode.unambiguous=true; - barcode.umi = get_umi(seq, search_pattern, read_to_subpatterns, umi_index, bc_index, false, 0, 0); - return(barcode); - } - - // otherwise widen our search space and the look for matches with errors - - int left_bound = max( - read_to_subpatterns[bc_index] - OFFSET, // widen the search by using an OFFSET - 0 // set a maximum starting character index of 0 - ); - int max_length = search_pattern[bc_index].second.length() + 2 * OFFSET; + // Pass 1: Process MATCHED segments + for (size_t i = 0; i < segments.size(); ++i) { + const Segment& s = segments[i]; + if (s.type != MATCHED) continue; - std::string barcode_seq = seq.substr(left_bound, max_length); + int segment_read_start = read_to_segment_starts[i]; + int segment_read_end = (i + 1 < read_to_segment_starts.size()) ? read_to_segment_starts[i+1] : barcode_result.flank_end; - //iterate over all the known barcodes, checking each sequentially - unordered_set::iterator known_barcodes_itr=known_barcodes->begin(); - unsigned int editDistance, endDistance; + const unordered_set* current_bclist = nullptr; - for(; known_barcodes_itr != known_barcodes->end(); known_barcodes_itr++){ - search_string = *known_barcodes_itr; //known barcode to check against - editDistance = edit_distance(barcode_seq, search_string, endDistance, barcode_max_editd); - - if (editDistance == barcode.editd) { - barcode.unambiguous = false; - } else if (editDistance < barcode.editd && editDistance <= barcode_max_editd) { // if best so far, update - barcode.unambiguous = true; - barcode.editd = editDistance; - barcode.barcode = *known_barcodes_itr; - barcode.umi = get_umi(seq, search_pattern, read_to_subpatterns, umi_index, bc_index, true, left_bound, endDistance); + if (!s.bc_list_name.empty()) { + auto it = known_barcodes_map->find(s.bc_list_name); + if (it != known_barcodes_map->end()) { + current_bclist = &(it->second); + } + } else if (known_barcodes_map->count("global")) { + current_bclist = &(known_barcodes_map->at("global")); + } - //if perfect match is found we're done. - if (editDistance == 0) { - return(barcode); + if (current_bclist && !current_bclist->empty()) { + // Apply buffer for search + int search_start = max(0, segment_read_start - s.buffer_size); + int search_end = min((int)seq.length(), segment_read_end + s.buffer_size); + std::string search_region = seq.substr(search_start, search_end - search_start); + + unsigned int best_edit_distance = 100; // Higher than any reasonable max_ed + unsigned int best_end_distance = 0; + std::string best_barcode_match = ""; + bool current_segment_unambiguous = false; + + for (const auto& known_bc : *current_bclist) { + unsigned int editDistance, endDistance; + editDistance = edit_distance(search_region, known_bc, endDistance, s.max_edit_distance); + + if (editDistance == best_edit_distance) { + current_segment_unambiguous = false; + } else if (editDistance < best_edit_distance && editDistance <= s.max_edit_distance) { + current_segment_unambiguous = true; + best_edit_distance = editDistance; + best_barcode_match = known_bc; + best_end_distance = endDistance; + + // TODO: multiplex perfect match can exits due to the buffering around barcode segments + if (editDistance == 0) { + break; // perfect match found + } + } + } + if (best_edit_distance <= s.max_edit_distance && current_segment_unambiguous) { + barcode_result.features[s.name] = best_barcode_match; + + // Refined positions + refined_segment_ends[i] = search_start + best_end_distance - 1; + // Approximate start based on match length (assuming not many indels) + refined_segment_starts[i] = refined_segment_ends[i] - best_barcode_match.length() + 1; + } else { + barcode_result.found_all_matched_segments = false; } + } else { // No barcode list found + cerr << "Error: No barcode list found for segment " << s.name << ".\n"; + exit(1); } + } + if (!barcode_result.found_all_matched_segments) { + return(barcode_result); // return early if any matched segments failed } - return(barcode); //return the best matched barcode and associated information -} -//search a read for one or more barcodes (parent function that calls get_barcode) -vector big_barcode_search(string & sequence, unordered_set & known_barcodes, int max_flank_editd, int max_editd, const std::vector> &search_pattern) { - - vector return_vec; //vector of all the barcodes found - - //search for barcode - Barcode result=get_barcode(sequence,&known_barcodes,max_flank_editd,max_editd, search_pattern); //,ss); - if(result.editd<=max_editd && result.unambiguous) //add to return vector if edit distance small enough - return_vec.push_back(result); - - //if a result was found, mask out the flanking sequence and search again in case there are more. - if(return_vec.size()>0){ - string masked_sequence = sequence; - for(int i=0; i masked_res; - masked_res=big_barcode_search(masked_sequence,known_barcodes,max_flank_editd,max_editd, search_pattern); //,ss); - return_vec.insert(return_vec.end(),masked_res.begin(),masked_res.end()); //add to result + // Pass 2: Process RANDOM and other segments + for (size_t i = 0; i < segments.size(); ++i) { + const Segment& s = segments[i]; + if (s.type == MATCHED) continue; + + int extract_start = read_to_segment_starts[i]; + int extract_end = (i + 1 < read_to_segment_starts.size()) ? read_to_segment_starts[i+1] : barcode_result.flank_end; + + if (s.type == RANDOM) { + // Check if there is a MATCHED segment immediately before + if (i > 0 && segments[i-1].type == MATCHED && refined_segment_ends[i-1] != -1) { + extract_start = refined_segment_ends[i-1] + 1; + extract_end = extract_start + s.pattern.length(); + } + // Otherwise check if there is one immediately after + else if (i + 1 < segments.size() && segments[i+1].type == MATCHED && refined_segment_starts[i+1] != -1) { + extract_end = refined_segment_starts[i+1]; + extract_start = extract_end - s.pattern.length(); + } + + int actual_extract_start = max(0, extract_start); + int actual_extract_end = min((int)seq.length(), extract_end); + + if (actual_extract_end < actual_extract_start) actual_extract_end = actual_extract_start; + + std::string extracted_seq = seq.substr(actual_extract_start, actual_extract_end - actual_extract_start); + barcode_result.features[s.name] = extracted_seq; + } } - return(return_vec); + return(barcode_result); //return the best matched barcode and associated information +} + +vector big_barcode_search(const string & sequence, + const std::unordered_map> *known_barcodes_map, + int global_flank_max_editd, + int global_barcode_edit_distance, + const std::vector &segments) { + vector return_vec; + string masked_sequence = sequence; // Work on a copy + + while (true) { + Barcode result = extract_features(masked_sequence, segments, known_barcodes_map, global_flank_max_editd, global_barcode_edit_distance); + if (result.flank_editd <= global_flank_max_editd && result.found_all_matched_segments) { + return_vec.push_back(result); + + // Mask the found region to prevent re-finding it + // result.flank_end is inclusive, so length is end - start + 1 + int match_length = result.flank_end - result.flank_start + 1; + + if (match_length > 0) { + masked_sequence.replace(result.flank_start, match_length, string(match_length, 'X')); + } else { + break; // Should not happen for valid match, but prevents infinite loop + } + } else { + break; + } + } + return return_vec; } + // utility function to check true/false input options bool get_bool_opt_arg(string value){ transform(value.begin(), value.end(), value.begin(), ::tolower); @@ -478,17 +502,27 @@ bool get_bool_opt_arg(string value){ } // print information about barcodes -void print_stats(string read_id, vector & vec_bc, ostream & out_stream){ +void print_stats(const string& read_id, const vector & vec_bc, ostream & out_stream, const std::vector &segments){ for(int b=0; bsecond; + } else { + out_stream << "\t"; + } + } + } + out_stream << "\t" << vec_bc.at(b).flank_editd << "\n"; } } -void print_line(string id, string read, string quals, bool is_fastq, ostream & out_stream){ +void print_line(const string& id, const string& read, const string& quals, ostream & out_stream){ + + //flag for read format + bool is_fastq=!(quals==""); //no quality scores passed = fasta //output to the new read file if(is_fastq) @@ -503,122 +537,136 @@ void print_line(string id, string read, string quals, bool is_fastq, ostream & o } //print fastq or fasta lines.. -void print_read(string read_id, string read, string qual, - vector & vec_bc, string prefix, - bool split, unordered_set & found_barcodes, +void print_read(const string& read_id, const string& read, string qual, + const vector & vec_bc, const string& prefix, + bool split, map & barcode_file_streams, + mutex & cout_mutex, mutex & file_mutex, bool trim_barcodes, - bool chimeric, bool is_fastq){ + bool chimeric, + const std::vector &segments){ auto vec_size = vec_bc.size(); - //loop over the barcodes found... usually will just be one for (int b = 0; b < vec_size; b++) { - - // format the new read id. Using FLAMES format. - stringstream ss; - ss << (b + 1) << "of" << vec_size; + stringstream ss_id_suffix; + ss_id_suffix << (b + 1) << "of" << vec_size; if (chimeric) { - ss << "_" << "C"; + ss_id_suffix << "_" << "C"; } - string barcode = vec_bc.at(b).barcode; - // also add the proper FASTQ way: \tCB:Z:barcode\tUB:Z:umi - string new_read_id = - barcode + "_" + vec_bc.at(b).umi + "#" + read_id + ss.str() + "\tCB:Z:" + barcode + "\tUB:Z:" + vec_bc[b].umi; + stringstream new_read_id_ss; + string primary_barcode_for_filename = ""; + + for (const auto& s : segments) { + if (s.type != FIXED) { + auto it = vec_bc.at(b).features.find(s.name); + if (it != vec_bc.at(b).features.end()) { + new_read_id_ss << it->second << "_"; + if (s.type == MATCHED && primary_barcode_for_filename.empty()) { + primary_barcode_for_filename = it->second; + } + } else { + new_read_id_ss << "NA_"; + } + } + } + string new_read_id_prefix = new_read_id_ss.str(); + if (!new_read_id_prefix.empty() && new_read_id_prefix.back() == '_') { + new_read_id_prefix.pop_back(); + } - // work out the start and end base in case multiple barcodes - // note: read_start+1, see issue #63 - int read_start = vec_bc.at(b).flank_end + 1; - int read_length = read.length() - read_start; + string new_read_id = new_read_id_prefix + "#" + read_id + ss_id_suffix.str(); - for (int f = 0; f < vec_size; f++) { - int temp_read_length = vec_bc.at(f).flank_start - read_start; - if (temp_read_length >= 0 && temp_read_length < read_length) - read_length = temp_read_length; + for (const auto& s : segments) { + if (s.type != FIXED) { + auto it = vec_bc.at(b).features.find(s.name); + if (it != vec_bc.at(b).features.end()) { + new_read_id += "\t" + s.name + ":Z:" + it->second; + } + } } - string qual_new = ""; // don't trim the quality scores if it's a fasta file + int read_start = vec_bc.at(b).flank_end + 1; + // work out the start and end base in case multiple barcodes + int next_barcode_start = read.length(); + for (int k = 0; k < vec_size; k++) { + if (b == k) continue; + if (vec_bc.at(k).flank_start >= read_start) { + if (vec_bc.at(k).flank_start < next_barcode_start) { + next_barcode_start = vec_bc.at(k).flank_start; + } + } + } + + int read_length = next_barcode_start - read_start; + + string qual_new = ""; if (qual != "") { - if((read_start+read_length)>(qual.length())) { - cerr << "WARNING: sequence and quality lengths diff for read: " << read_id << ". Ignoring read." << endl; - return; + if((read_start+read_length)>(qual.length())){ + lock_guard lock(cout_mutex); + cerr << "WARNING: sequence and quality lengths diff for read: " << read_id << ". Ignoring read." << endl; + return; } qual_new = qual.substr(read_start, read_length); } string read_new = read.substr(read_start, read_length); - if (b == 0 && !trim_barcodes) { // override if read shouldn't be cut + if (b == 0 && !trim_barcodes) { new_read_id = read_id; read_new = read; qual_new = qual; - b = vec_size; // force loop to exit after this iteration } - // Skip reads that become empty after trimming if (read_new.length() == 0) { continue; } - if (split) { // to a file if splitting by barcode - string outname = prefix + "_" + barcode + "."; - if (qual == "") - outname += "fasta"; - else - outname += "fastq"; - ofstream outstream; - if (found_barcodes.insert(barcode).second) - outstream.open(outname); // override file if this is the first read - // for the barcode - else - outstream.open(outname, ofstream::app); // remove file if this is the - // first read for the barcode - print_line(new_read_id, read_new, qual_new, is_fastq, outstream); - outstream.close(); + if (split) { + lock_guard lock(file_mutex); + string barcode_for_split = primary_barcode_for_filename.empty() ? "UNKNOWN" : primary_barcode_for_filename; + if (barcode_file_streams.find(barcode_for_split) == barcode_file_streams.end()) { + string outname = prefix + "_" + barcode_for_split + (qual.empty() ? ".fasta" : ".fastq"); + barcode_file_streams[barcode_for_split].open(outname); + } + print_line(new_read_id, read_new, qual_new, barcode_file_streams[barcode_for_split]); } else { - print_line(new_read_id, read_new, qual_new, is_fastq, std::cout); + lock_guard lock(cout_mutex); + print_line(new_read_id, read_new, qual_new, std::cout); } } } -// separated out from main so that this can be run with threads -void search_read(vector & reads, unordered_set & known_barcodes, - int flank_edit_distance, int edit_distance, - const std::vector> &search_pattern) { - - for (int r=0; r &work_queue, + ThreadSafeQueue &results_queue, + const std::unordered_map> &known_barcodes_map, + int flank_edit_distance, + int global_barcode_edit_distance, + const std::vector &search_pattern +) { + SearchResult* sr_ptr; + while (work_queue.pop(sr_ptr) && sr_ptr) { + //forward search + sr_ptr->vec_bc_for = big_barcode_search(sr_ptr->line, &known_barcodes_map, flank_edit_distance, global_barcode_edit_distance, search_pattern); + + // get reverse complement + sr_ptr->rev_line = sr_ptr->line; + reverse_complement(sr_ptr->rev_line); + + //Check the reverse complement of the read + sr_ptr->vec_bc_rev = big_barcode_search(sr_ptr->rev_line, &known_barcodes_map, flank_edit_distance, global_barcode_edit_distance, search_pattern); + + sr_ptr->count = sr_ptr->vec_bc_for.size() + sr_ptr->vec_bc_rev.size(); + sr_ptr->chimeric = !sr_ptr->vec_bc_for.empty() && !sr_ptr->vec_bc_rev.empty(); + + results_queue.push(sr_ptr); + } + // Push a sentinel to the results queue to signal this worker is done + results_queue.push(nullptr); } + // check if file already exists bool file_exists(const std::string &filename) { std::ifstream infile(filename); @@ -633,7 +681,6 @@ int main(int argc, char **argv) { // Variables to store user options // Set these to their defaults - int expected_cells = 0; //(d) int edit_distance = 2; //(e) int flank_edit_distance = 8; //(f) @@ -643,31 +690,37 @@ int main(int argc, char **argv) { string out_filename_prefix = "flexiplex"; //(n) bool split_file_by_barcode = false; //(s) - bool remove_barcodes = true; //(r) + bool remove_barcodes = true; //(i) bool print_chimeric = false; //(c) - std::vector> search_pattern; + std::vector search_pattern; - // Set of known barcodes - unordered_set known_barcodes; - unordered_set found_barcodes; + // Set of known barcodes (can be multiple, identified by name) + std::unordered_map> known_barcodes_map; + // Flag to track if any -B segment specified its own barcode list + bool individual_bclist_specified = false; + + // Counters for automatic naming + int barcode_count = 0; + int umi_count = 0; // threads int n_threads = 1; - /*** Pass command line option *****/ + /*** Pass command line option *****/\ int c; int params = 1; ifstream file; string line; vector myArgs(argv, argv + argc); + vector allocated_args; + while ((c = getopt(myArgs.size(), myArgs.data(), - "d:k:i:b:u:x:e:f:n:s:hp:c:")) != EOF) { + "d:k:i:b:u:x:e:f:n:s:hp:c:B:")) != EOF) { switch (c) { - case 'd': { // d=predefined list of settings for various search/barcode - // schemes + case 'd': { // d=predefined list of settings for various search/barcode schemes if (predefinedMap.find(optarg) == predefinedMap.end()) { cerr << "Unknown argument, " << optarg << ", passed to option -d. See list below for options.\n"; @@ -680,41 +733,47 @@ int main(int argc, char **argv) { vector newArgv; while (settingsStream >> token) { // code created with the help of chatGPT token.erase(remove(token.begin(), token.end(), '\''), token.end()); - char *newArg = - new char[token.size() + - 1]; // +1 for null terminator //need to delete these later. + char *newArg = new char[token.size() + 1]; // +1 for null terminator strcpy(newArg, token.c_str()); newArgv.push_back(newArg); // Append the token to the new argv } params += 2; - myArgs.insert(myArgs.begin() += params, newArgv.begin(), newArgv.end()); + myArgs.insert(myArgs.begin() + params, newArgv.begin(), newArgv.end()); + allocated_args.insert(allocated_args.end(), newArgv.begin(), newArgv.end()); break; } - case 'k': { // k=list of known barcodes + case 'k': { // k=list of known barcodes (global barcode list) + if (individual_bclist_specified) { + cerr << "Error: Cannot use -k when -B segments have specified their own barcode files.\n"; + print_usage(); + exit(1); + } string file_name(optarg); string bc; + unordered_set global_bclist; /**** READ BARCODE LIST FROM FILE ******/ file.open(file_name); if (!(file.good())) { // if the string given isn't a file cerr << "Reading known barcodes from string: " << file_name << "\n"; stringstream bc_list(file_name); while (getline(bc_list, bc, ',')) // tokenize - known_barcodes.insert(bc); + global_bclist.insert(bc); } else { // otherwise get the barcodes from the file.. cerr << "Setting known barcodes from " << file_name << "\n"; while (getline(file, line)) { istringstream line_stream(line); line_stream >> bc; - known_barcodes.insert(bc); + global_bclist.insert(bc); } file.close(); } - cerr << "Number of known barcodes: " << known_barcodes.size() << "\n"; - if (known_barcodes.size() == 0) { + cerr << "Number of known barcodes in global list: " << global_bclist.size() << "\n"; + if (global_bclist.empty()) { print_usage(); exit(1); // case barcode file is empty } + known_barcodes_map["global"] = global_bclist; params += 2; break; } @@ -726,7 +785,7 @@ int main(int argc, char **argv) { } case 'e': { edit_distance = atoi(optarg); - cerr << "Setting max barcode edit distance to " << edit_distance << "\n"; + cerr << "Setting max barcode edit distance to " << edit_distance << " (global default).\n"; params += 2; break; } @@ -737,22 +796,174 @@ int main(int argc, char **argv) { params += 2; break; } - // x, u, b arguments inserts subpatterns to search_pattern - case 'x': { - search_pattern.push_back(std::make_pair("Unnamed Seq", optarg)); - cerr << "Adding flank sequence to search for: " << optarg << "\n"; + case 'x': { // Fixed segment + Segment s; + s.type = FIXED; + s.pattern = optarg; + s.name = "Fixed_" + to_string(search_pattern.size()); + search_pattern.push_back(s); + cerr << "Adding fixed sequence to search for: " << optarg << "\n"; + params += 2; + break; + } + case 'u': { // Random (UMI) segment + Segment s; + s.type = RANDOM; + s.name = ""; // Will be set later if not provided + s.buffer_size = 0; // Default buffer for UMI + + string arg_str(optarg); + size_t current_pos = 0; + size_t next_comma = arg_str.find(','); + + // First part is always the pattern + s.pattern = arg_str.substr(current_pos, next_comma - current_pos); + cerr << "Setting UMI to search for: " << s.pattern; + + while (next_comma != string::npos) { + current_pos = next_comma + 1; + next_comma = arg_str.find(',', current_pos); + string param = arg_str.substr(current_pos, next_comma - current_pos); + + size_t colon_pos = param.find(':'); + if (colon_pos == string::npos) { + cerr << "Error: Malformed -u parameter: " << param << "\n"; + print_usage(); + exit(1); + } + string key = param.substr(0, colon_pos); + string value = param.substr(colon_pos + 1); + + if (key == "name") { + s.name = value; + cerr << " (name: " << s.name << ")"; + } else if (key == "buffer") { + s.buffer_size = stoi(value); + cerr << " (buffer: " << s.buffer_size << ")"; + } else { + cerr << "Error: Unknown -u parameter: " << key << "\n"; + print_usage(); + exit(1); + } + } + + // Assign default name if not provided + if (s.name.empty()) { + umi_count++; + if (umi_count == 1) { + s.name = "UB"; + } else { + s.name = "UB" + to_string(umi_count); + } + } else { + umi_count++; // Still increment counter for user-named UMIs + } + + search_pattern.push_back(s); + cerr << " -> named: " << s.name << "\n"; params += 2; break; } - case 'u': { - search_pattern.push_back(std::make_pair("UMI", optarg)); - cerr << "Setting UMI to search for: " << optarg << "\n"; + case 'b': { // Old-style Barcode segment + Segment s; + s.type = MATCHED; + s.pattern = optarg; + barcode_count++; + s.name = (barcode_count == 1) ? "CB" : "CB" + to_string(barcode_count); + s.buffer_size = 5; // Default buffer for barcode + s.max_edit_distance = edit_distance; // Will use global -e + s.bc_list_name = "global"; // Will use global -k + search_pattern.push_back(s); + cerr << "Setting old-style barcode to search for: " << optarg + << " -> named: " << s.name + << ". Will use global barcode list (-k) and edit distance (-e).\n"; params += 2; break; } - case 'b': { - search_pattern.push_back(std::make_pair("BC", optarg)); - cerr << "Setting barcode to search for: " << optarg << "\n"; + case 'B': { // New-style Barcode segment + Segment s; + s.type = MATCHED; + s.name = ""; // Will be set later if not provided + s.buffer_size = 5; // Default buffer + s.max_edit_distance = edit_distance; // Default to global -e + + string arg_str(optarg); + size_t current_pos = 0; + size_t next_comma = arg_str.find(','); + + // First part is always the pattern + s.pattern = arg_str.substr(current_pos, next_comma - current_pos); + cerr << "Adding new-style barcode segment: " << s.pattern; + + while (next_comma != string::npos) { + current_pos = next_comma + 1; + next_comma = arg_str.find(',', current_pos); + string param = arg_str.substr(current_pos, next_comma - current_pos); + + size_t colon_pos = param.find(':'); + if (colon_pos == string::npos) { + cerr << "Error: Malformed -B parameter: " << param << "\n"; + print_usage(); + exit(1); + } + string key = param.substr(0, colon_pos); + string value = param.substr(colon_pos + 1); + + if (key == "name") { + s.name = value; + cerr << " (name: " << s.name << ")"; + } else if (key == "list") { + s.bc_list_name = value; + individual_bclist_specified = true; + // Load barcode file + unordered_set current_bclist; + file.open(value); + if (!(file.good())) { + cerr << "Error: Unable to open barcode file: " << value << "\n"; + print_usage(); + exit(1); + } + while (getline(file, line)) { + istringstream line_stream(line); + string bc_entry; + line_stream >> bc_entry; + current_bclist.insert(bc_entry); + } + file.close(); + if (current_bclist.empty()) { + cerr << "Error: barcode file " << value << " is empty.\n"; + print_usage(); + exit(1); + } + known_barcodes_map[value] = current_bclist; + cerr << " (barcode list: " << value << ")"; + } else if (key == "buffer") { + s.buffer_size = stoi(value); + cerr << " (buffer: " << s.buffer_size << ")"; + } else if (key == "max_ed") { + s.max_edit_distance = stoi(value); + cerr << " (max_ed: " << s.max_edit_distance << ")"; + } else { + cerr << "Error: Unknown -B parameter: " << key << "\n"; + print_usage(); + exit(1); + } + } + + // Assign default name if not provided + if (s.name.empty()) { + barcode_count++; + if (barcode_count == 1) { + s.name = "CB"; + } else { + s.name = "CB" + to_string(barcode_count); + } + } else { + barcode_count++; // Still increment counter for user-named barcodes + } + + search_pattern.push_back(s); + cerr << " -> named: " << s.name << "\n"; params += 2; break; } @@ -771,18 +982,12 @@ int main(int argc, char **argv) { split_file_by_barcode = get_bool_opt_arg(optarg); cerr << "Split read output into separate files by barcode: " << split_file_by_barcode << "\n"; - int max_split_bc = 50; - if (known_barcodes.size() > max_split_bc) { - cerr << "Too many barcodes to split into separate files: " - << known_barcodes.size() << "> " << max_split_bc << "\n"; - split_file_by_barcode = false; - } + params += 2; break; } case 'c': { print_chimeric = get_bool_opt_arg(optarg); - params += 2; break; } @@ -799,15 +1004,42 @@ int main(int argc, char **argv) { } } - // default case when no x, u, b is speficied + // free memory from -d option + for (auto arg : allocated_args) { + delete[] arg; + } + + // default case when no x, u, b, B is speficied if (search_pattern.empty()) { cerr << "Using default search pattern: " << endl; - search_pattern = {{"primer", "CTACACGACGCTCTTCCGATCT"}, - {"BC", std::string(16, '?')}, - {"UMI", std::string(12, '?')}, - {"polyA", std::string(9, 'T')}}; - for (auto i : search_pattern) - std::cerr << i.first << ": " << i.second << "\n"; + Segment s_primer, s_bc, s_umi, s_polya; + + s_primer.type = FIXED; + s_primer.pattern = "CTACACGACGCTCTTCCGATCT"; + s_primer.name = "primer"; + search_pattern.push_back(s_primer); + + s_bc.type = MATCHED; + s_bc.pattern = std::string(16, '?'); + s_bc.name = "CB"; + s_bc.buffer_size = 5; + s_bc.max_edit_distance = edit_distance; + s_bc.bc_list_name = "global"; + search_pattern.push_back(s_bc); + + s_umi.type = RANDOM; + s_umi.pattern = std::string(12, '?'); + s_umi.name = "UB"; + s_umi.buffer_size = 0; + search_pattern.push_back(s_umi); + + s_polya.type = FIXED; + s_polya.pattern = std::string(9, 'T'); + s_polya.name = "polyA"; + search_pattern.push_back(s_polya); + + for (auto const& s : search_pattern) + std::cerr << s.name << ": " << s.pattern << "\n"; } cerr << "For usage information type: flexiplex -h" << endl; @@ -832,22 +1064,27 @@ int main(int argc, char **argv) { } /********* FIND BARCODE IN READS ********/ - string sequence; - int bc_count = 0; - int r_count = 0; - int multi_bc_count = 0; + atomic r_count(0); + atomic bc_count(0); + atomic multi_bc_count(0); ofstream out_stat_file; out_stat_filename = out_filename_prefix + "_" + out_stat_filename; out_bc_filename = out_filename_prefix + "_" + out_bc_filename; - if (known_barcodes.size() > 0) { + if (!known_barcodes_map.empty()) { if (file_exists(out_stat_filename)) { cerr << "File " << out_stat_filename << " already exists, overwriting." << endl; } out_stat_file.open(out_stat_filename, std::ios::trunc); - out_stat_file << "Read\tCellBarcode\tFlankEditDist\tBarcodeEditDist\tUMI\n"; + out_stat_file << "Read"; + for (const auto& s : search_pattern) { + if (s.type != FIXED) { + out_stat_file << "\t" << s.name; + } + } + out_stat_file << "\tFlankEditDist\n"; } cerr << "Searching for barcodes...\n"; @@ -859,146 +1096,133 @@ int main(int argc, char **argv) { if (getline(*in, read_id_line)) { // check the first line for file type if (read_id_line[0] == '>') { is_fastq = false; - } else if (read_id_line[0] == '@') { // fasta - ignore! - } else { + } else if (read_id_line[0] != '@') { cerr << "Unknown read format... exiting" << endl; exit(1); } + } else { // Empty file + return 0; } - while (getline(*in, line)) { - const int buffer_size = 2000; // number of reads to pass to one thread. + // --- Threading and Pipeline Setup --- + ThreadSafeQueue work_queue; + ThreadSafeQueue results_queue; + vector workers; - vector> sr_v(n_threads); - for (int i = 0; i < n_threads; i++) { - sr_v[i] = vector(buffer_size); - } + // Mutexes for synchronized output + mutex cout_mutex; + mutex file_mutex; + map barcode_file_streams; - vector threads(n_threads); + // Start worker threads + for (int i = 0; i < n_threads; ++i) { + workers.emplace_back(search_worker, ref(work_queue), ref(results_queue), ref(known_barcodes_map), + flank_edit_distance, edit_distance, ref(search_pattern)); + } - // get n_threads*buffer number of reads.. - for (int t = 0; t < n_threads; t++) { - for (int b = 0; b < buffer_size; b++) { - SearchResult &sr = sr_v[t][b]; + // --- Producer Stage --- + // The main thread reads the file and pushes work to the queue + do { + SearchResult *sr = new SearchResult(); + istringstream line_stream(read_id_line); + line_stream >> sr->read_id; + sr->read_id.erase(0, 1); + + if (!is_fastq) { // fasta (account for multi-lines per read) + string buffer_string; + while (getline(*in, buffer_string) && buffer_string[0] != '>') + sr->line += buffer_string; + read_id_line = buffer_string; + } else { // fastq (get quality scores) + getline(*in, sr->line); + getline(*in, line); // skip '+' line + getline(*in, sr->qual_scores); + getline(*in, read_id_line); + } + work_queue.push(sr); + } while (*in); - sr.line = line; - string read_id; + // Push sentinels to the work queue to signal completion + for (int i = 0; i < n_threads; ++i) { + work_queue.push(nullptr); + } - istringstream line_stream(read_id_line); - line_stream >> sr.read_id; - sr.read_id.erase(0, 1); + // --- Consumer/Writer Stage --- + // The main thread now processes results + int finished_workers = 0; + while(finished_workers < n_threads) { + SearchResult* sr_ptr; + results_queue.pop(sr_ptr); - if (!is_fastq) { // fasta (account for multi-lines per read) - string buffer_string; - while (getline(*in, buffer_string) && buffer_string[0] != '>') - sr.line += buffer_string; - read_id_line = buffer_string; - } else { // fastq (get quality scores) - for (int s = 0; s < 2; s++) { - getline(*in, sr.qual_scores); + if (sr_ptr == nullptr) { + finished_workers++; + continue; + } + + unique_ptr sr(sr_ptr); // Manage memory + + r_count++; + if (sr->count > 0) bc_count++; + if (sr->chimeric) multi_bc_count++; + + // Count barcodes from features map (first MATCHED segment) + for (const auto& bc : sr->vec_bc_for) { + for (const auto& s : search_pattern) { + if (s.type == MATCHED) { + auto it = bc.features.find(s.name); + if (it != bc.features.end()) { + barcode_counts[it->second]++; + break; // Only count first barcode for statistics + } + } } - getline(*in, read_id_line); - } - - r_count++; // progress counter - if (// display for every 10,000 reads first - (r_count < 100000 && r_count % 10000 == 0) || - // and then every 100,000 reads after - (r_count % 100000 == 0)) { - cerr << r_count / ((double)1000000) << " million reads processed.." - << endl; - } - - // this is quite ugly, must be a better way to do this.. - if (b == buffer_size - 1 && t == n_threads - 1) { - break; // if it's the last in the chunk don't getline as this happens - // in the while statement - } else if (!getline(*in, line)) { // case we are at the end of the - // reads. - sr_v[t].resize(b + 1); - threads[t] = std::thread(search_read, ref(sr_v[t]), - ref(known_barcodes), flank_edit_distance, - edit_distance, ref(search_pattern)); - for (int t2 = t + 1; t2 < n_threads; t2++) { - sr_v[t2].resize(0); + } + for (const auto& bc : sr->vec_bc_rev) { + for (const auto& s : search_pattern) { + if (s.type == MATCHED) { + auto it = bc.features.find(s.name); + if (it != bc.features.end()) { + barcode_counts[it->second]++; + break; // Only count first barcode for statistics + } + } } - - goto print_result; // advance the line - } } - // send reads to the thread - threads[t] = - std::thread(search_read, ref(sr_v[t]), ref(known_barcodes), - flank_edit_distance, edit_distance, ref(search_pattern)); - } - print_result: - // START print_result LABEL... + if (known_barcodes_map.size() != 0) { + print_stats(sr->read_id, sr->vec_bc_for, out_stat_file, search_pattern); + print_stats(sr->read_id, sr->vec_bc_rev, out_stat_file, search_pattern); - // loop over the threads and print out ther results - for (int t = 0; t < sr_v.size(); t++) { - if (sr_v[t].size() > 0) - threads[t].join(); // wait for the threads to finish before printing - - for (int r = 0; r < sr_v[t].size(); r++) { // loop over the reads - - for (int b = 0; b < sr_v[t][r].vec_bc_for.size(); b++) - barcode_counts[sr_v[t][r].vec_bc_for.at(b).barcode]++; - for (int b = 0; b < sr_v[t][r].vec_bc_rev.size(); b++) - barcode_counts[sr_v[t][r].vec_bc_rev.at(b).barcode]++; - - if (sr_v[t][r].count > 0) - bc_count++; - if (sr_v[t][r].chimeric) { - multi_bc_count++; - } + print_read(sr->read_id + "_+", sr->line, sr->qual_scores, sr->vec_bc_for, out_filename_prefix, + split_file_by_barcode, barcode_file_streams, cout_mutex, file_mutex, remove_barcodes, print_chimeric && sr->chimeric, search_pattern); - if (known_barcodes.size() != - 0) { // if we are just looking for all possible barcodes don't - // output reads etc. - - print_stats(sr_v[t][r].read_id, sr_v[t][r].vec_bc_for, out_stat_file); - print_stats(sr_v[t][r].read_id, sr_v[t][r].vec_bc_rev, out_stat_file); - - print_read( - sr_v[t][r].read_id + "_+", - sr_v[t][r].line, - sr_v[t][r].qual_scores, - sr_v[t][r].vec_bc_for, - out_filename_prefix, - split_file_by_barcode, - found_barcodes, - remove_barcodes, - print_chimeric && sr_v[t][r].chimeric, // include chimeric information if requested - is_fastq - ); - - // case we just want to print read once if multiple bc found. - if (remove_barcodes || sr_v[t][r].vec_bc_for.size() == 0) { - // for a chimeric read, we first need to reverse the quality scores - reverse(sr_v[t][r].qual_scores.begin(), sr_v[t][r].qual_scores.end()); - - print_read( - sr_v[t][r].read_id + "_-", - sr_v[t][r].rev_line, - sr_v[t][r].qual_scores, - sr_v[t][r].vec_bc_rev, - out_filename_prefix, - split_file_by_barcode, - found_barcodes, - remove_barcodes, - print_chimeric && sr_v[t][r].chimeric, // include chimeric information if requested - is_fastq - ); + if (remove_barcodes || sr->vec_bc_for.empty()) { + reverse(sr->qual_scores.begin(), sr->qual_scores.end()); + print_read(sr->read_id + "_-", sr->rev_line, sr->qual_scores, sr->vec_bc_rev, out_filename_prefix, + split_file_by_barcode, barcode_file_streams, cout_mutex, file_mutex, remove_barcodes, print_chimeric && sr->chimeric, search_pattern); } - } } - } + + if ((r_count < 100000 && r_count % 10000 == 0) || (r_count % 100000 == 0)) { + cerr << r_count / 1000000.0 << " million reads processed.." << endl; + } + } - // END print_result LABEL + // Join all worker threads + for (auto& t : workers) { + t.join(); } reads_ifs.close(); + if (known_barcodes_map.size() > 0) { + out_stat_file.close(); + } + // Close all the split files + for (auto& pair : barcode_file_streams) { + if (pair.second.is_open()) { + pair.second.close(); + } + } // print summary statistics cerr << "Number of reads processed: " << r_count << "\n"; @@ -1008,8 +1232,7 @@ int main(int argc, char **argv) { cerr << "All done!" << endl; cerr << "If you like Flexiplex, please cite us! https://doi.org/10.1093/bioinformatics/btae102" << endl; - if (known_barcodes.size() > 0) { - out_stat_file.close(); + if (known_barcodes_map.size() > 0) { return (0); } @@ -1028,21 +1251,23 @@ int main(int argc, char **argv) { return l.first < r.first; }); - vector hist(bc_vec[0].second); - ofstream out_bc_file; - - out_bc_file.open(out_bc_filename); + if (!bc_vec.empty()) { + vector hist(bc_vec[0].second); + ofstream out_bc_file; - for (auto const &bc_pair : bc_vec) { - out_bc_file << bc_pair.first << "\t" << bc_pair.second << "\n"; - hist[bc_pair.second - 1]++; - } + out_bc_file.open(out_bc_filename); - out_bc_file.close(); + for (auto const &bc_pair : bc_vec) { + out_bc_file << bc_pair.first << "\t" << bc_pair.second << "\n"; + if (bc_pair.second > 0) { + hist[bc_pair.second - 1]++; + } + } + out_bc_file.close(); - cout << "Reads\tBarcodes" - << "\n"; - for (int i = hist.size() - 1; i >= 0; i--) - cout << i + 1 << "\t" << hist[i] << "\n"; + cout << "Reads\tBarcodes" << "\n"; + for (int i = hist.size() - 1; i >= 0; i--) + if(hist[i] > 0) + cout << i + 1 << "\t" << hist[i] << "\n"; + } } -