From e122cc1bd3a557baa8e510a88a4ad136e686b1a6 Mon Sep 17 00:00:00 2001 From: Changqing Wang Date: Fri, 16 Jan 2026 14:14:33 +1100 Subject: [PATCH 1/3] change to thread safe queue --- flexiplex.c++ | 489 +++++++++++++++++++++++++------------------------- 1 file changed, 248 insertions(+), 241 deletions(-) diff --git a/flexiplex.c++ b/flexiplex.c++ index 9b661d7..58d62d6 100644 --- a/flexiplex.c++ +++ b/flexiplex.c++ @@ -17,9 +17,14 @@ #include #include #include -#include +#include +#include +#include +#include +#include #include "edlib.h" +#include using namespace std; @@ -43,7 +48,7 @@ static const map< string, PredefinedStruct> predefinedMap = { {"10x5v2",{"10x version 2 chemistry 5'", "-x CTACACGACGCTCTTCCGATCT -b ???????????????? -u ?????????? -x TTTCTTATATGGG -f 8 -e 2"}}, {"grep",{"Simple grep-like search (edit distance up to 2)", - "-f 2 -k ? -b \'\' -u \'\' -i false"}} + "-f 2 -k ? -b '' -u '' -i false"}} }; // the help information which is printed when a user puts in the wrong @@ -142,6 +147,45 @@ struct SearchResult { bool chimeric; }; +// A thread-safe queue for our producer-consumer model +template +class ThreadSafeQueue { +private: + std::queue queue_; + mutable std::mutex mutex_; + std::condition_variable cond_; + +public: + void push(T value) { + std::lock_guard lock(mutex_); + queue_.push(std::move(value)); + cond_.notify_one(); + } + + // Pop an element from the queue. Returns true on success, false if the queue is empty and likely to remain so. + bool pop(T& value) { + std::unique_lock lock(mutex_); + // Wait until the queue is not empty. + cond_.wait(lock, [this] { return !queue_.empty(); }); + + // Although we waited, it's possible to be woken up spuriously. + // Or, another thread could have popped the element. + if (queue_.empty()) { + return false; + } + + value = std::move(queue_.front()); + queue_.pop(); + return true; + } + + void notify_all_finish() { + std::lock_guard lock(mutex_); + cond_.notify_all(); + } +}; + + // Code for fast edit distance calculation for short sequences modified from // https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C++ // s2 is always assumned to be the shorter string (barcode) @@ -254,7 +298,7 @@ std::string get_umi(const std::string &seq, // UMI is not the last subpattern // use the start of the next subpattern as the end of UMI umi_start = read_to_subpatterns[umi_index]; - umi_length = min(read_to_subpatterns[umi_index + 1] - umi_start, umi_length); + umi_length = min((int)read_to_subpatterns[umi_index + 1] - umi_start, umi_length); umi_pad = std::string(search_pattern[umi_index].second.length() - umi_length, 'N'); return seq.substr(umi_start, umi_length) + umi_pad; } else { @@ -272,7 +316,7 @@ std::string get_umi(const std::string &seq, // Sequence seearch is performed using edlib Barcode get_barcode(string & seq, - unordered_set *known_barcodes, + const unordered_set *known_barcodes, int flank_max_editd, int barcode_max_editd, const std::vector> &search_pattern) { @@ -313,7 +357,7 @@ Barcode get_barcode(string & seq, //search for the concatenated pattern EdlibAlignResult result = edlibAlign(search_string.c_str(), search_string.length(), seq.c_str(), seq.length(), edlibConf); - if(result.status != EDLIB_STATUS_OK | result.numLocations==0 ){ + if(result.status != EDLIB_STATUS_OK || result.numLocations==0 ){ edlibFreeAlignResult(result); return(barcode); // no match found - return } //fill in info about found primer and polyT location @@ -353,7 +397,7 @@ Barcode get_barcode(string & seq, if (value != 2) { i_pattern++; } - if (i_pattern >= subpattern_ends[i_subpattern]) { + if (i_subpattern < subpattern_ends.size() && i_pattern >= subpattern_ends[i_subpattern]) { read_to_subpatterns.emplace_back(i_read); i_subpattern++; } @@ -405,15 +449,18 @@ Barcode get_barcode(string & seq, // otherwise widen our search space and the look for matches with errors int left_bound = max( - read_to_subpatterns[bc_index] - OFFSET, // widen the search by using an OFFSET + (int)read_to_subpatterns[bc_index] - OFFSET, // widen the search by using an OFFSET 0 // set a maximum starting character index of 0 ); int max_length = search_pattern[bc_index].second.length() + 2 * OFFSET; + if (left_bound + max_length > seq.length()) { + max_length = seq.length() - left_bound; + } std::string barcode_seq = seq.substr(left_bound, max_length); //iterate over all the known barcodes, checking each sequentially - unordered_set::iterator known_barcodes_itr=known_barcodes->begin(); + auto known_barcodes_itr=known_barcodes->begin(); unsigned int editDistance, endDistance; for(; known_barcodes_itr != known_barcodes->end(); known_barcodes_itr++){ @@ -439,30 +486,29 @@ Barcode get_barcode(string & seq, } //search a read for one or more barcodes (parent function that calls get_barcode) -vector big_barcode_search(string & sequence, unordered_set & known_barcodes, int max_flank_editd, int max_editd, const std::vector> &search_pattern) { - - vector return_vec; //vector of all the barcodes found - - //search for barcode - Barcode result=get_barcode(sequence,&known_barcodes,max_flank_editd,max_editd, search_pattern); //,ss); - if(result.editd<=max_editd && result.unambiguous) //add to return vector if edit distance small enough - return_vec.push_back(result); - - //if a result was found, mask out the flanking sequence and search again in case there are more. - if(return_vec.size()>0){ - string masked_sequence = sequence; - for(int i=0; i masked_res; - masked_res=big_barcode_search(masked_sequence,known_barcodes,max_flank_editd,max_editd, search_pattern); //,ss); - return_vec.insert(return_vec.end(),masked_res.begin(),masked_res.end()); //add to result - } - return(return_vec); - +vector big_barcode_search(const string & sequence, const unordered_set & known_barcodes, int max_flank_editd, int max_editd, const std::vector> &search_pattern) { + vector return_vec; //vector of all the barcodes found + string masked_sequence = sequence; // Work on a copy + + while (true) { + //search for barcode + Barcode result=get_barcode(masked_sequence, &known_barcodes, max_flank_editd, max_editd, search_pattern); + if(result.editd <= max_editd && result.unambiguous) { //add to return vector if edit distance small enough + return_vec.push_back(result); + // Mask the found region to prevent re-finding it + int flank_length = result.flank_end - result.flank_start; + if (flank_length > 0) + masked_sequence.replace(result.flank_start, flank_length, string(flank_length, 'X')); + else // if flank_length is 0, we risk an infinite loop + break; + } else { + break; // No more barcodes found + } + } + return(return_vec); } + // utility function to check true/false input options bool get_bool_opt_arg(string value){ transform(value.begin(), value.end(), value.begin(), ::tolower); @@ -478,7 +524,7 @@ bool get_bool_opt_arg(string value){ } // print information about barcodes -void print_stats(string read_id, vector & vec_bc, ostream & out_stream){ +void print_stats(const string& read_id, const vector & vec_bc, ostream & out_stream){ for(int b=0; b & vec_bc, ostream & out_stream) } } -void print_line(string id, string read, string quals, bool is_fastq, ostream & out_stream){ +void print_line(const string& id, const string& read, const string& quals, ostream & out_stream){ + + //flag for read format + bool is_fastq=!(quals==""); //no quality scores passed = fasta //output to the new read file if(is_fastq) @@ -503,11 +552,12 @@ void print_line(string id, string read, string quals, bool is_fastq, ostream & o } //print fastq or fasta lines.. -void print_read(string read_id, string read, string qual, - vector & vec_bc, string prefix, - bool split, unordered_set & found_barcodes, +void print_read(const string& read_id, const string& read, string qual, + const vector & vec_bc, const string& prefix, + bool split, map & barcode_file_streams, + mutex & cout_mutex, mutex & file_mutex, bool trim_barcodes, - bool chimeric, bool is_fastq){ + bool chimeric){ auto vec_size = vec_bc.size(); @@ -524,7 +574,7 @@ void print_read(string read_id, string read, string qual, string barcode = vec_bc.at(b).barcode; // also add the proper FASTQ way: \tCB:Z:barcode\tUB:Z:umi string new_read_id = - barcode + "_" + vec_bc.at(b).umi + "#" + read_id + ss.str() + "\tCB:Z:" + barcode + "\tUB:Z:" + vec_bc[b].umi; + barcode + "_" + vec_bc.at(b).umi + "#" + read_id + ss.str() + "\tCB:Z:" + barcode + "\tUB:Z:" + vec_bc.at(b).umi; // work out the start and end base in case multiple barcodes // note: read_start+1, see issue #63 @@ -539,9 +589,10 @@ void print_read(string read_id, string read, string qual, string qual_new = ""; // don't trim the quality scores if it's a fasta file if (qual != "") { - if((read_start+read_length)>(qual.length())) { - cerr << "WARNING: sequence and quality lengths diff for read: " << read_id << ". Ignoring read." << endl; - return; + if((read_start+read_length)>(qual.length())){ + lock_guard lock(cout_mutex); + cerr << "WARNING: sequence and quality lengths diff for read: " << read_id << ". Ignoring read." << endl; + return; } qual_new = qual.substr(read_start, read_length); } @@ -559,66 +610,54 @@ void print_read(string read_id, string read, string qual, continue; } - if (split) { // to a file if splitting by barcode - string outname = prefix + "_" + barcode + "."; - if (qual == "") - outname += "fasta"; - else - outname += "fastq"; - ofstream outstream; - if (found_barcodes.insert(barcode).second) - outstream.open(outname); // override file if this is the first read - // for the barcode - else - outstream.open(outname, ofstream::app); // remove file if this is the - // first read for the barcode - print_line(new_read_id, read_new, qual_new, is_fastq, outstream); - outstream.close(); + if (split) { // to a file if spliting by barcode + lock_guard lock(file_mutex); + // Check if the file stream is already open + if (barcode_file_streams.find(barcode) == barcode_file_streams.end()) { + // If not, open it for the first time and add it to the map + string outname = prefix + "_" + barcode + (qual.empty() ? ".fasta" : ".fastq"); + barcode_file_streams[barcode].open(outname); + } + // Write to the already open stream + print_line(new_read_id, read_new, qual_new, barcode_file_streams[barcode]); } else { - print_line(new_read_id, read_new, qual_new, is_fastq, std::cout); + lock_guard lock(cout_mutex); + print_line(new_read_id, read_new, qual_new, std::cout); } } } -// separated out from main so that this can be run with threads -void search_read(vector & reads, unordered_set & known_barcodes, - int flank_edit_distance, int edit_distance, - const std::vector> &search_pattern) { - - for (int r=0; r &work_queue, + ThreadSafeQueue &results_queue, + const unordered_set &known_barcodes, + int flank_edit_distance, + int edit_distance, + const std::vector> &search_pattern +) { + SearchResult* sr_ptr; + while (work_queue.pop(sr_ptr) && sr_ptr) { + //forward search + sr_ptr->vec_bc_for = big_barcode_search(sr_ptr->line, known_barcodes, flank_edit_distance, edit_distance, search_pattern); + + // get reverse complement + sr_ptr->rev_line = sr_ptr->line; + reverse_complement(sr_ptr->rev_line); + + //Check the reverse complement of the read + sr_ptr->vec_bc_rev = big_barcode_search(sr_ptr->rev_line, known_barcodes, flank_edit_distance, edit_distance, search_pattern); + + sr_ptr->count = sr_ptr->vec_bc_for.size() + sr_ptr->vec_bc_rev.size(); + sr_ptr->chimeric = !sr_ptr->vec_bc_for.empty() && !sr_ptr->vec_bc_rev.empty(); + + results_queue.push(sr_ptr); + } + // Push a sentinel to the results queue to signal this worker is done + results_queue.push(nullptr); } + // check if file already exists bool file_exists(const std::string &filename) { std::ifstream infile(filename); @@ -633,7 +672,6 @@ int main(int argc, char **argv) { // Variables to store user options // Set these to their defaults - int expected_cells = 0; //(d) int edit_distance = 2; //(e) int flank_edit_distance = 8; //(f) @@ -643,31 +681,31 @@ int main(int argc, char **argv) { string out_filename_prefix = "flexiplex"; //(n) bool split_file_by_barcode = false; //(s) - bool remove_barcodes = true; //(r) + bool remove_barcodes = true; //(i) bool print_chimeric = false; //(c) std::vector> search_pattern; // Set of known barcodes unordered_set known_barcodes; - unordered_set found_barcodes; // threads int n_threads = 1; - /*** Pass command line option *****/ + /*** Pass command line option *****/\ int c; int params = 1; ifstream file; string line; vector myArgs(argv, argv + argc); + vector allocated_args; + while ((c = getopt(myArgs.size(), myArgs.data(), "d:k:i:b:u:x:e:f:n:s:hp:c:")) != EOF) { switch (c) { - case 'd': { // d=predefined list of settings for various search/barcode - // schemes + case 'd': { // d=predefined list of settings for various search/barcode schemes if (predefinedMap.find(optarg) == predefinedMap.end()) { cerr << "Unknown argument, " << optarg << ", passed to option -d. See list below for options.\n"; @@ -680,14 +718,13 @@ int main(int argc, char **argv) { vector newArgv; while (settingsStream >> token) { // code created with the help of chatGPT token.erase(remove(token.begin(), token.end(), '\''), token.end()); - char *newArg = - new char[token.size() + - 1]; // +1 for null terminator //need to delete these later. + char *newArg = new char[token.size() + 1]; // +1 for null terminator strcpy(newArg, token.c_str()); newArgv.push_back(newArg); // Append the token to the new argv } params += 2; - myArgs.insert(myArgs.begin() += params, newArgv.begin(), newArgv.end()); + myArgs.insert(myArgs.begin() + params, newArgv.begin(), newArgv.end()); + allocated_args.insert(allocated_args.end(), newArgv.begin(), newArgv.end()); break; } case 'k': { // k=list of known barcodes @@ -782,7 +819,6 @@ int main(int argc, char **argv) { } case 'c': { print_chimeric = get_bool_opt_arg(optarg); - params += 2; break; } @@ -799,6 +835,11 @@ int main(int argc, char **argv) { } } + // free memory from -d option + for (auto arg : allocated_args) { + delete[] arg; + } + // default case when no x, u, b is speficied if (search_pattern.empty()) { cerr << "Using default search pattern: " << endl; @@ -806,7 +847,7 @@ int main(int argc, char **argv) { {"BC", std::string(16, '?')}, {"UMI", std::string(12, '?')}, {"polyA", std::string(9, 'T')}}; - for (auto i : search_pattern) + for (auto const& i : search_pattern) std::cerr << i.first << ": " << i.second << "\n"; } @@ -832,10 +873,9 @@ int main(int argc, char **argv) { } /********* FIND BARCODE IN READS ********/ - string sequence; - int bc_count = 0; - int r_count = 0; - int multi_bc_count = 0; + atomic r_count(0); + atomic bc_count(0); + atomic multi_bc_count(0); ofstream out_stat_file; out_stat_filename = out_filename_prefix + "_" + out_stat_filename; @@ -859,146 +899,112 @@ int main(int argc, char **argv) { if (getline(*in, read_id_line)) { // check the first line for file type if (read_id_line[0] == '>') { is_fastq = false; - } else if (read_id_line[0] == '@') { // fasta - ignore! - } else { + } else if (read_id_line[0] != '@') { cerr << "Unknown read format... exiting" << endl; exit(1); } + } else { // Empty file + return 0; } - while (getline(*in, line)) { - const int buffer_size = 2000; // number of reads to pass to one thread. - - vector> sr_v(n_threads); - for (int i = 0; i < n_threads; i++) { - sr_v[i] = vector(buffer_size); - } - - vector threads(n_threads); + // --- Threading and Pipeline Setup --- + ThreadSafeQueue work_queue; + ThreadSafeQueue results_queue; + vector workers; - // get n_threads*buffer number of reads.. - for (int t = 0; t < n_threads; t++) { - for (int b = 0; b < buffer_size; b++) { - SearchResult &sr = sr_v[t][b]; + // Mutexes for synchronized output + mutex cout_mutex; + mutex file_mutex; + map barcode_file_streams; - sr.line = line; - string read_id; + // Start worker threads + for (int i = 0; i < n_threads; ++i) { + workers.emplace_back(search_worker, ref(work_queue), ref(results_queue), ref(known_barcodes), + flank_edit_distance, edit_distance, ref(search_pattern)); + } - istringstream line_stream(read_id_line); - line_stream >> sr.read_id; - sr.read_id.erase(0, 1); + // --- Producer Stage --- + // The main thread reads the file and pushes work to the queue + do { + SearchResult *sr = new SearchResult(); + istringstream line_stream(read_id_line); + line_stream >> sr->read_id; + sr->read_id.erase(0, 1); + + if (!is_fastq) { // fasta (account for multi-lines per read) + string buffer_string; + while (getline(*in, buffer_string) && buffer_string[0] != '>') + sr->line += buffer_string; + read_id_line = buffer_string; + } else { // fastq (get quality scores) + getline(*in, sr->line); + getline(*in, line); // skip '+' line + getline(*in, sr->qual_scores); + getline(*in, read_id_line); + } + work_queue.push(sr); + } while (*in); - if (!is_fastq) { // fasta (account for multi-lines per read) - string buffer_string; - while (getline(*in, buffer_string) && buffer_string[0] != '>') - sr.line += buffer_string; - read_id_line = buffer_string; - } else { // fastq (get quality scores) - for (int s = 0; s < 2; s++) { - getline(*in, sr.qual_scores); - } - getline(*in, read_id_line); - } + // Push sentinels to the work queue to signal completion + for (int i = 0; i < n_threads; ++i) { + work_queue.push(nullptr); + } - r_count++; // progress counter - if (// display for every 10,000 reads first - (r_count < 100000 && r_count % 10000 == 0) || - // and then every 100,000 reads after - (r_count % 100000 == 0)) { - cerr << r_count / ((double)1000000) << " million reads processed.." - << endl; - } + // --- Consumer/Writer Stage --- + // The main thread now processes results + int finished_workers = 0; + while(finished_workers < n_threads) { + SearchResult* sr_ptr; + results_queue.pop(sr_ptr); - // this is quite ugly, must be a better way to do this.. - if (b == buffer_size - 1 && t == n_threads - 1) { - break; // if it's the last in the chunk don't getline as this happens - // in the while statement - } else if (!getline(*in, line)) { // case we are at the end of the - // reads. - sr_v[t].resize(b + 1); - threads[t] = std::thread(search_read, ref(sr_v[t]), - ref(known_barcodes), flank_edit_distance, - edit_distance, ref(search_pattern)); - for (int t2 = t + 1; t2 < n_threads; t2++) { - sr_v[t2].resize(0); - } - - goto print_result; // advance the line - } + if (sr_ptr == nullptr) { + finished_workers++; + continue; } - // send reads to the thread - threads[t] = - std::thread(search_read, ref(sr_v[t]), ref(known_barcodes), - flank_edit_distance, edit_distance, ref(search_pattern)); - } + + unique_ptr sr(sr_ptr); // Manage memory - print_result: - // START print_result LABEL... + r_count++; + if (sr->count > 0) bc_count++; + if (sr->chimeric) multi_bc_count++; - // loop over the threads and print out ther results - for (int t = 0; t < sr_v.size(); t++) { - if (sr_v[t].size() > 0) - threads[t].join(); // wait for the threads to finish before printing + for (const auto& bc : sr->vec_bc_for) barcode_counts[bc.barcode]++; + for (const auto& bc : sr->vec_bc_rev) barcode_counts[bc.barcode]++; - for (int r = 0; r < sr_v[t].size(); r++) { // loop over the reads + if (known_barcodes.size() != 0) { + print_stats(sr->read_id, sr->vec_bc_for, out_stat_file); + print_stats(sr->read_id, sr->vec_bc_rev, out_stat_file); - for (int b = 0; b < sr_v[t][r].vec_bc_for.size(); b++) - barcode_counts[sr_v[t][r].vec_bc_for.at(b).barcode]++; - for (int b = 0; b < sr_v[t][r].vec_bc_rev.size(); b++) - barcode_counts[sr_v[t][r].vec_bc_rev.at(b).barcode]++; + print_read(sr->read_id + "_+", sr->line, sr->qual_scores, sr->vec_bc_for, out_filename_prefix, + split_file_by_barcode, barcode_file_streams, cout_mutex, file_mutex, remove_barcodes, print_chimeric && sr->chimeric); - if (sr_v[t][r].count > 0) - bc_count++; - if (sr_v[t][r].chimeric) { - multi_bc_count++; - } - - if (known_barcodes.size() != - 0) { // if we are just looking for all possible barcodes don't - // output reads etc. - - print_stats(sr_v[t][r].read_id, sr_v[t][r].vec_bc_for, out_stat_file); - print_stats(sr_v[t][r].read_id, sr_v[t][r].vec_bc_rev, out_stat_file); - - print_read( - sr_v[t][r].read_id + "_+", - sr_v[t][r].line, - sr_v[t][r].qual_scores, - sr_v[t][r].vec_bc_for, - out_filename_prefix, - split_file_by_barcode, - found_barcodes, - remove_barcodes, - print_chimeric && sr_v[t][r].chimeric, // include chimeric information if requested - is_fastq - ); - - // case we just want to print read once if multiple bc found. - if (remove_barcodes || sr_v[t][r].vec_bc_for.size() == 0) { - // for a chimeric read, we first need to reverse the quality scores - reverse(sr_v[t][r].qual_scores.begin(), sr_v[t][r].qual_scores.end()); - - print_read( - sr_v[t][r].read_id + "_-", - sr_v[t][r].rev_line, - sr_v[t][r].qual_scores, - sr_v[t][r].vec_bc_rev, - out_filename_prefix, - split_file_by_barcode, - found_barcodes, - remove_barcodes, - print_chimeric && sr_v[t][r].chimeric, // include chimeric information if requested - is_fastq - ); + if (remove_barcodes || sr->vec_bc_for.empty()) { + reverse(sr->qual_scores.begin(), sr->qual_scores.end()); + print_read(sr->read_id + "_-", sr->rev_line, sr->qual_scores, sr->vec_bc_rev, out_filename_prefix, + split_file_by_barcode, barcode_file_streams, cout_mutex, file_mutex, remove_barcodes, print_chimeric && sr->chimeric); } - } } - } + + if ((r_count < 100000 && r_count % 10000 == 0) || (r_count % 100000 == 0)) { + cerr << r_count / 1000000.0 << " million reads processed.." << endl; + } + } - // END print_result LABEL + // Join all worker threads + for (auto& t : workers) { + t.join(); } reads_ifs.close(); + if (known_barcodes.size() > 0) { + out_stat_file.close(); + } + // Close all the split files + for (auto& pair : barcode_file_streams) { + if (pair.second.is_open()) { + pair.second.close(); + } + } // print summary statistics cerr << "Number of reads processed: " << r_count << "\n"; @@ -1009,7 +1015,6 @@ int main(int argc, char **argv) { cerr << "If you like Flexiplex, please cite us! https://doi.org/10.1093/bioinformatics/btae102" << endl; if (known_barcodes.size() > 0) { - out_stat_file.close(); return (0); } @@ -1028,21 +1033,23 @@ int main(int argc, char **argv) { return l.first < r.first; }); - vector hist(bc_vec[0].second); - ofstream out_bc_file; - - out_bc_file.open(out_bc_filename); + if (!bc_vec.empty()) { + vector hist(bc_vec[0].second); + ofstream out_bc_file; - for (auto const &bc_pair : bc_vec) { - out_bc_file << bc_pair.first << "\t" << bc_pair.second << "\n"; - hist[bc_pair.second - 1]++; - } + out_bc_file.open(out_bc_filename); - out_bc_file.close(); + for (auto const &bc_pair : bc_vec) { + out_bc_file << bc_pair.first << "\t" << bc_pair.second << "\n"; + if (bc_pair.second > 0) { + hist[bc_pair.second - 1]++; + } + } + out_bc_file.close(); - cout << "Reads\tBarcodes" - << "\n"; - for (int i = hist.size() - 1; i >= 0; i--) - cout << i + 1 << "\t" << hist[i] << "\n"; + cout << "Reads\tBarcodes" << "\n"; + for (int i = hist.size() - 1; i >= 0; i--) + if(hist[i] > 0) + cout << i + 1 << "\t" << hist[i] << "\n"; + } } - From f14b99d8297bf7d7d19fcc9fe8bf7d59595df7bf Mon Sep 17 00:00:00 2001 From: Changqing Wang Date: Fri, 16 Jan 2026 17:45:16 +1100 Subject: [PATCH 2/3] allow multiple barcode / umi --- flexiplex.c++ | 751 +++++++++++++++++++++++++++++++------------------- 1 file changed, 461 insertions(+), 290 deletions(-) diff --git a/flexiplex.c++ b/flexiplex.c++ index 58d62d6..75ea5e6 100644 --- a/flexiplex.c++ +++ b/flexiplex.c++ @@ -32,6 +32,17 @@ using namespace std; // e.g. 1.00.1 (dev) goes to 1.01 (release) const static string VERSION="1.02.5"; +enum SegmentType { FIXED, MATCHED, RANDOM }; + +struct Segment { + SegmentType type; + std::string pattern; + std::string name; // e.g., "BC1", "UMI", "Primer1" + std::string bc_list_name; // Only for MATCHED, refers to a key in known_barcodes_map + int buffer_size; // Only for MATCHED + int max_edit_distance; // Only for MATCHED +}; + struct PredefinedStruct { string description; string params; @@ -72,6 +83,9 @@ void print_usage(){ cerr << " @BC_UMI#READID_+1of2\n"; cerr << " is chimeric, it will become:\n"; cerr << " @BC_UMI#READID_+1of2_C\n"; + cerr << " Reads are considered chimeric if barcodes are found on both\n"; + cerr << " forward and reverse strands.\n"; + // what about when only one strand has multiple barcodes? cerr << " -n prefix Prefix for output filenames.\n"; cerr << " -e N Maximum edit distance to barcode (default 2).\n"; cerr << " -f N Maximum edit distance to primer+polyT (default 8).\n"; @@ -79,15 +93,19 @@ void print_usage(){ cerr << " Specifying adaptor / barcode structure : \n"; cerr << " -x sequence Append flanking sequence to search for\n"; - cerr << " -b sequence Append the barcode pattern to search for\n"; - cerr << " -u sequence Append the UMI pattern to search for\n"; - cerr << " Notes:\n"; - cerr << " The order of these options matters\n"; - cerr << " ? - can be used as a wildcard\n"; + cerr << " -b sequence Append the barcode pattern to search for (named CB by default)\n"; + cerr << " -u sequence[,name:] Append the UMI pattern to search for\n"; + cerr << " (named UB, UB2, UB3... by default if name not provided)\n"; + cerr << " -B sequence[,name:][,list:][,buffer:][,max_ed:]\n"; + cerr << " Append a barcode pattern with optional parameters\n"; + cerr << " - name: specify the name of the barcode segment (defaults to BC1, BC2...)\n"; + cerr << " - list: specify a file containing expected barcodes for this segment\n"; + cerr << " - buffer: number of bases to extend search region on either side (default 5)\n"; + cerr << " - max_ed: maximum edit distance for this segment (default is global -e value)\n"; cerr << " When no search pattern x,b,u option is provided, the following default pattern is used: \n"; cerr << " primer: CTACACGACGCTCTTCCGATCT\n"; - cerr << " barcode: ????????????????\n"; - cerr << " UMI: ????????????\n"; + cerr << " barcode (CB): ????????????????\n"; + cerr << " UMI (UB): ????????????\n"; cerr << " polyT: TTTTTTTTT\n"; cerr << " which is the same as providing: \n"; cerr << " -x CTACACGACGCTCTTCCGATCT -b ???????????????? -u ???????????? -x TTTTTTTTT\n\n"; @@ -127,13 +145,11 @@ void reverse_complement(string & seq){ //Holds the found barcode and associated information struct Barcode { - string barcode; - string umi; - int editd; + std::map features; // Map of segment name to extracted sequence int flank_editd; int flank_start; int flank_end; - bool unambiguous; + bool found_all_matched_segments; } ; struct SearchResult { @@ -246,86 +262,22 @@ unsigned int edit_distance(const std::string& s1, const std::string& s2, unsigne return best; // return edit distance } -// extract UMI from the read after barcode matching -std::string get_umi(const std::string &seq, - const std::vector> &search_pattern, - const std::vector &read_to_subpatterns, - const int umi_index, const int bc_index, - const bool sliding_window_match, // if true, use left_bound and endDistance - const int left_bound, - const int endDistance - ) { - - int umi_start, umi_length; - std::string umi_pad = ""; - umi_length = search_pattern[umi_index].second.length(); - - if (umi_index == -1) { - return ""; // protocol does not have UMI - - } else if (umi_index == bc_index + 1) { - // UMI right after BC - if (sliding_window_match) { - umi_start = left_bound + endDistance; - } else { - umi_start = read_to_subpatterns[bc_index] + search_pattern[bc_index].second.length(); - } - if (seq.length() < umi_start + umi_length) { - // read not long enough, pad N to the end - umi_length = seq.length() - umi_start; - umi_pad = std::string(search_pattern[umi_index].second.length() - umi_length, 'N'); - } - return seq.substr(umi_start, umi_length) + umi_pad; - - } else if (umi_index == bc_index - 1) { - // UMI right before BC - int bc_start = sliding_window_match ? left_bound + endDistance : read_to_subpatterns[bc_index]; - // umi should start umi_offset bases before BC - int umi_offset = search_pattern[bc_index].second.length() + search_pattern[umi_index].second.length(); - if (bc_start < umi_offset) { - // not enough bases before BC - umi_pad = std::string(umi_offset - bc_start, 'N'); - umi_start = 0; - umi_length -= umi_offset - bc_start; - } else { - umi_start = bc_start - umi_offset; - } - return umi_pad + seq.substr(umi_start, umi_length); - } else { - // UMI not next to BC, no idea which side was truncated - if (read_to_subpatterns.size() > umi_index + 1) { - // UMI is not the last subpattern - // use the start of the next subpattern as the end of UMI - umi_start = read_to_subpatterns[umi_index]; - umi_length = min((int)read_to_subpatterns[umi_index + 1] - umi_start, umi_length); - umi_pad = std::string(search_pattern[umi_index].second.length() - umi_length, 'N'); - return seq.substr(umi_start, umi_length) + umi_pad; - } else { - // UMI is the last subpattern - umi_start = read_to_subpatterns[umi_index]; - umi_length = min((int) seq.length() - umi_start, umi_length); - umi_pad = std::string(search_pattern[umi_index].second.length() - umi_length, 'N'); - return seq.substr(umi_start, umi_length) + umi_pad; - } - } -} // Given a string 'seq' search for substring with primer and polyT sequence followed by // a targeted search in the region for barcode // Sequence seearch is performed using edlib -Barcode get_barcode(string & seq, - const unordered_set *known_barcodes, - int flank_max_editd, - int barcode_max_editd, - const std::vector> &search_pattern) { - - const int OFFSET=5; //wiggle room in bases of the expected barcode start site to search. +Barcode extract_features(string & seq, + const std::vector &segments, + const std::unordered_map> *known_barcodes_map, + int global_flank_max_editd, + int global_barcode_edit_distance) { //initialise struct variables for return: - Barcode barcode; - barcode.editd=100; barcode.flank_editd=100; barcode.unambiguous = false; + Barcode barcode_result; + barcode_result.found_all_matched_segments = true; + barcode_result.flank_editd=100; //initialise edlib configuration // Use IUPAC codes @@ -342,170 +294,160 @@ Barcode get_barcode(string & seq, {'D', 'A'}, {'D', 'G'}, {'D', 'T'}, {'V', 'A'}, {'V', 'C'}, {'V', 'G'} }; - EdlibAlignConfig edlibConf = {flank_max_editd, EDLIB_MODE_HW, EDLIB_TASK_PATH, additionalEqualities, 28}; + EdlibAlignConfig edlibConf = {global_flank_max_editd, EDLIB_MODE_HW, EDLIB_TASK_PATH, additionalEqualities, 28}; // concatenate patterns in search_pattern in insertion order - std::string search_string; - for (const auto &pair : search_pattern) { - search_string += pair.second; + std::string search_string_template; + std::vector segment_lengths; + for (const auto &segment : segments) { + search_string_template += segment.pattern; + segment_lengths.push_back(segment.pattern.length()); } // shorter than the pattern, skip search - if (seq.length() < search_string.length()) { - return barcode; + if (seq.length() < search_string_template.length()) { + return barcode_result; } //search for the concatenated pattern - EdlibAlignResult result = edlibAlign(search_string.c_str(), search_string.length(), seq.c_str(), seq.length(), edlibConf); - if(result.status != EDLIB_STATUS_OK || result.numLocations==0 ){ + EdlibAlignResult result = edlibAlign(search_string_template.c_str(), search_string_template.length(), seq.c_str(), seq.length(), edlibConf); + if(result.status != EDLIB_STATUS_OK || result.numLocations==0 || result.editDistance == -1 ){ edlibFreeAlignResult(result); - return(barcode); // no match found - return + return(barcode_result); // no match found - return } //fill in info about found primer and polyT location - barcode.flank_editd=result.editDistance; - barcode.flank_start=result.startLocations[0]; - barcode.flank_end=result.endLocations[0]; - - // Extract sub-patterns from aligment directly - std::vector subpattern_lengths; - for (const auto &pair : search_pattern) { - subpattern_lengths.push_back(pair.second.length()); - } + barcode_result.flank_editd=result.editDistance; + barcode_result.flank_start=result.startLocations[0]; + barcode_result.flank_end=result.endLocations[0]; - std::vector subpattern_ends; - subpattern_ends.resize(subpattern_lengths.size()); - std::partial_sum(subpattern_lengths.begin(), subpattern_lengths.end(), subpattern_ends.begin()); + std::vector segment_template_ends; + segment_template_ends.resize(segment_lengths.size()); + std::partial_sum(segment_lengths.begin(), segment_lengths.end(), segment_template_ends.begin()); - vector read_to_subpatterns; - read_to_subpatterns.reserve(subpattern_ends.size() + 1); - read_to_subpatterns.emplace_back(barcode.flank_start); + vector read_to_segment_starts; + read_to_segment_starts.reserve(segment_template_ends.size() + 1); + read_to_segment_starts.emplace_back(barcode_result.flank_start); // initialise pointers - int i_read = barcode.flank_start; + int i_read = barcode_result.flank_start; int i_pattern = 0; - int i_subpattern = 0; + int i_segment = 0; // walk through edlib aligment // 0 for match - // 1 for insertion to target - // 2 for insertion to query + // 1 for insertion to target (read) + // 2 for insertion to query (template) // 3 for mismatch std::vector alignment_vector(result.alignment, result.alignment + result.alignmentLength); for (const auto& value : alignment_vector) { - if (value != 1) { + if (value != EDLIB_EDOP_INSERT) { // If not an insertion in the read i_read++; } - if (value != 2) { + if (value != EDLIB_EDOP_DELETE) { // If not an insertion in the template i_pattern++; } - if (i_subpattern < subpattern_ends.size() && i_pattern >= subpattern_ends[i_subpattern]) { - read_to_subpatterns.emplace_back(i_read); - i_subpattern++; + if (i_segment < segment_template_ends.size() && i_pattern >= segment_template_ends[i_segment]) { + read_to_segment_starts.emplace_back(i_read); + i_segment++; } } - edlibFreeAlignResult(result); + // Extract and validate features for each segment + for (size_t i = 0; i < segments.size(); ++i) { + const Segment& s = segments[i]; + int segment_read_start = read_to_segment_starts[i]; + int segment_read_end = (i + 1 < read_to_segment_starts.size()) ? read_to_segment_starts[i+1] : barcode_result.flank_end; + int extracted_length = segment_read_end - segment_read_start; - // Work out the index of BC and UMI in the pattern - // TODO: Should this be done in main to be more efficient? - // TODO: Handle edge cases where BC / UMI is not speficied in the pattern - int bc_index = -1, umi_index = -1; - auto it_pattern = - std::find_if(search_pattern.begin(), search_pattern.end(), - [](const std::pair &pair) { - return pair.first == "BC"; - }); - if (it_pattern != search_pattern.end()) { - bc_index = std::distance(search_pattern.begin(), it_pattern); - } else { - // error - } - it_pattern = - std::find_if(search_pattern.begin(), search_pattern.end(), - [](const std::pair &pair) { - return pair.first == "UMI"; - }); - if (it_pattern != search_pattern.end()) { - umi_index = std::distance(search_pattern.begin(), it_pattern); - } else { - // error - } - - //if not checking against known list of barcodes, return sequence after the primer - //also check for a perfect match straight up as this will save computer later. - // - // read_to_subpatterns[subpattern_index] gives start of subpattern in the read - std::string exact_bc = seq.substr( - read_to_subpatterns[bc_index], - search_pattern[bc_index].second.length()); - if (known_barcodes->size()==0 || (known_barcodes->find(exact_bc) != known_barcodes->end())){ - barcode.barcode=exact_bc; - barcode.editd=0; - barcode.unambiguous=true; - barcode.umi = get_umi(seq, search_pattern, read_to_subpatterns, umi_index, bc_index, false, 0, 0); - return(barcode); - } - - // otherwise widen our search space and the look for matches with errors - - int left_bound = max( - (int)read_to_subpatterns[bc_index] - OFFSET, // widen the search by using an OFFSET - 0 // set a maximum starting character index of 0 - ); - int max_length = search_pattern[bc_index].second.length() + 2 * OFFSET; - if (left_bound + max_length > seq.length()) { - max_length = seq.length() - left_bound; - } - - std::string barcode_seq = seq.substr(left_bound, max_length); + if (extracted_length < 0) extracted_length = 0; // Should not happen with correct alignment - //iterate over all the known barcodes, checking each sequentially - auto known_barcodes_itr=known_barcodes->begin(); - unsigned int editDistance, endDistance; + std::string extracted_seq = seq.substr(segment_read_start, extracted_length); - for(; known_barcodes_itr != known_barcodes->end(); known_barcodes_itr++){ - search_string = *known_barcodes_itr; //known barcode to check against - editDistance = edit_distance(barcode_seq, search_string, endDistance, barcode_max_editd); + if (s.type == MATCHED) { + const unordered_set* current_bclist = nullptr; - if (editDistance == barcode.editd) { - barcode.unambiguous = false; - } else if (editDistance < barcode.editd && editDistance <= barcode_max_editd) { // if best so far, update - barcode.unambiguous = true; - barcode.editd = editDistance; - barcode.barcode = *known_barcodes_itr; - barcode.umi = get_umi(seq, search_pattern, read_to_subpatterns, umi_index, bc_index, true, left_bound, endDistance); + if (!s.bc_list_name.empty()) { + auto it = known_barcodes_map->find(s.bc_list_name); + if (it != known_barcodes_map->end()) { + current_bclist = &(it->second); + } + } else if (known_barcodes_map->count("global")) { + current_bclist = &(known_barcodes_map->at("global")); + } - //if perfect match is found we're done. - if (editDistance == 0) { - return(barcode); + if (current_bclist && !current_bclist->empty()) { + // Apply buffer for search + int search_start = max(0, segment_read_start - s.buffer_size); + int search_end = min((int)seq.length(), segment_read_end + s.buffer_size); + std::string search_region = seq.substr(search_start, search_end - search_start); + + unsigned int best_edit_distance = 100; // Higher than any reasonable max_ed + std::string best_barcode_match = ""; + bool current_segment_unambiguous = false; + + for (const auto& known_bc : *current_bclist) { + unsigned int editDistance, endDistance; + editDistance = edit_distance(search_region, known_bc, endDistance, s.max_edit_distance); + + if (editDistance == best_edit_distance) { + current_segment_unambiguous = false; + } else if (editDistance < best_edit_distance && editDistance <= s.max_edit_distance) { + current_segment_unambiguous = true; + best_edit_distance = editDistance; + best_barcode_match = known_bc; + } + } + if (best_edit_distance <= s.max_edit_distance && current_segment_unambiguous) { + barcode_result.features[s.name] = best_barcode_match; + } else { + barcode_result.found_all_matched_segments = false; + } + } else { // No allow list provided + cerr << "Error: No barcode list found for segment " << s.name << ".\n"; + exit(1); } + } else if (s.type == RANDOM) { + int extract_start = max(0, segment_read_start); + int extract_end = min((int)seq.length(), segment_read_end); + extracted_seq = seq.substr(extract_start, extract_end - extract_start); + barcode_result.features[s.name] = extracted_seq; + } else if (s.type == FIXED) { + // For fixed segments, we just note their presence or can store the extracted sequence if needed + // For now, we don't store fixed segments in features map unless explicitly asked. + // barcode_result.features[s.name] = extracted_seq; } - } - return(barcode); //return the best matched barcode and associated information + + return(barcode_result); //return the best matched barcode and associated information } -//search a read for one or more barcodes (parent function that calls get_barcode) -vector big_barcode_search(const string & sequence, const unordered_set & known_barcodes, int max_flank_editd, int max_editd, const std::vector> &search_pattern) { - vector return_vec; //vector of all the barcodes found +vector big_barcode_search(const string & sequence, + const std::unordered_map> *known_barcodes_map, + int global_flank_max_editd, + int global_barcode_edit_distance, + const std::vector &segments) { + vector return_vec; string masked_sequence = sequence; // Work on a copy while (true) { - //search for barcode - Barcode result=get_barcode(masked_sequence, &known_barcodes, max_flank_editd, max_editd, search_pattern); - if(result.editd <= max_editd && result.unambiguous) { //add to return vector if edit distance small enough - return_vec.push_back(result); - // Mask the found region to prevent re-finding it - int flank_length = result.flank_end - result.flank_start; - if (flank_length > 0) - masked_sequence.replace(result.flank_start, flank_length, string(flank_length, 'X')); - else // if flank_length is 0, we risk an infinite loop + Barcode result = extract_features(masked_sequence, segments, known_barcodes_map, global_flank_max_editd, global_barcode_edit_distance); + if (result.flank_editd <= global_flank_max_editd && result.found_all_matched_segments) { + return_vec.push_back(result); + + // Mask the found region to prevent re-finding it + // result.flank_end is inclusive, so length is end - start + 1 + int match_length = result.flank_end - result.flank_start + 1; + + if (match_length > 0) { + masked_sequence.replace(result.flank_start, match_length, string(match_length, 'X')); + } else { + break; // Should not happen for valid match, but prevents infinite loop + } + } else { break; - } else { - break; // No more barcodes found - } + } } - return(return_vec); + return return_vec; } @@ -524,13 +466,20 @@ bool get_bool_opt_arg(string value){ } // print information about barcodes -void print_stats(const string& read_id, const vector & vec_bc, ostream & out_stream){ +void print_stats(const string& read_id, const vector & vec_bc, ostream & out_stream, const std::vector &segments){ for(int b=0; bsecond; + } else { + out_stream << "\t"; + } + } + } + out_stream << "\t" << vec_bc.at(b).flank_editd << "\n"; } } @@ -557,37 +506,54 @@ void print_read(const string& read_id, const string& read, string qual, bool split, map & barcode_file_streams, mutex & cout_mutex, mutex & file_mutex, bool trim_barcodes, - bool chimeric){ + bool chimeric, + const std::vector &segments){ auto vec_size = vec_bc.size(); - //loop over the barcodes found... usually will just be one for (int b = 0; b < vec_size; b++) { - - // format the new read id. Using FLAMES format. - stringstream ss; - ss << (b + 1) << "of" << vec_size; + stringstream ss_id_suffix; + ss_id_suffix << (b + 1) << "of" << vec_size; if (chimeric) { - ss << "_" << "C"; + ss_id_suffix << "_" << "C"; + } + + stringstream new_read_id_ss; + string primary_barcode_for_filename = ""; + + for (const auto& s : segments) { + if (s.type != FIXED) { + auto it = vec_bc.at(b).features.find(s.name); + if (it != vec_bc.at(b).features.end()) { + new_read_id_ss << it->second << "_"; + if (s.type == MATCHED && primary_barcode_for_filename.empty()) { + primary_barcode_for_filename = it->second; + } + } else { + new_read_id_ss << "NA_"; + } + } + } + string new_read_id_prefix = new_read_id_ss.str(); + if (!new_read_id_prefix.empty() && new_read_id_prefix.back() == '_') { + new_read_id_prefix.pop_back(); } - string barcode = vec_bc.at(b).barcode; - // also add the proper FASTQ way: \tCB:Z:barcode\tUB:Z:umi - string new_read_id = - barcode + "_" + vec_bc.at(b).umi + "#" + read_id + ss.str() + "\tCB:Z:" + barcode + "\tUB:Z:" + vec_bc.at(b).umi; + string new_read_id = new_read_id_prefix + "#" + read_id + ss_id_suffix.str(); + + for (const auto& s : segments) { + if (s.type != FIXED) { + auto it = vec_bc.at(b).features.find(s.name); + if (it != vec_bc.at(b).features.end()) { + new_read_id += "\t" + s.name + ":Z:" + it->second; + } + } + } - // work out the start and end base in case multiple barcodes - // note: read_start+1, see issue #63 int read_start = vec_bc.at(b).flank_end + 1; int read_length = read.length() - read_start; - for (int f = 0; f < vec_size; f++) { - int temp_read_length = vec_bc.at(f).flank_start - read_start; - if (temp_read_length >= 0 && temp_read_length < read_length) - read_length = temp_read_length; - } - string qual_new = ""; // don't trim the quality scores if it's a fasta file - + string qual_new = ""; if (qual != "") { if((read_start+read_length)>(qual.length())){ lock_guard lock(cout_mutex); @@ -598,28 +564,24 @@ void print_read(const string& read_id, const string& read, string qual, } string read_new = read.substr(read_start, read_length); - if (b == 0 && !trim_barcodes) { // override if read shouldn't be cut + if (b == 0 && !trim_barcodes) { new_read_id = read_id; read_new = read; qual_new = qual; - b = vec_size; // force loop to exit after this iteration } - // Skip reads that become empty after trimming if (read_new.length() == 0) { continue; } - if (split) { // to a file if spliting by barcode + if (split) { lock_guard lock(file_mutex); - // Check if the file stream is already open - if (barcode_file_streams.find(barcode) == barcode_file_streams.end()) { - // If not, open it for the first time and add it to the map - string outname = prefix + "_" + barcode + (qual.empty() ? ".fasta" : ".fastq"); - barcode_file_streams[barcode].open(outname); + string barcode_for_split = primary_barcode_for_filename.empty() ? "UNKNOWN" : primary_barcode_for_filename; + if (barcode_file_streams.find(barcode_for_split) == barcode_file_streams.end()) { + string outname = prefix + "_" + barcode_for_split + (qual.empty() ? ".fasta" : ".fastq"); + barcode_file_streams[barcode_for_split].open(outname); } - // Write to the already open stream - print_line(new_read_id, read_new, qual_new, barcode_file_streams[barcode]); + print_line(new_read_id, read_new, qual_new, barcode_file_streams[barcode_for_split]); } else { lock_guard lock(cout_mutex); print_line(new_read_id, read_new, qual_new, std::cout); @@ -631,22 +593,22 @@ void print_read(const string& read_id, const string& read, string qual, void search_worker( ThreadSafeQueue &work_queue, ThreadSafeQueue &results_queue, - const unordered_set &known_barcodes, + const std::unordered_map> &known_barcodes_map, int flank_edit_distance, - int edit_distance, - const std::vector> &search_pattern + int global_barcode_edit_distance, + const std::vector &search_pattern ) { SearchResult* sr_ptr; while (work_queue.pop(sr_ptr) && sr_ptr) { //forward search - sr_ptr->vec_bc_for = big_barcode_search(sr_ptr->line, known_barcodes, flank_edit_distance, edit_distance, search_pattern); + sr_ptr->vec_bc_for = big_barcode_search(sr_ptr->line, &known_barcodes_map, flank_edit_distance, global_barcode_edit_distance, search_pattern); // get reverse complement sr_ptr->rev_line = sr_ptr->line; reverse_complement(sr_ptr->rev_line); //Check the reverse complement of the read - sr_ptr->vec_bc_rev = big_barcode_search(sr_ptr->rev_line, known_barcodes, flank_edit_distance, edit_distance, search_pattern); + sr_ptr->vec_bc_rev = big_barcode_search(sr_ptr->rev_line, &known_barcodes_map, flank_edit_distance, global_barcode_edit_distance, search_pattern); sr_ptr->count = sr_ptr->vec_bc_for.size() + sr_ptr->vec_bc_rev.size(); sr_ptr->chimeric = !sr_ptr->vec_bc_for.empty() && !sr_ptr->vec_bc_rev.empty(); @@ -684,10 +646,16 @@ int main(int argc, char **argv) { bool remove_barcodes = true; //(i) bool print_chimeric = false; //(c) - std::vector> search_pattern; + std::vector search_pattern; + + // Set of known barcodes (can be multiple, identified by name) + std::unordered_map> known_barcodes_map; + // Flag to track if any -B segment specified its own barcode list + bool individual_bclist_specified = false; - // Set of known barcodes - unordered_set known_barcodes; + // Counters for automatic naming + int barcode_count = 0; + int umi_count = 0; // threads int n_threads = 1; @@ -703,7 +671,7 @@ int main(int argc, char **argv) { while ((c = getopt(myArgs.size(), myArgs.data(), - "d:k:i:b:u:x:e:f:n:s:hp:c:")) != EOF) { + "d:k:i:b:u:x:e:f:n:s:hp:c:B:")) != EOF) { switch (c) { case 'd': { // d=predefined list of settings for various search/barcode schemes if (predefinedMap.find(optarg) == predefinedMap.end()) { @@ -727,31 +695,38 @@ int main(int argc, char **argv) { allocated_args.insert(allocated_args.end(), newArgv.begin(), newArgv.end()); break; } - case 'k': { // k=list of known barcodes + case 'k': { // k=list of known barcodes (global barcode list) + if (individual_bclist_specified) { + cerr << "Error: Cannot use -k when -B segments have specified their own barcode files.\n"; + print_usage(); + exit(1); + } string file_name(optarg); string bc; + unordered_set global_bclist; /**** READ BARCODE LIST FROM FILE ******/ file.open(file_name); if (!(file.good())) { // if the string given isn't a file cerr << "Reading known barcodes from string: " << file_name << "\n"; stringstream bc_list(file_name); while (getline(bc_list, bc, ',')) // tokenize - known_barcodes.insert(bc); + global_bclist.insert(bc); } else { // otherwise get the barcodes from the file.. cerr << "Setting known barcodes from " << file_name << "\n"; while (getline(file, line)) { istringstream line_stream(line); line_stream >> bc; - known_barcodes.insert(bc); + global_bclist.insert(bc); } file.close(); } - cerr << "Number of known barcodes: " << known_barcodes.size() << "\n"; - if (known_barcodes.size() == 0) { + cerr << "Number of known barcodes in global list: " << global_bclist.size() << "\n"; + if (global_bclist.empty()) { print_usage(); exit(1); // case barcode file is empty } + known_barcodes_map["global"] = global_bclist; params += 2; break; } @@ -763,7 +738,7 @@ int main(int argc, char **argv) { } case 'e': { edit_distance = atoi(optarg); - cerr << "Setting max barcode edit distance to " << edit_distance << "\n"; + cerr << "Setting max barcode edit distance to " << edit_distance << " (global default).\n"; params += 2; break; } @@ -774,22 +749,174 @@ int main(int argc, char **argv) { params += 2; break; } - // x, u, b arguments inserts subpatterns to search_pattern - case 'x': { - search_pattern.push_back(std::make_pair("Unnamed Seq", optarg)); - cerr << "Adding flank sequence to search for: " << optarg << "\n"; + case 'x': { // Fixed segment + Segment s; + s.type = FIXED; + s.pattern = optarg; + s.name = "Fixed_" + to_string(search_pattern.size()); + search_pattern.push_back(s); + cerr << "Adding fixed sequence to search for: " << optarg << "\n"; + params += 2; + break; + } + case 'u': { // Random (UMI) segment + Segment s; + s.type = RANDOM; + s.name = ""; // Will be set later if not provided + s.buffer_size = 0; // Default buffer for UMI + + string arg_str(optarg); + size_t current_pos = 0; + size_t next_comma = arg_str.find(','); + + // First part is always the pattern + s.pattern = arg_str.substr(current_pos, next_comma - current_pos); + cerr << "Setting UMI to search for: " << s.pattern; + + while (next_comma != string::npos) { + current_pos = next_comma + 1; + next_comma = arg_str.find(',', current_pos); + string param = arg_str.substr(current_pos, next_comma - current_pos); + + size_t colon_pos = param.find(':'); + if (colon_pos == string::npos) { + cerr << "Error: Malformed -u parameter: " << param << "\n"; + print_usage(); + exit(1); + } + string key = param.substr(0, colon_pos); + string value = param.substr(colon_pos + 1); + + if (key == "name") { + s.name = value; + cerr << " (name: " << s.name << ")"; + } else if (key == "buffer") { + s.buffer_size = stoi(value); + cerr << " (buffer: " << s.buffer_size << ")"; + } else { + cerr << "Error: Unknown -u parameter: " << key << "\n"; + print_usage(); + exit(1); + } + } + + // Assign default name if not provided + if (s.name.empty()) { + umi_count++; + if (umi_count == 1) { + s.name = "UB"; + } else { + s.name = "UB" + to_string(umi_count); + } + } else { + umi_count++; // Still increment counter for user-named UMIs + } + + search_pattern.push_back(s); + cerr << " -> named: " << s.name << "\n"; params += 2; break; } - case 'u': { - search_pattern.push_back(std::make_pair("UMI", optarg)); - cerr << "Setting UMI to search for: " << optarg << "\n"; + case 'b': { // Old-style Barcode segment + Segment s; + s.type = MATCHED; + s.pattern = optarg; + barcode_count++; + s.name = (barcode_count == 1) ? "CB" : "CB" + to_string(barcode_count); + s.buffer_size = 5; // Default buffer for barcode + s.max_edit_distance = edit_distance; // Will use global -e + s.bc_list_name = "global"; // Will use global -k + search_pattern.push_back(s); + cerr << "Setting old-style barcode to search for: " << optarg + << " -> named: " << s.name + << ". Will use global barcode list (-k) and edit distance (-e).\n"; params += 2; break; } - case 'b': { - search_pattern.push_back(std::make_pair("BC", optarg)); - cerr << "Setting barcode to search for: " << optarg << "\n"; + case 'B': { // New-style Barcode segment + Segment s; + s.type = MATCHED; + s.name = ""; // Will be set later if not provided + s.buffer_size = 5; // Default buffer + s.max_edit_distance = edit_distance; // Default to global -e + + string arg_str(optarg); + size_t current_pos = 0; + size_t next_comma = arg_str.find(','); + + // First part is always the pattern + s.pattern = arg_str.substr(current_pos, next_comma - current_pos); + cerr << "Adding new-style barcode segment: " << s.pattern; + + while (next_comma != string::npos) { + current_pos = next_comma + 1; + next_comma = arg_str.find(',', current_pos); + string param = arg_str.substr(current_pos, next_comma - current_pos); + + size_t colon_pos = param.find(':'); + if (colon_pos == string::npos) { + cerr << "Error: Malformed -B parameter: " << param << "\n"; + print_usage(); + exit(1); + } + string key = param.substr(0, colon_pos); + string value = param.substr(colon_pos + 1); + + if (key == "name") { + s.name = value; + cerr << " (name: " << s.name << ")"; + } else if (key == "list") { + s.bc_list_name = value; + individual_bclist_specified = true; + // Load barcode file + unordered_set current_bclist; + file.open(value); + if (!(file.good())) { + cerr << "Error: Unable to open barcode file: " << value << "\n"; + print_usage(); + exit(1); + } + while (getline(file, line)) { + istringstream line_stream(line); + string bc_entry; + line_stream >> bc_entry; + current_bclist.insert(bc_entry); + } + file.close(); + if (current_bclist.empty()) { + cerr << "Error: barcode file " << value << " is empty.\n"; + print_usage(); + exit(1); + } + known_barcodes_map[value] = current_bclist; + cerr << " (barcode list: " << value << ")"; + } else if (key == "buffer") { + s.buffer_size = stoi(value); + cerr << " (buffer: " << s.buffer_size << ")"; + } else if (key == "max_ed") { + s.max_edit_distance = stoi(value); + cerr << " (max_ed: " << s.max_edit_distance << ")"; + } else { + cerr << "Error: Unknown -B parameter: " << key << "\n"; + print_usage(); + exit(1); + } + } + + // Assign default name if not provided + if (s.name.empty()) { + barcode_count++; + if (barcode_count == 1) { + s.name = "CB"; + } else { + s.name = "CB" + to_string(barcode_count); + } + } else { + barcode_count++; // Still increment counter for user-named barcodes + } + + search_pattern.push_back(s); + cerr << " -> named: " << s.name << "\n"; params += 2; break; } @@ -808,12 +935,7 @@ int main(int argc, char **argv) { split_file_by_barcode = get_bool_opt_arg(optarg); cerr << "Split read output into separate files by barcode: " << split_file_by_barcode << "\n"; - int max_split_bc = 50; - if (known_barcodes.size() > max_split_bc) { - cerr << "Too many barcodes to split into separate files: " - << known_barcodes.size() << "> " << max_split_bc << "\n"; - split_file_by_barcode = false; - } + params += 2; break; } @@ -840,15 +962,37 @@ int main(int argc, char **argv) { delete[] arg; } - // default case when no x, u, b is speficied + // default case when no x, u, b, B is speficied if (search_pattern.empty()) { cerr << "Using default search pattern: " << endl; - search_pattern = {{"primer", "CTACACGACGCTCTTCCGATCT"}, - {"BC", std::string(16, '?')}, - {"UMI", std::string(12, '?')}, - {"polyA", std::string(9, 'T')}}; - for (auto const& i : search_pattern) - std::cerr << i.first << ": " << i.second << "\n"; + Segment s_primer, s_bc, s_umi, s_polya; + + s_primer.type = FIXED; + s_primer.pattern = "CTACACGACGCTCTTCCGATCT"; + s_primer.name = "primer"; + search_pattern.push_back(s_primer); + + s_bc.type = MATCHED; + s_bc.pattern = std::string(16, '?'); + s_bc.name = "CB"; + s_bc.buffer_size = 5; + s_bc.max_edit_distance = edit_distance; + s_bc.bc_list_name = "global"; + search_pattern.push_back(s_bc); + + s_umi.type = RANDOM; + s_umi.pattern = std::string(12, '?'); + s_umi.name = "UB"; + s_umi.buffer_size = 0; + search_pattern.push_back(s_umi); + + s_polya.type = FIXED; + s_polya.pattern = std::string(9, 'T'); + s_polya.name = "polyA"; + search_pattern.push_back(s_polya); + + for (auto const& s : search_pattern) + std::cerr << s.name << ": " << s.pattern << "\n"; } cerr << "For usage information type: flexiplex -h" << endl; @@ -881,13 +1025,19 @@ int main(int argc, char **argv) { out_stat_filename = out_filename_prefix + "_" + out_stat_filename; out_bc_filename = out_filename_prefix + "_" + out_bc_filename; - if (known_barcodes.size() > 0) { + if (!known_barcodes_map.empty()) { if (file_exists(out_stat_filename)) { cerr << "File " << out_stat_filename << " already exists, overwriting." << endl; } out_stat_file.open(out_stat_filename, std::ios::trunc); - out_stat_file << "Read\tCellBarcode\tFlankEditDist\tBarcodeEditDist\tUMI\n"; + out_stat_file << "Read"; + for (const auto& s : search_pattern) { + if (s.type != FIXED) { + out_stat_file << "\t" << s.name; + } + } + out_stat_file << "\tFlankEditDist\n"; } cerr << "Searching for barcodes...\n"; @@ -919,7 +1069,7 @@ int main(int argc, char **argv) { // Start worker threads for (int i = 0; i < n_threads; ++i) { - workers.emplace_back(search_worker, ref(work_queue), ref(results_queue), ref(known_barcodes), + workers.emplace_back(search_worker, ref(work_queue), ref(results_queue), ref(known_barcodes_map), flank_edit_distance, edit_distance, ref(search_pattern)); } @@ -968,20 +1118,41 @@ int main(int argc, char **argv) { if (sr->count > 0) bc_count++; if (sr->chimeric) multi_bc_count++; - for (const auto& bc : sr->vec_bc_for) barcode_counts[bc.barcode]++; - for (const auto& bc : sr->vec_bc_rev) barcode_counts[bc.barcode]++; + // Count barcodes from features map (first MATCHED segment) + for (const auto& bc : sr->vec_bc_for) { + for (const auto& s : search_pattern) { + if (s.type == MATCHED) { + auto it = bc.features.find(s.name); + if (it != bc.features.end()) { + barcode_counts[it->second]++; + break; // Only count first barcode for statistics + } + } + } + } + for (const auto& bc : sr->vec_bc_rev) { + for (const auto& s : search_pattern) { + if (s.type == MATCHED) { + auto it = bc.features.find(s.name); + if (it != bc.features.end()) { + barcode_counts[it->second]++; + break; // Only count first barcode for statistics + } + } + } + } - if (known_barcodes.size() != 0) { - print_stats(sr->read_id, sr->vec_bc_for, out_stat_file); - print_stats(sr->read_id, sr->vec_bc_rev, out_stat_file); + if (known_barcodes_map.size() != 0) { + print_stats(sr->read_id, sr->vec_bc_for, out_stat_file, search_pattern); + print_stats(sr->read_id, sr->vec_bc_rev, out_stat_file, search_pattern); print_read(sr->read_id + "_+", sr->line, sr->qual_scores, sr->vec_bc_for, out_filename_prefix, - split_file_by_barcode, barcode_file_streams, cout_mutex, file_mutex, remove_barcodes, print_chimeric && sr->chimeric); + split_file_by_barcode, barcode_file_streams, cout_mutex, file_mutex, remove_barcodes, print_chimeric && sr->chimeric, search_pattern); if (remove_barcodes || sr->vec_bc_for.empty()) { reverse(sr->qual_scores.begin(), sr->qual_scores.end()); print_read(sr->read_id + "_-", sr->rev_line, sr->qual_scores, sr->vec_bc_rev, out_filename_prefix, - split_file_by_barcode, barcode_file_streams, cout_mutex, file_mutex, remove_barcodes, print_chimeric && sr->chimeric); + split_file_by_barcode, barcode_file_streams, cout_mutex, file_mutex, remove_barcodes, print_chimeric && sr->chimeric, search_pattern); } } @@ -996,7 +1167,7 @@ int main(int argc, char **argv) { } reads_ifs.close(); - if (known_barcodes.size() > 0) { + if (known_barcodes_map.size() > 0) { out_stat_file.close(); } // Close all the split files @@ -1014,7 +1185,7 @@ int main(int argc, char **argv) { cerr << "All done!" << endl; cerr << "If you like Flexiplex, please cite us! https://doi.org/10.1093/bioinformatics/btae102" << endl; - if (known_barcodes.size() > 0) { + if (known_barcodes_map.size() > 0) { return (0); } From a842fadc9642e0de3fccb1944a0b3f8e4b7fe009 Mon Sep 17 00:00:00 2001 From: Changqing Wang Date: Fri, 16 Jan 2026 20:18:53 +1100 Subject: [PATCH 3/3] trim off multi-matching bits; better UMI locations --- flexiplex.c++ | 155 ++++++++++++++++++++++++++++++++------------------ 1 file changed, 101 insertions(+), 54 deletions(-) diff --git a/flexiplex.c++ b/flexiplex.c++ index 75ea5e6..5ca81a2 100644 --- a/flexiplex.c++ +++ b/flexiplex.c++ @@ -311,7 +311,7 @@ Barcode extract_features(string & seq, //search for the concatenated pattern EdlibAlignResult result = edlibAlign(search_string_template.c_str(), search_string_template.length(), seq.c_str(), seq.length(), edlibConf); - if(result.status != EDLIB_STATUS_OK || result.numLocations==0 || result.editDistance == -1 ){ + if(result.status != EDLIB_STATUS_OK || result.numLocations==0 ){ edlibFreeAlignResult(result); return(barcode_result); // no match found - return } //fill in info about found primer and polyT location @@ -352,69 +352,105 @@ Barcode extract_features(string & seq, } edlibFreeAlignResult(result); - // Extract and validate features for each segment + // Storage for refined positions from matched segments + std::vector refined_segment_starts(segments.size(), -1); + std::vector refined_segment_ends(segments.size(), -1); + + // Pass 1: Process MATCHED segments for (size_t i = 0; i < segments.size(); ++i) { const Segment& s = segments[i]; + if (s.type != MATCHED) continue; + int segment_read_start = read_to_segment_starts[i]; int segment_read_end = (i + 1 < read_to_segment_starts.size()) ? read_to_segment_starts[i+1] : barcode_result.flank_end; - int extracted_length = segment_read_end - segment_read_start; - if (extracted_length < 0) extracted_length = 0; // Should not happen with correct alignment + const unordered_set* current_bclist = nullptr; - std::string extracted_seq = seq.substr(segment_read_start, extracted_length); - - if (s.type == MATCHED) { - const unordered_set* current_bclist = nullptr; + if (!s.bc_list_name.empty()) { + auto it = known_barcodes_map->find(s.bc_list_name); + if (it != known_barcodes_map->end()) { + current_bclist = &(it->second); + } + } else if (known_barcodes_map->count("global")) { + current_bclist = &(known_barcodes_map->at("global")); + } - if (!s.bc_list_name.empty()) { - auto it = known_barcodes_map->find(s.bc_list_name); - if (it != known_barcodes_map->end()) { - current_bclist = &(it->second); + if (current_bclist && !current_bclist->empty()) { + // Apply buffer for search + int search_start = max(0, segment_read_start - s.buffer_size); + int search_end = min((int)seq.length(), segment_read_end + s.buffer_size); + std::string search_region = seq.substr(search_start, search_end - search_start); + + unsigned int best_edit_distance = 100; // Higher than any reasonable max_ed + unsigned int best_end_distance = 0; + std::string best_barcode_match = ""; + bool current_segment_unambiguous = false; + + for (const auto& known_bc : *current_bclist) { + unsigned int editDistance, endDistance; + editDistance = edit_distance(search_region, known_bc, endDistance, s.max_edit_distance); + + if (editDistance == best_edit_distance) { + current_segment_unambiguous = false; + } else if (editDistance < best_edit_distance && editDistance <= s.max_edit_distance) { + current_segment_unambiguous = true; + best_edit_distance = editDistance; + best_barcode_match = known_bc; + best_end_distance = endDistance; + + // TODO: multiplex perfect match can exits due to the buffering around barcode segments + if (editDistance == 0) { + break; // perfect match found } - } else if (known_barcodes_map->count("global")) { - current_bclist = &(known_barcodes_map->at("global")); + } + } + if (best_edit_distance <= s.max_edit_distance && current_segment_unambiguous) { + barcode_result.features[s.name] = best_barcode_match; + + // Refined positions + refined_segment_ends[i] = search_start + best_end_distance - 1; + // Approximate start based on match length (assuming not many indels) + refined_segment_starts[i] = refined_segment_ends[i] - best_barcode_match.length() + 1; + } else { + barcode_result.found_all_matched_segments = false; } + } else { // No barcode list found + cerr << "Error: No barcode list found for segment " << s.name << ".\n"; + exit(1); + } + } - if (current_bclist && !current_bclist->empty()) { - // Apply buffer for search - int search_start = max(0, segment_read_start - s.buffer_size); - int search_end = min((int)seq.length(), segment_read_end + s.buffer_size); - std::string search_region = seq.substr(search_start, search_end - search_start); - - unsigned int best_edit_distance = 100; // Higher than any reasonable max_ed - std::string best_barcode_match = ""; - bool current_segment_unambiguous = false; - - for (const auto& known_bc : *current_bclist) { - unsigned int editDistance, endDistance; - editDistance = edit_distance(search_region, known_bc, endDistance, s.max_edit_distance); - - if (editDistance == best_edit_distance) { - current_segment_unambiguous = false; - } else if (editDistance < best_edit_distance && editDistance <= s.max_edit_distance) { - current_segment_unambiguous = true; - best_edit_distance = editDistance; - best_barcode_match = known_bc; - } - } - if (best_edit_distance <= s.max_edit_distance && current_segment_unambiguous) { - barcode_result.features[s.name] = best_barcode_match; - } else { - barcode_result.found_all_matched_segments = false; + if (!barcode_result.found_all_matched_segments) { + return(barcode_result); // return early if any matched segments failed + } + + // Pass 2: Process RANDOM and other segments + for (size_t i = 0; i < segments.size(); ++i) { + const Segment& s = segments[i]; + if (s.type == MATCHED) continue; + + int extract_start = read_to_segment_starts[i]; + int extract_end = (i + 1 < read_to_segment_starts.size()) ? read_to_segment_starts[i+1] : barcode_result.flank_end; + + if (s.type == RANDOM) { + // Check if there is a MATCHED segment immediately before + if (i > 0 && segments[i-1].type == MATCHED && refined_segment_ends[i-1] != -1) { + extract_start = refined_segment_ends[i-1] + 1; + extract_end = extract_start + s.pattern.length(); + } + // Otherwise check if there is one immediately after + else if (i + 1 < segments.size() && segments[i+1].type == MATCHED && refined_segment_starts[i+1] != -1) { + extract_end = refined_segment_starts[i+1]; + extract_start = extract_end - s.pattern.length(); } - } else { // No allow list provided - cerr << "Error: No barcode list found for segment " << s.name << ".\n"; - exit(1); - } - } else if (s.type == RANDOM) { - int extract_start = max(0, segment_read_start); - int extract_end = min((int)seq.length(), segment_read_end); - extracted_seq = seq.substr(extract_start, extract_end - extract_start); - barcode_result.features[s.name] = extracted_seq; - } else if (s.type == FIXED) { - // For fixed segments, we just note their presence or can store the extracted sequence if needed - // For now, we don't store fixed segments in features map unless explicitly asked. - // barcode_result.features[s.name] = extracted_seq; + + int actual_extract_start = max(0, extract_start); + int actual_extract_end = min((int)seq.length(), extract_end); + + if (actual_extract_end < actual_extract_start) actual_extract_end = actual_extract_start; + + std::string extracted_seq = seq.substr(actual_extract_start, actual_extract_end - actual_extract_start); + barcode_result.features[s.name] = extracted_seq; } } @@ -551,7 +587,18 @@ void print_read(const string& read_id, const string& read, string qual, } int read_start = vec_bc.at(b).flank_end + 1; - int read_length = read.length() - read_start; + // work out the start and end base in case multiple barcodes + int next_barcode_start = read.length(); + for (int k = 0; k < vec_size; k++) { + if (b == k) continue; + if (vec_bc.at(k).flank_start >= read_start) { + if (vec_bc.at(k).flank_start < next_barcode_start) { + next_barcode_start = vec_bc.at(k).flank_start; + } + } + } + + int read_length = next_barcode_start - read_start; string qual_new = ""; if (qual != "") {