diff --git a/HybridMSA/README.md b/HybridMSA/README.md new file mode 100644 index 00000000..b080baef --- /dev/null +++ b/HybridMSA/README.md @@ -0,0 +1,89 @@ +# HybridMSA + +HybridMSA is a versatile C++ tool for generating customized multiple sequence alignments (MSAs) for proteins — especially those produced via deep learning-driven motif scaffolding pipelines, though it supports a range of other use cases including fusion/chimeric proteins, tagged proteins, and more! + +## How HybridMSA works + +![HybridMSA](assets/HybridMSA.png) + +HybridMSA processes MSAs (e.g. from AlphaFold2 or other sources) and generates HybridMSAs that: + +- Retain evolutionary information where available + +- Insert masked tokens elsewhere + +- Support downstream applications (e.g., structure prediction using ColabFold's implementation of AlphaFold2) + +Importantly, this enables confident and accurate structure prediction for engineered proteins that would otherwise be unsupported or poorly modeled by AlphaFold2. + +## Why use HybridMSA? + +Tool's relying on MSAs (e.g. AlphaFold2) often fail or perform poorly when: + +- A protein includes *de novo* designed regions, + +- The protein is a chimera or fusion, + +- Tags (e.g., purification tags) are added, + +- Genetic searches return no meaningful hits. + +HybridMSA solves these issues by seperating genetic searching and MSA generation, enabling the combination of meaningful evolutionary content with masked regions such that the performance of AlphaFold2 increases tremendously. + +## Benchmarking HybridMSA + +![HybridMSA](assets/Benchmarking.png) + +In a benchmarking case of 1000 motif scaffolds generated via a deep learning-driven protein design pipeline, HybridMSA outperformed ColabFold's implementation of AlphaFold2 under default settings (CF) and single-sequence mode (CFss) with regard to accuracy and confidence of predicted models. We attribute this to HybridMSA’s ability to preserve evolutionary context while avoiding noisy or misleading genetic searches for artificial regions. In addition, HybridMSA enables the bypassing of time-consuming genetic and structural database searches, decreasing structure predicton time significantly compared to CF. + +## Installation +Follow the steps below to install dependencies and compile the source code. + +1. Download reformat.pl + + HybridMSA relies on HH-suite’s [reformat.pl](https://github.com/soedinglab/hh-suite/blob/master/scripts/reformat.pl) to standardize MSA formats. + +2. Compile the three C++ utility programs + + g++ -std=c++17 src/parse_msa_bfd_only.cpp -o parse_msa_bfd_only + + g++ -std=c++17 src/prepare_msa_A.cpp -o prepare_msa_A -lstdc++fs + + g++ -std=c++17 src/prepare_msa_B.cpp -o prepare_msa_B + +## Usage + +**parse_msa_bfd_only** processes the BFD MSA (in A3M format) from AlphaFold2 genetic searching to extract specific contiguous regions (contigs) from each aligned sequence; these contigs are specified by exact substring matches to the query (first) sequence, and the resulting sub-alignments are saved for downstream use. While it is optimized for BFD-style A3M inputs, the program is format-flexible and supports any input readable by reformat.pl, including: fas, a2m, a3m, sto, psi, clu. + +Usage: ./parse_msa_bfd_only /path/to/msas/directory contig1 [contig2] ... + +- : Path to the reformat.pl script from HH-suite + +- : Path to the directory containing bfd_uniclust_hits.a3m + +- : Substring(s) to extract from aligned sequences + +**prepare_msa_A** generates a HybridMSA for the first sequence in an input FASTA file by aligning each specified contig exactly to its corresponding position within the full input sequence and then filling all other (non-contig) positions with masked tokens to indicate missing or unaligned regions. This approach enables rapid construction of MSAs even when contigs occur in arbitrary positions or in different orders within the input sequences, by preserving positional accuracy and marking gaps clearly. Each sequence within the input FASTA file should be the same length and contigs should be in the same positions (e.g. output FASTA file from ProteinMPNN). + +Usage: ./prepare_msa_A input_file.fasta /path/to/output/of/run_divide_msa_bfd_only/directory contig1 [contig2] ... + +- : Path to the input FASTA file containing full-length sequences + +- : Directory containing sub-alignments for each contig (e.g., contig1.txt, contig2.txt) + +- : One or more substring(s) defining contigs to align and assemble into a HybridMSA + +**prepare_msa_B** generates a HybridMSA for each sequence in an input FASTA file with proper formatting for ColabFold input by prepending sequence-specific headers to the A3M file output from prepare_msa_A. It supports different cardinalities (e.g., "monomer" or "homotrimer") and formats output filenames accordingly. + +Usage: ./prepare_msa_B input_file.fasta fasta_file a3m_file cardinality + +- : Path to the input FASTA file containing full-length sequences + +- : Path to the output A3M file from prepare_msa_A corresponding to the input FASTA file + +- : Specifies assembly type, e.g., "monomer" or "homotrimer" + +## Citation +This method is described in a recent publication: Kriews and Agostino, et al. (2025) + +Please cite this work if you use HybridMSA in your research! diff --git a/HybridMSA/assets/Benchmarking.png b/HybridMSA/assets/Benchmarking.png new file mode 100755 index 00000000..f1e2dff1 Binary files /dev/null and b/HybridMSA/assets/Benchmarking.png differ diff --git a/HybridMSA/assets/HybridMSA.png b/HybridMSA/assets/HybridMSA.png new file mode 100755 index 00000000..6ef8e436 Binary files /dev/null and b/HybridMSA/assets/HybridMSA.png differ diff --git a/HybridMSA/src/parse_msa_bfd_only.cpp b/HybridMSA/src/parse_msa_bfd_only.cpp new file mode 100755 index 00000000..970ed1c3 --- /dev/null +++ b/HybridMSA/src/parse_msa_bfd_only.cpp @@ -0,0 +1,210 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + +std::string exec(const char* cmd) { + std::array buffer; + std::string result; + std::unique_ptr pipe(popen(cmd, "r"), pclose); + if (!pipe) { + throw std::runtime_error("popen() failed!"); + } + while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { + result += buffer.data(); + } + return result; +} + +std::vector split(const std::string& s, char delimiter) { + std::vector tokens; + std::string token; + std::istringstream tokenStream(s); + while (std::getline(tokenStream, token, delimiter)) { + tokens.push_back(token); + } + return tokens; +} + +int main(int argc, char* argv[]) { + if (argc < 4) { // Minimum arguments: program name, MSA path, and at least one substring + std::cerr << "Usage: " << argv[0] << " [contig2] ..." << std::endl; + return 1; + } + + // Get MSA directory path + std::string reformat = argv[1]; + + // Get MSA directory path + std::string path = argv[2]; + + // Get contigs + std::vector substrings; + for (int i = 3; i < argc; ++i) { + substrings.push_back(argv[i]); + } + + // Print contigs + std::cout << "MSA directory: " << path << std::endl; + std::cout << "Contigs:" << std::endl; + for (const auto& substring : substrings) { + std::cout << " - " << substring << std::endl; + } + + // Reformat bfd_uniclust_hits.a3m + std::string cmd = reformat + " a3m a3m " + path + "/bfd_uniclust_hits.a3m bfd_uniclust.a3m"; + system(cmd.c_str()); + + // Reformat bfd_uniclust.a3m + cmd = "awk '/^>/ {if (seq) print seq; printf \"%s\\n\", $0; seq = \"\"; next} {gsub(/[ \\t\\r\\n]/,\"\"); seq = seq $0} END {if (seq) print seq}' bfd_uniclust.a3m > bfd_uniclust_reformatted.a3m"; + system(cmd.c_str()); + + // Concatenate reformatted files + system("cat bfd_uniclust_reformatted.a3m > msa.a3m"); + + std::string msa = "msa.a3m"; + std::string seq; + + // Read the second line of msa.a3m + std::ifstream msa_file(msa); + std::string line; + std::getline(msa_file, line); // Skip first line + std::getline(msa_file, seq); + + // Calculate start and end positions + std::vector starts, ends; + for (const auto& substring : substrings) { + std::vector positions; + size_t pos = seq.find(substring); + + while (pos != std::string::npos) { + positions.push_back(pos); + pos = seq.find(substring, pos + 1); // Look for the next occurrence + } + + if (positions.empty()) { + std::cout << "Warning: No positions found for substring '" << substring << "'" << std::endl; + } else if (positions.size() > 1) { + std::cout << "Warning: Multiple positions found for substring '" << substring << "'" << std::endl; + std::cout << "Positions: "; + for (const auto& p : positions) { + std::cout << p << " "; + } + std::cout << std::endl; + + starts.push_back(positions[0]); + ends.push_back(positions[0] + substring.length() - 1); + } else { + starts.push_back(positions[0]); + ends.push_back(positions[0] + substring.length() - 1); + } + } + + // Print starts and ends + for (int start : starts) std::cout << start << " "; + std::cout << std::endl; + for (int end : ends) std::cout << end << " "; + std::cout << std::endl; + + // Create output files + std::ifstream input_msa(msa); + std::ofstream target_names("target_names.txt"); + std::vector substring_files; + for (const auto& substring : substrings) { + substring_files.emplace_back(substring + ".txt"); + } + + // Populate output files + int index = 0; + std::getline(input_msa, line); + std::getline(input_msa, line); + while (std::getline(input_msa, line)) { + if (index % 2 == 0) { + target_names << line << std::endl; + } else { + int counter = 0; + int saved_position = 0; + int found = 0; + + for (int i = 0; i < line.length(); ++i) { + char c = line[i]; + if (std::find(starts.begin(), starts.end(), counter) != starts.end() && + (c == '-' || std::isupper(c))) { + saved_position = i; + int checkSum = i - std::count_if(line.begin(), line.begin() + i, + [](char ch) { return std::islower(ch); }); + if (starts[found] != checkSum) { + std::cout << "Element " << starts[found] << " is not equal to " << checkSum << std::endl; + } + } else if (std::find(ends.begin(), ends.end(), counter) != ends.end() && + (c == '-' || std::isupper(c))) { + substring_files[found] << line.substr(saved_position, i - saved_position + 1) << std::endl; + int checkSum = i - saved_position + 1 - + std::count_if(line.begin() + saved_position, line.begin() + i + 1, + [](char ch) { return std::islower(ch); }); + if (substrings[found].length() != checkSum) { + std::cout << "Element " << substrings[found].length() << " is not equal to " << checkSum << std::endl; + } + ++found; + } + if (c == '-' || std::isupper(c)) { + ++counter; + } + } + } + ++index; + } + + // Remove lines where all contigs are blank + std::vector lines_to_remove; + int line_count = 0; + std::ifstream count_file("target_names.txt"); + while (std::getline(count_file, line)) ++line_count; + + for (int i = 1; i <= line_count; ++i) { + bool non_dash = false; + for (const auto& substring : substrings) { + std::ifstream file(substring + ".txt"); + std::string line; + for (int j = 1; j < i; ++j) std::getline(file, line); + std::getline(file, line); + if (std::regex_search(line, std::regex("[A-WYZ]"))) { + non_dash = true; + break; + } + } + if (!non_dash) { + lines_to_remove.push_back(i); + std::cout << "rm " << i << std::endl; + } + } + + if (!lines_to_remove.empty()) { + std::string sed_command = "sed -i '"; + for (int line : lines_to_remove) { + sed_command += std::to_string(line) + "d;"; + } + sed_command.pop_back(); + sed_command += "' target_names.txt"; + system(sed_command.c_str()); + + for (const auto& substring : substrings) { + sed_command = "sed -i '"; + for (int line : lines_to_remove) { + sed_command += std::to_string(line) + "d;"; + } + sed_command.pop_back(); + sed_command += "' " + substring + ".txt"; + system(sed_command.c_str()); + } + } + + return 0; +} diff --git a/HybridMSA/src/prepare_msa_A.cpp b/HybridMSA/src/prepare_msa_A.cpp new file mode 100755 index 00000000..db4589a7 --- /dev/null +++ b/HybridMSA/src/prepare_msa_A.cpp @@ -0,0 +1,191 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + +std::string exec(const char* cmd) { + std::array buffer; + std::string result; + std::unique_ptr pipe(popen(cmd, "r"), pclose); + if (!pipe) { + throw std::runtime_error("popen() failed!"); + } + while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) { + result += buffer.data(); + } + return result; +} + +std::string get_sequence_from_file(const std::string& filename) { + std::ifstream file(filename); + std::string line; + std::getline(file, line); // Skip first line + std::getline(file, line); // Get second line (sequence) + return line; +} + +std::vector split(const std::string& s, char delimiter) { + std::vector tokens; + std::string token; + std::istringstream tokenStream(s); + while (std::getline(tokenStream, token, delimiter)) { + tokens.push_back(token); + } + return tokens; +} + +int main(int argc, char* argv[]) { + if (argc < 3) { // Minimum arguments: program name, fasta file, MSA path, and at least one contig + std::cerr << "Usage: " << argv[0] << " [contig2] ..." << std::endl; + return 1; + } + + // Get sequence and sequence length + std::string fa = argv[1]; + std::string seq = get_sequence_from_file(fa); + int length = seq.length(); + + // Get MSA directory path + std::string msa_path = argv[2]; + + // Get contigs + std::vector substrings; + for (int i = 3; i < argc; ++i) { + substrings.push_back(argv[i]); + } + + // Print contigs for debugging + std::cout << "Fasta file: " << fa << std::endl; + std::cout << "MSA directory: " << msa_path << std::endl; + std::cout << "Contigs:" << std::endl; + for (const auto& substring : substrings) { + std::cout << " - " << substring << std::endl; + } + + // for each contig get start and end postitions + std::map start_positions; + std::vector starts, ends; + for (const auto& substring : substrings) { + std::vector positions; + size_t pos = seq.find(substring); + + while (pos != std::string::npos) { + positions.push_back(pos); + pos = seq.find(substring, pos + 1); // Look for the next occurrence + } + + if (positions.empty()) { + std::cout << "Warning: No positions found for substring '" << substring << "'" << std::endl; + } else if (positions.size() > 1) { + std::cout << "Warning: Multiple positions found for substring '" << substring << "'" << std::endl; + std::cout << "Positions: "; + for (const auto& p : positions) { + std::cout << p << " "; + } + std::cout << std::endl; + + starts.push_back(positions[0]); + start_positions[substring] = positions[0]; + ends.push_back(positions[0] + substring.length()); + } else { + starts.push_back(positions[0]); + start_positions[substring] = positions[0]; + ends.push_back(positions[0] + substring.length()); + } + } + + // Sort start and end postitions + std::sort(starts.begin(), starts.end()); + std::sort(ends.begin(), ends.end()); + + // Add N- and C-terminal linkers + std::string Nlinker_dash_string = starts[0] == 0 ? "" : std::string(starts[0], '-'); + std::string Clinker_dash_string = ends.back() == length ? "" : std::string(length - ends.back(), '-'); + + // Get linker lengths + std::vector linker_lengths; + for (size_t i = 1; i < starts.size(); ++i) { + linker_lengths.push_back(starts[i] - ends[i-1]); + } + + // Get linkers + std::vector linkers; + for (int len : linker_lengths) { + linkers.push_back(std::string(len, '-')); + } + + // Print linkers + std::cout << "Linkers:" << std::endl; + for (const auto& linker : linkers) { + std::cout << linker << std::endl; + } + + // Sort the substrings by starting position + std::vector sorted_substrings; + for (int pos : starts) { + for (const auto& pair : start_positions) { + if (pair.second == pos) { + sorted_substrings.push_back(pair.first); + break; + } + } + } + + // Read target names into arrays + std::vector target_names; + std::ifstream target_file(msa_path + "/target_names.txt"); + std::string line; + while (std::getline(target_file, line)) { + target_names.push_back(line); + } + + // Read all sorted contigs into nested array + std::vector> contigs; + for (const auto& substring : sorted_substrings) { + std::vector temp; + std::ifstream contig_file(msa_path + "/" + substring + ".txt"); + while (std::getline(contig_file, line)) { + temp.push_back(line); + } + contigs.push_back(temp); + } + + // Create output files + std::string base_name = fa.substr(0, fa.find_last_of('.')); + std::ofstream output_file(base_name + ".a3m"); + + // Populate output file + for (size_t i = 0; i < target_names.size(); ++i) { + std::string line = Nlinker_dash_string; + + for (size_t j = 0; j < sorted_substrings.size(); ++j) { + if (j < sorted_substrings.size() - 1) { + line += contigs[j][i] + linkers[j]; + } else { + line += contigs[j][i]; + } + } + + line += Clinker_dash_string; + + int checkSum = line.length() - std::count_if(line.begin(), line.end(), [](char c) { return std::islower(c); }); + + if (checkSum != length) { + std::cout << "Checksum failed" << std::endl; + } + + std::string target = split(target_names[i], '\t')[0]; + + output_file << target << "\t" << target.substr(1) << std::endl; + output_file << line << std::endl; + } + + return 0; +} \ No newline at end of file diff --git a/HybridMSA/src/prepare_msa_B.cpp b/HybridMSA/src/prepare_msa_B.cpp new file mode 100755 index 00000000..f99e95ac --- /dev/null +++ b/HybridMSA/src/prepare_msa_B.cpp @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +namespace fs = std::filesystem; + +std::vector read_fasta(const std::string& filename) { + std::vector sequences; + std::ifstream file(filename); + std::string line; + std::string current_seq; + + while (std::getline(file, line)) { + if (line[0] == '>') { + if (!current_seq.empty()) { + sequences.push_back(current_seq); + current_seq.clear(); + } + } else { + current_seq += line; + } + } + if (!current_seq.empty()) { + sequences.push_back(current_seq); + } + return sequences; +} + +std::string repeat_char(char c, int n) { + return std::string(n, c); +} + +int main(int argc, char* argv[]) { + if (argc < 4) { // Minimum arguments: program name, fasta file, MSA path, and cardinality + std::cerr << "Usage: " << argv[0] << " " << std::endl; + return 1; + } + + // Store command-line arguments and get sequence length + std::string fa = argv[1]; + std::string a3m = argv[2]; + std::string card = argv[3]; + std::vector sequences = read_fasta(fa); + int length = sequences[0].length(); + + // Generate a3m files + for (int i = 0; i < sequences.size(); ++i) { + std::stringstream ss; + ss << std::setw(2) << std::setfill('0') << i + 1; + std::string i_formatted = ss.str(); + + std::string output_filename = fa.substr(0, fa.length() - 3) + "_seq" + i_formatted + ".a3m"; + std::ofstream output_file(output_filename); + + std::string seq = sequences[i]; + + if (card == "monomer") { + output_file << "#" << length << "\t1" << std::endl; + } else if (card == "homotrimer") { + output_file << "#" << length << "\t3" << std::endl; + } + + output_file << ">101" << std::endl; + output_file << seq << std::endl; + output_file << ">101" << std::endl; + output_file << seq << std::endl; + + std::ifstream original_a3m(a3m); + output_file << original_a3m.rdbuf(); + + output_file.close(); + } + + return 0; +}