From 391f5854a8bd9f3ed3335cbce72d915f22395a7f Mon Sep 17 00:00:00 2001
From: Weiyin Wu <59970244+TigerWindWood@users.noreply.github.com>
Date: Sun, 11 May 2025 16:45:20 +0800
Subject: [PATCH 01/11] Update Run_UniDoc_from_scratch_structure_afdb.py

---
 .../scripts/Run_UniDoc_from_scratch_structure_afdb.py           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py b/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py
index d1bce3d..b832551 100644
--- a/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py
+++ b/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py
@@ -9,7 +9,7 @@
 
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 BINDIR = os.path.join(SCRIPT_DIR, 'bin')
-UNIDOC = os.path.join(BINDIR, 'UniDoc_structure')
+UNIDOC = os.path.join(BINDIR, 'UniDoc_struct')
 STRIDE = os.path.join(BINDIR, 'stride')
 pdb_to_fasta = "pdb_tofasta"
 pdb_selres= "pdb_selres"

From f9be1f7e3dcc1e3889aff4830cb44fec2309e5f4 Mon Sep 17 00:00:00 2001
From: Weiyin Wu <59970244+TigerWindWood@users.noreply.github.com>
Date: Fri, 16 May 2025 15:28:58 +0800
Subject: [PATCH 02/11] Update Run_UniDoc_from_scratch_structure_afdb.py

---
 .../Run_UniDoc_from_scratch_structure_afdb.py | 40 +++++++++++--------
 1 file changed, 23 insertions(+), 17 deletions(-)

diff --git a/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py b/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py
index b832551..4009f2a 100644
--- a/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py
+++ b/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py
@@ -74,29 +74,35 @@ def main():
                 if_exist = os.path.exists(pdb_path_chopped)
                 if if_exist and not is_empty:
                     pdb = pdb_path_chopped
-            else:
-                pdb_path_chopped = None
             
-            try:
-                # Run secondary structure calculation with STRIDE
-                subprocess.check_output(f"{STRIDE} {pdb} -r{args.chain} > {pdb_ss} 2> /dev/null", shell=True)
+                try:
+                    # Run secondary structure calculation with STRIDE
+                    subprocess.check_output(f"{STRIDE} {pdb} -r{args.chain} > {pdb_ss} 2> /dev/null", shell=True)
                 
-                # Run UniDoc
-                output = subprocess.check_output(f"{UNIDOC} {pdb} {args.chain} {pdb_ss}", shell=True)
+                    # Run UniDoc
+                    output = subprocess.check_output(f"{UNIDOC} {pdb} {args.chain} {pdb_ss}", shell=True)
                 
-                # Format the output
-                output = str(output, 'utf-8').replace('~','-').replace(',','_').replace('/',',').rstrip('\n')
+                    # Format the output
+                    output = str(output, 'utf-8').replace('~','-').replace(',','_').replace('/',',').rstrip('\n')
 
-                domains = output.split(',')
-                ndoms = len(domains)
-                chopping = ','.join(natsorted(domains))
+                    domains = output.split(',')
+                    ndoms = len(domains)
+                    chopping = ','.join(natsorted(domains))
                 
-                if chopping == '':
-                    chopping = "NULL"
-                    ndoms = 0
+                    if chopping == '':
+                        chopping = "NULL"
+                        ndoms = 0
                 
-            except:
-                chopping = 'NO_SS'
+                except:
+                    pdb_ss = None
+                    pdb_path_chopped = None
+                    chopping = 'NO_SS'
+                    ndoms = 0
+
+            else:
+                pdb_ss = None
+                pdb_path_chopped = None
+                chopping = 'NULL'
                 ndoms = 0
 
             # end_time = time.time() - start_time 

From 5143e37187a5019eeba91210e27c5d883888544f Mon Sep 17 00:00:00 2001
From: Weiyin Wu <59970244+TigerWindWood@users.noreply.github.com>
Date: Fri, 16 May 2025 15:30:39 +0800
Subject: [PATCH 03/11] Update Run_UniDoc_from_scratch_structure_afdb.py

---
 .../scripts/Run_UniDoc_from_scratch_structure_afdb.py         | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py b/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py
index 4009f2a..0ec7e6f 100644
--- a/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py
+++ b/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py
@@ -118,10 +118,10 @@ def main():
             ))
             
             # Cleanup
-            if os.path.exists(pdb_ss):
+            if pdb_ss is not None and os.path.exists(pdb_ss):
                 os.remove(pdb_ss)
 
-            if os.path.exists(pdb_path_chopped):
+            if pdb_path_chopped is not None and os.path.exists(pdb_path_chopped):
                 os.remove(pdb_path_chopped)
     
 if __name__ == "__main__":

From e530ae6c008ddcf9d0166575bd370c2755e45d31 Mon Sep 17 00:00:00 2001
From: Weiyin Wu <59970244+TigerWindWood@users.noreply.github.com>
Date: Thu, 29 May 2025 20:39:27 +0800
Subject: [PATCH 04/11] Delete
 ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py

---
 .../Run_UniDoc_from_scratch_structure_afdb.py | 128 ------------------
 1 file changed, 128 deletions(-)
 delete mode 100644 ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py

diff --git a/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py b/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py
deleted file mode 100644
index 0ec7e6f..0000000
--- a/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import os
-import argparse
-import subprocess
-import hashlib
-import re
-import time
-
-from natsort import natsorted
-
-SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
-BINDIR = os.path.join(SCRIPT_DIR, 'bin')
-UNIDOC = os.path.join(BINDIR, 'UniDoc_struct')
-STRIDE = os.path.join(BINDIR, 'stride')
-pdb_to_fasta = "pdb_tofasta"
-pdb_selres= "pdb_selres"
-
-resndict = {'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
-            'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
-            'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P',
-            'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V',
-            'PAD': 'X'}
-
-def main():
-    parser = argparse.ArgumentParser()
-    
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument("-i", "--input", type=str, nargs="+", required=False, help="Pass a single file or list of filenames as -i 1ubq.pdb or *.pdb.")
-    group.add_argument("-l", "--list", type=str, required=False, help="Pass a file containing paths to files.")
-    
-    parser.add_argument("-c",dest='chain', required=False, type=str, default='A', help="the chain of parsed protein")
-    parser.add_argument("--out",dest='outfile', required=True, type=str, help="output file to write results to")
-    parser.add_argument("--inherit_chopping", type=str, required=True, default=None, help="Pass a file containing choppings from Merizo or Chainsaw. Rows should match targets in --input.")
-    
-    args = parser.parse_args()
-    
-    if args.input is not None:
-        files = args.input
-
-    if args.list is not None:
-        with open(args.list, 'r') as fn:
-            files = [line.rstrip('\n') for line in fn]
-            
-    chopping_dict = {}
-    with open(args.inherit_chopping, 'r') as fn:
-        for line in fn:
-            line_split = line.rstrip('\n').split('\t')
-            chopping_dict[line_split[0]] = line_split[-2]
-
-    with open(args.outfile, 'w') as fn:
-        for pdb_path in files:
-            start_time = time.time()
-            bn, ext = os.path.splitext(os.path.basename(pdb_path))
-            chopping = chopping_dict[bn]
-
-            pdb = os.path.realpath(pdb_path)
-            pdb_bn, pdb_ext = os.path.splitext(pdb)
-            pdb_ss = pdb + '.ss'
-            
-            fasta = str(subprocess.check_output(f"{pdb_to_fasta} {pdb}", shell=True), 'utf-8').split('\n')[1:-1]
-            seq = ''.join(fasta)
-            
-            md5 = hashlib.md5(seq.encode('utf-8')).hexdigest()
-            nres = len(seq)
-            
-            # pdb_selres.py -1:10,20:30 pdb.pdb
-            # If chopping is provided, then extract all domain residues from PDB using pdb_tools
-            # save as new file and point to the new file
-            if chopping not in ['NULL','NO_SS']:
-                pdb_path_chopped = pdb_bn + '_chopped.pdb'
-                resrng = chopping.replace('-',':').replace('_',',') # convert domain chopping into segments
-                subprocess.check_output(f"{pdb_selres} -{resrng} {pdb_path} > {pdb_path_chopped} 2> /dev/null", shell=True)
-                
-                is_empty = os.stat(pdb_path_chopped).st_size == 0
-                if_exist = os.path.exists(pdb_path_chopped)
-                if if_exist and not is_empty:
-                    pdb = pdb_path_chopped
-            
-                try:
-                    # Run secondary structure calculation with STRIDE
-                    subprocess.check_output(f"{STRIDE} {pdb} -r{args.chain} > {pdb_ss} 2> /dev/null", shell=True)
-                
-                    # Run UniDoc
-                    output = subprocess.check_output(f"{UNIDOC} {pdb} {args.chain} {pdb_ss}", shell=True)
-                
-                    # Format the output
-                    output = str(output, 'utf-8').replace('~','-').replace(',','_').replace('/',',').rstrip('\n')
-
-                    domains = output.split(',')
-                    ndoms = len(domains)
-                    chopping = ','.join(natsorted(domains))
-                
-                    if chopping == '':
-                        chopping = "NULL"
-                        ndoms = 0
-                
-                except:
-                    pdb_ss = None
-                    pdb_path_chopped = None
-                    chopping = 'NO_SS'
-                    ndoms = 0
-
-            else:
-                pdb_ss = None
-                pdb_path_chopped = None
-                chopping = 'NULL'
-                ndoms = 0
-
-            # end_time = time.time() - start_time 
-            
-            fn.write("{}\t{}\t{}\t{}\t{}\t{:.5f}\n".format(
-                bn,
-                md5,
-                nres, 
-                ndoms, 
-                chopping,
-                1.,
-                # end_time,
-            ))
-            
-            # Cleanup
-            if pdb_ss is not None and os.path.exists(pdb_ss):
-                os.remove(pdb_ss)
-
-            if pdb_path_chopped is not None and os.path.exists(pdb_path_chopped):
-                os.remove(pdb_path_chopped)
-    
-if __name__ == "__main__":
-    main()

From eadc339c615875ecfc32f123352be7b449b737f7 Mon Sep 17 00:00:00 2001
From: Weiyin Wu <59970244+TigerWindWood@users.noreply.github.com>
Date: Thu, 29 May 2025 20:40:10 +0800
Subject: [PATCH 05/11] update

---
 .../Run_UniDoc_from_scratch_structure_afdb.py | 151 ++++++++++++++++++
 1 file changed, 151 insertions(+)
 create mode 100644 ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py

diff --git a/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py b/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py
new file mode 100644
index 0000000..9b52f5c
--- /dev/null
+++ b/ted_consensus_1.0/scripts/Run_UniDoc_from_scratch_structure_afdb.py
@@ -0,0 +1,151 @@
+import os
+import argparse
+import subprocess
+import hashlib
+import re
+import time
+
+from natsort import natsorted
+
+SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
+BINDIR = os.path.join(SCRIPT_DIR, 'bin')
+UNIDOC = os.path.join(BINDIR, 'UniDoc_struct')
+STRIDE = os.path.join(BINDIR, 'stride')
+pdb_to_fasta = "pdb_tofasta"
+pdb_selres= "pdb_selres"
+
+resndict = {'ALA': 'A', 'ARG': 'R', 'ASN': 'N', 'ASP': 'D', 'CYS': 'C',
+            'GLN': 'Q', 'GLU': 'E', 'GLY': 'G', 'HIS': 'H', 'ILE': 'I',
+            'LEU': 'L', 'LYS': 'K', 'MET': 'M', 'PHE': 'F', 'PRO': 'P',
+            'SER': 'S', 'THR': 'T', 'TRP': 'W', 'TYR': 'Y', 'VAL': 'V',
+            'PAD': 'X'}
+
+def main():
+    parser = argparse.ArgumentParser()
+    
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument("-i", "--input", type=str, nargs="+", required=False, help="Pass a single file or list of filenames as -i 1ubq.pdb or *.pdb.")
+    group.add_argument("-l", "--list", type=str, required=False, help="Pass a file containing paths to files.")
+    
+    parser.add_argument("-c",dest='chain', required=False, type=str, default='A', help="the chain of parsed protein")
+    parser.add_argument("--out",dest='outfile', required=True, type=str, help="output file to write results to")
+    parser.add_argument("--inherit_chopping", type=str, required=True, default=None, help="Pass a file containing choppings from Merizo or Chainsaw. Rows should match targets in --input.")
+    
+    args = parser.parse_args()
+    
+    if args.input is not None:
+        files = args.input
+
+    if args.list is not None:
+        with open(args.list, 'r') as fn:
+            files = [line.rstrip('\n') for line in fn]
+            
+    chopping_dict = {}
+    with open(args.inherit_chopping, 'r') as fn:
+        for line in fn:
+            line_split = line.rstrip('\n').split('\t')
+            chopping_dict[line_split[0]] = line_split[-2]
+
+    with open(args.outfile, 'w') as fn:
+        for pdb_path in files:
+            start_time = time.time()
+            bn, ext = os.path.splitext(os.path.basename(pdb_path))
+            chopping = chopping_dict[bn]
+
+            pdb = os.path.realpath(pdb_path)
+            pdb_bn, pdb_ext = os.path.splitext(pdb)
+            pdb_ss = pdb + '.ss'
+            
+            fasta = str(subprocess.check_output(f"{pdb_to_fasta} {pdb}", shell=True), 'utf-8').split('\n')[1:-1]
+            seq = ''.join(fasta)
+            
+            md5 = hashlib.md5(seq.encode('utf-8')).hexdigest()
+            nres = len(seq)
+            
+            # pdb_selres.py -1:10,20:30 pdb.pdb
+            # If chopping is provided, then extract all domain residues from PDB using pdb_tools
+            # save as new file and point to the new file
+            if chopping not in ['NULL','NO_SS']:
+                print(f"Processing chopping for {bn}: {chopping}")
+                pdb_path_chopped = pdb_bn + '_chopped.pdb'
+                resrng = chopping.replace('-',':').replace('_',',') # convert domain chopping into segments
+                print(f"Extracting residues {resrng} from {pdb_path}")
+                subprocess.check_output(f"{pdb_selres} -{resrng} {pdb_path} > {pdb_path_chopped} 2> /dev/null", shell=True)
+                
+                is_empty = os.stat(pdb_path_chopped).st_size == 0
+                if_exist = os.path.exists(pdb_path_chopped)
+                if if_exist and not is_empty:
+                    print(f"Successfully created chopped PDB file: {pdb_path_chopped}")
+                    pdb = pdb_path_chopped
+                else:
+                    print(f"Warning: Chopped PDB file is empty or does not exist: {pdb_path_chopped}")
+                    
+                try:
+                    # Run secondary structure calculation with STRIDE
+                    print(f"Running STRIDE on {pdb} for chain {args.chain}")
+                    subprocess.check_output(f"{STRIDE} {pdb} -r{args.chain} > {pdb_ss} 2> /dev/null", shell=True)
+                    
+                    # Run UniDoc
+                    print(f"Running UniDoc on {pdb}")
+                    # UniDoc_struct needs to be run from the directory containing it as it looks for ./stride
+                    # Change to the directory containing UNIDOC
+                    unidoc_dir = os.path.dirname(UNIDOC)
+                    original_dir = os.getcwd()
+                    os.chdir(unidoc_dir)
+                    
+                    # Run UniDoc from its directory
+                    output = subprocess.check_output(f"./UniDoc_struct {pdb} {args.chain} {pdb_ss}", shell=True)
+                    
+                    # Change back to original directory
+                    os.chdir(original_dir)
+                    # Format the output
+                    output = str(output, 'utf-8').replace('~','-').replace(',','_').replace('/',',').rstrip('\n')
+                    print(f"UniDoc output: {output}")
+
+                    domains = output.split(',')
+                    ndoms = len(domains)
+                    chopping = ','.join(natsorted(domains))
+                    print(f"Found {ndoms} domains: {chopping}")
+                    
+                    if chopping == '':
+                        print("No domains found, setting chopping to NULL")
+                        chopping = "NULL"
+                        ndoms = 0
+                    
+                except Exception as e:
+                    print(f"Error processing {pdb}: {str(e)}")
+                    chopping = 'NO_SS'
+                    ndoms = 0
+            else:
+                print(f"Skipping processing for {bn} - chopping is {chopping}")
+                pdb_path_chopped = None
+                ndoms = 0
+
+            # end_time = time.time() - start_time 
+            
+            fn.write("{}\t{}\t{}\t{}\t{}\t{:.5f}\n".format(
+                bn,
+                md5,
+                nres, 
+                ndoms, 
+                chopping,
+                1.,
+                # end_time,
+            ))
+            
+            # Cleanup temporary files
+            try:
+                if os.path.exists(pdb_ss):
+                    os.remove(pdb_ss)
+                    print(f"Cleaned up temporary file: {pdb_ss}")
+
+                if pdb_path_chopped and os.path.exists(pdb_path_chopped):
+                    os.remove(pdb_path_chopped)
+                    print(f"Cleaned up temporary file: {pdb_path_chopped}")
+            except Exception as e:
+                print(f"Warning: Error during cleanup: {str(e)}")
+            
+            print("--------------------------------------------------------------")
+    
+if __name__ == "__main__":
+    main()

From 2e1c83674745b609024893dccb66a1fcc4480c66 Mon Sep 17 00:00:00 2001
From: Weiyin Wu <59970244+TigerWindWood@users.noreply.github.com>
Date: Thu, 29 May 2025 20:44:06 +0800
Subject: [PATCH 06/11] Delete ted_consensus_1.0/run_segmentation.sh

---
 ted_consensus_1.0/run_segmentation.sh | 104 --------------------------
 1 file changed, 104 deletions(-)
 delete mode 100755 ted_consensus_1.0/run_segmentation.sh

diff --git a/ted_consensus_1.0/run_segmentation.sh b/ted_consensus_1.0/run_segmentation.sh
deleted file mode 100755
index 5e76dab..0000000
--- a/ted_consensus_1.0/run_segmentation.sh
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/bin/bash
-
-# This file is a part of TED: The Encyclopedia of Domains. If you utilize or reference any content from this file, 
-# please cite the following paper:
-
-# Lau et al., 2024. Exploring structural diversity across the protein universe with The Encyclopedia of Domains.
-
-# Function to display usage message
-usage() {
-    echo "Usage: $0 -i <input_directory_with_pdb_files> -o <output_directory>"
-    exit 1
-}
-
-# Check that the environment exists and activate it 
-VENV_DIR="ted_consensus"
-if [ -d "$VENV_DIR" ]; then
-    source $VENV_DIR/bin/activate
-else
-    echo "Virtual environment 'ted_consensus' does not exist."
-    echo "Please run 'bash setup.sh' to create and set up the virtual environment."
-    exit 1
-fi
-
-# Parse command-line arguments
-while getopts "i:o:" opt; do
-    case $opt in
-        i) INPUT_DIR="$OPTARG" ;;
-        o) OUTPUT_DIR="$OPTARG" ;;
-        *) usage ;;
-    esac
-done
-
-# Check if both input and output directories are provided
-if [ -z "$INPUT_DIR" ] || [ -z "$OUTPUT_DIR" ]; then
-    usage
-fi
-
-# Check if the input directory exists
-if [ ! -d "$INPUT_DIR" ]; then
-    echo "Error: $INPUT_DIR is not a directory"
-    exit 1
-fi
-
-# Create the output directory if it doesn't exist
-if [ ! -d "$OUTPUT_DIR" ]; then
-    mkdir -p "$OUTPUT_DIR"
-fi
-
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-PY=$(which python)
-
-SEGMENT="${SCRIPT_DIR}/scripts/segment.sh"
-CONSENSUS="${SCRIPT_DIR}/scripts/get_consensus.py"
-FILTER_DOMAINS="${SCRIPT_DIR}/scripts/filter_domains_consensus.py"
-
-# Run Merizo on the input directory
-out_merizo="${OUTPUT_DIR}/chopping_merizo.txt"
-log_merizo="${OUTPUT_DIR}/chopping_merizo.log"
-bash "${SEGMENT}" -i "${INPUT_DIR}" -m merizo -o "${OUTPUT_DIR}" > "${log_merizo}" 2>&1
-
-if test ! -f "${out_merizo}" || test ! -s "${out_merizo}"; then
-    echo "Expected to find chopping file for Merizo at ${out_merizo}!"
-    exit 1
-fi
-
-# Run UniDoc on the Merizo output
-out_unidoc="${OUTPUT_DIR}/chopping_unidoc.txt"
-log_unidoc="${OUTPUT_DIR}/chopping_unidoc.log"
-bash "${SEGMENT}" -i "${INPUT_DIR}" -m unidoc -o "${OUTPUT_DIR}" -c "${out_merizo}" > "${log_unidoc}" 2>&1
-
-if test ! -f "${out_unidoc}" || test ! -s "${out_unidoc}"; then
-    echo "Expected to find chopping file for UniDoc at ${out_unidoc}!"
-    exit 1
-fi
-
-# Run Chainsaw on the input directory
-out_chainsaw="${OUTPUT_DIR}/chopping_chainsaw.txt"
-log_chainsaw="${OUTPUT_DIR}/chopping_chainsaw.log"
-bash "${SEGMENT}" -i "${INPUT_DIR}" -m chainsaw -o "${OUTPUT_DIR}" > "${log_chainsaw}" 2>&1
-
-if test ! -f "${out_chainsaw}" || test ! -s "${out_chainsaw}"; then
-    echo "Expected to find chopping file for Chainsaw at ${out_chainsaw}!"
-    exit 1
-fi
-
-echo "Calculating consensus domains from Merizo, UniDoc and Chainsaw outputs.. "
-
-# Calculate consensus from each of the outputs
-out_consensus="${OUTPUT_DIR}/consensus.tsv"
-log_consensus="${OUTPUT_DIR}/consensus.log"
-"${PY}" "${CONSENSUS}" -c "${out_merizo}" "${out_chainsaw}" "${out_unidoc}" -o "${out_consensus}" > "${log_consensus}" 2>&1
-
-if test -f "${out_consensus}"; then
-    "${PY}" "${FILTER_DOMAINS}" "${out_consensus}" -o "${out_consensus}.tmp"
-
-    if [ $? == 0 ]; then
-        mv "${out_consensus}.tmp" "${out_consensus}"
-    fi
-else
-    echo "Expected to find consensus domain file at ${out_consensus}"
-    exit 1
-fi
-
-echo "Consensus domain file saved at ${out_consensus}"
\ No newline at end of file

From 461ec09ce00d5dd3e68d84c501ef36fb15379777 Mon Sep 17 00:00:00 2001
From: Weiyin Wu <59970244+TigerWindWood@users.noreply.github.com>
Date: Thu, 29 May 2025 20:44:56 +0800
Subject: [PATCH 07/11] Add files via upload

change
---
 ted_consensus_1.0/run_segmentation.sh | 147 ++++++++++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 ted_consensus_1.0/run_segmentation.sh

diff --git a/ted_consensus_1.0/run_segmentation.sh b/ted_consensus_1.0/run_segmentation.sh
new file mode 100644
index 0000000..ac28301
--- /dev/null
+++ b/ted_consensus_1.0/run_segmentation.sh
@@ -0,0 +1,147 @@
+#!/bin/bash
+
+# This file is a part of TED: The Encyclopedia of Domains. If you utilize or reference any content from this file, 
+# please cite the following paper:
+
+# Lau et al., 2024. Exploring structural diversity across the protein universe with The Encyclopedia of Domains.
+
+# Function to display usage message
+usage() {
+    echo "Usage: $0 -i <input_directory_with_pdb_files> -o <output_directory>"
+    exit 1
+}
+
+# Check that the environment exists and activate it 
+BASE_DIR="/root/data/ted-tools-main/ted_consensus_1.0"
+VENV_DIR="${BASE_DIR}/ted_consensus"
+if [ -d "$VENV_DIR" ]; then
+    source $VENV_DIR/bin/activate
+else
+    echo "Virtual environment 'ted_consensus' does not exist."
+    echo "Please run 'bash setup.sh' to create and set up the virtual environment."
+    exit 1
+fi
+
+# Parse command-line arguments
+while getopts "i:o:" opt; do
+    case $opt in
+        i) INPUT_DIR="$OPTARG" ;;
+        o) OUTPUT_DIR="$OPTARG" ;;
+        *) usage ;;
+    esac
+done
+
+# Check if both input and output directories are provided
+if [ -z "$INPUT_DIR" ] || [ -z "$OUTPUT_DIR" ]; then
+    usage
+fi
+
+# Check if the input directory exists
+if [ ! -d "$INPUT_DIR" ]; then
+    echo "Error: $INPUT_DIR is not a directory"
+    exit 1
+fi
+
+# Create the output directory if it doesn't exist
+if [ ! -d "$OUTPUT_DIR" ]; then
+    mkdir -p "$OUTPUT_DIR"
+fi
+
+SCRIPT_DIR="/root/data/ted-tools-main/ted_consensus_1.0"
+PY=$(which python)
+
+SEGMENT="${SCRIPT_DIR}/scripts/segment.sh"
+CONSENSUS="${SCRIPT_DIR}/scripts/get_consensus.py"
+FILTER_DOMAINS="${SCRIPT_DIR}/scripts/filter_domains_consensus.py"
+
+# Calculate input data
+input_pdb_count=$(find "${INPUT_DIR}" -maxdepth 1 -name '*.pdb' | wc -l)
+
+# Run Merizo on the input directory
+out_merizo="${OUTPUT_DIR}/chopping_merizo.txt"
+log_merizo="${OUTPUT_DIR}/chopping_merizo.log"
+if test -f "${out_merizo}"; then
+    # Output file exists, check the line count
+    merizo_count=$(wc -l < "${out_merizo}")
+    # Result count is equal, skip execution
+    if [ "${merizo_count}" -eq "${input_pdb_count}" ]; then
+        echo "${out_merizo} already exists"
+    # Result count is not equal, delete the output file and log file, and execute again
+    else
+        rm -f "${out_merizo}" "${log_merizo}"
+        bash "${SEGMENT}" -i "${INPUT_DIR}" -m merizo -o "${OUTPUT_DIR}" > "${log_merizo}" 2>&1
+    fi
+# Output file does not exist, execute directly
+else
+    rm -f "${out_merizo}" "${log_merizo}"
+    bash "${SEGMENT}" -i "${INPUT_DIR}" -m merizo -o "${OUTPUT_DIR}" > "${log_merizo}" 2>&1
+fi
+
+if test ! -f "${out_merizo}" || test ! -s "${out_merizo}"; then
+    echo "Expected to find chopping file for Merizo at ${out_merizo}!"
+    exit 1
+fi
+
+# Run UniDoc on the Merizo output
+out_unidoc="${OUTPUT_DIR}/chopping_unidoc.txt"
+log_unidoc="${OUTPUT_DIR}/chopping_unidoc.log"
+if test -f "${out_unidoc}"; then
+    # Output file exists, check the line count
+    unidoc_count=$(wc -l < "${out_unidoc}")
+    if [ "${unidoc_count}" -eq "${input_pdb_count}" ]; then
+        echo "${out_unidoc} already exists"
+    else
+        rm -f "${out_unidoc}" "${log_unidoc}"
+	bash "${SEGMENT}" -i "${INPUT_DIR}" -m unidoc -o "${OUTPUT_DIR}" -c "${out_merizo}" > "${log_unidoc}" 2>&1
+    fi
+else
+    rm -f "${out_unidoc}" "${log_unidoc}"
+    bash "${SEGMENT}" -i "${INPUT_DIR}" -m unidoc -o "${OUTPUT_DIR}" -c "${out_merizo}" > "${log_unidoc}" 2>&1
+fi
+
+if test ! -f "${out_unidoc}" || test ! -s "${out_unidoc}"; then
+    echo "Expected to find chopping file for UniDoc at ${out_unidoc}!"
+    exit 1
+fi
+
+# Run Chainsaw on the input directory
+out_chainsaw="${OUTPUT_DIR}/chopping_chainsaw.txt"
+log_chainsaw="${OUTPUT_DIR}/chopping_chainsaw.log"
+if test -f "${out_chainsaw}"; then
+    # Output file exists, check the line count
+    chainsaw_count=$(wc -l < "${out_chainsaw}")
+    if [ "${chainsaw_count}" -eq "${input_pdb_count}" ]; then
+        echo "${out_chainsaw} already exists"
+    else
+        rm -f "${out_chainsaw}" "${log_chainsaw}"
+        bash "${SEGMENT}" -i "${INPUT_DIR}" -m chainsaw -o "${OUTPUT_DIR}" > "${log_chainsaw}" 2>&1
+	fi
+else
+    rm -f "${out_chainsaw}" "${log_chainsaw}"
+    bash "${SEGMENT}" -i "${INPUT_DIR}" -m chainsaw -o "${OUTPUT_DIR}" > "${log_chainsaw}" 2>&1
+fi
+
+if test ! -f "${out_chainsaw}" || test ! -s "${out_chainsaw}"; then
+    echo "Expected to find chopping file for Chainsaw at ${out_chainsaw}!"
+    exit 1
+fi
+
+
+echo "Calculating consensus domains from Merizo, UniDoc and Chainsaw outputs.. "
+# Calculate consensus from each of the outputs
+out_consensus="${OUTPUT_DIR}/consensus.tsv"
+log_consensus="${OUTPUT_DIR}/consensus.log"
+"${PY}" "${CONSENSUS}" -c "${out_merizo}" "${out_chainsaw}" "${out_unidoc}" -o "${out_consensus}" > "${log_consensus}" 2>&1
+
+if test -f "${out_consensus}"; then
+    "${PY}" "${FILTER_DOMAINS}" "${out_consensus}" -o "${out_consensus}.tmp"
+
+    if [ $? == 0 ]; then
+        mv "${out_consensus}.tmp" "${out_consensus}"
+    fi
+else
+    echo "Expected to find consensus domain file at ${out_consensus}"
+    exit 1
+fi
+
+echo "Consensus domain file saved at ${out_consensus}"

From fd0ca1b0920d6234413cf3a20164d747ba4db561 Mon Sep 17 00:00:00 2001
From: Weiyin Wu <59970244+TigerWindWood@users.noreply.github.com>
Date: Thu, 29 May 2025 20:50:00 +0800
Subject: [PATCH 08/11] Delete ted_consensus_1.0/scripts/segment.sh

---
 ted_consensus_1.0/scripts/segment.sh | 101 ---------------------------
 1 file changed, 101 deletions(-)
 delete mode 100644 ted_consensus_1.0/scripts/segment.sh

diff --git a/ted_consensus_1.0/scripts/segment.sh b/ted_consensus_1.0/scripts/segment.sh
deleted file mode 100644
index 6cbfabd..0000000
--- a/ted_consensus_1.0/scripts/segment.sh
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/bin/bash
-
-# This file is a part of TED: The Encyclopedia of Domains. If you utilize or reference any content from this file, 
-# please cite the following paper:
-
-# Lau et al., 2024. Exploring structural diversity across the protein universe with The Encyclopedia of Domains.
-
-# Script for running segmentation methods given a directory of structures.
-
-# Usage: 
-# bash run_segment_afdb.sh -i <structure_directory> -m <merizo/unidoc/chainsaw>
-
-set -eu
-
-# Directories and paths
-SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
-PROG_DIR="${SCRIPT_DIR}/../programs"
-
-py=$(which python)
-custom_chopping=''
-
-FILTER_DOMAINS="${SCRIPT_DIR}/filter_domains.py"
-
-while getopts ":i:m:o:c:" opt; do
-  case $opt in
-    i) inputs=$(readlink -f "$OPTARG") ;;
-    m) method=$OPTARG ;;
-    o) output=$(readlink -f "$OPTARG") ;;
-    c) custom_chopping=${OPTARG} ;;
-    \?)
-      echo "Invalid option: -$OPTARG" >&2
-      exit 1
-      ;;
-    :)
-      echo "Option -$OPTARG requires an argument." >&2
-      exit 1
-      ;;
-  esac
-done
-
-# Check if both options are provided
-if [[ -z "${inputs}" || -z "${method}" || -z "${output}" ]]; then
-  echo "Usage: run_segment_afdb.sh -i <structure_directory> -m <merizo/unidoc/chainsaw> -o <output_directory> [-c <chopping>]"
-  exit 1
-fi
-
-case $method in
-  "merizo")
-    RUN_SCRIPT="${PROG_DIR}/merizo/predict_afdb.py"
-    OFFSET_RESI=0
-    ;;
-  "unidoc")
-    RUN_SCRIPT="${PROG_DIR}/unidoc/Run_UniDoc_from_scratch_structure_afdb.py"
-    OFFSET_RESI=0
-    ;;
-  "chainsaw")
-    RUN_SCRIPT="${PROG_DIR}/chainsaw/get_predictions.py"
-    OFFSET_RESI=1
-    ;;
-  *)
-    echo "Invalid method: ${method}. Allowed options are 'merizo', 'unidoc', or 'chainsaw'."
-    exit 1
-    ;;
-esac
-
-# Run method
-output_file="${output%/}/chopping_${method}.txt"
-
-echo "Running ${method} on targets in ${inputs}"
-
-# Each method will take the list containing the paths to the targets
-if [ "${method}" = "merizo" ] || [ "${method}" = "unidoc" ]; then
-    target_list="${output%/}targets.txt"
-    readlink -f "${inputs}/"*.pdb > "${target_list}"
-
-    if [[ ${custom_chopping} == '' ]]; then
-        ${py} "${RUN_SCRIPT}" -l "${target_list}" --out "${output_file}"
-    else
-        ${py} "${RUN_SCRIPT}" -l "${target_list}" --out "${output_file}" --inherit_chopping "${custom_chopping}"
-    fi
-
-    # Cleanup
-    if test -f "${target_list}"; then
-        rm "${target_list}"
-    fi
-
-elif [ "${method}" = "chainsaw" ]; then
-    ${py} "${RUN_SCRIPT}" --structure_directory "${inputs}" -o "${output_file}" --append
-fi
-
-# Filter choppings to remove small segments and single-residue domains
-if test -f "${output_file}"; then
-    "${py}" "${FILTER_DOMAINS}" "${output_file}" -o "${output_file}.tmp" --offset_resi "${OFFSET_RESI}"
-
-    if [ $? == 0 ]; then
-        mv "${output_file}.tmp" "${output_file}"
-    fi
-else
-    echo "Expected to find output file at ${output_file}"
-    exit 1
-fi

From c5a1a043cb82587f43eb956a2a0091574334cecc Mon Sep 17 00:00:00 2001
From: Weiyin Wu <59970244+TigerWindWood@users.noreply.github.com>
Date: Thu, 29 May 2025 20:50:24 +0800
Subject: [PATCH 09/11] Add files via upload

---
 ted_consensus_1.0/scripts/segment.sh | 101 +++++++++++++++++++++++++++
 1 file changed, 101 insertions(+)
 create mode 100644 ted_consensus_1.0/scripts/segment.sh

diff --git a/ted_consensus_1.0/scripts/segment.sh b/ted_consensus_1.0/scripts/segment.sh
new file mode 100644
index 0000000..dfc5302
--- /dev/null
+++ b/ted_consensus_1.0/scripts/segment.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# This file is a part of TED: The Encyclopedia of Domains. If you utilize or reference any content from this file, 
+# please cite the following paper:
+
+# Lau et al., 2024. Exploring structural diversity across the protein universe with The Encyclopedia of Domains.
+
+# Script for running segmentation methods given a directory of structures.
+
+# Usage: 
+# bash run_segment_afdb.sh -i <structure_directory> -m <merizo/unidoc/chainsaw>
+
+set -eu
+
+# Directories and paths
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+PROG_DIR="${SCRIPT_DIR}/../programs"
+
+py=$(which python)
+custom_chopping=''
+
+FILTER_DOMAINS="${SCRIPT_DIR}/filter_domains.py"
+
+while getopts ":i:m:o:c:" opt; do
+  case $opt in
+    i) inputs=$(readlink -f "$OPTARG") ;;
+    m) method=$OPTARG ;;
+    o) output=$(readlink -f "$OPTARG") ;;
+    c) custom_chopping=${OPTARG} ;;
+    \?)
+      echo "Invalid option: -$OPTARG" >&2
+      exit 1
+      ;;
+    :)
+      echo "Option -$OPTARG requires an argument." >&2
+      exit 1
+      ;;
+  esac
+done
+
+# Check if both options are provided
+if [[ -z "${inputs}" || -z "${method}" || -z "${output}" ]]; then
+  echo "Usage: run_segment_afdb.sh -i <structure_directory> -m <merizo/unidoc/chainsaw> -o <output_directory> [-c <chopping>]"
+  exit 1
+fi
+
+case $method in
+  "merizo")
+    RUN_SCRIPT="${PROG_DIR}/merizo/predict_afdb.py"
+    OFFSET_RESI=0
+    ;;
+  "unidoc")
+    RUN_SCRIPT="${PROG_DIR}/unidoc/Run_UniDoc_from_scratch_structure_afdb.py"
+    OFFSET_RESI=0
+    ;;
+  "chainsaw")
+    RUN_SCRIPT="${PROG_DIR}/chainsaw/get_predictions.py"
+    OFFSET_RESI=1
+    ;;
+  *)
+    echo "Invalid method: ${method}. Allowed options are 'merizo', 'unidoc', or 'chainsaw'."
+    exit 1
+    ;;
+esac
+
+# Run method
+output_file="${output%/}/chopping_${method}.txt"
+
+echo "Running ${method} on targets in ${inputs}"
+
+# Each method will take the list containing the paths to the targets
+if [ "${method}" = "merizo" ] || [ "${method}" = "unidoc" ]; then
+    target_list="${output%/}targets.txt"
+    find "${inputs}/" -type f -name "*.pdb" -printf "%p\n" > "${target_list}"
+
+    if [[ ${custom_chopping} == '' ]]; then
+        ${py} "${RUN_SCRIPT}" -l "${target_list}" --out "${output_file}"
+    else
+        ${py} "${RUN_SCRIPT}" -l "${target_list}" --out "${output_file}" --inherit_chopping "${custom_chopping}"
+    fi
+
+    # Cleanup
+    if test -f "${target_list}"; then
+        rm "${target_list}"
+    fi
+
+elif [ "${method}" = "chainsaw" ]; then
+    ${py} "${RUN_SCRIPT}" --structure_directory "${inputs}" -o "${output_file}" --append
+fi
+
+# Filter choppings to remove small segments and single-residue domains
+if test -f "${output_file}"; then
+    "${py}" "${FILTER_DOMAINS}" "${output_file}" -o "${output_file}.tmp" --offset_resi "${OFFSET_RESI}"
+
+    if [ $? == 0 ]; then
+        mv "${output_file}.tmp" "${output_file}"
+    fi
+else
+    echo "Expected to find output file at ${output_file}"
+    exit 1
+fi

From 830225603cb621d832447c3a506abbe508bd7b21 Mon Sep 17 00:00:00 2001
From: Weiyin Wu <59970244+TigerWindWood@users.noreply.github.com>
Date: Thu, 29 May 2025 21:03:02 +0800
Subject: [PATCH 10/11] Delete ted_consensus_1.0/run_segmentation.sh

---
 ted_consensus_1.0/run_segmentation.sh | 147 --------------------------
 1 file changed, 147 deletions(-)
 delete mode 100644 ted_consensus_1.0/run_segmentation.sh

diff --git a/ted_consensus_1.0/run_segmentation.sh b/ted_consensus_1.0/run_segmentation.sh
deleted file mode 100644
index ac28301..0000000
--- a/ted_consensus_1.0/run_segmentation.sh
+++ /dev/null
@@ -1,147 +0,0 @@
-#!/bin/bash
-
-# This file is a part of TED: The Encyclopedia of Domains. If you utilize or reference any content from this file, 
-# please cite the following paper:
-
-# Lau et al., 2024. Exploring structural diversity across the protein universe with The Encyclopedia of Domains.
-
-# Function to display usage message
-usage() {
-    echo "Usage: $0 -i <input_directory_with_pdb_files> -o <output_directory>"
-    exit 1
-}
-
-# Check that the environment exists and activate it 
-BASE_DIR="/root/data/ted-tools-main/ted_consensus_1.0"
-VENV_DIR="${BASE_DIR}/ted_consensus"
-if [ -d "$VENV_DIR" ]; then
-    source $VENV_DIR/bin/activate
-else
-    echo "Virtual environment 'ted_consensus' does not exist."
-    echo "Please run 'bash setup.sh' to create and set up the virtual environment."
-    exit 1
-fi
-
-# Parse command-line arguments
-while getopts "i:o:" opt; do
-    case $opt in
-        i) INPUT_DIR="$OPTARG" ;;
-        o) OUTPUT_DIR="$OPTARG" ;;
-        *) usage ;;
-    esac
-done
-
-# Check if both input and output directories are provided
-if [ -z "$INPUT_DIR" ] || [ -z "$OUTPUT_DIR" ]; then
-    usage
-fi
-
-# Check if the input directory exists
-if [ ! -d "$INPUT_DIR" ]; then
-    echo "Error: $INPUT_DIR is not a directory"
-    exit 1
-fi
-
-# Create the output directory if it doesn't exist
-if [ ! -d "$OUTPUT_DIR" ]; then
-    mkdir -p "$OUTPUT_DIR"
-fi
-
-SCRIPT_DIR="/root/data/ted-tools-main/ted_consensus_1.0"
-PY=$(which python)
-
-SEGMENT="${SCRIPT_DIR}/scripts/segment.sh"
-CONSENSUS="${SCRIPT_DIR}/scripts/get_consensus.py"
-FILTER_DOMAINS="${SCRIPT_DIR}/scripts/filter_domains_consensus.py"
-
-# Calculate input data
-input_pdb_count=$(find "${INPUT_DIR}" -maxdepth 1 -name '*.pdb' | wc -l)
-
-# Run Merizo on the input directory
-out_merizo="${OUTPUT_DIR}/chopping_merizo.txt"
-log_merizo="${OUTPUT_DIR}/chopping_merizo.log"
-if test -f "${out_merizo}"; then
-    # Output file exists, check the line count
-    merizo_count=$(wc -l < "${out_merizo}")
-    # Result count is equal, skip execution
-    if [ "${merizo_count}" -eq "${input_pdb_count}" ]; then
-        echo "${out_merizo} already exists"
-    # Result count is not equal, delete the output file and log file, and execute again
-    else
-        rm -f "${out_merizo}" "${log_merizo}"
-        bash "${SEGMENT}" -i "${INPUT_DIR}" -m merizo -o "${OUTPUT_DIR}" > "${log_merizo}" 2>&1
-    fi
-# Output file does not exist, execute directly
-else
-    rm -f "${out_merizo}" "${log_merizo}"
-    bash "${SEGMENT}" -i "${INPUT_DIR}" -m merizo -o "${OUTPUT_DIR}" > "${log_merizo}" 2>&1
-fi
-
-if test ! -f "${out_merizo}" || test ! -s "${out_merizo}"; then
-    echo "Expected to find chopping file for Merizo at ${out_merizo}!"
-    exit 1
-fi
-
-# Run UniDoc on the Merizo output
-out_unidoc="${OUTPUT_DIR}/chopping_unidoc.txt"
-log_unidoc="${OUTPUT_DIR}/chopping_unidoc.log"
-if test -f "${out_unidoc}"; then
-    # Output file exists, check the line count
-    unidoc_count=$(wc -l < "${out_unidoc}")
-    if [ "${unidoc_count}" -eq "${input_pdb_count}" ]; then
-        echo "${out_unidoc} already exists"
-    else
-        rm -f "${out_unidoc}" "${log_unidoc}"
-	bash "${SEGMENT}" -i "${INPUT_DIR}" -m unidoc -o "${OUTPUT_DIR}" -c "${out_merizo}" > "${log_unidoc}" 2>&1
-    fi
-else
-    rm -f "${out_unidoc}" "${log_unidoc}"
-    bash "${SEGMENT}" -i "${INPUT_DIR}" -m unidoc -o "${OUTPUT_DIR}" -c "${out_merizo}" > "${log_unidoc}" 2>&1
-fi
-
-if test ! -f "${out_unidoc}" || test ! -s "${out_unidoc}"; then
-    echo "Expected to find chopping file for UniDoc at ${out_unidoc}!"
-    exit 1
-fi
-
-# Run Chainsaw on the input directory
-out_chainsaw="${OUTPUT_DIR}/chopping_chainsaw.txt"
-log_chainsaw="${OUTPUT_DIR}/chopping_chainsaw.log"
-if test -f "${out_chainsaw}"; then
-    # Output file exists, check the line count
-    chainsaw_count=$(wc -l < "${out_chainsaw}")
-    if [ "${chainsaw_count}" -eq "${input_pdb_count}" ]; then
-        echo "${out_chainsaw} already exists"
-    else
-        rm -f "${out_chainsaw}" "${log_chainsaw}"
-        bash "${SEGMENT}" -i "${INPUT_DIR}" -m chainsaw -o "${OUTPUT_DIR}" > "${log_chainsaw}" 2>&1
-	fi
-else
-    rm -f "${out_chainsaw}" "${log_chainsaw}"
-    bash "${SEGMENT}" -i "${INPUT_DIR}" -m chainsaw -o "${OUTPUT_DIR}" > "${log_chainsaw}" 2>&1
-fi
-
-if test ! -f "${out_chainsaw}" || test ! -s "${out_chainsaw}"; then
-    echo "Expected to find chopping file for Chainsaw at ${out_chainsaw}!"
-    exit 1
-fi
-
-
-echo "Calculating consensus domains from Merizo, UniDoc and Chainsaw outputs.. "
-# Calculate consensus from each of the outputs
-out_consensus="${OUTPUT_DIR}/consensus.tsv"
-log_consensus="${OUTPUT_DIR}/consensus.log"
-"${PY}" "${CONSENSUS}" -c "${out_merizo}" "${out_chainsaw}" "${out_unidoc}" -o "${out_consensus}" > "${log_consensus}" 2>&1
-
-if test -f "${out_consensus}"; then
-    "${PY}" "${FILTER_DOMAINS}" "${out_consensus}" -o "${out_consensus}.tmp"
-
-    if [ $? == 0 ]; then
-        mv "${out_consensus}.tmp" "${out_consensus}"
-    fi
-else
-    echo "Expected to find consensus domain file at ${out_consensus}"
-    exit 1
-fi
-
-echo "Consensus domain file saved at ${out_consensus}"

From dd26f36848b0d3bd2ce92f97eb1dbbd8b83f52df Mon Sep 17 00:00:00 2001
From: Weiyin Wu <59970244+TigerWindWood@users.noreply.github.com>
Date: Thu, 29 May 2025 21:04:31 +0800
Subject: [PATCH 11/11] Add files via upload

---
 ted_consensus_1.0/run_segmentation.sh | 104 ++++++++++++++++++++++++++
 1 file changed, 104 insertions(+)
 create mode 100644 ted_consensus_1.0/run_segmentation.sh

diff --git a/ted_consensus_1.0/run_segmentation.sh b/ted_consensus_1.0/run_segmentation.sh
new file mode 100644
index 0000000..5e76dab
--- /dev/null
+++ b/ted_consensus_1.0/run_segmentation.sh
@@ -0,0 +1,104 @@
+#!/bin/bash
+
+# This file is a part of TED: The Encyclopedia of Domains. If you utilize or reference any content from this file, 
+# please cite the following paper:
+
+# Lau et al., 2024. Exploring structural diversity across the protein universe with The Encyclopedia of Domains.
+
+# Function to display usage message
+usage() {
+    echo "Usage: $0 -i <input_directory_with_pdb_files> -o <output_directory>"
+    exit 1
+}
+
+# Check that the environment exists and activate it 
+VENV_DIR="ted_consensus"
+if [ -d "$VENV_DIR" ]; then
+    source $VENV_DIR/bin/activate
+else
+    echo "Virtual environment 'ted_consensus' does not exist."
+    echo "Please run 'bash setup.sh' to create and set up the virtual environment."
+    exit 1
+fi
+
+# Parse command-line arguments
+while getopts "i:o:" opt; do
+    case $opt in
+        i) INPUT_DIR="$OPTARG" ;;
+        o) OUTPUT_DIR="$OPTARG" ;;
+        *) usage ;;
+    esac
+done
+
+# Check if both input and output directories are provided
+if [ -z "$INPUT_DIR" ] || [ -z "$OUTPUT_DIR" ]; then
+    usage
+fi
+
+# Check if the input directory exists
+if [ ! -d "$INPUT_DIR" ]; then
+    echo "Error: $INPUT_DIR is not a directory"
+    exit 1
+fi
+
+# Create the output directory if it doesn't exist
+if [ ! -d "$OUTPUT_DIR" ]; then
+    mkdir -p "$OUTPUT_DIR"
+fi
+
+SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
+PY=$(which python)
+
+SEGMENT="${SCRIPT_DIR}/scripts/segment.sh"
+CONSENSUS="${SCRIPT_DIR}/scripts/get_consensus.py"
+FILTER_DOMAINS="${SCRIPT_DIR}/scripts/filter_domains_consensus.py"
+
+# Run Merizo on the input directory
+out_merizo="${OUTPUT_DIR}/chopping_merizo.txt"
+log_merizo="${OUTPUT_DIR}/chopping_merizo.log"
+bash "${SEGMENT}" -i "${INPUT_DIR}" -m merizo -o "${OUTPUT_DIR}" > "${log_merizo}" 2>&1
+
+if test ! -f "${out_merizo}" || test ! -s "${out_merizo}"; then
+    echo "Expected to find chopping file for Merizo at ${out_merizo}!"
+    exit 1
+fi
+
+# Run UniDoc on the Merizo output
+out_unidoc="${OUTPUT_DIR}/chopping_unidoc.txt"
+log_unidoc="${OUTPUT_DIR}/chopping_unidoc.log"
+bash "${SEGMENT}" -i "${INPUT_DIR}" -m unidoc -o "${OUTPUT_DIR}" -c "${out_merizo}" > "${log_unidoc}" 2>&1
+
+if test ! -f "${out_unidoc}" || test ! -s "${out_unidoc}"; then
+    echo "Expected to find chopping file for UniDoc at ${out_unidoc}!"
+    exit 1
+fi
+
+# Run Chainsaw on the input directory
+out_chainsaw="${OUTPUT_DIR}/chopping_chainsaw.txt"
+log_chainsaw="${OUTPUT_DIR}/chopping_chainsaw.log"
+bash "${SEGMENT}" -i "${INPUT_DIR}" -m chainsaw -o "${OUTPUT_DIR}" > "${log_chainsaw}" 2>&1
+
+if test ! -f "${out_chainsaw}" || test ! -s "${out_chainsaw}"; then
+    echo "Expected to find chopping file for Chainsaw at ${out_chainsaw}!"
+    exit 1
+fi
+
+echo "Calculating consensus domains from Merizo, UniDoc and Chainsaw outputs.. "
+
+# Calculate consensus from each of the outputs
+out_consensus="${OUTPUT_DIR}/consensus.tsv"
+log_consensus="${OUTPUT_DIR}/consensus.log"
+"${PY}" "${CONSENSUS}" -c "${out_merizo}" "${out_chainsaw}" "${out_unidoc}" -o "${out_consensus}" > "${log_consensus}" 2>&1
+
+if test -f "${out_consensus}"; then
+    "${PY}" "${FILTER_DOMAINS}" "${out_consensus}" -o "${out_consensus}.tmp"
+
+    if [ $? == 0 ]; then
+        mv "${out_consensus}.tmp" "${out_consensus}"
+    fi
+else
+    echo "Expected to find consensus domain file at ${out_consensus}"
+    exit 1
+fi
+
+echo "Consensus domain file saved at ${out_consensus}"
\ No newline at end of file