From 6965702300790ab28c8f23be12aa0d2c8f329ebb Mon Sep 17 00:00:00 2001 From: Alex Lee Date: Tue, 15 Jul 2025 21:49:48 -0700 Subject: [PATCH 1/3] figure files --- analysis/fig-s4.py | 94 ++++++++++++++++++++++++++++++ analysis/fig-s5.py | 89 ++++++++++++++++++++++++++++ analysis/figure2b+d.py | 60 +++++++++++++++++++ analysis/figure2c.py | 129 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 372 insertions(+) create mode 100644 analysis/fig-s4.py create mode 100644 analysis/fig-s5.py create mode 100644 analysis/figure2b+d.py create mode 100644 analysis/figure2c.py diff --git a/analysis/fig-s4.py b/analysis/fig-s4.py new file mode 100644 index 0000000..6799c48 --- /dev/null +++ b/analysis/fig-s4.py @@ -0,0 +1,94 @@ +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt +import seaborn as sns +import tqdm +import colorcet + +sns.set(font_scale=1.7) +sns.set_style("whitegrid") +plt.rcParams["axes.grid"] = False +plt.rc("axes", edgecolor="black") +plt.rc( + "text.latex", + preamble=r"\usepackage{newpxtext}\usepackage{newpxmath}\usepackage{commath}\usepackage{mathtools}", +) +plt.rc("font", family="serif", size=16.0, weight="medium") +plt.rc("savefig", dpi=500) +plt.rc("legend", loc="best", fontsize="medium", fancybox=True, framealpha=0.5) +plt.rc("lines", linewidth=2.5, markersize=10, markeredgewidth=2.5) +plt.rc("axes", titlepad=10) + + +both = pd.read_csv('pdb_plus_240k/scRMSD_best_240k_plus_pdbFirst_aln0_cluster.tsv', sep='\t', + names=['representative', 'member']) + +output_rows = [] +for resample_freq in tqdm.tqdm((0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)): + for iter_n in range(5): + resampled = both.sample(frac=resample_freq, random_state=iter_n, replace=False) + n_uniq_clust = resampled['representative'].nunique() + output_rows.append(dict( + freq=resample_freq, + iteration=iter_n, + num_uniq_clust=n_uniq_clust, + comparison=name, + )) + + output_rows.append(dict(freq=1, iteration=0, comparison=name, num_uniq_clust=both.representative.nunique())) + output_rows.append(dict(freq=0, iteration=0, comparison=name, num_uniq_clust=0)) + +df_full_resamp = pd.DataFrame(output_rows) + +output_rows = [] +for full_df, name in zip((both,), ('PDB + syn',)): + for resample_freq in tqdm.tqdm((0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)): + # names of synthetic samples will start with date + rest_df = full_df[~full_df['member'].str.startswith("2024")] + df = full_df[full_df['member'].str.startswith("2024")] + rest_u = rest_df.representative.nunique() + + for iter_n in range(5): + resampled = df.sample(frac=resample_freq, random_state=iter_n, replace=False) + n_uniq_clust = resampled['representative'].nunique() + rest_u + output_rows.append(dict( + freq=resample_freq, + iteration=iter_n, + num_uniq_clust=n_uniq_clust, + comparison=name, + )) + + output_rows.append(dict(freq=1, iteration=0, comparison=name, num_uniq_clust=full_df.representative.nunique())) + output_rows.append(dict(freq=0, iteration=0, comparison=name, num_uniq_clust=0)) + +df_partial_resamp = pd.DataFrame(output_rows) + +plt.figure() +sns.lineplot(data=df_partial_resamp, + x='freq', + y='num_uniq_clust', + label='Partial resampling', + color='#0D96C9',) + +# Plot the full resampling line in blue +sns.lineplot(data=df_full_resamp, + x='freq', + y='num_uniq_clust', + label='Full resampling', + color='#404040', + ) + +plt.xlabel("Proportion of data points \nsampled") +plt.ylabel("No. of clusters") +plt.xticks(np.arange(0, 1.1, 0.1), rotation=75) +plt.legend(title=None, loc='best', frameon=False) +plt.title("Number of distinct clusters by\n resampling frequency") + +# Remove top and right spines +plt.gca().spines['top'].set_visible(False) +plt.gca().spines['right'].set_visible(False) + +plt.tight_layout() + + + diff --git a/analysis/fig-s5.py b/analysis/fig-s5.py new file mode 100644 index 0000000..0ade421 --- /dev/null +++ b/analysis/fig-s5.py @@ -0,0 +1,89 @@ +#!/usr/bin/env python +# coding: utf-8 + +# In[1]: + + +import pathlib +import pandas as pd +import pathlib +import pandas as pd +import joblib +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt + +import re + +sns.set(font_scale=1.7) +sns.set_style("whitegrid") +plt.rcParams['axes.grid'] = False +plt.rc('axes',edgecolor='black') + +plt.rc("text", usetex=False) +plt.rc( + "text.latex", + preamble=r"\usepackage{newpxtext}\usepackage{newpxmath}\usepackage{commath}\usepackage{mathtools}", +) +plt.rcParams['font.family'] = 'sans-serif' +plt.rcParams['font.sans-serif'] = ['DejaVu Sans'] +# plt.rc("font", family="serif", size=16.0, weight="medium") +plt.rc("savefig", dpi=500) +plt.rc("legend", loc="best", fontsize="medium", fancybox=True, framealpha=0.5) +plt.rc("lines", linewidth=2.5, markersize=10, markeredgewidth=2.5) +plt.rc("axes", titlepad=10) + +colors = ["#BBBBBB", "#33BBEE", "#EE3377", "#009988", "#CC3311", "#0077BB"] +colors = list(reversed(colors)) +sns.set_palette(sns.color_palette(colors)) +# set mpl palette +plt.rcParams["axes.prop_cycle"] = plt.cycler(color=colors) + + +# In[2]: + + +df = pd.read_parquet('temperature_scRMSD_gridsearch_results.parquet') + + +# In[3]: + + +averages = df.groupby(["temperature", + 'backbone_pdb'])[ + ["scRMSD", "TM", "aa_length"] +].mean() +averages = averages.reset_index() + +averages["pass"] = averages["scRMSD"] <= 2 +averages["pass_tm"] = averages["TM"] >= 0.5 +bins = np.arange(0, 1100, 100) + + +# In[4]: + + +plt.figure() +averages['pass_pct'] = averages['pass'] * 100 +sns.lineplot(data=averages, + x='temperature', + y='pass_pct', + marker='o', + color='#0D96C9', + errorbar=None) + +plt.xlabel('Temperature') +plt.ylabel('% backbones with\n(scRMSD < 2Å)') + + +# Force x-axis tick labels to show +plt.gca().set_xticks(averages['temperature'].unique()) # Set ticks at each temperature value +plt.gca().set_xticklabels(averages['temperature'].unique()) # Force labels to show + +# Remove top and right spines +plt.gca().spines['top'].set_visible(False) +plt.gca().spines['right'].set_visible(False) + +plt.tight_layout() +plt.title('BackboneRef sample designability\nby temperature') + diff --git a/analysis/figure2b+d.py b/analysis/figure2b+d.py new file mode 100644 index 0000000..2b676c7 --- /dev/null +++ b/analysis/figure2b+d.py @@ -0,0 +1,60 @@ +import pandas as pd +import seaborn as sns +import matplotlib.pyplot as plt +from scipy import stats + +sns.set(font_scale=1.7) +sns.set_style("whitegrid") +plt.rcParams['axes.grid'] = False +plt.rc('axes',edgecolor='black') + +plt.rc("text", usetex=False) +plt.rc( + "text.latex", + preamble=r"\usepackage{newpxtext}\usepackage{newpxmath}\usepackage{commath}\usepackage{mathtools}", +) +plt.rcParams['font.family'] = 'sans-serif' +plt.rcParams['font.sans-serif'] = ['DejaVu Sans'] +plt.rc("savefig", dpi=500) +plt.rc("legend", loc="best", fontsize="medium", fancybox=True, framealpha=0.5) +plt.rc("lines", linewidth=2.5, markersize=10, markeredgewidth=2.5) +plt.rc("axes", titlepad=10) +colors = ["#BBBBBB", "#33BBEE", "#EE3377", "#009988", "#CC3311", "#0077BB"] +colors = list(reversed(colors)) +sns.set_palette(sns.color_palette(colors)) +# set mpl palette +plt.rcParams["axes.prop_cycle"] = plt.cycler(color=colors) + +# this should be on the zenodo +df = pd.read_parquet('backbone_novelty_quality_statistics.parquet') + +# figure 2d +sns.histplot(df, + x='length', + y='max_search_tm') +plt.ylabel('TM-score (AFDB/UniProt)') +plt.xlabel('Backbone length (AA)') +plt.xlim(40, 512) +_ = plt.xticks([100, 200, 300, 400, 500, ]) +plt.ylim(0, 1.0) +plt.title('Max. TM-score of BR structures') +# make correlation txt +corr_df = df[['length', 'max_search_tm']] +corr_df.dropna(inplace=True) +res = stats.pearsonr(corr_df['length'], corr_df['max_search_tm']) +r = res.correlation.item() +_ = plt.text(350, 0.85, f'R = {r:.2f}') + + +# figure 2b +plt.figure() +sns.ecdfplot(df['avg_scrmsd'], stat='proportion', + complementary=False, linewidth=2, color='#0D96C9') +plt.xlabel('Average scRMSD') +plt.ylabel('Percentile') +plt.title('Designability of BR backbones') +plt.axvline(x=2, linestyle='--', color='black',zorder=-1, alpha=0.5) + + + + diff --git a/analysis/figure2c.py b/analysis/figure2c.py new file mode 100644 index 0000000..f58c0c9 --- /dev/null +++ b/analysis/figure2c.py @@ -0,0 +1,129 @@ +import pathlib +import pandas as pd +import joblib +import numpy as np +import seaborn as sns +import matplotlib.pyplot as plt + +sns.set(font_scale=1.7) +sns.set_style("whitegrid") +plt.rcParams["axes.grid"] = False +plt.rc("axes", edgecolor="black") +plt.rcParams['font.family'] = 'sans-serif' +plt.rcParams['font.sans-serif'] = ['DejaVu Sans'] +plt.rc("savefig", dpi=500) +plt.rc("legend", loc="best", fontsize="medium", fancybox=True, framealpha=0.5) +plt.rc("lines", linewidth=2.5, markersize=10, markeredgewidth=2.5) +plt.rc("axes", titlepad=10) + +colors = ["#BBBBBB", "#33BBEE", "#EE3377", "#009988", "#CC3311", "#0077BB"] +colors = list(reversed(colors)) +sns.set_palette(sns.color_palette(colors)) +plt.rcParams["axes.prop_cycle"] = plt.cycler(color=colors) + +together_cluster = pd.read_csv( + "pdb_plus_240k/scRMSD_best_240k_plus_pdbFirst_default__cluster.tsv", + sep="\t", + header=None, + names=["representative", "member"], +) + + +representatives, pdb_counts, percentages, sizes, member, effective_sz = ( + [], + [], + [], + [], + [], + [], +) +for representative, groupby in together_cluster.groupby("representative"): + unique_pdb = groupby["member"][groupby["member"].str.startswith("pdb")].tolist() + unique_pdb = [s.split("_")[0] for s in unique_pdb] + unique_pdb = set(unique_pdb) + + unique_non_pdb = groupby["member"][ + ~groupby["member"].str.startswith("pdb") + ].tolist() + unique_non_pdb = set(unique_non_pdb) + percentage_pdb = len(unique_pdb) / len(unique_pdb.union(unique_non_pdb)) + + representatives.append(representative) + pdb_counts.append(len(unique_pdb)) + percentages.append(percentage_pdb) + sizes.append(len(groupby)) + effective_size = len(unique_pdb.union(unique_non_pdb)) + effective_sz.append(effective_size) + +cluster_pdb = pd.DataFrame( + { + "representative": representatives, + "pdb_count": pdb_counts, + "percentage": percentages, + "size": sizes, + "effective_size": effective_sz, + } +) +cluster_pdb["pdb_log"] = np.log(cluster_pdb["pdb_count"] + 1) +cluster_pdb["size_log"] = np.log(cluster_pdb["size"]) +cluster_pdb["size_syn"] = cluster_pdb["size"] - cluster_pdb["pdb_count"] + +cluster_pdb['is_synthetic'] = cluster_pdb['size_syn'] > 0 +cluster_pdb['syn_only'] = cluster_pdb['size_syn'] == cluster_pdb['size'] + + + +fig, axs = plt.subplots(1, 1) +cluster_pdb_sorted = cluster_pdb.sort_values('syn_only', ascending=True) + +g = sns.scatterplot( + data=cluster_pdb_sorted, + x="size", + y="size_syn", + alpha=1, + ax=axs, + s=20, + hue='syn_only', + palette=['#7f7f7f', '#0D96C9'], + legend=True # Turn off automatic legend +) + +f = 0.1 +xmin = cluster_pdb_sorted['size'].min() +xmin = xmin - xmin * f + +xmax = cluster_pdb_sorted['size'].max() +xmax = xmax + xmin * f + +# Log scale for x-axis +axs.set_xscale("log") +axs.set_yscale('log') + +# Set labels and title +axs.set_xlabel("Cluster size") +axs.set_ylabel("# BBR members") +axs.set_title("Cluster size by num. BBR members") + +#Create custom legend handles +from matplotlib.lines import Line2D +legend_elements = [ + Line2D([0], [0], marker='o', color='w', markerfacecolor='#7f7f7f', + label='PDB + BBRef samples', markersize=6), + Line2D([0], [0], marker='o', color='w', markerfacecolor='#0D96C9', + label='BBRef samples only', markersize=6) +] +# PDB + BBRef samples" & "BBRef samples only" ? +# also please change "PDB + synthetic samples" to "PDB + BBRef samples" + +# Add custom legend +axs.legend(handles=legend_elements, frameon=False, markerscale=2.5) + +# Remove frame +axs.spines['top'].set_visible(False) +axs.spines['right'].set_visible(False) + +plt.tight_layout() + + + + From 7e45ebdc07eda4add35ba4e1a22c95026aa76c41 Mon Sep 17 00:00:00 2001 From: Alex Lee Date: Tue, 15 Jul 2025 21:51:43 -0700 Subject: [PATCH 2/3] add backboneref amulet files --- analysis/br/foldseek/clust_synalone.yaml | 32 ++ analysis/br/foldseek/clust_together.yaml | 32 ++ analysis/br/foldseek/cluster_pdb_alone.yaml | 34 ++ analysis/br/foldseek/parallell_search.yaml | 47 +++ analysis/br/omegafold/Dockerfile | 15 + analysis/br/omegafold/parallel.yaml | 40 +++ analysis/br/omegafold/single.yaml | 41 +++ analysis/br/pmpnn/Dockerfile | 8 + analysis/br/pmpnn/base_parallel.yaml | 44 +++ analysis/br/pydssp/Dockerfile | 4 + analysis/br/pydssp/pydssp_parallel.yaml | 46 +++ analysis/br/rfdiffusion/Dockerfile | 22 ++ .../br/rfdiffusion/amulet/Dockerfile.amulet | 30 ++ analysis/br/rfdiffusion/amulet/job.yaml | 36 ++ analysis/br/rfdiffusion/amulet/parallel.yaml | 55 +++ analysis/br/rfdiffusion/gen_rfdiff.py | 333 ++++++++++++++++++ 16 files changed, 819 insertions(+) create mode 100644 analysis/br/foldseek/clust_synalone.yaml create mode 100644 analysis/br/foldseek/clust_together.yaml create mode 100644 analysis/br/foldseek/cluster_pdb_alone.yaml create mode 100644 analysis/br/foldseek/parallell_search.yaml create mode 100644 analysis/br/omegafold/Dockerfile create mode 100644 analysis/br/omegafold/parallel.yaml create mode 100644 analysis/br/omegafold/single.yaml create mode 100644 analysis/br/pmpnn/Dockerfile create mode 100644 analysis/br/pmpnn/base_parallel.yaml create mode 100644 analysis/br/pydssp/Dockerfile create mode 100644 analysis/br/pydssp/pydssp_parallel.yaml create mode 100644 analysis/br/rfdiffusion/Dockerfile create mode 100644 analysis/br/rfdiffusion/amulet/Dockerfile.amulet create mode 100644 analysis/br/rfdiffusion/amulet/job.yaml create mode 100644 analysis/br/rfdiffusion/amulet/parallel.yaml create mode 100644 analysis/br/rfdiffusion/gen_rfdiff.py diff --git a/analysis/br/foldseek/clust_synalone.yaml b/analysis/br/foldseek/clust_synalone.yaml new file mode 100644 index 0000000..f084f68 --- /dev/null +++ b/analysis/br/foldseek/clust_synalone.yaml @@ -0,0 +1,32 @@ +target: + service: sing + # run "amlt target list aml" to list the names of available AML targets + name: msrresrchvc + workspace_name: biomlinterns2024 + resource_group: gcr-singularity-resrch + +environment: + image: alexjlee/fs:latest + username: biomlinterns2024cr + registry: biomlinterns2024cr.azurecr.io + +storage: + data: + storage_account_name: alexleecold + container_name: amulet + mount_dir: /mnt/data/ + fseek: + storage_account_name: alexleecold + container_name: foldseek + mount_dir: /mnt/foldseekdb/ + +jobs: +- name: "cluster_together" + #sku: 8C60 # 32 gb 16 cores + #sku: 8C30 + sku: 8C60 + #sku: 10C3 + priority: high + process_count_per_node: 1 + command: + - /usr/local/bin/entrypoint easy-cluster /mnt/data/alexleecold/pdbs/foldseek_best_scRMSD /mnt/data/alexleecold/clustering_experiments/syn_alone_w_seqreplace/syn240k_alone_default_ /tmp diff --git a/analysis/br/foldseek/clust_together.yaml b/analysis/br/foldseek/clust_together.yaml new file mode 100644 index 0000000..d2a0d27 --- /dev/null +++ b/analysis/br/foldseek/clust_together.yaml @@ -0,0 +1,32 @@ +target: + service: sing + # run "amlt target list aml" to list the names of available AML targets + name: msrresrchvc + workspace_name: biomlinterns2024 + resource_group: gcr-singularity-resrch + +environment: + image: alexjlee/fs:latest + username: biomlinterns2024cr + registry: biomlinterns2024cr.azurecr.io + +storage: + data: + storage_account_name: alexleecold + container_name: amulet + mount_dir: /mnt/data/ + fseek: + storage_account_name: alexleecold + container_name: foldseek + mount_dir: /mnt/foldseekdb/ + +jobs: +- name: "cluster_pdb" + #sku: 8C60 # 32 gb 16 cores + #sku: 8C30 + sku: 8C60 + #sku: 10C3 + priority: high + process_count_per_node: 1 + command: + - /usr/local/bin/entrypoint easy-cluster /mnt/data/alexleecold/pdbs/scRMSD_best_240k_plus_pdbFirst /mnt/data/alexleecold/clustering_experiments/pdb_plus_240k/scRMSD_best_240k_plus_pdbFirst_default_ /tmp diff --git a/analysis/br/foldseek/cluster_pdb_alone.yaml b/analysis/br/foldseek/cluster_pdb_alone.yaml new file mode 100644 index 0000000..2777839 --- /dev/null +++ b/analysis/br/foldseek/cluster_pdb_alone.yaml @@ -0,0 +1,34 @@ +target: + service: sing + # run "amlt target list aml" to list the names of available AML targets + name: msrresrchvc + workspace_name: biomlinterns2024 + resource_group: gcr-singularity-resrch + +environment: + image: alexjlee/fs:latest + username: biomlinterns2024cr + registry: biomlinterns2024cr.azurecr.io + image_setup: + - echo "Setup!" + +storage: + data: + storage_account_name: alexleecold + container_name: amulet + mount_dir: /mnt/data/ + fseek: + storage_account_name: alexleecold + container_name: foldseek + mount_dir: /mnt/foldseekdb/ + +jobs: +- name: "cluster_synthetic" + #sku: 8C60 # 32 gb 16 cores + #sku: 8C30 + sku: 8C60 + #sku: 10C3 + priority: high + process_count_per_node: 1 + command: + - /usr/local/bin/entrypoint easy-cluster /mnt/data/alexleecold/pdbs/pdb_flat_first /mnt/data/alexleecold/clustering_experiments/pdbalone/pdbAlone_flat_first_default_ /tmp diff --git a/analysis/br/foldseek/parallell_search.yaml b/analysis/br/foldseek/parallell_search.yaml new file mode 100644 index 0000000..b526636 --- /dev/null +++ b/analysis/br/foldseek/parallell_search.yaml @@ -0,0 +1,47 @@ +target: + service: sing + # run "amlt target list aml" to list the names of available AML targets + name: msrresrchvc + workspace_name: biomlinterns2024 + resource_group: gcr-singularity-resrch + +environment: + image: alexjlee/fs:latest + username: biomlinterns2024cr + registry: biomlinterns2024cr.azurecr.io + image_setup: + - echo "Setup!" + - echo "Doing somehting!" + # - . setup.sh + +# code: +# # $CONFIG_DIR is expanded to the directory of this config file +# local_dir: /home/t-leea/project/ +# ignore: +# - prodata/scripts/tools/foldseek/* +# - plm +# - RFdiffusion + +# data: +# local_dir: /data/uniref50_202401 +# remote_dir: uniref50_202401 + +storage: + data: + storage_account_name: alexleecold + container_name: amulet + mount_dir: /mnt/data/ + fseek: + storage_account_name: alexleecold + container_name: foldseek + mount_dir: /mnt/foldseekdb/ + +jobs: +- name: foldseek + sku: 8C60 + priority: high + process_count_per_node: 1 + command: + #- /usr/local/bin/entrypoint easy-search /mnt/data/omegafold/best_syn_rmsd/0.1 /mnt/foldseekdb/afdb50/afdb50 /mnt/data/alexleecold/omegafold/best-fit-seqs_search.m8 /tmp --format-output "query,target,alntmscore,qtmscore,ttmscore,lddt,prob,evalue,bits" + - /usr/local/bin/entrypoint easy-search /mnt/data/alexleecold/pdbs/foldseek_140316_tosearch /mnt/foldseekdb/afdb50/afdb50 /mnt/data/alexleecold/omegafold/best-fit-seqs_search_140k.m8 /tmp --format-output "query,target,alntmscore,qtmscore,ttmscore,lddt,prob,evalue,bits" + - sleep 1m diff --git a/analysis/br/omegafold/Dockerfile b/analysis/br/omegafold/Dockerfile new file mode 100644 index 0000000..d8049ca --- /dev/null +++ b/analysis/br/omegafold/Dockerfile @@ -0,0 +1,15 @@ +ARG PYTORCH_TAG=2.3.1-cuda12.1-cudnn8-devel +FROM pytorch/pytorch:${PYTORCH_TAG} + +RUN apt-get update \ + && apt-get install --no-install-recommends -y \ + build-essential \ + git \ + wget \ + curl \ + && rm -rf /var/lib/apt/lists/* \ + && apt-get autoremove -y \ + && apt-get clean + +RUN pip install biopython && pip install --no-deps git+https://github.com/alexj-lee/OmegaFold.git && \ + mkdir -p /root/.cache/omegafold_ckpt && wget https://helixon.s3.amazonaws.com/release1.pt -O /root/.cache/omegafold_ckpt/model.pt diff --git a/analysis/br/omegafold/parallel.yaml b/analysis/br/omegafold/parallel.yaml new file mode 100644 index 0000000..789d448 --- /dev/null +++ b/analysis/br/omegafold/parallel.yaml @@ -0,0 +1,40 @@ +target: + service: sing + # run "amlt target list aml" to list the names of available AML targets + name: msrresrchvc + workspace_name: biomlinterns2024 + resource_group: gcr-singularity-resrch + +environment: + image: alexjlee/omegafold:latest + username: biomlinterns2024cr + registry: biomlinterns2024cr.azurecr.io + +code: + # $CONFIG_DIR is expanded to the directory of this config file + local_dir: /home/t-leea/project/protprune/scripts + +storage: + data: + storage_account_name: alexleecold + container_name: amulet + mount_dir: /mnt/data/ + +search: + job_template: + name: "{experiment_name:s}_{auto:3s}" + sku: G1-A100 # 32 gb 16 cores + #sku: 10C3 + priority: high + process_count_per_node: 1 + command: + - mkdir /mnt/data/alexleecold/omegafold/0dot1/{dirname} + - omegafold --subbatch_size 448 /mnt/data/alexleecold/pmpnn_results/0dot1/{dirname}.fasta /mnt/data/alexleecold/omegafold/0dot1/{dirname} + submit_args: + env: + SHARED_MEMORY_PERCENT: 0.1 + type: grid + max_trials: 5000 + params: + - name: dirname + values: ['202407262019_xdNQ', '202407201930_ft4g', '202407201950_YEeQ'] diff --git a/analysis/br/omegafold/single.yaml b/analysis/br/omegafold/single.yaml new file mode 100644 index 0000000..a13d133 --- /dev/null +++ b/analysis/br/omegafold/single.yaml @@ -0,0 +1,41 @@ +target: + service: sing + # run "amlt target list aml" to list the names of available AML targets + name: msrresrchvc + workspace_name: biomlinterns2024 + resource_group: gcr-singularity-resrch + + # name: msrresrchlab + # workspace_name: biomlinterns2024 + # resource_group: gcr-singularity-lab + + +environment: + image: alexjlee/ofold:latest + username: biomlinterns2024cr + registry: biomlinterns2024cr.azurecr.io + image_setup: + - echo "bye" + +code: + # $CONFIG_DIR is expanded to the directory of this config file + local_dir: /home/t-leea/project/protprune/scripts + +storage: + data: + storage_account_name: alexleecold + container_name: amulet + mount_dir: /mnt/data/ + +job: + name: "{experiment_name:s}_{auto:3s}" + sku: G1-A100 # 32 gb 16 cores + #sku: 10C3 + priority: high + process_count_per_node: 1 + command: + #- echo {dirname} >> /mnt/data/alexleecold/foldseek_finished_backbones.txt + #- python pmpnn_cdesign.py --directory {dirname} --output_fasta /mnt/data/alexleecold/pmpnn_results/0dot1/{dirname}.fasta --temperature 0.1 --num_seqs 10 + #- touch /mnt/data/alexleecold/pmpnn_results/0dot1/done/{dirname} + - mkdir /mnt/data/alexleecold/omegafold/0dot1/{dirname} + - omegafold --subbatch_size 224 /mnt/data/alexleecold/pmpnn_results/0dot1/{dirname}.fasta /mnt/data/alexleecold/omegafold/0dot1/{dirname} diff --git a/analysis/br/pmpnn/Dockerfile b/analysis/br/pmpnn/Dockerfile new file mode 100644 index 0000000..bf3df7e --- /dev/null +++ b/analysis/br/pmpnn/Dockerfile @@ -0,0 +1,8 @@ +FROM nvcr.io/nvidia/pytorch:23.09-py3 + + +RUN pip install --upgrade pip && \ + pip install "jax[cuda]" \ + -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html +RUN pip install git+https://github.com/sokrypton/ColabDesign + diff --git a/analysis/br/pmpnn/base_parallel.yaml b/analysis/br/pmpnn/base_parallel.yaml new file mode 100644 index 0000000..7fd67e5 --- /dev/null +++ b/analysis/br/pmpnn/base_parallel.yaml @@ -0,0 +1,44 @@ +target: + service: sing + name: msrresrchvc + workspace_name: biomlinterns2024 + resource_group: gcr-singularity-resrch + +environment: + image: alexjlee/colabdesign_jaxgpu:latest + username: biomlinterns2024cr + registry: biomlinterns2024cr.azurecr.io + +code: + # $CONFIG_DIR is expanded to the directory of this config file + local_dir: /home/t-leea/project/protprune/scripts + +storage: + data: + storage_account_name: alexleecold + container_name: amulet + mount_dir: /mnt/data/ + +search: + job_template: + name: "{experiment_name:s}_{auto:3s}" + sku: 8C7 + priority: high + process_count_per_node: 1 + command: + - mkdir -p /mnt/data/alexleecold/pmpnn_results_u50length/{temperature}/done + - python pmpnn_cdesign.py --directory /mnt/data/alexleecold/backbones_u50length/{dirname} --output_fasta /mnt/data/alexleecold/pmpnn_results_u50length/{temperature}/{dirname}.fasta --temperature {temperature} --num_seqs 10 + - touch /mnt/data/alexleecold/pmpnn_results_u50length/{temperature}/done/{dirname} + - sleep 30s + submit_args: + env: + SHARED_MEMORY_PERCENT: 0.1 + CUDA_VISIBLE_DEVICES: "" + JAX_PLATFORMS: "cpu" + type: grid + max_trials: 400 + params: + - name: dirname + values: ['20240810122_mcz2','202408101759_mj8X','202408101946_JehM','202408101946_YPnu','202408102045_dQqr',] + - name: temperature + values: [0.1, 0.2, 0.3, 0.4, 0.6, 0.8, 1.0] \ No newline at end of file diff --git a/analysis/br/pydssp/Dockerfile b/analysis/br/pydssp/Dockerfile new file mode 100644 index 0000000..894cb12 --- /dev/null +++ b/analysis/br/pydssp/Dockerfile @@ -0,0 +1,4 @@ +FROM singularitybase.azurecr.io/base/job/pytorch/acpt-2.2.1-py3.10-cuda12.1:20240312T225111416 as base + +RUN pip install git+https://github.com/alexj-lee/PyDSSP/ +RUN echo "rebuild" diff --git a/analysis/br/pydssp/pydssp_parallel.yaml b/analysis/br/pydssp/pydssp_parallel.yaml new file mode 100644 index 0000000..3567cf9 --- /dev/null +++ b/analysis/br/pydssp/pydssp_parallel.yaml @@ -0,0 +1,46 @@ +target: + service: sing + # run "amlt target list aml" to list the names of available AML targets + name: msrresrchvc + workspace_name: biomlinterns2024 + resource_group: gcr-singularity-resrch + +environment: + image: alexjlee/pdp:latest + username: biomlinterns2024cr + registry: biomlinterns2024cr.azurecr.io + +# code: +# # $CONFIG_DIR is expanded to the directory of this config file +# local_dir: /home/t-leea/project/protprune/scripts + +# data: +# local_dir: /home/t-leea/checkpoints +# remote_dir: alexleecold/rfdiff/checkpoints +# data upload is not required for this example + +storage: + data: + storage_account_name: alexleecold + container_name: amulet + mount_dir: /mnt/data/ + +# list of jobs to run, we run 2 jobs in this example +search: + job_template: + name: "{experiment_name:s}_{auto:3s}" + sku: 10C3 + process_count_per_node: 1 + command: + - mkdir -p /mnt/data/alexleecold/pydssp_u50length/done + - pydssp /mnt/data/alexleecold/backbones_u50length/{dirname} -d cpu -o /mnt/data/alexleecold/pydssp_u50length/{dirname}.dssp + - touch /mnt/data/alexleecold/pydssp_u50length/done/{dirname} + - sleep 1m + type: grid + max_trials: 1000 + params: + - name: dirname + # + #values: ['202408101031_9Q6G', '202408101056_tNN7', '202408101241_Nawi', '20240810143_BSEv', '20240810153_3kv1', '20240810153_QVZd', '202408101710_4ysa', '202408101829_OFVA', '20240810182_NGjb', '202408101836_1iPO', '202408101844_WYFR', '202408101856_doJ3', '202408101857_aSfS', '202408101857_v2pP', '202408101912_tj9c', '20240810191_iS0X', '202408101923_5Nvb', '202408101931_Ah6S', '202408101944_aQyo', '202408101949_XhSS', '20240810199_Ui4O', '20240810200_Tx7Q', '20240810202_v0fL', '20240810204_N4p4', '20240810205_w4Gh', '20240810221_LOLh', '202408102319_kyNx', '202408102324_S4h2', '202408102331_iANp', '202408102346_CPZl', '2024081032_EyrB', '20240810341_NPeK', '20240810447_1Y2q', '20240810514_eUY6', '20240810521_kjKj', '20240810523_Gadq', '20240810554_FUtt', '20240810557_BNYP', '2024081055_uzrX', '2024081056_DkZm', '20240810610_J1nr', '2024081069_vj6U', '20240810820_0Lju', '20240811027_cAWF', '20240811030_P8fC', '20240811036_Wm5X', '2024081106_MIxD', '2024081108_oPis', '20240811110_5J8D', '20240811110_ihY4', '202408111110_FoRe', '202408111110_lqKy', '202408111111_DpiL', '202408111112_Zbdn', '202408111112_Znsz', '202408111113_GM8G', '202408111114_X5yj', '202408111115_y1LH', '202408111117_PaIP', '202408111118_j2A6', '202408111118_jns2', '202408111119_JXhw', '202408111119_UblG', '202408111119_Xz4E', '202408111121_W5F1', '202408111122_40Po', '202408111122_BlTy', '202408111123_sI0J', '202408111123_zql1', '202408111126_f93e', '20240811112_1wrP', '20240811112_ryBz', '202408111131_UWWa', '202408111135_G2JY', '202408111135_i2rg', '202408111139_snzV', '202408111139_x6PR', '20240811113_1B6d', '202408111146_Sdge', '202408111147_xclR', '20240811114_kCbq', '202408111150_PVeV', '202408111150_WQow', '202408111152_OJVt', '202408111154_9Hh3', '202408111156_8pYe', '202408111156_H5dM', '20240811116_ZzTt', '20240811116_aJEW', '20240811117_JVmW', '20240811119_KlKq', '20240811119_hWI8', '2024081111_31ef', '20240811120_BfQE', '202408111223_Lr1Y', '202408111330_dslw', '20240811149_YLV1', '202408111521_loB5', '202408111521_zkLX', '202408111538_SenI', '20240811178_LiiG', '2024081117_HXv1', '202408111846_MHEn', '202408111946_Iht7', '202408111946_yzRJ', '20240811211_MZCu', '20240811212_YOsI', '20240811212_hDsB', '2024081121_aaJK', '20240811223_e3GQ', '202408112245_FLs0', '20240811226_PyIf', '202408112318_Pgeh', '202408112318_m4le', '20240811235_Bm6S', '20240811235_sjoZ', '20240811236_Ilwu', '20240811236_tQXr', '20240811242_ju7z', '20240811248_8Wc6', '20240811254_R6kb', '20240811257_bxLd', '2024081125_ARnR', '2024081130_NKfI', '20240811326_Khe4', '2024081132_6kjE', '2024081132_Da8p', '20240811331_TyhP', '20240811339_pQWl', '20240811341_vKby', '20240811345_6pe2', '20240811348_9eO3', '2024081134_kazp', '20240811358_U0tA', '20240811359_WFBn', '2024081136_36Hj', '20240811418_9pYy', '20240811427_lcqQ', '20240811428_h5nH', '20240811431_IR50', '20240811436_zKnB', '20240811442_G15y', '20240811443_1uIT', '20240811445_fX4d', '2024081144_oeIR', '2024081145_TqZD', '2024081146_hc5l', '2024081150_h2pO', '2024081150_zmCi', '20240811526_itpZ', '20240811527_ISCX', '20240811531_uvpz', '20240811535_K5h4', '20240811536_g8UF', '20240811536_v0NR', '20240811536_wgcm', '20240811537_JEjV', '20240811537_v9Rk', '20240811539_TDNQ', '20240811540_H92P', '20240811548_Vrwv', '20240811548_bdpu', '20240811549_OLMG', '20240811549_uA2B', '20240811550_X8n2', '20240811551_85Rp', '20240811552_UrPH', '20240811554_yGn2', '20240811556_YooQ', '20240811556_k9PD', '2024081156_C4y2', '2024081156_kSzP', '20240811619_9fd8', '20240811619_ylP5', '20240811620_xGKh', '20240811621_25Ob', '20240811626_iLPl', '20240811628_Od4u', '20240811638_rbHY', '2024081164_A2sy', '2024081166_X9KV', '2024081167_b9SN', '2024081168_7jR4', '20240811719_g1rO', '20240811723_QTR8', '20240811728_FEa1', '20240811731_7GB0', '20240811739_mRWl', '20240811743_MrOf', '20240811744_4WMq', '2024081175_YTEi', '20240811843_b9Rb', '20240812042_SbAd', '20240812042_n90s', '202408121118_hcsa', '202408121228_rAvy', '202408121239_0NZH', '202408121239_JIhp', '20240812124_Yje4', '20240812124_yDdP', '202408121316_anfp', '202408121333_9XYJ', '202408121333_DaNc', '202408121414_UnCQ', '202408121422_wiZV', '202408121430_M70S', '202408121434_CPSn', '202408121438_8kK0', '20240812143_V6Ch', '20240812143_ZPOE', '202408121447_vz0N', '202408121513_Xrih', '202408121515_yLgM', '202408121516_8B4D', '202408121516_k062', '202408121516_lPk8', '202408121523_4QmA', '202408121529_BHLb', '202408121535_0HL7', '202408121535_HoBu', '202408121537_8vmZ', '20240812161_LHNd', '20240812163_UDD1', '202408121651_ueIY', '202408121653_bJ7B', '202408121654_44dv', '202408121655_Uylr', '202408121657_NmLF', '202408121659_AZbu', '202408121659_IW62', '202408121714_WJMX', '202408121717_Fn48', '202408121717_eSdx', '202408121744_1dgY', '202408121745_hc8E', '202408121750_kk20', '202408121817_KA7x', '202408121820_Uxq8', '202408121821_HhfA', '202408121830_KCI9', '202408121837_6j27', '202408121912_s9jO', '20240812191_Mz0p', '202408121953_1Dv7', '202408122015_r657', '202408122054_M0GV', '20240812207_ZmzQ', '202408122111_3BJx', '2024081221_U2sO', '2024081221_WLir', '202408122215_M79x', '202408122222_YVhq', '202408122231_j0nj', '202408122231_ndJv', '202408122235_tKkc', '202408122236_weVf', '202408122243_5S5m', '202408122243_9yez', '202408122246_c994', '202408122250_3wX4', '202408122259_VuLL', '202408122340_31el', '202408122345_UB46', '202408122358_36Lb', '202408122358_WgPe', '20240812236_7Cbh', '20240812236_ypDO', '20240812237_u9v8', '2024081250_h8SG', '20240812544_wTCr', '20240812556_613o', '20240812721_FPRE', '2024081273_wSCU', '20240813023_iwnW', '20240813024_45db', '20240813036_vLcO', '20240813037_3R8x', '20240813037_xmrj', '20240813043_r244', '202408131113_u3Mh', '202408131113_yeI4', '20240813130_sVhL', '20240813131_G4um', '20240813142_5ZNU', '20240813144_K72c', '202408131525_RXse', '202408131525_TeoX', '202408131548_IfS1', '202408131548_MJQw', '20240813155_Ov5O', '2024081316_Y5MF', '20240813228_80F1', '20240813237_J4i6', '20240813237_MMr1', '2024081323_03ka', '20240813240_MYwN', '20240813240_XLGL', '20240813247_ZqNn', '20240813347_Ba83', '20240813427_mzRM', '20240813436_Enp6', '2024089160_Vb7f', '20240891612_l5Om', '20240891612_o9Gh', '20240891612_t07X', '20240891614_K6ud', '20240891615_CneJ', '20240891616_rFV9', '20240891617_ibtr', '20240891618_cqM1', '20240891621_l0Z4', '20240891623_Dhyb', '20240891627_ANij', '20240891630_Nbke', '20240891635_d33L', '20240891635_kI7Q', '2024089163_pMQ0', '20240891649_6Zrz', '2024089164_Ias9', '2024089164_yaxt', '2024089164_ytiN', '20240891650_f6u4', '20240891651_li7V', '20240891654_hgcB', '2024089165_Pxck', '2024089165_kA8J', '2024089165_w9Pl', '2024089166_8rn3', '2024089166_hW1p', '2024089167_PV4f', '2024089167_rgtR', '2024089167_ywGT', '2024089168_KvwZ', '2024089168_Pq5y', '2024089169_JULm', '2024089169_LxUY', '20240891724_y1uo', '2024089173_pFpY', '20240891754_kU7e', '20240891920_SZQr', '20240891920_rOmz', '20240892034_dkVX', '20240892110_Thvf', '20240892110_uCpL', '20240892119_mZbf', '20240892121_T7X8', '20240892128_n5aU', '20240892133_OZ5F', '20240892136_Jwnh', '20240892141_en36', '20240892143_xuNC', '20240892145_pWJk', '20240892150_kuLx', '20240892158_4BvZ', '20240892159_cFBJ', '2024089217_GgiP', '2024089219_K2Pp', '20240892210_xXve', '20240892211_Qa82', '20240892221_KQqK', '20240892224_O4HK', '20240892224_sPsS', '20240892225_w36I', '2024089222_vgOB', '20240892230_2JGR', '20240892234_rmjM', '20240892236_0FxJ', '20240892240_OaL7', '20240892242_Q72R', '20240892245_bxuP', '20240892249_zOCd', '2024089224_hLed', '20240892251_Q0an', '20240892253_PHAX', '20240892255_WFrY', '20240892257_lTdl'] + values: ['202408102331_iANp', '20240891754_kU7e', '2024081175_YTEi', '20240813240_MYwN', '20240811535_K5h4', '20240811331_TyhP', '202408121745_hc8E', '20240813237_MMr1', '202408121655_Uylr'] + #values: ['20240810122_mcz2','202408101759_mj8X','202408101946_JehM','202408101946_YPnu','202408102045_dQqr','202408102342_2JXE','202408102342_tOOV','202408102344_RHup','202408102344_edQV','20240811149_0ubq','20240811149_4Bzu','2024089150_TU6H','2024089150_W2Cw','2024089150_cJ61','2024089150_vnan','20240891510_NhYO','20240891510_SbSn','20240891510_UzPY','20240891510_gsce','20240891510_sfJf','20240891511_bZRN','20240891511_uOeV','20240891512_Odvr','20240891512_UVIc','20240891512_Wlaa','20240891512_aldX','20240891513_TdEy','20240891514_jVks','20240891514_zt6V','20240891516_7qRe','20240891516_YRiU','20240891517_58ki','20240891517_JMDc','20240891517_UBIl','20240891517_fJ9I','20240891518_21cg','20240891518_8K2Z','20240891518_BAiB','20240891518_Yck6','20240891519_OKjQ','20240891519_rG0p','20240891519_zf6C','2024089151_CqmT','20240891521_3uyo','20240891521_sUVH','20240891523_JgVl','20240891523_Zew8','20240891523_fvMb','20240891523_opSm','20240891523_xeO3','20240891525_0E00','20240891525_Dy0W','20240891525_ZKFP','20240891526_rM4K','20240891527_3hS4','20240891527_NTpR','20240891527_RXNH','20240891527_mb1q','20240891527_qJLB','20240891528_MkF9','20240891529_MmSA','2024089152_UP80','2024089152_q9Lz','20240891531_BtMN','20240891533_3inS','20240891533_n5Tx','20240891533_qmEX','20240891534_les6','20240891534_tTM7','20240891535_izEP','20240891535_qYOD','20240891535_zvko','20240891537_2LpY','20240891538_8CsF','20240891538_kQgj','20240891538_n5Yd','20240891539_eEh1','2024089153_Qs5N','20240891540_a05b','20240891540_wVYT','20240891541_hNzW','20240891541_oDrt','20240891541_ouWK','20240891541_r1PK','20240891544_pFgf','20240891545_AyGl','20240891545_JJOI','20240891545_OEql','20240891545_wGOp','20240891546_b4JI','20240891546_mAXl','20240891546_rS5J','20240891547_6Cux','20240891547_MmA9','20240891547_clnD','20240891547_jtK7','20240891548_BgF4','20240891548_VDpa','20240891549_ChEi','20240891549_Swik','20240891549_qTlE','20240891549_qt1z','2024089154_JujZ','20240891550_JuTI','20240891550_NNSw','20240891550_n7Ed','20240891550_wBbW','20240891551_RKj7','20240891551_hRsB','20240891552_91gq','20240891552_Awnn','20240891552_CSqk','20240891552_pidW','20240891552_q2Rr','20240891552_yl42','20240891552_zP0L','20240891554_9MJg','20240891555_kDGE','20240891555_lnce','20240891556_pMny','20240891556_z87K','20240891557_V7hd','20240891559_w7i6','2024089155_9jwK','2024089155_PrrY','2024089155_TuJI','2024089155_hZnl','2024089156_BFyN','2024089156_GYAY','2024089156_JpjY','2024089156_N2X9','2024089156_XXa0','2024089157_9iyc','2024089157_9krm','2024089158_3nEd','2024089158_FIgQ','2024089159_Uc65','2024089159_ie9O','2024089160_Wr9r','2024089160_xBna','20240891638_Orhz','20240891649_zYpz','20240891651_Llz8','20240891654_ZTQh','20240891656_OIIP','20240891656_lqmd','20240891659_7xKI','20240891659_RPfq','20240892010_W0ZT','20240892215_szM9'] diff --git a/analysis/br/rfdiffusion/Dockerfile b/analysis/br/rfdiffusion/Dockerfile new file mode 100644 index 0000000..4682494 --- /dev/null +++ b/analysis/br/rfdiffusion/Dockerfile @@ -0,0 +1,22 @@ +FROM nvcr.io/nvidia/pytorch:23.09-py3 + +RUN apt-get update && \ + apt-get install -y \ + wget \ + git +RUN git clone https://github.com/sokrypton/RFdiffusion.git +RUN pip install jedi \ + omegaconf \ + hydra-core \ + icecream \ + pyrsistent +RUN pip install --no-dependencies dgl==2.0.0 -f https://data.dgl.ai/wheels/cu121/repo.html +RUN pip install --no-dependencies e3nn==0.3.3 +RUN pip install opt_einsum_fx +RUN cd RFdiffusion/env/SE3Transformer && \ + pip install . +RUN wget -qnc https://files.ipd.uw.edu/krypton/ananas && chmod +x ananas +RUN mv RFdiffusion/* /root +RUN pip install git+https://github.com/sokrypton/ColabDesign.git@v1.1.1 +RUN ln -s /usr/local/lib/python3.*/dist-packages/colabdesign colabdesign +ENV DGLBACKEND="pytorch" diff --git a/analysis/br/rfdiffusion/amulet/Dockerfile.amulet b/analysis/br/rfdiffusion/amulet/Dockerfile.amulet new file mode 100644 index 0000000..8b48d99 --- /dev/null +++ b/analysis/br/rfdiffusion/amulet/Dockerfile.amulet @@ -0,0 +1,30 @@ +FROM singularitybase.azurecr.io/base/job/pytorch/acpt-2.2.1-py3.10-cuda12.1:20240312T225111416 as base +# FROM validations/base/singularity-tests as validator + +FROM base +RUN apt-get update && \ + apt install -y \ + wget \ + git && \ + apt-get clean + +RUN git clone https://github.com/alexj-lee/RFdiffusion +RUN pip install jedi \ + omegaconf \ + hydra-core \ + icecream \ + pyrsistent +RUN pip install --no-dependencies dgl==2.0.0 -f https://data.dgl.ai/wheels/cu121/repo.html && \ + pip install --no-dependencies e3nn==0.3.3 && \ + pip install opt_einsum_fx +RUN cd RFdiffusion/env/SE3Transformer && \ + pip install . +RUN wget -qnc https://files.ipd.uw.edu/krypton/ananas && chmod +x ananas +RUN mv RFdiffusion/* /root +RUN chmod -R 777 /root +RUN pip install git+https://github.com/sokrypton/ColabDesign.git@v1.1.1 +RUN ln -s /usr/local/lib/python3.*/dist-packages/colabdesign colabdesign +ENV DGLBACKEND="pytorch" +#COPY --from=validator /validations /opt/microsoft/_singularity/validations/ +#ENV SINGULARITY_IMAGE_ACCELERATORY="NVIDIA" +#RUN /opt/microsoft/_singularity/validations/validator.sh diff --git a/analysis/br/rfdiffusion/amulet/job.yaml b/analysis/br/rfdiffusion/amulet/job.yaml new file mode 100644 index 0000000..25c44b6 --- /dev/null +++ b/analysis/br/rfdiffusion/amulet/job.yaml @@ -0,0 +1,36 @@ +target: + service: sing + # run "amlt target list aml" to list the names of available AML targets + name: msrresrchvc + workspace_name: biomlinterns2024 + resource_group: gcr-singularity-resrch + +environment: + image: alexjlee/rfdiff:latest + username: biomlinterns2024cr + registry: biomlinterns2024cr.azurecr.io + +code: + # $CONFIG_DIR is expanded to the directory of this config file + local_dir: /home/t-leea/project/protprune/scripts + +# data: +# local_dir: /home/t-leea/checkpoints +# remote_dir: alexleecold/rfdiff/checkpoints +# data upload is not required for this example + +storage: + data: + storage_account_name: alexleecold + container_name: amulet + mount_dir: /mnt/data/ + +# list of jobs to run, we run 2 jobs in this example +jobs: +- name: rfdiffusion + sku: 32G1-V100 + process_count_per_node: 2 + command: + - mkdir -p /mnt/data/alexleecold/backbones + - find /mnt/data + - python gen_rfdiff.py --nb_config /mnt/data/alexleecold/rfdiff/checkpoints/negbin_uniref50.yaml --num 20 --gpu 0 --container '' --checkpoint /mnt/data/alexleecold/rfdiff/checkpoints/Base_ckpt.pt --num_threads 16 --output_directory /mnt/data/alexleecold/backbones/ diff --git a/analysis/br/rfdiffusion/amulet/parallel.yaml b/analysis/br/rfdiffusion/amulet/parallel.yaml new file mode 100644 index 0000000..8a63e94 --- /dev/null +++ b/analysis/br/rfdiffusion/amulet/parallel.yaml @@ -0,0 +1,55 @@ +target: + service: sing + # run "amlt target list aml" to list the names of available AML targets + + # name: msrresrchlab + # workspace_name: biomlinterns2024 + # resource_group: gcr-singularity-lab + name: msrresrchvc + workspace_name: biomlinterns2024 + resource_group: gcr-singularity-resrch + + + # name: msroctovc + # workspace_name: biomlinterns2024 + # resource_group: gcr-singularity-octo + +environment: + image: alexjlee/rfdiff:latest + username: biomlinterns2024cr + registry: biomlinterns2024cr.azurecr.io + image_setup: + - echo "bye" + +code: + # $CONFIG_DIR is expanded to the directory of this config file + local_dir: /home/t-leea/project/protprune/scripts + ignore: + - tools/foldseek + - tools/genie2 + - tools/omegafold + - tools/pmpnn + - tools/pydssp + - training + - scrmsd + + +storage: + data: + storage_account_name: alexleecold + container_name: amulet + mount_dir: /mnt/data/ + +search: + job_template: + name: "{experiment_name:s}_{auto:3s}" + sku: 16G1-V100 + process_count_per_node: 2 + command: # placeholder variable literally does nothing + - python gen_rfdiff.py --nb_config /mnt/data/alexleecold/rfdiff/checkpoints/negbin_uniref50.yaml --placeholder {placeholder} --num 1000 --gpu 0 --container '' --checkpoint /mnt/data/alexleecold/rfdiff/checkpoints/Base_ckpt.pt --num_threads 8 --output_directory /mnt/data/alexleecold/backbones/ --cleanup + type: grid + max_trials: 384 + params: + - name: placeholder + values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105] + diff --git a/analysis/br/rfdiffusion/gen_rfdiff.py b/analysis/br/rfdiffusion/gen_rfdiff.py new file mode 100644 index 0000000..101da88 --- /dev/null +++ b/analysis/br/rfdiffusion/gen_rfdiff.py @@ -0,0 +1,333 @@ +import yaml +import pathlib +import argparse +import random +import sys +import string +import shutil +from datetime import datetime + +from scipy import stats +import numpy as np +from hydra import initialize, compose + +try: + import docker +except ImportError: + pass + +""" +Last updated 2024-07-11. +TODO: I am realizing this is super suboptimal. There are two sources of +unnecessary overhead: + +1. starting the image +Possibly ameliorated by addn of something like below with substitution of +docker exec for docker run: + +import docker + +# Create a Docker client +client = docker.from_env() + +# Run a container in detached mode +container = client.containers.run('my_docker_image', detach=True, name='my_container') + +# Execute a command in the running container +exit_code, output = container.exec_run('command_to_run') + +# Print the output +print(output.decode()) + +# Stop and remove the container +container.stop() +container.remove() + +2. Loading weights and initializing the actual model itself: +Solved probably by editing the script they are using. + +""" + + +def get_timestamp_now(): + now = datetime.datetime.now() + year = now.year + month = str(now.month).zfill(2) + day = now.day + hour = now.hour + second = now.second + + return f"{year}{month}{day}{hour}{second}" + + +def nb_from_file(file: str): + """ + File to read params from a negative binomial parameter file in YAML format. If you want to bypass this, + the script is looking for `n`, `p`, and `loc`. + """ + + fpathlib = pathlib.Path(file) + if not fpathlib.exists(): + raise FileNotFoundError(f"File {file} does not exist.") + + with open(file, "r") as f: + config = yaml.safe_load(f) + + return stats.nbinom( + config.get("n", None), config.get("p", None), config.get("loc", 0) + ) + + +def nb_from_params(n: int, p: float, loc: int): + return stats.nbinom(n, p, loc) + +def generate_random_string(length=6): + characters = string.ascii_letters + string.digits + return "".join(random.choice(characters) for _ in range(length)) + + +def setup_docker(gpu: int): + client = docker.from_env() + device = [docker.types.DeviceRequest(device_ids=[str(gpu)], capabilities=[["gpu"]])] + ulimits = [ # docker complains if these are not added + GPU is desired + docker.types.Ulimit(name="memlock", hard=-1, soft=-1), + docker.types.Ulimit(name="stack", hard=67108864, soft=67108864), + ] + volume = { + "/home": {"bind": "/home", "mode": "rw"}, + "/data": {"bind": "/data", "mode": "rw"}, + } + + return client, device, ulimits, volume + + +def parse_args(): + args = argparse.ArgumentParser() + sampler = args.add_mutually_exclusive_group(required=True) + + sampler.add_argument( + "--nb_config", + type=str, + help="Path to the negative binomial config file", + ) + sampler.add_argument( + "--lengths", + type=str, + help="Path to file of lengths in `.npy` file format.", + ) + + args.add_argument( + "--sequential", + action="store_true", + help="Run the inference sequentially, \ + as opposed to selecting n proteins of differing length from the length distribution and\ + generating those at the same time, meaning all proteins of length l are done in one docker command.", + required=False, + ) + args.add_argument( + "--num", + type=int, + default=5000, + help="Number of proteins to generate.", + required=True, + ) + args.add_argument( + "--gpu", + type=int, + default=0, + help="GPU to use. Will be provides a a `docker.types.DeviceRequest` with given ID.", + required=True, + ) + args.add_argument( + "--output_directory", + type=str, + help="Output prefix for the generated proteins. Backbones will then be generated at path: {output_directory}/{DATETIMENOW}{RFDIFFUSION_NORMAL_OUTPUT}\ + where RFDIFFUSION_NORMAL_OUTPUT is the normal string name; we do this in order to prevent collisions between simultaneous runs of this script.", + required=True, + ) + args.add_argument( + "--container", + type=str, + default="rfdiff", + help="Docker container to use.", + required=False, + ) + args.add_argument( + "--checkpoint", + type=str, + help="Path to the checkpoint file to use.", + required=True, + ) + args.add_argument( + "--num_threads", + type=str, + help="Number of threads to force one instance to use (CPU).", + required=False, + default=8, + ) + args.add_argument( + "--cleanup", + action="store_true", + required=False, + help="Whether to delete `traj` files after running. ", + ) + args.add_argument('--maxlen', type=int, default=1024, help='Maximum length of protein to generate.') + args.add_argument( + "--placeholder", + type=str, + help="Doesn't do anything. Just a placeholder to allow for multiple simultaneous amulet runs.", + ) + + args = parse_args() + return args + + +def main(): + args = parse_args() + + if pathlib.Path(args.checkpoint).exists() is False: + raise FileNotFoundError(f"Checkpoint file {args.checkpoint} does not exist.") + + if pathlib.Path(args.output_prefix.parent).exists() is False: + raise ValueError(f"The output prefix desired's path directories do not exist. Please fix; parent is: {args.output_prefix.parent}.\n\ + Fix by running `mkdir -p {args.output_prefix.parent}`") + + datetime_stamp = get_timestamp_now() + + container = args.container + if not ((container is None) or (container == "none") or (container == "")): + try: + import docker + + client, device, ulimits, volume = setup_docker(args.gpu) + except ImportError: + raise ImportError( + 'You passed a container name but do not have the docker library installed. \ + Please install the docker library or pass "" and then run this script using amulet.' + ) + else: + client = None + try: + # assume we are on amulet and an appropriate container has been uploaded + # in Sergey's RFDiffusion repo (which we are forking for this image+script) + # the requisite scripting files are moved to /root + # the file structure is roughly something like: + # /root + # ...py files + # /root/config + # /root/config/inference/ + # /root/config/inference/base.yaml + # /root/config/inference/symmetry.yaml + + sys.path.append("/root/") + import run_inference # type: ignore + # this is inside the container + + except ImportError: + raise ImportError( + 'You passed "" for the container meaning you wanted to run on amulet: for this \ + we need to be able to find the `run_inference.py` script in /root on the \ + container filesystem. \ + Please specify a container or install the script.' + ) + + print(f"Client is: {client}") + + rand_str = generate_random_string(length=4) + prefix = pathlib.Path(args.output_directory) / f"{datetime_stamp}_{rand_str}" + prefix.mkdir(parents=True, exist_ok=True) + print("Completed making parent dir:", prefix) + + prefix = prefix.as_posix() # convert to string + + overrides = [ # baseline hydra opts + f"inference.ckpt_override_path={args.checkpoint}", + f"++num_threads={args.num_threads}", + + ] + + if args.nb_config: + sampler = nb_from_file( + args.nb_config + ) # will be stats.nbinom object; see documentation for scipy + random_lengths = sampler.rvs(size=args.num) + else: + random_lengths = np.load(args.lengths) + random_lengths = np.random.choice(random_lengths, size=args.num, replace=True) + + random_lengths = np.minimum(random_lengths, args.maxlen) + random_lengths = np.maximum(random_lengths, 40) + + unique, counts = np.unique(random_lengths, return_counts=True) + + stacked = np.vstack((unique, counts)).T # will be 2 x TOT_NUM + + if random.random() <= 1: # flip half the time + stacked = stacked[::-1] + + for idx, (rand_length, count) in enumerate(stacked): + tnow = datetime.now() + rand_length_zfill = str(int(rand_length)).zfill(5) + + print(f"{tnow} || Generating: {count} {rand_length}-long backbone(s).") + random_tag = generate_random_string(length=6) + + if client is not None: + # cmd = f"python /root/run_inference.py 'contigmap.contigs=[{rand_length}-{rand_length}]' \ + # inference.output_prefix={prefix}/{datetime_stamp}_{rand_length_zfill}AA \ + # inference.num_designs={count} inference.ckpt_override_path={args.checkpoint} ++num_threads={args.num_threads}" + cmd = "python /root/run_inference.py" + extra_cfgs = [ + f"inference.output_prefix={prefix}/{datetime_stamp}_{random_tag}_{rand_length_zfill}AA", + f"contigmap.contigs=[{rand_length}-{rand_length}]", + f"inference.num_designs={count}", + ] + for cfg in extra_cfgs + overrides: + cmd += f" {cfg}" + print(f"Running cmd: {cmd}") + + client.containers.run( + container, + cmd, + device_requests=device, + volumes=volume, + ulimits=ulimits, + ) + + else: + from os import chdir + + chdir( + "/root/" + ) # base IPD & SOvchinnikov code gets copied to /root for some reason + with initialize( + version_base=None, config_path="../../root/config/inference" + ): # file struct is config/inference/{base.yaml, symmetry.yaml} + # see explanation in comment at start of this file to see more details about this + # this is equivalent roughly to: + # python /root/run_inference.py ...[settings]... (example inference.num_designs=3) + cfg = compose( + "base", + overrides=overrides + + [ + f"inference.output_prefix={prefix}/{datetime_stamp}_{random_tag}_{rand_length_zfill}AA", + f"contigmap.contigs=[{rand_length}-{rand_length}]", + f"inference.num_designs={count}", + ], + ) + + run_inference.main(cfg) + if args.cleanup is True: + print("args.cleanup is True: deleting `traj` files.") + pth_to_delete = pathlib.Path(prefix)/'traj' + if pth_to_delete.exists() is False: + print(f"Path to delete {pth_to_delete} does not exist. Continuing.") + else: + shutil.rmtree(pth_to_delete) + print("Finished deleting `traj` files.") + + return 0 + + +if __name__ == "__main__": + main() From 21baeb729d65efe2d60680efd66139efa4da87c7 Mon Sep 17 00:00:00 2001 From: Alex Lee Date: Wed, 16 Jul 2025 20:30:14 -0700 Subject: [PATCH 3/3] move br scripts to datasets dir --- {analysis => datasets}/br/foldseek/clust_synalone.yaml | 0 {analysis => datasets}/br/foldseek/clust_together.yaml | 0 {analysis => datasets}/br/foldseek/cluster_pdb_alone.yaml | 0 {analysis => datasets}/br/foldseek/parallell_search.yaml | 0 {analysis => datasets}/br/omegafold/Dockerfile | 0 {analysis => datasets}/br/omegafold/parallel.yaml | 0 {analysis => datasets}/br/omegafold/single.yaml | 0 {analysis => datasets}/br/pmpnn/Dockerfile | 0 {analysis => datasets}/br/pmpnn/base_parallel.yaml | 0 {analysis => datasets}/br/pydssp/Dockerfile | 0 {analysis => datasets}/br/pydssp/pydssp_parallel.yaml | 0 {analysis => datasets}/br/rfdiffusion/Dockerfile | 0 {analysis => datasets}/br/rfdiffusion/amulet/Dockerfile.amulet | 0 {analysis => datasets}/br/rfdiffusion/amulet/job.yaml | 0 {analysis => datasets}/br/rfdiffusion/amulet/parallel.yaml | 0 {analysis => datasets}/br/rfdiffusion/gen_rfdiff.py | 0 16 files changed, 0 insertions(+), 0 deletions(-) rename {analysis => datasets}/br/foldseek/clust_synalone.yaml (100%) rename {analysis => datasets}/br/foldseek/clust_together.yaml (100%) rename {analysis => datasets}/br/foldseek/cluster_pdb_alone.yaml (100%) rename {analysis => datasets}/br/foldseek/parallell_search.yaml (100%) rename {analysis => datasets}/br/omegafold/Dockerfile (100%) rename {analysis => datasets}/br/omegafold/parallel.yaml (100%) rename {analysis => datasets}/br/omegafold/single.yaml (100%) rename {analysis => datasets}/br/pmpnn/Dockerfile (100%) rename {analysis => datasets}/br/pmpnn/base_parallel.yaml (100%) rename {analysis => datasets}/br/pydssp/Dockerfile (100%) rename {analysis => datasets}/br/pydssp/pydssp_parallel.yaml (100%) rename {analysis => datasets}/br/rfdiffusion/Dockerfile (100%) rename {analysis => datasets}/br/rfdiffusion/amulet/Dockerfile.amulet (100%) rename {analysis => datasets}/br/rfdiffusion/amulet/job.yaml (100%) rename {analysis => datasets}/br/rfdiffusion/amulet/parallel.yaml (100%) rename {analysis => datasets}/br/rfdiffusion/gen_rfdiff.py (100%) diff --git a/analysis/br/foldseek/clust_synalone.yaml b/datasets/br/foldseek/clust_synalone.yaml similarity index 100% rename from analysis/br/foldseek/clust_synalone.yaml rename to datasets/br/foldseek/clust_synalone.yaml diff --git a/analysis/br/foldseek/clust_together.yaml b/datasets/br/foldseek/clust_together.yaml similarity index 100% rename from analysis/br/foldseek/clust_together.yaml rename to datasets/br/foldseek/clust_together.yaml diff --git a/analysis/br/foldseek/cluster_pdb_alone.yaml b/datasets/br/foldseek/cluster_pdb_alone.yaml similarity index 100% rename from analysis/br/foldseek/cluster_pdb_alone.yaml rename to datasets/br/foldseek/cluster_pdb_alone.yaml diff --git a/analysis/br/foldseek/parallell_search.yaml b/datasets/br/foldseek/parallell_search.yaml similarity index 100% rename from analysis/br/foldseek/parallell_search.yaml rename to datasets/br/foldseek/parallell_search.yaml diff --git a/analysis/br/omegafold/Dockerfile b/datasets/br/omegafold/Dockerfile similarity index 100% rename from analysis/br/omegafold/Dockerfile rename to datasets/br/omegafold/Dockerfile diff --git a/analysis/br/omegafold/parallel.yaml b/datasets/br/omegafold/parallel.yaml similarity index 100% rename from analysis/br/omegafold/parallel.yaml rename to datasets/br/omegafold/parallel.yaml diff --git a/analysis/br/omegafold/single.yaml b/datasets/br/omegafold/single.yaml similarity index 100% rename from analysis/br/omegafold/single.yaml rename to datasets/br/omegafold/single.yaml diff --git a/analysis/br/pmpnn/Dockerfile b/datasets/br/pmpnn/Dockerfile similarity index 100% rename from analysis/br/pmpnn/Dockerfile rename to datasets/br/pmpnn/Dockerfile diff --git a/analysis/br/pmpnn/base_parallel.yaml b/datasets/br/pmpnn/base_parallel.yaml similarity index 100% rename from analysis/br/pmpnn/base_parallel.yaml rename to datasets/br/pmpnn/base_parallel.yaml diff --git a/analysis/br/pydssp/Dockerfile b/datasets/br/pydssp/Dockerfile similarity index 100% rename from analysis/br/pydssp/Dockerfile rename to datasets/br/pydssp/Dockerfile diff --git a/analysis/br/pydssp/pydssp_parallel.yaml b/datasets/br/pydssp/pydssp_parallel.yaml similarity index 100% rename from analysis/br/pydssp/pydssp_parallel.yaml rename to datasets/br/pydssp/pydssp_parallel.yaml diff --git a/analysis/br/rfdiffusion/Dockerfile b/datasets/br/rfdiffusion/Dockerfile similarity index 100% rename from analysis/br/rfdiffusion/Dockerfile rename to datasets/br/rfdiffusion/Dockerfile diff --git a/analysis/br/rfdiffusion/amulet/Dockerfile.amulet b/datasets/br/rfdiffusion/amulet/Dockerfile.amulet similarity index 100% rename from analysis/br/rfdiffusion/amulet/Dockerfile.amulet rename to datasets/br/rfdiffusion/amulet/Dockerfile.amulet diff --git a/analysis/br/rfdiffusion/amulet/job.yaml b/datasets/br/rfdiffusion/amulet/job.yaml similarity index 100% rename from analysis/br/rfdiffusion/amulet/job.yaml rename to datasets/br/rfdiffusion/amulet/job.yaml diff --git a/analysis/br/rfdiffusion/amulet/parallel.yaml b/datasets/br/rfdiffusion/amulet/parallel.yaml similarity index 100% rename from analysis/br/rfdiffusion/amulet/parallel.yaml rename to datasets/br/rfdiffusion/amulet/parallel.yaml diff --git a/analysis/br/rfdiffusion/gen_rfdiff.py b/datasets/br/rfdiffusion/gen_rfdiff.py similarity index 100% rename from analysis/br/rfdiffusion/gen_rfdiff.py rename to datasets/br/rfdiffusion/gen_rfdiff.py