Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 94 additions & 0 deletions analysis/fig-s4.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
import colorcet

sns.set(font_scale=1.7)
sns.set_style("whitegrid")
plt.rcParams["axes.grid"] = False
plt.rc("axes", edgecolor="black")
plt.rc(
"text.latex",
preamble=r"\usepackage{newpxtext}\usepackage{newpxmath}\usepackage{commath}\usepackage{mathtools}",
)
plt.rc("font", family="serif", size=16.0, weight="medium")
plt.rc("savefig", dpi=500)
plt.rc("legend", loc="best", fontsize="medium", fancybox=True, framealpha=0.5)
plt.rc("lines", linewidth=2.5, markersize=10, markeredgewidth=2.5)
plt.rc("axes", titlepad=10)


both = pd.read_csv('pdb_plus_240k/scRMSD_best_240k_plus_pdbFirst_aln0_cluster.tsv', sep='\t',
names=['representative', 'member'])

output_rows = []
for resample_freq in tqdm.tqdm((0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)):
for iter_n in range(5):
resampled = both.sample(frac=resample_freq, random_state=iter_n, replace=False)
n_uniq_clust = resampled['representative'].nunique()
output_rows.append(dict(
freq=resample_freq,
iteration=iter_n,
num_uniq_clust=n_uniq_clust,
comparison=name,
))

output_rows.append(dict(freq=1, iteration=0, comparison=name, num_uniq_clust=both.representative.nunique()))
output_rows.append(dict(freq=0, iteration=0, comparison=name, num_uniq_clust=0))

df_full_resamp = pd.DataFrame(output_rows)

output_rows = []
for full_df, name in zip((both,), ('PDB + syn',)):
for resample_freq in tqdm.tqdm((0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)):
# names of synthetic samples will start with date
rest_df = full_df[~full_df['member'].str.startswith("2024")]
df = full_df[full_df['member'].str.startswith("2024")]
rest_u = rest_df.representative.nunique()

for iter_n in range(5):
resampled = df.sample(frac=resample_freq, random_state=iter_n, replace=False)
n_uniq_clust = resampled['representative'].nunique() + rest_u
output_rows.append(dict(
freq=resample_freq,
iteration=iter_n,
num_uniq_clust=n_uniq_clust,
comparison=name,
))

output_rows.append(dict(freq=1, iteration=0, comparison=name, num_uniq_clust=full_df.representative.nunique()))
output_rows.append(dict(freq=0, iteration=0, comparison=name, num_uniq_clust=0))

df_partial_resamp = pd.DataFrame(output_rows)

plt.figure()
sns.lineplot(data=df_partial_resamp,
x='freq',
y='num_uniq_clust',
label='Partial resampling',
color='#0D96C9',)

# Plot the full resampling line in blue
sns.lineplot(data=df_full_resamp,
x='freq',
y='num_uniq_clust',
label='Full resampling',
color='#404040',
)

plt.xlabel("Proportion of data points \nsampled")
plt.ylabel("No. of clusters")
plt.xticks(np.arange(0, 1.1, 0.1), rotation=75)
plt.legend(title=None, loc='best', frameon=False)
plt.title("Number of distinct clusters by\n resampling frequency")

# Remove top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.tight_layout()



89 changes: 89 additions & 0 deletions analysis/fig-s5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
#!/usr/bin/env python
# coding: utf-8

# In[1]:


import pathlib
import pandas as pd
import pathlib
import pandas as pd
import joblib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import re

sns.set(font_scale=1.7)
sns.set_style("whitegrid")
plt.rcParams['axes.grid'] = False
plt.rc('axes',edgecolor='black')

plt.rc("text", usetex=False)
plt.rc(
"text.latex",
preamble=r"\usepackage{newpxtext}\usepackage{newpxmath}\usepackage{commath}\usepackage{mathtools}",
)
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['DejaVu Sans']
# plt.rc("font", family="serif", size=16.0, weight="medium")
plt.rc("savefig", dpi=500)
plt.rc("legend", loc="best", fontsize="medium", fancybox=True, framealpha=0.5)
plt.rc("lines", linewidth=2.5, markersize=10, markeredgewidth=2.5)
plt.rc("axes", titlepad=10)

colors = ["#BBBBBB", "#33BBEE", "#EE3377", "#009988", "#CC3311", "#0077BB"]
colors = list(reversed(colors))
sns.set_palette(sns.color_palette(colors))
# set mpl palette
plt.rcParams["axes.prop_cycle"] = plt.cycler(color=colors)


# In[2]:


df = pd.read_parquet('temperature_scRMSD_gridsearch_results.parquet')


# In[3]:


averages = df.groupby(["temperature",
'backbone_pdb'])[
["scRMSD", "TM", "aa_length"]
].mean()
averages = averages.reset_index()

averages["pass"] = averages["scRMSD"] <= 2
averages["pass_tm"] = averages["TM"] >= 0.5
bins = np.arange(0, 1100, 100)


# In[4]:


plt.figure()
averages['pass_pct'] = averages['pass'] * 100
sns.lineplot(data=averages,
x='temperature',
y='pass_pct',
marker='o',
color='#0D96C9',
errorbar=None)

plt.xlabel('Temperature')
plt.ylabel('% backbones with\n(scRMSD < 2Å)')


# Force x-axis tick labels to show
plt.gca().set_xticks(averages['temperature'].unique()) # Set ticks at each temperature value
plt.gca().set_xticklabels(averages['temperature'].unique()) # Force labels to show

# Remove top and right spines
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)

plt.tight_layout()
plt.title('BackboneRef sample designability\nby temperature')

60 changes: 60 additions & 0 deletions analysis/figure2b+d.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

sns.set(font_scale=1.7)
sns.set_style("whitegrid")
plt.rcParams['axes.grid'] = False
plt.rc('axes',edgecolor='black')

plt.rc("text", usetex=False)
plt.rc(
"text.latex",
preamble=r"\usepackage{newpxtext}\usepackage{newpxmath}\usepackage{commath}\usepackage{mathtools}",
)
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['DejaVu Sans']
plt.rc("savefig", dpi=500)
plt.rc("legend", loc="best", fontsize="medium", fancybox=True, framealpha=0.5)
plt.rc("lines", linewidth=2.5, markersize=10, markeredgewidth=2.5)
plt.rc("axes", titlepad=10)
colors = ["#BBBBBB", "#33BBEE", "#EE3377", "#009988", "#CC3311", "#0077BB"]
colors = list(reversed(colors))
sns.set_palette(sns.color_palette(colors))
# set mpl palette
plt.rcParams["axes.prop_cycle"] = plt.cycler(color=colors)

# this should be on the zenodo
df = pd.read_parquet('backbone_novelty_quality_statistics.parquet')

# figure 2d
sns.histplot(df,
x='length',
y='max_search_tm')
plt.ylabel('TM-score (AFDB/UniProt)')
plt.xlabel('Backbone length (AA)')
plt.xlim(40, 512)
_ = plt.xticks([100, 200, 300, 400, 500, ])
plt.ylim(0, 1.0)
plt.title('Max. TM-score of BR structures')
# make correlation txt
corr_df = df[['length', 'max_search_tm']]
corr_df.dropna(inplace=True)
res = stats.pearsonr(corr_df['length'], corr_df['max_search_tm'])
r = res.correlation.item()
_ = plt.text(350, 0.85, f'R = {r:.2f}')


# figure 2b
plt.figure()
sns.ecdfplot(df['avg_scrmsd'], stat='proportion',
complementary=False, linewidth=2, color='#0D96C9')
plt.xlabel('Average scRMSD')
plt.ylabel('Percentile')
plt.title('Designability of BR backbones')
plt.axvline(x=2, linestyle='--', color='black',zorder=-1, alpha=0.5)




129 changes: 129 additions & 0 deletions analysis/figure2c.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
import pathlib
import pandas as pd
import joblib
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(font_scale=1.7)
sns.set_style("whitegrid")
plt.rcParams["axes.grid"] = False
plt.rc("axes", edgecolor="black")
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['DejaVu Sans']
plt.rc("savefig", dpi=500)
plt.rc("legend", loc="best", fontsize="medium", fancybox=True, framealpha=0.5)
plt.rc("lines", linewidth=2.5, markersize=10, markeredgewidth=2.5)
plt.rc("axes", titlepad=10)

colors = ["#BBBBBB", "#33BBEE", "#EE3377", "#009988", "#CC3311", "#0077BB"]
colors = list(reversed(colors))
sns.set_palette(sns.color_palette(colors))
plt.rcParams["axes.prop_cycle"] = plt.cycler(color=colors)

together_cluster = pd.read_csv(
"pdb_plus_240k/scRMSD_best_240k_plus_pdbFirst_default__cluster.tsv",
sep="\t",
header=None,
names=["representative", "member"],
)


representatives, pdb_counts, percentages, sizes, member, effective_sz = (
[],
[],
[],
[],
[],
[],
)
for representative, groupby in together_cluster.groupby("representative"):
unique_pdb = groupby["member"][groupby["member"].str.startswith("pdb")].tolist()
unique_pdb = [s.split("_")[0] for s in unique_pdb]
unique_pdb = set(unique_pdb)

unique_non_pdb = groupby["member"][
~groupby["member"].str.startswith("pdb")
].tolist()
unique_non_pdb = set(unique_non_pdb)
percentage_pdb = len(unique_pdb) / len(unique_pdb.union(unique_non_pdb))

representatives.append(representative)
pdb_counts.append(len(unique_pdb))
percentages.append(percentage_pdb)
sizes.append(len(groupby))
effective_size = len(unique_pdb.union(unique_non_pdb))
effective_sz.append(effective_size)

cluster_pdb = pd.DataFrame(
{
"representative": representatives,
"pdb_count": pdb_counts,
"percentage": percentages,
"size": sizes,
"effective_size": effective_sz,
}
)
cluster_pdb["pdb_log"] = np.log(cluster_pdb["pdb_count"] + 1)
cluster_pdb["size_log"] = np.log(cluster_pdb["size"])
cluster_pdb["size_syn"] = cluster_pdb["size"] - cluster_pdb["pdb_count"]

cluster_pdb['is_synthetic'] = cluster_pdb['size_syn'] > 0
cluster_pdb['syn_only'] = cluster_pdb['size_syn'] == cluster_pdb['size']



fig, axs = plt.subplots(1, 1)
cluster_pdb_sorted = cluster_pdb.sort_values('syn_only', ascending=True)

g = sns.scatterplot(
data=cluster_pdb_sorted,
x="size",
y="size_syn",
alpha=1,
ax=axs,
s=20,
hue='syn_only',
palette=['#7f7f7f', '#0D96C9'],
legend=True # Turn off automatic legend
)

f = 0.1
xmin = cluster_pdb_sorted['size'].min()
xmin = xmin - xmin * f

xmax = cluster_pdb_sorted['size'].max()
xmax = xmax + xmin * f

# Log scale for x-axis
axs.set_xscale("log")
axs.set_yscale('log')

# Set labels and title
axs.set_xlabel("Cluster size")
axs.set_ylabel("# BBR members")
axs.set_title("Cluster size by num. BBR members")

#Create custom legend handles
from matplotlib.lines import Line2D
legend_elements = [
Line2D([0], [0], marker='o', color='w', markerfacecolor='#7f7f7f',
label='PDB + BBRef samples', markersize=6),
Line2D([0], [0], marker='o', color='w', markerfacecolor='#0D96C9',
label='BBRef samples only', markersize=6)
]
# PDB + BBRef samples" & "BBRef samples only" ?
# also please change "PDB + synthetic samples" to "PDB + BBRef samples"

# Add custom legend
axs.legend(handles=legend_elements, frameon=False, markerscale=2.5)

# Remove frame
axs.spines['top'].set_visible(False)
axs.spines['right'].set_visible(False)

plt.tight_layout()




Loading