diff --git a/Dockerfile.cuchem b/Dockerfile.cuchem index dd210672..08e15c49 100644 --- a/Dockerfile.cuchem +++ b/Dockerfile.cuchem @@ -22,6 +22,8 @@ RUN cd /opt/nvidia/cheminfomatics/common; \ RUN cd /opt/nvidia/cheminfomatics/cuchem; \ pip install -r requirements.txt +RUN pip install torch==1.7.0+cu110 -f https://download.pytorch.org/whl/torch_stable.html + ENV UCX_LOG_LEVEL error ENV PYTHONPATH ./common/generated:./common:./cuchem: diff --git a/README.md b/README.md index fa1f86de..fa785866 100644 --- a/README.md +++ b/README.md @@ -50,11 +50,6 @@ Build your container: ./launch.sh build ``` -Download the ChEMBL database (version 27): -``` -./launch.sh dbSetup -``` - Launch the interactive ChEMBL exploration tool: ``` ./launch.sh start diff --git a/common/cuchemcommon/data/__init__.py b/common/cuchemcommon/data/__init__.py index 3a07d30c..403dc91c 100644 --- a/common/cuchemcommon/data/__init__.py +++ b/common/cuchemcommon/data/__init__.py @@ -12,16 +12,16 @@ def meta_df(self): """ return NotImplemented - def fetch_molecular_embedding(self, n_molecules: int, cache_directory: str = None): + def fetch_molecular_embedding(self, n_molecules: int, cache_directory: str = None, radius = 2, nBits = 512): """ Fetch molecular properties from database/cache into a dask array. """ return NotImplemented - def fetch_molecular_embedding_by_id(self, molecule_id: List): + def fetch_molecular_embedding_by_id(self, molecule_id: List, radius=2, nBits=512): """ Fetch molecular properties from database for the given id. Id depends on - the backend databse. For chemble DB it should be molregid. + the backend databse. For chembl DB it should be molregid. """ return NotImplemented @@ -29,7 +29,7 @@ def fetch_id_from_smile(self, new_molecules: List): """ Fetch molecular details for a list of molecules. The values in the list of molecules depends on database/service used. For e.g. it could be - ChemblId or molreg_id for Chemble database. + ChemblId or molreg_id for Chembl database. """ return NotImplemented @@ -40,6 +40,6 @@ def fetch_id_from_chembl(self, id: List): """ Fetch molecular details for a list of molecules. The values in the list of molecules depends on database/service used. For e.g. it could be - ChemblId or molreg_id for Chemble database. + ChemblId or molreg_id for Chembl database. """ return NotImplemented diff --git a/common/cuchemcommon/data/cluster_wf.py b/common/cuchemcommon/data/cluster_wf.py index 6462d5fb..ca3f50bc 100644 --- a/common/cuchemcommon/data/cluster_wf.py +++ b/common/cuchemcommon/data/cluster_wf.py @@ -6,6 +6,7 @@ import cudf import dask import dask_cudf +import sys from cuchemcommon.context import Context from cuchemcommon.data.helper.chembldata import BATCH_SIZE, ChEmblData from cuchemcommon.utils.singleton import Singleton @@ -19,8 +20,11 @@ class ChemblClusterWfDao(ClusterWfDAO, metaclass=Singleton): - def __init__(self, fp_type): + def __init__(self, fp_type, radius=2, nBits=512): + logger.info(f'ChemblClusterWfDao({fp_type})') self.chem_data = ChEmblData(fp_type) + self.radius = radius + self.nBits = nBits def meta_df(self): chem_data = ChEmblData() @@ -28,29 +32,59 @@ def meta_df(self): def fetch_molecular_embedding(self, n_molecules: int, - cache_directory: str = None): + cache_directory: str = None, + radius=2, + nBits=512): + # Since we allow the user to change the fingerprint radius and length (nBits), + # the fingerprints need to be cached in separate subdirectories. + # Note: the precomputed ones are not presumed to be of a specific radius or length context = Context() if cache_directory: - hdf_path = os.path.join(cache_directory, FINGER_PRINT_FILES) + cache_subdir = f'{cache_dir}/fp_r{radius}_n{nBits}' + hdf_path = os.path.join(cache_subdir, FINGER_PRINT_FILES) + else: + cache_subdir = None + hdf_path = None + if cache_directory and os.path.isdir(cache_subdir): # and (self.radius == radius) and (self.nBits == nBits): logger.info('Reading %d rows from %s...', n_molecules, hdf_path) mol_df = dask.dataframe.read_hdf(hdf_path, 'fingerprints') - + if len(mol_df) == 0: + logger.info(f'Zero molecules found in {hdf_path}! Caching error?') if n_molecules > 0: npartitions = math.ceil(n_molecules / BATCH_SIZE) mol_df = mol_df.head(n_molecules, compute=False, npartitions=npartitions) else: - logger.info('Reading molecules from database...') - mol_df = self.chem_data.fetch_mol_embedding(num_recs=n_molecules, - batch_size=context.batch_size) - + self.radius = radius + self.nBits = nBits + logger.info(f'Reading molecules from database and computing fingerprints (radius={self.radius}, nBits={self.nBits})...') + sys.stdout.flush() + mol_df = self.chem_data.fetch_mol_embedding( + num_recs=n_molecules, + batch_size=context.batch_size, + radius=radius, + nBits=nBits + ) + if cache_directory: + os.mkdir(cache_subdir) + logger.info(f'Caching mol_df fingerprints to {hdf_path}') + mol_df.to_hdf(hdf_path, 'fingerprints') + else: + logging.info(f'cache_directory={cache_directory}, not caching!') + sys.stdout.flush() return mol_df - def fetch_molecular_embedding_by_id(self, molecule_id: List): + def fetch_molecular_embedding_by_id(self, molecule_id: List, radius=2, nBits=512): context = Context() - meta = self.chem_data._meta_df() - fp_df = self.chem_data._fetch_mol_embedding(molregnos=molecule_id, - batch_size=context.batch_size) \ - .astype(meta.dtypes) + meta = self.chem_data._meta_df( + f'fetch_molecular_embedding_by_id({molecule_id}): MISMATCH!!! radius: {radius} != {self.radius}, nBits: {nBits} != {self.nBits}') + if (self.radius != radius) or (self.nBits != nBits): + logger.info('Something broken?') + fp_df = self.chem_data._fetch_mol_embedding( + molregnos=molecule_id, + batch_size=context.batch_size, + radius=radius, + nBits=nBits + ).astype(meta.dtypes) fp_df = cudf.from_pandas(fp_df) fp_df = dask_cudf.from_cudf(fp_df, npartitions=1).reset_index() diff --git a/common/cuchemcommon/data/helper/chembldata.py b/common/cuchemcommon/data/helper/chembldata.py index 763d2ee6..7ccd7c58 100644 --- a/common/cuchemcommon/data/helper/chembldata.py +++ b/common/cuchemcommon/data/helper/chembldata.py @@ -3,10 +3,9 @@ import pandas import sqlite3 import logging - +import sys from typing import List -from dask import delayed, dataframe - +import dask from contextlib import closing from cuchemcommon.utils.singleton import Singleton from cuchemcommon.context import Context @@ -70,7 +69,7 @@ def fetch_props_by_molregno(self, molregnos): cols = list(map(lambda x: x[0], cur.description)) return cols, cur.fetchall() - def fetch_props_by_chemble(self, chemble_ids): + def fetch_props_by_chembl(self, chembl_ids): """ Returns compound properties and structure filtered by ChEMBL IDs along with a list of columns. @@ -84,7 +83,7 @@ def fetch_props_by_chemble(self, chemble_ids): """ with closing(sqlite3.connect(self.chembl_db, uri=True)) as con, con, \ closing(con.cursor()) as cur: - select_stmt = sql_stml % "'%s'" % "','".join([x.strip().upper() for x in chemble_ids]) + select_stmt = sql_stml % "'%s'" % "','".join([x.strip().upper() for x in chembl_ids]) cur.execute(select_stmt) cols = list(map(lambda x: x[0], cur.description)) @@ -148,13 +147,18 @@ def fetch_molecule_cnt(self): return cur.fetchone()[0] - def _meta_df(self, **transformation_kwargs): + def _meta_df(self, columns=[], **transformation_kwargs): transformation = self.fp_type(**transformation_kwargs) prop_meta = {'id': pandas.Series([], dtype='int64')} prop_meta.update(dict(zip(IMP_PROPS + ADDITIONAL_FEILD, IMP_PROPS_TYPE + ADDITIONAL_FEILD_TYPE))) - prop_meta.update({i: pandas.Series([], dtype='float32') for i in range(len(transformation))}) + prop_meta.update( + {i: pandas.Series([], dtype='float32') for i in range(len(transformation))}) + # New columns containing the fingerprint as uint64s: + for column in columns: + if isinstance(column, str) and column.startswith('fp'): + prop_meta.update({column: pandas.Series([], dtype='uint64')}) return pandas.DataFrame(prop_meta) @@ -167,7 +171,7 @@ def _fetch_mol_embedding(self, Returns compound properties and structure for the first N number of records in a dataframe. """ - + # TODO: loading compounds from the database and computing fingerprints need to be separated logger.debug('Fetching %d records starting %d...' % (batch_size, start)) imp_cols = ['cp.' + col for col in IMP_PROPS] @@ -194,32 +198,38 @@ def _fetch_mol_embedding(self, LIMIT %d, %d ''' % (', '.join(imp_cols), " ,".join(list(map(str, molregnos))), start, batch_size) - df = pandas.read_sql(select_stmt, - sqlite3.connect(self.chembl_db, uri=True)) + df = pandas.read_sql( + select_stmt, + sqlite3.connect(self.chembl_db, uri=True)) # Smiles -> Smiles transformation and filtering # TODO: Discuss internally to find use or refactor this code to remove # model specific filtering df['transformed_smiles'] = df['canonical_smiles'] - # if smiles_transforms is not None: - # if len(smiles_transforms) > 0: - # for xf in smiles_transforms: - # df['transformed_smiles'] = df['transformed_smiles'].map(xf.transform) - # df.dropna(subset=['transformed_smiles'], axis=0, inplace=True) # Conversion to fingerprints or embeddings - # transformed_smiles = df['transformed_smiles'] transformation = self.fp_type(**transformation_kwargs) - cache_data = transformation.transform(df) - return_df = pandas.DataFrame(cache_data) + # This is where the int64 fingerprint columns are computed: + cache_data, raw_fp_list = transformation.transform( + df, + return_fp=True + ) + return_df = pandas.DataFrame(cache_data) return_df = pandas.DataFrame( return_df, columns=pandas.RangeIndex(start=0, stop=len(transformation))).astype('float32') return_df = df.merge(return_df, left_index=True, right_index=True) + # TODO: expect to run into the issue that the fingerprint cannot be a cudf column + # TODO: compute here so that chemvisualize does not have to + # The computed fingerprint columns are inserted into the df with the 'fp' prefix (to + # distinguish from PCA columns that are also numeric) + for i, fp_col in enumerate(raw_fp_list): + return_df[f'fp{i}'] = fp_col return_df.rename(columns={'molregno': 'id'}, inplace=True) + return return_df def fetch_mol_embedding(self, @@ -231,8 +241,6 @@ def fetch_mol_embedding(self, Returns compound properties and structure for the first N number of records in a dataframe. """ - logger.debug('Fetching properties for all molecules...') - if num_recs is None or num_recs < 0: num_recs = self.fetch_molecule_cnt() @@ -242,24 +250,26 @@ def fetch_mol_embedding(self, dls = [] for start in range(0, num_recs, batch_size): bsize = min(num_recs - start, batch_size) - dl_data = delayed(self._fetch_mol_embedding)(start=start, - batch_size=bsize, - molregnos=molregnos, - **transformation_kwargs) + dl_data = dask.delayed(self._fetch_mol_embedding)( + start=start, + batch_size=bsize, + molregnos=molregnos, + **transformation_kwargs + ) dls.append(dl_data) + meta_df = self._meta_df( + columns=dls[0].columns.compute(), **transformation_kwargs) - return dataframe.from_delayed(dls, meta=meta_df) + return dask.dataframe.from_delayed(dls, meta=meta_df) def save_fingerprints(self, hdf_path='data/filter_*.h5', num_recs=None, batch_size=5000): """ Generates fingerprints for all ChEMBL ID's in the database """ - logger.debug('Fetching molecules from database for fingerprints...') - mol_df = self.fetch_mol_embedding(num_recs=num_recs, batch_size=batch_size) + logger.info(f'save_fingerprints writing {type(mol_df)} to {hdf_path}') mol_df.to_hdf(hdf_path, 'fingerprints') - def is_valid_chemble_smiles(self, smiles, con=None): if con is None: diff --git a/common/cuchemcommon/fingerprint.py b/common/cuchemcommon/fingerprint.py index 860e7021..9b4799a1 100644 --- a/common/cuchemcommon/fingerprint.py +++ b/common/cuchemcommon/fingerprint.py @@ -6,7 +6,9 @@ import numpy as np from rdkit import Chem from rdkit.Chem import AllChem +from math import ceil +INTEGER_NBITS = 64 # Maximum number of bits in an integer column in a cudf Series logger = logging.getLogger(__name__) @@ -25,7 +27,7 @@ def __init__(self, **kwargs): self.kwargs = None self.func = None - def transform(self, data): + def transform(self, data, smiles_column = 'transformed_smiles'): return NotImplemented def transform_many(self, data): @@ -56,14 +58,69 @@ def transform_single(self, smiles): fp = cupy.asarray(fp) return fp - def transform(self, data, col_name='transformed_smiles'): + def transform_new(self, data, col_name='transformed_smiles', return_fp=False, raw=False): """Single threaded processing of list""" data = data[col_name] fp_array = [] + self.n_fp_integers = ceil(self.kwargs['nBits'] / INTEGER_NBITS) + if raw: + raw_fp_array = [] + else: + raw_fp_array = [[] for i in range(0, self.kwargs['nBits'], INTEGER_NBITS)] for smiles in data: fp = self.transform_single(smiles) fp_array.append(fp) + fp_bs = fp.ToBitString() + if return_fp: + if raw: + raw_fp_array.append(fp) + else: + for i in range(0, self.kwargs['nBits'], INTEGER_NBITS): + raw_fp_array[i // INTEGER_NBITS].append(int(fp_bs[i: i + INTEGER_NBITS], 2)) + fp_array = cupy.stack(fp_array) + if return_fp: + if raw: + return fp_array, raw_fp_array + else: + return fp_array, np.asarray(raw_fp_array, dtype=np.uint64) + return fp_array + + def transform( + self, + data, + smiles_column = 'transformed_smiles', + return_fp = False, # When set to True, an additional value is returned determined by the raw parameter + raw = False # The RDKit fingerprint object is returned when raw = True, and the int64 fingerprint columns are returned when raw = False + ): + data = data[smiles_column] + fp_array = [] + self.n_fp_integers = ceil(self.kwargs['nBits'] / INTEGER_NBITS) + if raw: + raw_fp_array = [] + else: + raw_fp_array = [[] for i in range(0, self.kwargs['nBits'], INTEGER_NBITS)] + for mol_smiles in data: + m = Chem.MolFromSmiles(mol_smiles) + if not m: + fp = None + fp_bs = '0' * self.kwargs['nBits'] + else: + fp = self.func(m, **self.kwargs) + fp_bs = fp.ToBitString() + fp_array.append(cupy.asarray(np.frombuffer(fp_bs.encode(), 'u1') - ord('0'))) + if return_fp: + if raw: + raw_fp_array.append(fp) + else: + for i in range(0, self.kwargs['nBits'], INTEGER_NBITS): + raw_fp_array[i // INTEGER_NBITS].append(int(fp_bs[i: i + INTEGER_NBITS], 2)) fp_array = cupy.stack(fp_array) + # TODO: return value parameter names should be self-explanatory + if return_fp: + if raw: + return fp_array, raw_fp_array + else: + return fp_array, np.asarray(raw_fp_array, dtype=np.uint64) return fp_array def __len__(self): diff --git a/common/cuchemcommon/workflow.py b/common/cuchemcommon/workflow.py index 065c3632..5a15fec0 100644 --- a/common/cuchemcommon/workflow.py +++ b/common/cuchemcommon/workflow.py @@ -1,3 +1,6 @@ +from functools import singledispatch +import pandas as pd +import numpy as np import logging from typing import List @@ -6,7 +9,6 @@ logger = logging.getLogger(__name__) - class BaseGenerativeWorkflow(BaseTransformation): def __init__(self, dao: GenerativeWfDao = None) -> None: @@ -62,7 +64,7 @@ def _compute_radius(self, scaled_radius): def interpolate_by_id(self, ids: List, - id_type: str = 'chembleid', + id_type: str = 'chemblid', num_points=10, force_unique=False, scaled_radius: int = 1, @@ -72,40 +74,84 @@ def interpolate_by_id(self, if not self.min_jitter_radius: raise Exception('Property `radius_scale` must be defined in model class.') - if id_type.lower() == 'chembleid': + if id_type.lower() == 'chemblid': smiles = [row[2] for row in self.dao.fetch_id_from_chembl(ids)] if len(smiles) != len(ids): raise Exception('One of the ids is invalid %s', ids) else: raise Exception('id type %s not supported' % id_type) - return self.interpolate_smiles(smiles, - num_points=num_points, - scaled_radius=scaled_radius, - force_unique=force_unique, - sanitize=sanitize) + return self.interpolate_smiles( + smiles, + compound_ids=ids, + num_points=num_points, + scaled_radius=scaled_radius, + force_unique=force_unique, + sanitize=sanitize + ) + + def extrapolate_from_cluster(self, + compounds_df, + compound_property: str, + cluster_id: int = 0, + n_compounds_to_transform=10, + num_points: int = 10, + step_size: float = 0.01, + force_unique = False, + scaled_radius: int = 1): + """ + The embedding vector is calculated for the specified cluster_id and applied over it. + TO DO: We should have a table of direction vectors in embedded space listed, just like the list of compoun d IDs. + The user should choose one to be applied to the selected compounds, or to a cluster number. + """ + smiles_list = None + + if not self.radius_scale: + raise Exception('Property `radius_scale` must be defined in model class.') + else: + radius = float(scaled_radius * self.radius_scale) + # TO DO: User must be able to extrapolate directly from smiles in the table; + # these may themselves be generated compounds without any chemblid. + df_cluster = compounds_df[ compounds_df['cluster'] == cluster_id ].dropna().reset_index(drop=True).compute () + return self.extrapolate_from_smiles(df_cluster['transformed_smiles'].to_array(), + compound_property_vals=df_cluster[compound_property].to_array(), + num_points=num_points, + n_compounds_to_transform=n_compounds_to_transform, + step_size=step_size, + radius=scaled_radius, + force_unique=force_unique) + def find_similars_smiles_by_id(self, - chemble_id: str, - id_type: str = 'chembleid', + chembl_ids: List[str], # actually a list of strings + id_type: str = 'chemblid', num_requested=10, force_unique=False, scaled_radius: int = 1, sanitize=True): - smiles = None - + smiles_list = [] + if not self.min_jitter_radius: raise Exception('Property `radius_scale` must be defined in model class.') - if id_type.lower() == 'chembleid': - smiles = [row[2] for row in self.dao.fetch_id_from_chembl(chemble_id)] - if len(smiles) != len(chemble_id): - raise Exception('One of the ids is invalid %s' + chemble_id) + if id_type.lower() == 'chemblid': + smiles_list = [row[2] for row in self.dao.fetch_id_from_chembl(chembl_ids)] + if len(smiles_list) != len(chembl_ids): + raise Exception('One of the ids is invalid %s' + chembl_ids) else: raise Exception('id type %s not supported' % id_type) - return self.find_similars_smiles(smiles[0], - num_requested=num_requested, - scaled_radius=scaled_radius, - force_unique=force_unique, - sanitize=sanitize) + ret_vals = [ + self.find_similars_smiles( + smiles, + num_requested=num_requested, + scaled_radius=scaled_radius, + force_unique=force_unique, + compound_id=str(chembl_id), + sanitize=sanitize + ) + for smiles, chembl_id in zip(smiles_list, chembl_ids) + ] + if len(ret_vals) == 1: + return ret_vals[0] + return pd.concat(ret_vals, ignore_index=True, copy=False) diff --git a/cuchem/cuchem/decorator/__init__.py b/cuchem/cuchem/decorator/__init__.py index 37137aae..25f4354d 100644 --- a/cuchem/cuchem/decorator/__init__.py +++ b/cuchem/cuchem/decorator/__init__.py @@ -8,7 +8,7 @@ class BaseMolPropertyDecorator(object): def decorate(self, df: Union[cudf.DataFrame, pandas.DataFrame], - smile_cols: int = 0): + smiles_cols: int = 0): NotImplemented diff --git a/cuchem/cuchem/decorator/lipinski.py b/cuchem/cuchem/decorator/lipinski.py index 67407fc8..6500e41c 100644 --- a/cuchem/cuchem/decorator/lipinski.py +++ b/cuchem/cuchem/decorator/lipinski.py @@ -21,7 +21,7 @@ class LipinskiRuleOfFiveDecorator(BaseMolPropertyDecorator): def decorate(self, df: Union[cudf.DataFrame, pandas.DataFrame], - smile_cols: int = 0): + smiles_cols: int = 0): mol_wt = [] mol_logp = [] @@ -29,13 +29,16 @@ def decorate(self, hacceptors = [] rotatable_bonds = [] qeds = [] + invalid = [] for idx in range(df.shape[0]): - smiles = df.iat[idx, smile_cols] + smiles = df.iat[idx, smiles_cols] m = Chem.MolFromSmiles(smiles) if m is None: + logger.info(f'{idx}: Could not make a Mol from {smiles}') + invalid.append(True) mol_logp.append({'value': '-', 'level': 'info'}) mol_wt.append({'value': '-', 'level': 'info'}) hdonors.append({'value': '-', 'level': 'info'}) @@ -43,7 +46,8 @@ def decorate(self, rotatable_bonds.append({'value': '-', 'level': 'info'}) qeds.append({'value': '-', 'level': 'info'}) continue - + else: + invalid.append(False) try: logp = Descriptors.MolLogP(m) mol_logp.append({'value': round(logp, 2), @@ -100,5 +104,7 @@ def decorate(self, df['H-Bond Acceptors'] = hacceptors df['Rotatable Bonds'] = rotatable_bonds df['QED'] = qeds + # TODO: this may be redundant as chemvisualize seems to be handling such invalid molecules + df['invalid'] = invalid return df diff --git a/cuchem/cuchem/decorator/mol_structure.py b/cuchem/cuchem/decorator/mol_structure.py index 1720d1e7..d96bde34 100644 --- a/cuchem/cuchem/decorator/mol_structure.py +++ b/cuchem/cuchem/decorator/mol_structure.py @@ -17,12 +17,12 @@ class MolecularStructureDecorator(BaseMolPropertyDecorator): def decorate(self, df: Union[cudf.DataFrame, pandas.DataFrame], - smile_cols: int = 0): + smiles_col: int = 0): mol_struct = [] for idx in range(df.shape[0]): - smiles = df.iat[idx, smile_cols] + smiles = df.iat[idx, smiles_col] try: m = Chem.MolFromSmiles(smiles) drawer = Draw.rdMolDraw2D.MolDraw2DCairo(500, 125) diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index 439b87c6..81c2e81f 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -1,6 +1,9 @@ # Copyright 2020 NVIDIA Corporation # SPDX-License-Identifier: Apache-2.0 +# TODO: separate loading of compounds from clustering of compounds; currently, loading is triggered by a call to clustering. +# TODO: separate fingerprinting from clustering; currently fingerprinting is triggered by a call to clustering. + import base64 import json import logging @@ -9,6 +12,7 @@ import cupy import dash +import cuml import dash_bootstrap_components as dbc import dash_core_components as dcc import dash_html_components as html @@ -22,6 +26,20 @@ from cuchem.utils import generate_colors, report_ui_error from rdkit import Chem from rdkit.Chem import Draw, PandasTools +from numba.cuda.libdevice import popcll + +# Check if all of these are needed: +from cuchemcommon.fingerprint import MorganFingerprint, INTEGER_NBITS +import sys +import numpy as np +import pandas as pd +import cudf +import dask_cudf +from dask.distributed import wait +from rdkit import DataStructs, Chem +from rdkit.Chem.rdmolfiles import MolFromSmiles +from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmilesFromSmiles +import time logger = logging.getLogger(__name__) @@ -76,12 +94,12 @@ def download_sdf(): valid_idx = [] col_list = ['SMILES', 'Molecular Weight', 'LogP', 'H-Bond Donors', 'H-Bond Acceptors', 'Rotatable Bonds'] - for row, data in vis.genreated_df.iterrows(): + for row, data in vis.generated_df.iterrows(): mol = Chem.MolFromSmiles(data['SMILES']) if (mol is not None): valid_idx.append(row) - valid_df = vis.genreated_df.iloc[valid_idx] + valid_df = vis.generated_df.iloc[valid_idx] valid_df = valid_df[col_list] PandasTools.AddMoleculeColumnToFrame(valid_df, 'SMILES') @@ -95,18 +113,33 @@ def download_sdf(): headers={"Content-disposition": "attachment; filename=download.sdf"}) +def popcll_wrapper(ip_col, op_col): + for i, n in enumerate(ip_col): + op_col[i] = popcll(n) + +def popcll_wrapper_dask(df, ip_col, op_col): + df = df.apply_rows(popcll_wrapper, incols = {ip_col: 'ip_col'}, outcols = {op_col: int}, kwargs = {}) + return df[op_col] + +def intersection_wrapper(fp_int_col, op_col, query_fp_int): + for i, fp_int in enumerate(fp_int_col): + op_col[i] = popcll(fp_int & query_fp_int) class ChemVisualization(metaclass=Singleton): - def __init__(self, cluster_wf): + def __init__(self, cluster_wf, fingerprint_radius=2, fingerprint_nBits=512): self.app = app self.cluster_wf = cluster_wf self.n_clusters = cluster_wf.n_clusters self.chem_data = ChEmblData() - self.genreated_df = None + self.generated_df = None self.cluster_wf_cls = 'cuchem.wf.cluster.gpukmeansumap.GpuKmeansUmapHybrid' self.generative_wf_cls = 'cuchem.wf.generative.MegatronMolBART' + self.fp_df = None # all fingerprints of all ChemBl compounds and their IDs as a pandas dataframe for use in compound similarity search on the CPU + self.fingerprint_radius = fingerprint_radius + self.fingerprint_nBits = fingerprint_nBits + # Store colors to avoid plots changes colors on events such as # molecule selection, etc. self.cluster_colors = generate_colors(self.n_clusters) @@ -136,7 +169,9 @@ def __init__(self, cluster_wf): Input('bt_north_star', 'n_clicks'), Input('sl_prop_gradient', 'value'), Input('sl_nclusters', 'value'), - Input('refresh_main_fig', 'children')], + Input('refresh_main_fig', 'children'), + Input('fingerprint_radius', 'value'), + Input('fingerprint_nBits', 'value')], [State("selected_clusters", "value"), State("main-figure", "selectedData"), State('north_star', 'value'), @@ -180,14 +215,23 @@ def __init__(self, cluster_wf): Input('bt_close_err', 'n_clicks')])(self.handle_error) self.app.callback( - Output('genration_candidates', 'children'), + Output('generation_candidates', 'children'), [Input({'role': 'bt_add_candidate', 'chemblId': ALL, 'molregno': ALL}, 'n_clicks'), Input('bt_reset_candidates', 'n_clicks'), ], - State('genration_candidates', 'children'))(self.handle_add_candidate) + State('generation_candidates', 'children'))(self.handle_add_candidate) + + self.app.callback( + Output('analoguing_candidates', 'children'), + [Input({'role': 'bt_analoguing_candidate', 'chemblId': ALL, 'molregno': ALL}, 'n_clicks')], + State('analoguing_candidates', 'children'))(self.handle_analoguing_candidate) self.app.callback( Output('ckl_candidate_mol_id', 'options'), - Input('genration_candidates', 'children'))(self.handle_construct_candidates) + Input('generation_candidates', 'children'))(self.handle_construct_candidates) + + self.app.callback( + Output('ckl_analoguing_mol_id', 'options'), + Input('analoguing_candidates', 'children'))(self.handle_construct_candidates2) self.app.callback( [Output('ckl_candidate_mol_id', 'value'), @@ -196,27 +240,62 @@ def __init__(self, cluster_wf): Input('rd_generation_type', 'value')])(self.handle_ckl_selection) self.app.callback( - [Output('table_generated_molecules', 'children'), + [Output('ckl_analoguing_mol_id', 'value')], + [Input('ckl_analoguing_mol_id', 'value')])(self.handle_analoguing_ckl_selection) + + self.app.callback( + [Output('section_fitting', 'style'), + Output('fitting_figure', 'figure')], + [Input("bt_fit", "n_clicks"),], + [State('sl_featurizing_wf', 'value'), + State('fit_nn_compound_property', 'value'), + State('fit_nn_train_cluster_number', 'value'), + State('fit_nn_test_cluster_number', 'value'), + State('fit_nn_hidden_layer_sizes', 'value'), + State('fit_nn_activation_fn', 'value'), + State('fit_nn_final_activation_fn', 'value'), + State('fit_nn_max_epochs', 'value'), + State('fit_nn_learning_rate', 'value'), + State('fit_nn_weight_decay', 'value'), + State('fit_nn_batch_size', 'value')])(self.handle_fitting) + + self.app.callback( + [Output('section_analoguing', 'style'), + Output('tb_analoguing', 'children')], + [Input("bt_analoguing", "n_clicks"),], + [State('ckl_analoguing_mol_id', 'value'), + State('analoguing_n_analogues', 'value'), + State('analoguing_threshold', 'value'), + State('analoguing_type', 'value')])(self.handle_analoguing) + + self.app.callback( + [Output('section_generated_molecules', 'style'), + Output('section_selected_molecules', 'style'), ], + [Input('show_generated_mol', 'children'), + Input('show_selected_mol', 'children')])(self.handle_property_tables) + + self.app.callback( + [Output('section_generated_molecules_clustered', 'style'), + Output('gen_figure', 'figure'), + Output('table_generated_molecules', 'children'), Output('show_generated_mol', 'children'), Output('msg_generated_molecules', 'children'), Output('interpolation_error', 'children')], - [Input("bt_generate", "n_clicks"), ], + [Input("bt_generate", "n_clicks")], [State('sl_generative_wf', 'value'), State('ckl_candidate_mol_id', 'value'), State('n2generate', 'value'), + State('extrap_compound_property', 'value'), + State('extrap_cluster_number', 'value'), + State('extrap_n_compounds', 'value'), + State('extrap_step_size', 'value'), State('scaled_radius', 'value'), State('rd_generation_type', 'value'), State('show_generated_mol', 'children')])(self.handle_generation) - self.app.callback( - [Output('section_generated_molecules', 'style'), - Output('section_selected_molecules', 'style'), ], - [Input('show_generated_mol', 'children'), - Input('show_selected_mol', 'children')])(self.handle_property_tables) - def handle_add_candidate(self, bt_add_candidate, bt_reset_candidates, - genration_candidates): + generation_candidates): comp_id, event_type = self._fetch_event_data() if comp_id == 'bt_reset_candidates' and event_type == 'n_clicks': @@ -227,8 +306,8 @@ def handle_add_candidate(self, bt_add_candidate, selected_candidates = [] - if genration_candidates: - selected_candidates = genration_candidates.split(",") + if generation_candidates: + selected_candidates = generation_candidates.split(",") comp_detail = json.loads(comp_id) selected_chembl_id = comp_detail['chemblId'] @@ -238,6 +317,24 @@ def handle_add_candidate(self, bt_add_candidate, return ','.join(selected_candidates) + + def handle_analoguing_candidate(self, bt_analoguing_candidate, analoguing_candidates): + comp_id, event_type = self._fetch_event_data() + if event_type != 'n_clicks' or dash.callback_context.triggered[0]['value'] == 0: + raise dash.exceptions.PreventUpdate + + selected_candidates = [] + + if analoguing_candidates: + selected_candidates = analoguing_candidates.split(",") + + comp_detail = json.loads(comp_id) + selected_chembl_id = comp_detail['chemblId'] + + if selected_chembl_id not in selected_candidates: + selected_candidates.append(selected_chembl_id) + return ','.join(selected_candidates) + def _fetch_event_data(self): if not dash.callback_context.triggered: raise dash.exceptions.PreventUpdate @@ -254,17 +351,37 @@ def handle_property_tables(self, show_generated_mol, show_selected_mol): return {'display': 'block', 'width': '100%'}, {'display': 'none'} return dash.no_update, dash.no_update - @report_ui_error(4) - def handle_generation(self, bt_generate, - sl_generative_wf, ckl_candidate_mol_id, - n2generate, scaled_radius, rd_generation_type, show_generated_mol): + @report_ui_error(3) + def handle_generation( + self, bt_generate, sl_generative_wf, ckl_candidate_mol_id, n2generate, + extrap_compound_property, extrap_cluster_number, extrap_n_compounds, extrap_step_size, + scaled_radius, rd_generation_type, show_generated_mol + ): + """ + [Output('section_generated_molecules_clustered', 'style'), + Output('gen_figure', 'figure'), + Output('table_generated_molecules', 'children'), + Output('show_generated_mol', 'children'), + Output('msg_generated_molecules', 'children'), + Output('interpolation_error', 'children')], + [Input("bt_generate", "n_clicks")], + [State('sl_generative_wf', 'value'), + State('ckl_candidate_mol_id', 'value'), + State('n2generate', 'value'), + State('extrap_compound_property', 'value'), + State('extrap_cluster_number', 'value'), + State('extrap_n_compounds', 'value'), + State('extrap_step_size', 'value'), + State('scaled_radius', 'value'), + State('rd_generation_type', 'value'), + State('show_generated_mol', 'children')])(self.handle_generation) + """ comp_id, event_type = self._fetch_event_data() - - chemble_ids = [] + chembl_ids = [] if comp_id == 'bt_generate' and event_type == 'n_clicks': - chemble_ids = ckl_candidate_mol_id + chembl_ids = ckl_candidate_mol_id else: - return dash.no_update, dash.no_update + return dash.no_update, dash.no_update, dash.no_update, dash.no_update, dash.no_update self.generative_wf_cls = sl_generative_wf wf_class = locate(self.generative_wf_cls) @@ -273,17 +390,26 @@ def handle_generation(self, bt_generate, scaled_radius = float(scaled_radius) if rd_generation_type == 'SAMPLE': - if chemble_ids == None or len(chemble_ids) == 0: + if chembl_ids == None or len(chembl_ids) == 0: raise ValueError('Please select at-least one molecule for Sampling.') - self.genreated_df = generative_wf.find_similars_smiles_by_id(chemble_ids, + self.generated_df = generative_wf.find_similars_smiles_by_id(chembl_ids, num_requested=n2generate, scaled_radius=scaled_radius, force_unique=True, sanitize=True) + elif rd_generation_type == 'EXTRAPOLATE': + self.generated_df = generative_wf.extrapolate_from_cluster(self.cluster_wf.df_embedding, + compound_property=extrap_compound_property, + cluster_id=extrap_cluster_number, + n_compounds_to_transform=extrap_n_compounds, + num_points=n2generate, + step_size=extrap_step_size, + scaled_radius=scaled_radius, + force_unique=False)#True) else: - if chemble_ids == None or len(chemble_ids) < 2: + if chembl_ids == None or len(chembl_ids) < 2: raise ValueError('Please select at-least two molecules for Interpolation.') - self.genreated_df = generative_wf.interpolate_by_id(chemble_ids, + self.generated_df = generative_wf.interpolate_by_id(chembl_ids, num_points=n2generate, scaled_radius=scaled_radius, force_unique=True, @@ -292,40 +418,81 @@ def handle_generation(self, bt_generate, if show_generated_mol is None: show_generated_mol = 0 show_generated_mol += 1 - # Add other useful attributes to be added for rendering - self.genreated_df = MolecularStructureDecorator().decorate(self.genreated_df) - self.genreated_df = LipinskiRuleOfFiveDecorator().decorate(self.genreated_df) + self.generated_df = MolecularStructureDecorator().decorate(self.generated_df) + self.generated_df = LipinskiRuleOfFiveDecorator().decorate(self.generated_df) + self.generated_df = self.generated_df[ ~self.generated_df['invalid'] ].reset_index(drop=True).drop(columns=['invalid']) + if len(self.generated_df) == 0: + logger.info("None of the generated smiles yielded valid molecules!") + return dash.no_update, dash.no_update + # Note: we are not allowing fingerprint specification to change here because we want to see the results on the same PCA / UMAP as the original figure + # TODO: make this clear in the UI + + # The number of generated molecules is not expected to be large, so regular cudf dataframes should suffice. + # However, we have to handle both cudf and dask_cudf, perhaps more elegantly. + fps = MorganFingerprint( + radius=self.fingerprint_radius, nBits=self.fingerprint_nBits + ).transform(self.generated_df, smiles_column='SMILES') + df_fp = pd.DataFrame(fps, dtype='float32') + self.generated_df = pd.concat([self.generated_df, df_fp], axis=1) + df_fp=cudf.from_pandas(df_fp) + df_fp['id'] = list(map(str, self.generated_df['id'])) + df_fp['cluster'] = list(map(int, self.generated_df['Generated'])) # This controls the color + n_generated = self.generated_df['Generated'].sum() + if n_generated < len(self.generated_df) / 2: + # Highlight the generated compounds + north_stars = ','.join(list(df_fp[ self.generated_df['Generated'] ]['id'].values_host)) + else: + # Highlight the source compound(s) + north_stars = ','.join(list(df_fp[ ~self.generated_df['Generated'] ]['id'].values_host)) + df_embedding = df_fp + cluster_col = df_embedding['cluster'] + df_embedding, prop_series = self.cluster_wf._remove_non_numerics(df_embedding) + prop_series['cluster'] = cluster_col + if isinstance(self.cluster_wf.pca, cuml.dask.decomposition.PCA) and not isinstance(df_embedding, dask_cudf.DataFrame): + df_embedding = dask_cudf.from_cudf(df_embedding, npartitions=1) + df_embedding = self.cluster_wf.pca.transform(df_embedding) + Xt = self.cluster_wf.umap.transform(df_embedding) + if hasattr(Xt, 'persist'): + Xt = Xt.compute() + if hasattr(df_embedding, 'persist'): + logging.info(f'BEFORE df_embedding: {type(df_embedding)}, {df_embedding.columns}, {len(df_embedding)}, {df_embedding.index}') + # Note: When converting a dask_cudf to cudf, the indices within each chunk are retained, so we need to reset_index. + df_embedding = df_embedding.compute().reset_index(drop=True) + df_embedding['x'] = Xt[0] + df_embedding['y'] = Xt[1] + logging.info(f'df_embedding: {type(df_embedding)}, {df_embedding.columns}, {len(df_embedding)}, {df_embedding.index}') + for col in prop_series.keys(): + df_embedding[col] = prop_series[col] + + fig, northstar_cluster = self.create_graph(df_embedding, north_stars=north_stars) # Create Table header table_headers = [] - columns = self.genreated_df.columns.to_list() - ignore_columns = ['embeddings', 'embeddings_dim'] - for column in columns: - if column in ignore_columns: - continue + all_columns = self.generated_df.columns.to_list() + columns_in_table = [ + col_name + for col_name in self.generated_df.columns.to_list() + if (not isinstance(col_name, int)) and (not col_name.startswith('fp')) and not ('embeddings' in col_name) + ] + # TODO: factor this into a separate function: build table from dataframe + for column in columns_in_table: table_headers.append(html.Th(column, style={'fontSize': '150%', 'text-align': 'center'})) - prop_recs = [html.Tr(table_headers, style={'background': 'lightgray'})] invalid_mol_cnt = 0 - for row_idx in range(self.genreated_df.shape[0]): + for row_idx in range(self.generated_df.shape[0]): td = [] - try: - col_pos = columns.index('Chemical Structure') - col_data = self.genreated_df.iat[row_idx, col_pos] - - if 'value' in col_data and col_data['value'] == MolecularStructureDecorator.ERROR_VALUE: + col_pos = all_columns.index('Chemical Structure') + col_data = self.generated_df.iat[row_idx, col_pos] + if 'value' in col_data and col_data['value'] == 'Error interpreting SMILES using RDKit': invalid_mol_cnt += 1 continue except ValueError: pass - - for col_id in range(len(columns)): - col_data = self.genreated_df.iat[row_idx, col_id] - if columns[col_id] in ignore_columns: - continue - + for col_name in columns_in_table: + col_id = all_columns.index(col_name) + col_data = self.generated_df.iat[row_idx, col_id] col_level = 'info' if isinstance(col_data, dict): col_value = col_data['value'] @@ -333,7 +500,6 @@ def handle_generation(self, bt_generate, col_level = col_data['level'] else: col_value = col_data - if isinstance(col_value, str) and col_value.startswith('data:image/png;base64,'): td.append(html.Td(html.Img(src=col_value))) else: @@ -343,19 +509,246 @@ def handle_generation(self, bt_generate, 'text-align': 'center', 'color': LEVEL_TO_STYLE[col_level]['color'] } - )) - + )) prop_recs.append(html.Tr(td, style={'fontSize': '125%'})) msg_generated_molecules = '' if invalid_mol_cnt > 0: msg_generated_molecules = f'{invalid_mol_cnt} invalid molecules were created, which were eliminated from the result.' - return html.Table(prop_recs, style={'width': '100%', - 'border': '1px solid lightgray'}), \ - show_generated_mol, \ - msg_generated_molecules, \ - dash.no_update + return {'display': 'inline'}, fig, html.Table( + prop_recs, style={'width': '100%', 'margin': 12, 'border': '1px solid lightgray'} + ), show_generated_mol, msg_generated_molecules, dash.no_update + + + @report_ui_error(3) + def handle_fitting( + self, bt_fit, sl_featurizing_wf, + fit_nn_compound_property, fit_nn_train_cluster_number, fit_nn_test_cluster_number, fit_nn_hidden_layer_sizes, fit_nn_activation_fn, fit_nn_final_activation_fn, + fit_nn_max_epochs, fit_nn_learning_rate, fit_nn_weight_decay, fit_nn_batch_size + ): + comp_id, event_type = self._fetch_event_data() + sys.stdout.flush() + if (comp_id != 'bt_fit') or (event_type != 'n_clicks'): + return dash.no_update, dash.no_update + self.featurizing_wf_cls = sl_featurizing_wf + wf_class = locate(self.featurizing_wf_cls) + featurizing_wf = wf_class() + + df = featurizing_wf.fit_nn( + self.cluster_wf.df_embedding, + compound_property=fit_nn_compound_property, + cluster_id_train=fit_nn_train_cluster_number, + cluster_id_test=fit_nn_test_cluster_number, + hidden_layer_sizes=list(map(int, fit_nn_hidden_layer_sizes.split(','))) if fit_nn_hidden_layer_sizes != '' else [], + activation_fn=fit_nn_activation_fn, + final_activation_fn=fit_nn_final_activation_fn, + max_epochs=int(fit_nn_max_epochs), + learning_rate=float(fit_nn_learning_rate), + weight_decay=float(fit_nn_weight_decay), + batch_size=int(fit_nn_batch_size) + ) + sys.stdout.flush() + fig = self.create_plot(df, fit_nn_compound_property) + return {'display': 'inline'}, fig + + @report_ui_error(3) + def handle_analoguing( + self, bt_analoguing, analoguing_mol_id, analoguing_n_analogues, analoguing_threshold, analoguing_type, + + ): + comp_id, event_type = self._fetch_event_data() + sys.stdout.flush() + if (comp_id != 'bt_analoguing') or (event_type != 'n_clicks'): + return dash.no_update, dash.no_update + + # Compute fingerprints once for all input database compounds (already done when input data would have been clustered) + if 'canonical_smiles' in self.cluster_wf.df_embedding: + smiles_column = 'canonical_smiles' + else: + smiles_columns = 'SMILES' + + if self.fp_df is None: + # Note: CPU-based workflow for fingerprint similarity is no longer needed, can be removed + logger.info(f'CPU-based similarity search: self.fp_df not set') + # First move the smiles to the CPU: + if isinstance(self.cluster_wf.df_embedding, dask_cudf.DataFrame): + smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']].map_partitions(cudf.DataFrame.to_pandas) + elif isinstance(self.cluster_wf.df_embedding, cudf.DataFrame): + smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']].to_pandas() + else: + smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']] + # Then compute fingerprints on the CPU using RDKit: + if 'fp' not in self.cluster_wf.df_embedding.columns: + logger.info(f'Computing fingerprints with radius={self.fingerprint_radius}, nBits={self.fingerprint_nBits}...') + _, v = MorganFingerprint(radius=self.fingerprint_radius, nBits=self.fingerprint_nBits).transform( + smiles_df, smiles_column=smiles_column, return_fp=True, raw=True) + else: + if hasattr(self.cluster_wf.df_embedding, 'compute'): + v = list(self.cluster_wf.df_embedding['fp'].compute().to_pandas()) + else: + v = list(self.cluster_wf.df_embedding['fp']) + # This pandas dataframe has the fingerprints in the fp column: + self.fp_df = pd.DataFrame({ + 'fp': v, + smiles_column: smiles_df[smiles_column], #list(self.cluster_wf.df_embedding[smiles_column].compute().to_pandas()), #smiles_df[smiles_column], + 'id': smiles_df['id'], #list(self.cluster_wf.df_embedding['id'].compute().to_pandas()) + }) + + if hasattr(self.cluster_wf.df_embedding, 'persist'): + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() + wait(self.cluster_wf.df_embedding) + + if 'pc' not in self.cluster_wf.df_embedding.columns: + # Pre-computing the popcounts for all compounds in the database for use in GPU-based similarity search: + t0 = time.time() + self.cluster_wf.df_embedding['op_col'] = 0 + self.cluster_wf.df_embedding['pc'] = 0 + n_fp_cols = 0 + for col in self.cluster_wf.df_embedding.columns: + if (type(col) == str) and col.startswith('fp') and (len(col) > 2): + n_fp_cols += 1 + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.apply_rows( + popcll_wrapper, incols = {col: 'ip_col'}, outcols = {'op_col': int}, kwargs = {}) + self.cluster_wf.df_embedding['pc'] += self.cluster_wf.df_embedding['op_col'] + if hasattr(self.cluster_wf.df_embedding, 'persist'): + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() + wait(self.cluster_wf.df_embedding) + t1 = time.time() + #logger.info(f'Time to compute partial popcounts ({n_fp_cols} fp columns): {t1 - t0}:\n{self.cluster_wf.df_embedding["pc"].head()}') + + # Prepare the query compound: + logger.info(f'analoguing_mol_id={analoguing_mol_id}') + molregno = self.chem_data.fetch_molregno_by_chemblId( + [analoguing_mol_id])[0][0] + props, selected_molecules = self.chem_data.fetch_props_by_molregno([molregno]) + query_smiles = selected_molecules[0][props.index('canonical_smiles')] + query_fp = MorganFingerprint(radius=self.fingerprint_radius, nBits=self.fingerprint_nBits).transform( + pd.DataFrame({'smiles': [query_smiles]}), smiles_column='smiles', return_fp=True, raw=True)[1][0] + query_fps = query_fp.ToBitString() + query_fp_ints = [int(query_fps[i: i + INTEGER_NBITS], 2) for i in range(0, self.fingerprint_nBits, INTEGER_NBITS)] + query_pc = sum(bin(x).count('1') for x in query_fp_ints) + + # GPU-based workflow for similarity computation + # Tanimoto = popcount(intersection) / ( popcount(query) + popcount(compound) - popcount(intersection) ) + # Sine the fingerprint is stored as a list of int64s in separate columns + if 'op_col' in self.cluster_wf.df_embedding: + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.drop(columns=['op_col']) + if 'n_intersection' in self.cluster_wf.df_embedding: + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.drop(columns=['n_intersection']) + #self.cluster_wf.df_embedding['op_col'] = 0 + self.cluster_wf.df_embedding['n_intersection'] = 0 + t4 = time.time() + for i in range(0, self.fingerprint_nBits, INTEGER_NBITS): + fp_num = i // INTEGER_NBITS + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.apply_rows( + intersection_wrapper, incols={f'fp{fp_num}': 'fp_int_col'}, + outcols={'op_col': int}, kwargs={'query_fp_int': query_fp_ints[fp_num]}) + #self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() + #wait(self.cluster_wf.df_embedding) + self.cluster_wf.df_embedding['n_intersection'] += self.cluster_wf.df_embedding['op_col'] + + self.cluster_wf.df_embedding['n_union'] = self.cluster_wf.df_embedding['pc'] - self.cluster_wf.df_embedding['n_intersection'] + query_pc + self.cluster_wf.df_embedding['similarity'] = self.cluster_wf.df_embedding['n_intersection'] / self.cluster_wf.df_embedding['n_union'] + if hasattr(self.cluster_wf.df_embedding, 'persist'): + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() + wait(self.cluster_wf.df_embedding) + t5 = time.time() + t0 = time.time() + self.fp_df['similarity_cpu'] = self.fp_df['fp'].apply(lambda x: DataStructs.FingerprintSimilarity(query_fp, x)) + + if 'similarity_cpu' in self.cluster_wf.df_embedding: + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.drop(columns=['similarity_cpu']) + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.merge( + dask_cudf.from_cudf( + cudf.from_pandas(self.fp_df[['id', 'similarity_cpu']]), + npartitions = self.cluster_wf.df_embedding.npartitions + ), + on='id', + how='left' + ).reset_index(drop=True) + + t1 = time.time() + #logger.info(f'Fingerprint length={self.fingerprint_nBits}: GPU-Method: {t5 - t4}, CPU-Method: {t1 - t0}') + + #self.analoguing_df = self.fp_df[ self.fp_df['similarity_cpu'] >= float(analoguing_threshold) ] + self.analoguing_df = self.cluster_wf.df_embedding[ self.cluster_wf.df_embedding['similarity'] >= float(analoguing_threshold) ] + drop_columns = [ + col + for col in self.analoguing_df.columns + if (type(col) == int) or col.startswith('fp') or (col in ['x', 'y', 'cluster', 'op_col', 'pc', 'n_intersection', 'n_union', 'transformed_smiles']) + ] + self.analoguing_df = self.analoguing_df.drop(columns=drop_columns).compute().to_pandas() # dask_cudf --> cudf --> pandas (CPU!) + if analoguing_type in ['scaffold', 'superstructure']: + if analoguing_type == 'scaffold': + # Only include compounds that have the same murcko scaffold as the query compound + query_scaffold_mol = MolFromSmiles(MurckoScaffoldSmilesFromSmiles(query_smiles)) + else: #analoguing_type == 'superstructure': + # Only include compounds that are superstructures of the query compound + query_scaffold_mol = MolFromSmiles(query_smiles) + self.analoguing_df['mol'] = self.analoguing_df[smiles_column].apply(MolFromSmiles) + self.analoguing_df.dropna(subset=['mol'], inplace=True) + self.analoguing_df = self.analoguing_df[ self.analoguing_df['mol'].apply(lambda x: x.HasSubstructMatch(query_scaffold_mol)) ] + self.analoguing_df.drop(columns=['mol'], inplace=True) + self.analoguing_df = self.analoguing_df.nlargest(int(analoguing_n_analogues), 'similarity') + self.analoguing_df.reset_index(drop=True, inplace=True) + #self.analoguing_df = dask_cudf.from_cudf(self.analoguing_df, npartitions=self.cluster_wf.df_embedding.npartitions) # going back to dask for a reason? + # TODO: we are presuming the IDs are the same but there is no guarantee since we added code to generate dummy IDs based on indices elsewhere. + + # Needed only for CPU-based workflow + #self.analoguing_df = self.analoguing_df.merge(self.cluster_wf.df_embedding, on='id').compute().reset_index(drop=True).to_pandas() + # Add other useful attributes to be added for rendering + smiles_idx = self.analoguing_df.columns.to_list().index(smiles_column) + self.analoguing_df = MolecularStructureDecorator().decorate(self.analoguing_df, smiles_col=smiles_idx) + #self.analoguing_df = LipinskiRuleOfFiveDecorator().decorate(self.analoguing_df, smiles_col=smiles_idx) + self.analoguing_df = self.analoguing_df.sort_values('similarity', ascending=False) + # Create Table header + table_headers = [] + all_columns = self.analoguing_df.columns.to_list() + columns_in_table = [ + col_name + for col_name in self.analoguing_df.columns.to_list() + if (not isinstance(col_name, int)) and (not col_name.startswith('fp')) + ] + # TODO: factor this into a separate function: build table from dataframe + for column in columns_in_table: + table_headers.append(html.Th(column, style={'fontSize': '150%', 'text-align': 'center'})) + prop_recs = [html.Tr(table_headers, style={'background': 'lightgray'})] + for row_idx in range(self.analoguing_df.shape[0]): + td = [] + try: + col_pos = all_columns.index('Chemical Structure') + col_data = self.analoguing_df.iat[row_idx, col_pos] + if 'value' in col_data and col_data['value'] == 'Error interpreting SMILES using RDKit': + continue + except ValueError: + pass + for col_name in columns_in_table: + col_id = all_columns.index(col_name) + col_data = self.analoguing_df.iat[row_idx, col_id] + col_level = 'info' + if isinstance(col_data, dict): + col_value = col_data['value'] + if 'level' in col_data: + col_level = col_data['level'] + else: + col_value = col_data + if isinstance(col_value, str) and col_value.startswith('data:image/png;base64,'): + td.append(html.Td(html.Img(src=col_value))) + else: + td.append(html.Td(str(col_value), style=LEVEL_TO_STYLE[col_level].update({'maxWidth': '100px', 'wordWrap':'break-word'}))) + + prop_recs.append(html.Tr(td)) + + + # venkat: + return {'display': 'inline'}, html.Table(prop_recs, style={'width': '100%', 'margin': 12, 'border': '1px solid lightgray'}) + # dev: + #return html.Table(prop_recs, style={'width': '100%', + # 'border': '1px solid lightgray'}), \ + # show_generated_mol, \ + # msg_generated_molecules, \ + # dash.no_update def handle_ckl_selection(self, ckl_candidate_mol_id, rd_generation_type): selection_msg = '**Please Select Two Molecules**' @@ -364,12 +757,22 @@ def handle_ckl_selection(self, ckl_candidate_mol_id, rd_generation_type): if rd_generation_type == 'SAMPLE': selection_msg = '**Please Select One Molecule**' selection_cnt = 1 - + elif rd_generation_type == 'EXTRAPOLATE': + # TO DO: one cluster and one property have to be provided + selection_msg = '**Please Select Zero Molecules (specify cluster above, instead)**' + selection_cnt = 0 if ckl_candidate_mol_id and len(ckl_candidate_mol_id) > selection_cnt: ckl_candidate_mol_id = ckl_candidate_mol_id[selection_cnt * -1:] return ckl_candidate_mol_id, selection_msg + def handle_analoguing_ckl_selection(self, ckl_analoguing_mol_id): + if ckl_analoguing_mol_id and len(ckl_analoguing_mol_id) > 1: + # Allow only one compound to be chosen for analoguing + ckl_analoguing_mol_id = ckl_analoguing_mol_id[-1:] + + return ckl_analoguing_mol_id + def handle_construct_candidates(self, north_star): if not north_star: return [] @@ -377,6 +780,13 @@ def handle_construct_candidates(self, north_star): options = [{'label': i.strip(), 'value': i.strip()} for i in north_star.split(',')] return options + def handle_construct_candidates2(self, north_star): + if not north_star: + return [] + + options = [{'label': i.strip(), 'value': i.strip()} for i in north_star.split(',')] + return options + def handle_reset(self, bt_reset, bt_apply_wf, refresh_main_fig, sl_wf): comp_id, event_type = self._fetch_event_data() @@ -399,25 +809,41 @@ def handle_reset(self, bt_reset, bt_apply_wf, refresh_main_fig, sl_wf): def recluster(self, filter_values=None, filter_column=None, reload_data=False): self.cluster_wf.n_clusters = self.n_clusters if reload_data: - return self.cluster_wf.cluster() + return self.cluster_wf.cluster( + fingerprint_radius=self.fingerprint_radius, fingerprint_nBits=self.fingerprint_nBits + ) else: - return self.cluster_wf.recluster(filter_column, filter_values, - n_clusters=self.n_clusters) - - def recluster_selection(self, - filter_value=None, - filter_column=None, - gradient_prop=None, - north_stars=None, - reload_data=False, - recluster_data=True, - color_col='cluster'): + return self.cluster_wf.recluster( + filter_column, + filter_values, + n_clusters=self.n_clusters, + fingerprint_radius=self.fingerprint_radius, + fingerprint_nBits=self.fingerprint_nBits + ) + + def recluster_selection( + self, + filter_value=None, + filter_column=None, + gradient_prop=None, + north_stars=None, + reload_data=False, + recluster_data=True, + color_col='cluster', + fingerprint_radius=2, + fingerprint_nBits=512 + ): if recluster_data or self.cluster_wf.df_embedding is None: - df_embedding = self.recluster(filter_values=filter_value, - filter_column=filter_column, - reload_data=reload_data) + self.fingerprint_nBits = fingerprint_nBits + self.fingerprint_radius = fingerprint_radius + df_embedding = self.recluster( + filter_values=filter_value, + filter_column=filter_column, + reload_data=reload_data + ) else: + # Can use previous embedding only if fingerprint has not changed df_embedding = self.cluster_wf.df_embedding return self.create_graph(df_embedding, @@ -426,6 +852,7 @@ def recluster_selection(self, north_stars=north_stars) def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop=None): + sys.stdout.flush() fig = go.Figure(layout={'colorscale': {}}) # Filter out relevant columns in this method. @@ -441,7 +868,7 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop moi_molregno = [] if north_stars: - moi_molregno = list(map(int, north_stars.split(","))) + moi_molregno = north_stars.split(",") #list(map(int, north_stars.split(","))) moi_filter = ldf['id'].isin(moi_molregno) @@ -461,6 +888,8 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop cluster = ldf['cluster'] customdata = ldf['id'] grad_prop = ldf[gradient_prop] + textdata = cupy.asarray([ + f'C-{c}_ID-{cid}' for c, cid in zip(cdf['cluster'].to_array(), cdf['id'].to_array()) ]) if self.cluster_wf.is_gpu_enabled(): x_data = x_data.to_array() @@ -486,6 +915,7 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop 'showscale': True, 'cmin': cmin, 'cmax': cmax, + 'name': customdata } })) else: @@ -512,6 +942,8 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop y_data = cdf['y'] cluster = cdf['cluster'] customdata = cdf['id'] + textdata = [ f'C-{c}_ID-{cid}' for c, cid in zip(cdf['cluster'].to_array(), cdf['id'].to_array()) ] + sys.stdout.flush() if self.cluster_wf.is_gpu_enabled(): x_data = x_data.to_array() @@ -524,14 +956,16 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop scatter_trace = go.Scattergl({ 'x': x_data, 'y': y_data, - 'text': cluster, + 'text': textdata, + #'text': cluster, 'customdata': customdata, 'name': 'Cluster ' + str(cluster_id), 'mode': 'markers', 'marker': { 'size': df_size, 'symbol': df_shape, - 'color': self.cluster_colors[int(cluster_id) % len(self.cluster_colors)], + 'color': self.cluster_colors[ + int(cluster_id) % len(self.cluster_colors)], }, }) if moi_present: @@ -558,6 +992,37 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop del ldf return fig, northstar_cluster + def create_plot(self, df, compound_property): + """ + Expects df to have x, y, cluster and train_set columns + """ + fig = go.Figure(layout={'colorscale': {}}) + scatter_trace = go.Scattergl({ + 'x': df['x'], + 'y': df['y'], + 'text': [ f'C-{c}_ID-{cid}' for c, cid in zip(df['cluster'], df['id']) ], + 'customdata': df['id'], + 'mode': 'markers', + 'marker': { + 'size': DOT_SIZE, + 'symbol': df['train_set'].apply(lambda x: 0 if x else 1), + 'color': df['cluster'].apply(lambda x: self.cluster_colors[x % len(self.cluster_colors)]), + }, + }) + fig.add_trace(scatter_trace) + # Change the title to indicate type of H/W in use + f_color = 'green' if self.cluster_wf.is_gpu_enabled() else 'blue' + fig.update_layout( + showlegend=True, clickmode='event', height=main_fig_height, + title=f'{PROP_DISP_NAME[compound_property]} Prediction', dragmode='select', + title_font_color=f_color, + annotations=[ + dict(x=0.5, y=-0.07, showarrow=False, text='Actual', + xref="paper", yref="paper"), + dict(x=-0.05, y=0.5, showarrow=False, text="Predicted", + textangle=-90, xref="paper", yref="paper")]) + return fig + def start(self, host=None, port=5000): return self.app.run_server( debug=False, use_reloader=False, host=host, port=port) @@ -587,7 +1052,7 @@ def construct_molecule_detail(self, selected_points, display_properties, prop_recs = [html.Tr(table_headers, style={'background': 'lightgray'})] if chembl_ids: - props, selected_molecules = self.chem_data.fetch_props_by_chemble(chembl_ids) + props, selected_molecules = self.chem_data.fetch_props_by_chembl(chembl_ids) elif selected_points: selected_molregno = [] for point in selected_points['points'][((page - 1) * pageSize): page * pageSize]: @@ -631,7 +1096,7 @@ def construct_molecule_detail(self, selected_points, display_properties, td.append(html.Td(selected_chembl_id)) else: td.append(html.Td( - dbc.Button('Add as MoI', + dbc.Button('Highlight', id={'role': 'bt_star_candidate', 'chemblId': selected_chembl_id, 'molregno': str(molregno) @@ -640,7 +1105,7 @@ def construct_molecule_detail(self, selected_points, display_properties, )) td.append(html.Td( - dbc.Button('Add for Interpolation', + dbc.Button('Add', id={'role': 'bt_add_candidate', 'chemblId': selected_chembl_id, 'molregno': str(molregno) @@ -649,6 +1114,16 @@ def construct_molecule_detail(self, selected_points, display_properties, n_clicks=0) )) + td.append(html.Td( + dbc.Button('Analogue', + id={'role': 'bt_analoguing_candidate', + 'chemblId': selected_chembl_id, + 'molregno': str(molregno), + #'smiles': smiles + }, + n_clicks=0) + )) + prop_recs.append(html.Tr(td, style={'fontSize': '125%'})) return html.Table(prop_recs, style={'width': '100%', 'border': '1px solid lightgray'}), all_props @@ -665,6 +1140,8 @@ def constuct_layout(self): html.Div([ dcc.Markdown("""**Molecule(s) of Interest**"""), + dcc.Markdown(children="""Click *Highlight* to populate this list""", + style={'marginTop': 18}), dcc.Markdown("Please enter ChEMBL ID(s) separated by commas."), html.Div(className='row', children=[ @@ -674,6 +1151,19 @@ def constuct_layout(self): className='three columns'), ], style={'marginLeft': 0, 'marginBottom': 18, }), + dcc.Markdown("For fingerprint changes to take effect, first *Apply* the *GPU KMeans-UMAP* Workflow, then *Recluster*"), + html.Div(className='row', children=[ + dcc.Markdown("Fingerprint Radius", style={'marginTop': 12,}), + dcc.Input(id='fingerprint_radius', value=2), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + + html.Div(className='row', children=[ + dcc.Markdown("Fingerprint Size", style={'marginTop': 12,}), + dcc.Input(id='fingerprint_nBits', value=512), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + dcc.Tabs([ dcc.Tab(label='Cluster Molecules', children=[ dcc.Markdown("""**Select Workflow**""", style={'marginTop': 18, }), @@ -737,8 +1227,9 @@ def constuct_layout(self): dcc.RadioItems( id='rd_generation_type', options=[ - {'label': 'Interpolate between two molecules', 'value': 'INTERPOLATE'}, {'label': 'Sample around one molecule', 'value': 'SAMPLE'}, + {'label': 'Fit cluster to property and extrapolate', 'value': 'EXTRAPOLATE'}, + {'label': 'Interpolate between two molecules', 'value': 'INTERPOLATE'}, ], value='INTERPOLATE', style={'marginTop': 18}, @@ -747,11 +1238,35 @@ def constuct_layout(self): ), html.Div(className='row', children=[ - dcc.Markdown("Number of molecules to generate", + dcc.Markdown("Number to be generated from each compound", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}), dcc.Input(id='n2generate', value=10), ], style={'marginLeft': 0}), + html.Div(className='row', children=[ + html.Label([ + "Select molecular property for fitting and extrapolation", + dcc.Dropdown(id='extrap_compound_property', multi=False, clearable=False, + options=[{"label": PROP_DISP_NAME[p], "value": p} for p in IMP_PROPS], + value=IMP_PROPS[0]), + ], style={'marginTop': 18, 'marginLeft': 18})], + ), + + html.Div(className='row', children=[ + dcc.Markdown("Cluster number for fitting property and extrapolation", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}), + dcc.Input(id='extrap_cluster_number', value=0), + ], style={'marginLeft': 0}), + + html.Div(className='row', children=[ + dcc.Markdown("Step-size for extrapolation", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}), + dcc.Input(id='extrap_step_size', value=0.1), + ], style={'marginLeft': 0}), + + html.Div(className='row', children=[ + dcc.Markdown("Number of compounds to extrapolate", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}), + dcc.Input(id='extrap_n_compounds', value=10), + ], style={'marginLeft': 0}), + html.Div(className='row', children=[ dcc.Markdown("Scaled sampling radius (int, start with 1)", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}), @@ -761,6 +1276,8 @@ def constuct_layout(self): dcc.Markdown(children="""**Please Select Two**""", id="mk_selection_msg", style={'marginTop': 18}), + dcc.Markdown(children="""Click *Add* to populate this list""", + style={'marginTop': 18}), dcc.Checklist( id='ckl_candidate_mol_id', options=[], @@ -772,7 +1289,129 @@ def constuct_layout(self): dbc.Button('Generate', id='bt_generate', n_clicks=0, style={'marginRight': 12}), dbc.Button('Reset', id='bt_reset_candidates', n_clicks=0), ], style={'marginLeft': 0}), + + ]), + + dcc.Tab(label='Predict Properties', children=[ + + dcc.Markdown("**Select Featurizing Model**", style={'marginTop': 18,}), + html.Div(children=[ + dcc.Dropdown(id='sl_featurizing_wf', multi=False, + options=[{'label': 'CDDD Model', + 'value': 'cuchem.wf.generative.Cddd'}, + {'label': 'MolBART Model', + 'value': 'cuchem.wf.generative.MolBART'}, + {'label': 'MegatronMolBART Model', + 'value': 'cuchem.wf.generative.MegatronMolBART'}, + ], + value=self.generative_wf_cls, + clearable=False), + ]), + html.Div(className='row', children=[ + html.Label([ + "Select molecular property for fitting and prediction", + dcc.Dropdown(id='fit_nn_compound_property', multi=False, clearable=False, + options=[{"label": PROP_DISP_NAME[p], "value": p} for p in IMP_PROPS], + value=IMP_PROPS[0]), + ], style={'marginTop': 18, 'marginLeft': 18})], + ), + html.Div(className='row', children=[ + dcc.Markdown("Train cluster", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_train_cluster_number', value=0), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Test cluster", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_test_cluster_number', value=1), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + dcc.Markdown(children="**Neural Network Parameters**", + id="nn_params_msg", + style={'marginTop': 18} + ), + html.Div(className='row', children=[ + dcc.Markdown("Hidden layer sizes", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_hidden_layer_sizes', value=''), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Activation Function", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_activation_fn', value='LeakyReLU'), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Final Activation Function", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_final_activation_fn', value='LeakyReLU'), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Number of training epochs", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_max_epochs', value=10), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Learning Rate", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_learning_rate', value=0.001), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Weight Decay (Adam)", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_weight_decay', value=0.0001), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Batch size", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_batch_size', value=1), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dbc.Button('Fit', id='bt_fit', n_clicks=0, style={'marginRight': 12}), + ], style={'marginLeft': 0} + ), ]), + + dcc.Tab(label='Find Analogues', children=[ + + html.Div(className='row', children=[ + dcc.Markdown("Maxinum Number of Analogues", style={'marginTop': 12,}), + dcc.Input(id='analoguing_n_analogues', value=10), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Similarity Threshold", style={'marginTop': 12,}), + dcc.Input(id='analoguing_threshold', value=0.33), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(children=[ + dcc.Dropdown(id='analoguing_type', multi=False, + options=[{'label': 'Similar compounds', + 'value': 'similar'}, + {'label': 'Compounds with the same scaffold', + 'value': 'scaffold'}, + {'label': 'Compounds that are superstructures', + 'value': 'superstructure'}, + ], + value='similar', + clearable=False), + ]), + dcc.Markdown(children="Choose a compound", + id="analoguing_msg", + style={'marginTop': 18} + ), + dcc.Markdown(children="Click *Analogue* to populate this list", + style={'marginTop': 18}), + dcc.Checklist( + id='ckl_analoguing_mol_id', + options=[], + value=[], + #inputStyle={'display': 'inline-block', 'marginLeft': 6, 'marginRight': 6}, + #labelStyle={'display': 'block', 'marginLeft': 6, 'marginRight': 6} + ), + html.Div(className='row', children=[ + dbc.Button('Search', id='bt_analoguing', n_clicks=0, style={'marginRight': 12}), + ], style={'marginLeft': 0}), + ]) + ]), html.Div(className='row', children=[ @@ -785,51 +1424,81 @@ def constuct_layout(self): ], className='three columns', style={'marginLeft': 18, 'marginTop': 90, 'verticalAlign': 'text-top', }), ]), - html.Div(className='row', children=[ - html.Div(id='section_generated_molecules', children=[ - html.Div(className='row', children=[ - html.A('Export to SDF', - id='download-link', - download="rawdata.sdf", - href="/cheminfo/downloadSDF", - target="_blank", - n_clicks=0, - style={'fontSize': '150%'} - ), - html.Div(id='msg_generated_molecules', children=[], - style={'color': 'red', 'fontWeight': 'bold', 'marginLeft': 12, 'fontSize': '150%'}), - ], style={'marginLeft': 0, 'marginBottom': 18, }), - html.Div(id='table_generated_molecules', children=[], style={'width': '100%'}) - ], style={'display': 'none', 'width': '100%'}), - - html.Div(id='section_selected_molecules', children=[ + #html.Div(className='row', children=[ + html.Div(id='section_generated_molecules', children=[ html.Div(className='row', children=[ - html.Div(id='section_display_properties', children=[ - html.Label([ - "Select Molecular Properties", - dcc.Dropdown(id='sl_mol_props', multi=True, - options=[ - {'label': 'alogp', 'value': 'alogp'}], - value=['alogp']), - ])], - className='nine columns'), - html.Div(children=[ - dbc.Button("<", id="bt_page_prev", - style={"height": "25px"}), - html.Span(children=1, id='current_page', - style={"paddingLeft": "6px"}), - html.Span(children=' of 1', id='total_page', - style={"paddingRight": "6px"}), - dbc.Button(">", id="bt_page_next", - style={"height": "25px"}) - ], - className='three columns', - style={'verticalAlign': 'text-bottom', 'text-align': 'right'} - ), - ], style={'margin': 12}), - html.Div(id='tb_selected_molecules', children=[], style={'width': '100%'}) - ], style={'display': 'none', 'width': '100%'}), - ], style={'margin': 12}), + html.A('Export to SDF', + id='download-link', + download="rawdata.sdf", + href="/cheminfo/downloadSDF", + target="_blank", + n_clicks=0, + style={'fontSize': '150%'} + ), + html.Div(id='msg_generated_molecules', children=[], + style={'color': 'red', 'fontWeight': 'bold', 'marginLeft': 12, 'fontSize': '150%'}), + ], style={'marginLeft': 0, 'marginBottom': 18, }), + html.Div(id='table_generated_molecules', children=[], style={'width': '100%'}) + ], style={'display': 'none', 'width': '100%'}), + + + html.Div(id='section_generated_molecules_clustered', children=[ + dcc.Graph(id='gen_figure', figure=fig, + #className='nine columns', + #style={'verticalAlign': 'text-top'} + ), + ], style={'display': 'none'}), + + html.Div(id='section_fitting', children=[ + dcc.Graph(id='fitting_figure', figure=fig, + #className='nine columns', + #style={'verticalAlign': 'text-top'} + ), + ], style={'display': 'none'}), + + html.Div(id='section_selected_molecules', children=[ + html.Div(className='row', children=[ + html.Div(id='section_display_properties', children=[ + html.Label([ + "Select Molecular Properties", + dcc.Dropdown(id='sl_mol_props', multi=True, + options=[ + {'label': 'alogp', 'value': 'alogp'}], + value=['alogp']), + ])], + className='nine columns'), + html.Div(children=[ + dbc.Button("<", id="bt_page_prev", + style={"height": "25px"}), + html.Span(children=1, id='current_page', + style={"paddingLeft": "6px"}), + html.Span(children=' of 1', id='total_page', + style={"paddingRight": "6px"}), + dbc.Button(">", id="bt_page_next", + style={"height": "25px"}) + ], + className='three columns', + style={'verticalAlign': 'text-bottom', 'text-align': 'right'} + ), + ], style={'margin': 12}), + + html.Div( + id='tb_selected_molecules', + children=[], + style={'width': '100%'} + ) + ], style={'display': 'none', 'width': '100%'}), + + html.Div(id='section_analoguing', children=[ + html.Div(children=[ + html.Div(id='tb_analoguing', children=[], + style={'verticalAlign': 'text-top'} + ), + ]) + ], style={'display': 'none'}), + + #], style={'margin': 12}), + html.Div(id='refresh_main_fig', style={'display': 'none'}), html.Div(id='northstar_cluster', style={'display': 'none'}), @@ -837,9 +1506,10 @@ def constuct_layout(self): html.Div(id='mol_selection_error', style={'display': 'none'}), html.Div(id='show_selected_mol', style={'display': 'none'}), html.Div(id='show_generated_mol', style={'display': 'none'}), - html.Div(id='genration_candidates', style={'display': 'none'}), + html.Div(id='generation_candidates', style={'display': 'none'}), html.Div(id='refresh_moi_prop_table', style={'display': 'none'}), html.Div(id='interpolation_error', style={'display': 'none'}), + html.Div(id='analoguing_candidates', style={'display': 'none'}), # Not displayed but used to keep track of compounds added to checklist of compounds to be analogued html.Div(className='row', children=[ dbc.Modal([ @@ -986,9 +1656,12 @@ def handle_mark_north_star(self, bt_north_star_click, north_star): return ','.join(selected_north_star) @report_ui_error(4) - def handle_re_cluster(self, bt_cluster_clicks, bt_point_clicks, bt_north_star_clicks, - sl_prop_gradient, sl_nclusters, refresh_main_fig, - selected_clusters, selected_points, north_star, refresh_moi_prop_table): + def handle_re_cluster( + self, bt_cluster_clicks, bt_point_clicks, bt_north_star_clicks, + sl_prop_gradient, sl_nclusters, refresh_main_fig, + fingerprint_radius, fingerprint_nBits, + selected_clusters, selected_points, north_star, refresh_moi_prop_table + ): comp_id, event_type = self._fetch_event_data() if comp_id == 'sl_nclusters': @@ -998,6 +1671,9 @@ def handle_re_cluster(self, bt_cluster_clicks, bt_point_clicks, bt_north_star_cl raise dash.exceptions.PreventUpdate + if comp_id in ['fingerprint_radius', 'fingerprint_nBits']: + raise dash.exceptions.PreventUpdate + filter_values = None filter_column = None reload_data = False @@ -1018,10 +1694,11 @@ def handle_re_cluster(self, bt_cluster_clicks, bt_point_clicks, bt_north_star_cl elif comp_id == 'bt_north_star' and event_type == 'n_clicks': if north_star: north_star = north_star.split(',') - missing_mols, molregnos, _ = self.cluster_wf.add_molecules(north_star) + missing_mols, molregnos, _ = self.cluster_wf.add_molecules( + north_star, radius=int(fingerprint_radius), nBits=int(fingerprint_nBits)) recluster_data = len(missing_mols) > 0 logger.info("%d missing molecules added...", len(missing_mols)) - logger.debug("Missing molecules werew %s", missing_mols) + logger.debug("Missing molecules were %s", missing_mols) moi_molregno = " ,".join(list(map(str, molregnos))) if refresh_moi_prop_table is None: @@ -1050,6 +1727,9 @@ def handle_re_cluster(self, bt_cluster_clicks, bt_point_clicks, bt_north_star_cl north_stars=moi_molregno, color_col='cluster', reload_data=reload_data, - recluster_data=recluster_data) + recluster_data=recluster_data, + fingerprint_radius=int(fingerprint_radius), + fingerprint_nBits=int(fingerprint_nBits) + ) return figure, ','.join(northstar_cluster), _refresh_moi_prop_table, dash.no_update diff --git a/cuchem/cuchem/wf/cluster/__init__.py b/cuchem/cuchem/wf/cluster/__init__.py index 5c4ff371..f8e37966 100644 --- a/cuchem/cuchem/wf/cluster/__init__.py +++ b/cuchem/cuchem/wf/cluster/__init__.py @@ -24,16 +24,14 @@ def _remove_ui_columns(self, embedding): def _remove_non_numerics(self, embedding): embedding = self._remove_ui_columns(embedding) - - other_props = ['id'] + IMP_PROPS + ADDITIONAL_FEILD - # Tempraryly store columns not required during processesing + # Fingerprint columns have the names 0, 1, 2,... + non_numeric_col_names = [col for col in embedding.columns if type(col) != int] + # Temporarily store columns not required during processesing prop_series = {} - for col in other_props: - if col in embedding.columns: - prop_series[col] = embedding[col] + for col in non_numeric_col_names: + prop_series[col] = embedding[col] if len(prop_series) > 0: - embedding = embedding.drop(other_props, axis=1) - + embedding = embedding.drop(non_numeric_col_names, axis=1) return embedding, prop_series def _random_sample_from_arrays(self, *input_array_list, n_samples=None, index=None): @@ -99,7 +97,7 @@ def recluster(self, def add_molecules(self, chemblids: List): """ - ChembleId's accepted as argument to the existing database. Duplicates + ChemblId's accepted as argument to the existing database. Duplicates must be ignored. """ raise NotImplementedError diff --git a/cuchem/cuchem/wf/cluster/gpukmeansumap.py b/cuchem/cuchem/wf/cluster/gpukmeansumap.py index 413360ee..0942ca42 100644 --- a/cuchem/cuchem/wf/cluster/gpukmeansumap.py +++ b/cuchem/cuchem/wf/cluster/gpukmeansumap.py @@ -22,6 +22,8 @@ import cuml import dask import dask_cudf +import sys +from dask.distributed import wait from cuchemcommon.context import Context from cuchemcommon.data import ClusterWfDAO from cuchemcommon.data.cluster_wf import ChemblClusterWfDao @@ -42,43 +44,51 @@ @singledispatch -def _gpu_cluster_wrapper(embedding, n_pca, self): +def _gpu_cluster_wrapper(embedding, n_pca, reuse_umap, reuse_pca, self): return NotImplemented @_gpu_cluster_wrapper.register(dask.dataframe.core.DataFrame) -def _(embedding, n_pca, self): +def _(embedding, n_pca, reuse_umap, reuse_pca, self): embedding = dask_cudf.from_dask_dataframe(embedding) - return _gpu_cluster_wrapper(embedding, n_pca, self) + return _gpu_cluster_wrapper(embedding, n_pca, reuse_umap, reuse_pca, self) @_gpu_cluster_wrapper.register(cudf.DataFrame) -def _(embedding, n_pca, self): +def _(embedding, n_pca, reuse_umap, reuse_pca, self): embedding = dask_cudf.from_cudf(embedding, - chunksize=int(embedding.shape[0] * 0.1)) - return _gpu_cluster_wrapper(embedding, n_pca, self) + chunksize=max(10, int(embedding.shape[0] * 0.1))) + return _gpu_cluster_wrapper(embedding, n_pca, reuse_umap, reuse_pca, self) @_gpu_cluster_wrapper.register(dask_cudf.core.DataFrame) -def _(embedding, n_pca, self): +def _(embedding, n_pca, reuse_umap, reuse_pca, self): embedding = embedding.persist() - return self._cluster(embedding, n_pca) + return self._cluster(embedding, n_pca, reuse_umap, reuse_pca) class GpuKmeansUmap(BaseClusterWorkflow, metaclass=Singleton): - + # TODO: support changing fingerprint radius and nBits in other kmeans workflows as well (hybrid, random projection) def __init__(self, n_molecules: int = None, dao: ClusterWfDAO = ChemblClusterWfDao(MorganFingerprint), pca_comps=64, n_clusters=7, - seed=0): + seed=0, + fingerprint_radius=2, + fingerprint_nBits=512 + ): super().__init__() - self.dao = dao + self.dao = dao if dao is not None else ChemblClusterWfDao( + MorganFingerprint, radius=fingerprint_radius, nBits=fingerprint_nBits + ) + self.fingerprint_radius = fingerprint_radius + self.fingerprint_nBits = fingerprint_nBits self.n_molecules = n_molecules self.pca_comps = pca_comps self.pca = None + self.umap = None self.n_clusters = n_clusters self.df_embedding = None @@ -87,12 +97,12 @@ def __init__(self, self.n_spearman = 5000 self.n_silhouette = 500000 - def _cluster(self, embedding, n_pca): + def _cluster(self, embedding, n_pca, reuse_umap=False, reuse_pca=True): """ Generates UMAP transformation on Kmeans labels generated from molecular fingerprints. """ - + reuse_umap = reuse_umap and reuse_pca dask_client = self.context.dask_client embedding = embedding.reset_index() @@ -106,15 +116,17 @@ def _cluster(self, embedding, n_pca): if n_pca and n_obs > n_pca: with MetricsLogger('pca', self.n_molecules) as ml: - if self.pca is None: + if (self.pca is None) or not reuse_pca: self.pca = cuDaskPCA(client=dask_client, n_components=n_pca) self.pca.fit(embedding) + else: + logger.info(f'Using available pca') embedding = self.pca.transform(embedding) embedding = embedding.persist() with MetricsLogger('kmeans', self.n_molecules) as ml: - if self.n_molecules < MIN_RECLUSTER_SIZE: - raise Exception('Reclustering less than %d molecules is not supported.' % MIN_RECLUSTER_SIZE) + if self.n_molecules < self.n_clusters: # < MIN_RECLUSTER_SIZE: + raise Exception('Reclustering {self.n_molecules} molecules into {self.n_clusters} clusters not supported.')# % MIN_RECLUSTER_SIZE) kmeans_cuml = cuDaskKMeans(client=dask_client, n_clusters=self.n_clusters) @@ -137,13 +149,18 @@ def _cluster(self, embedding, n_pca): local_model = cuUMAP() local_model.fit(X_train) - umap_model = cuDaskUMAP(local_model, - n_neighbors=100, - a=1.0, - b=1.0, - learning_rate=1.0, - client=dask_client) - Xt = umap_model.transform(embedding) + if not (reuse_umap and self.umap): + self.umap = cuDaskUMAP( + local_model, + n_neighbors=100, + a=1.0, + b=1.0, + learning_rate=1.0, + client=dask_client + ) + else: + logger.info(f'reusing {self.umap}') + Xt = self.umap.transform(embedding) ml.metric_name = 'spearman_rho' ml.metric_func = self._compute_spearman_rho @@ -165,30 +182,55 @@ def _cluster(self, embedding, n_pca): return embedding - def cluster(self, df_mol_embedding=None): - - logger.info("Executing GPU workflow...") - - if df_mol_embedding is None: + def cluster( + self, + df_mol_embedding=None, + reuse_umap=False, + reuse_pca=True, + fingerprint_radius=2, + fingerprint_nBits=512 + ): + + logger.info(f"GpuKmeansUmap.cluster(radius={fingerprint_radius}, nBits={fingerprint_nBits}), df_mol_embedding={df_mol_embedding}") + sys.stdout.flush() + if (df_mol_embedding is None) or (fingerprint_radius != self.fingerprint_radius) or (fingerprint_nBits != self.fingerprint_nBits): self.n_molecules = self.context.n_molecule - + self.dao = ChemblClusterWfDao( + MorganFingerprint, radius=fingerprint_radius, nBits=fingerprint_nBits) + self.fingerprint_radius = fingerprint_radius + self.fingerprint_nBits = fingerprint_nBits + logger.info(f'dao={self.dao}, getting df_mol_embedding...') df_mol_embedding = self.dao.fetch_molecular_embedding( self.n_molecules, cache_directory=self.context.cache_directory, + radius=fingerprint_radius, + nBits=fingerprint_nBits ) - df_mol_embedding = df_mol_embedding.persist() - - self.df_embedding = _gpu_cluster_wrapper(df_mol_embedding, - self.pca_comps, - self) + wait(df_mol_embedding) + + self.df_embedding = _gpu_cluster_wrapper( + df_mol_embedding, + self.pca_comps, + reuse_umap, + reuse_pca, + self + ) return self.df_embedding def recluster(self, filter_column=None, filter_values=None, - n_clusters=None): + n_clusters=None, + fingerprint_radius=2, + fingerprint_nBits=512 + ): + + # The user may have changed the fingerprint specification, in which case, we cannot reuse the embeddings + if (fingerprint_radius != self.fingerprint_radius) or (fingerprint_nBits != self.fingerprint_nBits): + return self.cluster(df_mol_embedding=None, reuse_umap=False, reuse_pca=False, fingerprint_radius=fingerprint_radius, fingerprint_nBits=fingerprint_nBits) + logger.info(f"recluster(radius={fingerprint_radius}, nBits={fingerprint_nBits}): reusing embedding") df_embedding = self.df_embedding if filter_values is not None: filter = df_embedding[filter_column].isin(filter_values) @@ -199,11 +241,11 @@ def recluster(self, if n_clusters is not None: self.n_clusters = n_clusters - self.df_embedding = _gpu_cluster_wrapper(df_embedding, None, self) + self.df_embedding = _gpu_cluster_wrapper(df_embedding, None, False, True, self) return self.df_embedding - def add_molecules(self, chemblids: List): + def add_molecules(self, chemblids: List, radius=2, nBits=512): chemblids = [x.strip().upper() for x in chemblids] chem_mol_map = {row[0]: row[1] for row in self.dao.fetch_id_from_chembl(chemblids)} @@ -245,8 +287,6 @@ def add_molecules(self, chemblids: List): if hasattr(self.df_embedding, 'compute'): self.df_embedding = self.df_embedding.compute() - logger.info(self.df_embedding.shape) - return chem_mol_map, molregnos, self.df_embedding @@ -264,7 +304,7 @@ def __init__(self, n_clusters=n_clusters, seed=seed) - def _cluster(self, embedding, n_pca): + def _cluster(self, embedding, n_pca, reuse_umap=False, reuse_pca=False): """ Generates UMAP transformation on Kmeans labels generated from molecular fingerprints. @@ -284,7 +324,7 @@ def _cluster(self, embedding, n_pca): if n_pca and n_obs > n_pca: with MetricsLogger('pca', self.n_molecules) as ml: - if self.pca == None: + if (self.pca == None) or not reuse_pca: self.pca = cuml.PCA(n_components=n_pca) self.pca.fit(embedding) embedding = self.pca.transform(embedding) @@ -308,8 +348,8 @@ def _cluster(self, embedding, n_pca): ml.metric_func_args = (embedding_sample, kmeans_labels_sample) with MetricsLogger('umap', self.n_molecules) as ml: - umap = cuml.manifold.UMAP() - Xt = umap.fit_transform(embedding) + self.umap = cuml.manifold.UMAP() + Xt = self.umap.fit_transform(embedding) ml.metric_name = 'spearman_rho' ml.metric_func = self._compute_spearman_rho diff --git a/cuchem/cuchem/wf/generative/megatronmolbart.py b/cuchem/cuchem/wf/generative/megatronmolbart.py index 1437ce97..831313c0 100644 --- a/cuchem/cuchem/wf/generative/megatronmolbart.py +++ b/cuchem/cuchem/wf/generative/megatronmolbart.py @@ -1,4 +1,5 @@ import logging +import os import grpc import pandas as pd @@ -12,7 +13,25 @@ from cuchemcommon.utils.singleton import Singleton from cuchemcommon.workflow import BaseGenerativeWorkflow +# Check if all these are needed: +from cuchemcommon.fingerprint import MorganFingerprint +import torch +import torch.nn +from torch.utils.data import Dataset, DataLoader +import cupy as cp +import pickle +from pathlib import Path +import numpy as np +from functools import partial +from rdkit import Chem +from rdkit.Chem import MolFromSmiles, RDKFingerprint +from rdkit.DataStructs import FingerprintSimilarity +from cuml import Lasso, Ridge #LinearRegression +from cuml.metrics import mean_squared_error +from math import sqrt + logger = logging.getLogger(__name__) +PAD_TOKEN = 0 # TODO: use tokenizer.pad_token instead class MegatronMolBART(BaseGenerativeWorkflow): @@ -20,7 +39,10 @@ class MegatronMolBART(BaseGenerativeWorkflow): def __init__(self, dao: GenerativeWfDao = ChemblGenerativeWfDao(None)) -> None: super().__init__(dao) - + if torch.cuda.is_available(): + self.device = 'cuda' + else: + self.device = 'cpu' self.min_jitter_radius = 1 self.channel = grpc.insecure_channel('megamolbart:50051') self.stub = GenerativeSamplerStub(self.channel) @@ -64,7 +86,13 @@ def find_similars_smiles(self, num_requested: int = 10, scaled_radius=None, force_unique=False, - sanitize=True): + sanitize=True, + compound_id=None): + if isinstance(compound_id, list): + # Sometimes calling routine may send a list of length one containing the compound ID + if len(compound_id) > 1: + logger.info(f'find_similars_smiles received {compound_id}, generating neighbors only for first compound!') + compound_id = compound_id[0] spec = GenerativeSpec(model=GenerativeModel.MegaMolBART, smiles=smiles, radius=scaled_radius, @@ -78,21 +106,32 @@ def find_similars_smiles(self, for embedding in result.embeddings: embeddings.append(list(embedding.embedding)) dims.append(embedding.dim) - - generated_df = pd.DataFrame({'SMILES': generatedSmiles, - 'embeddings': embeddings, - 'embeddings_dim': dims, - 'Generated': [True for i in range(len(generatedSmiles))]}) - generated_df['Generated'].iat[0] = False - + if not compound_id: + compound_id = 'source' + generated_df = pd.DataFrame({ + 'SMILES': generatedSmiles, + 'embeddings': embeddings, + 'embeddings_dim': dims, + 'Generated': [False] + [True] * (len(generatedSmiles) - 1), + 'id': [ + str(compound_id)] + [f'{compound_id}-g{i + 1}' + for i in range(len(generatedSmiles) - 1) + ], + }) return generated_df - def interpolate_smiles(self, - smiles: List, - num_points: int = 10, - scaled_radius=None, - force_unique=False, - sanitize=True): + def interpolate_smiles( + self, + smiles: List, + num_points: int = 10, + scaled_radius=None, + force_unique=False, + compound_ids=[], + sanitize=True + ): + if len(compound_ids) == 0: + compound_ids = [f'source{i}' for i in range(len(smiles))] + spec = GenerativeSpec(model=GenerativeModel.MegaMolBART, smiles=smiles, radius=scaled_radius, @@ -102,9 +141,405 @@ def interpolate_smiles(self, result = self.stub.Interpolate(spec) result = list(result.generatedSmiles) - - generated_df = pd.DataFrame({'SMILES': result, - 'Generated': [True for i in range(len(result))]}) - generated_df.iat[0, 1] = False - generated_df.iat[-1, 1] = False + n_pairs = len(compound_ids) - 1 + n_generated = num_points + 2 + n_generated_total = n_generated * n_pairs + assert len(result) == n_generated_total, f"Expected generator to return {n_generated} compounds between each of the {n_pairs} compound-pairs but got {len(result)}" + generated_df = pd.DataFrame({ + 'SMILES': result, + 'Generated': [ + i % n_generated not in [0, n_generated - 1] + for i in range(n_generated_total) + ], + 'id': [ + str(compound_ids[i // n_generated]) if i % n_generated == 0 + else str(compound_ids[1 + i // n_generated]) if i % n_generated == n_generated - 1 + else f'{compound_ids[i // n_generated]}-{compound_ids[1 + i // n_generated]}_i{i % n_generated}' + for i in range(n_generated_total) + ], + }) + #generated_df = pd.DataFrame({'SMILES': result, + # 'Generated': [True for i in range(len(result))]}) + #generated_df.iat[0, 1] = False + #generated_df.iat[-1, 1] = False return generated_df + + + def extrapolate_from_cluster(self, + compounds_df, + compound_property: str, + cluster_id: int = 0, + n_compounds_to_transform=10, + num_points: int = 10, + step_size: float = 0.01, + force_unique = False, + scaled_radius: int = 1): + """ + The embedding vector is calculated for the specified cluster_id and applied over it. + TO DO: We should have a table of direction vectors in embedded space listed, just like the list of compound IDs. + The user should choose one to be applied to the selected compounds, or to a cluster number. + """ + smiles_list = None + radius = self._compute_radius(scaled_radius) + # TO DO: User must be able to extrapolate directly from smiles in the table; + # these may themselves be generated compounds without any chemblid. + df_cluster = compounds_df[ compounds_df['cluster'] == int(cluster_id) ].dropna().reset_index(drop=True) + if hasattr(df_cluster, 'compute'): + df_cluster = df_cluster.compute() + if 'transformed_smiles' in df_cluster: + smiles_col = 'transformed_smiles' + elif 'SMILES' in df_cluster: + smiles_col = 'SMILES' + elif 'smiles' in df_cluster: + smiles_col = 'smiles' + else: + raise RuntimeError(f'No smiles column in df_cluster: {list(df_cluster.columns)}\n{df_cluster.head()}') + smiles_col = None + smiles_list = df_cluster[smiles_col].to_array() + return self.extrapolate_from_smiles(smiles_list, + compound_property_vals=df_cluster[compound_property].to_gpu_array(), #to_array(), #[:n_compounds_to_transform].to_array(), + num_points=num_points, + n_compounds_to_transform=n_compounds_to_transform, + step_size=step_size, + scaled_radius=radius, + force_unique=force_unique, + id_list=df_cluster['id'].to_array()) + + def _get_embedding_direction(self, + embedding_list, + compound_property_vals, + ): + """ + Get the embedding of all compounds in the specified cluster. + The performa a linear regression against the compound_property to find the direction in + embedded space along which the compound_property tends to increase. + Using the minimum and maximum values of the compound_property in the cluster to define the range, + compute the step size along the direction that is expected to increase the compound_property value by step_percentage. + """ + + logger.info(f'_get_embedding_direction: emb:{embedding_list.shape}, {type(embedding_list)}, prop:{compound_property_vals.shape}, {type(compound_property_vals)},'\ + f' prop range: {max(compound_property_vals)} - {min(compound_property_vals)}.\nFitting Lasso regression...') + n_data = compound_property_vals.shape[0] + n_dimensions = embedding_list[0].shape[0] + reg = Lasso() + reg = reg.fit(embedding_list, compound_property_vals) + n_zero_coefs = len([x for x in reg.coef_ if x == 0.0]) + zero_coef_indices = [i for i, x in enumerate(reg.coef_) if x != 0.0] + logger.info(f'coef: {n_zero_coefs} / {len(reg.coef_)} coefficients are zero (in some positions between {min(zero_coef_indices)} and {max(zero_coef_indices)});'\ + f' range: {reg.coef_.argmin()}: {min(reg.coef_)} to {reg.coef_.argmax()}: {max(reg.coef_)}') + + y_pred = reg.predict(embedding_list) + rmse = sqrt(mean_squared_error(compound_property_vals, y_pred.astype('float64'))) + pearson_rho = cp.corrcoef(compound_property_vals, y_pred) + emb_std = np.std(embedding_list, axis=0) + logger.info(f'_get_embedding_direction: n={len(compound_property_vals)}, rho={pearson_rho}, rmse={rmse}, embedding_list.std: {emb_std}') + + emb_max = embedding_list[ np.argmax(compound_property_vals) ] + emb_min = embedding_list[ np.argmin(compound_property_vals) ] + diff_size = np.linalg.norm(emb_max - emb_min) / sqrt(n_dimensions) + #logger.info(f'compound_property_vals: [{np.argmin(compound_property_vals)}]={np.amin(compound_property_vals)}, [{np.argmax(compound_property_vals)}]={np.amax(compound_property_vals)}, diff_size={diff_size}') + return reg.coef_, emb_std, diff_size + + + def extrapolate_from_smiles(self, + smiles_list, + compound_property_vals, + num_points: int, + step_size: float, + scaled_radius=None, + force_unique=False, + n_compounds_to_transform=10, + id_list=[], + debug=False): + """ + Given a list of smiles strings, convert each to its embedding. + Then taken num_points steps in the specified direction (in the embedded space) of size step_size. + Convert these points on the embedded space back to smiles strings and return as a dataframe. + Modify duplicates if force_unique is True by adding a jitter of magnitude radius to the embedding. + """ + # TODO: generated compounds are the same no matter what the step-size is, check code!!!! + # TODO: generated compounds are yielding different Tanimotos even though their are identical. Bug or jitter??? + step_size = float(step_size) + n_compounds_to_transform = int(n_compounds_to_transform) + if len(id_list) == 0: + id_list = list(map(str, range(len(smiles_list)))) + logger.info(f'molbart: extrapolate_from_smiles: {len(smiles_list)} smiles ({type(smiles_list)}), {num_points} extrapolations each with step_size {step_size}') + data = pd.DataFrame({'transformed_smiles': smiles_list}) + full_mask = None + emb_shape = None + n_recovered = 0 + avg_tani = 0 + embeddings = [] + for i, smiles in enumerate(smiles_list): + spec = GenerativeSpec( + model=GenerativeModel.MegaMolBART, + smiles=smiles, + ) + result = self.stub.SmilesToEmbedding(spec) + emb = result.embedding + mask = result.pad_mask + emb_shape = result.dim + if debug: + spec = EmbeddingList( + embedding=emb, + dim=emb_shape, + pad_mask=mask + ) + generated_mols = self.stub.EmbeddingToSmiles(spec).generatedSmiles + if len(generated_mols) > 0: + n_recovered += 1 + tani = FingerprintSimilarity(RDKFingerprint(MolFromSmiles(smiles)), RDKFingerprint(MolFromSmiles(generated_mols[0]))) + logger.info(f'{n_recovered}/ {i+1}: {smiles} ({len(smiles)} chars)--> emb:{emb_shape}, mask:{mask.shape} --> {generated_mols} (tani={tani:.2f})') + avg_tani += tani + logger.info(f'emb: {type(emb)}, dim={emb_shape}, mask={len(mask)}, emb={len(emb)}') + embeddings.append(torch.tensor(emb)) #.detach().reshape(-1)) #torch tensor + if full_mask is None: + logger.info(f'First mask = {mask}') + full_mask = mask + emb_shape = emb_shape # n_tokens x 1 x 256 + else: + full_mask = [a and b for a, b in zip(full_mask, mask)] # not used any more + if debug: + logger.info(f'{n_recovered} / {len(smiles_list)} compounds yielded something after embedding, with avg tani = {avg_tani / n_recovered if n_recovered > 0 else 0}') + + embeddings = torch.nn.utils.rnn.pad_sequence(embeddings, batch_first=True, padding_value=PAD_TOKEN) # n_smiles x embedding_length + n_embedding_tokens = int(embeddings.shape[1] / (emb_shape[1] * emb_shape[2])) + emb_shape = [n_embedding_tokens, emb_shape[1], emb_shape[2]] + embeddings = cp.asarray(embeddings) + full_mask = [False] * n_embedding_tokens + + # Use the entire cluster to infer the direction: + direction, emb_std, diff_size = self._get_embedding_direction(embeddings, compound_property_vals) + if diff_size == 0.0: + logger.info(f'Increasing diff_size from 0.0 to 1e-6') + diff_size = 1e-6 + + # But apply the transform to no more than n_compounds_to_transform, chosen at random + if n_compounds_to_transform < len(smiles_list): + indices = np.random.choice(list(range(len(smiles_list))), size=n_compounds_to_transform, replace=False) + smiles_list = [smiles_list[i] for i in indices] + embeddings = cp.asarray([embeddings[i,:] for i in indices]) + id_list = [id_list[i] for i in indices] + + result_df_list = [ pd.DataFrame({'SMILES': smiles_list, 'Generated': False, 'id': id_list}) ] + + for step_num in range(1, 1 + num_points): + direction_sampled = cp.random.normal(loc=direction, scale=emb_std, size=emb_std.shape) #direction + noise + step = float(step_num * diff_size * step_size) * direction_sampled + extrap_embeddings = embeddings + step # TODO: print and check output + #logger.info(f'step ({step_num} * {diff_size} * {step_size} * direction_sampled): {type(step)}, {step.shape}, {step}\n:extrap_embeddings: {type(extrap_embeddings)}, {extrap_embeddings.shape}, extrap_embeddings[0]={extrap_embeddings[0]}') + smiles_gen_list = [] + ids_interp_list = [] + for i in range(len(extrap_embeddings)): + extrap_embedding = list(extrap_embeddings[i,:]) + spec = EmbeddingList( + embedding=extrap_embedding, + dim=emb_shape, + pad_mask=full_mask + ) + smiles_gen = self.stub.EmbeddingToSmiles(spec).generatedSmiles[0] + logger.info(f'{i}: {smiles_gen}') + smiles_gen_list.append(smiles_gen) + ids_interp_list.append(f'{id_list[i]}-s{step_num}') + extrap_df = pd.DataFrame({ + 'SMILES': smiles_gen_list, + 'Generated': True, + 'id': ids_interp_list + }) + if force_unique: + inv_transform_funct = partial(self.inverse_transform, + mem_pad_mask=full_mask) + extrap_df = self.compute_unique_smiles(extrap_df, + smiles_gen, + inv_transform_funct, + radius=radius) + logger.info(f'step_num={step_num} yielded {len(extrap_df)} compounds') + result_df_list.append(extrap_df) + results_df = pd.concat(result_df_list, ignore_index=True) + results_df['id'] = results_df['id'].apply(str) + results_df.sort_values('id', inplace=True) + results_df.reset_index(drop=True, inplace=True) + return results_df + + def fit_nn( + self, + compounds_df, + compound_property, + cluster_id_train, + cluster_id_test, + hidden_layer_sizes, + activation_fn, + final_activation_fn, + max_epochs, + batch_size=32, + learning_rate=0.001, + weight_decay=0.0001, + debug=False, + #scaled_radius=None + ): + """ + Convert compound SMILES to embeddings, then train a neural network with n_layers hidden layers with the specified activation function (activation_fn) + to predict the specified compound_property of cluster_id_train. Evaluate the model on cluster_id_test. Return actual and predicted values for both + the train and test set. + """ + logger.info(f'cluster_id_train={cluster_id_train}, cluster_id_test={cluster_id_test}, compound_property={compound_property}, compounds_df: {len(compounds_df)}, {type(compounds_df)}') + df_train = compounds_df[ compounds_df['cluster'] == int(cluster_id_train) ].dropna().reset_index(drop=True)#.compute() + if hasattr(df_train, 'compute'): + df_train = df_train.compute() + df_test = compounds_df[ compounds_df['cluster'] == int(cluster_id_test) ].dropna().reset_index(drop=True)#.compute() + if hasattr(df_test, 'compute'): + df_test = df_test.compute() + n_train = len(df_train) + n_test = len(df_test) + + smiles_list = np.concatenate((df_train['transformed_smiles'].to_array(), df_test['transformed_smiles'].to_array()), axis=0) + logger.info(f'smiles_list: {smiles_list.shape}') + pad_length = max(map(len, smiles_list)) + 2 # add 2 for start / stop + embeddings = [] + emb_shape = None + n_recovered = 0 + avg_tani = 0 + + for i, smiles in enumerate(smiles_list): + spec = GenerativeSpec( + model=GenerativeModel.MegaMolBART, + smiles=smiles, + #radius=radius + ) + result = self.stub.SmilesToEmbedding(spec) + emb = result.embedding + mask = result.pad_mask + dim = result.dim + emb_shape = result.dim + if debug: + spec = EmbeddingList( + embedding=emb, + dim=emb_shape, + pad_mask=mask + ) + generated_mols = self.stub.EmbeddingToSmiles(spec).generatedSmiles + if len(generated_mols) > 0: + m = MolFromSmiles(generated_mols[0]) + if m is not None: + n_recovered += 1 + tani = FingerprintSimilarity(RDKFingerprint(MolFromSmiles(smiles)), RDKFingerprint(m)) + logger.info(f'{n_recovered}/ {i+1}: {smiles} ({len(smiles)} chars)--> emb:{emb_shape}, mask:{len(mask)} --> {generated_mols} (tani={tani:.2f})') + avg_tani += tani + embeddings.append(torch.tensor(emb, device=self.device)) #emb.detach().reshape(-1)) #torch tensor + + if debug: + logger.info(f'{n_recovered} / {len(smiles_list)} compounds yielded something after embedding, with avg tani = {avg_tani / n_recovered if n_recovered > 0 else 0}') + + embeddings = torch.nn.utils.rnn.pad_sequence( + embeddings, batch_first=True, padding_value=PAD_TOKEN) + embeddings_train = embeddings[:n_train,:] + embeddings_test = embeddings[n_train:,:] + compound_property_vals_train = torch.tensor(df_train[compound_property], device=self.device, dtype=torch.float32) + compound_property_vals_test = torch.tensor(df_test[compound_property], device=self.device, dtype=torch.float32) + train_pred, test_pred = self._build_and_train_nn( + embeddings_train, + compound_property_vals_train, + embeddings_test, + compound_property_vals_test, + hidden_layer_sizes = hidden_layer_sizes, + activation_fn=activation_fn, + final_activation_fn=final_activation_fn, + max_epochs=max_epochs, + learning_rate=learning_rate, + weight_decay=weight_decay, + batch_size=batch_size + ) + df = pd.DataFrame({ + 'x': torch.cat((compound_property_vals_train, compound_property_vals_test), axis=0).to('cpu').numpy(), + 'y': torch.cat((train_pred.detach(), test_pred.detach()), axis=0).to('cpu').flatten().numpy(), + 'cluster': np.concatenate((df_train['cluster'].to_array(), df_test['cluster'].to_array()), axis=0), + 'id': np.concatenate((df_train['id'].to_array(), df_test['id'].to_array()), axis=0), + 'train_set': [True] * n_train + [False] * n_test + }) + return df + + def _build_and_train_nn(self, + embedding_list_train, + compound_property_vals_train, + embedding_list_test, + compound_property_vals_test, + hidden_layer_sizes = [], + activation_fn='LeakyReLU', + final_activation_fn='LeakyReLU', + max_epochs=10, + batch_size=32, + learning_rate=0.001, + weight_decay=0.0001 + ): + """ + Construct a neural network with the specified number of layers, using the specified activation function. + Then train it on the training set and evaluate on the test set. Return results. + """ + + logger.info(f'_build_and_train_nn: emb_train:{embedding_list_train.shape}, {type(embedding_list_train)}, embedding_list_train[0]:{len(embedding_list_train[0])},'\ + f' prop:{compound_property_vals_train.shape}, {type(compound_property_vals_train)},'\ + f' prop_train: {min(compound_property_vals_train)} - {max(compound_property_vals_train)}') + n_data_train = compound_property_vals_train.shape[0] + n_dimensions = embedding_list_train[0].shape[0] + comp_net = CompoundNet(n_dimensions, hidden_layer_sizes, activation_fn, final_activation_fn).to(self.device) + logger.info(comp_net) + loss_fn = torch.nn.SmoothL1Loss() + opt = torch.optim.Adam(comp_net.parameters(), lr=learning_rate, weight_decay=weight_decay) + train_set = CompoundData(embedding_list_train, compound_property_vals_train) + loader = DataLoader(train_set, batch_size=batch_size, shuffle=True) + + i = 0 + for epoch in range(max_epochs): + total_loss = 0.0 + for compounds, properties in loader: + opt.zero_grad() + predictions = comp_net(compounds) + loss = loss_fn(predictions, properties) + loss.backward() + opt.step() + total_loss += loss.item() + i += 1 + logger.info(f'epoch {epoch+1}, {i} batches: {total_loss / (n_data_train * (epoch+1))}') + + comp_net.eval() + train_pred = comp_net(embedding_list_train) + test_pred = comp_net(embedding_list_test) + + return train_pred, test_pred + + +class CompoundData(Dataset): + + def __init__(self, compounds, properties): + self.compounds = compounds + self.properties = properties + + def __len__(self): + return len(self.compounds) + + def __getitem__(self, compound_index): + return self.compounds[compound_index,:], self.properties[compound_index] + + +class CompoundNet(torch.nn.Module): + + def __init__(self, n_input_features, hidden_layer_sizes, activation_fn, last_activation_fn=None): + super(CompoundNet, self).__init__() + hidden_layer_sizes.append(1) # output layer size is appended to hidden layer sizes + layers = [torch.nn.Linear(n_input_features, hidden_layer_sizes[0])] + try: + activation = getattr(torch.nn, activation_fn) + if last_activation_fn: + last_activation = getattr(torch.nn, last_activation_fn) + except Exception as e: + raise UserError(f'Activation function name {activation_fn} / {last_activation_fn} not recognized') + for i, hidden_layer_size in enumerate(hidden_layer_sizes[:-1]): + layers.append(activation()) + layers.append(torch.nn.Linear(hidden_layer_size, hidden_layer_sizes[i + 1])) + if last_activation_fn: + # Having a non-linear function right before the output may not be needed for some properties being predicted + layers.append(last_activation()) + self.layers = torch.nn.Sequential(*layers) + + def forward(self, x): + return self.layers(x) diff --git a/cuchem/requirements.txt b/cuchem/requirements.txt index 5cdeaa6d..fc6cce1e 100644 --- a/cuchem/requirements.txt +++ b/cuchem/requirements.txt @@ -24,4 +24,4 @@ plotly==4.9.0 pytest==6.2.2 umap-learn==0.5.1 grpcio -git+https://github.com/jrwnter/cddd.git@1.0 \ No newline at end of file +git+https://github.com/jrwnter/cddd.git@1.0 diff --git a/cuchem/startdash.py b/cuchem/startdash.py index e2ca5355..9a92167c 100755 --- a/cuchem/startdash.py +++ b/cuchem/startdash.py @@ -168,9 +168,19 @@ def cache(self): from cuchemcommon.fingerprint import Embeddings prepocess_type = Embeddings + # TODO: when loading precomputed fingerprints, the radius and size should be specified + # For now, we are hard-coding this information: + nBits = 512 + radius = 2 chem_data = ChEmblData(fp_type=prepocess_type) + subdir = f'{args.cache_directory}/fp_r{radius}_n{nBits}' + if not os.path.isdir(subdir): + os.mkdir(subdir) + + logging.info(f'client: saving fingerprints') + # This will trigger a reread if fingerprints are not found in the cache directory! chem_data.save_fingerprints( - os.path.join(args.cache_directory, FINGER_PRINT_FILES), num_recs = args.n_mol, + os.path.join(subdir, FINGER_PRINT_FILES), num_recs = args.n_mol, batch_size=args.batch_size) logger.info('Fingerprint generated in (hh:mm:ss.ms) {}'.format( diff --git a/megamolbart/megamolbart/inference.py b/megamolbart/megamolbart/inference.py index 7dab48e5..80c727da 100644 --- a/megamolbart/megamolbart/inference.py +++ b/megamolbart/megamolbart/inference.py @@ -236,6 +236,14 @@ def find_similars_smiles(self, dims.append(tuple(neighboring_embedding.shape)) embeddings.append(neighboring_embedding.flatten().tolist()) + # Rest of the applications and libraries use RAPIDS and cuPY libraries. + # For interoperability, we need to convert the embeddings to cupy. + embeddings = [] + dims = [] + for neighboring_embedding in neighboring_embeddings: + dims.append(neighboring_embedding.shape) + embeddings.append(neighboring_embedding.flatten().tolist()) + generated_df = pd.DataFrame({'SMILES': generated_mols, 'embeddings': embeddings, 'embeddings_dim': dims, @@ -296,9 +304,9 @@ def interpolate_smiles(self, result_df.append(interp_df) result_df = pd.concat(result_df) - smile_list = list(result_df['SMILES']) + smiles_list = list(result_df['SMILES']) - return result_df, smile_list + return result_df, smiles_list def compute_unique_smiles(self, interp_df, diff --git a/megamolbart/megamolbart/service.py b/megamolbart/megamolbart/service.py index eb16d786..db17fed9 100644 --- a/megamolbart/megamolbart/service.py +++ b/megamolbart/megamolbart/service.py @@ -24,9 +24,9 @@ def __init__(self, *args, **kwargs): # TODO how to handle length overrun for batch processing --> see also MegaMolBART.load_model in inference.py def SmilesToEmbedding(self, spec, context): - smile_str = ''.join(spec.smiles) + smiles_str = ''.join(spec.smiles) - embedding, pad_mask = self.megamolbart.smiles2embedding(smile_str, + embedding, pad_mask = self.megamolbart.smiles2embedding(smiles_str, pad_length=spec.padding) dim = embedding.shape embedding = embedding.flatten().tolist() @@ -51,10 +51,10 @@ def EmbeddingToSmiles(self, embedding_spec, context): def FindSimilars(self, spec, context): - smile_str = ''.join(spec.smiles) + smiles_str = ''.join(spec.smiles) generated_df = self.megamolbart.find_similars_smiles( - smile_str, + smiles_str, num_requested=spec.numRequested, scaled_radius=spec.radius, sanitize=spec.sanitize, diff --git a/megamolbart/scripts/download_model.sh b/megamolbart/scripts/download_model.sh new file mode 100755 index 00000000..c77446f6 --- /dev/null +++ b/megamolbart/scripts/download_model.sh @@ -0,0 +1,14 @@ +MEGAMOLBART_MODEL_VERSION=0.1 +MEGAMOLBART_MODEL_PATH=/models/megamolbart + +DOWNLOAD_URL="https://api.ngc.nvidia.com/v2/models/nvidia/clara/megamolbart/versions/${MEGAMOLBART_MODEL_VERSION}/zip" +echo -e "${YELLOW}Downloading model megamolbart to ${MEGAMOLBART_MODEL_PATH}...${RESET}" + +mkdir -p ${MEGAMOLBART_MODEL_PATH} +set -x +wget -q --show-progress \ + --content-disposition ${DOWNLOAD_URL} \ + -O ${MEGAMOLBART_MODEL_PATH}/megamolbart_${MEGAMOLBART_MODEL_VERSION}.zip +mkdir ${MEGAMOLBART_MODEL_PATH} +unzip -q ${MEGAMOLBART_MODEL_PATH}/megamolbart_${MEGAMOLBART_MODEL_VERSION}.zip \ + -d ${MEGAMOLBART_MODEL_PATH} diff --git a/megamolbart/tests/test_grpc.py b/megamolbart/tests/test_grpc.py index 4c58e44a..9c7f71d8 100644 --- a/megamolbart/tests/test_grpc.py +++ b/megamolbart/tests/test_grpc.py @@ -10,6 +10,7 @@ from megamolbart.service import GenerativeSampler import generativesampler_pb2 import generativesampler_pb2_grpc +from util import (DEFAULT_NUM_LAYERS, DEFAULT_D_MODEL, DEFAULT_NUM_HEADS, CHECKPOINTS_DIR) logger = logging.getLogger(__name__) @@ -29,6 +30,14 @@ def similarity(add_server_method, service_cls, stub_cls): finally: server.stop(None) +def test_fetch_iterations(): + sys.argv = [sys.argv[0]] + with similarity(generativesampler_pb2_grpc.add_GenerativeSamplerServicer_to_server, + GenerativeSampler, + generativesampler_pb2_grpc.GenerativeSamplerStub) as stub: + + result = stub.GetIteration(generativesampler_pb2.google_dot_protobuf_dot_empty__pb2.Empty()) + def test_dataframe_similar(): sys.argv = [sys.argv[0]] diff --git a/setup/ansible-nvidia-driver.yml b/setup/ansible-nvidia-driver.yml index b4aba4a3..6ea2074f 100644 --- a/setup/ansible-nvidia-driver.yml +++ b/setup/ansible-nvidia-driver.yml @@ -5,12 +5,12 @@ tasks: - name: Add CUDA apt-key apt_key: - url: https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub + url: https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub state: present - name: Add CUDA apt repository apt_repository: - repo: 'deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /' + repo: 'deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /' state: present filename: nvidia update_cache: yes @@ -27,9 +27,9 @@ state: present update_cache: yes with_items: - - 'deb https://nvidia.github.io/libnvidia-container/stable/ubuntu18.04/amd64/ /' - - 'deb https://nvidia.github.io/nvidia-container-runtime/stable/ubuntu18.04/amd64/ /' - - 'deb https://nvidia.github.io/nvidia-docker/ubuntu18.04/amd64/ /' + - 'deb https://nvidia.github.io/libnvidia-container/stable/ubuntu20.04/amd64/ /' + - 'deb https://nvidia.github.io/nvidia-container-runtime/stable/ubuntu20.04/amd64/ /' + - 'deb https://nvidia.github.io/nvidia-docker/ubuntu20.04/amd64/ /' register: nvidia_container_runtime_apt_repo - name: Install Nvidia Driver diff --git a/setup/env.sh b/setup/env.sh index a4bda685..aa45ff0d 100644 --- a/setup/env.sh +++ b/setup/env.sh @@ -212,4 +212,4 @@ download_model() { ln -nsf ${MODEL_PATH}/megamolbart_v${MEGAMOLBART_MODEL_VERSION} ${MODEL_PATH}/megamolbart set +e -} +} \ No newline at end of file diff --git a/setup/launch b/setup/launch index 2da4998b..c6a89650 100755 --- a/setup/launch +++ b/setup/launch @@ -79,7 +79,6 @@ start() { validate_docker # run a container and start dash inside container. echo "${CUCHEM_CONT} ${MEGAMOLBART_CONT}" - export ADDITIONAL_PARAM="$@" download_model @@ -102,7 +101,7 @@ start() { stop() { - docker-compose --env-file .env \ + ./docker-compose --env-file .env \ -f docker_compose.yml \ --project-directory . \ down