Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
0d524e4
Changes to improve runtime performance of benchmark tests.
Aug 27, 2021
92e4625
Bug fix after cherry-picking commit meant to upgrade to RAPIDS 21.08...
Sep 4, 2021
4996a46
Change defalt container version.
Sep 7, 2021
98f53f1
Update README.
Sep 7, 2021
e7592db
Update README
Sep 7, 2021
87d5c5f
Fix to allow fraction for raduis scale from UI.
rilango Oct 7, 2021
6fa7a8f
Remove the need for NGC cli.
rilango Oct 15, 2021
67db0ad
Remove the need to login to ngc to download containers.
rilango Oct 19, 2021
3297c54
Upgrade dask and distributed python module to address security issue …
rilango Oct 28, 2021
d5f26ee
Remove docker-compose from pre-req.
rilango Nov 11, 2021
ca0918c
Get megamolbart url from env vars
dorukozturk Jan 5, 2022
eb6f922
Fix vocab file issue when starting container standalone.
Jan 26, 2022
84b58cc
Change to download model when setup is not executed
Jan 26, 2022
baab028
changes to UI and backend support
Jan 30, 2022
fd10005
most features working
Feb 3, 2022
55619b2
ready for merge
Feb 4, 2022
3349b03
merging with dev
Feb 11, 2022
3e32715
minor
Feb 24, 2022
e997abe
Merge branch 'dev' into mpvenkatesh/az
Feb 24, 2022
d167e8a
minor
Feb 24, 2022
408cc13
debugging
Feb 25, 2022
dd6a7b4
fixing
Feb 25, 2022
6a2da06
fixing
Feb 26, 2022
a6b7656
debugging
Feb 26, 2022
881c9a9
fixed
Feb 27, 2022
b318c45
cleaned up
Feb 28, 2022
a785c00
removed test
Feb 28, 2022
d32e949
minor
Feb 28, 2022
dffc131
merged with dev
Mar 23, 2022
13e247e
merged with dev
Mar 24, 2022
2955cd6
merged with dev
Mar 24, 2022
6ddb02b
merged with dev
Mar 24, 2022
7161687
ready for merge with dev
Mar 24, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions Dockerfile.cuchem
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ RUN cd /opt/nvidia/cheminfomatics/common; \
RUN cd /opt/nvidia/cheminfomatics/cuchem; \
pip install -r requirements.txt

RUN pip install torch==1.7.0+cu110 -f https://download.pytorch.org/whl/torch_stable.html

ENV UCX_LOG_LEVEL error
ENV PYTHONPATH ./common/generated:./common:./cuchem:

Expand Down
5 changes: 0 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,6 @@ Build your container:
./launch.sh build
```

Download the ChEMBL database (version 27):
```
./launch.sh dbSetup
```

Launch the interactive ChEMBL exploration tool:
```
./launch.sh start
Expand Down
10 changes: 5 additions & 5 deletions common/cuchemcommon/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,24 @@ def meta_df(self):
"""
return NotImplemented

def fetch_molecular_embedding(self, n_molecules: int, cache_directory: str = None):
def fetch_molecular_embedding(self, n_molecules: int, cache_directory: str = None, radius = 2, nBits = 512):
"""
Fetch molecular properties from database/cache into a dask array.
"""
return NotImplemented

def fetch_molecular_embedding_by_id(self, molecule_id: List):
def fetch_molecular_embedding_by_id(self, molecule_id: List, radius=2, nBits=512):
"""
Fetch molecular properties from database for the given id. Id depends on
the backend databse. For chemble DB it should be molregid.
the backend databse. For chembl DB it should be molregid.
"""
return NotImplemented

def fetch_id_from_smile(self, new_molecules: List):
"""
Fetch molecular details for a list of molecules. The values in the list
of molecules depends on database/service used. For e.g. it could be
ChemblId or molreg_id for Chemble database.
ChemblId or molreg_id for Chembl database.
"""
return NotImplemented

Expand All @@ -40,6 +40,6 @@ def fetch_id_from_chembl(self, id: List):
"""
Fetch molecular details for a list of molecules. The values in the list
of molecules depends on database/service used. For e.g. it could be
ChemblId or molreg_id for Chemble database.
ChemblId or molreg_id for Chembl database.
"""
return NotImplemented
60 changes: 47 additions & 13 deletions common/cuchemcommon/data/cluster_wf.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import cudf
import dask
import dask_cudf
import sys
from cuchemcommon.context import Context
from cuchemcommon.data.helper.chembldata import BATCH_SIZE, ChEmblData
from cuchemcommon.utils.singleton import Singleton
Expand All @@ -19,38 +20,71 @@

class ChemblClusterWfDao(ClusterWfDAO, metaclass=Singleton):

def __init__(self, fp_type):
def __init__(self, fp_type, radius=2, nBits=512):
logger.info(f'ChemblClusterWfDao({fp_type})')
self.chem_data = ChEmblData(fp_type)
self.radius = radius
self.nBits = nBits

def meta_df(self):
chem_data = ChEmblData()
return chem_data._meta_df()

def fetch_molecular_embedding(self,
n_molecules: int,
cache_directory: str = None):
cache_directory: str = None,
radius=2,
nBits=512):
# Since we allow the user to change the fingerprint radius and length (nBits),
# the fingerprints need to be cached in separate subdirectories.
# Note: the precomputed ones are not presumed to be of a specific radius or length
context = Context()
if cache_directory:
hdf_path = os.path.join(cache_directory, FINGER_PRINT_FILES)
cache_subdir = f'{cache_dir}/fp_r{radius}_n{nBits}'
hdf_path = os.path.join(cache_subdir, FINGER_PRINT_FILES)
else:
cache_subdir = None
hdf_path = None
if cache_directory and os.path.isdir(cache_subdir): # and (self.radius == radius) and (self.nBits == nBits):
logger.info('Reading %d rows from %s...', n_molecules, hdf_path)
mol_df = dask.dataframe.read_hdf(hdf_path, 'fingerprints')

if len(mol_df) == 0:
logger.info(f'Zero molecules found in {hdf_path}! Caching error?')
if n_molecules > 0:
npartitions = math.ceil(n_molecules / BATCH_SIZE)
mol_df = mol_df.head(n_molecules, compute=False, npartitions=npartitions)
else:
logger.info('Reading molecules from database...')
mol_df = self.chem_data.fetch_mol_embedding(num_recs=n_molecules,
batch_size=context.batch_size)

self.radius = radius
self.nBits = nBits
logger.info(f'Reading molecules from database and computing fingerprints (radius={self.radius}, nBits={self.nBits})...')
sys.stdout.flush()
mol_df = self.chem_data.fetch_mol_embedding(
num_recs=n_molecules,
batch_size=context.batch_size,
radius=radius,
nBits=nBits
)
if cache_directory:
os.mkdir(cache_subdir)
logger.info(f'Caching mol_df fingerprints to {hdf_path}')
mol_df.to_hdf(hdf_path, 'fingerprints')
else:
logging.info(f'cache_directory={cache_directory}, not caching!')
sys.stdout.flush()
return mol_df

def fetch_molecular_embedding_by_id(self, molecule_id: List):
def fetch_molecular_embedding_by_id(self, molecule_id: List, radius=2, nBits=512):
context = Context()
meta = self.chem_data._meta_df()
fp_df = self.chem_data._fetch_mol_embedding(molregnos=molecule_id,
batch_size=context.batch_size) \
.astype(meta.dtypes)
meta = self.chem_data._meta_df(
f'fetch_molecular_embedding_by_id({molecule_id}): MISMATCH!!! radius: {radius} != {self.radius}, nBits: {nBits} != {self.nBits}')
if (self.radius != radius) or (self.nBits != nBits):
logger.info('Something broken?')
fp_df = self.chem_data._fetch_mol_embedding(
molregnos=molecule_id,
batch_size=context.batch_size,
radius=radius,
nBits=nBits
).astype(meta.dtypes)

fp_df = cudf.from_pandas(fp_df)
fp_df = dask_cudf.from_cudf(fp_df, npartitions=1).reset_index()
Expand Down
66 changes: 38 additions & 28 deletions common/cuchemcommon/data/helper/chembldata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@
import pandas
import sqlite3
import logging

import sys
from typing import List
from dask import delayed, dataframe

import dask
from contextlib import closing
from cuchemcommon.utils.singleton import Singleton
from cuchemcommon.context import Context
Expand Down Expand Up @@ -70,7 +69,7 @@ def fetch_props_by_molregno(self, molregnos):
cols = list(map(lambda x: x[0], cur.description))
return cols, cur.fetchall()

def fetch_props_by_chemble(self, chemble_ids):
def fetch_props_by_chembl(self, chembl_ids):
"""
Returns compound properties and structure filtered by ChEMBL IDs along
with a list of columns.
Expand All @@ -84,7 +83,7 @@ def fetch_props_by_chemble(self, chemble_ids):
"""
with closing(sqlite3.connect(self.chembl_db, uri=True)) as con, con, \
closing(con.cursor()) as cur:
select_stmt = sql_stml % "'%s'" % "','".join([x.strip().upper() for x in chemble_ids])
select_stmt = sql_stml % "'%s'" % "','".join([x.strip().upper() for x in chembl_ids])
cur.execute(select_stmt)

cols = list(map(lambda x: x[0], cur.description))
Expand Down Expand Up @@ -148,13 +147,18 @@ def fetch_molecule_cnt(self):

return cur.fetchone()[0]

def _meta_df(self, **transformation_kwargs):
def _meta_df(self, columns=[], **transformation_kwargs):
transformation = self.fp_type(**transformation_kwargs)

prop_meta = {'id': pandas.Series([], dtype='int64')}
prop_meta.update(dict(zip(IMP_PROPS + ADDITIONAL_FEILD,
IMP_PROPS_TYPE + ADDITIONAL_FEILD_TYPE)))
prop_meta.update({i: pandas.Series([], dtype='float32') for i in range(len(transformation))})
prop_meta.update(
{i: pandas.Series([], dtype='float32') for i in range(len(transformation))})
# New columns containing the fingerprint as uint64s:
for column in columns:
if isinstance(column, str) and column.startswith('fp'):
prop_meta.update({column: pandas.Series([], dtype='uint64')})

return pandas.DataFrame(prop_meta)

Expand All @@ -167,7 +171,7 @@ def _fetch_mol_embedding(self,
Returns compound properties and structure for the first N number of
records in a dataframe.
"""

# TODO: loading compounds from the database and computing fingerprints need to be separated
logger.debug('Fetching %d records starting %d...' % (batch_size, start))

imp_cols = ['cp.' + col for col in IMP_PROPS]
Expand All @@ -194,32 +198,38 @@ def _fetch_mol_embedding(self,
LIMIT %d, %d
''' % (', '.join(imp_cols), " ,".join(list(map(str, molregnos))), start, batch_size)

df = pandas.read_sql(select_stmt,
sqlite3.connect(self.chembl_db, uri=True))
df = pandas.read_sql(
select_stmt,
sqlite3.connect(self.chembl_db, uri=True))

# Smiles -> Smiles transformation and filtering
# TODO: Discuss internally to find use or refactor this code to remove
# model specific filtering
df['transformed_smiles'] = df['canonical_smiles']
# if smiles_transforms is not None:
# if len(smiles_transforms) > 0:
# for xf in smiles_transforms:
# df['transformed_smiles'] = df['transformed_smiles'].map(xf.transform)
# df.dropna(subset=['transformed_smiles'], axis=0, inplace=True)

# Conversion to fingerprints or embeddings
# transformed_smiles = df['transformed_smiles']
transformation = self.fp_type(**transformation_kwargs)
cache_data = transformation.transform(df)
return_df = pandas.DataFrame(cache_data)

# This is where the int64 fingerprint columns are computed:
cache_data, raw_fp_list = transformation.transform(
df,
return_fp=True
)
return_df = pandas.DataFrame(cache_data)
return_df = pandas.DataFrame(
return_df,
columns=pandas.RangeIndex(start=0,
stop=len(transformation))).astype('float32')

return_df = df.merge(return_df, left_index=True, right_index=True)
# TODO: expect to run into the issue that the fingerprint cannot be a cudf column
# TODO: compute here so that chemvisualize does not have to
# The computed fingerprint columns are inserted into the df with the 'fp' prefix (to
# distinguish from PCA columns that are also numeric)
for i, fp_col in enumerate(raw_fp_list):
return_df[f'fp{i}'] = fp_col
return_df.rename(columns={'molregno': 'id'}, inplace=True)

return return_df

def fetch_mol_embedding(self,
Expand All @@ -231,8 +241,6 @@ def fetch_mol_embedding(self,
Returns compound properties and structure for the first N number of
records in a dataframe.
"""
logger.debug('Fetching properties for all molecules...')

if num_recs is None or num_recs < 0:
num_recs = self.fetch_molecule_cnt()

Expand All @@ -242,24 +250,26 @@ def fetch_mol_embedding(self,
dls = []
for start in range(0, num_recs, batch_size):
bsize = min(num_recs - start, batch_size)
dl_data = delayed(self._fetch_mol_embedding)(start=start,
batch_size=bsize,
molregnos=molregnos,
**transformation_kwargs)
dl_data = dask.delayed(self._fetch_mol_embedding)(
start=start,
batch_size=bsize,
molregnos=molregnos,
**transformation_kwargs
)
dls.append(dl_data)
meta_df = self._meta_df(
columns=dls[0].columns.compute(), **transformation_kwargs)

return dataframe.from_delayed(dls, meta=meta_df)
return dask.dataframe.from_delayed(dls, meta=meta_df)

def save_fingerprints(self, hdf_path='data/filter_*.h5', num_recs=None, batch_size=5000):
"""
Generates fingerprints for all ChEMBL ID's in the database
"""
logger.debug('Fetching molecules from database for fingerprints...')

mol_df = self.fetch_mol_embedding(num_recs=num_recs, batch_size=batch_size)
logger.info(f'save_fingerprints writing {type(mol_df)} to {hdf_path}')
mol_df.to_hdf(hdf_path, 'fingerprints')


def is_valid_chemble_smiles(self, smiles, con=None):

if con is None:
Expand Down
61 changes: 59 additions & 2 deletions common/cuchemcommon/fingerprint.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from math import ceil

INTEGER_NBITS = 64 # Maximum number of bits in an integer column in a cudf Series

logger = logging.getLogger(__name__)

Expand All @@ -25,7 +27,7 @@ def __init__(self, **kwargs):
self.kwargs = None
self.func = None

def transform(self, data):
def transform(self, data, smiles_column = 'transformed_smiles'):
return NotImplemented

def transform_many(self, data):
Expand Down Expand Up @@ -56,14 +58,69 @@ def transform_single(self, smiles):
fp = cupy.asarray(fp)
return fp

def transform(self, data, col_name='transformed_smiles'):
def transform_new(self, data, col_name='transformed_smiles', return_fp=False, raw=False):
"""Single threaded processing of list"""
data = data[col_name]
fp_array = []
self.n_fp_integers = ceil(self.kwargs['nBits'] / INTEGER_NBITS)
if raw:
raw_fp_array = []
else:
raw_fp_array = [[] for i in range(0, self.kwargs['nBits'], INTEGER_NBITS)]
for smiles in data:
fp = self.transform_single(smiles)
fp_array.append(fp)
fp_bs = fp.ToBitString()
if return_fp:
if raw:
raw_fp_array.append(fp)
else:
for i in range(0, self.kwargs['nBits'], INTEGER_NBITS):
raw_fp_array[i // INTEGER_NBITS].append(int(fp_bs[i: i + INTEGER_NBITS], 2))
fp_array = cupy.stack(fp_array)
if return_fp:
if raw:
return fp_array, raw_fp_array
else:
return fp_array, np.asarray(raw_fp_array, dtype=np.uint64)
return fp_array

def transform(
self,
data,
smiles_column = 'transformed_smiles',
return_fp = False, # When set to True, an additional value is returned determined by the raw parameter
raw = False # The RDKit fingerprint object is returned when raw = True, and the int64 fingerprint columns are returned when raw = False
):
data = data[smiles_column]
fp_array = []
self.n_fp_integers = ceil(self.kwargs['nBits'] / INTEGER_NBITS)
if raw:
raw_fp_array = []
else:
raw_fp_array = [[] for i in range(0, self.kwargs['nBits'], INTEGER_NBITS)]
for mol_smiles in data:
m = Chem.MolFromSmiles(mol_smiles)
if not m:
fp = None
fp_bs = '0' * self.kwargs['nBits']
else:
fp = self.func(m, **self.kwargs)
fp_bs = fp.ToBitString()
fp_array.append(cupy.asarray(np.frombuffer(fp_bs.encode(), 'u1') - ord('0')))
if return_fp:
if raw:
raw_fp_array.append(fp)
else:
for i in range(0, self.kwargs['nBits'], INTEGER_NBITS):
raw_fp_array[i // INTEGER_NBITS].append(int(fp_bs[i: i + INTEGER_NBITS], 2))
fp_array = cupy.stack(fp_array)
# TODO: return value parameter names should be self-explanatory
if return_fp:
if raw:
return fp_array, raw_fp_array
else:
return fp_array, np.asarray(raw_fp_array, dtype=np.uint64)
return fp_array

def __len__(self):
Expand Down
Loading