NVIDIA · mpvenkatesh · Aug 27, 2021 · Sep 4, 2021 · Sep 7, 2021 · Sep 7, 2021
diff --git a/Dockerfile.cuchem b/Dockerfile.cuchem
@@ -22,6 +22,8 @@ RUN cd /opt/nvidia/cheminfomatics/common; \
 RUN cd /opt/nvidia/cheminfomatics/cuchem; \
     pip install -r requirements.txt
 
+RUN pip install torch==1.7.0+cu110 -f https://download.pytorch.org/whl/torch_stable.html
+
 ENV UCX_LOG_LEVEL error
 ENV PYTHONPATH ./common/generated:./common:./cuchem:
 

diff --git a/README.md b/README.md
@@ -50,11 +50,6 @@ Build your container:
 ./launch.sh build
 ```
 
-Download the ChEMBL database (version 27):
-```
-./launch.sh dbSetup
-```
-
 Launch the interactive ChEMBL exploration tool:
 ```
 ./launch.sh start

diff --git a/common/cuchemcommon/data/__init__.py b/common/cuchemcommon/data/__init__.py
@@ -12,24 +12,24 @@ def meta_df(self):
         """
         return NotImplemented
 
-    def fetch_molecular_embedding(self, n_molecules: int, cache_directory: str = None):
+    def fetch_molecular_embedding(self, n_molecules: int, cache_directory: str = None, radius = 2, nBits = 512):
         """
         Fetch molecular properties from database/cache into a dask array.
         """
         return NotImplemented
 
-    def fetch_molecular_embedding_by_id(self, molecule_id: List):
+    def fetch_molecular_embedding_by_id(self, molecule_id: List, radius=2, nBits=512):
         """
         Fetch molecular properties from database for the given id. Id depends on
-        the backend databse. For chemble DB it should be molregid.
+        the backend databse. For chembl DB it should be molregid.
         """
         return NotImplemented
 
     def fetch_id_from_smile(self, new_molecules: List):
         """
         Fetch molecular details for a list of molecules. The values in the list
         of molecules depends on database/service used. For e.g. it could be
-        ChemblId or molreg_id for Chemble database.
+        ChemblId or molreg_id for Chembl database.
         """
         return NotImplemented
 
@@ -40,6 +40,6 @@ def fetch_id_from_chembl(self, id: List):
         """
         Fetch molecular details for a list of molecules. The values in the list
         of molecules depends on database/service used. For e.g. it could be
-        ChemblId or molreg_id for Chemble database.
+        ChemblId or molreg_id for Chembl database.
         """
         return NotImplemented
diff --git a/common/cuchemcommon/data/cluster_wf.py b/common/cuchemcommon/data/cluster_wf.py
@@ -6,6 +6,7 @@
 import cudf
 import dask
 import dask_cudf
+import sys
 from cuchemcommon.context import Context
 from cuchemcommon.data.helper.chembldata import BATCH_SIZE, ChEmblData
 from cuchemcommon.utils.singleton import Singleton
@@ -19,38 +20,71 @@
 
 class ChemblClusterWfDao(ClusterWfDAO, metaclass=Singleton):
 
-    def __init__(self, fp_type):
+    def __init__(self, fp_type, radius=2, nBits=512):
+        logger.info(f'ChemblClusterWfDao({fp_type})')
         self.chem_data = ChEmblData(fp_type)
+        self.radius = radius
+        self.nBits = nBits
 
     def meta_df(self):
         chem_data = ChEmblData()
         return chem_data._meta_df()
 
     def fetch_molecular_embedding(self,
                                   n_molecules: int,
-                                  cache_directory: str = None):
+                                  cache_directory: str = None, 
+                                  radius=2, 
+                                  nBits=512):
+        # Since we allow the user to change the fingerprint radius and length (nBits),
+        # the fingerprints need to be cached in separate subdirectories.
+        # Note: the precomputed ones are not presumed to be of a specific radius or length
         context = Context()
         if cache_directory:
-            hdf_path = os.path.join(cache_directory, FINGER_PRINT_FILES)
+            cache_subdir = f'{cache_dir}/fp_r{radius}_n{nBits}'
+            hdf_path = os.path.join(cache_subdir, FINGER_PRINT_FILES)
+        else:
+            cache_subdir = None
+            hdf_path = None
+        if cache_directory and os.path.isdir(cache_subdir): # and (self.radius == radius) and (self.nBits == nBits):
             logger.info('Reading %d rows from %s...', n_molecules, hdf_path)
             mol_df = dask.dataframe.read_hdf(hdf_path, 'fingerprints')
-
+            if len(mol_df) == 0:
+                logger.info(f'Zero molecules found in {hdf_path}! Caching error?')
             if n_molecules > 0:
                 npartitions = math.ceil(n_molecules / BATCH_SIZE)
                 mol_df = mol_df.head(n_molecules, compute=False, npartitions=npartitions)
         else:
-            logger.info('Reading molecules from database...')
-            mol_df = self.chem_data.fetch_mol_embedding(num_recs=n_molecules,
-                                                        batch_size=context.batch_size)
-
+            self.radius = radius
+            self.nBits = nBits
+            logger.info(f'Reading molecules from database and computing fingerprints (radius={self.radius}, nBits={self.nBits})...')
+            sys.stdout.flush()
+            mol_df = self.chem_data.fetch_mol_embedding(
+                num_recs=n_molecules,
+                batch_size=context.batch_size,
+                radius=radius,
+                nBits=nBits
+            )
+            if cache_directory:
+                os.mkdir(cache_subdir)
+                logger.info(f'Caching mol_df fingerprints to {hdf_path}')
+                mol_df.to_hdf(hdf_path, 'fingerprints')
+            else:
+                logging.info(f'cache_directory={cache_directory}, not caching!')
+        sys.stdout.flush()
         return mol_df
 
-    def fetch_molecular_embedding_by_id(self, molecule_id: List):
+    def fetch_molecular_embedding_by_id(self, molecule_id: List, radius=2, nBits=512):
         context = Context()
-        meta = self.chem_data._meta_df()
-        fp_df = self.chem_data._fetch_mol_embedding(molregnos=molecule_id,
-                                                    batch_size=context.batch_size) \
-            .astype(meta.dtypes)
+        meta = self.chem_data._meta_df(
+            f'fetch_molecular_embedding_by_id({molecule_id}): MISMATCH!!! radius: {radius} != {self.radius}, nBits: {nBits} != {self.nBits}')
+        if (self.radius != radius) or (self.nBits != nBits):
+            logger.info('Something broken?')
+        fp_df = self.chem_data._fetch_mol_embedding(
+            molregnos=molecule_id,
+            batch_size=context.batch_size,
+            radius=radius,
+            nBits=nBits
+        ).astype(meta.dtypes)
 
         fp_df = cudf.from_pandas(fp_df)
         fp_df = dask_cudf.from_cudf(fp_df, npartitions=1).reset_index()

diff --git a/common/cuchemcommon/data/helper/chembldata.py b/common/cuchemcommon/data/helper/chembldata.py
@@ -3,10 +3,9 @@
 import pandas
 import sqlite3
 import logging
-
+import sys
 from typing import List
-from dask import delayed, dataframe
-
+import dask
 from contextlib import closing
 from cuchemcommon.utils.singleton import Singleton
 from cuchemcommon.context import Context
@@ -70,7 +69,7 @@ def fetch_props_by_molregno(self, molregnos):
             cols = list(map(lambda x: x[0], cur.description))
             return cols, cur.fetchall()
 
-    def fetch_props_by_chemble(self, chemble_ids):
+    def fetch_props_by_chembl(self, chembl_ids):
         """
         Returns compound properties and structure filtered by ChEMBL IDs along
         with a list of columns.
@@ -84,7 +83,7 @@ def fetch_props_by_chemble(self, chemble_ids):
             """
         with closing(sqlite3.connect(self.chembl_db, uri=True)) as con, con, \
                 closing(con.cursor()) as cur:
-            select_stmt = sql_stml % "'%s'" % "','".join([x.strip().upper() for x in chemble_ids])
+            select_stmt = sql_stml % "'%s'" % "','".join([x.strip().upper() for x in chembl_ids])
             cur.execute(select_stmt)
 
             cols = list(map(lambda x: x[0], cur.description))
@@ -148,13 +147,18 @@ def fetch_molecule_cnt(self):
 
             return cur.fetchone()[0]
 
-    def _meta_df(self, **transformation_kwargs):
+    def _meta_df(self, columns=[], **transformation_kwargs):
         transformation = self.fp_type(**transformation_kwargs)
 
         prop_meta = {'id': pandas.Series([], dtype='int64')}
         prop_meta.update(dict(zip(IMP_PROPS + ADDITIONAL_FEILD,
                                   IMP_PROPS_TYPE + ADDITIONAL_FEILD_TYPE)))
-        prop_meta.update({i: pandas.Series([], dtype='float32') for i in range(len(transformation))})
+        prop_meta.update(
+            {i: pandas.Series([], dtype='float32') for i in range(len(transformation))})
+        # New columns containing the fingerprint as uint64s:
+        for column in columns:
+            if isinstance(column, str) and column.startswith('fp'):
+                prop_meta.update({column: pandas.Series([], dtype='uint64')})
 
         return pandas.DataFrame(prop_meta)
 
@@ -167,7 +171,7 @@ def _fetch_mol_embedding(self,
         Returns compound properties and structure for the first N number of
         records in a dataframe.
         """
-
+        # TODO: loading compounds from the database and computing fingerprints need to be separated
         logger.debug('Fetching %d records starting %d...' % (batch_size, start))
 
         imp_cols = ['cp.' + col for col in IMP_PROPS]
@@ -194,32 +198,38 @@ def _fetch_mol_embedding(self,
                 LIMIT %d, %d
             ''' % (', '.join(imp_cols), " ,".join(list(map(str, molregnos))), start, batch_size)
 
-        df = pandas.read_sql(select_stmt,
-                             sqlite3.connect(self.chembl_db, uri=True))
+        df = pandas.read_sql(
+            select_stmt,
+            sqlite3.connect(self.chembl_db, uri=True))
 
         # Smiles -> Smiles transformation and filtering
         # TODO: Discuss internally to find use or refactor this code to remove
         # model specific filtering
         df['transformed_smiles'] = df['canonical_smiles']
-        # if smiles_transforms is not None:
-        #     if len(smiles_transforms) > 0:
-        #         for xf in smiles_transforms:
-        #             df['transformed_smiles'] = df['transformed_smiles'].map(xf.transform)
-        #             df.dropna(subset=['transformed_smiles'], axis=0, inplace=True)
 
         # Conversion to fingerprints or embeddings
-        # transformed_smiles = df['transformed_smiles']
         transformation = self.fp_type(**transformation_kwargs)
-        cache_data = transformation.transform(df)
-        return_df = pandas.DataFrame(cache_data)
 
+        # This is where the int64 fingerprint columns are computed:
+        cache_data, raw_fp_list = transformation.transform(
+            df, 
+            return_fp=True
+        )
+        return_df = pandas.DataFrame(cache_data)
         return_df = pandas.DataFrame(
             return_df,
             columns=pandas.RangeIndex(start=0,
                                       stop=len(transformation))).astype('float32')
 
         return_df = df.merge(return_df, left_index=True, right_index=True)
+        # TODO: expect to run into the issue that the fingerprint cannot be a cudf column
+        # TODO: compute here so that chemvisualize does not have to
+        # The computed fingerprint columns are inserted into the df with the 'fp' prefix (to
+        # distinguish from PCA columns that are also numeric)
+        for i, fp_col in enumerate(raw_fp_list):
+            return_df[f'fp{i}'] = fp_col
         return_df.rename(columns={'molregno': 'id'}, inplace=True)
+
         return return_df
 
     def fetch_mol_embedding(self,
@@ -231,8 +241,6 @@ def fetch_mol_embedding(self,
         Returns compound properties and structure for the first N number of
         records in a dataframe.
         """
-        logger.debug('Fetching properties for all molecules...')
-
         if num_recs is None or num_recs < 0:
             num_recs = self.fetch_molecule_cnt()
 
@@ -242,24 +250,26 @@ def fetch_mol_embedding(self,
         dls = []
         for start in range(0, num_recs, batch_size):
             bsize = min(num_recs - start, batch_size)
-            dl_data = delayed(self._fetch_mol_embedding)(start=start,
-                                                         batch_size=bsize,
-                                                         molregnos=molregnos,
-                                                         **transformation_kwargs)
+            dl_data = dask.delayed(self._fetch_mol_embedding)(
+                start=start,
+                batch_size=bsize,
+                molregnos=molregnos,
+                **transformation_kwargs
+            )
             dls.append(dl_data)
+        meta_df = self._meta_df(
+            columns=dls[0].columns.compute(), **transformation_kwargs)
 
-        return dataframe.from_delayed(dls, meta=meta_df)
+        return dask.dataframe.from_delayed(dls, meta=meta_df)
 
     def save_fingerprints(self, hdf_path='data/filter_*.h5', num_recs=None, batch_size=5000):
         """
         Generates fingerprints for all ChEMBL ID's in the database
         """
-        logger.debug('Fetching molecules from database for fingerprints...')
-
         mol_df = self.fetch_mol_embedding(num_recs=num_recs, batch_size=batch_size)
+        logger.info(f'save_fingerprints writing {type(mol_df)} to {hdf_path}')
         mol_df.to_hdf(hdf_path, 'fingerprints')
 
-
     def is_valid_chemble_smiles(self, smiles, con=None):
 
         if con is None:

diff --git a/common/cuchemcommon/fingerprint.py b/common/cuchemcommon/fingerprint.py
@@ -6,7 +6,9 @@
 import numpy as np
 from rdkit import Chem
 from rdkit.Chem import AllChem
+from math import ceil
 
+INTEGER_NBITS = 64 # Maximum number of bits in an integer column in a cudf Series
 
 logger = logging.getLogger(__name__)
 
@@ -25,7 +27,7 @@ def __init__(self, **kwargs):
         self.kwargs = None
         self.func = None
 
-    def transform(self, data):
+    def transform(self, data, smiles_column = 'transformed_smiles'):
         return NotImplemented
 
     def transform_many(self, data):
@@ -56,14 +58,69 @@ def transform_single(self, smiles):
         fp = cupy.asarray(fp)
         return fp
 
-    def transform(self, data, col_name='transformed_smiles'):
+    def transform_new(self, data, col_name='transformed_smiles', return_fp=False, raw=False):
         """Single threaded processing of list"""
         data = data[col_name]
         fp_array = []
+        self.n_fp_integers = ceil(self.kwargs['nBits'] / INTEGER_NBITS)
+        if raw:
+            raw_fp_array = []
+        else:
+            raw_fp_array = [[] for i in range(0, self.kwargs['nBits'], INTEGER_NBITS)]
         for smiles in data:
             fp = self.transform_single(smiles)
             fp_array.append(fp)
+            fp_bs = fp.ToBitString()
+            if return_fp:
+                if raw:
+                    raw_fp_array.append(fp)
+                else:
+                    for i in range(0, self.kwargs['nBits'], INTEGER_NBITS):
+                        raw_fp_array[i // INTEGER_NBITS].append(int(fp_bs[i: i + INTEGER_NBITS], 2))
+        fp_array = cupy.stack(fp_array)
+        if return_fp:
+            if raw:
+                return fp_array, raw_fp_array
+            else:
+                return fp_array, np.asarray(raw_fp_array, dtype=np.uint64)        
+        return fp_array
+
+    def transform(
+        self, 
+        data, 
+        smiles_column = 'transformed_smiles', 
+        return_fp = False, # When set to True, an additional value is returned determined by the raw parameter
+        raw = False # The RDKit fingerprint object is returned when raw = True, and the int64 fingerprint columns are returned when raw = False
+    ):
+        data = data[smiles_column]
+        fp_array = []
+        self.n_fp_integers = ceil(self.kwargs['nBits'] / INTEGER_NBITS)
+        if raw:
+            raw_fp_array = []
+        else:
+            raw_fp_array = [[] for i in range(0, self.kwargs['nBits'], INTEGER_NBITS)]
+        for mol_smiles in data:
+            m = Chem.MolFromSmiles(mol_smiles)
+            if not m:
+                fp = None
+                fp_bs = '0' * self.kwargs['nBits']
+            else:
+                fp = self.func(m, **self.kwargs)
+                fp_bs = fp.ToBitString()
+            fp_array.append(cupy.asarray(np.frombuffer(fp_bs.encode(), 'u1') - ord('0')))
+            if return_fp:
+                if raw:
+                    raw_fp_array.append(fp)
+                else:
+                    for i in range(0, self.kwargs['nBits'], INTEGER_NBITS):
+                        raw_fp_array[i // INTEGER_NBITS].append(int(fp_bs[i: i + INTEGER_NBITS], 2))
         fp_array = cupy.stack(fp_array)
+        # TODO: return value parameter names should be self-explanatory 
+        if return_fp:
+            if raw:
+                return fp_array, raw_fp_array
+            else:
+                return fp_array, np.asarray(raw_fp_array, dtype=np.uint64)
         return fp_array
 
     def __len__(self):