diff --git a/Dockerfile.cuchem b/Dockerfile.cuchem
index dd210672..08e15c49 100644
--- a/Dockerfile.cuchem
+++ b/Dockerfile.cuchem
@@ -22,6 +22,8 @@ RUN cd /opt/nvidia/cheminfomatics/common; \
 RUN cd /opt/nvidia/cheminfomatics/cuchem; \
     pip install -r requirements.txt
 
+RUN pip install torch==1.7.0+cu110 -f https://download.pytorch.org/whl/torch_stable.html
+
 ENV UCX_LOG_LEVEL error
 ENV PYTHONPATH ./common/generated:./common:./cuchem:
 
diff --git a/README.md b/README.md
index fa1f86de..fa785866 100644
--- a/README.md
+++ b/README.md
@@ -50,11 +50,6 @@ Build your container:
 ./launch.sh build
 ```
 
-Download the ChEMBL database (version 27):
-```
-./launch.sh dbSetup
-```
-
 Launch the interactive ChEMBL exploration tool:
 ```
 ./launch.sh start
diff --git a/common/cuchemcommon/data/__init__.py b/common/cuchemcommon/data/__init__.py
index 3a07d30c..403dc91c 100644
--- a/common/cuchemcommon/data/__init__.py
+++ b/common/cuchemcommon/data/__init__.py
@@ -12,16 +12,16 @@ def meta_df(self):
         """
         return NotImplemented
 
-    def fetch_molecular_embedding(self, n_molecules: int, cache_directory: str = None):
+    def fetch_molecular_embedding(self, n_molecules: int, cache_directory: str = None, radius = 2, nBits = 512):
         """
         Fetch molecular properties from database/cache into a dask array.
         """
         return NotImplemented
 
-    def fetch_molecular_embedding_by_id(self, molecule_id: List):
+    def fetch_molecular_embedding_by_id(self, molecule_id: List, radius=2, nBits=512):
         """
         Fetch molecular properties from database for the given id. Id depends on
-        the backend databse. For chemble DB it should be molregid.
+        the backend databse. For chembl DB it should be molregid.
         """
         return NotImplemented
 
@@ -29,7 +29,7 @@ def fetch_id_from_smile(self, new_molecules: List):
         """
         Fetch molecular details for a list of molecules. The values in the list
         of molecules depends on database/service used. For e.g. it could be
-        ChemblId or molreg_id for Chemble database.
+        ChemblId or molreg_id for Chembl database.
         """
         return NotImplemented
 
@@ -40,6 +40,6 @@ def fetch_id_from_chembl(self, id: List):
         """
         Fetch molecular details for a list of molecules. The values in the list
         of molecules depends on database/service used. For e.g. it could be
-        ChemblId or molreg_id for Chemble database.
+        ChemblId or molreg_id for Chembl database.
         """
         return NotImplemented
diff --git a/common/cuchemcommon/data/cluster_wf.py b/common/cuchemcommon/data/cluster_wf.py
index 6462d5fb..ca3f50bc 100644
--- a/common/cuchemcommon/data/cluster_wf.py
+++ b/common/cuchemcommon/data/cluster_wf.py
@@ -6,6 +6,7 @@
 import cudf
 import dask
 import dask_cudf
+import sys
 from cuchemcommon.context import Context
 from cuchemcommon.data.helper.chembldata import BATCH_SIZE, ChEmblData
 from cuchemcommon.utils.singleton import Singleton
@@ -19,8 +20,11 @@
 
 class ChemblClusterWfDao(ClusterWfDAO, metaclass=Singleton):
 
-    def __init__(self, fp_type):
+    def __init__(self, fp_type, radius=2, nBits=512):
+        logger.info(f'ChemblClusterWfDao({fp_type})')
         self.chem_data = ChEmblData(fp_type)
+        self.radius = radius
+        self.nBits = nBits
 
     def meta_df(self):
         chem_data = ChEmblData()
@@ -28,29 +32,59 @@ def meta_df(self):
 
     def fetch_molecular_embedding(self,
                                   n_molecules: int,
-                                  cache_directory: str = None):
+                                  cache_directory: str = None, 
+                                  radius=2, 
+                                  nBits=512):
+        # Since we allow the user to change the fingerprint radius and length (nBits),
+        # the fingerprints need to be cached in separate subdirectories.
+        # Note: the precomputed ones are not presumed to be of a specific radius or length
         context = Context()
         if cache_directory:
-            hdf_path = os.path.join(cache_directory, FINGER_PRINT_FILES)
+            cache_subdir = f'{cache_dir}/fp_r{radius}_n{nBits}'
+            hdf_path = os.path.join(cache_subdir, FINGER_PRINT_FILES)
+        else:
+            cache_subdir = None
+            hdf_path = None
+        if cache_directory and os.path.isdir(cache_subdir): # and (self.radius == radius) and (self.nBits == nBits):
             logger.info('Reading %d rows from %s...', n_molecules, hdf_path)
             mol_df = dask.dataframe.read_hdf(hdf_path, 'fingerprints')
-
+            if len(mol_df) == 0:
+                logger.info(f'Zero molecules found in {hdf_path}! Caching error?')
             if n_molecules > 0:
                 npartitions = math.ceil(n_molecules / BATCH_SIZE)
                 mol_df = mol_df.head(n_molecules, compute=False, npartitions=npartitions)
         else:
-            logger.info('Reading molecules from database...')
-            mol_df = self.chem_data.fetch_mol_embedding(num_recs=n_molecules,
-                                                        batch_size=context.batch_size)
-
+            self.radius = radius
+            self.nBits = nBits
+            logger.info(f'Reading molecules from database and computing fingerprints (radius={self.radius}, nBits={self.nBits})...')
+            sys.stdout.flush()
+            mol_df = self.chem_data.fetch_mol_embedding(
+                num_recs=n_molecules,
+                batch_size=context.batch_size,
+                radius=radius,
+                nBits=nBits
+            )
+            if cache_directory:
+                os.mkdir(cache_subdir)
+                logger.info(f'Caching mol_df fingerprints to {hdf_path}')
+                mol_df.to_hdf(hdf_path, 'fingerprints')
+            else:
+                logging.info(f'cache_directory={cache_directory}, not caching!')
+        sys.stdout.flush()
         return mol_df
 
-    def fetch_molecular_embedding_by_id(self, molecule_id: List):
+    def fetch_molecular_embedding_by_id(self, molecule_id: List, radius=2, nBits=512):
         context = Context()
-        meta = self.chem_data._meta_df()
-        fp_df = self.chem_data._fetch_mol_embedding(molregnos=molecule_id,
-                                                    batch_size=context.batch_size) \
-            .astype(meta.dtypes)
+        meta = self.chem_data._meta_df(
+            f'fetch_molecular_embedding_by_id({molecule_id}): MISMATCH!!! radius: {radius} != {self.radius}, nBits: {nBits} != {self.nBits}')
+        if (self.radius != radius) or (self.nBits != nBits):
+            logger.info('Something broken?')
+        fp_df = self.chem_data._fetch_mol_embedding(
+            molregnos=molecule_id,
+            batch_size=context.batch_size,
+            radius=radius,
+            nBits=nBits
+        ).astype(meta.dtypes)
 
         fp_df = cudf.from_pandas(fp_df)
         fp_df = dask_cudf.from_cudf(fp_df, npartitions=1).reset_index()
diff --git a/common/cuchemcommon/data/helper/chembldata.py b/common/cuchemcommon/data/helper/chembldata.py
index 763d2ee6..7ccd7c58 100644
--- a/common/cuchemcommon/data/helper/chembldata.py
+++ b/common/cuchemcommon/data/helper/chembldata.py
@@ -3,10 +3,9 @@
 import pandas
 import sqlite3
 import logging
-
+import sys
 from typing import List
-from dask import delayed, dataframe
-
+import dask
 from contextlib import closing
 from cuchemcommon.utils.singleton import Singleton
 from cuchemcommon.context import Context
@@ -70,7 +69,7 @@ def fetch_props_by_molregno(self, molregnos):
             cols = list(map(lambda x: x[0], cur.description))
             return cols, cur.fetchall()
 
-    def fetch_props_by_chemble(self, chemble_ids):
+    def fetch_props_by_chembl(self, chembl_ids):
         """
         Returns compound properties and structure filtered by ChEMBL IDs along
         with a list of columns.
@@ -84,7 +83,7 @@ def fetch_props_by_chemble(self, chemble_ids):
             """
         with closing(sqlite3.connect(self.chembl_db, uri=True)) as con, con, \
                 closing(con.cursor()) as cur:
-            select_stmt = sql_stml % "'%s'" % "','".join([x.strip().upper() for x in chemble_ids])
+            select_stmt = sql_stml % "'%s'" % "','".join([x.strip().upper() for x in chembl_ids])
             cur.execute(select_stmt)
 
             cols = list(map(lambda x: x[0], cur.description))
@@ -148,13 +147,18 @@ def fetch_molecule_cnt(self):
 
             return cur.fetchone()[0]
 
-    def _meta_df(self, **transformation_kwargs):
+    def _meta_df(self, columns=[], **transformation_kwargs):
         transformation = self.fp_type(**transformation_kwargs)
 
         prop_meta = {'id': pandas.Series([], dtype='int64')}
         prop_meta.update(dict(zip(IMP_PROPS + ADDITIONAL_FEILD,
                                   IMP_PROPS_TYPE + ADDITIONAL_FEILD_TYPE)))
-        prop_meta.update({i: pandas.Series([], dtype='float32') for i in range(len(transformation))})
+        prop_meta.update(
+            {i: pandas.Series([], dtype='float32') for i in range(len(transformation))})
+        # New columns containing the fingerprint as uint64s:
+        for column in columns:
+            if isinstance(column, str) and column.startswith('fp'):
+                prop_meta.update({column: pandas.Series([], dtype='uint64')})
 
         return pandas.DataFrame(prop_meta)
 
@@ -167,7 +171,7 @@ def _fetch_mol_embedding(self,
         Returns compound properties and structure for the first N number of
         records in a dataframe.
         """
-
+        # TODO: loading compounds from the database and computing fingerprints need to be separated
         logger.debug('Fetching %d records starting %d...' % (batch_size, start))
 
         imp_cols = ['cp.' + col for col in IMP_PROPS]
@@ -194,32 +198,38 @@ def _fetch_mol_embedding(self,
                 LIMIT %d, %d
             ''' % (', '.join(imp_cols), " ,".join(list(map(str, molregnos))), start, batch_size)
 
-        df = pandas.read_sql(select_stmt,
-                             sqlite3.connect(self.chembl_db, uri=True))
+        df = pandas.read_sql(
+            select_stmt,
+            sqlite3.connect(self.chembl_db, uri=True))
 
         # Smiles -> Smiles transformation and filtering
         # TODO: Discuss internally to find use or refactor this code to remove
         # model specific filtering
         df['transformed_smiles'] = df['canonical_smiles']
-        # if smiles_transforms is not None:
-        #     if len(smiles_transforms) > 0:
-        #         for xf in smiles_transforms:
-        #             df['transformed_smiles'] = df['transformed_smiles'].map(xf.transform)
-        #             df.dropna(subset=['transformed_smiles'], axis=0, inplace=True)
 
         # Conversion to fingerprints or embeddings
-        # transformed_smiles = df['transformed_smiles']
         transformation = self.fp_type(**transformation_kwargs)
-        cache_data = transformation.transform(df)
-        return_df = pandas.DataFrame(cache_data)
 
+        # This is where the int64 fingerprint columns are computed:
+        cache_data, raw_fp_list = transformation.transform(
+            df, 
+            return_fp=True
+        )
+        return_df = pandas.DataFrame(cache_data)
         return_df = pandas.DataFrame(
             return_df,
             columns=pandas.RangeIndex(start=0,
                                       stop=len(transformation))).astype('float32')
 
         return_df = df.merge(return_df, left_index=True, right_index=True)
+        # TODO: expect to run into the issue that the fingerprint cannot be a cudf column
+        # TODO: compute here so that chemvisualize does not have to
+        # The computed fingerprint columns are inserted into the df with the 'fp' prefix (to
+        # distinguish from PCA columns that are also numeric)
+        for i, fp_col in enumerate(raw_fp_list):
+            return_df[f'fp{i}'] = fp_col
         return_df.rename(columns={'molregno': 'id'}, inplace=True)
+
         return return_df
 
     def fetch_mol_embedding(self,
@@ -231,8 +241,6 @@ def fetch_mol_embedding(self,
         Returns compound properties and structure for the first N number of
         records in a dataframe.
         """
-        logger.debug('Fetching properties for all molecules...')
-
         if num_recs is None or num_recs < 0:
             num_recs = self.fetch_molecule_cnt()
 
@@ -242,24 +250,26 @@ def fetch_mol_embedding(self,
         dls = []
         for start in range(0, num_recs, batch_size):
             bsize = min(num_recs - start, batch_size)
-            dl_data = delayed(self._fetch_mol_embedding)(start=start,
-                                                         batch_size=bsize,
-                                                         molregnos=molregnos,
-                                                         **transformation_kwargs)
+            dl_data = dask.delayed(self._fetch_mol_embedding)(
+                start=start,
+                batch_size=bsize,
+                molregnos=molregnos,
+                **transformation_kwargs
+            )
             dls.append(dl_data)
+        meta_df = self._meta_df(
+            columns=dls[0].columns.compute(), **transformation_kwargs)
 
-        return dataframe.from_delayed(dls, meta=meta_df)
+        return dask.dataframe.from_delayed(dls, meta=meta_df)
 
     def save_fingerprints(self, hdf_path='data/filter_*.h5', num_recs=None, batch_size=5000):
         """
         Generates fingerprints for all ChEMBL ID's in the database
         """
-        logger.debug('Fetching molecules from database for fingerprints...')
-
         mol_df = self.fetch_mol_embedding(num_recs=num_recs, batch_size=batch_size)
+        logger.info(f'save_fingerprints writing {type(mol_df)} to {hdf_path}')
         mol_df.to_hdf(hdf_path, 'fingerprints')
 
-
     def is_valid_chemble_smiles(self, smiles, con=None):
 
         if con is None:
diff --git a/common/cuchemcommon/fingerprint.py b/common/cuchemcommon/fingerprint.py
index 860e7021..9b4799a1 100644
--- a/common/cuchemcommon/fingerprint.py
+++ b/common/cuchemcommon/fingerprint.py
@@ -6,7 +6,9 @@
 import numpy as np
 from rdkit import Chem
 from rdkit.Chem import AllChem
+from math import ceil
 
+INTEGER_NBITS = 64 # Maximum number of bits in an integer column in a cudf Series
 
 logger = logging.getLogger(__name__)
 
@@ -25,7 +27,7 @@ def __init__(self, **kwargs):
         self.kwargs = None
         self.func = None
 
-    def transform(self, data):
+    def transform(self, data, smiles_column = 'transformed_smiles'):
         return NotImplemented
 
     def transform_many(self, data):
@@ -56,14 +58,69 @@ def transform_single(self, smiles):
         fp = cupy.asarray(fp)
         return fp
 
-    def transform(self, data, col_name='transformed_smiles'):
+    def transform_new(self, data, col_name='transformed_smiles', return_fp=False, raw=False):
         """Single threaded processing of list"""
         data = data[col_name]
         fp_array = []
+        self.n_fp_integers = ceil(self.kwargs['nBits'] / INTEGER_NBITS)
+        if raw:
+            raw_fp_array = []
+        else:
+            raw_fp_array = [[] for i in range(0, self.kwargs['nBits'], INTEGER_NBITS)]
         for smiles in data:
             fp = self.transform_single(smiles)
             fp_array.append(fp)
+            fp_bs = fp.ToBitString()
+            if return_fp:
+                if raw:
+                    raw_fp_array.append(fp)
+                else:
+                    for i in range(0, self.kwargs['nBits'], INTEGER_NBITS):
+                        raw_fp_array[i // INTEGER_NBITS].append(int(fp_bs[i: i + INTEGER_NBITS], 2))
+        fp_array = cupy.stack(fp_array)
+        if return_fp:
+            if raw:
+                return fp_array, raw_fp_array
+            else:
+                return fp_array, np.asarray(raw_fp_array, dtype=np.uint64)        
+        return fp_array
+
+    def transform(
+        self, 
+        data, 
+        smiles_column = 'transformed_smiles', 
+        return_fp = False, # When set to True, an additional value is returned determined by the raw parameter
+        raw = False # The RDKit fingerprint object is returned when raw = True, and the int64 fingerprint columns are returned when raw = False
+    ):
+        data = data[smiles_column]
+        fp_array = []
+        self.n_fp_integers = ceil(self.kwargs['nBits'] / INTEGER_NBITS)
+        if raw:
+            raw_fp_array = []
+        else:
+            raw_fp_array = [[] for i in range(0, self.kwargs['nBits'], INTEGER_NBITS)]
+        for mol_smiles in data:
+            m = Chem.MolFromSmiles(mol_smiles)
+            if not m:
+                fp = None
+                fp_bs = '0' * self.kwargs['nBits']
+            else:
+                fp = self.func(m, **self.kwargs)
+                fp_bs = fp.ToBitString()
+            fp_array.append(cupy.asarray(np.frombuffer(fp_bs.encode(), 'u1') - ord('0')))
+            if return_fp:
+                if raw:
+                    raw_fp_array.append(fp)
+                else:
+                    for i in range(0, self.kwargs['nBits'], INTEGER_NBITS):
+                        raw_fp_array[i // INTEGER_NBITS].append(int(fp_bs[i: i + INTEGER_NBITS], 2))
         fp_array = cupy.stack(fp_array)
+        # TODO: return value parameter names should be self-explanatory 
+        if return_fp:
+            if raw:
+                return fp_array, raw_fp_array
+            else:
+                return fp_array, np.asarray(raw_fp_array, dtype=np.uint64)
         return fp_array
 
     def __len__(self):
diff --git a/common/cuchemcommon/workflow.py b/common/cuchemcommon/workflow.py
index 065c3632..5a15fec0 100644
--- a/common/cuchemcommon/workflow.py
+++ b/common/cuchemcommon/workflow.py
@@ -1,3 +1,6 @@
+from functools import singledispatch
+import pandas as pd
+import numpy as np
 import logging
 from typing import List
 
@@ -6,7 +9,6 @@
 
 logger = logging.getLogger(__name__)
 
-
 class BaseGenerativeWorkflow(BaseTransformation):
 
     def __init__(self, dao: GenerativeWfDao = None) -> None:
@@ -62,7 +64,7 @@ def _compute_radius(self, scaled_radius):
 
     def interpolate_by_id(self,
                           ids: List,
-                          id_type: str = 'chembleid',
+                          id_type: str = 'chemblid',
                           num_points=10,
                           force_unique=False,
                           scaled_radius: int = 1,
@@ -72,40 +74,84 @@ def interpolate_by_id(self,
         if not self.min_jitter_radius:
             raise Exception('Property `radius_scale` must be defined in model class.')
 
-        if id_type.lower() == 'chembleid':
+        if id_type.lower() == 'chemblid':
             smiles = [row[2] for row in self.dao.fetch_id_from_chembl(ids)]
             if len(smiles) != len(ids):
                 raise Exception('One of the ids is invalid %s', ids)
         else:
             raise Exception('id type %s not supported' % id_type)
 
-        return self.interpolate_smiles(smiles,
-                                       num_points=num_points,
-                                       scaled_radius=scaled_radius,
-                                       force_unique=force_unique,
-                                       sanitize=sanitize)
+        return self.interpolate_smiles(
+            smiles,
+            compound_ids=ids,
+            num_points=num_points,
+            scaled_radius=scaled_radius,
+            force_unique=force_unique,
+            sanitize=sanitize
+        )
+
+    def extrapolate_from_cluster(self,
+                                  compounds_df,
+                                  compound_property: str,
+                                  cluster_id: int = 0,
+                                  n_compounds_to_transform=10,
+                                  num_points: int = 10,
+                                  step_size: float = 0.01,
+                                  force_unique = False,
+                                  scaled_radius: int = 1):
+         """
+         The embedding vector is calculated for the specified cluster_id and applied over it.
+         TO DO: We should have a table of direction vectors in embedded space listed, just like the list of compoun    d IDs.
+         The user should choose one to be applied to the selected compounds, or to a cluster number.
+         """
+         smiles_list = None
+
+         if not self.radius_scale:
+             raise Exception('Property `radius_scale` must be defined in model class.')
+         else:
+             radius = float(scaled_radius * self.radius_scale)
+         # TO DO: User must be able to extrapolate directly from smiles in the table;
+         # these may themselves be generated compounds without any chemblid.
+         df_cluster = compounds_df[ compounds_df['cluster'] == cluster_id ].dropna().reset_index(drop=True).compute    ()
+         return self.extrapolate_from_smiles(df_cluster['transformed_smiles'].to_array(),
+                                             compound_property_vals=df_cluster[compound_property].to_array(),
+                                             num_points=num_points,
+                                             n_compounds_to_transform=n_compounds_to_transform,
+                                             step_size=step_size,
+                                             radius=scaled_radius,
+                                             force_unique=force_unique)
+
 
     def find_similars_smiles_by_id(self,
-                                   chemble_id: str,
-                                   id_type: str = 'chembleid',
+                                   chembl_ids: List[str], # actually a list of strings
+                                   id_type: str = 'chemblid',
                                    num_requested=10,
                                    force_unique=False,
                                    scaled_radius: int = 1,
                                    sanitize=True):
-        smiles = None
-
+        smiles_list = []
+        
         if not self.min_jitter_radius:
             raise Exception('Property `radius_scale` must be defined in model class.')
 
-        if id_type.lower() == 'chembleid':
-            smiles = [row[2] for row in self.dao.fetch_id_from_chembl(chemble_id)]
-            if len(smiles) != len(chemble_id):
-                raise Exception('One of the ids is invalid %s' + chemble_id)
+        if id_type.lower() == 'chemblid':
+            smiles_list = [row[2] for row in self.dao.fetch_id_from_chembl(chembl_ids)]
+            if len(smiles_list) != len(chembl_ids):
+                raise Exception('One of the ids is invalid %s' + chembl_ids)
         else:
             raise Exception('id type %s not supported' % id_type)
 
-        return self.find_similars_smiles(smiles[0],
-                                         num_requested=num_requested,
-                                         scaled_radius=scaled_radius,
-                                         force_unique=force_unique,
-                                         sanitize=sanitize)
+        ret_vals = [
+            self.find_similars_smiles(
+                smiles,
+                num_requested=num_requested,
+                scaled_radius=scaled_radius,
+                force_unique=force_unique,
+                compound_id=str(chembl_id),
+                sanitize=sanitize
+            )
+            for smiles, chembl_id in zip(smiles_list, chembl_ids)
+        ]
+        if len(ret_vals) == 1:
+            return ret_vals[0]
+        return pd.concat(ret_vals, ignore_index=True, copy=False)
diff --git a/cuchem/cuchem/decorator/__init__.py b/cuchem/cuchem/decorator/__init__.py
index 37137aae..25f4354d 100644
--- a/cuchem/cuchem/decorator/__init__.py
+++ b/cuchem/cuchem/decorator/__init__.py
@@ -8,7 +8,7 @@ class BaseMolPropertyDecorator(object):
 
     def decorate(self,
                  df: Union[cudf.DataFrame, pandas.DataFrame],
-                 smile_cols: int = 0):
+                 smiles_cols: int = 0):
         NotImplemented
 
 
diff --git a/cuchem/cuchem/decorator/lipinski.py b/cuchem/cuchem/decorator/lipinski.py
index 67407fc8..6500e41c 100644
--- a/cuchem/cuchem/decorator/lipinski.py
+++ b/cuchem/cuchem/decorator/lipinski.py
@@ -21,7 +21,7 @@ class LipinskiRuleOfFiveDecorator(BaseMolPropertyDecorator):
 
     def decorate(self,
                  df: Union[cudf.DataFrame, pandas.DataFrame],
-                 smile_cols: int = 0):
+                 smiles_cols: int = 0):
 
         mol_wt = []
         mol_logp = []
@@ -29,13 +29,16 @@ def decorate(self,
         hacceptors = []
         rotatable_bonds = []
         qeds = []
+        invalid = []
 
         for idx in range(df.shape[0]):
 
-            smiles = df.iat[idx, smile_cols]
+            smiles = df.iat[idx, smiles_cols]
             m = Chem.MolFromSmiles(smiles)
 
             if m is None:
+                logger.info(f'{idx}: Could not make a Mol from {smiles}')
+                invalid.append(True)
                 mol_logp.append({'value': '-', 'level': 'info'})
                 mol_wt.append({'value': '-', 'level': 'info'})
                 hdonors.append({'value': '-', 'level': 'info'})
@@ -43,7 +46,8 @@ def decorate(self,
                 rotatable_bonds.append({'value': '-', 'level': 'info'})
                 qeds.append({'value': '-', 'level': 'info'})
                 continue
-
+            else: 
+                invalid.append(False)
             try:
                 logp = Descriptors.MolLogP(m)
                 mol_logp.append({'value': round(logp, 2),
@@ -100,5 +104,7 @@ def decorate(self,
         df['H-Bond Acceptors'] = hacceptors
         df['Rotatable Bonds'] = rotatable_bonds
         df['QED'] = qeds
+        # TODO: this may be redundant as chemvisualize seems to be handling such invalid molecules
+        df['invalid'] = invalid
 
         return df
diff --git a/cuchem/cuchem/decorator/mol_structure.py b/cuchem/cuchem/decorator/mol_structure.py
index 1720d1e7..d96bde34 100644
--- a/cuchem/cuchem/decorator/mol_structure.py
+++ b/cuchem/cuchem/decorator/mol_structure.py
@@ -17,12 +17,12 @@ class MolecularStructureDecorator(BaseMolPropertyDecorator):
 
     def decorate(self,
                  df: Union[cudf.DataFrame, pandas.DataFrame],
-                 smile_cols: int = 0):
+                 smiles_col: int = 0):
 
         mol_struct = []
         for idx in range(df.shape[0]):
 
-            smiles = df.iat[idx, smile_cols]
+            smiles = df.iat[idx, smiles_col]
             try:
                 m = Chem.MolFromSmiles(smiles)
                 drawer = Draw.rdMolDraw2D.MolDraw2DCairo(500, 125)
diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py
index 439b87c6..81c2e81f 100644
--- a/cuchem/cuchem/interactive/chemvisualize.py
+++ b/cuchem/cuchem/interactive/chemvisualize.py
@@ -1,6 +1,9 @@
 # Copyright 2020 NVIDIA Corporation
 # SPDX-License-Identifier: Apache-2.0
 
+# TODO: separate loading of compounds from clustering of compounds; currently, loading is triggered by a call to clustering.
+# TODO: separate fingerprinting from clustering; currently fingerprinting is triggered by a call to clustering.
+
 import base64
 import json
 import logging
@@ -9,6 +12,7 @@
 
 import cupy
 import dash
+import cuml
 import dash_bootstrap_components as dbc
 import dash_core_components as dcc
 import dash_html_components as html
@@ -22,6 +26,20 @@
 from cuchem.utils import generate_colors, report_ui_error
 from rdkit import Chem
 from rdkit.Chem import Draw, PandasTools
+from numba.cuda.libdevice import popcll
+
+# Check if all of these are needed:
+from cuchemcommon.fingerprint import MorganFingerprint, INTEGER_NBITS
+import sys
+import numpy as np
+import pandas as pd
+import cudf
+import dask_cudf
+from dask.distributed import wait
+from rdkit import DataStructs, Chem
+from rdkit.Chem.rdmolfiles import MolFromSmiles
+from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmilesFromSmiles
+import time
 
 logger = logging.getLogger(__name__)
 
@@ -76,12 +94,12 @@ def download_sdf():
 
     valid_idx = []
     col_list = ['SMILES', 'Molecular Weight', 'LogP', 'H-Bond Donors', 'H-Bond Acceptors', 'Rotatable Bonds']
-    for row, data in vis.genreated_df.iterrows():
+    for row, data in vis.generated_df.iterrows():
         mol = Chem.MolFromSmiles(data['SMILES'])
         if (mol is not None):
             valid_idx.append(row)
 
-    valid_df = vis.genreated_df.iloc[valid_idx]
+    valid_df = vis.generated_df.iloc[valid_idx]
     valid_df = valid_df[col_list]
 
     PandasTools.AddMoleculeColumnToFrame(valid_df, 'SMILES')
@@ -95,18 +113,33 @@ def download_sdf():
         headers={"Content-disposition":
                      "attachment; filename=download.sdf"})
 
+def popcll_wrapper(ip_col, op_col):
+    for i, n in enumerate(ip_col):
+        op_col[i] = popcll(n)
+
+def popcll_wrapper_dask(df, ip_col, op_col):
+    df = df.apply_rows(popcll_wrapper, incols = {ip_col: 'ip_col'}, outcols = {op_col: int}, kwargs = {})
+    return df[op_col]
+
+def intersection_wrapper(fp_int_col, op_col, query_fp_int):
+    for i, fp_int in enumerate(fp_int_col):
+        op_col[i] = popcll(fp_int & query_fp_int)
 
 class ChemVisualization(metaclass=Singleton):
 
-    def __init__(self, cluster_wf):
+    def __init__(self, cluster_wf, fingerprint_radius=2, fingerprint_nBits=512):
         self.app = app
         self.cluster_wf = cluster_wf
         self.n_clusters = cluster_wf.n_clusters
         self.chem_data = ChEmblData()
-        self.genreated_df = None
+        self.generated_df = None
         self.cluster_wf_cls = 'cuchem.wf.cluster.gpukmeansumap.GpuKmeansUmapHybrid'
         self.generative_wf_cls = 'cuchem.wf.generative.MegatronMolBART'
 
+        self.fp_df = None # all fingerprints of all ChemBl compounds and their IDs as a pandas dataframe for use in compound similarity search on the CPU
+        self.fingerprint_radius = fingerprint_radius
+        self.fingerprint_nBits = fingerprint_nBits
+
         # Store colors to avoid plots changes colors on events such as
         # molecule selection, etc.
         self.cluster_colors = generate_colors(self.n_clusters)
@@ -136,7 +169,9 @@ def __init__(self, cluster_wf):
              Input('bt_north_star', 'n_clicks'),
              Input('sl_prop_gradient', 'value'),
              Input('sl_nclusters', 'value'),
-             Input('refresh_main_fig', 'children')],
+             Input('refresh_main_fig', 'children'),
+             Input('fingerprint_radius', 'value'),
+             Input('fingerprint_nBits', 'value')],
             [State("selected_clusters", "value"),
              State("main-figure", "selectedData"),
              State('north_star', 'value'),
@@ -180,14 +215,23 @@ def __init__(self, cluster_wf):
              Input('bt_close_err', 'n_clicks')])(self.handle_error)
 
         self.app.callback(
-            Output('genration_candidates', 'children'),
+            Output('generation_candidates', 'children'),
             [Input({'role': 'bt_add_candidate', 'chemblId': ALL, 'molregno': ALL}, 'n_clicks'),
              Input('bt_reset_candidates', 'n_clicks'), ],
-            State('genration_candidates', 'children'))(self.handle_add_candidate)
+            State('generation_candidates', 'children'))(self.handle_add_candidate)
+
+        self.app.callback(
+            Output('analoguing_candidates', 'children'),
+            [Input({'role': 'bt_analoguing_candidate', 'chemblId': ALL, 'molregno': ALL}, 'n_clicks')],
+            State('analoguing_candidates', 'children'))(self.handle_analoguing_candidate)
 
         self.app.callback(
             Output('ckl_candidate_mol_id', 'options'),
-            Input('genration_candidates', 'children'))(self.handle_construct_candidates)
+            Input('generation_candidates', 'children'))(self.handle_construct_candidates)
+
+        self.app.callback(
+            Output('ckl_analoguing_mol_id', 'options'),
+            Input('analoguing_candidates', 'children'))(self.handle_construct_candidates2)
 
         self.app.callback(
             [Output('ckl_candidate_mol_id', 'value'),
@@ -196,27 +240,62 @@ def __init__(self, cluster_wf):
              Input('rd_generation_type', 'value')])(self.handle_ckl_selection)
 
         self.app.callback(
-            [Output('table_generated_molecules', 'children'),
+            [Output('ckl_analoguing_mol_id', 'value')],
+            [Input('ckl_analoguing_mol_id', 'value')])(self.handle_analoguing_ckl_selection)
+
+        self.app.callback(
+            [Output('section_fitting', 'style'),
+             Output('fitting_figure', 'figure')],
+            [Input("bt_fit", "n_clicks"),],
+            [State('sl_featurizing_wf', 'value'),
+             State('fit_nn_compound_property', 'value'),
+             State('fit_nn_train_cluster_number', 'value'),
+             State('fit_nn_test_cluster_number', 'value'),
+             State('fit_nn_hidden_layer_sizes', 'value'),
+             State('fit_nn_activation_fn', 'value'),
+             State('fit_nn_final_activation_fn', 'value'),
+             State('fit_nn_max_epochs', 'value'),
+             State('fit_nn_learning_rate', 'value'),
+             State('fit_nn_weight_decay', 'value'),
+             State('fit_nn_batch_size', 'value')])(self.handle_fitting)
+
+        self.app.callback(
+            [Output('section_analoguing', 'style'),
+             Output('tb_analoguing', 'children')],
+            [Input("bt_analoguing", "n_clicks"),],
+            [State('ckl_analoguing_mol_id', 'value'),
+             State('analoguing_n_analogues', 'value'),
+             State('analoguing_threshold', 'value'),
+             State('analoguing_type', 'value')])(self.handle_analoguing)
+
+        self.app.callback(
+            [Output('section_generated_molecules', 'style'),
+             Output('section_selected_molecules', 'style'), ],
+            [Input('show_generated_mol', 'children'),
+             Input('show_selected_mol', 'children')])(self.handle_property_tables)
+
+        self.app.callback(
+            [Output('section_generated_molecules_clustered', 'style'),
+             Output('gen_figure', 'figure'),
+             Output('table_generated_molecules', 'children'),
              Output('show_generated_mol', 'children'),
              Output('msg_generated_molecules', 'children'),
              Output('interpolation_error', 'children')],
-            [Input("bt_generate", "n_clicks"), ],
+            [Input("bt_generate", "n_clicks")],
             [State('sl_generative_wf', 'value'),
              State('ckl_candidate_mol_id', 'value'),
              State('n2generate', 'value'),
+             State('extrap_compound_property', 'value'),
+             State('extrap_cluster_number', 'value'),
+             State('extrap_n_compounds', 'value'),
+             State('extrap_step_size', 'value'),
              State('scaled_radius', 'value'),
              State('rd_generation_type', 'value'),
              State('show_generated_mol', 'children')])(self.handle_generation)
 
-        self.app.callback(
-            [Output('section_generated_molecules', 'style'),
-             Output('section_selected_molecules', 'style'), ],
-            [Input('show_generated_mol', 'children'),
-             Input('show_selected_mol', 'children')])(self.handle_property_tables)
-
     def handle_add_candidate(self, bt_add_candidate,
                              bt_reset_candidates,
-                             genration_candidates):
+                             generation_candidates):
         comp_id, event_type = self._fetch_event_data()
 
         if comp_id == 'bt_reset_candidates' and event_type == 'n_clicks':
@@ -227,8 +306,8 @@ def handle_add_candidate(self, bt_add_candidate,
 
         selected_candidates = []
 
-        if genration_candidates:
-            selected_candidates = genration_candidates.split(",")
+        if generation_candidates:
+            selected_candidates = generation_candidates.split(",")
 
         comp_detail = json.loads(comp_id)
         selected_chembl_id = comp_detail['chemblId']
@@ -238,6 +317,24 @@ def handle_add_candidate(self, bt_add_candidate,
 
         return ','.join(selected_candidates)
 
+
+    def handle_analoguing_candidate(self, bt_analoguing_candidate, analoguing_candidates):
+        comp_id, event_type = self._fetch_event_data()
+        if event_type != 'n_clicks' or dash.callback_context.triggered[0]['value'] == 0:
+            raise dash.exceptions.PreventUpdate
+
+        selected_candidates = []
+
+        if analoguing_candidates:
+            selected_candidates = analoguing_candidates.split(",")
+
+        comp_detail = json.loads(comp_id)
+        selected_chembl_id = comp_detail['chemblId']
+
+        if selected_chembl_id not in selected_candidates:
+            selected_candidates.append(selected_chembl_id)
+        return ','.join(selected_candidates)
+
     def _fetch_event_data(self):
         if not dash.callback_context.triggered:
             raise dash.exceptions.PreventUpdate
@@ -254,17 +351,37 @@ def handle_property_tables(self, show_generated_mol, show_selected_mol):
             return {'display': 'block', 'width': '100%'}, {'display': 'none'}
         return dash.no_update, dash.no_update
 
-    @report_ui_error(4)
-    def handle_generation(self, bt_generate,
-                          sl_generative_wf, ckl_candidate_mol_id,
-                          n2generate, scaled_radius, rd_generation_type, show_generated_mol):
+    @report_ui_error(3)
+    def handle_generation(
+        self, bt_generate, sl_generative_wf, ckl_candidate_mol_id, n2generate, 
+        extrap_compound_property, extrap_cluster_number, extrap_n_compounds, extrap_step_size, 
+        scaled_radius, rd_generation_type, show_generated_mol
+    ):
+        """
+            [Output('section_generated_molecules_clustered', 'style'),
+             Output('gen_figure', 'figure'),
+             Output('table_generated_molecules', 'children'),
+             Output('show_generated_mol', 'children'),
+             Output('msg_generated_molecules', 'children'),
+             Output('interpolation_error', 'children')],
+            [Input("bt_generate", "n_clicks")],
+            [State('sl_generative_wf', 'value'),
+             State('ckl_candidate_mol_id', 'value'),
+             State('n2generate', 'value'),
+             State('extrap_compound_property', 'value'),
+             State('extrap_cluster_number', 'value'),
+             State('extrap_n_compounds', 'value'),
+             State('extrap_step_size', 'value'),
+             State('scaled_radius', 'value'),
+             State('rd_generation_type', 'value'),
+             State('show_generated_mol', 'children')])(self.handle_generation)
+        """
         comp_id, event_type = self._fetch_event_data()
-
-        chemble_ids = []
+        chembl_ids = []
         if comp_id == 'bt_generate' and event_type == 'n_clicks':
-            chemble_ids = ckl_candidate_mol_id
+            chembl_ids = ckl_candidate_mol_id
         else:
-            return dash.no_update, dash.no_update
+            return dash.no_update, dash.no_update, dash.no_update, dash.no_update, dash.no_update
 
         self.generative_wf_cls = sl_generative_wf
         wf_class = locate(self.generative_wf_cls)
@@ -273,17 +390,26 @@ def handle_generation(self, bt_generate,
         scaled_radius = float(scaled_radius)
 
         if rd_generation_type == 'SAMPLE':
-            if chemble_ids == None or len(chemble_ids) == 0:
+            if chembl_ids == None or len(chembl_ids) == 0:
                 raise ValueError('Please select at-least one molecule for Sampling.')
-            self.genreated_df = generative_wf.find_similars_smiles_by_id(chemble_ids,
+            self.generated_df = generative_wf.find_similars_smiles_by_id(chembl_ids,
                                                                          num_requested=n2generate,
                                                                          scaled_radius=scaled_radius,
                                                                          force_unique=True,
                                                                          sanitize=True)
+        elif rd_generation_type == 'EXTRAPOLATE':
+            self.generated_df = generative_wf.extrapolate_from_cluster(self.cluster_wf.df_embedding,
+                                                                       compound_property=extrap_compound_property,
+                                                                       cluster_id=extrap_cluster_number,
+                                                                       n_compounds_to_transform=extrap_n_compounds,									
+                                                                       num_points=n2generate,
+                                                                       step_size=extrap_step_size,
+                                                                       scaled_radius=scaled_radius,
+                                                                       force_unique=False)#True)                                                                         
         else:
-            if chemble_ids == None or len(chemble_ids) < 2:
+            if chembl_ids == None or len(chembl_ids) < 2:
                 raise ValueError('Please select at-least two molecules for Interpolation.')
-            self.genreated_df = generative_wf.interpolate_by_id(chemble_ids,
+            self.generated_df = generative_wf.interpolate_by_id(chembl_ids,
                                                                 num_points=n2generate,
                                                                 scaled_radius=scaled_radius,
                                                                 force_unique=True,
@@ -292,40 +418,81 @@ def handle_generation(self, bt_generate,
         if show_generated_mol is None:
             show_generated_mol = 0
         show_generated_mol += 1
-
         # Add other useful attributes to be added for rendering
-        self.genreated_df = MolecularStructureDecorator().decorate(self.genreated_df)
-        self.genreated_df = LipinskiRuleOfFiveDecorator().decorate(self.genreated_df)
+        self.generated_df = MolecularStructureDecorator().decorate(self.generated_df)
+        self.generated_df = LipinskiRuleOfFiveDecorator().decorate(self.generated_df)
+        self.generated_df = self.generated_df[ ~self.generated_df['invalid'] ].reset_index(drop=True).drop(columns=['invalid'])
+        if len(self.generated_df) == 0:
+            logger.info("None of the generated smiles yielded valid molecules!")
+            return dash.no_update, dash.no_update
 
+        # Note: we are not allowing fingerprint specification to change here because we want to see the results on the same PCA / UMAP as the original figure
+        # TODO: make this clear in the UI
+
+        # The number of generated molecules is not expected to be large, so regular cudf dataframes should suffice. 
+        # However, we have to handle both cudf and dask_cudf, perhaps more elegantly.
+        fps = MorganFingerprint(
+            radius=self.fingerprint_radius, nBits=self.fingerprint_nBits
+        ).transform(self.generated_df, smiles_column='SMILES')
+        df_fp = pd.DataFrame(fps, dtype='float32')
+        self.generated_df = pd.concat([self.generated_df, df_fp], axis=1)  
+        df_fp=cudf.from_pandas(df_fp)
+        df_fp['id'] = list(map(str, self.generated_df['id']))
+        df_fp['cluster'] = list(map(int, self.generated_df['Generated'])) # This controls the color
+        n_generated =  self.generated_df['Generated'].sum()
+        if n_generated < len(self.generated_df) / 2:
+            # Highlight the generated compounds
+            north_stars = ','.join(list(df_fp[ self.generated_df['Generated'] ]['id'].values_host))
+        else:
+            # Highlight the source compound(s)
+            north_stars = ','.join(list(df_fp[ ~self.generated_df['Generated'] ]['id'].values_host))     
+        df_embedding = df_fp
+        cluster_col = df_embedding['cluster']
+        df_embedding, prop_series = self.cluster_wf._remove_non_numerics(df_embedding)
+        prop_series['cluster'] = cluster_col
+        if isinstance(self.cluster_wf.pca, cuml.dask.decomposition.PCA) and not isinstance(df_embedding, dask_cudf.DataFrame):
+            df_embedding = dask_cudf.from_cudf(df_embedding, npartitions=1)
+        df_embedding = self.cluster_wf.pca.transform(df_embedding)
+        Xt = self.cluster_wf.umap.transform(df_embedding)
+        if hasattr(Xt, 'persist'):
+            Xt = Xt.compute()
+        if hasattr(df_embedding, 'persist'):
+            logging.info(f'BEFORE df_embedding: {type(df_embedding)}, {df_embedding.columns}, {len(df_embedding)}, {df_embedding.index}')
+            # Note: When converting a dask_cudf to cudf, the indices within each chunk are retained, so we need to reset_index.
+            df_embedding = df_embedding.compute().reset_index(drop=True)
+        df_embedding['x'] = Xt[0]
+        df_embedding['y'] = Xt[1]
+        logging.info(f'df_embedding: {type(df_embedding)}, {df_embedding.columns}, {len(df_embedding)}, {df_embedding.index}')
+        for col in prop_series.keys():
+            df_embedding[col] = prop_series[col]
+
+        fig, northstar_cluster = self.create_graph(df_embedding, north_stars=north_stars)
         # Create Table header
         table_headers = []
-        columns = self.genreated_df.columns.to_list()
-        ignore_columns = ['embeddings', 'embeddings_dim']
-        for column in columns:
-            if column in ignore_columns:
-                continue
+        all_columns = self.generated_df.columns.to_list()
+        columns_in_table = [
+            col_name 
+            for col_name in self.generated_df.columns.to_list()
+            if (not isinstance(col_name, int)) and (not col_name.startswith('fp')) and not ('embeddings' in col_name)
+        ]
+        # TODO: factor this into a separate function: build table from dataframe
+        for column in columns_in_table:
             table_headers.append(html.Th(column, style={'fontSize': '150%', 'text-align': 'center'}))
-
         prop_recs = [html.Tr(table_headers, style={'background': 'lightgray'})]
         invalid_mol_cnt = 0
-        for row_idx in range(self.genreated_df.shape[0]):
+        for row_idx in range(self.generated_df.shape[0]):
             td = []
-
             try:
-                col_pos = columns.index('Chemical Structure')
-                col_data = self.genreated_df.iat[row_idx, col_pos]
-
-                if 'value' in col_data and col_data['value'] == MolecularStructureDecorator.ERROR_VALUE:
+                col_pos = all_columns.index('Chemical Structure')
+                col_data = self.generated_df.iat[row_idx, col_pos]
+                if 'value' in col_data and col_data['value'] == 'Error interpreting SMILES using RDKit':
                     invalid_mol_cnt += 1
                     continue
             except ValueError:
                 pass
-
-            for col_id in range(len(columns)):
-                col_data = self.genreated_df.iat[row_idx, col_id]
-                if columns[col_id] in ignore_columns:
-                    continue
-
+            for col_name in columns_in_table:
+                col_id = all_columns.index(col_name)
+                col_data = self.generated_df.iat[row_idx, col_id]
                 col_level = 'info'
                 if isinstance(col_data, dict):
                     col_value = col_data['value']
@@ -333,7 +500,6 @@ def handle_generation(self, bt_generate,
                         col_level = col_data['level']
                 else:
                     col_value = col_data
-
                 if isinstance(col_value, str) and col_value.startswith('data:image/png;base64,'):
                     td.append(html.Td(html.Img(src=col_value)))
                 else:
@@ -343,19 +509,246 @@ def handle_generation(self, bt_generate,
                                              'text-align': 'center',
                                              'color': LEVEL_TO_STYLE[col_level]['color']
                                              }
-                                      ))
-
+                                      ))                        
             prop_recs.append(html.Tr(td, style={'fontSize': '125%'}))
 
         msg_generated_molecules = ''
         if invalid_mol_cnt > 0:
             msg_generated_molecules =  f'{invalid_mol_cnt} invalid molecules were created, which were eliminated from the result.'
 
-        return html.Table(prop_recs, style={'width': '100%',
-                                            'border': '1px solid lightgray'}), \
-               show_generated_mol, \
-               msg_generated_molecules, \
-               dash.no_update
+        return {'display': 'inline'}, fig, html.Table(
+            prop_recs, style={'width': '100%', 'margin': 12, 'border': '1px solid lightgray'}
+            ), show_generated_mol, msg_generated_molecules, dash.no_update
+
+
+    @report_ui_error(3)
+    def handle_fitting(
+        self, bt_fit, sl_featurizing_wf, 
+        fit_nn_compound_property, fit_nn_train_cluster_number, fit_nn_test_cluster_number, fit_nn_hidden_layer_sizes, fit_nn_activation_fn, fit_nn_final_activation_fn, 
+        fit_nn_max_epochs, fit_nn_learning_rate, fit_nn_weight_decay, fit_nn_batch_size
+    ):
+        comp_id, event_type = self._fetch_event_data()
+        sys.stdout.flush()
+        if (comp_id != 'bt_fit') or (event_type != 'n_clicks'):
+            return dash.no_update, dash.no_update
+        self.featurizing_wf_cls = sl_featurizing_wf
+        wf_class = locate(self.featurizing_wf_cls)
+        featurizing_wf = wf_class()
+
+        df = featurizing_wf.fit_nn(
+            self.cluster_wf.df_embedding,
+            compound_property=fit_nn_compound_property,
+            cluster_id_train=fit_nn_train_cluster_number,
+            cluster_id_test=fit_nn_test_cluster_number,
+            hidden_layer_sizes=list(map(int, fit_nn_hidden_layer_sizes.split(','))) if fit_nn_hidden_layer_sizes != '' else [],
+            activation_fn=fit_nn_activation_fn,
+            final_activation_fn=fit_nn_final_activation_fn,
+            max_epochs=int(fit_nn_max_epochs),
+            learning_rate=float(fit_nn_learning_rate),
+            weight_decay=float(fit_nn_weight_decay),
+            batch_size=int(fit_nn_batch_size)
+        )
+        sys.stdout.flush()
+        fig = self.create_plot(df, fit_nn_compound_property)
+        return {'display': 'inline'}, fig
+
+    @report_ui_error(3)
+    def handle_analoguing(
+        self, bt_analoguing, analoguing_mol_id, analoguing_n_analogues, analoguing_threshold, analoguing_type,
+
+    ):
+        comp_id, event_type = self._fetch_event_data()
+        sys.stdout.flush()
+        if (comp_id != 'bt_analoguing') or (event_type != 'n_clicks'):
+            return dash.no_update, dash.no_update
+
+        # Compute fingerprints once for all input database compounds (already done when input data would have been clustered)
+        if 'canonical_smiles' in self.cluster_wf.df_embedding:
+            smiles_column = 'canonical_smiles'
+        else:
+            smiles_columns = 'SMILES'
+
+        if self.fp_df is None: 
+            # Note: CPU-based workflow for fingerprint similarity is no longer needed, can be removed
+            logger.info(f'CPU-based similarity search: self.fp_df not set')
+            # First move the smiles to the CPU:
+            if isinstance(self.cluster_wf.df_embedding, dask_cudf.DataFrame):
+                smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']].map_partitions(cudf.DataFrame.to_pandas)
+            elif isinstance(self.cluster_wf.df_embedding, cudf.DataFrame):
+                smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']].to_pandas()
+            else:
+                smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']]
+            # Then compute fingerprints on the CPU using RDKit:
+            if 'fp' not in self.cluster_wf.df_embedding.columns:           
+                logger.info(f'Computing fingerprints with radius={self.fingerprint_radius}, nBits={self.fingerprint_nBits}...')
+                _, v = MorganFingerprint(radius=self.fingerprint_radius, nBits=self.fingerprint_nBits).transform(
+                    smiles_df, smiles_column=smiles_column, return_fp=True, raw=True)
+            else:
+                if hasattr(self.cluster_wf.df_embedding, 'compute'):
+                    v = list(self.cluster_wf.df_embedding['fp'].compute().to_pandas())
+                else:
+                    v = list(self.cluster_wf.df_embedding['fp'])
+            # This pandas dataframe has the fingerprints in the fp column:
+            self.fp_df = pd.DataFrame({
+                'fp': v, 
+                smiles_column: smiles_df[smiles_column], #list(self.cluster_wf.df_embedding[smiles_column].compute().to_pandas()), #smiles_df[smiles_column], 
+                'id': smiles_df['id'], #list(self.cluster_wf.df_embedding['id'].compute().to_pandas())
+            })         
+
+        if hasattr(self.cluster_wf.df_embedding, 'persist'):
+            self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist()
+            wait(self.cluster_wf.df_embedding)
+
+        if 'pc' not in self.cluster_wf.df_embedding.columns:
+            # Pre-computing the popcounts for all compounds in the database for use in GPU-based similarity search:
+            t0 = time.time()
+            self.cluster_wf.df_embedding['op_col'] = 0
+            self.cluster_wf.df_embedding['pc'] = 0
+            n_fp_cols = 0
+            for col in self.cluster_wf.df_embedding.columns:
+                if (type(col) == str) and col.startswith('fp') and (len(col) > 2):
+                    n_fp_cols += 1
+                    self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.apply_rows(
+                        popcll_wrapper, incols = {col: 'ip_col'}, outcols = {'op_col': int}, kwargs = {})
+                    self.cluster_wf.df_embedding['pc'] += self.cluster_wf.df_embedding['op_col']
+            if hasattr(self.cluster_wf.df_embedding, 'persist'):
+                self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist()
+                wait(self.cluster_wf.df_embedding)
+            t1 = time.time()
+            #logger.info(f'Time to compute partial popcounts ({n_fp_cols} fp columns): {t1 - t0}:\n{self.cluster_wf.df_embedding["pc"].head()}')
+
+        # Prepare the query compound:
+        logger.info(f'analoguing_mol_id={analoguing_mol_id}')
+        molregno = self.chem_data.fetch_molregno_by_chemblId(
+            [analoguing_mol_id])[0][0]
+        props, selected_molecules = self.chem_data.fetch_props_by_molregno([molregno])
+        query_smiles = selected_molecules[0][props.index('canonical_smiles')]
+        query_fp =  MorganFingerprint(radius=self.fingerprint_radius, nBits=self.fingerprint_nBits).transform(
+            pd.DataFrame({'smiles': [query_smiles]}), smiles_column='smiles', return_fp=True, raw=True)[1][0]
+        query_fps = query_fp.ToBitString()
+        query_fp_ints = [int(query_fps[i: i + INTEGER_NBITS], 2) for i in range(0, self.fingerprint_nBits, INTEGER_NBITS)]
+        query_pc = sum(bin(x).count('1') for x in query_fp_ints)
+
+        # GPU-based workflow for similarity computation
+        # Tanimoto = popcount(intersection) / ( popcount(query) + popcount(compound) - popcount(intersection) )
+        # Sine the fingerprint is stored as a list of int64s in separate columns
+        if 'op_col' in self.cluster_wf.df_embedding:
+            self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.drop(columns=['op_col'])
+        if 'n_intersection' in self.cluster_wf.df_embedding:
+            self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.drop(columns=['n_intersection'])
+        #self.cluster_wf.df_embedding['op_col'] = 0
+        self.cluster_wf.df_embedding['n_intersection'] = 0
+        t4 = time.time()
+        for i in range(0, self.fingerprint_nBits, INTEGER_NBITS):
+            fp_num = i // INTEGER_NBITS
+            self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.apply_rows(
+                intersection_wrapper, incols={f'fp{fp_num}': 'fp_int_col'}, 
+                outcols={'op_col': int}, kwargs={'query_fp_int': query_fp_ints[fp_num]})
+            #self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist()
+            #wait(self.cluster_wf.df_embedding)
+            self.cluster_wf.df_embedding['n_intersection'] += self.cluster_wf.df_embedding['op_col']
+
+        self.cluster_wf.df_embedding['n_union'] = self.cluster_wf.df_embedding['pc'] - self.cluster_wf.df_embedding['n_intersection'] + query_pc
+        self.cluster_wf.df_embedding['similarity'] = self.cluster_wf.df_embedding['n_intersection'] / self.cluster_wf.df_embedding['n_union']
+        if hasattr(self.cluster_wf.df_embedding, 'persist'):
+            self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist()
+            wait(self.cluster_wf.df_embedding)
+        t5 = time.time()
+        t0 = time.time()
+        self.fp_df['similarity_cpu'] = self.fp_df['fp'].apply(lambda x: DataStructs.FingerprintSimilarity(query_fp, x))
+
+        if 'similarity_cpu' in self.cluster_wf.df_embedding:
+            self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.drop(columns=['similarity_cpu'])
+        self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.merge(
+            dask_cudf.from_cudf(
+                cudf.from_pandas(self.fp_df[['id', 'similarity_cpu']]), 
+                npartitions = self.cluster_wf.df_embedding.npartitions
+            ),
+            on='id',
+            how='left'
+        ).reset_index(drop=True)
+
+        t1 = time.time()
+        #logger.info(f'Fingerprint length={self.fingerprint_nBits}: GPU-Method: {t5 - t4}, CPU-Method: {t1 - t0}')
+
+        #self.analoguing_df = self.fp_df[ self.fp_df['similarity_cpu'] >= float(analoguing_threshold) ]
+        self.analoguing_df = self.cluster_wf.df_embedding[ self.cluster_wf.df_embedding['similarity'] >= float(analoguing_threshold) ]
+        drop_columns = [
+            col
+            for col in self.analoguing_df.columns
+            if (type(col) == int) or col.startswith('fp') or (col in ['x', 'y', 'cluster', 'op_col', 'pc', 'n_intersection', 'n_union', 'transformed_smiles']) 
+        ]
+        self.analoguing_df = self.analoguing_df.drop(columns=drop_columns).compute().to_pandas() # dask_cudf --> cudf --> pandas (CPU!)
+        if analoguing_type in ['scaffold', 'superstructure']:
+            if analoguing_type == 'scaffold':
+                # Only include compounds that have the same murcko scaffold as the query compound
+                query_scaffold_mol = MolFromSmiles(MurckoScaffoldSmilesFromSmiles(query_smiles))
+            else: #analoguing_type == 'superstructure':
+                # Only include compounds that are superstructures of the query compound
+                query_scaffold_mol = MolFromSmiles(query_smiles)
+            self.analoguing_df['mol'] = self.analoguing_df[smiles_column].apply(MolFromSmiles)
+            self.analoguing_df.dropna(subset=['mol'], inplace=True)
+            self.analoguing_df = self.analoguing_df[ self.analoguing_df['mol'].apply(lambda x: x.HasSubstructMatch(query_scaffold_mol)) ]
+            self.analoguing_df.drop(columns=['mol'], inplace=True)
+        self.analoguing_df = self.analoguing_df.nlargest(int(analoguing_n_analogues), 'similarity') 
+        self.analoguing_df.reset_index(drop=True, inplace=True)
+        #self.analoguing_df = dask_cudf.from_cudf(self.analoguing_df, npartitions=self.cluster_wf.df_embedding.npartitions) # going back to dask for a reason?
+        # TODO: we are presuming the IDs are the same but there is no guarantee since we added code to generate dummy IDs based on indices elsewhere.
+ 
+        # Needed only for CPU-based workflow
+        #self.analoguing_df = self.analoguing_df.merge(self.cluster_wf.df_embedding, on='id').compute().reset_index(drop=True).to_pandas()
+        # Add other useful attributes to be added for rendering
+        smiles_idx = self.analoguing_df.columns.to_list().index(smiles_column)
+        self.analoguing_df = MolecularStructureDecorator().decorate(self.analoguing_df, smiles_col=smiles_idx)
+        #self.analoguing_df = LipinskiRuleOfFiveDecorator().decorate(self.analoguing_df, smiles_col=smiles_idx)
+        self.analoguing_df = self.analoguing_df.sort_values('similarity', ascending=False)
+        # Create Table header
+        table_headers = []
+        all_columns = self.analoguing_df.columns.to_list()
+        columns_in_table = [
+            col_name 
+            for col_name in self.analoguing_df.columns.to_list()
+            if (not isinstance(col_name, int)) and (not col_name.startswith('fp'))
+        ]
+        # TODO: factor this into a separate function: build table from dataframe
+        for column in columns_in_table:
+            table_headers.append(html.Th(column, style={'fontSize': '150%', 'text-align': 'center'}))
+        prop_recs = [html.Tr(table_headers, style={'background': 'lightgray'})]
+        for row_idx in range(self.analoguing_df.shape[0]):
+            td = []
+            try:
+                col_pos = all_columns.index('Chemical Structure')
+                col_data = self.analoguing_df.iat[row_idx, col_pos]
+                if 'value' in col_data and col_data['value'] == 'Error interpreting SMILES using RDKit':
+                    continue
+            except ValueError:
+                pass
+            for col_name in columns_in_table:
+                col_id = all_columns.index(col_name)
+                col_data = self.analoguing_df.iat[row_idx, col_id]
+                col_level = 'info'
+                if isinstance(col_data, dict):
+                    col_value = col_data['value']
+                    if 'level' in col_data:
+                        col_level = col_data['level']
+                else:
+                    col_value = col_data
+                if isinstance(col_value, str) and col_value.startswith('data:image/png;base64,'):
+                    td.append(html.Td(html.Img(src=col_value)))
+                else:
+                    td.append(html.Td(str(col_value), style=LEVEL_TO_STYLE[col_level].update({'maxWidth': '100px', 'wordWrap':'break-word'})))
+
+            prop_recs.append(html.Tr(td))
+
+
+        # venkat: 
+        return {'display': 'inline'}, html.Table(prop_recs, style={'width': '100%', 'margin': 12, 'border': '1px solid lightgray'})
+        # dev:
+        #return html.Table(prop_recs, style={'width': '100%',
+        #                                    'border': '1px solid lightgray'}), \
+        #       show_generated_mol, \
+        #       msg_generated_molecules, \
+        #       dash.no_update
 
     def handle_ckl_selection(self, ckl_candidate_mol_id, rd_generation_type):
         selection_msg = '**Please Select Two Molecules**'
@@ -364,12 +757,22 @@ def handle_ckl_selection(self, ckl_candidate_mol_id, rd_generation_type):
         if rd_generation_type == 'SAMPLE':
             selection_msg = '**Please Select One Molecule**'
             selection_cnt = 1
-
+        elif rd_generation_type == 'EXTRAPOLATE':
+            # TO DO: one cluster and one property have to be provided
+            selection_msg = '**Please Select Zero Molecules (specify cluster above, instead)**'
+            selection_cnt = 0
         if ckl_candidate_mol_id and len(ckl_candidate_mol_id) > selection_cnt:
             ckl_candidate_mol_id = ckl_candidate_mol_id[selection_cnt * -1:]
 
         return ckl_candidate_mol_id, selection_msg
 
+    def handle_analoguing_ckl_selection(self, ckl_analoguing_mol_id):
+        if ckl_analoguing_mol_id and len(ckl_analoguing_mol_id) > 1:
+            # Allow only one compound to be chosen for analoguing
+            ckl_analoguing_mol_id = ckl_analoguing_mol_id[-1:]
+
+        return ckl_analoguing_mol_id
+
     def handle_construct_candidates(self, north_star):
         if not north_star:
             return []
@@ -377,6 +780,13 @@ def handle_construct_candidates(self, north_star):
         options = [{'label': i.strip(), 'value': i.strip()} for i in north_star.split(',')]
         return options
 
+    def handle_construct_candidates2(self, north_star):
+        if not north_star:
+            return []
+
+        options = [{'label': i.strip(), 'value': i.strip()} for i in north_star.split(',')]
+        return options
+
     def handle_reset(self, bt_reset, bt_apply_wf, refresh_main_fig, sl_wf):
         comp_id, event_type = self._fetch_event_data()
 
@@ -399,25 +809,41 @@ def handle_reset(self, bt_reset, bt_apply_wf, refresh_main_fig, sl_wf):
     def recluster(self, filter_values=None, filter_column=None, reload_data=False):
         self.cluster_wf.n_clusters = self.n_clusters
         if reload_data:
-            return self.cluster_wf.cluster()
+            return self.cluster_wf.cluster(
+                fingerprint_radius=self.fingerprint_radius, fingerprint_nBits=self.fingerprint_nBits
+            )
         else:
-            return self.cluster_wf.recluster(filter_column, filter_values,
-                                             n_clusters=self.n_clusters)
-
-    def recluster_selection(self,
-                            filter_value=None,
-                            filter_column=None,
-                            gradient_prop=None,
-                            north_stars=None,
-                            reload_data=False,
-                            recluster_data=True,
-                            color_col='cluster'):
+            return self.cluster_wf.recluster(
+                filter_column, 
+                filter_values,
+                n_clusters=self.n_clusters,
+                fingerprint_radius=self.fingerprint_radius, 
+                fingerprint_nBits=self.fingerprint_nBits
+            )
+
+    def recluster_selection(
+        self,
+        filter_value=None,
+        filter_column=None,
+        gradient_prop=None,
+        north_stars=None,
+        reload_data=False,
+        recluster_data=True,
+        color_col='cluster', 
+        fingerprint_radius=2, 
+        fingerprint_nBits=512
+    ):
 
         if recluster_data or self.cluster_wf.df_embedding is None:
-            df_embedding = self.recluster(filter_values=filter_value,
-                                          filter_column=filter_column,
-                                          reload_data=reload_data)
+            self.fingerprint_nBits = fingerprint_nBits
+            self.fingerprint_radius = fingerprint_radius
+            df_embedding = self.recluster(
+                filter_values=filter_value,
+                filter_column=filter_column,
+                reload_data=reload_data
+            )
         else:
+            # Can use previous embedding only if fingerprint has not changed
             df_embedding = self.cluster_wf.df_embedding
 
         return self.create_graph(df_embedding,
@@ -426,6 +852,7 @@ def recluster_selection(self,
                                  north_stars=north_stars)
 
     def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop=None):
+        sys.stdout.flush()
         fig = go.Figure(layout={'colorscale': {}})
 
         # Filter out relevant columns in this method.
@@ -441,7 +868,7 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop
 
         moi_molregno = []
         if north_stars:
-            moi_molregno = list(map(int, north_stars.split(",")))
+            moi_molregno = north_stars.split(",") #list(map(int, north_stars.split(",")))
 
         moi_filter = ldf['id'].isin(moi_molregno)
 
@@ -461,6 +888,8 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop
             cluster = ldf['cluster']
             customdata = ldf['id']
             grad_prop = ldf[gradient_prop]
+            textdata = cupy.asarray([
+                f'C-{c}_ID-{cid}' for c, cid in zip(cdf['cluster'].to_array(), cdf['id'].to_array()) ])
 
             if self.cluster_wf.is_gpu_enabled():
                 x_data = x_data.to_array()
@@ -486,6 +915,7 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop
                     'showscale': True,
                     'cmin': cmin,
                     'cmax': cmax,
+                    'name': customdata
                 }
             }))
         else:
@@ -512,6 +942,8 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop
                 y_data = cdf['y']
                 cluster = cdf['cluster']
                 customdata = cdf['id']
+                textdata = [ f'C-{c}_ID-{cid}' for c, cid in zip(cdf['cluster'].to_array(), cdf['id'].to_array()) ]
+                sys.stdout.flush()
 
                 if self.cluster_wf.is_gpu_enabled():
                     x_data = x_data.to_array()
@@ -524,14 +956,16 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop
                 scatter_trace = go.Scattergl({
                     'x': x_data,
                     'y': y_data,
-                    'text': cluster,
+                    'text': textdata,
+                    #'text': cluster,
                     'customdata': customdata,
                     'name': 'Cluster ' + str(cluster_id),
                     'mode': 'markers',
                     'marker': {
                         'size': df_size,
                         'symbol': df_shape,
-                        'color': self.cluster_colors[int(cluster_id) % len(self.cluster_colors)],
+                        'color': self.cluster_colors[
+                            int(cluster_id) % len(self.cluster_colors)],
                     },
                 })
                 if moi_present:
@@ -558,6 +992,37 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop
         del ldf
         return fig, northstar_cluster
 
+    def create_plot(self, df, compound_property):
+        """
+        Expects df to have x, y, cluster and train_set columns
+        """
+        fig = go.Figure(layout={'colorscale': {}})
+        scatter_trace = go.Scattergl({
+            'x': df['x'],
+            'y': df['y'],
+            'text': [ f'C-{c}_ID-{cid}' for c, cid in zip(df['cluster'], df['id']) ],
+            'customdata': df['id'],
+            'mode': 'markers',
+            'marker': {
+                'size': DOT_SIZE,
+                'symbol': df['train_set'].apply(lambda x: 0 if x else 1),
+                'color': df['cluster'].apply(lambda x: self.cluster_colors[x % len(self.cluster_colors)]),
+            },
+        })
+        fig.add_trace(scatter_trace)
+        # Change the title to indicate type of H/W in use
+        f_color = 'green' if self.cluster_wf.is_gpu_enabled() else 'blue'
+        fig.update_layout(
+            showlegend=True, clickmode='event', height=main_fig_height,
+            title=f'{PROP_DISP_NAME[compound_property]} Prediction', dragmode='select',
+            title_font_color=f_color,
+            annotations=[
+                dict(x=0.5, y=-0.07, showarrow=False, text='Actual',
+                     xref="paper", yref="paper"),
+                dict(x=-0.05, y=0.5, showarrow=False, text="Predicted",
+                     textangle=-90, xref="paper", yref="paper")])
+        return fig
+
     def start(self, host=None, port=5000):
         return self.app.run_server(
             debug=False, use_reloader=False, host=host, port=port)
@@ -587,7 +1052,7 @@ def construct_molecule_detail(self, selected_points, display_properties,
         prop_recs = [html.Tr(table_headers, style={'background': 'lightgray'})]
 
         if chembl_ids:
-            props, selected_molecules = self.chem_data.fetch_props_by_chemble(chembl_ids)
+            props, selected_molecules = self.chem_data.fetch_props_by_chembl(chembl_ids)
         elif selected_points:
             selected_molregno = []
             for point in selected_points['points'][((page - 1) * pageSize): page * pageSize]:
@@ -631,7 +1096,7 @@ def construct_molecule_detail(self, selected_points, display_properties,
                 td.append(html.Td(selected_chembl_id))
             else:
                 td.append(html.Td(
-                    dbc.Button('Add as MoI',
+                    dbc.Button('Highlight',
                                id={'role': 'bt_star_candidate',
                                    'chemblId': selected_chembl_id,
                                    'molregno': str(molregno)
@@ -640,7 +1105,7 @@ def construct_molecule_detail(self, selected_points, display_properties,
                 ))
 
             td.append(html.Td(
-                dbc.Button('Add for Interpolation',
+                dbc.Button('Add',
                            id={'role': 'bt_add_candidate',
                                'chemblId': selected_chembl_id,
                                'molregno': str(molregno)
@@ -649,6 +1114,16 @@ def construct_molecule_detail(self, selected_points, display_properties,
                            n_clicks=0)
             ))
 
+            td.append(html.Td(
+                dbc.Button('Analogue',
+                        id={'role': 'bt_analoguing_candidate',
+                            'chemblId': selected_chembl_id,
+                            'molregno': str(molregno),
+                            #'smiles': smiles
+                            },
+                        n_clicks=0)
+            ))
+
             prop_recs.append(html.Tr(td, style={'fontSize': '125%'}))
 
         return html.Table(prop_recs, style={'width': '100%', 'border': '1px solid lightgray'}), all_props
@@ -665,6 +1140,8 @@ def constuct_layout(self):
 
                 html.Div([
                     dcc.Markdown("""**Molecule(s) of Interest**"""),
+                    dcc.Markdown(children="""Click *Highlight* to populate this list""",
+                                style={'marginTop': 18}),
                     dcc.Markdown("Please enter ChEMBL ID(s) separated by commas."),
 
                     html.Div(className='row', children=[
@@ -674,6 +1151,19 @@ def constuct_layout(self):
                                    className='three columns'),
                     ], style={'marginLeft': 0, 'marginBottom': 18, }),
 
+                    dcc.Markdown("For fingerprint changes to take effect, first *Apply* the *GPU KMeans-UMAP* Workflow, then *Recluster*"),
+                    html.Div(className='row', children=[
+                        dcc.Markdown("Fingerprint Radius", style={'marginTop': 12,}),
+                        dcc.Input(id='fingerprint_radius', value=2),
+                        ], style={'marginLeft': 0, 'marginTop': '6px'}
+                    ),
+
+                    html.Div(className='row', children=[
+                        dcc.Markdown("Fingerprint Size", style={'marginTop': 12,}),
+                        dcc.Input(id='fingerprint_nBits', value=512),
+                        ], style={'marginLeft': 0, 'marginTop': '6px'}
+                    ),      
+
                     dcc.Tabs([
                         dcc.Tab(label='Cluster Molecules', children=[
                             dcc.Markdown("""**Select Workflow**""", style={'marginTop': 18, }),
@@ -737,8 +1227,9 @@ def constuct_layout(self):
                             dcc.RadioItems(
                                 id='rd_generation_type',
                                 options=[
-                                    {'label': 'Interpolate between two molecules', 'value': 'INTERPOLATE'},
                                     {'label': 'Sample around one molecule', 'value': 'SAMPLE'},
+                                    {'label': 'Fit cluster to property and extrapolate', 'value': 'EXTRAPOLATE'},
+                                    {'label': 'Interpolate between two molecules', 'value': 'INTERPOLATE'},
                                 ],
                                 value='INTERPOLATE',
                                 style={'marginTop': 18},
@@ -747,11 +1238,35 @@ def constuct_layout(self):
                             ),
 
                             html.Div(className='row', children=[
-                                dcc.Markdown("Number of molecules to generate",
+                                dcc.Markdown("Number to be generated from each compound", 
                                              style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}),
                                 dcc.Input(id='n2generate', value=10),
                             ], style={'marginLeft': 0}),
 
+                            html.Div(className='row', children=[
+                                html.Label([
+                                    "Select molecular property for fitting and extrapolation",
+                                    dcc.Dropdown(id='extrap_compound_property', multi=False,  clearable=False,
+                                                options=[{"label": PROP_DISP_NAME[p], "value": p} for p in IMP_PROPS],
+                                                value=IMP_PROPS[0]),
+                                ], style={'marginTop': 18, 'marginLeft': 18})],
+                            ),
+
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Cluster number for fitting property and extrapolation", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}),
+                                dcc.Input(id='extrap_cluster_number', value=0),
+                            ], style={'marginLeft': 0}),
+
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Step-size for extrapolation", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}),
+                                dcc.Input(id='extrap_step_size', value=0.1),
+                            ], style={'marginLeft': 0}),
+
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Number of compounds to extrapolate", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}),
+                                dcc.Input(id='extrap_n_compounds', value=10),
+                            ], style={'marginLeft': 0}),
+
                             html.Div(className='row', children=[
                                 dcc.Markdown("Scaled sampling radius (int, start with 1)",
                                              style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}),
@@ -761,6 +1276,8 @@ def constuct_layout(self):
                             dcc.Markdown(children="""**Please Select Two**""",
                                          id="mk_selection_msg",
                                          style={'marginTop': 18}),
+                            dcc.Markdown(children="""Click *Add* to populate this list""",
+                                         style={'marginTop': 18}),
                             dcc.Checklist(
                                 id='ckl_candidate_mol_id',
                                 options=[],
@@ -772,7 +1289,129 @@ def constuct_layout(self):
                                 dbc.Button('Generate', id='bt_generate', n_clicks=0, style={'marginRight': 12}),
                                 dbc.Button('Reset', id='bt_reset_candidates', n_clicks=0),
                             ], style={'marginLeft': 0}),
+                       
+                        ]),
+                        
+                        dcc.Tab(label='Predict Properties', children=[
+
+                            dcc.Markdown("**Select Featurizing Model**", style={'marginTop': 18,}),
+                            html.Div(children=[
+                                dcc.Dropdown(id='sl_featurizing_wf', multi=False,
+                                             options=[{'label': 'CDDD Model',
+                                                       'value': 'cuchem.wf.generative.Cddd'},
+                                                       {'label': 'MolBART Model',
+                                                       'value': 'cuchem.wf.generative.MolBART'},
+                                                       {'label': 'MegatronMolBART Model',
+                                                       'value': 'cuchem.wf.generative.MegatronMolBART'},
+                                                     ],
+                                             value=self.generative_wf_cls,
+                                             clearable=False),
+                            ]),
+                             html.Div(className='row', children=[
+                                html.Label([
+                                    "Select molecular property for fitting and prediction",
+                                    dcc.Dropdown(id='fit_nn_compound_property', multi=False,  clearable=False,
+                                    options=[{"label": PROP_DISP_NAME[p], "value": p} for p in IMP_PROPS],
+                                    value=IMP_PROPS[0]),
+                                ], style={'marginTop': 18, 'marginLeft': 18})],
+                            ),
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Train cluster", style={'marginTop': 12,}),
+                                dcc.Input(id='fit_nn_train_cluster_number', value=0),
+                                ], style={'marginLeft': 0, 'marginTop': '6px'}
+                            ),
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Test cluster", style={'marginTop': 12,}),
+                                dcc.Input(id='fit_nn_test_cluster_number', value=1),
+                                ], style={'marginLeft': 0, 'marginTop': '6px'}
+                            ),
+                            dcc.Markdown(children="**Neural Network Parameters**",
+                                         id="nn_params_msg",
+                                         style={'marginTop': 18}
+                            ),
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Hidden layer sizes", style={'marginTop': 12,}),
+                                dcc.Input(id='fit_nn_hidden_layer_sizes', value=''),
+                                ], style={'marginLeft': 0, 'marginTop': '6px'}
+                            ),
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Activation Function", style={'marginTop': 12,}),
+                                dcc.Input(id='fit_nn_activation_fn', value='LeakyReLU'),
+                                ], style={'marginLeft': 0, 'marginTop': '6px'}
+                            ),
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Final Activation Function", style={'marginTop': 12,}),
+                                dcc.Input(id='fit_nn_final_activation_fn', value='LeakyReLU'),
+                                ], style={'marginLeft': 0, 'marginTop': '6px'}
+                            ),
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Number of training epochs", style={'marginTop': 12,}),
+                                dcc.Input(id='fit_nn_max_epochs', value=10),
+                                ], style={'marginLeft': 0, 'marginTop': '6px'}
+                            ),
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Learning Rate", style={'marginTop': 12,}),
+                                dcc.Input(id='fit_nn_learning_rate', value=0.001),
+                                ], style={'marginLeft': 0, 'marginTop': '6px'}
+                            ),
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Weight Decay (Adam)", style={'marginTop': 12,}),
+                                dcc.Input(id='fit_nn_weight_decay', value=0.0001),
+                                ], style={'marginLeft': 0, 'marginTop': '6px'}
+                            ),
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Batch size", style={'marginTop': 12,}),
+                                dcc.Input(id='fit_nn_batch_size', value=1),
+                                ], style={'marginLeft': 0, 'marginTop': '6px'}
+                            ),
+                            html.Div(className='row', children=[
+                                dbc.Button('Fit', id='bt_fit', n_clicks=0, style={'marginRight': 12}),
+                                ], style={'marginLeft': 0}
+                            ),
                         ]),
+
+                        dcc.Tab(label='Find Analogues', children=[
+
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Maxinum Number of Analogues", style={'marginTop': 12,}),
+                                dcc.Input(id='analoguing_n_analogues', value=10),
+                                ], style={'marginLeft': 0, 'marginTop': '6px'}
+                            ),
+                            html.Div(className='row', children=[
+                                dcc.Markdown("Similarity Threshold", style={'marginTop': 12,}),
+                                dcc.Input(id='analoguing_threshold', value=0.33),
+                                ], style={'marginLeft': 0, 'marginTop': '6px'}
+                            ),
+                            html.Div(children=[
+                                dcc.Dropdown(id='analoguing_type', multi=False,
+                                             options=[{'label': 'Similar compounds',
+                                                       'value': 'similar'},
+                                                       {'label': 'Compounds with the same scaffold',
+                                                       'value': 'scaffold'},
+                                                       {'label': 'Compounds that are superstructures',
+                                                       'value': 'superstructure'},
+                                                     ],
+                                             value='similar',
+                                             clearable=False),
+                            ]),
+                            dcc.Markdown(children="Choose a compound",
+                                         id="analoguing_msg",
+                                         style={'marginTop': 18}
+                            ),
+                            dcc.Markdown(children="Click *Analogue* to populate this list",
+                                         style={'marginTop': 18}),
+                            dcc.Checklist(
+                                id='ckl_analoguing_mol_id',
+                                options=[],
+                                value=[],
+                                #inputStyle={'display': 'inline-block', 'marginLeft': 6, 'marginRight': 6},
+                                #labelStyle={'display': 'block', 'marginLeft': 6, 'marginRight': 6}
+                            ),
+                            html.Div(className='row', children=[
+                                dbc.Button('Search', id='bt_analoguing', n_clicks=0, style={'marginRight': 12}),
+                            ], style={'marginLeft': 0}),
+                        ])
+
                     ]),
 
                     html.Div(className='row', children=[
@@ -785,51 +1424,81 @@ def constuct_layout(self):
                 ], className='three columns', style={'marginLeft': 18, 'marginTop': 90, 'verticalAlign': 'text-top', }),
             ]),
 
-            html.Div(className='row', children=[
-                html.Div(id='section_generated_molecules', children=[
-                        html.Div(className='row', children=[
-                            html.A('Export to SDF',
-                                id='download-link',
-                                download="rawdata.sdf",
-                                href="/cheminfo/downloadSDF",
-                                target="_blank",
-                                n_clicks=0,
-                                style={'fontSize': '150%'}
-                        ),
-                        html.Div(id='msg_generated_molecules', children=[],
-                                style={'color': 'red', 'fontWeight': 'bold', 'marginLeft': 12, 'fontSize': '150%'}),
-                    ], style={'marginLeft': 0, 'marginBottom': 18, }),
-                    html.Div(id='table_generated_molecules', children=[], style={'width': '100%'})
-                ], style={'display': 'none', 'width': '100%'}),
-
-                html.Div(id='section_selected_molecules', children=[
+            #html.Div(className='row', children=[
+            html.Div(id='section_generated_molecules', children=[
                     html.Div(className='row', children=[
-                        html.Div(id='section_display_properties', children=[
-                            html.Label([
-                                "Select Molecular Properties",
-                                dcc.Dropdown(id='sl_mol_props', multi=True,
-                                            options=[
-                                                {'label': 'alogp', 'value': 'alogp'}],
-                                            value=['alogp']),
-                            ])],
-                            className='nine columns'),
-                        html.Div(children=[
-                            dbc.Button("<", id="bt_page_prev",
-                                    style={"height": "25px"}),
-                            html.Span(children=1, id='current_page',
-                                    style={"paddingLeft": "6px"}),
-                            html.Span(children=' of 1', id='total_page',
-                                    style={"paddingRight": "6px"}),
-                            dbc.Button(">", id="bt_page_next",
-                                    style={"height": "25px"})
-                            ],
-                            className='three columns',
-                            style={'verticalAlign': 'text-bottom', 'text-align': 'right'}
-                        ),
-                    ], style={'margin': 12}),
-                    html.Div(id='tb_selected_molecules', children=[], style={'width': '100%'})
-                ], style={'display': 'none', 'width': '100%'}),
-            ], style={'margin': 12}),
+                        html.A('Export to SDF',
+                            id='download-link',
+                            download="rawdata.sdf",
+                            href="/cheminfo/downloadSDF",
+                            target="_blank",
+                            n_clicks=0,
+                            style={'fontSize': '150%'}
+                    ),
+                    html.Div(id='msg_generated_molecules', children=[],
+                            style={'color': 'red', 'fontWeight': 'bold', 'marginLeft': 12, 'fontSize': '150%'}),
+                ], style={'marginLeft': 0, 'marginBottom': 18, }),
+                html.Div(id='table_generated_molecules', children=[], style={'width': '100%'})
+            ], style={'display': 'none', 'width': '100%'}),
+
+
+            html.Div(id='section_generated_molecules_clustered', children=[
+                dcc.Graph(id='gen_figure', figure=fig,
+                        #className='nine columns',
+                        #style={'verticalAlign': 'text-top'}
+                ),
+            ], style={'display': 'none'}),
+
+            html.Div(id='section_fitting', children=[
+                dcc.Graph(id='fitting_figure', figure=fig,
+                        #className='nine columns',
+                        #style={'verticalAlign': 'text-top'}
+                ),
+            ], style={'display': 'none'}),
+
+            html.Div(id='section_selected_molecules', children=[
+                html.Div(className='row', children=[
+                    html.Div(id='section_display_properties', children=[
+                        html.Label([
+                            "Select Molecular Properties",
+                            dcc.Dropdown(id='sl_mol_props', multi=True,
+                                        options=[
+                                            {'label': 'alogp', 'value': 'alogp'}],
+                                        value=['alogp']),
+                        ])],
+                        className='nine columns'),
+                    html.Div(children=[
+                        dbc.Button("<", id="bt_page_prev",
+                                style={"height": "25px"}),
+                        html.Span(children=1, id='current_page',
+                                style={"paddingLeft": "6px"}),
+                        html.Span(children=' of 1', id='total_page',
+                                style={"paddingRight": "6px"}),
+                        dbc.Button(">", id="bt_page_next",
+                                style={"height": "25px"})
+                        ],
+                        className='three columns',
+                        style={'verticalAlign': 'text-bottom', 'text-align': 'right'}
+                    ),
+                ], style={'margin': 12}),
+
+                html.Div(
+                    id='tb_selected_molecules', 
+                    children=[], 
+                    style={'width': '100%'}
+                )
+            ], style={'display': 'none', 'width': '100%'}),
+
+            html.Div(id='section_analoguing', children=[
+                html.Div(children=[
+                    html.Div(id='tb_analoguing', children=[],
+                             style={'verticalAlign': 'text-top'}
+                             ),
+                ])
+            ], style={'display': 'none'}),
+
+            #], style={'margin': 12}),
+
 
             html.Div(id='refresh_main_fig', style={'display': 'none'}),
             html.Div(id='northstar_cluster', style={'display': 'none'}),
@@ -837,9 +1506,10 @@ def constuct_layout(self):
             html.Div(id='mol_selection_error', style={'display': 'none'}),
             html.Div(id='show_selected_mol', style={'display': 'none'}),
             html.Div(id='show_generated_mol', style={'display': 'none'}),
-            html.Div(id='genration_candidates', style={'display': 'none'}),
+            html.Div(id='generation_candidates', style={'display': 'none'}),
             html.Div(id='refresh_moi_prop_table', style={'display': 'none'}),
             html.Div(id='interpolation_error', style={'display': 'none'}),
+            html.Div(id='analoguing_candidates', style={'display': 'none'}), # Not displayed but used to keep track of compounds added to checklist of compounds to be analogued
 
             html.Div(className='row', children=[
                 dbc.Modal([
@@ -986,9 +1656,12 @@ def handle_mark_north_star(self, bt_north_star_click, north_star):
         return ','.join(selected_north_star)
 
     @report_ui_error(4)
-    def handle_re_cluster(self, bt_cluster_clicks, bt_point_clicks, bt_north_star_clicks,
-                          sl_prop_gradient, sl_nclusters, refresh_main_fig,
-                          selected_clusters, selected_points, north_star, refresh_moi_prop_table):
+    def handle_re_cluster(
+        self, bt_cluster_clicks, bt_point_clicks, bt_north_star_clicks,
+        sl_prop_gradient, sl_nclusters, refresh_main_fig,
+        fingerprint_radius, fingerprint_nBits,
+        selected_clusters, selected_points, north_star, refresh_moi_prop_table
+    ):
         comp_id, event_type = self._fetch_event_data()
 
         if comp_id == 'sl_nclusters':
@@ -998,6 +1671,9 @@ def handle_re_cluster(self, bt_cluster_clicks, bt_point_clicks, bt_north_star_cl
 
             raise dash.exceptions.PreventUpdate
 
+        if comp_id in ['fingerprint_radius', 'fingerprint_nBits']:
+            raise dash.exceptions.PreventUpdate
+
         filter_values = None
         filter_column = None
         reload_data = False
@@ -1018,10 +1694,11 @@ def handle_re_cluster(self, bt_cluster_clicks, bt_point_clicks, bt_north_star_cl
         elif comp_id == 'bt_north_star' and event_type == 'n_clicks':
             if north_star:
                 north_star = north_star.split(',')
-                missing_mols, molregnos, _ = self.cluster_wf.add_molecules(north_star)
+                missing_mols, molregnos, _ = self.cluster_wf.add_molecules(
+                    north_star, radius=int(fingerprint_radius), nBits=int(fingerprint_nBits))
                 recluster_data = len(missing_mols) > 0
                 logger.info("%d missing molecules added...", len(missing_mols))
-                logger.debug("Missing molecules werew %s", missing_mols)
+                logger.debug("Missing molecules were %s", missing_mols)
 
                 moi_molregno = " ,".join(list(map(str, molregnos)))
                 if refresh_moi_prop_table is None:
@@ -1050,6 +1727,9 @@ def handle_re_cluster(self, bt_cluster_clicks, bt_point_clicks, bt_north_star_cl
             north_stars=moi_molregno,
             color_col='cluster',
             reload_data=reload_data,
-            recluster_data=recluster_data)
+            recluster_data=recluster_data, 
+            fingerprint_radius=int(fingerprint_radius), 
+            fingerprint_nBits=int(fingerprint_nBits)
+        )
 
         return figure, ','.join(northstar_cluster), _refresh_moi_prop_table, dash.no_update
diff --git a/cuchem/cuchem/wf/cluster/__init__.py b/cuchem/cuchem/wf/cluster/__init__.py
index 5c4ff371..f8e37966 100644
--- a/cuchem/cuchem/wf/cluster/__init__.py
+++ b/cuchem/cuchem/wf/cluster/__init__.py
@@ -24,16 +24,14 @@ def _remove_ui_columns(self, embedding):
 
     def _remove_non_numerics(self, embedding):
         embedding = self._remove_ui_columns(embedding)
-
-        other_props = ['id'] + IMP_PROPS + ADDITIONAL_FEILD
-        # Tempraryly store columns not required during processesing
+        # Fingerprint columns have the names 0, 1, 2,...
+        non_numeric_col_names = [col for col in embedding.columns if type(col) != int]
+        # Temporarily store columns not required during processesing
         prop_series = {}
-        for col in other_props:
-            if col in embedding.columns:
-                prop_series[col] = embedding[col]
+        for col in non_numeric_col_names:
+            prop_series[col] = embedding[col]
         if len(prop_series) > 0:
-            embedding = embedding.drop(other_props, axis=1)
-
+            embedding = embedding.drop(non_numeric_col_names, axis=1)
         return embedding, prop_series
 
     def _random_sample_from_arrays(self, *input_array_list, n_samples=None, index=None):
@@ -99,7 +97,7 @@ def recluster(self,
 
     def add_molecules(self, chemblids: List):
         """
-        ChembleId's accepted as argument to the existing database. Duplicates
+        ChemblId's accepted as argument to the existing database. Duplicates
         must be ignored.
         """
         raise NotImplementedError
diff --git a/cuchem/cuchem/wf/cluster/gpukmeansumap.py b/cuchem/cuchem/wf/cluster/gpukmeansumap.py
index 413360ee..0942ca42 100644
--- a/cuchem/cuchem/wf/cluster/gpukmeansumap.py
+++ b/cuchem/cuchem/wf/cluster/gpukmeansumap.py
@@ -22,6 +22,8 @@
 import cuml
 import dask
 import dask_cudf
+import sys
+from dask.distributed import wait
 from cuchemcommon.context import Context
 from cuchemcommon.data import ClusterWfDAO
 from cuchemcommon.data.cluster_wf import ChemblClusterWfDao
@@ -42,43 +44,51 @@
 
 
 @singledispatch
-def _gpu_cluster_wrapper(embedding, n_pca, self):
+def _gpu_cluster_wrapper(embedding, n_pca, reuse_umap, reuse_pca, self):
     return NotImplemented
 
 
 @_gpu_cluster_wrapper.register(dask.dataframe.core.DataFrame)
-def _(embedding, n_pca, self):
+def _(embedding, n_pca, reuse_umap, reuse_pca, self):
     embedding = dask_cudf.from_dask_dataframe(embedding)
-    return _gpu_cluster_wrapper(embedding, n_pca, self)
+    return _gpu_cluster_wrapper(embedding, n_pca, reuse_umap, reuse_pca, self)
 
 
 @_gpu_cluster_wrapper.register(cudf.DataFrame)
-def _(embedding, n_pca, self):
+def _(embedding, n_pca, reuse_umap, reuse_pca, self):
     embedding = dask_cudf.from_cudf(embedding,
-                                    chunksize=int(embedding.shape[0] * 0.1))
-    return _gpu_cluster_wrapper(embedding, n_pca, self)
+                                    chunksize=max(10, int(embedding.shape[0] * 0.1)))
+    return _gpu_cluster_wrapper(embedding, n_pca, reuse_umap, reuse_pca, self)
 
 
 @_gpu_cluster_wrapper.register(dask_cudf.core.DataFrame)
-def _(embedding, n_pca, self):
+def _(embedding, n_pca, reuse_umap, reuse_pca, self):
     embedding = embedding.persist()
-    return self._cluster(embedding, n_pca)
+    return self._cluster(embedding, n_pca, reuse_umap, reuse_pca)
 
 
 class GpuKmeansUmap(BaseClusterWorkflow, metaclass=Singleton):
-
+    # TODO: support changing fingerprint radius and nBits in other kmeans workflows as well (hybrid, random projection)
     def __init__(self,
                  n_molecules: int = None,
                  dao: ClusterWfDAO = ChemblClusterWfDao(MorganFingerprint),
                  pca_comps=64,
                  n_clusters=7,
-                 seed=0):
+                 seed=0,
+                 fingerprint_radius=2,
+                 fingerprint_nBits=512
+        ):
         super().__init__()
 
-        self.dao = dao
+        self.dao = dao if dao is not None else ChemblClusterWfDao(
+            MorganFingerprint, radius=fingerprint_radius, nBits=fingerprint_nBits
+        )
+        self.fingerprint_radius = fingerprint_radius
+        self.fingerprint_nBits = fingerprint_nBits
         self.n_molecules = n_molecules
         self.pca_comps = pca_comps
         self.pca = None
+        self.umap = None
         self.n_clusters = n_clusters
 
         self.df_embedding = None
@@ -87,12 +97,12 @@ def __init__(self,
         self.n_spearman = 5000
         self.n_silhouette = 500000
 
-    def _cluster(self, embedding, n_pca):
+    def _cluster(self, embedding, n_pca, reuse_umap=False, reuse_pca=True):
         """
         Generates UMAP transformation on Kmeans labels generated from
         molecular fingerprints.
         """
-
+        reuse_umap = reuse_umap and reuse_pca
         dask_client = self.context.dask_client
         embedding = embedding.reset_index()
 
@@ -106,15 +116,17 @@ def _cluster(self, embedding, n_pca):
 
         if n_pca and n_obs > n_pca:
             with MetricsLogger('pca', self.n_molecules) as ml:
-                if self.pca is None:
+                if (self.pca is None) or not reuse_pca:
                     self.pca = cuDaskPCA(client=dask_client, n_components=n_pca)
                     self.pca.fit(embedding)
+                else:
+                    logger.info(f'Using available pca')
                 embedding = self.pca.transform(embedding)
                 embedding = embedding.persist()
 
         with MetricsLogger('kmeans', self.n_molecules) as ml:
-            if self.n_molecules < MIN_RECLUSTER_SIZE:
-                raise Exception('Reclustering less than %d molecules is not supported.' % MIN_RECLUSTER_SIZE)
+            if self.n_molecules < self.n_clusters: # < MIN_RECLUSTER_SIZE:
+                raise Exception('Reclustering {self.n_molecules} molecules into {self.n_clusters} clusters not supported.')# % MIN_RECLUSTER_SIZE)
 
             kmeans_cuml = cuDaskKMeans(client=dask_client,
                                        n_clusters=self.n_clusters)
@@ -137,13 +149,18 @@ def _cluster(self, embedding, n_pca):
             local_model = cuUMAP()
             local_model.fit(X_train)
 
-            umap_model = cuDaskUMAP(local_model,
-                                    n_neighbors=100,
-                                    a=1.0,
-                                    b=1.0,
-                                    learning_rate=1.0,
-                                    client=dask_client)
-            Xt = umap_model.transform(embedding)
+            if not (reuse_umap and self.umap):
+                self.umap = cuDaskUMAP(
+                    local_model,
+                    n_neighbors=100,
+                    a=1.0,
+                    b=1.0,
+                    learning_rate=1.0,
+                    client=dask_client
+                )
+            else:
+                logger.info(f'reusing {self.umap}')
+            Xt = self.umap.transform(embedding)
 
             ml.metric_name = 'spearman_rho'
             ml.metric_func = self._compute_spearman_rho
@@ -165,30 +182,55 @@ def _cluster(self, embedding, n_pca):
 
         return embedding
 
-    def cluster(self, df_mol_embedding=None):
-
-        logger.info("Executing GPU workflow...")
-
-        if df_mol_embedding is None:
+    def cluster(
+        self, 
+        df_mol_embedding=None, 
+        reuse_umap=False, 
+        reuse_pca=True, 
+        fingerprint_radius=2, 
+        fingerprint_nBits=512
+    ):
+
+        logger.info(f"GpuKmeansUmap.cluster(radius={fingerprint_radius}, nBits={fingerprint_nBits}), df_mol_embedding={df_mol_embedding}")
+        sys.stdout.flush()
+        if (df_mol_embedding is None) or (fingerprint_radius != self.fingerprint_radius) or (fingerprint_nBits != self.fingerprint_nBits):
             self.n_molecules = self.context.n_molecule
-
+            self.dao = ChemblClusterWfDao(
+                MorganFingerprint, radius=fingerprint_radius, nBits=fingerprint_nBits)
+            self.fingerprint_radius = fingerprint_radius
+            self.fingerprint_nBits = fingerprint_nBits
+            logger.info(f'dao={self.dao}, getting df_mol_embedding...')
             df_mol_embedding = self.dao.fetch_molecular_embedding(
                 self.n_molecules,
                 cache_directory=self.context.cache_directory,
+                radius=fingerprint_radius,
+                nBits=fingerprint_nBits
             )
-
             df_mol_embedding = df_mol_embedding.persist()
-
-        self.df_embedding = _gpu_cluster_wrapper(df_mol_embedding,
-                                                 self.pca_comps,
-                                                 self)
+            wait(df_mol_embedding)
+
+        self.df_embedding = _gpu_cluster_wrapper(
+            df_mol_embedding,
+            self.pca_comps,
+            reuse_umap,
+            reuse_pca,
+            self
+        )
         return self.df_embedding
 
     def recluster(self,
                   filter_column=None,
                   filter_values=None,
-                  n_clusters=None):
+                  n_clusters=None, 
+                  fingerprint_radius=2, 
+                  fingerprint_nBits=512
+    ):
+
+        # The user may have changed the fingerprint specification, in which case, we cannot reuse the embeddings
+        if (fingerprint_radius != self.fingerprint_radius) or (fingerprint_nBits != self.fingerprint_nBits):
+            return self.cluster(df_mol_embedding=None, reuse_umap=False, reuse_pca=False, fingerprint_radius=fingerprint_radius, fingerprint_nBits=fingerprint_nBits)
 
+        logger.info(f"recluster(radius={fingerprint_radius}, nBits={fingerprint_nBits}): reusing embedding")
         df_embedding = self.df_embedding
         if filter_values is not None:
             filter = df_embedding[filter_column].isin(filter_values)
@@ -199,11 +241,11 @@ def recluster(self,
         if n_clusters is not None:
             self.n_clusters = n_clusters
 
-        self.df_embedding = _gpu_cluster_wrapper(df_embedding, None, self)
+        self.df_embedding = _gpu_cluster_wrapper(df_embedding, None, False, True, self)
 
         return self.df_embedding
 
-    def add_molecules(self, chemblids: List):
+    def add_molecules(self, chemblids: List, radius=2, nBits=512):
 
         chemblids = [x.strip().upper() for x in chemblids]
         chem_mol_map = {row[0]: row[1] for row in self.dao.fetch_id_from_chembl(chemblids)}
@@ -245,8 +287,6 @@ def add_molecules(self, chemblids: List):
             if hasattr(self.df_embedding, 'compute'):
                 self.df_embedding = self.df_embedding.compute()
 
-            logger.info(self.df_embedding.shape)
-
         return chem_mol_map, molregnos, self.df_embedding
 
 
@@ -264,7 +304,7 @@ def __init__(self,
                          n_clusters=n_clusters,
                          seed=seed)
 
-    def _cluster(self, embedding, n_pca):
+    def _cluster(self, embedding, n_pca, reuse_umap=False, reuse_pca=False):
         """
         Generates UMAP transformation on Kmeans labels generated from
         molecular fingerprints.
@@ -284,7 +324,7 @@ def _cluster(self, embedding, n_pca):
 
         if n_pca and n_obs > n_pca:
             with MetricsLogger('pca', self.n_molecules) as ml:
-                if self.pca == None:
+                if (self.pca == None) or not reuse_pca:
                     self.pca = cuml.PCA(n_components=n_pca)
                     self.pca.fit(embedding)
                 embedding = self.pca.transform(embedding)
@@ -308,8 +348,8 @@ def _cluster(self, embedding, n_pca):
                 ml.metric_func_args = (embedding_sample, kmeans_labels_sample)
 
         with MetricsLogger('umap', self.n_molecules) as ml:
-            umap = cuml.manifold.UMAP()
-            Xt = umap.fit_transform(embedding)
+            self.umap = cuml.manifold.UMAP()
+            Xt = self.umap.fit_transform(embedding)
 
             ml.metric_name = 'spearman_rho'
             ml.metric_func = self._compute_spearman_rho
diff --git a/cuchem/cuchem/wf/generative/megatronmolbart.py b/cuchem/cuchem/wf/generative/megatronmolbart.py
index 1437ce97..831313c0 100644
--- a/cuchem/cuchem/wf/generative/megatronmolbart.py
+++ b/cuchem/cuchem/wf/generative/megatronmolbart.py
@@ -1,4 +1,5 @@
 import logging
+import os
 import grpc
 import pandas as pd
 
@@ -12,7 +13,25 @@
 from cuchemcommon.utils.singleton import Singleton
 from cuchemcommon.workflow import BaseGenerativeWorkflow
 
+# Check if all these are needed:
+from cuchemcommon.fingerprint import MorganFingerprint
+import torch
+import torch.nn
+from torch.utils.data import Dataset, DataLoader
+import cupy as cp
+import pickle
+from pathlib import Path
+import numpy as np
+from functools import partial
+from rdkit import Chem
+from rdkit.Chem import MolFromSmiles, RDKFingerprint
+from rdkit.DataStructs import FingerprintSimilarity
+from cuml import Lasso, Ridge #LinearRegression
+from cuml.metrics import mean_squared_error
+from math import sqrt
+
 logger = logging.getLogger(__name__)
+PAD_TOKEN = 0 # TODO: use tokenizer.pad_token instead
 
 
 class MegatronMolBART(BaseGenerativeWorkflow):
@@ -20,7 +39,10 @@ class MegatronMolBART(BaseGenerativeWorkflow):
 
     def __init__(self, dao: GenerativeWfDao = ChemblGenerativeWfDao(None)) -> None:
         super().__init__(dao)
-
+        if torch.cuda.is_available():
+            self.device = 'cuda'
+        else:
+            self.device = 'cpu' 
         self.min_jitter_radius = 1
         self.channel = grpc.insecure_channel('megamolbart:50051')
         self.stub = GenerativeSamplerStub(self.channel)
@@ -64,7 +86,13 @@ def find_similars_smiles(self,
                              num_requested: int = 10,
                              scaled_radius=None,
                              force_unique=False,
-                             sanitize=True):
+                             sanitize=True,
+                             compound_id=None):
+        if isinstance(compound_id, list):
+            # Sometimes calling routine may send a list of length one containing the compound ID
+            if len(compound_id) > 1:
+                logger.info(f'find_similars_smiles received {compound_id}, generating neighbors only for first compound!')
+            compound_id = compound_id[0]
         spec = GenerativeSpec(model=GenerativeModel.MegaMolBART,
                               smiles=smiles,
                               radius=scaled_radius,
@@ -78,21 +106,32 @@ def find_similars_smiles(self,
         for embedding in result.embeddings:
             embeddings.append(list(embedding.embedding))
             dims.append(embedding.dim)
-
-        generated_df = pd.DataFrame({'SMILES': generatedSmiles,
-                                     'embeddings': embeddings,
-                                     'embeddings_dim': dims,
-                                     'Generated': [True for i in range(len(generatedSmiles))]})
-        generated_df['Generated'].iat[0] = False
-
+        if not compound_id:
+            compound_id = 'source'
+        generated_df = pd.DataFrame({
+            'SMILES': generatedSmiles,
+            'embeddings': embeddings,
+            'embeddings_dim': dims,
+            'Generated': [False] + [True] * (len(generatedSmiles) - 1),
+            'id': [
+                str(compound_id)] + [f'{compound_id}-g{i + 1}' 
+                for i in range(len(generatedSmiles) - 1)
+            ],
+        })
         return generated_df
 
-    def interpolate_smiles(self,
-                           smiles: List,
-                           num_points: int = 10,
-                           scaled_radius=None,
-                           force_unique=False,
-                           sanitize=True):
+    def interpolate_smiles(
+        self,
+        smiles: List,
+        num_points: int = 10,
+        scaled_radius=None,
+        force_unique=False,
+        compound_ids=[],
+        sanitize=True
+    ):
+        if len(compound_ids) == 0:
+            compound_ids = [f'source{i}' for i in range(len(smiles))]
+
         spec = GenerativeSpec(model=GenerativeModel.MegaMolBART,
                               smiles=smiles,
                               radius=scaled_radius,
@@ -102,9 +141,405 @@ def interpolate_smiles(self,
 
         result = self.stub.Interpolate(spec)
         result = list(result.generatedSmiles)
-
-        generated_df = pd.DataFrame({'SMILES': result,
-                                     'Generated': [True for i in range(len(result))]})
-        generated_df.iat[0, 1] = False
-        generated_df.iat[-1, 1] = False
+        n_pairs = len(compound_ids) - 1
+        n_generated = num_points + 2
+        n_generated_total = n_generated * n_pairs
+        assert len(result) == n_generated_total, f"Expected generator to return {n_generated} compounds between each of the {n_pairs} compound-pairs but got {len(result)}"
+        generated_df = pd.DataFrame({
+            'SMILES': result,
+            'Generated': [
+                i % n_generated not in [0, n_generated - 1]
+                for i in range(n_generated_total)
+            ],
+            'id': [
+                str(compound_ids[i // n_generated]) if i % n_generated == 0 
+                else str(compound_ids[1 + i // n_generated]) if i % n_generated == n_generated - 1  
+                else f'{compound_ids[i // n_generated]}-{compound_ids[1 + i // n_generated]}_i{i % n_generated}' 
+                for i in range(n_generated_total)
+            ],
+        })
+        #generated_df = pd.DataFrame({'SMILES': result,
+        #                             'Generated': [True for i in range(len(result))]})
+        #generated_df.iat[0, 1] = False
+        #generated_df.iat[-1, 1] = False
         return generated_df
+
+   
+    def extrapolate_from_cluster(self,
+                                 compounds_df,
+                                 compound_property: str,
+                                 cluster_id: int = 0,
+                                 n_compounds_to_transform=10,
+                                 num_points: int = 10,
+                                 step_size: float = 0.01,
+                                 force_unique = False,
+                                 scaled_radius: int = 1):
+        """
+        The embedding vector is calculated for the specified cluster_id and applied over it.
+        TO DO: We should have a table of direction vectors in embedded space listed, just like the list of compound IDs.
+        The user should choose one to be applied to the selected compounds, or to a cluster number.
+        """
+        smiles_list = None
+        radius = self._compute_radius(scaled_radius)
+        # TO DO: User must be able to extrapolate directly from smiles in the table;
+        # these may themselves be generated compounds without any chemblid.
+        df_cluster = compounds_df[ compounds_df['cluster'] == int(cluster_id) ].dropna().reset_index(drop=True)
+        if hasattr(df_cluster, 'compute'):
+            df_cluster = df_cluster.compute()
+        if 'transformed_smiles' in df_cluster:
+            smiles_col = 'transformed_smiles'
+        elif 'SMILES' in df_cluster:
+            smiles_col = 'SMILES'
+        elif 'smiles' in df_cluster:
+            smiles_col = 'smiles'
+        else:
+            raise RuntimeError(f'No smiles column in df_cluster: {list(df_cluster.columns)}\n{df_cluster.head()}')
+            smiles_col = None
+        smiles_list = df_cluster[smiles_col].to_array()
+        return self.extrapolate_from_smiles(smiles_list,
+                                            compound_property_vals=df_cluster[compound_property].to_gpu_array(), #to_array(), #[:n_compounds_to_transform].to_array(),
+                                            num_points=num_points,
+                                            n_compounds_to_transform=n_compounds_to_transform,
+                                            step_size=step_size,
+                                            scaled_radius=radius,
+                                            force_unique=force_unique,
+                                            id_list=df_cluster['id'].to_array())
+
+    def _get_embedding_direction(self,
+                                 embedding_list,
+                                 compound_property_vals,
+                                 ):
+        """
+        Get the embedding of all compounds in the specified cluster.
+        The performa a linear regression against the compound_property to find the direction in
+        embedded space along which the compound_property tends to increase.
+        Using the minimum and maximum values of the compound_property in the cluster to define the range,
+        compute the step size along the direction that is expected to increase the compound_property value by step_percentage.
+        """
+
+        logger.info(f'_get_embedding_direction: emb:{embedding_list.shape}, {type(embedding_list)}, prop:{compound_property_vals.shape}, {type(compound_property_vals)},'\
+            f' prop range: {max(compound_property_vals)} - {min(compound_property_vals)}.\nFitting Lasso regression...')
+        n_data = compound_property_vals.shape[0]
+        n_dimensions = embedding_list[0].shape[0]
+        reg = Lasso()
+        reg = reg.fit(embedding_list, compound_property_vals)
+        n_zero_coefs = len([x for x in reg.coef_ if x == 0.0])
+        zero_coef_indices = [i for i, x in enumerate(reg.coef_) if x != 0.0]
+        logger.info(f'coef: {n_zero_coefs} / {len(reg.coef_)} coefficients are zero (in some positions between {min(zero_coef_indices)} and {max(zero_coef_indices)});'\
+                    f' range: {reg.coef_.argmin()}: {min(reg.coef_)} to {reg.coef_.argmax()}: {max(reg.coef_)}')
+        
+        y_pred = reg.predict(embedding_list)
+        rmse = sqrt(mean_squared_error(compound_property_vals, y_pred.astype('float64')))
+        pearson_rho = cp.corrcoef(compound_property_vals, y_pred)
+        emb_std = np.std(embedding_list, axis=0)
+        logger.info(f'_get_embedding_direction: n={len(compound_property_vals)}, rho={pearson_rho}, rmse={rmse}, embedding_list.std: {emb_std}')
+     
+        emb_max = embedding_list[ np.argmax(compound_property_vals) ]
+        emb_min = embedding_list[ np.argmin(compound_property_vals) ]
+        diff_size = np.linalg.norm(emb_max - emb_min) / sqrt(n_dimensions)
+        #logger.info(f'compound_property_vals: [{np.argmin(compound_property_vals)}]={np.amin(compound_property_vals)}, [{np.argmax(compound_property_vals)}]={np.amax(compound_property_vals)}, diff_size={diff_size}')
+        return reg.coef_, emb_std, diff_size
+
+
+    def extrapolate_from_smiles(self,
+                                smiles_list,
+                                compound_property_vals,
+                                num_points: int,
+                                step_size: float,
+                                scaled_radius=None,
+                                force_unique=False,
+                                n_compounds_to_transform=10,
+                                id_list=[],
+                                debug=False):
+        """
+        Given a list of smiles strings, convert each to its embedding.
+        Then taken num_points steps in the specified direction (in the embedded space) of size step_size.
+        Convert these points on the embedded space back to smiles strings and return as a dataframe.
+        Modify duplicates if force_unique is True by adding a jitter of magnitude radius to the embedding.
+        """
+        # TODO: generated compounds are the same no matter what the step-size is, check code!!!!
+        # TODO: generated compounds are yielding different Tanimotos even though their are identical. Bug or jitter???
+        step_size = float(step_size)
+        n_compounds_to_transform = int(n_compounds_to_transform)
+        if len(id_list) == 0:
+            id_list = list(map(str, range(len(smiles_list))))
+        logger.info(f'molbart: extrapolate_from_smiles: {len(smiles_list)} smiles ({type(smiles_list)}), {num_points} extrapolations each with step_size {step_size}')
+        data = pd.DataFrame({'transformed_smiles': smiles_list})
+        full_mask = None
+        emb_shape = None
+        n_recovered = 0
+        avg_tani = 0
+        embeddings = []
+        for i, smiles in enumerate(smiles_list):
+            spec = GenerativeSpec(
+                model=GenerativeModel.MegaMolBART,
+                smiles=smiles,
+            )
+            result = self.stub.SmilesToEmbedding(spec)
+            emb = result.embedding
+            mask = result.pad_mask
+            emb_shape = result.dim
+            if debug:
+                spec = EmbeddingList(
+                    embedding=emb,
+                    dim=emb_shape,
+                    pad_mask=mask
+                )
+                generated_mols = self.stub.EmbeddingToSmiles(spec).generatedSmiles
+                if len(generated_mols) > 0:
+                    n_recovered += 1
+                    tani = FingerprintSimilarity(RDKFingerprint(MolFromSmiles(smiles)), RDKFingerprint(MolFromSmiles(generated_mols[0])))
+                    logger.info(f'{n_recovered}/ {i+1}: {smiles} ({len(smiles)} chars)--> emb:{emb_shape}, mask:{mask.shape} --> {generated_mols} (tani={tani:.2f})')
+                    avg_tani += tani
+            logger.info(f'emb: {type(emb)}, dim={emb_shape}, mask={len(mask)}, emb={len(emb)}')
+            embeddings.append(torch.tensor(emb)) #.detach().reshape(-1)) #torch tensor
+            if full_mask is None:
+                logger.info(f'First mask = {mask}')
+                full_mask = mask
+                emb_shape = emb_shape # n_tokens x 1 x 256
+            else:
+                full_mask = [a and b for a, b in zip(full_mask, mask)] # not used any more
+        if debug:
+            logger.info(f'{n_recovered} / {len(smiles_list)} compounds yielded something after embedding, with avg tani = {avg_tani / n_recovered if n_recovered > 0 else 0}')
+
+        embeddings = torch.nn.utils.rnn.pad_sequence(embeddings, batch_first=True, padding_value=PAD_TOKEN) # n_smiles x embedding_length
+        n_embedding_tokens = int(embeddings.shape[1] / (emb_shape[1] * emb_shape[2]))
+        emb_shape = [n_embedding_tokens, emb_shape[1], emb_shape[2]]
+        embeddings = cp.asarray(embeddings)
+        full_mask = [False] * n_embedding_tokens
+
+        # Use the entire cluster to infer the direction:
+        direction, emb_std, diff_size = self._get_embedding_direction(embeddings, compound_property_vals)
+        if diff_size == 0.0:
+            logger.info(f'Increasing diff_size from 0.0 to 1e-6')
+            diff_size = 1e-6
+
+        # But apply the transform to no more than n_compounds_to_transform, chosen at random
+        if n_compounds_to_transform < len(smiles_list):
+            indices = np.random.choice(list(range(len(smiles_list))), size=n_compounds_to_transform, replace=False)
+            smiles_list = [smiles_list[i] for i in indices]
+            embeddings = cp.asarray([embeddings[i,:] for i in indices])
+            id_list = [id_list[i] for i in indices]
+
+        result_df_list = [ pd.DataFrame({'SMILES': smiles_list,  'Generated': False, 'id': id_list}) ]
+
+        for step_num in range(1, 1 + num_points):
+            direction_sampled = cp.random.normal(loc=direction, scale=emb_std, size=emb_std.shape) #direction + noise
+            step = float(step_num * diff_size * step_size) * direction_sampled
+            extrap_embeddings = embeddings + step # TODO: print and check output
+            #logger.info(f'step ({step_num} * {diff_size} * {step_size} * direction_sampled): {type(step)}, {step.shape}, {step}\n:extrap_embeddings: {type(extrap_embeddings)}, {extrap_embeddings.shape}, extrap_embeddings[0]={extrap_embeddings[0]}')
+            smiles_gen_list = []
+            ids_interp_list = []
+            for i in range(len(extrap_embeddings)):
+                extrap_embedding = list(extrap_embeddings[i,:])
+                spec = EmbeddingList(
+                    embedding=extrap_embedding,
+                    dim=emb_shape,
+                    pad_mask=full_mask
+                )
+                smiles_gen = self.stub.EmbeddingToSmiles(spec).generatedSmiles[0]
+                logger.info(f'{i}: {smiles_gen}')
+                smiles_gen_list.append(smiles_gen)
+                ids_interp_list.append(f'{id_list[i]}-s{step_num}')
+            extrap_df = pd.DataFrame({
+                'SMILES': smiles_gen_list,
+                'Generated': True,
+                'id': ids_interp_list
+            })
+            if force_unique:
+                inv_transform_funct = partial(self.inverse_transform,
+                                            mem_pad_mask=full_mask)
+                extrap_df = self.compute_unique_smiles(extrap_df,
+                                                   smiles_gen,
+                                                   inv_transform_funct,
+                                                   radius=radius)
+            logger.info(f'step_num={step_num} yielded {len(extrap_df)} compounds')
+            result_df_list.append(extrap_df)
+        results_df = pd.concat(result_df_list, ignore_index=True)
+        results_df['id'] = results_df['id'].apply(str)
+        results_df.sort_values('id', inplace=True)
+        results_df.reset_index(drop=True, inplace=True)
+        return results_df
+
+    def fit_nn(
+        self,
+        compounds_df,
+        compound_property,
+        cluster_id_train,
+        cluster_id_test,
+        hidden_layer_sizes,
+        activation_fn,
+        final_activation_fn,
+        max_epochs,
+        batch_size=32,
+        learning_rate=0.001,
+        weight_decay=0.0001,
+        debug=False,
+        #scaled_radius=None
+        ):
+        """
+        Convert compound SMILES to embeddings, then train a neural network with n_layers hidden layers with the specified activation function (activation_fn)
+        to predict the specified compound_property of cluster_id_train. Evaluate the model on cluster_id_test. Return actual and predicted values for both
+        the train and test set.
+        """
+        logger.info(f'cluster_id_train={cluster_id_train}, cluster_id_test={cluster_id_test}, compound_property={compound_property}, compounds_df: {len(compounds_df)}, {type(compounds_df)}')
+        df_train = compounds_df[ compounds_df['cluster'] == int(cluster_id_train) ].dropna().reset_index(drop=True)#.compute()
+        if hasattr(df_train, 'compute'): 
+            df_train = df_train.compute()
+        df_test = compounds_df[ compounds_df['cluster'] == int(cluster_id_test) ].dropna().reset_index(drop=True)#.compute()
+        if hasattr(df_test, 'compute'):
+            df_test = df_test.compute()
+        n_train = len(df_train)
+        n_test = len(df_test)
+
+        smiles_list = np.concatenate((df_train['transformed_smiles'].to_array(), df_test['transformed_smiles'].to_array()), axis=0)
+        logger.info(f'smiles_list: {smiles_list.shape}')
+        pad_length = max(map(len, smiles_list)) + 2 # add 2 for start / stop
+        embeddings = []
+        emb_shape = None
+        n_recovered = 0
+        avg_tani = 0
+
+        for i, smiles in enumerate(smiles_list):
+            spec = GenerativeSpec(
+                model=GenerativeModel.MegaMolBART,
+                smiles=smiles,
+                #radius=radius
+            )
+            result = self.stub.SmilesToEmbedding(spec)
+            emb = result.embedding
+            mask = result.pad_mask
+            dim = result.dim
+            emb_shape = result.dim
+            if debug:
+                spec = EmbeddingList(
+                    embedding=emb,
+                    dim=emb_shape,
+                    pad_mask=mask
+                )
+                generated_mols = self.stub.EmbeddingToSmiles(spec).generatedSmiles                
+                if len(generated_mols) > 0:
+                    m = MolFromSmiles(generated_mols[0])
+                    if m is not None:
+                        n_recovered += 1
+                        tani = FingerprintSimilarity(RDKFingerprint(MolFromSmiles(smiles)), RDKFingerprint(m))
+                        logger.info(f'{n_recovered}/ {i+1}: {smiles} ({len(smiles)} chars)--> emb:{emb_shape}, mask:{len(mask)} --> {generated_mols} (tani={tani:.2f})')
+                        avg_tani += tani
+            embeddings.append(torch.tensor(emb, device=self.device)) #emb.detach().reshape(-1)) #torch tensor
+
+        if debug:
+            logger.info(f'{n_recovered} / {len(smiles_list)} compounds yielded something after embedding, with avg tani = {avg_tani / n_recovered if n_recovered > 0 else 0}')
+        
+        embeddings = torch.nn.utils.rnn.pad_sequence(
+            embeddings, batch_first=True, padding_value=PAD_TOKEN)
+        embeddings_train = embeddings[:n_train,:]
+        embeddings_test = embeddings[n_train:,:]
+        compound_property_vals_train = torch.tensor(df_train[compound_property], device=self.device, dtype=torch.float32)
+        compound_property_vals_test = torch.tensor(df_test[compound_property], device=self.device, dtype=torch.float32)
+        train_pred, test_pred = self._build_and_train_nn(
+            embeddings_train,
+            compound_property_vals_train,
+            embeddings_test,
+            compound_property_vals_test,
+            hidden_layer_sizes = hidden_layer_sizes,
+            activation_fn=activation_fn,
+            final_activation_fn=final_activation_fn,
+            max_epochs=max_epochs,
+            learning_rate=learning_rate,
+            weight_decay=weight_decay,
+            batch_size=batch_size
+        )
+        df = pd.DataFrame({
+            'x': torch.cat((compound_property_vals_train, compound_property_vals_test), axis=0).to('cpu').numpy(),
+            'y': torch.cat((train_pred.detach(), test_pred.detach()), axis=0).to('cpu').flatten().numpy(),
+            'cluster': np.concatenate((df_train['cluster'].to_array(), df_test['cluster'].to_array()), axis=0),
+            'id': np.concatenate((df_train['id'].to_array(), df_test['id'].to_array()), axis=0),
+            'train_set': [True] * n_train + [False] * n_test
+        })
+        return df
+
+    def _build_and_train_nn(self,
+                            embedding_list_train,
+                            compound_property_vals_train,
+                            embedding_list_test,
+                            compound_property_vals_test,
+                            hidden_layer_sizes = [],
+                            activation_fn='LeakyReLU',
+                            final_activation_fn='LeakyReLU',
+                            max_epochs=10,
+                            batch_size=32,
+                            learning_rate=0.001,
+                            weight_decay=0.0001
+                            ):
+        """
+        Construct a neural network with the specified number of layers, using the specified activation function.
+        Then train it on the training set and evaluate on the test set. Return results.
+        """
+
+        logger.info(f'_build_and_train_nn: emb_train:{embedding_list_train.shape}, {type(embedding_list_train)}, embedding_list_train[0]:{len(embedding_list_train[0])},'\
+                f' prop:{compound_property_vals_train.shape}, {type(compound_property_vals_train)},'\
+                f' prop_train: {min(compound_property_vals_train)} - {max(compound_property_vals_train)}')
+        n_data_train = compound_property_vals_train.shape[0]
+        n_dimensions = embedding_list_train[0].shape[0]
+        comp_net = CompoundNet(n_dimensions, hidden_layer_sizes, activation_fn, final_activation_fn).to(self.device)
+        logger.info(comp_net)
+        loss_fn = torch.nn.SmoothL1Loss()
+        opt = torch.optim.Adam(comp_net.parameters(), lr=learning_rate, weight_decay=weight_decay)
+        train_set = CompoundData(embedding_list_train, compound_property_vals_train)
+        loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
+
+        i = 0
+        for epoch in range(max_epochs):
+            total_loss = 0.0
+            for compounds, properties in loader:
+                opt.zero_grad()
+                predictions = comp_net(compounds)
+                loss = loss_fn(predictions, properties)
+                loss.backward()
+                opt.step()
+                total_loss += loss.item()
+                i += 1
+            logger.info(f'epoch {epoch+1}, {i} batches: {total_loss / (n_data_train * (epoch+1))}')
+
+        comp_net.eval()
+        train_pred = comp_net(embedding_list_train)
+        test_pred = comp_net(embedding_list_test)
+
+        return train_pred, test_pred
+
+
+class CompoundData(Dataset):
+
+    def __init__(self, compounds, properties):
+        self.compounds = compounds
+        self.properties = properties
+
+    def __len__(self):
+        return len(self.compounds)
+
+    def __getitem__(self, compound_index):
+        return self.compounds[compound_index,:], self.properties[compound_index]
+
+
+class CompoundNet(torch.nn.Module):
+
+    def __init__(self, n_input_features, hidden_layer_sizes, activation_fn, last_activation_fn=None):
+        super(CompoundNet, self).__init__()
+        hidden_layer_sizes.append(1) # output layer size is appended to hidden layer sizes
+        layers = [torch.nn.Linear(n_input_features, hidden_layer_sizes[0])]
+        try:
+            activation = getattr(torch.nn, activation_fn)
+            if last_activation_fn:
+                last_activation = getattr(torch.nn, last_activation_fn)
+        except Exception as e:
+            raise UserError(f'Activation function name {activation_fn} / {last_activation_fn} not recognized')
+        for i, hidden_layer_size in enumerate(hidden_layer_sizes[:-1]):
+            layers.append(activation())
+            layers.append(torch.nn.Linear(hidden_layer_size, hidden_layer_sizes[i + 1]))
+        if last_activation_fn:
+            # Having a non-linear function right before the output may not be needed for some properties being predicted
+            layers.append(last_activation())
+        self.layers = torch.nn.Sequential(*layers)
+
+    def forward(self, x):
+        return self.layers(x)
diff --git a/cuchem/requirements.txt b/cuchem/requirements.txt
index 5cdeaa6d..fc6cce1e 100644
--- a/cuchem/requirements.txt
+++ b/cuchem/requirements.txt
@@ -24,4 +24,4 @@ plotly==4.9.0
 pytest==6.2.2
 umap-learn==0.5.1
 grpcio
-git+https://github.com/jrwnter/cddd.git@1.0
\ No newline at end of file
+git+https://github.com/jrwnter/cddd.git@1.0
diff --git a/cuchem/startdash.py b/cuchem/startdash.py
index e2ca5355..9a92167c 100755
--- a/cuchem/startdash.py
+++ b/cuchem/startdash.py
@@ -168,9 +168,19 @@ def cache(self):
                 from cuchemcommon.fingerprint import Embeddings
                 prepocess_type = Embeddings
 
+            # TODO: when loading precomputed fingerprints, the radius and size should be specified
+            # For now, we are hard-coding this information:
+            nBits = 512
+            radius = 2
             chem_data = ChEmblData(fp_type=prepocess_type)
+            subdir = f'{args.cache_directory}/fp_r{radius}_n{nBits}'
+            if not os.path.isdir(subdir):
+                os.mkdir(subdir)
+
+            logging.info(f'client: saving fingerprints')
+            # This will trigger a reread if fingerprints are not found in the cache directory!
             chem_data.save_fingerprints(
-                os.path.join(args.cache_directory, FINGER_PRINT_FILES), num_recs = args.n_mol,
+                os.path.join(subdir, FINGER_PRINT_FILES), num_recs = args.n_mol,
                 batch_size=args.batch_size)
 
             logger.info('Fingerprint generated in (hh:mm:ss.ms) {}'.format(
diff --git a/megamolbart/megamolbart/inference.py b/megamolbart/megamolbart/inference.py
index 7dab48e5..80c727da 100644
--- a/megamolbart/megamolbart/inference.py
+++ b/megamolbart/megamolbart/inference.py
@@ -236,6 +236,14 @@ def find_similars_smiles(self,
             dims.append(tuple(neighboring_embedding.shape))
             embeddings.append(neighboring_embedding.flatten().tolist())
 
+        # Rest of the applications and libraries use RAPIDS and cuPY libraries.
+        # For interoperability, we need to convert the embeddings to cupy.
+        embeddings = []
+        dims = []
+        for neighboring_embedding in neighboring_embeddings:
+            dims.append(neighboring_embedding.shape)
+            embeddings.append(neighboring_embedding.flatten().tolist())
+
         generated_df = pd.DataFrame({'SMILES': generated_mols,
                                      'embeddings': embeddings,
                                      'embeddings_dim': dims,
@@ -296,9 +304,9 @@ def interpolate_smiles(self,
             result_df.append(interp_df)
 
         result_df = pd.concat(result_df)
-        smile_list = list(result_df['SMILES'])
+        smiles_list = list(result_df['SMILES'])
 
-        return result_df, smile_list
+        return result_df, smiles_list
 
     def compute_unique_smiles(self,
                               interp_df,
diff --git a/megamolbart/megamolbart/service.py b/megamolbart/megamolbart/service.py
index eb16d786..db17fed9 100644
--- a/megamolbart/megamolbart/service.py
+++ b/megamolbart/megamolbart/service.py
@@ -24,9 +24,9 @@ def __init__(self, *args, **kwargs):
     # TODO how to handle length overrun for batch processing --> see also MegaMolBART.load_model in inference.py
     def SmilesToEmbedding(self, spec, context):
 
-        smile_str = ''.join(spec.smiles)
+        smiles_str = ''.join(spec.smiles)
 
-        embedding, pad_mask = self.megamolbart.smiles2embedding(smile_str,
+        embedding, pad_mask = self.megamolbart.smiles2embedding(smiles_str,
                                                                 pad_length=spec.padding)
         dim = embedding.shape
         embedding = embedding.flatten().tolist()
@@ -51,10 +51,10 @@ def EmbeddingToSmiles(self, embedding_spec, context):
 
     def FindSimilars(self, spec, context):
 
-        smile_str = ''.join(spec.smiles)
+        smiles_str = ''.join(spec.smiles)
 
         generated_df = self.megamolbart.find_similars_smiles(
-                smile_str,
+                smiles_str,
                 num_requested=spec.numRequested,
                 scaled_radius=spec.radius,
                 sanitize=spec.sanitize,
diff --git a/megamolbart/scripts/download_model.sh b/megamolbart/scripts/download_model.sh
new file mode 100755
index 00000000..c77446f6
--- /dev/null
+++ b/megamolbart/scripts/download_model.sh
@@ -0,0 +1,14 @@
+MEGAMOLBART_MODEL_VERSION=0.1
+MEGAMOLBART_MODEL_PATH=/models/megamolbart
+
+DOWNLOAD_URL="https://api.ngc.nvidia.com/v2/models/nvidia/clara/megamolbart/versions/${MEGAMOLBART_MODEL_VERSION}/zip"
+echo -e "${YELLOW}Downloading model megamolbart to ${MEGAMOLBART_MODEL_PATH}...${RESET}"
+
+mkdir -p ${MEGAMOLBART_MODEL_PATH}
+set -x
+wget -q --show-progress \
+    --content-disposition ${DOWNLOAD_URL} \
+    -O ${MEGAMOLBART_MODEL_PATH}/megamolbart_${MEGAMOLBART_MODEL_VERSION}.zip
+mkdir ${MEGAMOLBART_MODEL_PATH}
+unzip -q ${MEGAMOLBART_MODEL_PATH}/megamolbart_${MEGAMOLBART_MODEL_VERSION}.zip \
+    -d ${MEGAMOLBART_MODEL_PATH}
diff --git a/megamolbart/tests/test_grpc.py b/megamolbart/tests/test_grpc.py
index 4c58e44a..9c7f71d8 100644
--- a/megamolbart/tests/test_grpc.py
+++ b/megamolbart/tests/test_grpc.py
@@ -10,6 +10,7 @@
 from megamolbart.service import GenerativeSampler
 import generativesampler_pb2
 import generativesampler_pb2_grpc
+from util import (DEFAULT_NUM_LAYERS, DEFAULT_D_MODEL, DEFAULT_NUM_HEADS, CHECKPOINTS_DIR)
 
 logger = logging.getLogger(__name__)
 
@@ -29,6 +30,14 @@ def similarity(add_server_method, service_cls, stub_cls):
     finally:
         server.stop(None)
 
+def test_fetch_iterations():
+    sys.argv = [sys.argv[0]]
+    with similarity(generativesampler_pb2_grpc.add_GenerativeSamplerServicer_to_server,
+                    GenerativeSampler,
+                    generativesampler_pb2_grpc.GenerativeSamplerStub) as stub:
+
+        result = stub.GetIteration(generativesampler_pb2.google_dot_protobuf_dot_empty__pb2.Empty())
+
 
 def test_dataframe_similar():
     sys.argv = [sys.argv[0]]
diff --git a/setup/ansible-nvidia-driver.yml b/setup/ansible-nvidia-driver.yml
index b4aba4a3..6ea2074f 100644
--- a/setup/ansible-nvidia-driver.yml
+++ b/setup/ansible-nvidia-driver.yml
@@ -5,12 +5,12 @@
   tasks:
     - name: Add CUDA apt-key
       apt_key:
-        url: https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
+        url: https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub
         state: present
 
     - name: Add CUDA apt repository
       apt_repository:
-        repo: 'deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /'
+        repo: 'deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /'
         state: present
         filename: nvidia
         update_cache: yes
@@ -27,9 +27,9 @@
         state: present
         update_cache: yes
       with_items:
-        - 'deb https://nvidia.github.io/libnvidia-container/stable/ubuntu18.04/amd64/ /'
-        - 'deb https://nvidia.github.io/nvidia-container-runtime/stable/ubuntu18.04/amd64/ /'
-        - 'deb https://nvidia.github.io/nvidia-docker/ubuntu18.04/amd64/ /'
+        - 'deb https://nvidia.github.io/libnvidia-container/stable/ubuntu20.04/amd64/ /'
+        - 'deb https://nvidia.github.io/nvidia-container-runtime/stable/ubuntu20.04/amd64/ /'
+        - 'deb https://nvidia.github.io/nvidia-docker/ubuntu20.04/amd64/ /'
       register: nvidia_container_runtime_apt_repo
 
     - name: Install Nvidia Driver
diff --git a/setup/env.sh b/setup/env.sh
index a4bda685..aa45ff0d 100644
--- a/setup/env.sh
+++ b/setup/env.sh
@@ -212,4 +212,4 @@ download_model() {
 
     ln -nsf ${MODEL_PATH}/megamolbart_v${MEGAMOLBART_MODEL_VERSION} ${MODEL_PATH}/megamolbart
     set +e
-}
+}
\ No newline at end of file
diff --git a/setup/launch b/setup/launch
index 2da4998b..c6a89650 100755
--- a/setup/launch
+++ b/setup/launch
@@ -79,7 +79,6 @@ start() {
     validate_docker
     # run a container and start dash inside container.
     echo "${CUCHEM_CONT} ${MEGAMOLBART_CONT}"
-
     export ADDITIONAL_PARAM="$@"
 
     download_model
@@ -102,7 +101,7 @@ start() {
 
 
 stop() {
-    docker-compose --env-file .env  \
+    ./docker-compose --env-file .env  \
         -f docker_compose.yml \
         --project-directory . \
         down