From 0d524e41df92d707fee4e5cb9c8fbb5579ed48eb Mon Sep 17 00:00:00 2001 From: Rajesh Ilango Date: Fri, 27 Aug 2021 10:49:12 -0700 Subject: [PATCH 01/27] Changes to improve runtime performance of benchmark tests. With this change intermediate results are stored in sqlite database. All results from generative models are stored in SQLite database. With this change, we reduce the total number of requests to MegaMolbart gRPC service by 3/4th. Additionally, training dataset is loaded into SQLite database and used while computing Novelity metric. With this change, a request to check Novelity is 2ms. Other changes include: - Upgrade to rapids 2021.06 - Clean dockerfile to remove all workarrounds - Remove the need for conda cuchem env inside the container. - Ability to select docker image to build using launch script - Add smile2embedding and embedding2smile to cddd - Use hydra for benchmark configuration --- .dockerignore | 3 + .gitignore | 3 + Dockerfile.cuchem | 39 +- Dockerfile.megamolbart | 3 +- common/cuchemcommon/utils/__init__.py | 1 + common/cuchemcommon/workflow.py | 78 ++-- common/generated/generativesampler_pb2.py | 70 +++- .../generated/generativesampler_pb2_grpc.py | 7 +- common/grpc/generativesampler.proto | 19 +- common/requirements.txt | 2 +- cuchem/benchmark/scripts/benchmark.sql | 19 + cuchem/benchmark/scripts/benchmark.yaml | 38 ++ cuchem/benchmark/scripts/megamolbart.sh | 73 ++++ cuchem/conda/env.yml | 41 -- cuchem/cuchem/benchmark/data.py | 171 ++++++++ cuchem/cuchem/benchmark/megamolbart.py | 254 ++++++------ .../create_ZINC15_trie_multiprocessing.py | 10 +- cuchem/cuchem/datasets/loaders.py | 2 +- cuchem/cuchem/metrics/model.py | 377 +++++++++++------- cuchem/cuchem/utils/dataset.py | 2 +- cuchem/cuchem/wf/generative/cddd.py | 65 ++- .../cuchem/wf/generative/megatronmolbart.py | 86 ++-- cuchem/requirements.txt | 9 + cuchem/tests/test_benchmark_data.py | 20 + launch.sh | 47 ++- megamolbart/launch.py | 10 +- megamolbart/megamolbart/inference.py | 136 ++++--- megamolbart/megamolbart/service.py | 57 ++- megamolbart/tests/pytest.ini | 2 + megamolbart/tests/test_grpc.py | 22 +- megamolbart/tests/test_megamolbart.py | 8 +- misc/triton/molbart/model.py | 13 +- setup/docker_compose.yml | 2 +- 33 files changed, 1142 insertions(+), 547 deletions(-) create mode 100644 cuchem/benchmark/scripts/benchmark.sql create mode 100644 cuchem/benchmark/scripts/benchmark.yaml create mode 100755 cuchem/benchmark/scripts/megamolbart.sh delete mode 100644 cuchem/conda/env.yml create mode 100644 cuchem/cuchem/benchmark/data.py create mode 100644 cuchem/tests/test_benchmark_data.py create mode 100644 megamolbart/tests/pytest.ini diff --git a/.dockerignore b/.dockerignore index 0bd5be1a..784cf497 100644 --- a/.dockerignore +++ b/.dockerignore @@ -26,3 +26,6 @@ dask-worker-space .cache_dir portal megamolbart/models + +outputs/ +chemportal/frontend/node_modules/ \ No newline at end of file diff --git a/.gitignore b/.gitignore index ada38339..d657ae0d 100644 --- a/.gitignore +++ b/.gitignore @@ -31,3 +31,6 @@ dask-worker-space .cache_dir benchmark.csv .env +chemportal/frontend/node_modules/ +chemportal/outputs/ +outputs/ diff --git a/Dockerfile.cuchem b/Dockerfile.cuchem index c971310c..b04771c9 100644 --- a/Dockerfile.cuchem +++ b/Dockerfile.cuchem @@ -1,36 +1,25 @@ # Copyright 2020 NVIDIA Corporation -FROM nvidia/cuda:11.0-base +FROM rapidsai/rapidsai:21.08-cuda11.2-runtime-ubuntu20.04-py3.7 RUN apt-get update \ && apt-get upgrade -y \ && DEBIAN_FRONTEND=noninteractive apt-get install -y wget git unzip tmux \ && rm -rf /var/lib/apt/lists/* -SHELL ["/bin/bash", "-c"] -RUN wget --quiet -O /tmp/miniconda.sh \ - https://repo.anaconda.com/miniconda/Miniconda3-py37_4.9.2-Linux-x86_64.sh \ - && /bin/bash /tmp/miniconda.sh -b -p /opt/conda \ - && rm /tmp/miniconda.sh \ - && ln -s /opt/conda/etc/profile.d/conda.sh /etc/profile.d/conda.sh - -ENV PATH /opt/conda/bin:$PATH - -# Copy conda env spec. -COPY cuchem/conda/env.yml /tmp - -RUN conda env create --name cuchem -f /tmp/env.yml \ - && rm /tmp/env.yml\ - && conda clean -afy -ENV PATH /opt/conda/envs/cuchem/bin:$PATH - -RUN source activate cuchem && python3 -m ipykernel install --user --name=cuchem -RUN echo "source activate cuchem" > /etc/bash.bashrc +SHELL ["conda", "run", "-n", "rapids", "/bin/bash", "-c"] +RUN conda install -y -c conda-forge -n rapids rdkit==2020.09.1.0 +# ADD "https://www.random.org/cgi-bin/randbyte?nbytes=10&format=h" skipcache +# Any line that needs to be executed without refering to cache should be below this line. COPY ./ /opt/nvidia/cheminfomatics -RUN cd /opt/nvidia/cheminfomatics/common; pip install . -RUN cd /opt/nvidia/cheminfomatics/cuchem; pip install -r requirements.txt - -ENV PYTHONPATH /opt/nvidia/cheminfomatics/common/generated:$PYTHONPATH +RUN cd /opt/nvidia/cheminfomatics/common; \ + pip install -r requirements.txt +RUN cd /opt/nvidia/cheminfomatics/cuchem; \ + pip install -r requirements.txt +RUN cd /opt/nvidia/cheminfomatics/chemportal; \ + pip install -r requirements.txt ENV UCX_LOG_LEVEL error -CMD cd /opt/nvidia/cheminfomatics; ./launch.sh start \ No newline at end of file +ENV PYTHONPATH ./common/generated:./common:./cuchem:/./chemportal + +CMD cd /opt/nvidia/cheminfomatics; ./launch.sh start diff --git a/Dockerfile.megamolbart b/Dockerfile.megamolbart index 935f40a4..04dab768 100644 --- a/Dockerfile.megamolbart +++ b/Dockerfile.megamolbart @@ -39,7 +39,8 @@ RUN cd /tmp/common; pip install . RUN mkdir -p /opt/nvidia/cuchem/grpc COPY common/generated /opt/nvidia/cuchem/grpc -ENV PYTHONPATH /opt/nvidia/cuchem/grpc:$PYTHONPATH + +ENV PYTHONPATH /opt/nvidia/cuchem/grpc:/opt/nvidia/cheminfomatics/common:$PYTHONPATH COPY megamolbart/ /opt/nvidia/megamolbart/ CMD cd /opt/nvidia/megamolbart && python3 launch.py diff --git a/common/cuchemcommon/utils/__init__.py b/common/cuchemcommon/utils/__init__.py index e69de29b..0de2d947 100644 --- a/common/cuchemcommon/utils/__init__.py +++ b/common/cuchemcommon/utils/__init__.py @@ -0,0 +1 @@ +from cuchemcommon.utils.singleton import Singleton \ No newline at end of file diff --git a/common/cuchemcommon/workflow.py b/common/cuchemcommon/workflow.py index 1ca1d252..7df63d41 100644 --- a/common/cuchemcommon/workflow.py +++ b/common/cuchemcommon/workflow.py @@ -11,15 +11,20 @@ @singledispatch -def add_jitter(embedding, radius, cnt): +def add_jitter(embedding, radius, cnt, shape): return NotImplemented @add_jitter.register(np.ndarray) -def _(embedding, radius, cnt): - noise = np.random.normal(0, radius, (cnt,) + embedding.shape) +def _(embedding, radius, cnt, shape): - return noise + embedding + distorteds = [] + for i in range(cnt): + noise = np.random.normal(0, radius, embedding.shape) + distorted = noise + embedding + distorteds.append(distorted) + + return distorteds class BaseGenerativeWorkflow: @@ -28,11 +33,25 @@ def __init__(self, dao: GenerativeWfDao = None) -> None: self.dao = dao self.min_jitter_radius = None - def interpolate_from_smiles(self, - smiles: List, - num_points: int = 10, - scaled_radius=None, - force_unique=False): + def get_iteration(self): + NotImplemented + + def smiles_to_embedding(self, + smiles: str, + padding: int): + NotImplemented + + def embedding_to_smiles(self, + embedding: float, + dim: int, + pad_mask): + NotImplemented + + def interpolate_smiles(self, + smiles: List, + num_points: int = 10, + scaled_radius=None, + force_unique=False): NotImplemented def find_similars_smiles_list(self, @@ -58,15 +77,15 @@ def _compute_radius(self, scaled_radius): def addjitter(self, embedding, radius=None, - cnt=1): + cnt=1, + shape=None): radius = radius if radius else self.radius_scale - return add_jitter(embedding, radius, cnt) + return add_jitter(embedding, radius, cnt, shape) def compute_unique_smiles(self, interp_df, - embeddings, embedding_funct, - radius=None): + scaled_radius=None): """ Identify duplicate SMILES and distorts the embedding. The input df must have columns 'SMILES' and 'Generated' at 0th and 1st position. @@ -77,8 +96,9 @@ def compute_unique_smiles(self, Instead it simply orders the df by SMILES to identify the duplicates. """ - radius = radius if radius else self.min_jitter_radius - + distance = self._compute_radius(scaled_radius) + embeddings = interp_df['embeddings'] + embeddings_dim = interp_df['embeddings_dim'] for index, row in interp_df.iterrows(): smile_string = row['SMILES'] try: @@ -99,11 +119,15 @@ def compute_unique_smiles(self, if len(duplicates) > 0: for dup_idx in duplicates: - if interp_df.iat[dup_idx, 1]: + if interp_df.iat[dup_idx, 3]: # add jitter to generated molecules only - embeddings[dup_idx] = self.addjitter( - embeddings[dup_idx], radius, 1) - interp_df['SMILES'] = embedding_funct(embeddings) + distored = self.addjitter(embeddings[dup_idx], + distance, + cnt=1, + shape=embeddings_dim[dup_idx]) + embeddings[dup_idx] = distored[0] + interp_df['SMILES'] = embedding_funct(embeddings.to_list()) + interp_df['embeddings'] = embeddings else: break @@ -116,9 +140,11 @@ def compute_unique_smiles(self, invalid_index = invalid_mol_df.index.to_list() for idx in invalid_index: embeddings[idx] = self.addjitter(embeddings[idx], - radius, - cnt=1) - interp_df['SMILES'] = embedding_funct(embeddings) + distance, + cnt=1, + shape=embeddings_dim[idx])[0] + interp_df['SMILES'] = embedding_funct(embeddings.to_list()) + interp_df['embeddings'] = embeddings else: break @@ -146,10 +172,10 @@ def interpolate_by_id(self, else: raise Exception('id type %s not supported' % id_type) - return self.interpolate_from_smiles(smiles, - num_points=num_points, - scaled_radius=scaled_radius, - force_unique=force_unique) + return self.interpolate_smiles(smiles, + num_points=num_points, + scaled_radius=scaled_radius, + force_unique=force_unique) def find_similars_smiles_by_id(self, chemble_id: str, diff --git a/common/generated/generativesampler_pb2.py b/common/generated/generativesampler_pb2.py index 1da013ee..50316af9 100644 --- a/common/generated/generativesampler_pb2.py +++ b/common/generated/generativesampler_pb2.py @@ -12,6 +12,7 @@ _sym_db = _symbol_database.Default() +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 DESCRIPTOR = _descriptor.FileDescriptor( @@ -20,8 +21,9 @@ syntax='proto3', serialized_options=None, create_key=_descriptor._internal_create_key, - serialized_pb=b'\n\x17generativesampler.proto\x12\x1bnvidia.cheminformatics.grpc\"\xcb\x01\n\x0eGenerativeSpec\x12;\n\x05model\x18\x01 \x01(\x0e\x32,.nvidia.cheminformatics.grpc.GenerativeModel\x12\x0e\n\x06smiles\x18\x02 \x03(\t\x12\x13\n\x06radius\x18\x03 \x01(\x02H\x00\x88\x01\x01\x12\x19\n\x0cnumRequested\x18\x04 \x01(\x05H\x01\x88\x01\x01\x12\x14\n\x07padding\x18\x05 \x01(\x05H\x02\x88\x01\x01\x42\t\n\x07_radiusB\x0f\n\r_numRequestedB\n\n\x08_padding\"%\n\nSmilesList\x12\x17\n\x0fgeneratedSmiles\x18\x01 \x03(\t\"\"\n\rEmbeddingList\x12\x11\n\tembedding\x18\x01 \x03(\x02\"!\n\x0cIterationVal\x12\x11\n\titeration\x18\x01 \x01(\x05*9\n\x0fGenerativeModel\x12\x08\n\x04\x43\x44\x44\x44\x10\x00\x12\x0b\n\x07MolBART\x10\x01\x12\x0f\n\x0bMegaMolBART\x10\x02\x32\xbc\x03\n\x11GenerativeSampler\x12n\n\x11SmilesToEmbedding\x12+.nvidia.cheminformatics.grpc.GenerativeSpec\x1a*.nvidia.cheminformatics.grpc.EmbeddingList\"\x00\x12\x66\n\x0c\x46indSimilars\x12+.nvidia.cheminformatics.grpc.GenerativeSpec\x1a\'.nvidia.cheminformatics.grpc.SmilesList\"\x00\x12\x65\n\x0bInterpolate\x12+.nvidia.cheminformatics.grpc.GenerativeSpec\x1a\'.nvidia.cheminformatics.grpc.SmilesList\"\x00\x12h\n\x0cGetIteration\x12+.nvidia.cheminformatics.grpc.GenerativeSpec\x1a).nvidia.cheminformatics.grpc.IterationVal\"\x00\x62\x06proto3' -) + serialized_pb=b'\n\x17generativesampler.proto\x12\x1bnvidia.cheminformatics.grpc\x1a\x1bgoogle/protobuf/empty.proto\"\x99\x02\n\x0eGenerativeSpec\x12;\n\x05model\x18\x01 \x01(\x0e\x32,.nvidia.cheminformatics.grpc.GenerativeModel\x12\x0e\n\x06smiles\x18\x02 \x03(\t\x12\x13\n\x06radius\x18\x03 \x01(\x02H\x00\x88\x01\x01\x12\x19\n\x0cnumRequested\x18\x04 \x01(\x05H\x01\x88\x01\x01\x12\x14\n\x07padding\x18\x05 \x01(\x05H\x02\x88\x01\x01\x12\x18\n\x0b\x66orceUnique\x18\x06 \x01(\x08H\x03\x88\x01\x01\x12\x15\n\x08sanitize\x18\x07 \x01(\x08H\x04\x88\x01\x01\x42\t\n\x07_radiusB\x0f\n\r_numRequestedB\n\n\x08_paddingB\x0e\n\x0c_forceUniqueB\x0b\n\t_sanitize\"e\n\nSmilesList\x12\x17\n\x0fgeneratedSmiles\x18\x01 \x03(\t\x12>\n\nembeddings\x18\x02 \x03(\x0b\x32*.nvidia.cheminformatics.grpc.EmbeddingList\"A\n\rEmbeddingList\x12\x11\n\tembedding\x18\x01 \x03(\x02\x12\x0b\n\x03\x64im\x18\x02 \x03(\x05\x12\x10\n\x08pad_mask\x18\x03 \x03(\x08\"!\n\x0cIterationVal\x12\x11\n\titeration\x18\x01 \x01(\x05*:\n\x0fGenerativeModel\x12\x08\n\x04\x43\x44\x44\x44\x10\x00\x12\x0f\n\x0bMegaMolBART\x10\x01\x12\x0c\n\x07MolBART\x10\x90N2\x93\x04\n\x11GenerativeSampler\x12n\n\x11SmilesToEmbedding\x12+.nvidia.cheminformatics.grpc.GenerativeSpec\x1a*.nvidia.cheminformatics.grpc.EmbeddingList\"\x00\x12j\n\x11\x45mbeddingToSmiles\x12*.nvidia.cheminformatics.grpc.EmbeddingList\x1a\'.nvidia.cheminformatics.grpc.SmilesList\"\x00\x12\x66\n\x0c\x46indSimilars\x12+.nvidia.cheminformatics.grpc.GenerativeSpec\x1a\'.nvidia.cheminformatics.grpc.SmilesList\"\x00\x12\x65\n\x0bInterpolate\x12+.nvidia.cheminformatics.grpc.GenerativeSpec\x1a\'.nvidia.cheminformatics.grpc.SmilesList\"\x00\x12S\n\x0cGetIteration\x12\x16.google.protobuf.Empty\x1a).nvidia.cheminformatics.grpc.IterationVal\"\x00\x62\x06proto3' + , + dependencies=[google_dot_protobuf_dot_empty__pb2.DESCRIPTOR,]) _GENERATIVEMODEL = _descriptor.EnumDescriptor( name='GenerativeModel', @@ -48,8 +50,8 @@ ], containing_type=None, serialized_options=None, - serialized_start=372, - serialized_end=429, + serialized_start=574, + serialized_end=632, ) _sym_db.RegisterEnumDescriptor(_GENERATIVEMODEL) @@ -103,6 +105,20 @@ message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='forceUnique', full_name='nvidia.cheminformatics.grpc.GenerativeSpec.forceUnique', index=5, + number=6, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='sanitize', full_name='nvidia.cheminformatics.grpc.GenerativeSpec.sanitize', index=6, + number=7, type=8, cpp_type=7, label=1, + has_default_value=False, default_value=False, + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -129,9 +145,19 @@ index=2, containing_type=None, create_key=_descriptor._internal_create_key, fields=[]), + _descriptor.OneofDescriptor( + name='_forceUnique', full_name='nvidia.cheminformatics.grpc.GenerativeSpec._forceUnique', + index=3, containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[]), + _descriptor.OneofDescriptor( + name='_sanitize', full_name='nvidia.cheminformatics.grpc.GenerativeSpec._sanitize', + index=4, containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[]), ], - serialized_start=57, - serialized_end=260, + serialized_start=86, + serialized_end=367, ) @@ -150,6 +176,13 @@ message_type=None, enum_type=None, containing_type=None, is_extension=False, extension_scope=None, serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), + _descriptor.FieldDescriptor( + name='embeddings', full_name='nvidia.cheminformatics.grpc.SmilesList.embeddings', index=1, + number=2, type=11, cpp_type=10, label=3, + has_default_value=False, default_value=[], + message_type=None, enum_type=None, containing_type=None, + is_extension=False, extension_scope=None, + serialized_options=None, file=DESCRIPTOR, create_key=_descriptor._internal_create_key), ], extensions=[ ], @@ -162,8 +195,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=262, - serialized_end=299, + serialized_start=369, + serialized_end=470, ) @@ -194,8 +227,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=301, - serialized_end=335, + serialized_start=472, + serialized_end=537, ) @@ -226,8 +259,8 @@ extension_ranges=[], oneofs=[ ], - serialized_start=337, - serialized_end=370, + serialized_start=539, + serialized_end=572, ) _GENERATIVESPEC.fields_by_name['model'].enum_type = _GENERATIVEMODEL @@ -240,6 +273,13 @@ _GENERATIVESPEC.oneofs_by_name['_padding'].fields.append( _GENERATIVESPEC.fields_by_name['padding']) _GENERATIVESPEC.fields_by_name['padding'].containing_oneof = _GENERATIVESPEC.oneofs_by_name['_padding'] +_GENERATIVESPEC.oneofs_by_name['_forceUnique'].fields.append( + _GENERATIVESPEC.fields_by_name['forceUnique']) +_GENERATIVESPEC.fields_by_name['forceUnique'].containing_oneof = _GENERATIVESPEC.oneofs_by_name['_forceUnique'] +_GENERATIVESPEC.oneofs_by_name['_sanitize'].fields.append( + _GENERATIVESPEC.fields_by_name['sanitize']) +_GENERATIVESPEC.fields_by_name['sanitize'].containing_oneof = _GENERATIVESPEC.oneofs_by_name['_sanitize'] +_SMILESLIST.fields_by_name['embeddings'].message_type = _EMBEDDINGLIST DESCRIPTOR.message_types_by_name['GenerativeSpec'] = _GENERATIVESPEC DESCRIPTOR.message_types_by_name['SmilesList'] = _SMILESLIST DESCRIPTOR.message_types_by_name['EmbeddingList'] = _EMBEDDINGLIST @@ -284,8 +324,8 @@ index=0, serialized_options=None, create_key=_descriptor._internal_create_key, - serialized_start=432, - serialized_end=876, + serialized_start=635, + serialized_end=1166, methods=[ _descriptor.MethodDescriptor( name='SmilesToEmbedding', @@ -322,7 +362,7 @@ full_name='nvidia.cheminformatics.grpc.GenerativeSampler.GetIteration', index=3, containing_service=None, - input_type=_GENERATIVESPEC, + input_type=google_dot_protobuf_dot_empty__pb2._EMPTY, output_type=_ITERATIONVAL, serialized_options=None, create_key=_descriptor._internal_create_key, diff --git a/common/generated/generativesampler_pb2_grpc.py b/common/generated/generativesampler_pb2_grpc.py index 55c2d989..1629666b 100644 --- a/common/generated/generativesampler_pb2_grpc.py +++ b/common/generated/generativesampler_pb2_grpc.py @@ -3,6 +3,7 @@ import grpc import generativesampler_pb2 as generativesampler__pb2 +from google.protobuf import empty_pb2 as google_dot_protobuf_dot_empty__pb2 class GenerativeSamplerStub(object): @@ -31,7 +32,7 @@ def __init__(self, channel): ) self.GetIteration = channel.unary_unary( '/nvidia.cheminformatics.grpc.GenerativeSampler/GetIteration', - request_serializer=generativesampler__pb2.GenerativeSpec.SerializeToString, + request_serializer=google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, response_deserializer=generativesampler__pb2.IterationVal.FromString, ) @@ -83,7 +84,7 @@ def add_GenerativeSamplerServicer_to_server(servicer, server): ), 'GetIteration': grpc.unary_unary_rpc_method_handler( servicer.GetIteration, - request_deserializer=generativesampler__pb2.GenerativeSpec.FromString, + request_deserializer=google_dot_protobuf_dot_empty__pb2.Empty.FromString, response_serializer=generativesampler__pb2.IterationVal.SerializeToString, ), } @@ -159,7 +160,7 @@ def GetIteration(request, timeout=None, metadata=None): return grpc.experimental.unary_unary(request, target, '/nvidia.cheminformatics.grpc.GenerativeSampler/GetIteration', - generativesampler__pb2.GenerativeSpec.SerializeToString, + google_dot_protobuf_dot_empty__pb2.Empty.SerializeToString, generativesampler__pb2.IterationVal.FromString, options, channel_credentials, insecure, call_credentials, compression, wait_for_ready, timeout, metadata) diff --git a/common/grpc/generativesampler.proto b/common/grpc/generativesampler.proto index af0cae5c..466058cb 100644 --- a/common/grpc/generativesampler.proto +++ b/common/grpc/generativesampler.proto @@ -2,11 +2,15 @@ syntax = "proto3"; package nvidia.cheminformatics.grpc; -// python -m grpc_tools.protoc -I./grpc/ \ -// --python_out=generated \ -// --experimental_allow_proto3_optional \ -// --grpc_python_out=generated \ -// ./grpc/generativesampler.proto +import "google/protobuf/empty.proto"; + +//python -m pip install grpcio +//python -m pip install grpcio-tools +//python -m grpc_tools.protoc -I./grpc/ \ +// --python_out=generated \ +// --experimental_allow_proto3_optional \ +// --grpc_python_out=generated \ +// ./grpc/generativesampler.proto enum GenerativeModel { @@ -20,7 +24,7 @@ service GenerativeSampler { rpc SmilesToEmbedding(GenerativeSpec) returns (EmbeddingList) {}; rpc FindSimilars(GenerativeSpec) returns (SmilesList) {}; rpc Interpolate(GenerativeSpec) returns (SmilesList) {}; - rpc GetIteration(GenerativeSpec) returns (IterationVal) {}; + rpc GetIteration(google.protobuf.Empty) returns (IterationVal) {}; } @@ -30,11 +34,14 @@ message GenerativeSpec { optional float radius = 3; optional int32 numRequested = 4; optional int32 padding = 5; + optional bool forceUnique = 6; + optional bool sanitize = 7; } message SmilesList { repeated string generatedSmiles = 1; + repeated EmbeddingList embeddings = 2; } message EmbeddingList{ diff --git a/common/requirements.txt b/common/requirements.txt index 1b86e305..b6fdab03 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -1,2 +1,2 @@ -dask[complete]==2021.03.0 +dask[complete]==2021.8.1 sqlalchemy==1.3.20 diff --git a/cuchem/benchmark/scripts/benchmark.sql b/cuchem/benchmark/scripts/benchmark.sql new file mode 100644 index 00000000..2a1966cf --- /dev/null +++ b/cuchem/benchmark/scripts/benchmark.sql @@ -0,0 +1,19 @@ +CREATE TABLE IF NOT EXISTS smiles ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + smiles TEXT NULL, + num_samples INTEGER DEFAULT 10, + scaled_radius REAL, + force_unique INTEGER, + sanitize INTEGER, + UNIQUE(smiles, num_samples, scaled_radius, force_unique, sanitize) +); + +CREATE INDEX IF NOT EXISTS smiles_index ON smiles (smiles, num_samples, scaled_radius, force_unique, sanitize); + +CREATE TABLE IF NOT EXISTS smiles_samples ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + input_id INTEGER NOT NULL, + smiles TEXT NOT NULL, + embedding TEXT NOT NULL, + embedding_dim TEXT NOT NULL +); diff --git a/cuchem/benchmark/scripts/benchmark.yaml b/cuchem/benchmark/scripts/benchmark.yaml new file mode 100644 index 00000000..27310c5d --- /dev/null +++ b/cuchem/benchmark/scripts/benchmark.yaml @@ -0,0 +1,38 @@ +model: + name: MegaMolBART + params: + url: localhost:50051 + +metric: + validity: + enabled: false + radius_list: + - 1 + # - 2 + # - 5 + + unique: + enabled: false + radius_list: ${metric.validity.radius_list} + + novelty: + enabled: false + radius_list: ${metric.validity.radius_list} + + nearestNeighborCorrelation: + enabled: false + top_k_list: + - 50 + - 100 + - 500 + + modelability: + enabled: true + +samplingSpec: + input_size: 1000 + seq_len: 512 + sample_size: 10 + +output: + path: ./benchmark_output/ diff --git a/cuchem/benchmark/scripts/megamolbart.sh b/cuchem/benchmark/scripts/megamolbart.sh new file mode 100755 index 00000000..8168a389 --- /dev/null +++ b/cuchem/benchmark/scripts/megamolbart.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash + +SCRIPT_LOC=$(dirname "$0") + +ID=100 +ACTION="up" +GPU_ID="0" +CHECKPOINT_DIR="/models/megamolbart/checkpoints" +SIZE='' +NUM_LAYERS=4 +HIDDEN_SIZE=256 +NUM_ATTENTION_HEADS=8 + +while [[ $# -gt 0 ]]; do + key="$1" + case $key in + --id) + ID=$2 + shift + shift + ;; + --gpu) + GPU_ID=$2 + shift + shift + ;; + --stop) + ACTION=stop + shift + shift + ;; + --ckp) + CHECKPOINT_DIR=$2 + shift + shift + ;; + --size) + SIZE=$2 + shift + shift + ;; + *) + shift + ;; + esac +done +export RUN_ID="_${ID}" +export PLOTLY_PORT="5${ID}" + +export SUBNET=192.${ID}.100.0/16 +export IP_CUCHEM_UI=192.${ID}.100.1 +export IP_MEGAMOLBART=192.${ID}.100.2 + +export CUCHEM_UI_START_CMD="python3 ./cuchem/cuchem/benchmark/megamolbart.py --config-dir /workspace/cuchem/benchmark/scripts" +# export CUCHEM_UI_START_CMD="which python3" + +export MEGAMOLBART_CMD="bash -c 'CUDA_VISIBLE_DEVICES=${GPU_ID} \ + python3 launch.py -c ${CHECKPOINT_DIR} \ + --num_layers=${NUM_LAYERS} \ + --hidden_size=${HIDDEN_SIZE} \ + --num_attention_heads=${NUM_ATTENTION_HEADS}' +" + +export CUCHEM_PATH=/workspace +export MEGAMOLBART_PATH=/workspace/megamolbart +export WORKSPACE_DIR="$(pwd)" + +docker-compose \ + --env-file ${SCRIPT_LOC}/../../../.env \ + -f ${SCRIPT_LOC}/../../../setup/docker_compose.yml \ + --project-directory ${SCRIPT_LOC}/../../../ \ + --project-name "megamolbart${RUN_ID}" \ + ${ACTION} diff --git a/cuchem/conda/env.yml b/cuchem/conda/env.yml deleted file mode 100644 index 6e201143..00000000 --- a/cuchem/conda/env.yml +++ /dev/null @@ -1,41 +0,0 @@ -channels: - - rapidsai - - nvidia - - conda-forge - - rdkit - - anaconda - - plotly - - default -dependencies: - - rapids=0.17 - - python=3.7 - - cudatoolkit=11.0 # Ensure version matches with base image in Dockerfile - - rdkit==2020.09.1 - - dask==2021.03.0 - - distributed==2021.4.0 - - plotly==4.9.0 - - pytest==6.2.2 - - umap-learn==0.5.1 - - grpcio - - pip - - pip: - - tensorflow-gpu==1.15.5 - - dash==1.19.0 - - jupyter-dash==0.4.0 - - dash_bootstrap_components==0.11.1 - - dash_core_components==1.15.0 - - dash_html_components==1.1.2 - - progressbar2==3.53.1 - - tables==3.6.1 - - sqlalchemy==1.3.20 - - openpyxl==3.0.6 - - tabulate==0.8.7 - - autopep8==1.5.4 - - protobuf==3.14.0 - - chembl_webresource_client==0.10.2 - - dask_ml==1.8.0 - - matplotlib==3.3.4 - - waitress==1.4.3 - - flask-restplus==0.13.0 - - locust==1.4.3 - - git+https://github.com/jrwnter/cddd.git@1.0 \ No newline at end of file diff --git a/cuchem/cuchem/benchmark/data.py b/cuchem/cuchem/benchmark/data.py new file mode 100644 index 00000000..e11f2c9f --- /dev/null +++ b/cuchem/cuchem/benchmark/data.py @@ -0,0 +1,171 @@ +import os +import pickle +import sqlite3 +import logging + +from typing import List + +from cuchemcommon.utils.singleton import Singleton +from cuchemcommon.context import Context + +logger = logging.getLogger(__name__) + + +class TrainingData(object, metaclass=Singleton): + + def __init__(self): + + context = Context() + db_file = context.get_config('data_mount_path', default='/data') + db_file = os.path.join(db_file, 'db/zinc_train.sqlite3') + + logger.info(f'Benchmark database {db_file}...') + self.conn = sqlite3.connect(db_file) + + def is_known_smiles(self, smiles: str) -> bool: + """ + Checks if the given SMILES is known. + :param data: + :return: + """ + cursor = self.conn.cursor() + cursor.execute( + ''' + SELECT smiles FROM train_data + WHERE smiles=? + ''', + [smiles]) + id = cursor.fetchone() + cursor.close() + return True if id else False + + +class BenchmarkData(object, metaclass=Singleton): + + def __init__(self): + + context = Context() + db_file = context.get_config('data_mount_path', default='/data') + db_file = os.path.join(db_file, 'db/benchmark.sqlite3') + + logger.info(f'Benchmark database {db_file}...') + self.conn = sqlite3.connect(db_file) + + cursor = self.conn.cursor() + + sql_file = open("/workspace/cuchem/benchmark/scripts/benchmark.sql") + sql_as_string = sql_file.read() + cursor.executescript(sql_as_string) + + + def insert_sampling_data(self, + smiles, + num_samples, + scaled_radius, + force_unique, + sanitize, + generated_smiles: List[str], + embeddings: List, + embeddings_dim: List): + """ + Inserts a list of dicts into the benchmark data table. + :param data: + :return: + """ + logger.debug('Inserting benchmark data...') + cursor = self.conn.cursor() + id = cursor.execute( + ''' + INSERT INTO smiles(smiles, num_samples, scaled_radius, + force_unique, sanitize) + VALUES(?,?,?,?,?) + ''', + [smiles, num_samples, scaled_radius, force_unique, sanitize]).lastrowid + + for i in range(len(generated_smiles)): + gsmiles = generated_smiles[i] + embedding = list(embeddings[i]) + embedding_dim = list(embeddings_dim[i]) + + embedding = pickle.dumps(embedding) + embedding_dim = pickle.dumps(embedding_dim) + cursor.execute( + ''' + INSERT INTO smiles_samples(input_id, smiles, embedding, embedding_dim) + VALUES(?, ?, ?, ?) + ''', [id, gsmiles, sqlite3.Binary(embedding), sqlite3.Binary(embedding_dim)]) + self.conn.commit() + + + def fetch_sampling_data(self, + smiles, + num_samples, + scaled_radius, + force_unique, + sanitize): + """ + Fetch the benchmark data for a given set of parameters. + :param data: + :return: + """ + logger.debug('Fetching benchmark data...') + cursor = self.conn.cursor() + cursor.execute( + ''' + SELECT id FROM smiles + WHERE smiles=? + AND num_samples=? + AND scaled_radius=? + AND force_unique=? + AND sanitize=? + ''', + [smiles, num_samples, scaled_radius, force_unique, sanitize]) + id = cursor.fetchone() + + if not id: + return None + + cursor.execute('SELECT smiles FROM smiles_samples WHERE input_id=?', + [id[0]]) + generated_smiles = cursor.fetchall() + generated_smiles = [x[0] for x in generated_smiles] + return generated_smiles + + def fetch_n_sampling_data(self, + smiles, + num_samples, + scaled_radius, + force_unique, + sanitize): + """ + Fetch the benchmark data for a given set of parameters. + :param data: + :return: + """ + logger.debug('Fetching benchmark data...') + cursor = self.conn.cursor() + cursor.execute( + ''' + SELECT id FROM smiles + WHERE smiles=? + AND scaled_radius=? + AND force_unique=? + AND sanitize=? + ''', + [smiles, scaled_radius, force_unique, sanitize]) + id = cursor.fetchone() + + if not id: + return None + + cursor.execute( + ''' + SELECT smiles, embedding, embedding_dim + FROM smiles_samples WHERE input_id=? + LIMIT ? + ''', + [id[0], num_samples]) + generated_smiles = cursor.fetchall() + # generated_smiles = [x for x in generated_smiles] + + return generated_smiles diff --git a/cuchem/cuchem/benchmark/megamolbart.py b/cuchem/cuchem/benchmark/megamolbart.py index 3e7135f0..fda424cb 100644 --- a/cuchem/cuchem/benchmark/megamolbart.py +++ b/cuchem/cuchem/benchmark/megamolbart.py @@ -1,16 +1,9 @@ -#!/usr/bin/env python3 - -import logging import os -import sys -from datetime import datetime - -import generativesampler_pb2_grpc -import grpc +import time +import logging +import hydra import pandas as pd -import torch -import grpc -import generativesampler_pb2_grpc + from datetime import datetime import os import logging @@ -19,130 +12,145 @@ from cuml import LinearRegression, ElasticNet from cuml.svm import SVR -logger = logging.getLogger(__name__) - -# TODO add this path to the PYTHONPATH variable in the Dockerfile -sys.path.insert(0, '/workspace/cuchem') +from cuchem.wf.generative.megatronmolbart import MegatronMolBART from cuchem.datasets.loaders import ZINC15_TestSplit_20K_Samples, ZINC15_TestSplit_20K_Fingerprints -from cuchem.metrics.model import Validity, Unique, Novelty, NearestNeighborCorrelation, Modelability, get_model_iteration - - -def parse_args(): - parser = argparse.ArgumentParser(description='Model metrics') - - parser.add_argument('-o', '--output_dir', - dest='output_dir', - default='/workspace/megamolbart/benchmark', - type=str, - help='Output directory for CSV files') +from cuchem.metrics.model import Validity, Unique, Novelty, NearestNeighborCorrelation, Modelability - parser.add_argument('-l', '--max_seq_len', - dest='max_seq_len', - type=int, - default=512, - help='Default maximum sequence length') - - args = parser.parse_args(sys.argv[1:]) - - return args - - -args = parse_args() -OUTPUT_DIR = args.output_dir -DEFAULT_MAX_SEQ_LEN = args.max_seq_len # TODO: Import from MegaMolBART codebase? - -num_samples = 10 -radius_list = [0.01, 0.1, 0.5] # TODO calculate radius and automate this -top_k_list = [None, 50, 100, 500] # TODO decide on top k value - -# Metrics -validity = Validity() -unique = Unique() -novelty = Novelty() -nn = NearestNeighborCorrelation() -modelability = Modelability() -metric_list = [nn, modelability, validity, unique, novelty] - -# ML models -rf_estimator = RandomForestRegressor(accuracy_metric='mse', random_state=0) -rf_param_dict = {'n_estimators': [10, 50]} +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) -sv_estimator = SVR(kernel='rbf') -sv_param_dict = {'C': [0.01, 0.1, 1.0, 10], 'degree': [3,5,7,9]} +def get_model(): + rf_estimator = RandomForestRegressor(accuracy_metric='mse', random_state=0) + rf_param_dict = {'n_estimators': [10, 50]} -lr_estimator = LinearRegression(normalize=True) -lr_param_dict = {'normalize': [True]} + sv_estimator = SVR(kernel='rbf') + sv_param_dict = {'C': [0.01, 0.1, 1.0, 10], 'degree': [3,5,7,9]} -en_estimator = ElasticNet(normalize=True) -en_param_dict = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100], 'l1_ratio': [0.1, 0.5, 1.0, 10.0]} + lr_estimator = LinearRegression(normalize=True) + lr_param_dict = {'normalize': [True]} -model_dict = {'random forest': [rf_estimator, rf_param_dict], - 'support vector machine': [sv_estimator, sv_param_dict], - 'linear regression': [lr_estimator, lr_param_dict], - 'elastic net': [en_estimator, en_param_dict]} + en_estimator = ElasticNet(normalize=True) + en_param_dict = {'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100], + 'l1_ratio': [0.1, 0.5, 1.0, 10.0]} -# Datasets -smiles_dataset = ZINC15_TestSplit_20K_Samples(max_len=DEFAULT_MAX_SEQ_LEN) -fingerprint_dataset = ZINC15_TestSplit_20K_Fingerprints() -smiles_dataset.load() -fingerprint_dataset.load(smiles_dataset.data.index) -n_data = len(smiles_dataset.data) -assert (fingerprint_dataset.data.index == smiles_dataset.data.index) + return {'random forest': [rf_estimator, rf_param_dict], + 'support vector machine': [sv_estimator, sv_param_dict], + 'linear regression': [lr_estimator, lr_param_dict], + 'elastic net': [en_estimator, en_param_dict]} -# DEBUG -n_data = 10000 -smiles_dataset.data = smiles_dataset.data.iloc[:n_data] -smiles_dataset.properties = smiles_dataset.properties.iloc[:n_data] -fingerprint_dataset.data = fingerprint_dataset.data.iloc[:n_data] -def save_metric_results(metric_list): +def save_metric_results(metric_list, output_dir): metric_df = pd.concat(metric_list, axis=1).T + logger.info(metric_df) metric = metric_df['name'].iloc[0].replace(' ', '_') iteration = metric_df['iteration'].iloc[0] - metric_df.to_csv(os.path.join(OUTPUT_DIR, f'{metric}_{iteration}.csv'), index=False) + metric_df.to_csv(os.path.join(output_dir, f'{metric}_{iteration}.csv'), index=False) + + +@hydra.main(config_path=".", config_name="benchmark") +def main(cfg): + logger.info(cfg) + os.makedirs(cfg.output.path, exist_ok=True) + + output_dir = cfg.output.path + seq_len = int(cfg.samplingSpec.seq_len) # Import from MegaMolBART codebase? + sample_size = int(cfg.samplingSpec.sample_size) + + # radius_list = [1, 2, 5] # TODO calculate radius and automate this + # top_k_list = [None, 50, 100, 500] # TODO decide on top k value + + inferrer = MegatronMolBART() + + # Metrics + metric_list = [] + if cfg.metric.validity.enabled == True: + metric_list.append(Validity(inferrer)) + + if cfg.metric.unique.enabled == True: + metric_list.append(Unique(inferrer)) + + if cfg.metric.novelty.enabled == True: + metric_list.append(Novelty(inferrer)) + + if cfg.metric.nearestNeighborCorrelation.enabled == True: + metric_list.append(NearestNeighborCorrelation(inferrer)) + + if cfg.metric.modelability.enabled == True: + metric_list.append(Modelability(inferrer)) + + # ML models + model_dict = get_model() + + # Create Datasets of size input_size. Initialy load 20% more then reduce to + # input_size after cleaning and preprocessing. + + smiles_dataset = ZINC15_TestSplit_20K_Samples(max_len=seq_len) + fingerprint_dataset = ZINC15_TestSplit_20K_Fingerprints() + smiles_dataset.load() + fingerprint_dataset.load(smiles_dataset.data.index) + n_data = cfg.samplingSpec.input_size + if n_data <= 0: + n_data = len(smiles_dataset.data) + # assert fingerprint_dataset.data.index == smiles_dataset.data.index + + # DEBUG + smiles_dataset.data = smiles_dataset.data.iloc[:n_data] + smiles_dataset.properties = smiles_dataset.properties.iloc[:n_data] + fingerprint_dataset.data = fingerprint_dataset.data.iloc[:n_data] + + # DEBUG + n_data = cfg.samplingSpec.input_size + + convert_runtime = lambda x: x.seconds + (x.microseconds / 1.0e6) + + iteration = None + retry_count = 0 + while retry_count < 30: + try: + # Wait for upto 5 min for the server to be up + iteration = inferrer.get_iteration() + break + except Exception as e: + logging.warning(f'Service not available. Retrying {retry_count}...') + time.sleep(10) + retry_count += 1 + continue + logging.info(f'Service found after {retry_count} retries.') + + for metric in metric_list: + logger.info(f'METRIC: {metric.name}') + result_list = [] + + iter_list = metric.variations(cfg, model_dict=model_dict) + + for iter_val in iter_list: + start_time = datetime.now() + + try: + iter_val = int(iter_val) + except ValueError: + pass + + estimator, param_dict = None, None + if iter_val in model_dict: + estimator, param_dict = model_dict[iter_val] + + result = metric.calculate(smiles_dataset=smiles_dataset, + fingerprint_dataset=fingerprint_dataset, + top_k=iter_val, + properties=smiles_dataset.properties, + estimator=estimator, + param_dict=param_dict, + num_samples=sample_size, + radius=iter_val) + + run_time = convert_runtime(datetime.now() - start_time) + result['iteration'] = iteration + result['run_time'] = run_time + result['data_size'] = n_data + result_list.append(result) + save_metric_results(result_list, output_dir) -convert_runtime = lambda x: x.seconds + (x.microseconds / 1.0e6) - if __name__ == '__main__': - - with torch.no_grad(): - with grpc.insecure_channel('localhost:50051') as channel: - stub = generativesampler_pb2_grpc.GenerativeSamplerStub(channel) - func = stub.FindSimilars - iteration = get_model_iteration(stub) - - for metric in metric_list: - logger.info(f'METRIC: {metric}') - result_list = [] - - if metric.name == nn.name: - iter_list = top_k_list - elif metric.name == modelability.name: - iter_list = list(model_dict.keys()) - else: - iter_list = radius_list - - for iter_val in iter_list: - start_time = datetime.now() - - if metric.name == nn.name: - result = metric.calculate(smiles_dataset, fingerprint_dataset, stub, top_k=iter_val) - elif metric.name == modelability.name: - estimator, param_dict = model_dict[iter_val] - result = metric.calculate(smiles_dataset, - fingerprint_dataset, - smiles_dataset.properties, - stub, - estimator, - param_dict) - result['model'] = iter_val - else: - result = metric.calculate(smiles_dataset, num_samples, func, radius=iter_val) - - run_time = convert_runtime(datetime.now() - start_time) - result['iteration'] = iteration - result['run_time'] = run_time - result['data_size'] = n_data - result_list.append(result) - save_metric_results(result_list) + main() diff --git a/cuchem/cuchem/datasets/create_ZINC15_trie_multiprocessing.py b/cuchem/cuchem/datasets/create_ZINC15_trie_multiprocessing.py index b4d771ac..e0353fcf 100644 --- a/cuchem/cuchem/datasets/create_ZINC15_trie_multiprocessing.py +++ b/cuchem/cuchem/datasets/create_ZINC15_trie_multiprocessing.py @@ -23,7 +23,11 @@ from pathlib import Path import pandas as pd -from cuchem.utils.dataset import ZINC_CSV_DIR, ZINC_TRIE_DIR, generate_trie_filename +from cuchem.utils.dataset import ZINC_CSV_DIR, \ + ZINC_TRIE_DIR, \ + generate_trie_filename, \ + TRIE_FILENAME_LENGTH, \ + SHORT_TRIE_FILENAME ### SETTINGS ### LOG_PATH = os.path.join(ZINC_TRIE_DIR, 'processing.log') @@ -145,6 +149,10 @@ def worker(queue, lock, output_dir): # Gather list of filenames filelist = sorted(glob.glob(os.path.join(ZINC_CSV_DIR, '*.csv'))) + logger.info(ZINC_CSV_DIR) + logger.info(filelist) + import sys + sys.exit(0) n_files = len(filelist) logger.info(f'Identified {n_files} files') diff --git a/cuchem/cuchem/datasets/loaders.py b/cuchem/cuchem/datasets/loaders.py index 45c9d982..6f19bc6f 100644 --- a/cuchem/cuchem/datasets/loaders.py +++ b/cuchem/cuchem/datasets/loaders.py @@ -66,7 +66,7 @@ def load(self, index=None): if self.index_col: data = data.set_index(self.index_col).sort_index() - if index: + if index is not None: data = data.loc[index] self.data = data return diff --git a/cuchem/cuchem/metrics/model.py b/cuchem/cuchem/metrics/model.py index c2d5fdb9..727ffc06 100644 --- a/cuchem/cuchem/metrics/model.py +++ b/cuchem/cuchem/metrics/model.py @@ -1,212 +1,268 @@ #!/usr/bin/env python3 import logging -import os +import pickle import cupy -import generativesampler_pb2 import numpy as np import pandas as pd -from rdkit import Chem from cuml.metrics import pairwise_distances from sklearn.model_selection import ParameterGrid, KFold from cuml.metrics.regression import mean_squared_error from cuchem.utils.metrics import spearmanr from cuchem.utils.distance import tanimoto_calculate -from cuchem.utils.dataset import ZINC_TRIE_DIR, generate_trie_filename -from functools import lru_cache +from cuchem.benchmark.data import BenchmarkData, TrainingData logger = logging.getLogger(__name__) -def sanitized_smiles(smiles): - """Ensure SMILES are valid and sanitized, otherwise fill with NaN.""" - mol = Chem.MolFromSmiles(smiles, sanitize=True) - if mol: - sanitized_smiles = Chem.MolToSmiles(mol) - return sanitized_smiles - else: - return np.NaN - - -def get_model_iteration(stub): - """Get Model iteration""" - spec = generativesampler_pb2.GenerativeSpec( - model=generativesampler_pb2.GenerativeModel.MegaMolBART, - smiles="CCC", # all are dummy vars for this calculation - radius=0.0001, - numRequested=1, - padding=0) - - result = stub.GetIteration(spec) - return result.iteration - - class BaseSampleMetric(): + name = None + """Base class for metrics based on sampling for a single SMILES string""" - def __init__(self): - self.name = None + def __init__(self, inferrer): + self.inferrer = inferrer + self.benchmark_data = BenchmarkData() + self.training_data = TrainingData() + + def _find_similars_smiles(self, + smiles, + num_samples, + scaled_radius, + force_unique, + sanitize): + # Check db for results from a previous run + generated_smiles = self.benchmark_data.fetch_sampling_data(smiles, + num_samples, + scaled_radius, + force_unique, + sanitize) + if not generated_smiles: + # Generate new samples and update the database + result = self.inferrer.find_similars_smiles(smiles, + num_samples, + scaled_radius=scaled_radius, + force_unique=force_unique, + sanitize=sanitize) + # Result from sampler includes the input SMILES. Removing it. + # result = result[result.Generated == True] + generated_smiles = result['SMILES'].to_list() + + embeddings = result['embeddings'].to_list() + embeddings_dim = result['embeddings_dim'].to_list() + + # insert generated smiles into a database for use later. + self.benchmark_data.insert_sampling_data(smiles, + num_samples, + scaled_radius, + force_unique, + sanitize, + generated_smiles, + embeddings, + embeddings_dim) + return generated_smiles + + + def _calculate_metric(self, metric_array, num_samples): + total_samples = len(metric_array) * num_samples + return np.nansum(metric_array) / float(total_samples) - def sample(self): + def variations(self, cfg, model_dict=None): return NotImplemented - def calculate_metric(self, metric_array, num_samples): - total_samples = len(metric_array) * num_samples - return np.nansum(metric_array) / float(total_samples) + def sample(self): + return NotImplemented - def sample_many(self, smiles_dataset, num_samples, func, radius): + def sample_many(self, smiles_dataset, num_samples, radius): metric_result = list() + for index in range(len(smiles_dataset.data)): smiles = smiles_dataset.data.iloc[index] - logger.info(f'SMILES: {smiles}') - result = self.sample(smiles, num_samples, func, radius) + logger.debug(f'Sampling around {smiles}...') + result = self.sample(smiles, num_samples, radius) metric_result.append(result) + return np.array(metric_result) - def calculate(self, smiles, num_samples, func, radius): - metric_array = self.sample_many(smiles, num_samples, func, radius) - metric = self.calculate_metric(metric_array, num_samples) - return pd.Series({'name': self.name, 'value': metric, 'radius': radius, 'num_samples': num_samples}) + def calculate(self, **kwargs): + smiles_dataset = kwargs['smiles_dataset'] + num_samples = kwargs['num_samples'] + radius = kwargs['radius'] + + metric_array = self.sample_many(smiles_dataset, num_samples, radius) + metric = self._calculate_metric(metric_array, num_samples) + + return pd.Series({'name': self.__class__.name, + 'value': metric, + 'radius': radius, + 'num_samples': num_samples}) class BaseEmbeddingMetric(): + name = None + """Base class for metrics based on embedding datasets""" - def __init__(self): - self.name = None + def __init__(self, inferrer): + self.inferrer = inferrer + self.benchmark_data = BenchmarkData() + + def variations(self, cfg): + return NotImplemented + + def _find_embedding(self, + smiles, + scaled_radius, + force_unique, + sanitize, + max_len): + num_samples = 1 + + # Check db for results from a previous run + generated_smiles = self.benchmark_data.fetch_n_sampling_data(smiles, + num_samples, + scaled_radius, + force_unique, + sanitize) + if not generated_smiles: + # Generate new samples and update the database + generated_smiles = self.inferrer.smiles_to_embedding(smiles, + max_len, + scaled_radius=scaled_radius, + num_samples=num_samples) + else: + temp = generated_smiles[0] + embedding = pickle.loads(temp[1]) + + generated_smiles = [] + generated_smiles.append(temp[0]) + generated_smiles.append(embedding) + generated_smiles.append(pickle.loads(temp[2])) + + return generated_smiles[0], generated_smiles[1], generated_smiles[2] - def sample(self, smiles, max_len, stub, zero_padded_vals, average_tokens): + def sample(self, smiles, max_len, zero_padded_vals, average_tokens): - spec = generativesampler_pb2.GenerativeSpec( - model=generativesampler_pb2.GenerativeModel.MegaMolBART, - smiles=smiles, - radius=0.0001, # dummy var for this calculation - numRequested=1, # dummy var for this calculation - padding=max_len) + smiles, embedding, dim = self._find_embedding(smiles, 1, False, True, max_len) - result = stub.SmilesToEmbedding(spec) - shape = [int(x) for x in result.embedding[:2]] - assert shape[0] == max_len - embedding = cupy.array(result.embedding[2:]) + embedding = cupy.array(embedding) + embedding = embedding.reshape(dim) - embedding = embedding.reshape(shape) if zero_padded_vals: embedding[len(smiles):, :] = 0.0 if average_tokens: embedding = embedding[:len(smiles)].mean(axis=0).squeeze() - assert embedding.shape[0] == shape[-1] + assert embedding.shape[0] == dim[-1] else: embedding = embedding.flatten() + return embedding - def calculate_metric(self): + def _calculate_metric(self): raise NotImplementedError - @lru_cache(maxsize=None) - def sample_many(self, smiles_dataset, stub, zero_padded_vals=True, average_tokens=False): + def sample_many(self, smiles_dataset, zero_padded_vals=True, average_tokens=False): # Calculate pairwise distances for embeddings embeddings = [] + max_len = 0 for smiles in smiles_dataset.data.to_pandas(): - embedding = self.sample(smiles, smiles_dataset.max_len, stub, zero_padded_vals, average_tokens) - embeddings.append(embedding) + embedding = self.sample(smiles, smiles_dataset.max_len, zero_padded_vals, average_tokens) + max_len = max(max_len, embedding.shape[0]) + embeddings.append(cupy.array(embedding)) + + if max_len > 0: + embeddings_resized = [] + for embedding in embeddings: + n_pad = max_len - embedding.shape[0] + if n_pad <= 0: + embeddings_resized.append(embedding) + continue + embedding = cupy.resize(embedding, max_len) + embeddings_resized.append(embedding) + embeddings = embeddings_resized return cupy.asarray(embeddings) - def calculate(self): + def calculate(self, **kwargs): raise NotImplementedError class Validity(BaseSampleMetric): - def __init__(self): - self.name = 'validity' - - def sample(self, smiles, num_samples, func, radius): - spec = generativesampler_pb2.GenerativeSpec( - model=generativesampler_pb2.GenerativeModel.MegaMolBART, - smiles=smiles, - radius=radius, - numRequested=num_samples) - - result = func(spec) - result = result.generatedSmiles[1:] - - if isinstance(smiles, list): - result = result[:-1] - assert len(result) == num_samples - result = len(pd.Series([sanitized_smiles(x) for x in result]).dropna()) - return result + name = 'validity' + + def __init__(self, inferrer): + super().__init__(inferrer) + + def variations(self, cfg, model_dict=None): + return cfg.metric.validity.radius_list + + def sample(self, smiles, num_samples, radius): + generated_smiles = self._find_similars_smiles(smiles, + num_samples, + scaled_radius=radius, + force_unique=False, + sanitize=True) + return len(generated_smiles) class Unique(BaseSampleMetric): - def __init__(self): - self.name = 'uniqueness' - - def sample(self, smiles, num_samples, func, radius): - spec = generativesampler_pb2.GenerativeSpec( - model=generativesampler_pb2.GenerativeModel.MegaMolBART, - smiles=smiles, - radius=radius, - numRequested=num_samples) - - result = func(spec) - result = result.generatedSmiles[1:] - - if isinstance(smiles, list): - result = result[:-1] - assert len(result) == num_samples - result = len(pd.Series([sanitized_smiles(x) for x in result]).dropna().unique()) - return result + name = 'uniqueness' + def __init__(self, inferrer): + super().__init__(inferrer) -class Novelty(BaseSampleMetric): - def __init__(self): - self.name = 'novelty' + def variations(self, cfg, model_dict=None): + return cfg.metric.unique.radius_list - def smiles_in_train(self, smiles): - """Determine if smiles was in training dataset""" - in_train = False - - filename = generate_trie_filename(smiles) - trie_path = os.path.join(ZINC_TRIE_DIR, 'train', filename) - if os.path.exists(trie_path): - with open(trie_path, 'r') as fh: - smiles_list = fh.readlines() - smiles_list = [x.strip() for x in smiles_list] - in_train = smiles in smiles_list - else: - logger.warn(f'Trie file {filename} not found.') - in_train = False + def sample(self, smiles, num_samples, radius): + generated_smiles = self._find_similars_smiles(smiles, + num_samples, + scaled_radius=radius, + force_unique=False, + sanitize=True) + # Get the unquie ones + generated_smiles = set(generated_smiles) + return len(generated_smiles) - return in_train - def sample(self, smiles, num_samples, func, radius): - spec = generativesampler_pb2.GenerativeSpec( - model=generativesampler_pb2.GenerativeModel.MegaMolBART, - smiles=smiles, - radius=radius, - numRequested=num_samples) +class Novelty(BaseSampleMetric): + name = 'novelty' + + def __init__(self, inferrer): + super().__init__(inferrer) - result = func(spec) - result = result.generatedSmiles[1:] + def variations(self, cfg, model_dict=None): + return cfg.metric.novelty.radius_list - if isinstance(smiles, list): - result = result[:-1] - assert len(result) == num_samples + def smiles_in_train(self, smiles): + in_train = self.training_data.is_known_smiles(smiles) + return in_train + + def sample(self, smiles, num_samples, radius): + generated_smiles = self._find_similars_smiles(smiles, + num_samples, + scaled_radius=radius, + force_unique=False, + sanitize=True) - result = pd.Series([sanitized_smiles(x) for x in result]).dropna() - result = sum([self.smiles_in_train(x) for x in result]) + result = sum([self.smiles_in_train(x) for x in generated_smiles]) return result class NearestNeighborCorrelation(BaseEmbeddingMetric): """Sperman's Rho for correlation of pairwise Tanimoto distances vs Euclidean distance from embeddings""" - def __init__(self): - self.name = 'nearest neighbor correlation' + name = 'nearest neighbor correlation' - def calculate_metric(self, embeddings, fingerprints, top_k=None): + def __init__(self, inferrer): + super().__init__(inferrer) + + def variations(self, cfg, model_dict=None): + return cfg.metric.nearestNeighborCorrelation.top_k_list + + def _calculate_metric(self, embeddings, fingerprints, top_k=None): embeddings_dist = pairwise_distances(embeddings) del embeddings @@ -216,14 +272,20 @@ def calculate_metric(self, embeddings, fingerprints, top_k=None): corr = spearmanr(fingerprints_dist, embeddings_dist, top_k) return corr - def calculate(self, smiles_dataset, fingerprint_dataset, stub, top_k=None): - embeddings = self.sample_many(smiles_dataset, stub, zero_padded_vals=True, average_tokens=False) + def calculate(self, **kwargs): + smiles_dataset = kwargs['smiles_dataset'] + fingerprint_dataset = kwargs['fingerprint_dataset'] + top_k = kwargs['top_k'] + + embeddings = self.sample_many(smiles_dataset, + zero_padded_vals=True, + average_tokens=False) # Calculate pairwise distances for fingerprints fingerprints = cupy.fromDlpack(fingerprint_dataset.data.to_dlpack()) fingerprints = cupy.asarray(fingerprints, order='C') - metric = self.calculate_metric(embeddings, fingerprints, top_k) + metric = self._calculate_metric(embeddings, fingerprints, top_k) metric = cupy.nanmean(metric) top_k = embeddings.shape[0] - 1 if not top_k else top_k return pd.Series({'name': self.name, 'value': metric, 'top_k':top_k}) @@ -231,14 +293,18 @@ def calculate(self, smiles_dataset, fingerprint_dataset, stub, top_k=None): class Modelability(BaseEmbeddingMetric): """Ability to model molecular properties from embeddings vs Morgan Fingerprints""" + name = 'modelability' - def __init__(self): - self.name = 'modelability' + def __init__(self, inferrer): + super().__init__(inferrer) self.embeddings = None + def variations(self, cfg, model_dict=None): + return model_dict.keys() + def gpu_gridsearch_cv(self, estimator, param_dict, xdata, ydata, n_splits=5): """Perform grid search with cross validation and return score""" - + best_score = np.inf for param in ParameterGrid(param_dict): estimator.set_params(**param) @@ -257,25 +323,48 @@ def gpu_gridsearch_cv(self, estimator, param_dict, xdata, ydata, n_splits=5): best_score = min(metric, best_score) return best_score - def calculate_metric(self, embeddings, fingerprints, properties, estimator, param_dict): + def _calculate_metric(self, embeddings, fingerprints, properties, estimator, param_dict): """Perform grid search for each metric and calculate ratio""" metric_array = [] + embedding_errors = [] + fingerprint_errors = [] for col in properties.columns: props = properties[col].astype(cupy.float32).to_array() embedding_error = self.gpu_gridsearch_cv(estimator, param_dict, embeddings, props) fingerprint_error = self.gpu_gridsearch_cv(estimator, param_dict, fingerprints, props) ratio = fingerprint_error / embedding_error # If ratio > 1.0 --> embedding error is smaller --> embedding model is better metric_array.append(ratio) - return cupy.array(metric_array) + embedding_errors.append(embedding_error) + fingerprint_errors.append(fingerprint_error) + + return cupy.array(metric_array), cupy.array(fingerprint_errors), cupy.array(embedding_errors) - def calculate(self, smiles_dataset, fingerprint_dataset, properties, stub, estimator, param_dict): - embeddings = self.sample_many(smiles_dataset, stub, zero_padded_vals=False, average_tokens=True) + + def calculate(self, **kwargs): + smiles_dataset = kwargs['smiles_dataset'] + fingerprint_dataset = kwargs['fingerprint_dataset'] + properties = kwargs['properties'] + estimator = kwargs['estimator'] + param_dict = kwargs['param_dict'] + + embeddings = self.sample_many(smiles_dataset, zero_padded_vals=False, average_tokens=True) embeddings = cupy.asarray(embeddings, dtype=cupy.float32) fingerprints = cupy.fromDlpack(fingerprint_dataset.data.to_dlpack()) fingerprints = cupy.asarray(fingerprints, order='C', dtype=cupy.float32) - metric = self.calculate_metric(embeddings, fingerprints, properties, estimator, param_dict) + metric, fingerprint_errors, embedding_errors = self._calculate_metric(embeddings, + fingerprints, + properties, + estimator, + param_dict) + logger.info(f'{type(metric)} {type(fingerprint_errors)} {type(embedding_errors)}') metric = cupy.nanmean(metric) - return pd.Series({'name': self.name, 'value': metric}) + fingerprint_errors = cupy.nanmean(fingerprint_errors) + embedding_errors = cupy.nanmean(embedding_errors) + + return pd.Series({'name': self.name, + 'value': metric, + 'fingerprint_error': fingerprint_errors, + 'embedding_error': embedding_errors}) diff --git a/cuchem/cuchem/utils/dataset.py b/cuchem/cuchem/utils/dataset.py index 1adddac9..8dc8c4f2 100644 --- a/cuchem/cuchem/utils/dataset.py +++ b/cuchem/cuchem/utils/dataset.py @@ -16,7 +16,7 @@ import re # ZINC dataset parameters -ZINC_CSV_DIR = '/data/zinc_csv' +ZINC_CSV_DIR = '/data/zinc_csv/train' # ZINC trie parameters ZINC_TRIE_DIR = '/data/zinc_trie' diff --git a/cuchem/cuchem/wf/generative/cddd.py b/cuchem/cuchem/wf/generative/cddd.py index e6c64ef1..c7a81408 100644 --- a/cuchem/cuchem/wf/generative/cddd.py +++ b/cuchem/cuchem/wf/generative/cddd.py @@ -22,6 +22,16 @@ def __init__(self, dao: GenerativeWfDao = ChemblGenerativeWfDao(None)) -> None: self.cddd_embeddings = Embeddings(model_dir=self.default_model_loc) self.min_jitter_radius = 0.5 + def smiles_to_embedding(self, smiles: str, padding: int): + embedding = self.cddd_embeddings.func.seq_to_emb(smiles).squeeze() + return embedding + + def embedding_to_smiles(self, + embedding, + dim: int, + pad_mask): + return self.cddd_embeddings.inverse_transform(embedding) + def find_similars_smiles_list(self, smiles: str, num_requested: int = 10, @@ -30,11 +40,12 @@ def find_similars_smiles_list(self, radius = self._compute_radius(scaled_radius) embedding = self.cddd_embeddings.func.seq_to_emb(smiles).squeeze() - neighboring_embeddings = self.addjitter(embedding, radius, cnt=num_requested) + embeddings = self.addjitter(embedding, radius, cnt=num_requested) neighboring_embeddings = np.concatenate([embedding.reshape(1, embedding.shape[0]), - neighboring_embeddings]) - return self.cddd_embeddings.inverse_transform(neighboring_embeddings), neighboring_embeddings + embeddings]) + embeddings = [embedding] + embeddings + return self.cddd_embeddings.inverse_transform(neighboring_embeddings), embeddings def find_similars_smiles(self, smiles: str, @@ -45,26 +56,28 @@ def find_similars_smiles(self, num_requested=num_requested, scaled_radius=scaled_radius, force_unique=force_unique) + dims = [] + for neighboring_embedding in neighboring_embeddings: + dims.append(neighboring_embedding.shape) generated_df = pd.DataFrame({'SMILES': generated_mols, + 'embeddings': neighboring_embeddings, + 'embeddings_dim': dims, 'Generated': [True for i in range(len(generated_mols))]}) - generated_df.iat[0, 1] = False + generated_df.iat[0, 2] = False if force_unique: - radius = self._compute_radius(scaled_radius) generated_df = self.compute_unique_smiles(generated_df, - neighboring_embeddings, self.cddd_embeddings.inverse_transform, - radius=radius) + scaled_radius=scaled_radius) return generated_df - def interpolate_from_smiles(self, - smiles: List, - num_points: int = 10, - scaled_radius=None, - force_unique=False): + def interpolate_smiles(self, + smiles: List, + num_points: int = 10, + scaled_radius=None, + force_unique=False): - radius = self._compute_radius(scaled_radius) num_points = int(num_points) + 2 if len(smiles) < 2: raise Exception('At-least two or more smiles are expected') @@ -75,25 +88,35 @@ def linear_interpolate_points(embedding, num_points): result_df = [] for idx in range(len(smiles) - 1): data = pd.DataFrame({'transformed_smiles': [smiles[idx], smiles[idx + 1]]}) - embeddings = np.asarray(self.cddd_embeddings.transform(data)) + input_embeddings = np.asarray(self.cddd_embeddings.transform(data)) interp_embeddings = np.apply_along_axis(linear_interpolate_points, axis=0, - arr=embeddings, + arr=input_embeddings, num_points=num_points) - - interp_df = pd.DataFrame({'SMILES': self.cddd_embeddings.inverse_transform(interp_embeddings), + generated_mols = self.cddd_embeddings.inverse_transform(interp_embeddings) + interp_embeddings = interp_embeddings.tolist() + + dims = [] + embeddings = [] + for interp_embedding in interp_embeddings: + dims.append(input_embeddings.shape) + interp_embedding = np.asarray(interp_embedding) + embeddings.append(interp_embedding) + + interp_df = pd.DataFrame({'SMILES': generated_mols, + 'embeddings': embeddings, + 'embeddings_dim': dims, 'Generated': [True for i in range(num_points)]}) # Mark the source and desinations as not generated - interp_df.iat[0, 1] = False - interp_df.iat[-1, 1] = False + interp_df.iat[0, 2] = False + interp_df.iat[-1, 2] = False if force_unique: interp_df = self.compute_unique_smiles(interp_df, - interp_embeddings, self.cddd_embeddings.inverse_transform, - radius=radius) + scaled_radius=scaled_radius) result_df.append(interp_df) diff --git a/cuchem/cuchem/wf/generative/megatronmolbart.py b/cuchem/cuchem/wf/generative/megatronmolbart.py index b32a4628..066b1e2a 100644 --- a/cuchem/cuchem/wf/generative/megatronmolbart.py +++ b/cuchem/cuchem/wf/generative/megatronmolbart.py @@ -1,10 +1,12 @@ import logging -from typing import List - -import generativesampler_pb2 -import generativesampler_pb2_grpc import grpc import pandas as pd + +from typing import List + +from generativesampler_pb2_grpc import GenerativeSamplerStub +from generativesampler_pb2 import GenerativeSpec, EmbeddingList, GenerativeModel, google_dot_protobuf_dot_empty__pb2 + from cuchemcommon.data import GenerativeWfDao from cuchemcommon.data.generative_wf import ChemblGenerativeWfDao from cuchemcommon.utils.singleton import Singleton @@ -20,38 +22,74 @@ def __init__(self, dao: GenerativeWfDao = ChemblGenerativeWfDao(None)) -> None: self.min_jitter_radius = 1 channel = grpc.insecure_channel('megamolbart:50051') - self.stub = generativesampler_pb2_grpc.GenerativeSamplerStub(channel) + self.stub = GenerativeSamplerStub(channel) + + def get_iteration(self): + result = self.stub.GetIteration(google_dot_protobuf_dot_empty__pb2.Empty()) + return result.iteration + + def smiles_to_embedding(self, + smiles: str, + padding: int, + scaled_radius=None, + num_requested: int = 10): + spec = GenerativeSpec(smiles=[smiles], + padding=padding, + radius=scaled_radius, + numRequested=num_requested) + + result = self.stub.SmilesToEmbedding(spec) + return result + + def embedding_to_smiles(self, + embedding, + dim: int, + pad_mask): + spec = EmbeddingList(embedding=embedding, + dim=dim, + pad_mask=pad_mask) + + return self.stub.EmbeddingToSmiles(spec) def find_similars_smiles(self, smiles: str, num_requested: int = 10, scaled_radius=None, - force_unique=False): - spec = generativesampler_pb2.GenerativeSpec( - model=generativesampler_pb2.GenerativeModel.MegaMolBART, - smiles=smiles, - radius=scaled_radius, - numRequested=num_requested) + force_unique=False, + sanitize=True): + spec = GenerativeSpec(model=GenerativeModel.MegaMolBART, + smiles=smiles, + radius=scaled_radius, + numRequested=num_requested, + forceUnique=force_unique, + sanitize=sanitize) result = self.stub.FindSimilars(spec) - result = result.generatedSmiles + generatedSmiles = result.generatedSmiles + embeddings = [] + dims = [] + for embedding in result.embeddings: + embeddings.append(list(embedding.embedding)) + dims.append(embedding.dim) - generated_df = pd.DataFrame({'SMILES': result, - 'Generated': [True for i in range(len(result))]}) + generated_df = pd.DataFrame({'SMILES': generatedSmiles, + 'embeddings': embeddings, + 'embeddings_dim': dims, + 'Generated': [True for i in range(len(generatedSmiles))]}) generated_df['Generated'].iat[0] = False return generated_df - def interpolate_from_smiles(self, - smiles: List, - num_points: int = 10, - scaled_radius=None, - force_unique=False): - spec = generativesampler_pb2.GenerativeSpec( - model=generativesampler_pb2.GenerativeModel.MegaMolBART, - smiles=smiles, - radius=scaled_radius, - numRequested=num_points) + def interpolate_smiles(self, + smiles: List, + num_points: int = 10, + scaled_radius=None, + force_unique=False): + spec = GenerativeSpec(model=GenerativeModel.MegaMolBART, + smiles=smiles, + radius=scaled_radius, + numRequested=num_points, + forceUnique=force_unique) result = self.stub.Interpolate(spec) result = result.generatedSmiles diff --git a/cuchem/requirements.txt b/cuchem/requirements.txt index 76c9f5fb..7ceea23b 100644 --- a/cuchem/requirements.txt +++ b/cuchem/requirements.txt @@ -16,4 +16,13 @@ matplotlib==3.3.4 waitress==1.4.3 flask-restplus==0.13.0 locust==1.4.3 +hydra-core==1.1.1 +dask_ml==1.8.0 +locust==1.4.3 +dask==2021.8.1 +distributed==2021.8.1 +plotly==4.9.0 +pytest==6.2.2 +umap-learn==0.5.1 +grpcio git+https://github.com/jrwnter/cddd.git@1.0 \ No newline at end of file diff --git a/cuchem/tests/test_benchmark_data.py b/cuchem/tests/test_benchmark_data.py new file mode 100644 index 00000000..e1ed614f --- /dev/null +++ b/cuchem/tests/test_benchmark_data.py @@ -0,0 +1,20 @@ +import logging +from cuchem.benchmark.data import TrainingData + +logger = logging.getLogger(__name__) + + +def test_training_data(): + training_data = TrainingData() + + cursor = training_data.conn.cursor() + cursor.execute('SELECT smiles FROM train_data limit 10') + smiles_strs = cursor.fetchall() + + for smiles in smiles_strs: + logger.info(f'Looking for {smiles} in known smiles database...') + assert training_data.is_known_smiles(smiles[0]) == True + + smiles = 'adasdadsasdasd' + logger.info(f'Looking for {smiles} in known smiles database...') + assert training_data.is_known_smiles(smiles) == False \ No newline at end of file diff --git a/launch.sh b/launch.sh index b8bb9835..f71dd786 100755 --- a/launch.sh +++ b/launch.sh @@ -89,24 +89,28 @@ else fi build() { + local IMG_OPTION=$1 set -e DATE=$(date +%y%m%d) - IFS=':' read -ra CUCHEM_CONT_BASENAME <<< ${CUCHEM_CONT} - echo "Building ${CUCHEM_CONT_BASENAME}..." - docker build --network host \ - -t ${CUCHEM_CONT_BASENAME}:latest \ - -t ${CUCHEM_CONT_BASENAME}:${DATE} \ - -f Dockerfile.cuchem . - - IFS=':' read -ra MEGAMOLBART_CONT_BASENAME <<< ${MEGAMOLBART_CONT} - echo "Building ${MEGAMOLBART_CONT_BASENAME}..." - docker build --no-cache --network host \ - -t ${MEGAMOLBART_CONT_BASENAME}:latest \ - -t ${MEGAMOLBART_CONT_BASENAME}:${DATE} \ - --build-arg SOURCE_CONTAINER=${MEGAMOLBART_TRAINING_CONT} \ - -f Dockerfile.megamolbart \ - . + if [[ -z "${IMG_OPTION}" || "${IMG_OPTION}" == "1" ]]; then + IFS=':' read -ra CUCHEM_CONT_BASENAME <<< ${CUCHEM_CONT} + echo "Building ${CUCHEM_CONT_BASENAME}..." + docker build --network host \ + -t ${CUCHEM_CONT_BASENAME}:latest \ + -t ${CUCHEM_CONT} \ + -f Dockerfile.cuchem . + fi + + if [[ -z "${IMG_OPTION}" || "${IMG_OPTION}" == "2" ]]; then + IFS=':' read -ra MEGAMOLBART_CONT_BASENAME <<< ${MEGAMOLBART_CONT} + echo "Building ${MEGAMOLBART_CONT_BASENAME}..." + docker build --no-cache --network host \ + -t ${MEGAMOLBART_CONT_BASENAME}:latest \ + -t ${MEGAMOLBART_CONT} \ + --build-arg SOURCE_CONTAINER=${MEGAMOLBART_TRAINING_CONT} \ + -f Dockerfile.megamolbart . + fi set +e exit @@ -150,7 +154,14 @@ dev() { DOCKER_CMD="${DOCKER_CMD} -w /workspace/megamolbart/" CONT=${MEGAMOLBART_CONT} else +<<<<<<< HEAD DOCKER_CMD="${DOCKER_CMD} -e PYTHONPATH=${DEV_PYTHONPATH}" +======= + DOCKER_CMD="${DOCKER_CMD} --privileged" + DOCKER_CMD="${DOCKER_CMD} -v ${PROJECT_PATH}/chemportal/config:/etc/nvidia/cuChem/" + DOCKER_CMD="${DOCKER_CMD} -v /var/run/docker.sock:/var/run/docker.sock" + DOCKER_CMD="${DOCKER_CMD} -e PYTHONPATH=${DEV_PYTHONPATH}:" +>>>>>>> a3cae7e... Changes to improve runtime performance of benchmark tests. DOCKER_CMD="${DOCKER_CMD} -w /workspace/cuchem/" fi @@ -162,10 +173,7 @@ dev() { start() { if [[ -d "/opt/nvidia/cheminfomatics" ]]; then - # Executed within container or a managed env. - if [[ -d "/workspace/common/generated" ]]; then - PYTHONPATH="/workspace/cuchem:/workspace/common:/workspace/common/generated/" - fi + PYTHONPATH=/opt/nvidia/cheminfomatics/common/generated:/opt/nvidia/cheminfomatics/common:/opt/nvidia/cheminfomatics/cuchem:/opt/nvidia/cheminfomatics/chemportal dbSetup "${DATA_MOUNT_PATH}" cd ${CUCHEM_LOC}; python3 ${CUCHEM_LOC}/startdash.py analyze $@ else @@ -176,6 +184,7 @@ start() { export ADDITIONAL_PARAM="$@" export CUCHEM_PATH=/workspace export MEGAMOLBART_PATH=/workspace/megamolbart + export WORKSPACE_DIR='.' docker-compose --env-file .env \ -f setup/docker_compose.yml \ --project-directory . \ diff --git a/megamolbart/launch.py b/megamolbart/launch.py index 492c3c81..a950a380 100755 --- a/megamolbart/launch.py +++ b/megamolbart/launch.py @@ -29,13 +29,13 @@ import generativesampler_pb2_grpc from concurrent import futures from megamolbart.service import GenerativeSampler -from util import DEFAULT_MAX_SEQ_LEN logging.basicConfig(level=logging.INFO) - logger = logging.getLogger('megamolbart') -formatter = logging.Formatter( - '%(asctime)s %(name)s [%(levelname)s]: %(message)s') +formatter = logging.Formatter('%(asctime)s %(name)s [%(levelname)s]: %(message)s') + +from util import (DEFAULT_MAX_SEQ_LEN, DEFAULT_VOCAB_PATH, CHECKPOINTS_DIR, + DEFAULT_NUM_LAYERS, DEFAULT_D_MODEL, DEFAULT_NUM_HEADS) class Launcher(object): @@ -68,7 +68,7 @@ def __init__(self): logger.setLevel(logging.DEBUG) logger.info(f'Maximum decoded sequence length is set to {args.max_decode_length}') - + server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) generativesampler_pb2_grpc.add_GenerativeSamplerServicer_to_server(GenerativeSampler(decoder_max_seq_len=args.max_decode_length), server) server.add_insecure_port(f'[::]:{args.port}') diff --git a/megamolbart/megamolbart/inference.py b/megamolbart/megamolbart/inference.py index 8a86df8c..90c7d09f 100644 --- a/megamolbart/megamolbart/inference.py +++ b/megamolbart/megamolbart/inference.py @@ -6,8 +6,8 @@ from typing import List from rdkit import Chem -import pandas as pd import torch +import pandas as pd from checkpointing import load_checkpoint from cuchemcommon.workflow import BaseGenerativeWorkflow, add_jitter from decoder import DecodeSampler @@ -23,27 +23,18 @@ @add_jitter.register(torch.Tensor) -def _(embedding, radius, cnt): +def _(embedding, radius, cnt, shape): + if shape is not None: + embedding = torch.reshape(embedding, (1, shape[0], shape[1])).to(embedding.device) permuted_emb = embedding.permute(1, 0, 2) - noise = torch.normal(0, radius, (cnt,) + permuted_emb.shape[1:]).to(embedding.device) - - return (noise + permuted_emb).permute(1, 0, 2) - -def clean_smiles_list(smiles_list, standardize=True): - """Ensure SMILES are valid and unique. Optionally standardize them.""" + distorteds = [] + for i in range(cnt): + noise = torch.normal(0, radius, permuted_emb.shape).to(embedding.device) + distorted = (noise + permuted_emb).permute(1, 0, 2) + distorteds.append(distorted) - smiles_clean_list = [] - for smiles in smiles_list: - mol = Chem.MolFromSmiles(smiles, sanitize=standardize) - if mol: - sanitized_smiles = Chem.MolToSmiles(mol) - if sanitized_smiles not in smiles_clean_list: - smiles_clean_list.append(sanitized_smiles) - - if len(smiles_clean_list) == 0: - smiles_clean_list = [np.NaN] - return smiles_clean_list + return distorteds class MegaMolBART(BaseGenerativeWorkflow): @@ -180,27 +171,31 @@ def inverse_transform(self, embeddings, model, mem_pad_mask, k=1, sanitize=True) batch_size = 1 # TODO: parallelize this loop as a batch with torch.no_grad(): - for memory in embeddings.permute(1, 0, 2): + for memory in embeddings: + + if isinstance(memory, list): + memory = torch.FloatTensor(memory).cuda() decode_fn = partial(model._decode_fn, mem_pad_mask=mem_pad_mask.type(torch.LongTensor).cuda(), memory=memory) - mol_strs, log_lhs = model.sampler.beam_decode(decode_fn, - batch_size=batch_size, - device='cuda', - k=k) + mol_strs, _ = self.model.sampler.beam_decode(decode_fn, + batch_size=batch_size, + device='cuda', + k=k) mol_strs = sum(mol_strs, []) # flatten list # TODO: add back sanitization and validity checking once model is trained logger.warn('WARNING: MOLECULE VALIDATION AND SANITIZATION CURRENTLY DISABLED') for smiles in mol_strs: - # mol = Chem.MolFromSmiles(smiles, sanitize=sanitize) - # if mol: - # sanitized_smiles = Chem.MolToSmiles(mol) - # if sanitized_smiles not in smiles_interp_list: - # smiles_interp_list.append(sanitized_smiles) - # break + if sanitize: + mol = Chem.MolFromSmiles(smiles, sanitize=sanitize) + if mol: + sanitized_smiles = Chem.MolToSmiles(mol) + smiles_interp_list.append(sanitized_smiles) + logger.debug(f'Sanitized SMILES {sanitized_smiles} added...') + break smiles_interp_list.append(smiles) return smiles_interp_list @@ -233,8 +228,20 @@ def interpolate_molecules(self, smiles1, smiles2, num_interp, tokenizer, k=1): interpolated_emb = torch.lerp(embedding1, embedding2, scale).cuda() # dims: batch, tokens, embedding combined_mask = (pad_mask1 & pad_mask2).bool().cuda() - return self.inverse_transform(interpolated_emb, self.model, k=k, mem_pad_mask=combined_mask, - sanitize=True), combined_mask + embeddings = [] + dims = [] + for emb in interpolated_emb.permute(1, 0, 2): + dims.append(emb.shape) + embeddings.append(emb) + + generated_mols = self.inverse_transform(embeddings, + combined_mask, + k=k, + sanitize=True) + generated_mols = [smiles1] + generated_mols + [smiles2] + embeddings = [embedding1] + embeddings + [embedding2] + dims = [embedding1.shape] + dims + [embedding2.shape] + return generated_mols, embeddings, combined_mask, dims def find_similars_smiles_list(self, smiles: str, @@ -248,10 +255,14 @@ def find_similars_smiles_list(self, neighboring_embeddings = self.addjitter(embedding, distance, cnt=num_requested) - generated_mols = self.inverse_transform(embeddings=neighboring_embeddings, model=self.model, - k=1, mem_pad_mask=pad_mask.bool().cuda(), sanitize=True) + generated_mols = self.inverse_transform(neighboring_embeddings, + pad_mask.bool().cuda(), + k=1, sanitize=True) + if force_unique: + generated_mols = list(set(generated_mols)) generated_mols = [smiles] + generated_mols + neighboring_embeddings = [embedding] + neighboring_embeddings return generated_mols, neighboring_embeddings, pad_mask def find_similars_smiles(self, @@ -259,39 +270,39 @@ def find_similars_smiles(self, num_requested: int = 10, scaled_radius=None, force_unique=False): - distance = self._compute_radius(scaled_radius) - logger.info(f'Computing with distance {distance}...') - generated_mols, neighboring_embeddings, pad_mask = \ self.find_similars_smiles_list(smiles, num_requested=num_requested, scaled_radius=scaled_radius, force_unique=force_unique) + # Rest of the applications and libraries use RAPIDS and cuPY libraries. + # For interoperability, we need to convert the embeddings to cupy. + embeddings = [] + dims = [] + for neighboring_embedding in neighboring_embeddings: + dims.append(neighboring_embedding.shape) + embeddings.append(neighboring_embedding.flatten().tolist()) + generated_df = pd.DataFrame({'SMILES': generated_mols, + 'embeddings': embeddings, + 'embeddings_dim': dims, 'Generated': [True for i in range(len(generated_mols))]}) - generated_df.iat[0, 1] = False + generated_df.iat[0, 3] = False if force_unique: inv_transform_funct = partial(self.inverse_transform, mem_pad_mask=pad_mask) generated_df = self.compute_unique_smiles(generated_df, - neighboring_embeddings, inv_transform_funct, - radius=distance) - - smile_list = list(generated_df['SMILES']) + scaled_radius=scaled_radius) + return generated_df - return generated_df, smile_list - - def interpolate_from_smiles(self, + def interpolate_smiles(self, smiles: List, num_points: int = 10, scaled_radius=None, force_unique=False): - distance = self._compute_radius(scaled_radius) - logger.info(f'Computing with distance {distance}...') - num_points = int(num_points) if len(smiles) < 2: raise Exception('At-least two or more smiles are expected') @@ -299,29 +310,34 @@ def interpolate_from_smiles(self, k = 1 result_df = [] for idx in range(len(smiles) - 1): - interpolated_mol = [smiles[idx]] - interpolated, combined_mask = self.interpolate_molecules(smiles[idx], - smiles[idx + 1], - num_points, - self.tokenizer, - k=k) - interpolated_mol += interpolated - interpolated_mol.append(smiles[idx + 1]) + interpolated_mol, interpolated_embeddings, combined_mask, dims = \ + self.interpolate_molecules(smiles[idx], + smiles[idx + 1], + num_points, + self.tokenizer, + k=k) + + # Rest of the applications and libraries use RAPIDS and cuPY libraries. + # For interoperability, we need to convert the embeddings to cupy. + embeddings = [] + for interpolated_embedding in interpolated_embeddings: + embeddings.append(interpolated_embedding.cpu()) interp_df = pd.DataFrame({'SMILES': interpolated_mol, + 'embeddings': embeddings, + 'embeddings_dim': dims, 'Generated': [True for i in range(len(interpolated_mol))]}) inv_transform_funct = partial(self.inverse_transform, mem_pad_mask=combined_mask) # Mark the source and desinations as not generated - interp_df.iat[0, 1] = False - interp_df.iat[-1, 1] = False + interp_df.iat[0, 3] = False + interp_df.iat[-1, 3] = False if force_unique: interp_df = self.compute_unique_smiles(interp_df, - interpolated_mol, inv_transform_funct, - radius=distance) + scaled_radius=scaled_radius) result_df.append(interp_df) diff --git a/megamolbart/megamolbart/service.py b/megamolbart/megamolbart/service.py index 5db80330..a4b160d9 100644 --- a/megamolbart/megamolbart/service.py +++ b/megamolbart/megamolbart/service.py @@ -1,13 +1,15 @@ import logging -import generativesampler_pb2 +from generativesampler_pb2 import EmbeddingList, SmilesList, IterationVal import generativesampler_pb2_grpc from megamolbart.inference import MegaMolBART +from cuchemcommon.utils import Singleton + logger = logging.getLogger(__name__) -class GenerativeSampler(generativesampler_pb2_grpc.GenerativeSampler): +class GenerativeSampler(generativesampler_pb2_grpc.GenerativeSampler, metaclass=Singleton): def __init__(self, *args, **kwargs): decoder_max_seq_len = kwargs['decoder_max_seq_len'] if 'decoder_max_seq_len' in kwargs else None @@ -28,31 +30,54 @@ def SmilesToEmbedding(self, spec, context): embedding, pad_mask = self.megamolbart.smiles2embedding(smile_str, pad_length=spec.padding) - embedding = embedding.squeeze() - shape = list(embedding.shape) - assert len(shape) == 2 - - embedding = shape + embedding.flatten().tolist() - return generativesampler_pb2.EmbeddingList(embedding=embedding) + dim = embedding.shape + embedding = embedding.flatten().tolist() + return EmbeddingList(embedding=embedding, + dim=dim, + pad_mask=pad_mask) + + def EmbeddingToSmiles(self, embedding_spec, context): + ''' + Converts input embedding to SMILES. + @param transform_spec: Input spec with embedding and mask. + ''' + embedding = torch.FloatTensor(list(embedding_spec.embedding)) + pad_mask = torch.BoolTensor(list(embedding_spec.pad_mask)) + dim = tuple(embedding_spec.dim) + + embedding = torch.reshape(embedding, dim).cuda() + pad_mask = torch.reshape(pad_mask, (dim[0], 1)).cuda() + + generated_mols = self.megamolbart.inverse_transform(embedding, pad_mask) + return SmilesList(generatedSmiles=generated_mols) def FindSimilars(self, spec, context): smile_str = ''.join(spec.smiles) - _, generated_smiles = \ - self.megamolbart.find_similars_smiles( + generated_df = self.megamolbart.find_similars_smiles( smile_str, num_requested=spec.numRequested, - scaled_radius=spec.radius) - return generativesampler_pb2.SmilesList(generatedSmiles=generated_smiles) + scaled_radius=spec.radius, + force_unique=False) + + embeddings = [] + + for _, row in generated_df.iterrows(): + embeddings.append(EmbeddingList(embedding=row.embeddings, + dim=row.embeddings_dim)) + + return SmilesList(generatedSmiles=generated_df['SMILES'], + embeddings=embeddings) def Interpolate(self, spec, context): - _, generated_smiles = self.megamolbart.interpolate_from_smiles( + _, generated_smiles = self.megamolbart.interpolate_smiles( spec.smiles, num_points=spec.numRequested, - scaled_radius=spec.radius) - return generativesampler_pb2.SmilesList(generatedSmiles=generated_smiles) + scaled_radius=spec.radius, + force_unique=False) + return SmilesList(generatedSmiles=generated_smiles) def GetIteration(self, spec, context): - return generativesampler_pb2.IterationVal(iteration=self.iteration) + return IterationVal(iteration=self.iteration) diff --git a/megamolbart/tests/pytest.ini b/megamolbart/tests/pytest.ini new file mode 100644 index 00000000..7d448c5c --- /dev/null +++ b/megamolbart/tests/pytest.ini @@ -0,0 +1,2 @@ +[pytest] +filterwarnings = ignore::DeprecationWarning \ No newline at end of file diff --git a/megamolbart/tests/test_grpc.py b/megamolbart/tests/test_grpc.py index 8a2b1dbe..ce5c3a45 100644 --- a/megamolbart/tests/test_grpc.py +++ b/megamolbart/tests/test_grpc.py @@ -5,11 +5,10 @@ from concurrent import futures from contextlib import contextmanager -sys.path.insert(0, "generated") - from megamolbart.service import GenerativeSampler import generativesampler_pb2 import generativesampler_pb2_grpc +from util import (DEFAULT_NUM_LAYERS, DEFAULT_D_MODEL, DEFAULT_NUM_HEADS, CHECKPOINTS_DIR) logger = logging.getLogger(__name__) @@ -17,16 +16,29 @@ @contextmanager def similarity(add_server_method, service_cls, stub_cls): server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) - add_server_method(service_cls(), server) + + add_server_method(service_cls(num_layers=DEFAULT_NUM_LAYERS, + hidden_size=DEFAULT_D_MODEL, + num_attention_heads=DEFAULT_NUM_HEADS, + checkpoints_dir=CHECKPOINTS_DIR, + vocab_path='/models/megamolbart/bart_vocab.txt',), + server) port = server.add_insecure_port('[::]:0') server.start() - try: with grpc.insecure_channel('localhost:%d' % port) as channel: yield stub_cls(channel) finally: server.stop(None) +def test_fetch_iterations(): + sys.argv = [sys.argv[0]] + with similarity(generativesampler_pb2_grpc.add_GenerativeSamplerServicer_to_server, + GenerativeSampler, + generativesampler_pb2_grpc.GenerativeSamplerStub) as stub: + + result = stub.GetIteration(generativesampler_pb2.google_dot_protobuf_dot_empty__pb2.Empty()) + def test_dataframe_similar(): sys.argv = [sys.argv[0]] @@ -52,6 +64,6 @@ def test_dataframe_interpolate(): model=generativesampler_pb2.GenerativeModel.MegaMolBART, smiles=['CC(=O)Nc1ccc(O)cc1', 'CC(=O)Nc1ccc(O)'], radius=0.0005, - numPoints=10) + numRequested=10) result = stub.Interpolate(spec) diff --git a/megamolbart/tests/test_megamolbart.py b/megamolbart/tests/test_megamolbart.py index cf12deeb..bfabcad2 100644 --- a/megamolbart/tests/test_megamolbart.py +++ b/megamolbart/tests/test_megamolbart.py @@ -3,9 +3,7 @@ import pandas as pd import torch -import sys -sys.path.insert(0, "/workspace/megamolbart_generative") -from megamolbart_generative import MegaMolBART +from megamolbart.inference import MegaMolBART if __name__ == '__main__': @@ -33,8 +31,8 @@ assert isinstance(mols_df_1, pd.DataFrame) assert isinstance(mols_df_1.loc[1, 'SMILES'], str) - mols_df_2 = wf.interpolate_from_smiles([smiles1, smiles2], num_interp) + mols_df_2 = wf.interpolate_smiles([smiles1, smiles2], num_interp) assert len(mols_df_2) == num_interp + 2 assert isinstance(mols_df_2, pd.DataFrame) assert isinstance(mols_df_2.loc[1, 'SMILES'], str) - + diff --git a/misc/triton/molbart/model.py b/misc/triton/molbart/model.py index e55af9e4..c82c2e08 100644 --- a/misc/triton/molbart/model.py +++ b/misc/triton/molbart/model.py @@ -281,11 +281,17 @@ def addjitter(self, cnt=1): return add_jitter(embedding, radius, cnt) + def _compute_radius(self, scaled_radius): + if scaled_radius: + return float(scaled_radius * self.min_jitter_radius) + else: + return self.min_jitter_radius + def compute_unique_smiles(self, interp_df, embeddings, embedding_funct, - radius=0.5): + scaled_radius=0.5): """ Identify duplicate SMILES and distorts the embedding. The input df must have columns 'SMILES' and 'Generated' at 0th and 1st position. @@ -295,6 +301,7 @@ def compute_unique_smiles(self, This function does not make any assumptions about order of embeddings. Instead it simply orders the df by SMILES to identify the duplicates. """ + distance = self._compute_radius(scaled_radius) for i in range(5): smiles = interp_df['SMILES'].sort_values() @@ -309,7 +316,7 @@ def compute_unique_smiles(self, if interp_df.iat[dup_idx, 1]: # add jitter to generated molecules only embeddings[dup_idx] = self.addjitter( - embeddings[dup_idx], radius, 1) + embeddings[dup_idx], distance, 1) smiles = embedding_funct(embeddings) else: break @@ -323,7 +330,7 @@ def compute_unique_smiles(self, invalid_index = invalid_mol_df.index.to_list() for idx in invalid_index: embeddings[idx] = self.addjitter(embeddings[idx], - radius, + distance, cnt=1) smiles = embedding_funct(embeddings) else: diff --git a/setup/docker_compose.yml b/setup/docker_compose.yml index 78b459c3..0c1b74f8 100644 --- a/setup/docker_compose.yml +++ b/setup/docker_compose.yml @@ -17,7 +17,7 @@ services: ports: - "5000:5000" volumes: - - './:/workspace' + - "${WORKSPACE_DIR}/:/workspace" - "${CONTENT_PATH}/data:/data" user: "${UID}:${GID}" working_dir: ${CUCHEM_PATH} From 92e4625eddba02ea2478b14cd0e703a194d82ffc Mon Sep 17 00:00:00 2001 From: Rajesh Ilango Date: Fri, 3 Sep 2021 19:25:36 -0700 Subject: [PATCH 02/27] Bug fix after cherry-picking commit meant to upgrade to RAPIDS 21.08... RAPIDS 21.08 has fix for a bug filed for PCA on Multiple GPUs. Other changes include: - 'GPU KMeans-UMAP - Single and Multiple GPUs' is now the default clustering workflow. This one has performance benefit compare to 'GPU KMeans-UMAP' while reclustering. - 'MegatronMolBART' is now the default generative workflow - Remove Apex installation while docker image creation. --- Dockerfile.cuchem | 4 +-- Dockerfile.megamolbart | 16 +++-------- common/requirements.txt | 2 +- cuchem/cuchem/interactive/chemvisualize.py | 31 ++++++++++++++-------- cuchem/cuchem/wf/cluster/gpukmeansumap.py | 7 +++-- cuchem/startdash.py | 8 +++--- launch.sh | 4 --- megamolbart/megamolbart/inference.py | 4 +-- 8 files changed, 37 insertions(+), 39 deletions(-) diff --git a/Dockerfile.cuchem b/Dockerfile.cuchem index b04771c9..f5310562 100644 --- a/Dockerfile.cuchem +++ b/Dockerfile.cuchem @@ -16,10 +16,8 @@ RUN cd /opt/nvidia/cheminfomatics/common; \ pip install -r requirements.txt RUN cd /opt/nvidia/cheminfomatics/cuchem; \ pip install -r requirements.txt -RUN cd /opt/nvidia/cheminfomatics/chemportal; \ - pip install -r requirements.txt ENV UCX_LOG_LEVEL error -ENV PYTHONPATH ./common/generated:./common:./cuchem:/./chemportal +ENV PYTHONPATH ./common/generated:./common:./cuchem: CMD cd /opt/nvidia/cheminfomatics; ./launch.sh start diff --git a/Dockerfile.megamolbart b/Dockerfile.megamolbart index 04dab768..aeee8fd4 100644 --- a/Dockerfile.megamolbart +++ b/Dockerfile.megamolbart @@ -1,9 +1,9 @@ -# Copyright 2020 NVIDIA Corporation +# Copyright 2021 NVIDIA Corporation FROM nvcr.io/nvidia/pytorch:20.11-py3 RUN apt-get update \ && apt-get upgrade -y \ - && DEBIAN_FRONTEND=noninteractive apt-get install -y wget git unzip tmux vim libxrender1 \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y wget git unzip tmux libxrender1 \ && rm -rf /var/lib/apt/lists/* ENV UCX_LOG_LEVEL error @@ -16,13 +16,6 @@ RUN echo "source activate base" > /etc/bash.bashrc COPY megamolbart/conda/env.yml /tmp/. RUN conda env update --name base -f /tmp/env.yml && conda clean -afy -## Apex -RUN cd /tmp && git clone https://github.com/NVIDIA/apex -RUN cd /tmp/apex/ \ - && /opt/conda/bin/python3 -m pip install -v \ - --disable-pip-version-check --no-cache-dir \ - --global-option="--cpp_ext" --global-option="--cuda_ext" ./ - ## PySMILES -- requirements handled by conda environment RUN git clone https://github.com/MolecularAI/pysmilesutils.git --branch master /opt/pysmilesutils \ && cd /opt/pysmilesutils; pip install . @@ -39,8 +32,7 @@ RUN cd /tmp/common; pip install . RUN mkdir -p /opt/nvidia/cuchem/grpc COPY common/generated /opt/nvidia/cuchem/grpc - -ENV PYTHONPATH /opt/nvidia/cuchem/grpc:/opt/nvidia/cheminfomatics/common:$PYTHONPATH +ENV PYTHONPATH /opt/nvidia/cuchem/grpc:$PYTHONPATH COPY megamolbart/ /opt/nvidia/megamolbart/ -CMD cd /opt/nvidia/megamolbart && python3 launch.py +CMD cd /opt/nvidia/megamolbart && python3 launch.py \ No newline at end of file diff --git a/common/requirements.txt b/common/requirements.txt index b6fdab03..f538d5e2 100644 --- a/common/requirements.txt +++ b/common/requirements.txt @@ -1,2 +1,2 @@ -dask[complete]==2021.8.1 +dask[complete] sqlalchemy==1.3.20 diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index 9b3ef2c6..db103a7d 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -104,8 +104,8 @@ def __init__(self, cluster_wf): self.n_clusters = cluster_wf.n_clusters self.chem_data = ChEmblData() self.genreated_df = None - self.cluster_wf_cls = 'cuchem.wf.cluster.gpukmeansumap.GpuKmeansUmap' - self.generative_wf_cls = 'cuchem.wf.generative.Cddd' + self.cluster_wf_cls = 'cuchem.wf.cluster.gpukmeansumap.GpuKmeansUmapHybrid' + self.generative_wf_cls = 'cuchem.wf.generative.MegatronMolBART' # Store colors to avoid plots changes colors on events such as # molecule selection, etc. @@ -198,8 +198,8 @@ def __init__(self, cluster_wf): self.app.callback( [Output('table_generated_molecules', 'children'), Output('show_generated_mol', 'children'), - Output('interpolation_error', 'children'), - Output('msg_generated_molecules', 'children')], + Output('msg_generated_molecules', 'children'), + Output('interpolation_error', 'children')], [Input("bt_generate", "n_clicks"), ], [State('sl_generative_wf', 'value'), State('ckl_candidate_mol_id', 'value'), @@ -254,7 +254,7 @@ def handle_property_tables(self, show_generated_mol, show_selected_mol): return {'display': 'block', 'width': '100%'}, {'display': 'none'} return dash.no_update, dash.no_update - @report_ui_error(3) + @report_ui_error(4) def handle_generation(self, bt_generate, sl_generative_wf, ckl_candidate_mol_id, n2generate, scaled_radius, rd_generation_type, show_generated_mol): @@ -273,11 +273,15 @@ def handle_generation(self, bt_generate, scaled_radius = int(scaled_radius) if rd_generation_type == 'SAMPLE': + if chemble_ids == None or len(chemble_ids) == 0: + raise ValueError('Please select at-least one molecule for Sampling.') self.genreated_df = generative_wf.find_similars_smiles_by_id(chemble_ids, num_requested=n2generate, scaled_radius=scaled_radius, force_unique=True) else: + if chemble_ids == None or len(chemble_ids) < 2: + raise ValueError('Please select at-least two molecules for Interpolation.') self.genreated_df = generative_wf.interpolate_by_id(chemble_ids, num_points=n2generate, scaled_radius=scaled_radius, @@ -294,7 +298,10 @@ def handle_generation(self, bt_generate, # Create Table header table_headers = [] columns = self.genreated_df.columns.to_list() + ignore_columns = ['embeddings', 'embeddings_dim'] for column in columns: + if column in ignore_columns: + continue table_headers.append(html.Th(column, style={'fontSize': '150%', 'text-align': 'center'})) prop_recs = [html.Tr(table_headers, style={'background': 'lightgray'})] @@ -314,6 +321,8 @@ def handle_generation(self, bt_generate, for col_id in range(len(columns)): col_data = self.genreated_df.iat[row_idx, col_id] + if columns[col_id] in ignore_columns: + continue col_level = 'info' if isinstance(col_data, dict): @@ -343,8 +352,8 @@ def handle_generation(self, bt_generate, return html.Table(prop_recs, style={'width': '100%', 'border': '1px solid lightgray'}), \ show_generated_mol, \ - dash.no_update, \ - msg_generated_molecules + msg_generated_molecules, \ + dash.no_update def handle_ckl_selection(self, ckl_candidate_mol_id, rd_generation_type): selection_msg = '**Please Select Two Molecules**' @@ -430,7 +439,7 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop moi_molregno = [] if north_stars: - moi_molregno = north_stars.split(",") + moi_molregno = list(map(int, north_stars.split(","))) moi_filter = ldf['id'].isin(moi_molregno) @@ -671,10 +680,10 @@ def constuct_layout(self): html.Div(children=[ dcc.Dropdown(id='sl_wf', multi=False, - options=[{'label': 'GPU KMeans-UMAP', - 'value': 'cuchem.wf.cluster.gpukmeansumap.GpuKmeansUmap'}, - {'label': 'GPU KMeans-UMAP - Single and Multiple GPUs', + options=[{'label': 'GPU KMeans-UMAP - Single and Multiple GPUs', 'value': 'cuchem.wf.cluster.gpukmeansumap.GpuKmeansUmapHybrid'}, + {'label': 'GPU KMeans-UMAP', + 'value': 'cuchem.wf.cluster.gpukmeansumap.GpuKmeansUmap'}, {'label': 'GPU KMeans-Random Projection - Single GPU', 'value': 'cuchem.wf.cluster.gpurandomprojection.GpuWorkflowRandomProjection'}, {'label': 'CPU KMeans-UMAP', diff --git a/cuchem/cuchem/wf/cluster/gpukmeansumap.py b/cuchem/cuchem/wf/cluster/gpukmeansumap.py index 8c0c7e8a..58aa739f 100644 --- a/cuchem/cuchem/wf/cluster/gpukmeansumap.py +++ b/cuchem/cuchem/wf/cluster/gpukmeansumap.py @@ -237,10 +237,13 @@ def add_molecules(self, chemblids: List): new_fingerprints[col] = prop_ser self.df_embedding = self._remove_ui_columns(self.df_embedding) - - # TODO: Should we maintain the original PCA result for use here self.df_embedding = self.df_embedding.append(new_fingerprints) + if hasattr(self.df_embedding, 'compute'): + self.df_embedding = self.df_embedding.compute() + + logger.info(self.df_embedding.shape) + return chem_mol_map, molregnos, self.df_embedding diff --git a/cuchem/startdash.py b/cuchem/startdash.py index c408cc5e..5cc5f5c8 100755 --- a/cuchem/startdash.py +++ b/cuchem/startdash.py @@ -330,10 +330,10 @@ def analyze(self): n_molecules = args.n_mol if not args.cpu: - from cuchem.wf.cluster.gpukmeansumap import GpuKmeansUmap - workflow = GpuKmeansUmap(n_molecules=n_molecules, - pca_comps=args.pca_comps, - n_clusters=args.num_clusters) + from cuchem.wf.cluster.gpukmeansumap import GpuKmeansUmapHybrid + workflow = GpuKmeansUmapHybrid(n_molecules=n_molecules, + pca_comps=args.pca_comps, + n_clusters=args.num_clusters) else: from cuchem.wf.cluster.cpukmeansumap import CpuKmeansUmap workflow = CpuKmeansUmap(n_molecules=n_molecules, diff --git a/launch.sh b/launch.sh index f71dd786..ccd0dbe2 100755 --- a/launch.sh +++ b/launch.sh @@ -154,14 +154,10 @@ dev() { DOCKER_CMD="${DOCKER_CMD} -w /workspace/megamolbart/" CONT=${MEGAMOLBART_CONT} else -<<<<<<< HEAD - DOCKER_CMD="${DOCKER_CMD} -e PYTHONPATH=${DEV_PYTHONPATH}" -======= DOCKER_CMD="${DOCKER_CMD} --privileged" DOCKER_CMD="${DOCKER_CMD} -v ${PROJECT_PATH}/chemportal/config:/etc/nvidia/cuChem/" DOCKER_CMD="${DOCKER_CMD} -v /var/run/docker.sock:/var/run/docker.sock" DOCKER_CMD="${DOCKER_CMD} -e PYTHONPATH=${DEV_PYTHONPATH}:" ->>>>>>> a3cae7e... Changes to improve runtime performance of benchmark tests. DOCKER_CMD="${DOCKER_CMD} -w /workspace/cuchem/" fi diff --git a/megamolbart/megamolbart/inference.py b/megamolbart/megamolbart/inference.py index 90c7d09f..4ea51b73 100644 --- a/megamolbart/megamolbart/inference.py +++ b/megamolbart/megamolbart/inference.py @@ -165,7 +165,7 @@ def smiles2embedding(self, smiles, pad_length=None): torch.cuda.empty_cache() return embedding, pad_mask - def inverse_transform(self, embeddings, model, mem_pad_mask, k=1, sanitize=True): + def inverse_transform(self, embeddings, mem_pad_mask, k=1, sanitize=True): mem_pad_mask = mem_pad_mask.clone() smiles_interp_list = [] @@ -176,7 +176,7 @@ def inverse_transform(self, embeddings, model, mem_pad_mask, k=1, sanitize=True) if isinstance(memory, list): memory = torch.FloatTensor(memory).cuda() - decode_fn = partial(model._decode_fn, + decode_fn = partial(self.model._decode_fn, mem_pad_mask=mem_pad_mask.type(torch.LongTensor).cuda(), memory=memory) From 4996a467dd597a1fa5b0a7ec0820e8333eb40d5b Mon Sep 17 00:00:00 2001 From: Rajesh Ilango Date: Tue, 7 Sep 2021 11:22:31 -0700 Subject: [PATCH 03/27] Change defalt container version. --- setup/env.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup/env.sh b/setup/env.sh index 76117258..be76bc68 100644 --- a/setup/env.sh +++ b/setup/env.sh @@ -42,8 +42,8 @@ else write_env=1 fi -CUCHEM_CONT=${CUCHEM_CONT:=nvcr.io/nvidia/clara/cheminformatics_demo:0.1.1} -MEGAMOLBART_CONT=${MEGAMOLBART_CONT:=nvcr.io/nvidia/clara/megamolbart:0.1.1} +CUCHEM_CONT=${CUCHEM_CONT:=nvcr.io/nvidia/clara/cheminformatics_demo:0.1.2} +MEGAMOLBART_CONT=${MEGAMOLBART_CONT:=nvcr.io/nvidia/clara/megamolbart:0.1.2} MEGAMOLBART_MODEL=${MEGAMOLBART_MODEL:=nvidia/clara/megamolbart:0.1} PROJECT_PATH=${PROJECT_PATH:=$(pwd)} CONTENT_PATH=${CONTENT_PATH:=$(pwd)} From 98f53f1b336d7628eb4a506897746efd4c9cd1a6 Mon Sep 17 00:00:00 2001 From: Rajesh Ilango Date: Tue, 7 Sep 2021 13:45:32 -0700 Subject: [PATCH 04/27] Update README. --- README.md | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 1fd03e24..7f7a1ba3 100644 --- a/README.md +++ b/README.md @@ -24,16 +24,26 @@ To generate a template for `.env`, just run `./launch.sh` with no arguments. If `.env` does not exist, then a template will be written for you. ``` -CONT=nvcr.io/nvidia/clara/cheminformatics_demo:0.0.1 -JUPYTER_PORT=8888 +CUCHEM_CONT=nvcr.io/nvstaging/clara/cheminformatics_demo:latest +MEGAMOLBART_CONT=nvcr.io/nvstaging/clara/megamolbart:latest +MEGAMOLBART_MODEL=nvstaging/clara/megamolbart:0.1 +PROJECT_PATH=/home/rilango/Projects/github/cheminformatics +CONTENT_PATH=/clara/testData/chemInformatics +DATA_MOUNT_PATH=/data PLOTLY_PORT=5000 DASK_PORT=9001 -PROJECT_PATH=/path/to/local/repo/dir -DATA_PATH=/path/to/scratch/space -DATA_MOUNT_PATH=/data +SUBNET=192.177.100.0/16 +IP_CUCHEM_UI=192.177.100.1 +IP_MEGAMOLBART=192.177.100.2 +REGISTRY=nvcr.io +REGISTRY_USER='$oauthtoken' +REGISTRY_ACCESS_TOKEN=<> + ``` ### Getting Started +Please install NGC CLI from https://ngc.nvidia.com/setup/installers/cli. And obtain a NGC Key from https://ngc.nvidia.com/setup/api-key. + Once your environment is setup, the following commands should be all you need. Build your container: From e7592dba264f054cfd6fea38a425aac3ccffe112 Mon Sep 17 00:00:00 2001 From: Rajesh Ilango Date: Tue, 7 Sep 2021 14:01:20 -0700 Subject: [PATCH 05/27] Update README Change directory names --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7f7a1ba3..8e92f69a 100644 --- a/README.md +++ b/README.md @@ -27,8 +27,8 @@ If `.env` does not exist, then a template will be written for you. CUCHEM_CONT=nvcr.io/nvstaging/clara/cheminformatics_demo:latest MEGAMOLBART_CONT=nvcr.io/nvstaging/clara/megamolbart:latest MEGAMOLBART_MODEL=nvstaging/clara/megamolbart:0.1 -PROJECT_PATH=/home/rilango/Projects/github/cheminformatics -CONTENT_PATH=/clara/testData/chemInformatics +PROJECT_PATH=/home/user_home/code/cheminformatics +CONTENT_PATH=/home/user_home/data DATA_MOUNT_PATH=/data PLOTLY_PORT=5000 DASK_PORT=9001 From 87d5c5fc90b7f6e77630f21db30facdbcdad63e9 Mon Sep 17 00:00:00 2001 From: Rajesh K Ilango Date: Thu, 7 Oct 2021 08:40:00 -0700 Subject: [PATCH 06/27] Fix to allow fraction for raduis scale from UI. --- .gitignore | 1 + cuchem/cuchem/interactive/chemvisualize.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitignore b/.gitignore index d657ae0d..52844387 100644 --- a/.gitignore +++ b/.gitignore @@ -34,3 +34,4 @@ benchmark.csv chemportal/frontend/node_modules/ chemportal/outputs/ outputs/ +**/*.onnx \ No newline at end of file diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index db103a7d..8a71897f 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -270,7 +270,7 @@ def handle_generation(self, bt_generate, wf_class = locate(self.generative_wf_cls) generative_wf = wf_class() n2generate = int(n2generate) - scaled_radius = int(scaled_radius) + scaled_radius = float(scaled_radius) if rd_generation_type == 'SAMPLE': if chemble_ids == None or len(chemble_ids) == 0: From 6fa7a8f764e7042cfd8a34c640025b4d37a44013 Mon Sep 17 00:00:00 2001 From: Rajesh K Ilango Date: Thu, 14 Oct 2021 20:24:14 -0700 Subject: [PATCH 07/27] Remove the need for NGC cli. --- setup/env.sh | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) diff --git a/setup/env.sh b/setup/env.sh index be76bc68..5097c356 100644 --- a/setup/env.sh +++ b/setup/env.sh @@ -132,7 +132,7 @@ dbSetup() { if [[ ! -e "${DATA_DIR}/chembl_27_sqlite.tar.gz" ]]; then wget -q --show-progress \ -O ${DATA_DIR}/chembl_27_sqlite.tar.gz \ - ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_27/chembl_27_sqlite.tar.gz + https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_27/chembl_27_sqlite.tar.gz return_code=$? if [[ $return_code -ne 0 ]]; then echo -e "${RED}${BOLD}ChEMBL database download failed. Please check network settings and disk space(25GB).${RESET}" @@ -143,7 +143,7 @@ dbSetup() { wget -q --show-progress \ -O ${DATA_DIR}/checksums.txt \ - ftp://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_27/checksums.txt + https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_27/checksums.txt echo "Unzipping chembl db to ${DATA_DIR}..." CURR_DIR=$PWD; @@ -169,12 +169,36 @@ dbSetup() { download_model() { set -e - if [[ ! -e "${MODEL_PATH}" ]]; then - mkdir -p ${MODEL_PATH} - echo -e "${YELLOW}Downloading model ${MEGAMOLBART_MODEL} to ${MODEL_PATH}...${RESET}" - ngc registry model download-version \ - --dest ${MODEL_PATH} \ - "${MEGAMOLBART_MODEL}" + local MEGAMOLBART_MODEL_PATH=${MODEL_PATH} + local MEGAMOLBART_MODEL_VERSION=$(echo ${MEGAMOLBART_MODEL} | cut -d ":" -f2) + + if [ -n "${ALT_MEGAMOLBART_MODEL}" ]; then + # This is an alternate path for developers to download from an + # alternate/pre-release location. Please add 'ALT_MEGAMOLBART_MODEL' + # to .env with the alternate path. ALT_MEGAMOLBART_MODEL can only be + # an NGC model and will require NGC installed and configured. + local MEGAMOLBART_MODEL_VERSION=$(echo ${ALT_MEGAMOLBART_MODEL} | cut -d ":" -f2) + + if [[ ! -e "${MEGAMOLBART_MODEL_PATH}/megamolbart_v${MEGAMOLBART_MODEL_VERSION}" ]]; then + local DOWNLOAD_URL=${MEGAMOLBART_MODEL_URL} + mkdir -p ${MEGAMOLBART_MODEL_PATH} + ngc registry model download-version \ + --dest ${MEGAMOLBART_MODEL_PATH} \ + "${ALT_MEGAMOLBART_MODEL}" + fi + elif [[ ! -e "${MEGAMOLBART_MODEL_PATH}/megamolbart_v${MEGAMOLBART_MODEL_VERSION}" ]]; then + local DOWNLOAD_URL="https://api.ngc.nvidia.com/v2/models/nvidia/clara/megamolbart/versions/${MEGAMOLBART_MODEL_VERSION}/zip" + echo -e "${YELLOW}Downloading model ${MEGAMOLBART_MODEL} to ${MEGAMOLBART_MODEL_PATH}...${RESET}" + + mkdir -p ${MEGAMOLBART_MODEL_PATH} + + wget -q --show-progress \ + --content-disposition ${DOWNLOAD_URL} \ + -O ${MEGAMOLBART_MODEL_PATH}/megamolbart_${MEGAMOLBART_MODEL_VERSION}.zip + mkdir ${MEGAMOLBART_MODEL_PATH}/megamolbart_v${MEGAMOLBART_MODEL_VERSION} + unzip -q ${MEGAMOLBART_MODEL_PATH}/megamolbart_${MEGAMOLBART_MODEL_VERSION}.zip \ + -d ${MEGAMOLBART_MODEL_PATH}/megamolbart_v${MEGAMOLBART_MODEL_VERSION} fi + set +e -} +} \ No newline at end of file From 67db0ad2e7e2bbfd99a12dcc243493f368ea4693 Mon Sep 17 00:00:00 2001 From: Rajesh K Ilango Date: Mon, 18 Oct 2021 23:20:27 -0700 Subject: [PATCH 08/27] Remove the need to login to ngc to download containers. --- setup/launch | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/setup/launch b/setup/launch index 744348c3..010b5816 100755 --- a/setup/launch +++ b/setup/launch @@ -82,20 +82,6 @@ start() { echo "${CUCHEM_CONT} ${MEGAMOLBART_CONT}" export ADDITIONAL_PARAM="$@" - if [ -z ${REGISTRY_ACCESS_TOKEN} ]; then - echo "${RED}Please ensure 'REGISTRY_ACCESS_TOKEN' in $LOCAL_ENV is correct and rerun this script. Please set NGC API key to REGISTRY_ACCESS_TOKEN.${RESET}" - exit - else - echo "${YELLOW}Attempting docker login to ${REGISTRY}.${RESET}" - fi - - docker login ${REGISTRY} -u ${REGISTRY_USER} -p ${REGISTRY_ACCESS_TOKEN} - if [[ $? -ne 0 ]]; then - echo "${RED}Docker login failed. Please setup ngc('ngc config set'). " - echo "Please also check network settings and ensure 'REGISTRY_ACCESS_TOKEN' is $LOCAL_ENV is correct.${RESET}" - exit 1 - fi - download_model dbSetup ${DATA_PATH} From 3297c5421912f3528fbbc733c45690275b6438e8 Mon Sep 17 00:00:00 2001 From: Rajesh K Ilango Date: Thu, 28 Oct 2021 14:50:50 -0700 Subject: [PATCH 09/27] Upgrade dask and distributed python module to address security issue flagged by github --- .gitignore | 6 +++++- cuchem/requirements.txt | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 52844387..194b7a57 100644 --- a/.gitignore +++ b/.gitignore @@ -34,4 +34,8 @@ benchmark.csv chemportal/frontend/node_modules/ chemportal/outputs/ outputs/ -**/*.onnx \ No newline at end of file +**/*.onnx + +data/ +models/ + diff --git a/cuchem/requirements.txt b/cuchem/requirements.txt index 7ceea23b..3ac133b2 100644 --- a/cuchem/requirements.txt +++ b/cuchem/requirements.txt @@ -19,8 +19,8 @@ locust==1.4.3 hydra-core==1.1.1 dask_ml==1.8.0 locust==1.4.3 -dask==2021.8.1 -distributed==2021.8.1 +dask==2021.10.0 +distributed==2021.10.0 plotly==4.9.0 pytest==6.2.2 umap-learn==0.5.1 From d5f26ee6d22eca97179955e2a79c525b81f2e82e Mon Sep 17 00:00:00 2001 From: Rajesh K Ilango Date: Thu, 11 Nov 2021 08:04:17 -0800 Subject: [PATCH 10/27] Remove docker-compose from pre-req. Now docker-compose will be downloaded the first time the application is launched. Another changes is upgrade ansible script to install on ubuntu 20.04. --- setup/ansible-nvidia-driver.yml | 35 ++++++++++++++++----------------- setup/env.sh | 13 +++--------- setup/launch | 9 ++------- 3 files changed, 22 insertions(+), 35 deletions(-) diff --git a/setup/ansible-nvidia-driver.yml b/setup/ansible-nvidia-driver.yml index 7526ded5..6ea2074f 100644 --- a/setup/ansible-nvidia-driver.yml +++ b/setup/ansible-nvidia-driver.yml @@ -5,12 +5,12 @@ tasks: - name: Add CUDA apt-key apt_key: - url: https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub + url: https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub state: present - name: Add CUDA apt repository apt_repository: - repo: 'deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /' + repo: 'deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/ /' state: present filename: nvidia update_cache: yes @@ -27,19 +27,13 @@ state: present update_cache: yes with_items: - - 'deb https://nvidia.github.io/libnvidia-container/stable/ubuntu18.04/amd64/ /' - - 'deb https://nvidia.github.io/nvidia-container-runtime/stable/ubuntu18.04/amd64/ /' - - 'deb https://nvidia.github.io/nvidia-docker/ubuntu18.04/amd64/ /' + - 'deb https://nvidia.github.io/libnvidia-container/stable/ubuntu20.04/amd64/ /' + - 'deb https://nvidia.github.io/nvidia-container-runtime/stable/ubuntu20.04/amd64/ /' + - 'deb https://nvidia.github.io/nvidia-docker/ubuntu20.04/amd64/ /' register: nvidia_container_runtime_apt_repo - - name: Remove "nvidia-*" packages - apt: pkg=nvidia-* state=absent purge=yes - - - name: Remove "cuda-*" packages - apt: pkg=cuda-* state=absent purge=yes - - name: Install Nvidia Driver - apt: pkg=nvidia-driver-460 state=present + apt: pkg=nvidia-driver-470 state=present - name: Install CUDA Toolkit apt: pkg=nvidia-container-toolkit update_cache=yes state=present @@ -139,13 +133,18 @@ - name: Rebooting machine reboot: -- name: Start App +- name: Cheminfo Setup hosts: all become: yes tasks: - - name: Start Cheminfo container - command: docker run --name cheminfo --gpus all -d -v /tmp/:/data -p 5000:5000 nvcr.io/nvidia/clara/cheminformatics_demo:0.0.1 - - - name: Progress - command: docker logs -f cheminfo \ No newline at end of file + - name: Checkout cheminformatics github repo + ansible.builtin.git: + repo: https://github.com/NVIDIA/cheminformatics + dest: /opt/cheminfomatics + version: dev + + - name: Execute Setup + command: /opt/cheminfomatics/launch.sh setup + args: + chdir: /opt/cheminfomatics/ diff --git a/setup/env.sh b/setup/env.sh index 5097c356..94a206b7 100644 --- a/setup/env.sh +++ b/setup/env.sh @@ -89,16 +89,9 @@ else fi if [[ ! -d "/opt/nvidia/cheminfomatics" ]]; then - DOCKER_COMPOSE_SUPPORTED="1.29.1" - if [ -x "$(command -v docker-compose)" ]; then - DOCKER_COMPOSE_VERSION=$(docker-compose version --short) - if [ "$(version "$DOCKER_COMPOSE_SUPPORTED")" -gt "$(version "$DOCKER_COMPOSE_VERSION")" ]; then - echo "${RED}${BOLD}Please upgrade docker-compose to ${DOCKER_COMPOSE_SUPPORTED} from https://docs.docker.com/compose/install/.${RESET}" - exit 1 - fi - else - echo -e "${RED}${BOLD}Please install docker-compose. Version ${DOCKER_COMPOSE_SUPPORTED} or better. https://docs.docker.com/compose/install/${RESET}" - exit 1 + if [[ ! -e "docker-compose" ]]; then + curl -L "https://github.com/docker/compose/releases/download/1.29.2/docker-compose-$(uname -s)-$(uname -m)" -o docker-compose + chmod +x docker-compose fi fi diff --git a/setup/launch b/setup/launch index 010b5816..77bed191 100755 --- a/setup/launch +++ b/setup/launch @@ -68,11 +68,6 @@ EOF exit } -if [ ! -x "$(command -v ngc)" ]; then - echo "${RED}Please install NGC CLI. https://docs.nvidia.com/dgx/ngc-registry-cli-user-guide/index.html${RESET}" - exit 1 -fi - source env.sh @@ -88,7 +83,7 @@ start() { export ADDITIONAL_PARAM="$@" export CUCHEM_PATH=/opt/nvidia/cheminfomatics export MEGAMOLBART_PATH=/opt/nvidia/megamolbart - docker-compose --env-file .env \ + ./docker-compose --env-file .env \ -f docker_compose.yml \ --project-directory . \ up @@ -96,7 +91,7 @@ start() { stop() { - docker-compose --env-file .env \ + ./docker-compose --env-file .env \ -f docker_compose.yml \ --project-directory . \ down From ca0918c26a1c9cfe79c449aee725983c4ebccafa Mon Sep 17 00:00:00 2001 From: Doruk Ozturk Date: Wed, 5 Jan 2022 11:26:23 -0500 Subject: [PATCH 11/27] Get megamolbart url from env vars --- cuchem/cuchem/wf/generative/megatronmolbart.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cuchem/cuchem/wf/generative/megatronmolbart.py b/cuchem/cuchem/wf/generative/megatronmolbart.py index 066b1e2a..e13f09a5 100644 --- a/cuchem/cuchem/wf/generative/megatronmolbart.py +++ b/cuchem/cuchem/wf/generative/megatronmolbart.py @@ -1,4 +1,5 @@ import logging +import os import grpc import pandas as pd @@ -21,7 +22,7 @@ def __init__(self, dao: GenerativeWfDao = ChemblGenerativeWfDao(None)) -> None: super().__init__(dao) self.min_jitter_radius = 1 - channel = grpc.insecure_channel('megamolbart:50051') + channel = grpc.insecure_channel(os.getenv('Megamolbart', 'megamolbart:50051')) self.stub = GenerativeSamplerStub(channel) def get_iteration(self): From eb6f92275b2536b8ec0247f05c43b5ab4b8f2078 Mon Sep 17 00:00:00 2001 From: Rajesh Ilango Date: Tue, 25 Jan 2022 18:01:08 -0800 Subject: [PATCH 12/27] Fix vocab file issue when starting container standalone. Also remote clear cache option while building docker image. --- launch.sh | 2 +- megamolbart/launch.py | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/launch.sh b/launch.sh index ccd0dbe2..a77b46d4 100755 --- a/launch.sh +++ b/launch.sh @@ -105,7 +105,7 @@ build() { if [[ -z "${IMG_OPTION}" || "${IMG_OPTION}" == "2" ]]; then IFS=':' read -ra MEGAMOLBART_CONT_BASENAME <<< ${MEGAMOLBART_CONT} echo "Building ${MEGAMOLBART_CONT_BASENAME}..." - docker build --no-cache --network host \ + docker build --network host \ -t ${MEGAMOLBART_CONT_BASENAME}:latest \ -t ${MEGAMOLBART_CONT} \ --build-arg SOURCE_CONTAINER=${MEGAMOLBART_TRAINING_CONT} \ diff --git a/megamolbart/launch.py b/megamolbart/launch.py index a950a380..fd752a1a 100755 --- a/megamolbart/launch.py +++ b/megamolbart/launch.py @@ -18,6 +18,7 @@ import sys import atexit import logging +import shutil import logging import warnings @@ -28,7 +29,7 @@ import grpc import generativesampler_pb2_grpc from concurrent import futures -from megamolbart.service import GenerativeSampler +from util import DEFAULT_VOCAB_PATH logging.basicConfig(level=logging.INFO) logger = logging.getLogger('megamolbart') @@ -69,6 +70,11 @@ def __init__(self): logger.info(f'Maximum decoded sequence length is set to {args.max_decode_length}') + if not os.path.exists(DEFAULT_VOCAB_PATH): + os.makedirs(os.path.dirname(DEFAULT_VOCAB_PATH), exist_ok=True) + shutil.copy('/opt/MolBART/bart_vocab.txt', DEFAULT_VOCAB_PATH) + from megamolbart.service import GenerativeSampler + server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) generativesampler_pb2_grpc.add_GenerativeSamplerServicer_to_server(GenerativeSampler(decoder_max_seq_len=args.max_decode_length), server) server.add_insecure_port(f'[::]:{args.port}') From 84b58cc2b11f24de38f3bb3e309f8a0545b3e391 Mon Sep 17 00:00:00 2001 From: Rajesh Ilango Date: Wed, 26 Jan 2022 09:55:49 -0800 Subject: [PATCH 13/27] Change to download model when setup is not executed --- megamolbart/launch.py | 26 +++++++++++++++++--------- megamolbart/scripts/download_model.sh | 14 ++++++++++++++ 2 files changed, 31 insertions(+), 9 deletions(-) create mode 100755 megamolbart/scripts/download_model.sh diff --git a/megamolbart/launch.py b/megamolbart/launch.py index fd752a1a..a41621be 100755 --- a/megamolbart/launch.py +++ b/megamolbart/launch.py @@ -16,15 +16,12 @@ import os import sys -import atexit import logging import shutil import logging -import warnings import argparse - -from datetime import datetime +from subprocess import run import grpc import generativesampler_pb2_grpc @@ -70,18 +67,29 @@ def __init__(self): logger.info(f'Maximum decoded sequence length is set to {args.max_decode_length}') - if not os.path.exists(DEFAULT_VOCAB_PATH): - os.makedirs(os.path.dirname(DEFAULT_VOCAB_PATH), exist_ok=True) - shutil.copy('/opt/MolBART/bart_vocab.txt', DEFAULT_VOCAB_PATH) + if not os.path.exists('/models/megamolbart/checkpoints/'): + self.download_megamolbart_model() from megamolbart.service import GenerativeSampler server = grpc.server(futures.ThreadPoolExecutor(max_workers=10)) - generativesampler_pb2_grpc.add_GenerativeSamplerServicer_to_server(GenerativeSampler(decoder_max_seq_len=args.max_decode_length), server) + generativesampler_pb2_grpc.add_GenerativeSamplerServicer_to_server( + GenerativeSampler(decoder_max_seq_len=args.max_decode_length), server) server.add_insecure_port(f'[::]:{args.port}') server.start() server.wait_for_termination() - + def download_megamolbart_model(self): + """ + Downloads MegaMolBART model from NGC. + """ + download_script = '/opt/nvidia/megamolbart/scripts/download_model.sh' + if os.path.exists(download_script): + logger.info('Triggering model download...') + result = run(['bash', '-c', download_script]) + logger.info(f'Model download result: {result.stdout}') + logger.info(f'Model download result: {result.stderr}') + if result.returncode != 0: + raise Exception('Error downloading model') def main(): Launcher() diff --git a/megamolbart/scripts/download_model.sh b/megamolbart/scripts/download_model.sh new file mode 100755 index 00000000..c77446f6 --- /dev/null +++ b/megamolbart/scripts/download_model.sh @@ -0,0 +1,14 @@ +MEGAMOLBART_MODEL_VERSION=0.1 +MEGAMOLBART_MODEL_PATH=/models/megamolbart + +DOWNLOAD_URL="https://api.ngc.nvidia.com/v2/models/nvidia/clara/megamolbart/versions/${MEGAMOLBART_MODEL_VERSION}/zip" +echo -e "${YELLOW}Downloading model megamolbart to ${MEGAMOLBART_MODEL_PATH}...${RESET}" + +mkdir -p ${MEGAMOLBART_MODEL_PATH} +set -x +wget -q --show-progress \ + --content-disposition ${DOWNLOAD_URL} \ + -O ${MEGAMOLBART_MODEL_PATH}/megamolbart_${MEGAMOLBART_MODEL_VERSION}.zip +mkdir ${MEGAMOLBART_MODEL_PATH} +unzip -q ${MEGAMOLBART_MODEL_PATH}/megamolbart_${MEGAMOLBART_MODEL_VERSION}.zip \ + -d ${MEGAMOLBART_MODEL_PATH} From baab028405bdb42a5aa4bfdcc3364f8637820347 Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Sun, 30 Jan 2022 02:44:58 -0800 Subject: [PATCH 14/27] changes to UI and backend support --- cuchem/cuchem/interactive/chemvisualize.py | 812 ++++++++++++++++-- .../cuchem/wf/generative/megatronmolbart.py | 434 ++++++++++ 2 files changed, 1166 insertions(+), 80 deletions(-) diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index 8a71897f..b1dba90d 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -23,6 +23,18 @@ from rdkit import Chem from rdkit.Chem import Draw, PandasTools +# Check if all of these are needed: +from cuchemcommon.fingerprint import MorganFingerprint, INTEGER_NBITS +import sys +import numpy as np +import pandas as pd +import dask_cudf +from dask.distributed import wait +from rdkit import DataStructs, Chem +from rdkit.Chem.rdmolfiles import MolFromSmiles +from rdkit.Chem.Scaffolds.MurckoScaffold import MurckoScaffoldSmilesFromSmiles +import time + logger = logging.getLogger(__name__) main_fig_height = 700 @@ -95,6 +107,17 @@ def download_sdf(): headers={"Content-disposition": "attachment; filename=download.sdf"}) +def popcll_wrapper(ip_col, op_col): + for i, n in enumerate(ip_col): + op_col[i] = popcll(n) + +def popcll_wrapper_dask(df, ip_col, op_col): + df = df.apply_rows(popcll_wrapper, incols = {ip_col: 'ip_col'}, outcols = {op_col: int}, kwargs = {}) + return df[op_col] + +def intersection_wrapper(fp_int_col, op_col, query_fp_int): + for i, fp_int in enumerate(fp_int_col): + op_col[i] = popcll(fp_int & query_fp_int) class ChemVisualization(metaclass=Singleton): @@ -107,6 +130,10 @@ def __init__(self, cluster_wf): self.cluster_wf_cls = 'cuchem.wf.cluster.gpukmeansumap.GpuKmeansUmapHybrid' self.generative_wf_cls = 'cuchem.wf.generative.MegatronMolBART' + self.fp_df = None # all fingerprints of all ChemBl compounds and their IDs as a pandas dataframe + self.fingerprint_radius = fingerprint_radius + self.fingerprint_nBits = fingerprint_nBits + # Store colors to avoid plots changes colors on events such as # molecule selection, etc. self.cluster_colors = generate_colors(self.n_clusters) @@ -136,7 +163,9 @@ def __init__(self, cluster_wf): Input('bt_north_star', 'n_clicks'), Input('sl_prop_gradient', 'value'), Input('sl_nclusters', 'value'), - Input('refresh_main_fig', 'children')], + Input('refresh_main_fig', 'children'), + Input('fingerprint_radius', 'value'), + Input('fingerprint_nBits', 'value')], [State("selected_clusters", "value"), State("main-figure", "selectedData"), State('north_star', 'value'), @@ -185,10 +214,19 @@ def __init__(self, cluster_wf): Input('bt_reset_candidates', 'n_clicks'), ], State('genration_candidates', 'children'))(self.handle_add_candidate) + self.app.callback( + Output('analoguing_candidates', 'children'), + [Input({'role': 'bt_analoguing_candidate', 'chemblId': ALL, 'molregno': ALL}, 'n_clicks')], + State('analoguing_candidates', 'children'))(self.handle_analoguing_candidate) + self.app.callback( Output('ckl_candidate_mol_id', 'options'), Input('genration_candidates', 'children'))(self.handle_construct_candidates) + self.app.callback( + Output('ckl_analoguing_mol_id', 'options'), + Input('analoguing_candidates', 'children'))(self.handle_construct_candidates2) + self.app.callback( [Output('ckl_candidate_mol_id', 'value'), Output('mk_selection_msg', 'children')], @@ -196,6 +234,10 @@ def __init__(self, cluster_wf): Input('rd_generation_type', 'value')])(self.handle_ckl_selection) self.app.callback( + [Output('ckl_analoguing_mol_id', 'value')], + [Input('ckl_analoguing_mol_id', 'value')])(self.handle_analoguing_ckl_selection) + +""" self.app.callback( [Output('table_generated_molecules', 'children'), Output('show_generated_mol', 'children'), Output('msg_generated_molecules', 'children'), @@ -207,6 +249,50 @@ def __init__(self, cluster_wf): State('scaled_radius', 'value'), State('rd_generation_type', 'value'), State('show_generated_mol', 'children')])(self.handle_generation) +""" + + self.app.callback( + [Output('section_generated_molecules_clustered', 'style'), + Output('gen_figure', 'figure'), + Output('table_generated_molecules', 'children'), + Output('show_generated_mol', 'children'), + Output('interpolation_error', 'children'), ], + [Input("bt_generate", "n_clicks"), ], + [State('sl_generative_wf', 'value'), + State('ckl_candidate_mol_id', 'value'), + State('n2generate', 'value'), + State('extrap_compound_property', 'value'), + State('extrap_cluster_number', 'value'), + State('extrap_n_compounds', 'value'), + State('extrap_step_size', 'value'), + State('scaled_radius', 'value'), + State('rd_generation_type', 'value'), + State('show_generated_mol', 'children')])(self.handle_generation) + + self.app.callback( + [Output('section_fitting', 'style'), + Output('fitting_figure', 'figure')], + [Input("bt_fit", "n_clicks"),], + [State('sl_featurizing_wf', 'value'), + State('fit_nn_compound_property', 'value'), + State('fit_nn_train_cluster_number', 'value'), + State('fit_nn_test_cluster_number', 'value'), + State('fit_nn_hidden_layer_sizes', 'value'), + State('fit_nn_activation_fn', 'value'), + State('fit_nn_final_activation_fn', 'value'), + State('fit_nn_max_epochs', 'value'), + State('fit_nn_learning_rate', 'value'), + State('fit_nn_weight_decay', 'value'), + State('fit_nn_batch_size', 'value')])(self.handle_fitting) + + self.app.callback( + [Output('section_analoguing', 'style'), + Output('tb_analoguing', 'children')], + [Input("bt_analoguing", "n_clicks"),], + [State('ckl_analoguing_mol_id', 'value'), + State('analoguing_n_analogues', 'value'), + State('analoguing_threshold', 'value'), + State('analoguing_type', 'value')])(self.handle_analoguing) self.app.callback( [Output('section_generated_molecules', 'style'), @@ -238,6 +324,26 @@ def handle_add_candidate(self, bt_add_candidate, return ','.join(selected_candidates) + + def handle_analoguing_candidate(self, bt_analoguing_candidate, analoguing_candidates): + comp_id, event_type = self._fetch_event_data() + #logger.info(f'handle_analoguing_candidate({bt_analoguing_candidate}, {analoguing_candidates}): cid={comp_id}, et={event_type}, dash.callback_context.triggered[0]["value"]={ dash.callback_context.triggered[0]["value"]}') + if event_type != 'n_clicks' or dash.callback_context.triggered[0]['value'] == 0: + raise dash.exceptions.PreventUpdate + + selected_candidates = [] + + if analoguing_candidates: + selected_candidates = analoguing_candidates.split(",") + + comp_detail = json.loads(comp_id) + selected_chembl_id = comp_detail['chemblId'] + + if selected_chembl_id not in selected_candidates: + selected_candidates.append(selected_chembl_id) + #logger.info(f'comp_detail={comp_detail}, selected_candidates={selected_candidates}') + return ','.join(selected_candidates) + def _fetch_event_data(self): if not dash.callback_context.triggered: raise dash.exceptions.PreventUpdate @@ -279,6 +385,15 @@ def handle_generation(self, bt_generate, num_requested=n2generate, scaled_radius=scaled_radius, force_unique=True) + elif rd_generation_type == 'EXTRAPOLATE': + self.generated_df = generative_wf.extrapolate_from_cluster(self.cluster_wf.df_embedding, + compound_property=extrap_compound_property, + cluster_id=extrap_cluster_number, + n_compounds_to_transform=extrap_n_compounds, + num_points=n2generate, + step_size=extrap_step_size, + scaled_radius=scaled_radius, + force_unique=False)#True) else: if chemble_ids == None or len(chemble_ids) < 2: raise ValueError('Please select at-least two molecules for Interpolation.') @@ -294,14 +409,61 @@ def handle_generation(self, bt_generate, # Add other useful attributes to be added for rendering self.genreated_df = MolecularStructureDecorator().decorate(self.genreated_df) self.genreated_df = LipinskiRuleOfFiveDecorator().decorate(self.genreated_df) + self.generated_df = self.generated_df[ ~self.generated_df['invalid'] ].reset_index(drop=True).drop(columns=['invalid']) + if len(self.generated_df) == 0: + logger.info("None of the generated smiles yielded valid molecules!!!") + return dash.no_update, dash.no_update + # Note: we are not allowing fingerprint specification to change here because we want to see the results on the same PCA / UMAP as the original figure + # TODO: make this clear in the UI + fps = MorganFingerprint(radius=self.fingerprint_radius, nBits=self.fingerprint_nBits).transform(self.generated_df, smiles_column='SMILES') + df_fp = pd.DataFrame(fps, dtype='float32') + self.generated_df = pd.concat([self.generated_df, df_fp], axis=1) + df_fp=cudf.from_pandas(df_fp) + df_fp['id'] = list(map(str, self.generated_df['id'])) + df_fp['cluster'] = list(map(int, self.generated_df['Generated'])) + n_generated = self.generated_df['Generated'].sum() + if n_generated < len(self.generated_df) / 2: + # Highlight the generated compounds + north_stars = ','.join(list(df_fp[ self.generated_df['Generated'] ]['id'].values_host)) + else: + # Highlight the source compound(s) + north_stars = ','.join(list(df_fp[ ~self.generated_df['Generated'] ]['id'].values_host)) + + # TODO: check if all these lines are necessary! + chunksize=max(10, int(df_fp.shape[0] * 0.1)) + df_embedding = dask_cudf.from_cudf(df_fp, chunksize=chunksize) + df_embedding = df_embedding.reset_index() + cluster_col = df_embedding['cluster'] + df_embedding, prop_series = self.cluster_wf._remove_non_numerics(df_embedding) + prop_series['cluster'] = cluster_col + n_molecules, n_obs = df_embedding.compute().shape # needed? + df_embedding = self.cluster_wf.pca.transform(df_embedding) + df_embedding = df_embedding.persist() # TODO: wait after this? + X_train = df_embedding.compute() # needed? + Xt = self.cluster_wf.umap_model.transform(df_embedding) + df_embedding['x'] = Xt[0] + df_embedding['y'] = Xt[1] + + for col in prop_series.keys(): + #logger.info(f'col={col}') + sys.stdout.flush() + df_embedding[col] = prop_series[col]#.compute() + + fig, northstar_cluster = self.create_graph(df_embedding, north_stars=north_stars) + # Create Table header table_headers = [] - columns = self.genreated_df.columns.to_list() - ignore_columns = ['embeddings', 'embeddings_dim'] + columns = [ + col_name + for col_name in self.generated_df.columns.to_list() + if not isinstance(col_name, int) + ] + #columns = self.genreated_df.columns.to_list() + #ignore_columns = ['embeddings', 'embeddings_dim'] for column in columns: - if column in ignore_columns: - continue + #if column in ignore_columns: + # continue table_headers.append(html.Th(column, style={'fontSize': '150%', 'text-align': 'center'})) prop_recs = [html.Tr(table_headers, style={'background': 'lightgray'})] @@ -321,8 +483,8 @@ def handle_generation(self, bt_generate, for col_id in range(len(columns)): col_data = self.genreated_df.iat[row_idx, col_id] - if columns[col_id] in ignore_columns: - continue + #if columns[col_id] in ignore_columns: + # continue col_level = 'info' if isinstance(col_data, dict): @@ -343,17 +505,229 @@ def handle_generation(self, bt_generate, } )) - prop_recs.append(html.Tr(td, style={'fontSize': '125%'})) - + #prop_recs.append(html.Tr(td, style={'fontSize': '125%'})) + prop_recs.append(html.Tr(td)) + msg_generated_molecules = '' if invalid_mol_cnt > 0: msg_generated_molecules = f'{invalid_mol_cnt} invalid molecules were created, which were eliminated from the result.' - return html.Table(prop_recs, style={'width': '100%', - 'border': '1px solid lightgray'}), \ - show_generated_mol, \ - msg_generated_molecules, \ - dash.no_update + #return html.Table(prop_recs, style={'width': '100%', + # 'border': '1px solid lightgray'}), \ + # show_generated_mol, \ + # msg_generated_molecules, \ + # dash.no_update + return {'display': 'inline'}, fig, html.Table(prop_recs, style={'width': '100%', 'margin': 12, 'border': '1px solid lightgray'}), show_generated_mol, dash.no_update + + + @report_ui_error(3) + def handle_fitting( + self, bt_fit, sl_featurizing_wf, + fit_nn_compound_property, fit_nn_train_cluster_number, fit_nn_test_cluster_number, fit_nn_hidden_layer_sizes, fit_nn_activation_fn, fit_nn_final_activation_fn, + fit_nn_max_epochs, fit_nn_learning_rate, fit_nn_weight_decay, fit_nn_batch_size + ): + comp_id, event_type = self._fetch_event_data() + #logger.info(f'handle_fitting: comp_id={comp_id}, event_type={event_type}') + sys.stdout.flush() + if (comp_id != 'bt_fit') or (event_type != 'n_clicks'): + return dash.no_update, dash.no_update + #logger.info(f'comp_id={comp_id}, event_type={event_type}') + self.featurizing_wf_cls = sl_featurizing_wf + wf_class = locate(self.featurizing_wf_cls) + featurizing_wf = wf_class() + + df = featurizing_wf.fit_nn( + self.cluster_wf.df_embedding, + compound_property=fit_nn_compound_property, + cluster_id_train=fit_nn_train_cluster_number, + cluster_id_test=fit_nn_test_cluster_number, + hidden_layer_sizes=list(map(int, fit_nn_hidden_layer_sizes.split(','))) if fit_nn_hidden_layer_sizes != '' else [], + activation_fn=fit_nn_activation_fn, + final_activation_fn=fit_nn_final_activation_fn, + max_epochs=int(fit_nn_max_epochs), + learning_rate=float(fit_nn_learning_rate), + weight_decay=float(fit_nn_weight_decay), + batch_size=int(fit_nn_batch_size) + ) + #logger.info(df.head()) + sys.stdout.flush() + fig = self.create_plot(df, fit_nn_compound_property) + return {'display': 'inline'}, fig + + @report_ui_error(3) + def handle_analoguing( + self, bt_analoguing, analoguing_mol_id, analoguing_n_analogues, analoguing_threshold, analoguing_type, + + ): + comp_id, event_type = self._fetch_event_data() + #logger.info(f'handle_analoguing: mol={analoguing_mol_id}, n={analoguing_n_analogues}, th={analoguing_threshold}, type={analoguing_type}') + sys.stdout.flush() + if (comp_id != 'bt_analoguing') or (event_type != 'n_clicks'): + return dash.no_update, dash.no_update + + # Compute fingerprints once for all input database compounds (already done when input data would have been clustered) + if 'canonical_smiles' in self.cluster_wf.df_embedding: + smiles_column = 'canonical_smiles' + else: + smiles_columns = 'SMILES' + if self.fp_df is None: # CPU-based workflow, to be deprecated + smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']].map_partitions(cudf.DataFrame.to_pandas) + if 'fp' not in self.cluster_wf.df_embedding.columns: + logger.info(f'Computing fingerprints...') + _, v = MorganFingerprint(radius=self.fingerprint_radius, nBits=self.fingerprint_nBits).transform( + smiles_df, smiles_column=smiles_column, return_fp=True, raw=True) + else: + logger.info(f'Fingerprints already available') + v = list(self.cluster_wf.df_embedding['fp'].compute().to_pandas()) + self.fp_df = pd.DataFrame({ + 'fp': v, + smiles_column: smiles_df[smiles_column], #list(self.cluster_wf.df_embedding[smiles_column].compute().to_pandas()), #smiles_df[smiles_column], + 'id': smiles_df['id'], #list(self.cluster_wf.df_embedding['id'].compute().to_pandas()) + }) + + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() + wait(self.cluster_wf.df_embedding) + + if 'pc' not in self.cluster_wf.df_embedding.columns: + # Pre-computing the popcounts for all compounds in the database: + t0 = time.time() + self.cluster_wf.df_embedding['op_col'] = 0 + self.cluster_wf.df_embedding['pc'] = 0 + + for col in self.cluster_wf.df_embedding.columns: + if (type(col) == str) and col.startswith('fp') and (len(col) > 2): + logger.info(f'{col}: {self.cluster_wf.df_embedding[col]}') + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.apply_rows( + popcll_wrapper, incols = {col: 'ip_col'}, outcols = {'op_col': int}, kwargs = {}) + # More complex syntax was not necessary: + #self.cluster_wf.df_embedding['op_col'] = self.cluster_wf.df_embedding.map_partitions(popcll_wrapper_dask, col, 'op_col') #lambda df: df = df.apply_rows(popcll_wrapper, incols = {col: 'ip_col'}, outcols = {'op_col': int}, kwargs = {})) + self.cluster_wf.df_embedding['pc'] += self.cluster_wf.df_embedding['op_col'] + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() + wait(self.cluster_wf.df_embedding) + t1 = time.time() + logger.info(f'Time to compute partial popcounts: {t1 - t0}') + + # Prepare the query compound: + molregno = self.chem_data.fetch_molregno_by_chemblId([analoguing_mol_id])[0][0] + props, selected_molecules = self.chem_data.fetch_props_by_molregno([molregno]) + query_smiles = selected_molecules[0][props.index('canonical_smiles')] + query_fp = MorganFingerprint(radius=self.fingerprint_radius, nBits=self.fingerprint_nBits).transform( + pd.DataFrame({'smiles': [query_smiles]}), smiles_column='smiles', return_fp=True, raw=True)[1][0] + query_fps = query_fp.ToBitString() + query_fp_ints = [int(query_fps[i: i + INTEGER_NBITS], 2) for i in range(0, self.fingerprint_nBits, INTEGER_NBITS)] + query_pc = sum(bin(x).count('1') for x in query_fp_ints) + + # GPU-based workflow for similarity computation + # Tanimoto = popcount(intersection) / ( popcount(query) + popcount(compound) - popcount(intersection) ) + # Sine the fingerprint is stored as a list of int64s in separate columns + if 'op_col' in self.cluster_wf.df_embedding: + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.drop(columns=['op_col']) + if 'n_intersection' in self.cluster_wf.df_embedding: + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.drop(columns=['n_intersection']) + #self.cluster_wf.df_embedding['op_col'] = 0 + self.cluster_wf.df_embedding['n_intersection'] = 0 + t4 = time.time() + for i in range(0, self.fingerprint_nBits, INTEGER_NBITS): + fp_num = i // INTEGER_NBITS + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.apply_rows( + intersection_wrapper, incols={f'fp{fp_num}': 'fp_int_col'}, outcols={'op_col': int}, kwargs={'query_fp_int': query_fp_ints[fp_num]}) + #logging.info(f'{i}:\n{self.cluster_wf.df_embedding.head()}') + #self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() + #wait(self.cluster_wf.df_embedding) + self.cluster_wf.df_embedding['n_intersection'] += self.cluster_wf.df_embedding['op_col'] + + self.cluster_wf.df_embedding['n_union'] = self.cluster_wf.df_embedding['pc'] - self.cluster_wf.df_embedding['n_intersection'] + query_pc + self.cluster_wf.df_embedding['similarity'] = self.cluster_wf.df_embedding['n_intersection'] / self.cluster_wf.df_embedding['n_union'] + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() + wait(self.cluster_wf.df_embedding) + t5 = time.time() + t0 = time.time() + self.fp_df['similarity_cpu'] = self.fp_df['fp'].apply(lambda x: DataStructs.FingerprintSimilarity(query_fp, x)) + + if 'similarity_cpu' in self.cluster_wf.df_embedding: + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.drop(columns=['similarity_cpu']) + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.merge( + dask_cudf.from_cudf( + cudf.from_pandas(self.fp_df[['id', 'similarity_cpu']]), + npartitions = self.cluster_wf.df_embedding.npartitions + ), + on='id', + how='left' + ).reset_index(drop=True) + + t1 = time.time() + logger.info(f'Fingerprint length={self.fingerprint_nBits}: GPU-Method: {t5 - t4}, CPU-Method: {t1 - t0}') + + #self.analoguing_df = self.fp_df[ self.fp_df['similarity_cpu'] >= float(analoguing_threshold) ] + self.analoguing_df = self.cluster_wf.df_embedding[ self.cluster_wf.df_embedding['similarity'] >= float(analoguing_threshold) ] + drop_columns = [ + col + for col in self.analoguing_df.columns + if (type(col) == int) or col.startswith('fp') or (col in ['x', 'y', 'cluster', 'op_col', 'pc', 'n_intersection', 'n_union', 'transformed_smiles']) + ] + self.analoguing_df = self.analoguing_df.drop(columns=drop_columns).compute().to_pandas() # dask_cudf --> cudf --> pandas (CPU!) + if analoguing_type in ['scaffold', 'superstructure']: + if analoguing_type == 'scaffold': + # Only include compounds that have the same murcko scaffold as the query compound + query_scaffold_mol = MolFromSmiles(MurckoScaffoldSmilesFromSmiles(query_smiles)) + else: #analoguing_type == 'superstructure': + # Only include compounds that are superstructures of the query compound + query_scaffold_mol = MolFromSmiles(query_smiles) + self.analoguing_df['mol'] = self.analoguing_df[smiles_column].apply(MolFromSmiles) + self.analoguing_df.dropna(subset=['mol'], inplace=True) + self.analoguing_df = self.analoguing_df[ self.analoguing_df['mol'].apply(lambda x: x.HasSubstructMatch(query_scaffold_mol)) ] + self.analoguing_df.drop(columns=['mol'], inplace=True) + self.analoguing_df = self.analoguing_df.nlargest(int(analoguing_n_analogues), 'similarity') + self.analoguing_df.reset_index(drop=True, inplace=True) + #self.analoguing_df = dask_cudf.from_cudf(self.analoguing_df, npartitions=self.cluster_wf.df_embedding.npartitions) # going back to dask for a reason? + # TODO: we are presuming the IDs are the same but there is no guarantee since we added code to generate dummy IDs based on indices elsewhere. + + # Needed only for CPU-based workflow + #self.analoguing_df = self.analoguing_df.merge(self.cluster_wf.df_embedding, on='id').compute().reset_index(drop=True).to_pandas() + # Add other useful attributes to be added for rendering + smiles_idx = self.analoguing_df.columns.to_list().index(smiles_column) + self.analoguing_df = MolecularStructureDecorator().decorate(self.analoguing_df, smiles_col=smiles_idx) + #self.analoguing_df = LipinskiRuleOfFiveDecorator().decorate(self.analoguing_df, smiles_col=smiles_idx) + self.analoguing_df = self.analoguing_df.sort_values('similarity', ascending=False) + # Create Table header + table_headers = [] + all_columns = self.analoguing_df.columns.to_list() + columns_in_table = [ + col_name + for col_name in self.analoguing_df.columns.to_list() + if (not isinstance(col_name, int)) and (not col_name.startswith('fp')) + ] + # TODO: factor this into a separate function: build table from dataframe + for column in columns_in_table: + table_headers.append(html.Th(column, style={'fontSize': '150%', 'text-align': 'center'})) + prop_recs = [html.Tr(table_headers, style={'background': 'lightgray'})] + for row_idx in range(self.analoguing_df.shape[0]): + td = [] + try: + col_pos = all_columns.index('Chemical Structure') + col_data = self.analoguing_df.iat[row_idx, col_pos] + if 'value' in col_data and col_data['value'] == 'Error interpreting SMILES using RDKit': + continue + except ValueError: + pass + for col_name in columns_in_table: + col_id = all_columns.index(col_name) + col_data = self.analoguing_df.iat[row_idx, col_id] + col_level = 'info' + if isinstance(col_data, dict): + col_value = col_data['value'] + if 'level' in col_data: + col_level = col_data['level'] + else: + col_value = col_data + if isinstance(col_value, str) and col_value.startswith('data:image/png;base64,'): + td.append(html.Td(html.Img(src=col_value))) + else: + td.append(html.Td(str(col_value), style=LEVEL_TO_STYLE[col_level].update({'maxWidth': '100px', 'wordWrap':'break-word'}))) + + prop_recs.append(html.Tr(td)) + + return {'display': 'inline'}, html.Table(prop_recs, style={'width': '100%', 'margin': 12, 'border': '1px solid lightgray'}) def handle_ckl_selection(self, ckl_candidate_mol_id, rd_generation_type): selection_msg = '**Please Select Two Molecules**' @@ -362,12 +736,22 @@ def handle_ckl_selection(self, ckl_candidate_mol_id, rd_generation_type): if rd_generation_type == 'SAMPLE': selection_msg = '**Please Select One Molecule**' selection_cnt = 1 - + elif rd_generation_type == 'EXTRAPOLATE': + # TO DO: one cluster and one property have to be provided + selection_msg = '**Please Select Zero Molecules (specify cluster above, instead)**' + selection_cnt = 0 if ckl_candidate_mol_id and len(ckl_candidate_mol_id) > selection_cnt: ckl_candidate_mol_id = ckl_candidate_mol_id[selection_cnt * -1:] return ckl_candidate_mol_id, selection_msg + def handle_analoguing_ckl_selection(self, ckl_analoguing_mol_id): + if ckl_analoguing_mol_id and len(ckl_analoguing_mol_id) > 1: + # Allow only one compound to be chosen for analoguing + ckl_analoguing_mol_id = ckl_analoguing_mol_id[-1:] + + return ckl_analoguing_mol_id + def handle_construct_candidates(self, north_star): if not north_star: return [] @@ -375,6 +759,10 @@ def handle_construct_candidates(self, north_star): options = [{'label': i.strip(), 'value': i.strip()} for i in north_star.split(',')] return options + def handle_construct_candidates2(self, north_star): + if not north_star: + return [] + def handle_reset(self, bt_reset, bt_apply_wf, refresh_main_fig, sl_wf): comp_id, event_type = self._fetch_event_data() @@ -402,19 +790,26 @@ def recluster(self, filter_values=None, filter_column=None, reload_data=False): return self.cluster_wf.recluster(filter_column, filter_values, n_clusters=self.n_clusters) - def recluster_selection(self, - filter_value=None, - filter_column=None, - gradient_prop=None, - north_stars=None, - reload_data=False, - recluster_data=True, - color_col='cluster'): + def recluster_selection( + self, + filter_value=None, + filter_column=None, + gradient_prop=None, + north_stars=None, + reload_data=False, + recluster_data=True, + color_col='cluster', + fingerprint_nBits=512 + ): if recluster_data or self.cluster_wf.df_embedding is None: - df_embedding = self.recluster(filter_values=filter_value, - filter_column=filter_column, - reload_data=reload_data) + self.fingerprint_nBits = fingerprint_nBits + self.fingerprint_radius = fingerprint_radius + df_embedding = self.recluster( + filter_values=filter_value, + filter_column=filter_column, + reload_data=reload_data + ) else: df_embedding = self.cluster_wf.df_embedding @@ -424,6 +819,7 @@ def recluster_selection(self, north_stars=north_stars) def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop=None): + sys.stdout.flush() fig = go.Figure(layout={'colorscale': {}}) # Filter out relevant columns in this method. @@ -459,6 +855,7 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop cluster = ldf['cluster'] customdata = ldf['id'] grad_prop = ldf[gradient_prop] + textdata = cupy.asarray([ f'C-{c}_ID-{cid}' for c, cid in zip(cdf['cluster'].to_array(), cdf['id'].to_array()) ]) if self.cluster_wf.is_gpu_enabled(): x_data = x_data.to_array() @@ -484,6 +881,7 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop 'showscale': True, 'cmin': cmin, 'cmax': cmax, + 'name': customdata } })) else: @@ -504,12 +902,14 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop # Compute size of northstar and normal points df_shape = df_size.copy() - df_size = (df_size * 18) + DOT_SIZE + df_size = (df_size * 2) + DOT_SIZE df_shape = df_shape * 2 x_data = cdf['x'] y_data = cdf['y'] cluster = cdf['cluster'] customdata = cdf['id'] + textdata = [ f'C-{c}_ID-{cid}' for c, cid in zip(cdf['cluster'].to_array(), cdf['id'].to_array()) ] + sys.stdout.flush() if self.cluster_wf.is_gpu_enabled(): x_data = x_data.to_array() @@ -522,7 +922,8 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop scatter_trace = go.Scattergl({ 'x': x_data, 'y': y_data, - 'text': cluster, + 'text': textdata, + #'text': cluster, 'customdata': customdata, 'name': 'Cluster ' + str(cluster_id), 'mode': 'markers', @@ -556,6 +957,37 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop del ldf return fig, northstar_cluster + def create_plot(self, df, compound_property): + """ + Expects df to have x, y, cluster and train_set columns + """ + fig = go.Figure(layout={'colorscale': {}}) + scatter_trace = go.Scattergl({ + 'x': df['x'], + 'y': df['y'], + 'text': [ f'C-{c}_ID-{cid}' for c, cid in zip(df['cluster'], df['id']) ], + 'customdata': df['id'], + 'mode': 'markers', + 'marker': { + 'size': DOT_SIZE, + 'symbol': df['train_set'].apply(lambda x: 0 if x else 1), + 'color': df['cluster'].apply(lambda x: self.cluster_colors[x % len(self.cluster_colors)]), + }, + }) + fig.add_trace(scatter_trace) + # Change the title to indicate type of H/W in use + f_color = 'green' if self.cluster_wf.is_gpu_enabled() else 'blue' + fig.update_layout( + showlegend=True, clickmode='event', height=main_fig_height, + title=f'{PROP_DISP_NAME[compound_property]} Prediction', dragmode='select', + title_font_color=f_color, + annotations=[ + dict(x=0.5, y=-0.07, showarrow=False, text='Actual', + xref="paper", yref="paper"), + dict(x=-0.05, y=0.5, showarrow=False, text="Predicted", + textangle=-90, xref="paper", yref="paper")]) + return fig + def start(self, host=None, port=5000): return self.app.run_server( debug=False, use_reloader=False, host=host, port=port) @@ -629,7 +1061,7 @@ def construct_molecule_detail(self, selected_points, display_properties, td.append(html.Td(selected_chembl_id)) else: td.append(html.Td( - dbc.Button('Add as MoI', + dbc.Button('Highlight', id={'role': 'bt_star_candidate', 'chemblId': selected_chembl_id, 'molregno': str(molregno) @@ -638,7 +1070,7 @@ def construct_molecule_detail(self, selected_points, display_properties, )) td.append(html.Td( - dbc.Button('Add for Interpolation', + dbc.Button('Add', id={'role': 'bt_add_candidate', 'chemblId': selected_chembl_id, 'molregno': str(molregno) @@ -647,6 +1079,16 @@ def construct_molecule_detail(self, selected_points, display_properties, n_clicks=0) )) + td.append(html.Td( + dbc.Button('Analogue', + id={'role': 'bt_analoguing_candidate', + 'chemblId': selected_chembl_id, + 'molregno': str(molregno), + #'smiles': smiles + }, + n_clicks=0) + )) + prop_recs.append(html.Tr(td, style={'fontSize': '125%'})) return html.Table(prop_recs, style={'width': '100%', 'border': '1px solid lightgray'}), all_props @@ -672,6 +1114,18 @@ def constuct_layout(self): className='three columns'), ], style={'marginLeft': 0, 'marginBottom': 18, }), + html.Div(className='row', children=[ + dcc.Markdown("Fingerprint Radius", style={'marginTop': 12,}), + dcc.Input(id='fingerprint_radius', value=2), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + + html.Div(className='row', children=[ + dcc.Markdown("Fingerprint Size", style={'marginTop': 12,}), + dcc.Input(id='fingerprint_nBits', value=512), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + dcc.Tabs([ dcc.Tab(label='Cluster Molecules', children=[ dcc.Markdown("""**Select Workflow**""", style={'marginTop': 18, }), @@ -736,6 +1190,7 @@ def constuct_layout(self): id='rd_generation_type', options=[ {'label': 'Interpolate between two molecules', 'value': 'INTERPOLATE'}, + {'label': 'Fit cluster to property and extrapolate', 'value': 'EXTRAPOLATE'}, {'label': 'Sample around one molecule', 'value': 'SAMPLE'}, ], value='INTERPOLATE', @@ -745,11 +1200,37 @@ def constuct_layout(self): ), html.Div(className='row', children=[ - dcc.Markdown("Number of molecules to generate", + dcc.Markdown("Number to be generated from each compound", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}), dcc.Input(id='n2generate', value=10), ], style={'marginLeft': 0}), + + html.Div(className='row', children=[ + html.Label([ + "Select molecular property for fitting and extrapolation", + dcc.Dropdown(id='extrap_compound_property', multi=False, clearable=False, + options=[{"label": PROP_DISP_NAME[p], "value": p} for p in IMP_PROPS], + value=IMP_PROPS[0]), + ], style={'marginTop': 18, 'marginLeft': 18})], + ), + + html.Div(className='row', children=[ + dcc.Markdown("Cluster number for fitting property and extrapolation", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}), + dcc.Input(id='extrap_cluster_number', value=0), + ], style={'marginLeft': 0}), + + html.Div(className='row', children=[ + dcc.Markdown("Step-size for extrapolation", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}), + dcc.Input(id='extrap_step_size', value=0.1), + ], style={'marginLeft': 0}), + + html.Div(className='row', children=[ + dcc.Markdown("Number of compounds to extrapolate", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}), + dcc.Input(id='extrap_n_compounds', value=10), + ], style={'marginLeft': 0}), + + html.Div(className='row', children=[ dcc.Markdown("Scaled sampling radius (int, start with 1)", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}), @@ -771,6 +1252,124 @@ def constuct_layout(self): dbc.Button('Reset', id='bt_reset_candidates', n_clicks=0), ], style={'marginLeft': 0}), ]), + + dcc.Tab(label='Predict Properties', children=[ + + dcc.Markdown("""**Select Featurizing Model**""", style={'marginTop': 18,}), + html.Div(children=[ + dcc.Dropdown(id='sl_featurizing_wf', multi=False, + options=[{'label': 'CDDD Model', + 'value': 'cuchem.wf.generative.Cddd'}, + {'label': 'MolBART Model', + 'value': 'cuchem.wf.generative.MolBART'}, + {'label': 'MegatronMolBART Model', + 'value': 'cuchem.wf.generative.MegatronMolBART'}, + ], + value=self.generative_wf_cls, + clearable=False), + ]), + html.Div(className='row', children=[ + html.Label([ + "Select molecular property for fitting and prediction", + dcc.Dropdown(id='fit_nn_compound_property', multi=False, clearable=False, + options=[{"label": PROP_DISP_NAME[p], "value": p} for p in IMP_PROPS], + value=IMP_PROPS[0]), + ], style={'marginTop': 18, 'marginLeft': 18})], + ), + html.Div(className='row', children=[ + dcc.Markdown("Train cluster", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_train_cluster_number', value=0), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Test cluster", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_test_cluster_number', value=1), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + dcc.Markdown(children="""**Neural Network Parameters**""", + id="nn_params_msg", + style={'marginTop': 18} + ), + html.Div(className='row', children=[ + dcc.Markdown("Hidden layer sizes", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_hidden_layer_sizes', value=''), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Activation Function", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_activation_fn', value='LeakyReLU'), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Final Activation Function", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_final_activation_fn', value='LeakyReLU'), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Number of training epochs", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_max_epochs', value=10), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Learning Rate", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_learning_rate', value=0.001), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Weight Decay (Adam)", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_weight_decay', value=0.0001), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Batch size", style={'marginTop': 12,}), + dcc.Input(id='fit_nn_batch_size', value=1), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dbc.Button('Fit', id='bt_fit', n_clicks=0, style={'marginRight': 12}), + ], style={'marginLeft': 0} + ), + ]), + + dcc.Tab(label='Find Analogues', children=[ + dcc.Markdown(children="""Choose a compound""", + id="analoguing_msg", + style={'marginTop': 18} + ), + dcc.Checklist( + id='ckl_analoguing_mol_id', + options=[], + value=[], + inputStyle={'display': 'inline-block', 'marginLeft': 6, 'marginRight': 6}, + labelStyle={'display': 'block', 'marginLeft': 6, 'marginRight': 6} + ), + html.Div(className='row', children=[ + dcc.Markdown("Maxinum Number of Analogues", style={'marginTop': 12,}), + dcc.Input(id='analoguing_n_analogues', value=10), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(className='row', children=[ + dcc.Markdown("Similarity Threshold", style={'marginTop': 12,}), + dcc.Input(id='analoguing_threshold', value=0.33), + ], style={'marginLeft': 0, 'marginTop': '6px'} + ), + html.Div(children=[ + dcc.Dropdown(id='analoguing_type', multi=False, + options=[{'label': 'Similar compounds', + 'value': 'similar'}, + {'label': 'Compounds with the same scaffold', + 'value': 'scaffold'}, + {'label': 'Compounds that are superstructures', + 'value': 'superstructure'}, + ], + value='similar', + clearable=False), + ]), + html.Div(className='row', children=[ + dbc.Button('Search', id='bt_analoguing', n_clicks=0, style={'marginRight': 12}), + ], style={'marginLeft': 0}), + ]) + ]), html.Div(className='row', children=[ @@ -783,51 +1382,94 @@ def constuct_layout(self): ], className='three columns', style={'marginLeft': 18, 'marginTop': 90, 'verticalAlign': 'text-top', }), ]), - html.Div(className='row', children=[ - html.Div(id='section_generated_molecules', children=[ - html.Div(className='row', children=[ - html.A('Export to SDF', - id='download-link', - download="rawdata.sdf", - href="/cheminfo/downloadSDF", - target="_blank", - n_clicks=0, - style={'fontSize': '150%'} - ), - html.Div(id='msg_generated_molecules', children=[], - style={'color': 'red', 'fontWeight': 'bold', 'marginLeft': 12, 'fontSize': '150%'}), - ], style={'marginLeft': 0, 'marginBottom': 18, }), - html.Div(id='table_generated_molecules', children=[], style={'width': '100%'}) - ], style={'display': 'none', 'width': '100%'}), - - html.Div(id='section_selected_molecules', children=[ - html.Div(className='row', children=[ - html.Div(id='section_display_properties', children=[ - html.Label([ - "Select Molecular Properties", - dcc.Dropdown(id='sl_mol_props', multi=True, - options=[ - {'label': 'alogp', 'value': 'alogp'}], - value=['alogp']), - ])], - className='nine columns'), - html.Div(children=[ - dbc.Button("<", id="bt_page_prev", - style={"height": "25px"}), - html.Span(children=1, id='current_page', - style={"paddingLeft": "6px"}), - html.Span(children=' of 1', id='total_page', - style={"paddingRight": "6px"}), - dbc.Button(">", id="bt_page_next", - style={"height": "25px"}) - ], - className='three columns', - style={'verticalAlign': 'text-bottom', 'text-align': 'right'} - ), - ], style={'margin': 12}), - html.Div(id='tb_selected_molecules', children=[], style={'width': '100%'}) - ], style={'display': 'none', 'width': '100%'}), - ], style={'margin': 12}), + html.Div( + id='section_generated_molecules', + children=[ + html.A( + 'Export to SDF', + id='download-link', + download="rawdata.sdf", + href="/cheminfo/downloadSDF", + target="_blank", + n_clicks=0, + style={'marginLeft': 10, 'fontSize': '150%'} + ), + html.Div(id='table_generated_molecules', children=[]), + ], + style={'display': 'none'} + ), + #html.Div(className='row', children=[ + # html.Div(id='section_generated_molecules', children=[ + # html.Div(className='row', children=[ + # html.A('Export to SDF', + # id='download-link', + # download="rawdata.sdf", + # href="/cheminfo/downloadSDF", + # target="_blank", + # n_clicks=0, + # style={'fontSize': '150%'} + # ), + # html.Div(id='msg_generated_molecules', children=[], + # style={'color': 'red', 'fontWeight': 'bold', 'marginLeft': 12, 'fontSize': '150%'}), + # ], style={'marginLeft': 0, 'marginBottom': 18, }), + # html.Div(id='table_generated_molecules', children=[], style={'width': '100%'}) + # ], style={'display': 'none', 'width': '100%'}), + + html.Div(id='section_generated_molecules_clustered', children=[ + dcc.Graph(id='gen_figure', figure=fig, + #className='nine columns', + #style={'verticalAlign': 'text-top'} + ), + ], style={'display': 'none'}), + + html.Div(id='section_fitting', children=[ + dcc.Graph(id='fitting_figure', figure=fig, + #className='nine columns', + #style={'verticalAlign': 'text-top'} + ), + ], style={'display': 'none'}), + + html.Div(id='section_selected_molecules', children=[ + html.Div(className='row', children=[ + html.Div(id='section_display_properties', children=[ + html.Label([ + "Select Molecular Properties", + dcc.Dropdown(id='sl_mol_props', multi=True, + options=[ + {'label': 'alogp', 'value': 'alogp'}], + value=['alogp']), + ])], + className='nine columns'), + html.Div(children=[ + dbc.Button("<", id="bt_page_prev", + style={"height": "25px"}), + html.Span(children=1, id='current_page', + style={"paddingLeft": "6px"}), + html.Span(children=' of 1', id='total_page', + style={"paddingRight": "6px"}), + dbc.Button(">", id="bt_page_next", + style={"height": "25px"}) + ], + className='three columns', + style={'verticalAlign': 'text-bottom', 'text-align': 'right'} + ), + ], style={'margin': 12}), + + html.Div( + id='tb_selected_molecules', + children=[], + style={'width': '100%'} + ) + ], style={'display': 'none', 'width': '100%'}), + #], style={'margin': 12}), + + html.Div(id='section_analoguing', children=[ + html.Div(children=[ + html.Div(id='tb_analoguing', children=[], + style={'verticalAlign': 'text-top'} + ), + ]) + ], style={'display': 'none'}), html.Div(id='refresh_main_fig', style={'display': 'none'}), html.Div(id='northstar_cluster', style={'display': 'none'}), @@ -838,6 +1480,7 @@ def constuct_layout(self): html.Div(id='genration_candidates', style={'display': 'none'}), html.Div(id='refresh_moi_prop_table', style={'display': 'none'}), html.Div(id='interpolation_error', style={'display': 'none'}), + html.Div(id='analoguing_candidates', style={'display': 'none'}), # Not displayed but used to keep track of compounds added to checklist of compounds to be analogued html.Div(className='row', children=[ dbc.Modal([ @@ -984,9 +1627,12 @@ def handle_mark_north_star(self, bt_north_star_click, north_star): return ','.join(selected_north_star) @report_ui_error(4) - def handle_re_cluster(self, bt_cluster_clicks, bt_point_clicks, bt_north_star_clicks, - sl_prop_gradient, sl_nclusters, refresh_main_fig, - selected_clusters, selected_points, north_star, refresh_moi_prop_table): + def handle_re_cluster( + self, bt_cluster_clicks, bt_point_clicks, bt_north_star_clicks, + sl_prop_gradient, sl_nclusters, refresh_main_fig, + fingerprint_radius, fingerprint_nBits, + selected_clusters, selected_points, north_star, refresh_moi_prop_table + ): comp_id, event_type = self._fetch_event_data() if comp_id == 'sl_nclusters': @@ -996,6 +1642,9 @@ def handle_re_cluster(self, bt_cluster_clicks, bt_point_clicks, bt_north_star_cl raise dash.exceptions.PreventUpdate + if comp_id in ['fingerprint_radius', 'fingerprint_nBits']: + raise dash.exceptions.PreventUpdate + filter_values = None filter_column = None reload_data = False @@ -1048,6 +1697,9 @@ def handle_re_cluster(self, bt_cluster_clicks, bt_point_clicks, bt_north_star_cl north_stars=moi_molregno, color_col='cluster', reload_data=reload_data, - recluster_data=recluster_data) + recluster_data=recluster_data, + fingerprint_radius=int(fingerprint_radius), + fingerprint_nBits=int(fingerprint_nBits) + ) return figure, ','.join(northstar_cluster), _refresh_moi_prop_table, dash.no_update diff --git a/cuchem/cuchem/wf/generative/megatronmolbart.py b/cuchem/cuchem/wf/generative/megatronmolbart.py index e13f09a5..a57a989b 100644 --- a/cuchem/cuchem/wf/generative/megatronmolbart.py +++ b/cuchem/cuchem/wf/generative/megatronmolbart.py @@ -13,6 +13,23 @@ from cuchemcommon.utils.singleton import Singleton from cuchemcommon.workflow import BaseGenerativeWorkflow +# Check if all these are needed: +from cuchemcommon.fingerprint import MorganFingerprint +import torch +import torch.nn +from torch.utils.data import Dataset, DataLoader +import cupy as cp +import pickle +from pathlib import Path +import numpy as np +from functools import partial +from rdkit import Chem +from rdkit.Chem import MolFromSmiles, RDKFingerprint +from rdkit.DataStructs import FingerprintSimilarity +from cuml import Lasso, Ridge #LinearRegression +from cuml.metrics import mean_squared_error +from math import sqrt + logger = logging.getLogger(__name__) @@ -100,3 +117,420 @@ def interpolate_smiles(self, generated_df.iat[0, 1] = False generated_df.iat[-1, 1] = False return generated_df + + + def extrapolate_from_cluster(self, + compounds_df, + compound_property: str, + cluster_id: int = 0, + n_compounds_to_transform=10, + num_points: int = 10, + step_size: float = 0.01, + force_unique = False, + scaled_radius: int = 1): + """ + The embedding vector is calculated for the specified cluster_id and applied over it. + TO DO: We should have a table of direction vectors in embedded space listed, just like the list of compound IDs. + The user should choose one to be applied to the selected compounds, or to a cluster number. + """ + smiles_list = None + radius = self._compute_radius(scaled_radius) + # TO DO: User must be able to extrapolate directly from smiles in the table; + # these may themselves be generated compounds without any chemblid. + logger.info(f'cluster_id={cluster_id}, compound_property={compound_property}, compounds_df: {len(compounds_df)}, {type(compounds_df)}') + logger.info(compounds_df.head()) + logger.info(f'{list(compounds_df.columns)}, {list(compounds_df.dtypes)}') + df_cluster = compounds_df[ compounds_df['cluster'] == int(cluster_id) ].dropna().reset_index(drop=True).compute() + logger.info(f'df_cluster: {len(df_cluster)}\n{df_cluster.head()}') + if 'transformed_smiles' in df_cluster: + smiles_col = 'transformed_smiles' + elif 'SMILES' in df_cluster: + smiles_col = 'SMILES' + elif 'smiles' in df_cluster: + smiles_col = 'smiles' + else: + logger.info(list(df_cluster.columns)) + logger.info(df_cluster.head()) + raise Error('No smiles column') + smiles_col = None + smiles_list = df_cluster[smiles_col].to_array() + return self.extrapolate_from_smiles(smiles_list, + compound_property_vals=df_cluster[compound_property].to_gpu_array(), #to_array(), #[:n_compounds_to_transform].to_array(), + num_points=num_points, + n_compounds_to_transform=n_compounds_to_transform, + step_size=step_size, + scaled_radius=radius, + force_unique=force_unique, + id_list=df_cluster['id'].to_array()) + + def _get_embedding_direction(self, + embedding_list, + compound_property_vals, + ): + """ + Get the embedding of all compounds in the specified cluster. + The performa a linear regression against the compound_property to find the direction in + embedded space along which the compound_property tends to increase. + Using the minimum and maximum values of the compound_property in the cluster to define the range, + compute the step size along the direction that is expected to increase the compound_property value by step_percentage. + """ + + logger.info(f'_get_embedding_direction: emb:{embedding_list.shape}, {type(embedding_list)}, prop:{compound_property_vals.shape}, {type(compound_property_vals)}, prop: {min(compound_property_vals)} - {max(compound_property_vals)}') + n_data = compound_property_vals.shape[0] + n_dimensions = embedding_list[0].shape[0] + try: + reg = Lasso()#alpha=1.0/n_dimensions)#, tol=1.0/n_dimensions) + #reg = Ridge()#alpha=1.0/n_dimensions, solver='cd') # default is 'eig' + reg = reg.fit(embedding_list, compound_property_vals) + except Exception as e: + logger.info(f'Ridge regression encountered {e}, trying Lasso regression') + reg = Lasso()#alpha=1.0/n_dimensions) + reg = reg.fit(embedding_list, compound_property_vals) + n_zero_coefs = len([x for x in reg.coef_ if x == 0.0]) + zero_coef_indices = [i for i, x in enumerate(reg.coef_) if x != 0.0] + logger.info(f'coef: {n_zero_coefs} / {len(reg.coef_)} coefficients are zero (in some positions between {min(zero_coef_indices)} and {max(zero_coef_indices)});'\ + f' range: {reg.coef_.argmin()}: {min(reg.coef_)} to {reg.coef_.argmax()}: {max(reg.coef_)}') + + y_pred = reg.predict(embedding_list) + rmse = sqrt(mean_squared_error(compound_property_vals, y_pred.astype('float64'))) + pearson_rho = cp.corrcoef(compound_property_vals, y_pred) + logger.info(f'_get_embedding_direction: n={len(compound_property_vals)}, rho={pearson_rho}, rmse={rmse}') #:.2f}') + emb_std = np.std(embedding_list, axis=0) + logger.info(f'embedding_list.std: {emb_std}') + + emb_max = embedding_list[ np.argmax(compound_property_vals) ] + emb_min = embedding_list[ np.argmin(compound_property_vals) ] + diff_size = np.linalg.norm(emb_max - emb_min) / sqrt(n_dimensions) + # TODO: project on to embedding direction!!! + logger.info(f'compound_property_vals: [{np.argmin(compound_property_vals)}]={np.amin(compound_property_vals)}, [{np.argmax(compound_property_vals)}]={np.amax(compound_property_vals)}, diff_size={diff_size}') + return reg.coef_, emb_std, diff_size + + + def extrapolate_from_smiles(self, + smiles_list, + compound_property_vals, + num_points: int, + step_size: float, + scaled_radius=None, + force_unique=False, + n_compounds_to_transform=10, + id_list=[], + debug=False): + """ + Given a list of smiles strings, convert each to its embedding. + Then taken num_points steps in the specified direction (in the embedded space) of size step_size. + Convert these points on the embedded space back to smiles strings and return as a dataframe. + Modify duplicates if force_unique is True by adding a jitter of magnitude radius to the embedding. + """ + # TODO: generated compounds are the same no matter what the step-size is, check code!!!! + # TODO: generated compounds are yielding different Tanimotos even though their are identical. Bug or jitter??? + step_size = float(step_size) + n_compounds_to_transform = int(n_compounds_to_transform) + if len(id_list) == 0: + id_list = list(map(str, range(len(smiles_list)))) + logger.info(f'molbart: extrapolate_from_smiles: {len(smiles_list)} smiles ({type(smiles_list)}), {num_points} extrapolations each with step_size {step_size}') + data = pd.DataFrame({'transformed_smiles': smiles_list}) + logger.info(data.head()) + #pad_length = max(map(len, smiles_list)) + 2 # add 2 for start / stop + # TODO: check reversibility / recovery + full_mask = None + emb_shape = None + n_recovered = 0 + avg_tani = 0 + embeddings = [] + for i, smiles in enumerate(smiles_list): + spec = generativesampler_pb2.GenerativeSpec( + model=generativesampler_pb2.GenerativeModel.MegaMolBART, + smiles=smiles, + ) + result = self.stub.SmilesToEmbedding(spec) + emb = result.embedding + mask = result.pad_mask + emb_shape = result.dim + if debug: + spec = generativesampler_pb2.EmbeddingList( + embedding=emb, + dim=emb_shape, + pad_mask=mask + ) + generated_mols = self.stub.EmbeddingToSmiles(spec).generatedSmiles + if len(generated_mols) > 0: + n_recovered += 1 + tani = FingerprintSimilarity(RDKFingerprint(MolFromSmiles(smiles)), RDKFingerprint(MolFromSmiles(generated_mols[0]))) + logger.info(f'{n_recovered}/ {i+1}: {smiles} ({len(smiles)} chars)--> emb:{emb_shape}, mask:{mask.shape} --> {generated_mols} (tani={tani:.2f})') + avg_tani += tani + logger.info(f'emb: {type(emb)}, dim={emb_shape}, mask={len(mask)}, emb={len(emb)}') + embeddings.append(torch.tensor(emb)) #.detach().reshape(-1)) #torch tensor + if full_mask is None: + logger.info(f'First mask = {mask}') + full_mask = mask + emb_shape = emb_shape # n_tokens x 1 x 256 + else: + full_mask = [a and b for a, b in zip(full_mask, mask)] # not used any more + if debug: + logger.info(f'{n_recovered} / {len(smiles_list)} compounds yielded something after embedding, with avg tani = {avg_tani / n_recovered if n_recovered > 0 else 0}') + + embeddings = torch.nn.utils.rnn.pad_sequence(embeddings, batch_first=True, padding_value=PAD_TOKEN) # n_smiles x embedding_length + n_embedding_tokens = int(embeddings.shape[1] / (emb_shape[1] * emb_shape[2])) + emb_shape = [n_embedding_tokens, emb_shape[1], emb_shape[2]] + embeddings = cp.asarray(embeddings) + full_mask = [False] * n_embedding_tokens + logger.info(f'emb type: {type(embeddings)} of {type(embeddings[0])}') + logger.info(f'embeddings.shape:{embeddings.shape}, emb_shape={emb_shape}, embeddings[0]={embeddings[0]}') + + # Use the entire cluster to infer the direction: + direction, emb_std, diff_size = self._get_embedding_direction(embeddings, compound_property_vals) + if diff_size == 0.0: + logger.info(f'Increasing diff_size from 0.0 to 1e-6') + diff_size = 1e-6 + + # But apply the transform to no more than n_compounds_to_transform, chosen at random + if n_compounds_to_transform < len(smiles_list): + indices = np.random.choice(list(range(len(smiles_list))), size=n_compounds_to_transform, replace=False) + smiles_list = [smiles_list[i] for i in indices] + embeddings = cp.asarray([embeddings[i,:] for i in indices]) + id_list = [id_list[i] for i in indices] + + result_df_list = [ pd.DataFrame({'SMILES': smiles_list, 'Generated': False, 'id': id_list}) ] + logger.info(f'direction: {type(direction)}, shape={direction.shape}, {direction}\n, embeddings: {type(embeddings)}, shape: {embeddings.shape}, embeddings[0]={embeddings[0]}') + + for step_num in range(1, 1 + num_points): + #noise = cp.random.normal(loc=0.0, scale=emb_std, size=emb_std.shape) + #logger.info(f'noise: {type(noise)}, {noise.shape}; dir: {type(direction)}, {direction.shape}') + direction_sampled = cp.random.normal(loc=direction, scale=emb_std, size=emb_std.shape) #direction + noise + logger.info(f'step ({type(step_num)} * {type(diff_size)} * {type(step_size)} * {type(direction_sampled)}') + step = float(step_num * diff_size * step_size) * direction_sampled + logger.info(step) + extrap_embeddings = embeddings + step # TODO: print and check output + logger.info(f'step ({step_num} * {diff_size} * {step_size} * direction_sampled): {type(step)}, {step.shape}, {step}\n:extrap_embeddings: {type(extrap_embeddings)}, {extrap_embeddings.shape}, extrap_embeddings[0]={extrap_embeddings[0]}') + smiles_gen_list = [] + ids_interp_list = [] + for i in range(len(extrap_embeddings)): + #diff = extrap_embeddings[i] - embeddings[i] + #logger.info(f'{i}: diff: {diff.argmin()}: {min(diff)} to {diff.argmax()}: {max(diff)}') + extrap_embedding = list(extrap_embeddings[i,:]) + logger.info(f'embedding: {type(extrap_embedding)}, {len(extrap_embeddings)};'\ + f' dim: {type(emb_shape)}, {len(emb_shape)}; pad_mask={type(full_mask)}, {len(full_mask)}') + spec = generativesampler_pb2.EmbeddingList( + embedding=extrap_embedding, + dim=emb_shape, + pad_mask=full_mask + ) + smiles_gen = self.stub.EmbeddingToSmiles(spec).generatedSmiles[0] + logger.info(f'{i}: {smiles_gen}') + smiles_gen_list.append(smiles_gen) + ids_interp_list.append(f'{id_list[i]}-s{step_num}') + extrap_df = pd.DataFrame({ + 'SMILES': smiles_gen_list, + 'Generated': True, + 'id': ids_interp_list + }) + logger.info(extrap_df.head()) + if force_unique: + inv_transform_funct = partial(self.inverse_transform, + mem_pad_mask=full_mask) + extrap_df = self.compute_unique_smiles(extrap_df, + smiles_gen, + inv_transform_funct, + radius=radius) + logger.info(f'step_num={step_num} yielded {len(extrap_df)} compounds:\n{extrap_df.head()}') + result_df_list.append(extrap_df) + results_df = pd.concat(result_df_list, ignore_index=True) + results_df['id'] = results_df['id'].apply(str) + results_df.sort_values('id', inplace=True) + results_df.reset_index(drop=True, inplace=True) + return results_df + + def fit_nn( + self, + compounds_df, + compound_property, + cluster_id_train, + cluster_id_test, + hidden_layer_sizes, + activation_fn, + final_activation_fn, + max_epochs, + batch_size=32, + learning_rate=0.001, + weight_decay=0.0001, + debug=False, #82 / 88 compounds yielded something after embedding, with avg tani = 0.8287583649661866 + #scaled_radius=None + ): + """ + Convert compound SMILES to embeddings, then train a neural network with n_layers hidden layers with the specified activation function (activation_fn) + to predict the specified compound_property of cluster_id_train. Evaluate the model on cluster_id_test. Return actual and predicted values for both + the train and test set. + """ + logger.info(f'cluster_id_train={cluster_id_train}, cluster_id_test={cluster_id_test}, compound_property={compound_property}, compounds_df: {len(compounds_df)}, {type(compounds_df)}') + df_train = compounds_df[ compounds_df['cluster'] == int(cluster_id_train) ].dropna().reset_index(drop=True).compute() + df_test = compounds_df[ compounds_df['cluster'] == int(cluster_id_test) ].dropna().reset_index(drop=True).compute() + n_train = len(df_train) + n_test = len(df_test) + + logger.info(f'df_train: {len(df_train)}\n{df_train.head()}') + logger.info(f"type(df_train['transformed_smiles'])={type(df_train['transformed_smiles'])}") + + smiles_list = np.concatenate((df_train['transformed_smiles'].to_array(), df_test['transformed_smiles'].to_array()), axis=0) + logger.info(f'smiles_list: {smiles_list.shape}') + pad_length = max(map(len, smiles_list)) + 2 # add 2 for start / stop + embeddings = [] + #full_mask = None + emb_shape = None + n_recovered = 0 + avg_tani = 0 + #radius = self._compute_radius(scaled_radius) + + for i, smiles in enumerate(smiles_list): + spec = generativesampler_pb2.GenerativeSpec( + model=generativesampler_pb2.GenerativeModel.MegaMolBART, + smiles=smiles, + #radius=radius + ) + result = self.stub.SmilesToEmbedding(spec) + emb = result.embedding + mask = result.pad_mask + dim = result.dim + logger.info(f'{i}: smiles={smiles}, emd: {len(emb)}, {emb[:5]}; dim={dim}, mask: {len(mask)}') + emb_shape = result.dim #emb[:2] + #emb = emb[2:] + + if debug: + spec = generativesampler_pb2.EmbeddingList( + embedding=emb, + dim=emb_shape, + pad_mask=mask + ) + generated_mols = self.stub.EmbeddingToSmiles(spec).generatedSmiles + #generated_mols = self.inverse_transform([emb.reshape(emb_shape)], k=1, mem_pad_mask=mask.bool().cuda()) + if len(generated_mols) > 0: + m = MolFromSmiles(generated_mols[0]) + if m is not None: + n_recovered += 1 + tani = FingerprintSimilarity(RDKFingerprint(MolFromSmiles(smiles)), RDKFingerprint(m)) + logger.info(f'{n_recovered}/ {i+1}: {smiles} ({len(smiles)} chars)--> emb:{emb_shape}, mask:{len(mask)} --> {generated_mols} (tani={tani:.2f})') + avg_tani += tani + embeddings.append(torch.tensor(emb, device=self.device)) #emb.detach().reshape(-1)) #torch tensor + #if full_mask is None: + # full_mask = mask + # emb_shape = emb.shape + #else: + # full_mask &= mask + if debug: + logger.info(f'{n_recovered} / {len(smiles_list)} compounds yielded something after embedding, with avg tani = {avg_tani / n_recovered if n_recovered > 0 else 0}') + + #full_mask = full_mask.bool().cuda() + embeddings = torch.nn.utils.rnn.pad_sequence(embeddings, batch_first=True, padding_value=PAD_TOKEN) + embeddings_train = embeddings[:n_train,:] + embeddings_test = embeddings[n_train:,:] + logger.info(f'emb train: {type(embeddings_train)} of {type(embeddings_train[0])}, {embeddings_train.shape}') + compound_property_vals_train = torch.tensor(df_train[compound_property], device=self.device, dtype=torch.float32)#.to_gpu_array() # need to move to GPU array?? + compound_property_vals_test = torch.tensor(df_test[compound_property], device=self.device, dtype=torch.float32)#.to_gpu_array() # need to move to GPU array?? + logger.info(f'type(df_train[{compound_property}])={type(df_train[compound_property])}, type(compound_property_vals_train)={type(compound_property_vals_train)}') + train_pred, test_pred = self._build_and_train_nn( + embeddings_train, + compound_property_vals_train, + embeddings_test, + compound_property_vals_test, + hidden_layer_sizes = hidden_layer_sizes, + activation_fn=activation_fn, + final_activation_fn=final_activation_fn, + max_epochs=max_epochs, + learning_rate=learning_rate, + weight_decay=weight_decay, + batch_size=batch_size + ) + df = pd.DataFrame({ + 'x': torch.cat((compound_property_vals_train, compound_property_vals_test), axis=0).to('cpu').numpy(), + 'y': torch.cat((train_pred.detach(), test_pred.detach()), axis=0).to('cpu').flatten().numpy(), + 'cluster': np.concatenate((df_train['cluster'].to_array(), df_test['cluster'].to_array()), axis=0), + 'id': np.concatenate((df_train['id'].to_array(), df_test['id'].to_array()), axis=0), + 'train_set': [True] * n_train + [False] * n_test + }) + return df + + def _build_and_train_nn(self, + embedding_list_train, + compound_property_vals_train, + embedding_list_test, + compound_property_vals_test, + hidden_layer_sizes = [], + activation_fn='LeakyReLU', + final_activation_fn='LeakyReLU', + max_epochs=10, + batch_size=32, + learning_rate=0.001, + weight_decay=0.0001 + ): + """ + Construct a neural network with the specified number of layers, using the specified activation function. + Then train it on the training set and evaluate on the test set. Return results. + """ + + logger.info(f'_build_and_train_nn: emb_train:{embedding_list_train.shape}, {type(embedding_list_train)}, embedding_list_train[0]:{len(embedding_list_train[0])},'\ + f' prop:{compound_property_vals_train.shape}, {type(compound_property_vals_train)},'\ + f' prop_train: {min(compound_property_vals_train)} - {max(compound_property_vals_train)}') + n_data_train = compound_property_vals_train.shape[0] + n_dimensions = embedding_list_train[0].shape[0] + comp_net = CompoundNet(n_dimensions, hidden_layer_sizes, activation_fn, final_activation_fn).to(self.device) + logger.info(comp_net) + loss_fn = torch.nn.SmoothL1Loss() + opt = torch.optim.Adam(comp_net.parameters(), lr=learning_rate, weight_decay=weight_decay) + train_set = CompoundData(embedding_list_train, compound_property_vals_train) + loader = DataLoader(train_set, batch_size=batch_size, shuffle=True) + + i = 0 + for epoch in range(max_epochs): + total_loss = 0.0 + for compounds, properties in loader: + opt.zero_grad() + predictions = comp_net(compounds) + loss = loss_fn(predictions, properties) + loss.backward() + opt.step() + total_loss += loss.item() + i += 1 + logger.info(f'epoch {epoch+1}, {i} batches: {total_loss / (n_data_train * (epoch+1))}') + + comp_net.eval() + train_pred = comp_net(embedding_list_train) + test_pred = comp_net(embedding_list_test) + + return train_pred, test_pred + + +class CompoundData(Dataset): + + def __init__(self, compounds, properties): + self.compounds = compounds + self.properties = properties + + def __len__(self): + return len(self.compounds) + + def __getitem__(self, compound_index): + return self.compounds[compound_index,:], self.properties[compound_index] + + +class CompoundNet(torch.nn.Module): + + def __init__(self, n_input_features, hidden_layer_sizes, activation_fn, last_activation_fn=None): + super(CompoundNet, self).__init__() + hidden_layer_sizes.append(1) # output layer size is appended to hidden layer sizes + layers = [torch.nn.Linear(n_input_features, hidden_layer_sizes[0])] + try: + activation = getattr(torch.nn, activation_fn) + if last_activation_fn: + last_activation = getattr(torch.nn, last_activation_fn) + except Exception as e: + raise UserError(f'Activation function name {activation_fn} / {last_activation_fn} not recognized') + for i, hidden_layer_size in enumerate(hidden_layer_sizes[:-1]): + layers.append(activation()) + layers.append(torch.nn.Linear(hidden_layer_size, hidden_layer_sizes[i + 1])) + if last_activation_fn: + # Having a non-linear function right before the output may not be needed for some properties being predicted + layers.append(last_activation()) + self.layers = torch.nn.Sequential(*layers) + + def forward(self, x): + return self.layers(x) From fd100058e9fa57882782c9b9c9be79cf04adedb1 Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Thu, 3 Feb 2022 01:27:45 -0800 Subject: [PATCH 15/27] most features working --- Dockerfile.cuchem | 2 + common/cuchemcommon/data/__init__.py | 10 +- common/cuchemcommon/data/cluster_wf.py | 53 +++- common/cuchemcommon/data/helper/chembldata.py | 32 ++- common/cuchemcommon/fingerprint.py | 35 ++- common/cuchemcommon/workflow.py | 70 ++++-- cuchem/cuchem/decorator/lipinski.py | 8 +- cuchem/cuchem/interactive/chemvisualize.py | 226 ++++++++++-------- cuchem/cuchem/wf/cluster/__init__.py | 16 +- cuchem/cuchem/wf/cluster/gpukmeansumap.py | 114 ++++++--- .../cuchem/wf/generative/megatronmolbart.py | 95 +++++--- cuchem/requirements.txt | 2 +- cuchem/startdash.py | 1 + cuchem/tests/test_generative_wf.py | 10 +- 14 files changed, 439 insertions(+), 235 deletions(-) diff --git a/Dockerfile.cuchem b/Dockerfile.cuchem index f5310562..e372f2f1 100644 --- a/Dockerfile.cuchem +++ b/Dockerfile.cuchem @@ -17,6 +17,8 @@ RUN cd /opt/nvidia/cheminfomatics/common; \ RUN cd /opt/nvidia/cheminfomatics/cuchem; \ pip install -r requirements.txt +RUN pip install torch==1.7.0+cu110 -f https://download.pytorch.org/whl/torch_stable.html + ENV UCX_LOG_LEVEL error ENV PYTHONPATH ./common/generated:./common:./cuchem: diff --git a/common/cuchemcommon/data/__init__.py b/common/cuchemcommon/data/__init__.py index 3a07d30c..403dc91c 100644 --- a/common/cuchemcommon/data/__init__.py +++ b/common/cuchemcommon/data/__init__.py @@ -12,16 +12,16 @@ def meta_df(self): """ return NotImplemented - def fetch_molecular_embedding(self, n_molecules: int, cache_directory: str = None): + def fetch_molecular_embedding(self, n_molecules: int, cache_directory: str = None, radius = 2, nBits = 512): """ Fetch molecular properties from database/cache into a dask array. """ return NotImplemented - def fetch_molecular_embedding_by_id(self, molecule_id: List): + def fetch_molecular_embedding_by_id(self, molecule_id: List, radius=2, nBits=512): """ Fetch molecular properties from database for the given id. Id depends on - the backend databse. For chemble DB it should be molregid. + the backend databse. For chembl DB it should be molregid. """ return NotImplemented @@ -29,7 +29,7 @@ def fetch_id_from_smile(self, new_molecules: List): """ Fetch molecular details for a list of molecules. The values in the list of molecules depends on database/service used. For e.g. it could be - ChemblId or molreg_id for Chemble database. + ChemblId or molreg_id for Chembl database. """ return NotImplemented @@ -40,6 +40,6 @@ def fetch_id_from_chembl(self, id: List): """ Fetch molecular details for a list of molecules. The values in the list of molecules depends on database/service used. For e.g. it could be - ChemblId or molreg_id for Chemble database. + ChemblId or molreg_id for Chembl database. """ return NotImplemented diff --git a/common/cuchemcommon/data/cluster_wf.py b/common/cuchemcommon/data/cluster_wf.py index 6462d5fb..0f1cd993 100644 --- a/common/cuchemcommon/data/cluster_wf.py +++ b/common/cuchemcommon/data/cluster_wf.py @@ -19,8 +19,11 @@ class ChemblClusterWfDao(ClusterWfDAO, metaclass=Singleton): - def __init__(self, fp_type): + def __init__(self, fp_type, radius=2, nBits=512): + logger.info(f'ChemblClusterWfDao({fp_type})') self.chem_data = ChEmblData(fp_type) + self.radius = radius + self.nBits = nBits def meta_df(self): chem_data = ChEmblData() @@ -28,29 +31,55 @@ def meta_df(self): def fetch_molecular_embedding(self, n_molecules: int, - cache_directory: str = None): + cache_directory: str = None, + radius=2, + nBits=512): + # Since we allow the user to change the fingerprint radius and length (nBits), + # the fingerprints need to be cached in separate subdirectories. + # Note: the precomputed ones are not presumed to be of a specific radius or length context = Context() if cache_directory: - hdf_path = os.path.join(cache_directory, FINGER_PRINT_FILES) + cache_subdir = f'{cache_dir}/fp_r{radius}_n{nBits}' + hdf_path = os.path.join(cache_subdir, FINGER_PRINT_FILES) + else: + cache_subdir = None + if cache_directory and os.path.isdir(cache_subdir): # and (self.radius == radius) and (self.nBits == nBits): logger.info('Reading %d rows from %s...', n_molecules, hdf_path) mol_df = dask.dataframe.read_hdf(hdf_path, 'fingerprints') - + if len(mol_df) == 0: + logger.info(f'Zero molecules found in {hdf_path}! Caching error?') if n_molecules > 0: npartitions = math.ceil(n_molecules / BATCH_SIZE) mol_df = mol_df.head(n_molecules, compute=False, npartitions=npartitions) else: - logger.info('Reading molecules from database...') - mol_df = self.chem_data.fetch_mol_embedding(num_recs=n_molecules, - batch_size=context.batch_size) + self.radius = radius + self.nBits = nBits + logger.info(f'Reading molecules from database and computing fingerprints (radius={self.radius}, nBits={self.nBits})...') + mol_df = self.chem_data.fetch_mol_embedding( + num_recs=n_molecules, + batch_size=context.batch_size, + radius=radius, + nBits=nBits + ) + if cache_directory: + os.mkdir(cache_subdir) + mol_df.to_hdf(hdf_path, 'fingerprints') + logger.info(f'mol_df: {list(mol_df.columns)}')#\n{mol_df.head().compute()}') return mol_df - def fetch_molecular_embedding_by_id(self, molecule_id: List): + def fetch_molecular_embedding_by_id(self, molecule_id: List, radius=2, nBits=512): context = Context() - meta = self.chem_data._meta_df() - fp_df = self.chem_data._fetch_mol_embedding(molregnos=molecule_id, - batch_size=context.batch_size) \ - .astype(meta.dtypes) + meta = self.chem_data._meta_df( + f'fetch_molecular_embedding_by_id({molecule_id}): MISMATCH!!! radius: {radius} != {self.radius}, nBits: {nBits} != {self.nBits}') + if (self.radius != radius) or (self.nBits != nBits): + logger.info('Something broken?') + fp_df = self.chem_data._fetch_mol_embedding( + molregnos=molecule_id, + batch_size=context.batch_size, + radius=radius, + nBits=nBits + ).astype(meta.dtypes) fp_df = cudf.from_pandas(fp_df) fp_df = dask_cudf.from_cudf(fp_df, npartitions=1).reset_index() diff --git a/common/cuchemcommon/data/helper/chembldata.py b/common/cuchemcommon/data/helper/chembldata.py index 7b0d2728..d7ff5124 100644 --- a/common/cuchemcommon/data/helper/chembldata.py +++ b/common/cuchemcommon/data/helper/chembldata.py @@ -72,7 +72,7 @@ def fetch_props_by_molregno(self, molregnos): cols = list(map(lambda x: x[0], cur.description)) return cols, cur.fetchall() - def fetch_props_by_chemble(self, chemble_ids): + def fetch_props_by_chembl(self, chembl_ids): """ Returns compound properties and structure filtered by ChEMBL IDs along with a list of columns. @@ -88,7 +88,7 @@ def fetch_props_by_chemble(self, chemble_ids): """ with closing(sqlite3.connect(self.chembl_db, uri=True)) as con, con, \ closing(con.cursor()) as cur: - select_stmt = sql_stml % "'%s'" % "','".join([x.strip().upper() for x in chemble_ids]) + select_stmt = sql_stml % "'%s'" % "','".join([x.strip().upper() for x in chembl_ids]) cur.execute(select_stmt) cols = list(map(lambda x: x[0], cur.description)) @@ -207,13 +207,18 @@ def fetch_molecule_cnt(self): return cur.fetchone()[0] - def _meta_df(self, **transformation_kwargs): + def _meta_df(self, columns=[], **transformation_kwargs): transformation = self.fp_type(**transformation_kwargs) prop_meta = {'id': pandas.Series([], dtype='int64')} prop_meta.update(dict(zip(IMP_PROPS + ADDITIONAL_FEILD, IMP_PROPS_TYPE + ADDITIONAL_FEILD_TYPE))) - prop_meta.update({i: pandas.Series([], dtype='float32') for i in range(len(transformation))}) + prop_meta.update( + {i: pandas.Series([], dtype='float32') for i in range(len(transformation))}) + # New columns containing the fingerprint as uint64s: + for column in columns: + if isinstance(column, str) and column.startswith('fp'): + prop_meta.update({column: pandas.Series([], dtype='uint64')}) return pandas.DataFrame(prop_meta) @@ -226,7 +231,9 @@ def _fetch_mol_embedding(self, Returns compound properties and structure for the first N number of records in a dataframe. """ - + # TODO: loading compounds from the database and computing fingerprints need to be separated + # We may need to recompute fingerprints but not reload compounds. + # TODO: user must be able to load compounds by specifying start and batch_size logger.info('Fetching %d records starting %d...' % (batch_size, start)) imp_cols = ['cp.' + col for col in IMP_PROPS] @@ -253,8 +260,9 @@ def _fetch_mol_embedding(self, LIMIT %d, %d ''' % (', '.join(imp_cols), " ,".join(list(map(str, molregnos))), start, batch_size) - df = pandas.read_sql(select_stmt, - sqlite3.connect(self.chembl_db, uri=True)) + df = pandas.read_sql( + select_stmt, + sqlite3.connect(self.chembl_db, uri=True)) # Smiles -> Smiles transformation and filtering # TODO: Discuss internally to find use or refactor this code to remove @@ -269,16 +277,22 @@ def _fetch_mol_embedding(self, # Conversion to fingerprints or embeddings # transformed_smiles = df['transformed_smiles'] transformation = self.fp_type(**transformation_kwargs) - cache_data = transformation.transform(df) + cache_data, raw_fp_list = transformation.transform(df, return_fp=True) + #cache_data = transformation.transform(df) return_df = pandas.DataFrame(cache_data) - return_df = pandas.DataFrame( return_df, columns=pandas.RangeIndex(start=0, stop=len(transformation))).astype('float32') return_df = df.merge(return_df, left_index=True, right_index=True) + # TODO: expect to run into the issue that the fingerprint cannot be a cudf column + # TODO: compute here so that chemvisualize does not have to + #return_df['fp'] = raw_fp_list + for i, fp_col in enumerate(raw_fp_list): + return_df[f'fp{i}'] = fp_col return_df.rename(columns={'molregno': 'id'}, inplace=True) + logger.info(f'_fetch_mol_embedding returning: {list(return_df.columns)}\n{return_df.head()}') return return_df def fetch_mol_embedding(self, diff --git a/common/cuchemcommon/fingerprint.py b/common/cuchemcommon/fingerprint.py index 55f24717..f0524179 100644 --- a/common/cuchemcommon/fingerprint.py +++ b/common/cuchemcommon/fingerprint.py @@ -9,12 +9,14 @@ from cuchem.utils.data_peddler import download_cddd_models from rdkit import Chem from rdkit.Chem import AllChem +from math import ceil os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' logger = logging.getLogger(__name__) +INTEGER_NBITS = 64 # Maximum number of bits in an integer column in a cudf Series -def calc_morgan_fingerprints(dataframe, smiles_col='canonical_smiles'): +def calc_morgan_fingerprints(dataframe, smiles_column='canonical_smiles'): """Calculate Morgan fingerprints on SMILES strings Args: @@ -24,7 +26,7 @@ def calc_morgan_fingerprints(dataframe, smiles_col='canonical_smiles'): pd.DataFrame: new dataframe containing fingerprints """ mf = MorganFingerprint() - fp = mf.transform(dataframe, col_name=smiles_col) + fp = mf.transform(dataframe, smiles_column=smiles_column) fp = pd.DataFrame(fp) fp.index = dataframe.index return fp @@ -41,7 +43,7 @@ def __init__(self, **kwargs): self.kwargs = None self.func = None - def transform(self, data): + def transform(self, data, smiles_column = 'transformed_smiles'): return NotImplemented def transform_many(self, data): @@ -59,14 +61,31 @@ def __init__(self, **kwargs): self.kwargs.update(kwargs) self.func = AllChem.GetMorganFingerprintAsBitVect - def transform(self, data, col_name='transformed_smiles'): - data = data[col_name] + def transform(self, data, smiles_column='transformed_smiles', return_fp=False, raw=False): + data = data[smiles_column] fp_array = [] - for mol in data: - m = Chem.MolFromSmiles(mol) + self.n_fp_integers = ceil(self.kwargs['nBits'] / INTEGER_NBITS) + if raw: + raw_fp_array = [] + else: + raw_fp_array = [[] for i in range(0, self.kwargs['nBits'], INTEGER_NBITS)] + for mol_smiles in data: + m = Chem.MolFromSmiles(mol_smiles) fp = self.func(m, **self.kwargs) - fp_array.append(list(fp.ToBitString())) + fp_bs = fp.ToBitString() + fp_array.append(list(fp_bs)) + if return_fp: + if raw: + raw_fp_array.append(fp) + else: + for i in range(0, self.kwargs['nBits'], INTEGER_NBITS): + raw_fp_array[i // INTEGER_NBITS].append(int(fp_bs[i: i + INTEGER_NBITS], 2)) fp_array = np.asarray(fp_array) + if return_fp: + if raw: + return fp_array, raw_fp_array + else: + return fp_array, np.asarray(raw_fp_array, dtype=np.uint64) return fp_array def __len__(self): diff --git a/common/cuchemcommon/workflow.py b/common/cuchemcommon/workflow.py index 7df63d41..8219ff8c 100644 --- a/common/cuchemcommon/workflow.py +++ b/common/cuchemcommon/workflow.py @@ -156,7 +156,7 @@ def compute_unique_smiles(self, def interpolate_by_id(self, ids: List, - id_type: str = 'chembleid', + id_type: str = 'chemblid', num_points=10, force_unique=False, scaled_radius: int = 1): @@ -165,21 +165,56 @@ def interpolate_by_id(self, if not self.min_jitter_radius: raise Exception('Property `radius_scale` must be defined in model class.') - if id_type.lower() == 'chembleid': + if id_type.lower() == 'chemblid': smiles = [row[2] for row in self.dao.fetch_id_from_chembl(ids)] if len(smiles) != len(ids): raise Exception('One of the ids is invalid %s', ids) else: raise Exception('id type %s not supported' % id_type) - return self.interpolate_smiles(smiles, - num_points=num_points, - scaled_radius=scaled_radius, - force_unique=force_unique) + return self.interpolate_smiles( + smiles, + compound_ids=ids, + num_points=num_points, + scaled_radius=scaled_radius, + force_unique=force_unique + ) + + def extrapolate_from_cluster(self, + compounds_df, + compound_property: str, + cluster_id: int = 0, + n_compounds_to_transform=10, + num_points: int = 10, + step_size: float = 0.01, + force_unique = False, + scaled_radius: int = 1): + """ + The embedding vector is calculated for the specified cluster_id and applied over it. + TO DO: We should have a table of direction vectors in embedded space listed, just like the list of compoun d IDs. + The user should choose one to be applied to the selected compounds, or to a cluster number. + """ + smiles_list = None + + if not self.radius_scale: + raise Exception('Property `radius_scale` must be defined in model class.') + else: + radius = float(scaled_radius * self.radius_scale) + # TO DO: User must be able to extrapolate directly from smiles in the table; + # these may themselves be generated compounds without any chemblid. + df_cluster = compounds_df[ compounds_df['cluster'] == cluster_id ].dropna().reset_index(drop=True).compute () + return self.extrapolate_from_smiles(df_cluster['transformed_smiles'].to_array(), + compound_property_vals=df_cluster[compound_property].to_array(), + num_points=num_points, + n_compounds_to_transform=n_compounds_to_transform, + step_size=step_size, + radius=scaled_radius, + force_unique=force_unique) + def find_similars_smiles_by_id(self, - chemble_id: str, - id_type: str = 'chembleid', + chembl_id: str, + id_type: str = 'chemblid', num_requested=10, force_unique=False, scaled_radius: int = 1): @@ -188,14 +223,17 @@ def find_similars_smiles_by_id(self, if not self.min_jitter_radius: raise Exception('Property `radius_scale` must be defined in model class.') - if id_type.lower() == 'chembleid': - smiles = [row[2] for row in self.dao.fetch_id_from_chembl(chemble_id)] - if len(smiles) != len(chemble_id): - raise Exception('One of the ids is invalid %s' + chemble_id) + if id_type.lower() == 'chemblid': + smiles = [row[2] for row in self.dao.fetch_id_from_chembl(chembl_id)] + if len(smiles) != len(chembl_id): + raise Exception('One of the ids is invalid %s' + chembl_id) else: raise Exception('id type %s not supported' % id_type) - return self.find_similars_smiles(smiles[0], - num_requested=num_requested, - scaled_radius=scaled_radius, - force_unique=force_unique) + return self.find_similars_smiles( + smiles[0], + num_requested=num_requested, + scaled_radius=scaled_radius, + force_unique=force_unique, + compound_id=str(chembl_id) + ) diff --git a/cuchem/cuchem/decorator/lipinski.py b/cuchem/cuchem/decorator/lipinski.py index 67407fc8..3588df1c 100644 --- a/cuchem/cuchem/decorator/lipinski.py +++ b/cuchem/cuchem/decorator/lipinski.py @@ -29,6 +29,7 @@ def decorate(self, hacceptors = [] rotatable_bonds = [] qeds = [] + invalid = [] for idx in range(df.shape[0]): @@ -36,6 +37,8 @@ def decorate(self, m = Chem.MolFromSmiles(smiles) if m is None: + logger.info(f'{idx}: Could not make a Mol from {smiles}') + invalid.append(True) mol_logp.append({'value': '-', 'level': 'info'}) mol_wt.append({'value': '-', 'level': 'info'}) hdonors.append({'value': '-', 'level': 'info'}) @@ -43,7 +46,8 @@ def decorate(self, rotatable_bonds.append({'value': '-', 'level': 'info'}) qeds.append({'value': '-', 'level': 'info'}) continue - + else: + invalid.append(False) try: logp = Descriptors.MolLogP(m) mol_logp.append({'value': round(logp, 2), @@ -100,5 +104,7 @@ def decorate(self, df['H-Bond Acceptors'] = hacceptors df['Rotatable Bonds'] = rotatable_bonds df['QED'] = qeds + # TODO: this may be redundant as chemvisualize seems to be handling such invalid molecules + df['invalid'] = invalid return df diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index b1dba90d..442012fa 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -1,6 +1,9 @@ # Copyright 2020 NVIDIA Corporation # SPDX-License-Identifier: Apache-2.0 +# TODO: separate loading of compounds from clustering of compounds; currently, loading is triggered by a call to clustering. +# TODO: separate fingerprinting from clustering; currently fingerprinting is triggered by a call to clustering. + import base64 import json import logging @@ -9,6 +12,7 @@ import cupy import dash +import cuml import dash_bootstrap_components as dbc import dash_core_components as dcc import dash_html_components as html @@ -22,12 +26,14 @@ from cuchem.utils import generate_colors, report_ui_error from rdkit import Chem from rdkit.Chem import Draw, PandasTools +from numba.cuda.libdevice import popcll # Check if all of these are needed: from cuchemcommon.fingerprint import MorganFingerprint, INTEGER_NBITS import sys import numpy as np import pandas as pd +import cudf import dask_cudf from dask.distributed import wait from rdkit import DataStructs, Chem @@ -88,12 +94,12 @@ def download_sdf(): valid_idx = [] col_list = ['SMILES', 'Molecular Weight', 'LogP', 'H-Bond Donors', 'H-Bond Acceptors', 'Rotatable Bonds'] - for row, data in vis.genreated_df.iterrows(): + for row, data in vis.generated_df.iterrows(): mol = Chem.MolFromSmiles(data['SMILES']) if (mol is not None): valid_idx.append(row) - valid_df = vis.genreated_df.iloc[valid_idx] + valid_df = vis.generated_df.iloc[valid_idx] valid_df = valid_df[col_list] PandasTools.AddMoleculeColumnToFrame(valid_df, 'SMILES') @@ -121,12 +127,12 @@ def intersection_wrapper(fp_int_col, op_col, query_fp_int): class ChemVisualization(metaclass=Singleton): - def __init__(self, cluster_wf): + def __init__(self, cluster_wf, fingerprint_radius=2, fingerprint_nBits=512): self.app = app self.cluster_wf = cluster_wf self.n_clusters = cluster_wf.n_clusters self.chem_data = ChEmblData() - self.genreated_df = None + self.generated_df = None self.cluster_wf_cls = 'cuchem.wf.cluster.gpukmeansumap.GpuKmeansUmapHybrid' self.generative_wf_cls = 'cuchem.wf.generative.MegatronMolBART' @@ -209,10 +215,10 @@ def __init__(self, cluster_wf): Input('bt_close_err', 'n_clicks')])(self.handle_error) self.app.callback( - Output('genration_candidates', 'children'), + Output('generation_candidates', 'children'), [Input({'role': 'bt_add_candidate', 'chemblId': ALL, 'molregno': ALL}, 'n_clicks'), Input('bt_reset_candidates', 'n_clicks'), ], - State('genration_candidates', 'children'))(self.handle_add_candidate) + State('generation_candidates', 'children'))(self.handle_add_candidate) self.app.callback( Output('analoguing_candidates', 'children'), @@ -221,7 +227,7 @@ def __init__(self, cluster_wf): self.app.callback( Output('ckl_candidate_mol_id', 'options'), - Input('genration_candidates', 'children'))(self.handle_construct_candidates) + Input('generation_candidates', 'children'))(self.handle_construct_candidates) self.app.callback( Output('ckl_analoguing_mol_id', 'options'), @@ -237,20 +243,6 @@ def __init__(self, cluster_wf): [Output('ckl_analoguing_mol_id', 'value')], [Input('ckl_analoguing_mol_id', 'value')])(self.handle_analoguing_ckl_selection) -""" self.app.callback( - [Output('table_generated_molecules', 'children'), - Output('show_generated_mol', 'children'), - Output('msg_generated_molecules', 'children'), - Output('interpolation_error', 'children')], - [Input("bt_generate", "n_clicks"), ], - [State('sl_generative_wf', 'value'), - State('ckl_candidate_mol_id', 'value'), - State('n2generate', 'value'), - State('scaled_radius', 'value'), - State('rd_generation_type', 'value'), - State('show_generated_mol', 'children')])(self.handle_generation) -""" - self.app.callback( [Output('section_generated_molecules_clustered', 'style'), Output('gen_figure', 'figure'), @@ -302,7 +294,7 @@ def __init__(self, cluster_wf): def handle_add_candidate(self, bt_add_candidate, bt_reset_candidates, - genration_candidates): + generation_candidates): comp_id, event_type = self._fetch_event_data() if comp_id == 'bt_reset_candidates' and event_type == 'n_clicks': @@ -313,8 +305,8 @@ def handle_add_candidate(self, bt_add_candidate, selected_candidates = [] - if genration_candidates: - selected_candidates = genration_candidates.split(",") + if generation_candidates: + selected_candidates = generation_candidates.split(",") comp_detail = json.loads(comp_id) selected_chembl_id = comp_detail['chemblId'] @@ -327,7 +319,7 @@ def handle_add_candidate(self, bt_add_candidate, def handle_analoguing_candidate(self, bt_analoguing_candidate, analoguing_candidates): comp_id, event_type = self._fetch_event_data() - #logger.info(f'handle_analoguing_candidate({bt_analoguing_candidate}, {analoguing_candidates}): cid={comp_id}, et={event_type}, dash.callback_context.triggered[0]["value"]={ dash.callback_context.triggered[0]["value"]}') + logger.info(f'handle_analoguing_candidate({bt_analoguing_candidate}, {analoguing_candidates}): cid={comp_id}, et={event_type}, dash.callback_context.triggered[0]["value"]={ dash.callback_context.triggered[0]["value"]}') if event_type != 'n_clicks' or dash.callback_context.triggered[0]['value'] == 0: raise dash.exceptions.PreventUpdate @@ -341,7 +333,7 @@ def handle_analoguing_candidate(self, bt_analoguing_candidate, analoguing_candid if selected_chembl_id not in selected_candidates: selected_candidates.append(selected_chembl_id) - #logger.info(f'comp_detail={comp_detail}, selected_candidates={selected_candidates}') + logger.info(f'comp_detail={comp_detail}, selected_candidates={selected_candidates}') return ','.join(selected_candidates) def _fetch_event_data(self): @@ -361,27 +353,31 @@ def handle_property_tables(self, show_generated_mol, show_selected_mol): return dash.no_update, dash.no_update @report_ui_error(4) - def handle_generation(self, bt_generate, - sl_generative_wf, ckl_candidate_mol_id, - n2generate, scaled_radius, rd_generation_type, show_generated_mol): + def handle_generation( + self, bt_generate, sl_generative_wf, ckl_candidate_mol_id, n2generate, + extrap_compound_property, extrap_cluster_number, extrap_n_compounds, extrap_step_size, + scaled_radius, rd_generation_type, show_generated_mol + ): comp_id, event_type = self._fetch_event_data() - chemble_ids = [] + chembl_ids = [] if comp_id == 'bt_generate' and event_type == 'n_clicks': - chemble_ids = ckl_candidate_mol_id + chembl_ids = ckl_candidate_mol_id else: return dash.no_update, dash.no_update self.generative_wf_cls = sl_generative_wf wf_class = locate(self.generative_wf_cls) generative_wf = wf_class() + logger.info(f'locate({self.generative_wf_cls}) = {wf_class}, rd_generation_type={rd_generation_type}') + sys.stdout.flush() n2generate = int(n2generate) scaled_radius = float(scaled_radius) if rd_generation_type == 'SAMPLE': - if chemble_ids == None or len(chemble_ids) == 0: + if chembl_ids == None or len(chembl_ids) == 0: raise ValueError('Please select at-least one molecule for Sampling.') - self.genreated_df = generative_wf.find_similars_smiles_by_id(chemble_ids, + self.generated_df = generative_wf.find_similars_smiles_by_id(chembl_ids, num_requested=n2generate, scaled_radius=scaled_radius, force_unique=True) @@ -395,9 +391,9 @@ def handle_generation(self, bt_generate, scaled_radius=scaled_radius, force_unique=False)#True) else: - if chemble_ids == None or len(chemble_ids) < 2: + if chembl_ids == None or len(chembl_ids) < 2: raise ValueError('Please select at-least two molecules for Interpolation.') - self.genreated_df = generative_wf.interpolate_by_id(chemble_ids, + self.generated_df = generative_wf.interpolate_by_id(chembl_ids, num_points=n2generate, scaled_radius=scaled_radius, force_unique=True) @@ -407,8 +403,8 @@ def handle_generation(self, bt_generate, show_generated_mol += 1 # Add other useful attributes to be added for rendering - self.genreated_df = MolecularStructureDecorator().decorate(self.genreated_df) - self.genreated_df = LipinskiRuleOfFiveDecorator().decorate(self.genreated_df) + self.generated_df = MolecularStructureDecorator().decorate(self.generated_df) + self.generated_df = LipinskiRuleOfFiveDecorator().decorate(self.generated_df) self.generated_df = self.generated_df[ ~self.generated_df['invalid'] ].reset_index(drop=True).drop(columns=['invalid']) if len(self.generated_df) == 0: logger.info("None of the generated smiles yielded valid molecules!!!") @@ -416,7 +412,9 @@ def handle_generation(self, bt_generate, # Note: we are not allowing fingerprint specification to change here because we want to see the results on the same PCA / UMAP as the original figure # TODO: make this clear in the UI - fps = MorganFingerprint(radius=self.fingerprint_radius, nBits=self.fingerprint_nBits).transform(self.generated_df, smiles_column='SMILES') + fps = MorganFingerprint( + radius=self.fingerprint_radius, nBits=self.fingerprint_nBits + ).transform(self.generated_df, smiles_column='SMILES') df_fp = pd.DataFrame(fps, dtype='float32') self.generated_df = pd.concat([self.generated_df, df_fp], axis=1) df_fp=cudf.from_pandas(df_fp) @@ -438,10 +436,21 @@ def handle_generation(self, bt_generate, df_embedding, prop_series = self.cluster_wf._remove_non_numerics(df_embedding) prop_series['cluster'] = cluster_col n_molecules, n_obs = df_embedding.compute().shape # needed? + logger.info( + f'cluster_wf: {self.cluster_wf}, self.cluster_wf.pca={self.cluster_wf.pca}, isinstance(self.cluster_wf.pca, cuml.PCA)={isinstance(self.cluster_wf.pca, cuml.PCA)}'\ + f'\ndf_embedding: {type(df_embedding)}, isinstance(df_embedding, dask_cudf.DataFrame)={isinstance(df_embedding, dask_cudf.DataFrame)}\n{df_embedding.head()}') + sys.stdout.flush() + #if hasattr(df_embedding, 'compute'): + # df_embedding = df_embedding.compute() + # logger.info(f'df_embedding after compute(): {type(df_embedding)}\n{df_embedding.head()}') + # sys.stdout.flush() + + if isinstance(self.cluster_wf.pca, cuml.PCA) and isinstance(df_embedding, dask_cudf.DataFrame): + df_embedding = df_embedding.compute() df_embedding = self.cluster_wf.pca.transform(df_embedding) df_embedding = df_embedding.persist() # TODO: wait after this? - X_train = df_embedding.compute() # needed? - Xt = self.cluster_wf.umap_model.transform(df_embedding) + #X_train = df_embedding.compute() # needed? + Xt = self.cluster_wf.umap.transform(df_embedding) df_embedding['x'] = Xt[0] df_embedding['y'] = Xt[1] @@ -459,21 +468,21 @@ def handle_generation(self, bt_generate, for col_name in self.generated_df.columns.to_list() if not isinstance(col_name, int) ] - #columns = self.genreated_df.columns.to_list() - #ignore_columns = ['embeddings', 'embeddings_dim'] + #columns = self.generated_df.columns.to_list() + ignore_columns = ['embeddings', 'embeddings_dim'] for column in columns: - #if column in ignore_columns: - # continue + if column in ignore_columns: + continue table_headers.append(html.Th(column, style={'fontSize': '150%', 'text-align': 'center'})) prop_recs = [html.Tr(table_headers, style={'background': 'lightgray'})] invalid_mol_cnt = 0 - for row_idx in range(self.genreated_df.shape[0]): + for row_idx in range(self.generated_df.shape[0]): td = [] try: col_pos = columns.index('Chemical Structure') - col_data = self.genreated_df.iat[row_idx, col_pos] + col_data = self.generated_df.iat[row_idx, col_pos] if 'value' in col_data and col_data['value'] == MolecularStructureDecorator.ERROR_VALUE: invalid_mol_cnt += 1 @@ -482,7 +491,7 @@ def handle_generation(self, bt_generate, pass for col_id in range(len(columns)): - col_data = self.genreated_df.iat[row_idx, col_id] + col_data = self.generated_df.iat[row_idx, col_id] #if columns[col_id] in ignore_columns: # continue @@ -512,12 +521,9 @@ def handle_generation(self, bt_generate, if invalid_mol_cnt > 0: msg_generated_molecules = f'{invalid_mol_cnt} invalid molecules were created, which were eliminated from the result.' - #return html.Table(prop_recs, style={'width': '100%', - # 'border': '1px solid lightgray'}), \ - # show_generated_mol, \ - # msg_generated_molecules, \ - # dash.no_update - return {'display': 'inline'}, fig, html.Table(prop_recs, style={'width': '100%', 'margin': 12, 'border': '1px solid lightgray'}), show_generated_mol, dash.no_update + return {'display': 'inline'}, fig, html.Table( + prop_recs, style={'width': '100%', 'margin': 12, 'border': '1px solid lightgray'} + ), show_generated_mol, dash.no_update @report_ui_error(3) @@ -570,23 +576,36 @@ def handle_analoguing( smiles_column = 'canonical_smiles' else: smiles_columns = 'SMILES' + logger.info(f'self.cluster_wf.df_embedding: {self.cluster_wf.df_embedding}\n{self.cluster_wf.df_embedding.head()}') if self.fp_df is None: # CPU-based workflow, to be deprecated - smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']].map_partitions(cudf.DataFrame.to_pandas) + logger.info(f'self.fp_df not set, computing on CPU') + if isinstance(self.cluster_wf.df_embedding, dask_cudf.DataFrame): + smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']].map_partitions(cudf.DataFrame.to_pandas) + elif isinstance(self.cluster_wf.df_embedding, cudf.DataFrame): + smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']].to_pandas() + else: + smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']] + if 'fp' not in self.cluster_wf.df_embedding.columns: logger.info(f'Computing fingerprints...') _, v = MorganFingerprint(radius=self.fingerprint_radius, nBits=self.fingerprint_nBits).transform( smiles_df, smiles_column=smiles_column, return_fp=True, raw=True) else: logger.info(f'Fingerprints already available') - v = list(self.cluster_wf.df_embedding['fp'].compute().to_pandas()) + if hasattr(self.cluster_wf.df_embedding, 'compute'): + v = list(self.cluster_wf.df_embedding['fp'].compute().to_pandas()) + else: + v = list(self.cluster_wf.df_embedding['fp']) + self.fp_df = pd.DataFrame({ 'fp': v, smiles_column: smiles_df[smiles_column], #list(self.cluster_wf.df_embedding[smiles_column].compute().to_pandas()), #smiles_df[smiles_column], 'id': smiles_df['id'], #list(self.cluster_wf.df_embedding['id'].compute().to_pandas()) }) - self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() - wait(self.cluster_wf.df_embedding) + if hasattr(self.cluster_wf.df_embedding, 'persist'): + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() + wait(self.cluster_wf.df_embedding) if 'pc' not in self.cluster_wf.df_embedding.columns: # Pre-computing the popcounts for all compounds in the database: @@ -602,13 +621,16 @@ def handle_analoguing( # More complex syntax was not necessary: #self.cluster_wf.df_embedding['op_col'] = self.cluster_wf.df_embedding.map_partitions(popcll_wrapper_dask, col, 'op_col') #lambda df: df = df.apply_rows(popcll_wrapper, incols = {col: 'ip_col'}, outcols = {'op_col': int}, kwargs = {})) self.cluster_wf.df_embedding['pc'] += self.cluster_wf.df_embedding['op_col'] - self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() - wait(self.cluster_wf.df_embedding) + if hasattr(self.cluster_wf.df_embedding, 'persist'): + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() + wait(self.cluster_wf.df_embedding) t1 = time.time() logger.info(f'Time to compute partial popcounts: {t1 - t0}') # Prepare the query compound: - molregno = self.chem_data.fetch_molregno_by_chemblId([analoguing_mol_id])[0][0] + logger.info(f'analoguing_mol_id={analoguing_mol_id}') + molregno = self.chem_data.fetch_molregno_by_chemblId( + [analoguing_mol_id])[0][0] props, selected_molecules = self.chem_data.fetch_props_by_molregno([molregno]) query_smiles = selected_molecules[0][props.index('canonical_smiles')] query_fp = MorganFingerprint(radius=self.fingerprint_radius, nBits=self.fingerprint_nBits).transform( @@ -627,10 +649,14 @@ def handle_analoguing( #self.cluster_wf.df_embedding['op_col'] = 0 self.cluster_wf.df_embedding['n_intersection'] = 0 t4 = time.time() + logger.info(f'self.cluster_wf.df_embedding: {type(self.cluster_wf.df_embedding)}, {list(self.cluster_wf.df_embedding.columns)}\n{self.cluster_wf.df_embedding.head()}') + sys.stdout.flush() for i in range(0, self.fingerprint_nBits, INTEGER_NBITS): fp_num = i // INTEGER_NBITS + logger.info(f'i={i}, fp_num={fp_num}') self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.apply_rows( - intersection_wrapper, incols={f'fp{fp_num}': 'fp_int_col'}, outcols={'op_col': int}, kwargs={'query_fp_int': query_fp_ints[fp_num]}) + intersection_wrapper, incols={f'fp{fp_num}': 'fp_int_col'}, + outcols={'op_col': int}, kwargs={'query_fp_int': query_fp_ints[fp_num]}) #logging.info(f'{i}:\n{self.cluster_wf.df_embedding.head()}') #self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() #wait(self.cluster_wf.df_embedding) @@ -763,6 +789,9 @@ def handle_construct_candidates2(self, north_star): if not north_star: return [] + options = [{'label': i.strip(), 'value': i.strip()} for i in north_star.split(',')] + return options + def handle_reset(self, bt_reset, bt_apply_wf, refresh_main_fig, sl_wf): comp_id, event_type = self._fetch_event_data() @@ -785,10 +814,17 @@ def handle_reset(self, bt_reset, bt_apply_wf, refresh_main_fig, sl_wf): def recluster(self, filter_values=None, filter_column=None, reload_data=False): self.cluster_wf.n_clusters = self.n_clusters if reload_data: - return self.cluster_wf.cluster() + return self.cluster_wf.cluster( + fingerprint_radius=self.fingerprint_radius, fingerprint_nBits=self.fingerprint_nBits + ) else: - return self.cluster_wf.recluster(filter_column, filter_values, - n_clusters=self.n_clusters) + return self.cluster_wf.recluster( + filter_column, + filter_values, + n_clusters=self.n_clusters, + fingerprint_radius=self.fingerprint_radius, + fingerprint_nBits=self.fingerprint_nBits + ) def recluster_selection( self, @@ -799,6 +835,7 @@ def recluster_selection( reload_data=False, recluster_data=True, color_col='cluster', + fingerprint_radius=2, fingerprint_nBits=512 ): @@ -811,6 +848,7 @@ def recluster_selection( reload_data=reload_data ) else: + # Can use previous embedding only if fingerprint has not changed df_embedding = self.cluster_wf.df_embedding return self.create_graph(df_embedding, @@ -835,7 +873,7 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop moi_molregno = [] if north_stars: - moi_molregno = list(map(int, north_stars.split(","))) + moi_molregno = north_stars.split(",") #list(map(int, north_stars.split(","))) moi_filter = ldf['id'].isin(moi_molregno) @@ -855,7 +893,8 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop cluster = ldf['cluster'] customdata = ldf['id'] grad_prop = ldf[gradient_prop] - textdata = cupy.asarray([ f'C-{c}_ID-{cid}' for c, cid in zip(cdf['cluster'].to_array(), cdf['id'].to_array()) ]) + textdata = cupy.asarray([ + f'C-{c}_ID-{cid}' for c, cid in zip(cdf['cluster'].to_array(), cdf['id'].to_array()) ]) if self.cluster_wf.is_gpu_enabled(): x_data = x_data.to_array() @@ -930,7 +969,8 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop 'marker': { 'size': df_size, 'symbol': df_shape, - 'color': self.cluster_colors[int(cluster_id) % len(self.cluster_colors)], + 'color': self.cluster_colors[ + int(cluster_id) % len(self.cluster_colors)], }, }) if moi_present: @@ -1017,7 +1057,7 @@ def construct_molecule_detail(self, selected_points, display_properties, prop_recs = [html.Tr(table_headers, style={'background': 'lightgray'})] if chembl_ids: - props, selected_molecules = self.chem_data.fetch_props_by_chemble(chembl_ids) + props, selected_molecules = self.chem_data.fetch_props_by_chembl(chembl_ids) elif selected_points: selected_molregno = [] for point in selected_points['points'][((page - 1) * pageSize): page * pageSize]: @@ -1205,7 +1245,6 @@ def constuct_layout(self): dcc.Input(id='n2generate', value=10), ], style={'marginLeft': 0}), - html.Div(className='row', children=[ html.Label([ "Select molecular property for fitting and extrapolation", @@ -1230,7 +1269,6 @@ def constuct_layout(self): dcc.Input(id='extrap_n_compounds', value=10), ], style={'marginLeft': 0}), - html.Div(className='row', children=[ dcc.Markdown("Scaled sampling radius (int, start with 1)", style={'marginLeft': 10, 'marginTop': 12, 'width': '250px'}), @@ -1332,17 +1370,7 @@ def constuct_layout(self): ]), dcc.Tab(label='Find Analogues', children=[ - dcc.Markdown(children="""Choose a compound""", - id="analoguing_msg", - style={'marginTop': 18} - ), - dcc.Checklist( - id='ckl_analoguing_mol_id', - options=[], - value=[], - inputStyle={'display': 'inline-block', 'marginLeft': 6, 'marginRight': 6}, - labelStyle={'display': 'block', 'marginLeft': 6, 'marginRight': 6} - ), + html.Div(className='row', children=[ dcc.Markdown("Maxinum Number of Analogues", style={'marginTop': 12,}), dcc.Input(id='analoguing_n_analogues', value=10), @@ -1365,6 +1393,17 @@ def constuct_layout(self): value='similar', clearable=False), ]), + dcc.Markdown(children="""Choose a compound""", + id="analoguing_msg", + style={'marginTop': 18} + ), + dcc.Checklist( + id='ckl_analoguing_mol_id', + options=[], + value=[], + #inputStyle={'display': 'inline-block', 'marginLeft': 6, 'marginRight': 6}, + #labelStyle={'display': 'block', 'marginLeft': 6, 'marginRight': 6} + ), html.Div(className='row', children=[ dbc.Button('Search', id='bt_analoguing', n_clicks=0, style={'marginRight': 12}), ], style={'marginLeft': 0}), @@ -1382,8 +1421,7 @@ def constuct_layout(self): ], className='three columns', style={'marginLeft': 18, 'marginTop': 90, 'verticalAlign': 'text-top', }), ]), - html.Div( - id='section_generated_molecules', + html.Div(id='section_generated_molecules', children=[ html.A( 'Export to SDF', @@ -1398,22 +1436,6 @@ def constuct_layout(self): ], style={'display': 'none'} ), - #html.Div(className='row', children=[ - # html.Div(id='section_generated_molecules', children=[ - # html.Div(className='row', children=[ - # html.A('Export to SDF', - # id='download-link', - # download="rawdata.sdf", - # href="/cheminfo/downloadSDF", - # target="_blank", - # n_clicks=0, - # style={'fontSize': '150%'} - # ), - # html.Div(id='msg_generated_molecules', children=[], - # style={'color': 'red', 'fontWeight': 'bold', 'marginLeft': 12, 'fontSize': '150%'}), - # ], style={'marginLeft': 0, 'marginBottom': 18, }), - # html.Div(id='table_generated_molecules', children=[], style={'width': '100%'}) - # ], style={'display': 'none', 'width': '100%'}), html.Div(id='section_generated_molecules_clustered', children=[ dcc.Graph(id='gen_figure', figure=fig, @@ -1461,7 +1483,6 @@ def constuct_layout(self): style={'width': '100%'} ) ], style={'display': 'none', 'width': '100%'}), - #], style={'margin': 12}), html.Div(id='section_analoguing', children=[ html.Div(children=[ @@ -1477,7 +1498,7 @@ def constuct_layout(self): html.Div(id='mol_selection_error', style={'display': 'none'}), html.Div(id='show_selected_mol', style={'display': 'none'}), html.Div(id='show_generated_mol', style={'display': 'none'}), - html.Div(id='genration_candidates', style={'display': 'none'}), + html.Div(id='generation_candidates', style={'display': 'none'}), html.Div(id='refresh_moi_prop_table', style={'display': 'none'}), html.Div(id='interpolation_error', style={'display': 'none'}), html.Div(id='analoguing_candidates', style={'display': 'none'}), # Not displayed but used to keep track of compounds added to checklist of compounds to be analogued @@ -1665,7 +1686,8 @@ def handle_re_cluster( elif comp_id == 'bt_north_star' and event_type == 'n_clicks': if north_star: north_star = north_star.split(',') - missing_mols, molregnos, _ = self.cluster_wf.add_molecules(north_star) + missing_mols, molregnos, _ = self.cluster_wf.add_molecules( + north_star, radius=int(fingerprint_radius), nBits=int(fingerprint_nBits)) recluster_data = len(missing_mols) > 0 logger.info("%d missing molecules added...", len(missing_mols)) logger.debug("Missing molecules werew %s", missing_mols) diff --git a/cuchem/cuchem/wf/cluster/__init__.py b/cuchem/cuchem/wf/cluster/__init__.py index bff3dcd4..ab9f36ae 100644 --- a/cuchem/cuchem/wf/cluster/__init__.py +++ b/cuchem/cuchem/wf/cluster/__init__.py @@ -22,16 +22,14 @@ def _remove_ui_columns(self, embedding): def _remove_non_numerics(self, embedding): embedding = self._remove_ui_columns(embedding) - - other_props = ['id'] + IMP_PROPS + ADDITIONAL_FEILD - # Tempraryly store columns not required during processesing + # Fingerprint columns have the names 0, 1, 2,... + non_numeric_col_names = [col for col in embedding.columns if type(col) != int] + # Temporarily store columns not required during processesing prop_series = {} - for col in other_props: - if col in embedding.columns: - prop_series[col] = embedding[col] + for col in non_numeric_col_names: + prop_series[col] = embedding[col] if len(prop_series) > 0: - embedding = embedding.drop(other_props, axis=1) - + embedding = embedding.drop(non_numeric_col_names, axis=1) return embedding, prop_series def _random_sample_from_arrays(self, *input_array_list, n_samples=None, index=None): @@ -97,7 +95,7 @@ def recluster(self, def add_molecules(self, chemblids: List): """ - ChembleId's accepted as argument to the existing database. Duplicates + ChemblId's accepted as argument to the existing database. Duplicates must be ignored. """ raise NotImplementedError diff --git a/cuchem/cuchem/wf/cluster/gpukmeansumap.py b/cuchem/cuchem/wf/cluster/gpukmeansumap.py index 58aa739f..395dd419 100644 --- a/cuchem/cuchem/wf/cluster/gpukmeansumap.py +++ b/cuchem/cuchem/wf/cluster/gpukmeansumap.py @@ -42,43 +42,52 @@ @singledispatch -def _gpu_cluster_wrapper(embedding, n_pca, self): +def _gpu_cluster_wrapper(embedding, n_pca, reuse_umap, reuse_pca, self): return NotImplemented @_gpu_cluster_wrapper.register(dask.dataframe.core.DataFrame) -def _(embedding, n_pca, self): +def _(embedding, n_pca, reuse_umap, reuse_pca, self): embedding = dask_cudf.from_dask_dataframe(embedding) - return _gpu_cluster_wrapper(embedding, n_pca, self) + return _gpu_cluster_wrapper(embedding, n_pca, reuse_umap, reuse_pca, self) @_gpu_cluster_wrapper.register(cudf.DataFrame) -def _(embedding, n_pca, self): +def _(embedding, n_pca, reuse_umap, reuse_pca, self): embedding = dask_cudf.from_cudf(embedding, - chunksize=int(embedding.shape[0] * 0.1)) - return _gpu_cluster_wrapper(embedding, n_pca, self) + chunksize=max(10, int(embedding.shape[0] * 0.1))) + return _gpu_cluster_wrapper(embedding, n_pca, reuse_umap, reuse_pca, self) @_gpu_cluster_wrapper.register(dask_cudf.core.DataFrame) -def _(embedding, n_pca, self): +def _(embedding, n_pca, reuse_umap, reuse_pca, self): embedding = embedding.persist() - return self._cluster(embedding, n_pca) + logger.info(f'_gpu_cluster_wrapper: self={self}, {type(embedding)}, {n_pca}, {reuse_umap}, {reuse_pca}') + return self._cluster(embedding, n_pca, reuse_umap, reuse_pca) class GpuKmeansUmap(BaseClusterWorkflow, metaclass=Singleton): - + # TODO: support changing fingerprint radius and nBits in other kmeans workflows as well (hybrid, random projection) def __init__(self, n_molecules: int = None, dao: ClusterWfDAO = ChemblClusterWfDao(MorganFingerprint), pca_comps=64, n_clusters=7, - seed=0): + seed=0, + fingerprint_radius=2, + fingerprint_nBits=512 + ): super().__init__() - self.dao = dao + self.dao = dao if dao is not None else ChemblClusterWfDao( + MorganFingerprint, radius=fingerprint_radius, nBits=fingerprint_nBits + ) + self.fingerprint_radius = fingerprint_radius + self.fingerprint_nBits = fingerprint_nBits self.n_molecules = n_molecules self.pca_comps = pca_comps self.pca = None + self.umap = None self.n_clusters = n_clusters self.df_embedding = None @@ -87,12 +96,12 @@ def __init__(self, self.n_spearman = 5000 self.n_silhouette = 500000 - def _cluster(self, embedding, n_pca): + def _cluster(self, embedding, n_pca, reuse_umap=False, reuse_pca=True): """ Generates UMAP transformation on Kmeans labels generated from molecular fingerprints. """ - + reuse_umap = reuse_umap and reuse_pca dask_client = self.context.dask_client embedding = embedding.reset_index() @@ -106,15 +115,18 @@ def _cluster(self, embedding, n_pca): if n_pca and n_obs > n_pca: with MetricsLogger('pca', self.n_molecules) as ml: - if self.pca is None: + if (self.pca is None) or not reuse_pca: self.pca = cuDaskPCA(client=dask_client, n_components=n_pca) self.pca.fit(embedding) + else: + logger.info(f'Using available pca') embedding = self.pca.transform(embedding) embedding = embedding.persist() with MetricsLogger('kmeans', self.n_molecules) as ml: - if self.n_molecules < MIN_RECLUSTER_SIZE: - raise Exception('Reclustering less than %d molecules is not supported.' % MIN_RECLUSTER_SIZE) + if self.n_molecules < self.n_clusters: # < MIN_RECLUSTER_SIZE: + raise Exception('Reclustering {self.n_molecules} molecules into {self.n_clusters} clusters not supported.')# % MIN_RECLUSTER_SIZE) + #raise Exception('Reclustering less than %d molecules is not supported.' % MIN_RECLUSTER_SIZE) kmeans_cuml = cuDaskKMeans(client=dask_client, n_clusters=self.n_clusters) @@ -137,13 +149,18 @@ def _cluster(self, embedding, n_pca): local_model = cuUMAP() local_model.fit(X_train) - umap_model = cuDaskUMAP(local_model, - n_neighbors=100, - a=1.0, - b=1.0, - learning_rate=1.0, - client=dask_client) - Xt = umap_model.transform(embedding) + if not (reuse_umap and self.umap): + self.umap = cuDaskUMAP( + local_model, + n_neighbors=100, + a=1.0, + b=1.0, + learning_rate=1.0, + client=dask_client + ) + else: + logger.info(f'reusing {self.umap}') + Xt = self.umap.transform(embedding) ml.metric_name = 'spearman_rho' ml.metric_func = self._compute_spearman_rho @@ -165,30 +182,48 @@ def _cluster(self, embedding, n_pca): return embedding - def cluster(self, df_mol_embedding=None): + def cluster(self, df_mol_embedding=None, reuse_umap=False, reuse_pca=True, fingerprint_radius=2, fingerprint_nBits=512): - logger.info("Executing GPU workflow...") + #logger.info("Executing GPU workflow...") + logger.info(f"GpuKmeansUmap.cluster(radius={fingerprint_radius}, nBits={fingerprint_nBits}), df_mol_embedding={df_mol_embedding}") - if df_mol_embedding is None: + if (df_mol_embedding is None) or (fingerprint_radius != self.fingerprint_radius) or (fingerprint_nBits != self.fingerprint_nBits): self.n_molecules = self.context.n_molecule - + self.dao = ChemblClusterWfDao( + MorganFingerprint, radius=fingerprint_radius, nBits=fingerprint_nBits) + self.fingerprint_radius = fingerprint_radius + self.fingerprint_nBits = fingerprint_nBits + logger.info(f'dao={self.dao}') df_mol_embedding = self.dao.fetch_molecular_embedding( self.n_molecules, cache_directory=self.context.cache_directory, - ) + radius=fingerprint_radius, + nBits=fingerprint_nBits + ) df_mol_embedding = df_mol_embedding.persist() - - self.df_embedding = _gpu_cluster_wrapper(df_mol_embedding, - self.pca_comps, - self) + self.df_embedding = _gpu_cluster_wrapper( + df_mol_embedding, + self.pca_comps, + reuse_umap, + reuse_pca, + self + ) return self.df_embedding def recluster(self, filter_column=None, filter_values=None, - n_clusters=None): + n_clusters=None, + fingerprint_radius=2, + fingerprint_nBits=512 + ): + + # The user may have changed the fingerprint specification, in which case, we cannot reuse the embeddings + if (fingerprint_radius != self.fingerprint_radius) or (fingerprint_nBits != self.fingerprint_nBits): + return self.cluster(df_mol_embedding=None, reuse_umap=False, reuse_pca=False, fingerprint_radius=fingerprint_radius, fingerprint_nBits=fingerprint_nBits) + logger.info(f"recluster(radius={fingerprint_radius}, nBits={fingerprint_nBits}): reusing embedding") df_embedding = self.df_embedding if filter_values is not None: filter = df_embedding[filter_column].isin(filter_values) @@ -199,11 +234,11 @@ def recluster(self, if n_clusters is not None: self.n_clusters = n_clusters - self.df_embedding = _gpu_cluster_wrapper(df_embedding, None, self) + self.df_embedding = _gpu_cluster_wrapper(df_embedding, None, False, True, self) return self.df_embedding - def add_molecules(self, chemblids: List): + def add_molecules(self, chemblids: List, radius=2, nBits=512): chemblids = [x.strip().upper() for x in chemblids] chem_mol_map = {row[0]: row[1] for row in self.dao.fetch_id_from_chembl(chemblids)} @@ -239,6 +274,7 @@ def add_molecules(self, chemblids: List): self.df_embedding = self._remove_ui_columns(self.df_embedding) self.df_embedding = self.df_embedding.append(new_fingerprints) + # TODO: does caller expect cudf or dask_cudf? if hasattr(self.df_embedding, 'compute'): self.df_embedding = self.df_embedding.compute() @@ -261,7 +297,7 @@ def __init__(self, n_clusters=n_clusters, seed=seed) - def _cluster(self, embedding, n_pca): + def _cluster(self, embedding, n_pca, reuse_umap=False, reuse_pca=False): """ Generates UMAP transformation on Kmeans labels generated from molecular fingerprints. @@ -281,7 +317,7 @@ def _cluster(self, embedding, n_pca): if n_pca and n_obs > n_pca: with MetricsLogger('pca', self.n_molecules) as ml: - if self.pca == None: + if (self.pca == None) or not reuse_pca: self.pca = cuml.PCA(n_components=n_pca) self.pca.fit(embedding) embedding = self.pca.transform(embedding) @@ -305,8 +341,8 @@ def _cluster(self, embedding, n_pca): ml.metric_func_args = (embedding_sample, kmeans_labels_sample) with MetricsLogger('umap', self.n_molecules) as ml: - umap = cuml.manifold.UMAP() - Xt = umap.fit_transform(embedding) + self.umap = cuml.manifold.UMAP() + Xt = self.umap.fit_transform(embedding) ml.metric_name = 'spearman_rho' ml.metric_func = self._compute_spearman_rho diff --git a/cuchem/cuchem/wf/generative/megatronmolbart.py b/cuchem/cuchem/wf/generative/megatronmolbart.py index a57a989b..075295de 100644 --- a/cuchem/cuchem/wf/generative/megatronmolbart.py +++ b/cuchem/cuchem/wf/generative/megatronmolbart.py @@ -31,13 +31,17 @@ from math import sqrt logger = logging.getLogger(__name__) +PAD_TOKEN = 0 # TODO: use tokenizer.pad_token instead class MegatronMolBART(BaseGenerativeWorkflow, metaclass=Singleton): def __init__(self, dao: GenerativeWfDao = ChemblGenerativeWfDao(None)) -> None: super().__init__(dao) - + if torch.cuda.is_available(): + self.device = 'cuda' + else: + self.device = 'cpu' self.min_jitter_radius = 1 channel = grpc.insecure_channel(os.getenv('Megamolbart', 'megamolbart:50051')) self.stub = GenerativeSamplerStub(channel) @@ -74,7 +78,8 @@ def find_similars_smiles(self, num_requested: int = 10, scaled_radius=None, force_unique=False, - sanitize=True): + sanitize=True, + compound_id=None): spec = GenerativeSpec(model=GenerativeModel.MegaMolBART, smiles=smiles, radius=scaled_radius, @@ -89,20 +94,33 @@ def find_similars_smiles(self, for embedding in result.embeddings: embeddings.append(list(embedding.embedding)) dims.append(embedding.dim) - - generated_df = pd.DataFrame({'SMILES': generatedSmiles, - 'embeddings': embeddings, - 'embeddings_dim': dims, - 'Generated': [True for i in range(len(generatedSmiles))]}) - generated_df['Generated'].iat[0] = False + if not compound_id: + compound_id = 'source' + generated_df = pd.DataFrame({ + 'SMILES': generatedSmiles, + 'embeddings': embeddings, + 'embeddings_dim': dims, + 'Generated': [False] + [True] * (len(generatedSmiles) - 1), + 'id': [ + str(compound_id)] + [f'{compound_id}-g{i + 1}' + for i in range(len(generatedSmiles) - 1) + ], + }) + #generated_df['Generated'].iat[0] = False return generated_df - def interpolate_smiles(self, - smiles: List, - num_points: int = 10, - scaled_radius=None, - force_unique=False): + def interpolate_smiles( + self, + smiles: List, + num_points: int = 10, + scaled_radius=None, + force_unique=False, + compound_ids=[] + ): + if len(compound_ids) == 0: + compound_ids = [f'source{i}' for i in range(len(smiles))] + spec = GenerativeSpec(model=GenerativeModel.MegaMolBART, smiles=smiles, radius=scaled_radius, @@ -111,11 +129,27 @@ def interpolate_smiles(self, result = self.stub.Interpolate(spec) result = result.generatedSmiles - - generated_df = pd.DataFrame({'SMILES': result, - 'Generated': [True for i in range(len(result))]}) - generated_df.iat[0, 1] = False - generated_df.iat[-1, 1] = False + n_pairs = len(compound_ids) - 1 + n_generated = num_points + 2 + n_generated_total = n_generated * n_pairs + assert len(result) == n_generated_total, f"Expected generator to return {n_generated} compounds between each of the {n_pairs} compound-pairs but got {len(result)}" + generated_df = pd.DataFrame({ + 'SMILES': result, + 'Generated': [ + i % n_generated not in [0, n_generated - 1] + for i in range(n_generated_total) + ], + 'id': [ + str(compound_ids[i // n_generated]) if i % n_generated == 0 + else str(compound_ids[1 + i // n_generated]) if i % n_generated == n_generated - 1 + else f'{compound_ids[i // n_generated]}-{compound_ids[1 + i // n_generated]}_i{i % n_generated}' + for i in range(n_generated_total) + ], + }) + #generated_df = pd.DataFrame({'SMILES': result, + # 'Generated': [True for i in range(len(result))]}) + #generated_df.iat[0, 1] = False + #generated_df.iat[-1, 1] = False return generated_df @@ -239,8 +273,8 @@ def extrapolate_from_smiles(self, avg_tani = 0 embeddings = [] for i, smiles in enumerate(smiles_list): - spec = generativesampler_pb2.GenerativeSpec( - model=generativesampler_pb2.GenerativeModel.MegaMolBART, + spec = GenerativeSpec( + model=GenerativeModel.MegaMolBART, smiles=smiles, ) result = self.stub.SmilesToEmbedding(spec) @@ -248,7 +282,7 @@ def extrapolate_from_smiles(self, mask = result.pad_mask emb_shape = result.dim if debug: - spec = generativesampler_pb2.EmbeddingList( + spec = EmbeddingList( embedding=emb, dim=emb_shape, pad_mask=mask @@ -311,7 +345,7 @@ def extrapolate_from_smiles(self, extrap_embedding = list(extrap_embeddings[i,:]) logger.info(f'embedding: {type(extrap_embedding)}, {len(extrap_embeddings)};'\ f' dim: {type(emb_shape)}, {len(emb_shape)}; pad_mask={type(full_mask)}, {len(full_mask)}') - spec = generativesampler_pb2.EmbeddingList( + spec = EmbeddingList( embedding=extrap_embedding, dim=emb_shape, pad_mask=full_mask @@ -363,8 +397,12 @@ def fit_nn( the train and test set. """ logger.info(f'cluster_id_train={cluster_id_train}, cluster_id_test={cluster_id_test}, compound_property={compound_property}, compounds_df: {len(compounds_df)}, {type(compounds_df)}') - df_train = compounds_df[ compounds_df['cluster'] == int(cluster_id_train) ].dropna().reset_index(drop=True).compute() - df_test = compounds_df[ compounds_df['cluster'] == int(cluster_id_test) ].dropna().reset_index(drop=True).compute() + df_train = compounds_df[ compounds_df['cluster'] == int(cluster_id_train) ].dropna().reset_index(drop=True)#.compute() + if hasattr(df_train, 'compute'): + df_train = df_train.compute() + df_test = compounds_df[ compounds_df['cluster'] == int(cluster_id_test) ].dropna().reset_index(drop=True)#.compute() + if hasattr(df_test, 'compute'): + df_test = df_test.compute() n_train = len(df_train) n_test = len(df_test) @@ -382,8 +420,8 @@ def fit_nn( #radius = self._compute_radius(scaled_radius) for i, smiles in enumerate(smiles_list): - spec = generativesampler_pb2.GenerativeSpec( - model=generativesampler_pb2.GenerativeModel.MegaMolBART, + spec = GenerativeSpec( + model=GenerativeModel.MegaMolBART, smiles=smiles, #radius=radius ) @@ -396,7 +434,7 @@ def fit_nn( #emb = emb[2:] if debug: - spec = generativesampler_pb2.EmbeddingList( + spec = EmbeddingList( embedding=emb, dim=emb_shape, pad_mask=mask @@ -420,7 +458,8 @@ def fit_nn( logger.info(f'{n_recovered} / {len(smiles_list)} compounds yielded something after embedding, with avg tani = {avg_tani / n_recovered if n_recovered > 0 else 0}') #full_mask = full_mask.bool().cuda() - embeddings = torch.nn.utils.rnn.pad_sequence(embeddings, batch_first=True, padding_value=PAD_TOKEN) + embeddings = torch.nn.utils.rnn.pad_sequence( + embeddings, batch_first=True, padding_value=PAD_TOKEN) embeddings_train = embeddings[:n_train,:] embeddings_test = embeddings[n_train:,:] logger.info(f'emb train: {type(embeddings_train)} of {type(embeddings_train[0])}, {embeddings_train.shape}') diff --git a/cuchem/requirements.txt b/cuchem/requirements.txt index 3ac133b2..77bdf034 100644 --- a/cuchem/requirements.txt +++ b/cuchem/requirements.txt @@ -25,4 +25,4 @@ plotly==4.9.0 pytest==6.2.2 umap-learn==0.5.1 grpcio -git+https://github.com/jrwnter/cddd.git@1.0 \ No newline at end of file +git+https://github.com/jrwnter/cddd.git@1.0 diff --git a/cuchem/startdash.py b/cuchem/startdash.py index 5cc5f5c8..8903b0a0 100755 --- a/cuchem/startdash.py +++ b/cuchem/startdash.py @@ -163,6 +163,7 @@ def cache(self): elif (args.cache_type == 'Embeddings'): prepocess_type = Embeddings + # TODO: when loading precomputed fingerprints, the radius and size should be specified chem_data = ChEmblData(fp_type=prepocess_type) chem_data.save_fingerprints( os.path.join(args.cache_directory, FINGER_PRINT_FILES), num_recs = args.n_mol, diff --git a/cuchem/tests/test_generative_wf.py b/cuchem/tests/test_generative_wf.py index 020d9d16..960ebc09 100644 --- a/cuchem/tests/test_generative_wf.py +++ b/cuchem/tests/test_generative_wf.py @@ -13,14 +13,14 @@ def interpolation(wf, num_points=20, force_unique=False): smiles = ['CHEMBL6328', 'CHEMBL415286'] # smiles = ['CHEMBL10454', 'CHEMBL10469'] - genreated_df = wf.interpolate_by_id(smiles, + generated_df = wf.interpolate_by_id(smiles, num_points=num_points, force_unique=force_unique) - genreated_df = MolecularStructureDecorator().decorate(genreated_df) - genreated_df = LipinskiRuleOfFiveDecorator().decorate(genreated_df) - logger.info(genreated_df.shape) - return genreated_df + generated_df = MolecularStructureDecorator().decorate(generated_df) + generated_df = LipinskiRuleOfFiveDecorator().decorate(generated_df) + logger.info(generated_df.shape) + return generated_df def test_cddd_interpolation(): From 55619b274790506bbad2b0ac19a06945fecf9b07 Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Fri, 4 Feb 2022 05:50:34 -0800 Subject: [PATCH 16/27] ready for merge --- common/cuchemcommon/data/cluster_wf.py | 9 +- common/cuchemcommon/data/helper/chembldata.py | 50 ++++---- common/cuchemcommon/workflow.py | 6 +- cuchem/cuchem/decorator/__init__.py | 2 +- cuchem/cuchem/decorator/lipinski.py | 4 +- cuchem/cuchem/decorator/mol_structure.py | 4 +- cuchem/cuchem/interactive/chemvisualize.py | 109 +++++++----------- cuchem/cuchem/wf/cluster/gpukmeansumap.py | 24 ++-- .../cuchem/wf/generative/megatronmolbart.py | 42 ++----- cuchem/startdash.py | 13 ++- megamolbart/megamolbart/inference.py | 4 +- megamolbart/megamolbart/service.py | 8 +- 12 files changed, 122 insertions(+), 153 deletions(-) diff --git a/common/cuchemcommon/data/cluster_wf.py b/common/cuchemcommon/data/cluster_wf.py index 0f1cd993..ca3f50bc 100644 --- a/common/cuchemcommon/data/cluster_wf.py +++ b/common/cuchemcommon/data/cluster_wf.py @@ -6,6 +6,7 @@ import cudf import dask import dask_cudf +import sys from cuchemcommon.context import Context from cuchemcommon.data.helper.chembldata import BATCH_SIZE, ChEmblData from cuchemcommon.utils.singleton import Singleton @@ -43,6 +44,7 @@ def fetch_molecular_embedding(self, hdf_path = os.path.join(cache_subdir, FINGER_PRINT_FILES) else: cache_subdir = None + hdf_path = None if cache_directory and os.path.isdir(cache_subdir): # and (self.radius == radius) and (self.nBits == nBits): logger.info('Reading %d rows from %s...', n_molecules, hdf_path) mol_df = dask.dataframe.read_hdf(hdf_path, 'fingerprints') @@ -55,6 +57,7 @@ def fetch_molecular_embedding(self, self.radius = radius self.nBits = nBits logger.info(f'Reading molecules from database and computing fingerprints (radius={self.radius}, nBits={self.nBits})...') + sys.stdout.flush() mol_df = self.chem_data.fetch_mol_embedding( num_recs=n_molecules, batch_size=context.batch_size, @@ -63,9 +66,11 @@ def fetch_molecular_embedding(self, ) if cache_directory: os.mkdir(cache_subdir) + logger.info(f'Caching mol_df fingerprints to {hdf_path}') mol_df.to_hdf(hdf_path, 'fingerprints') - - logger.info(f'mol_df: {list(mol_df.columns)}')#\n{mol_df.head().compute()}') + else: + logging.info(f'cache_directory={cache_directory}, not caching!') + sys.stdout.flush() return mol_df def fetch_molecular_embedding_by_id(self, molecule_id: List, radius=2, nBits=512): diff --git a/common/cuchemcommon/data/helper/chembldata.py b/common/cuchemcommon/data/helper/chembldata.py index d7ff5124..a1502a63 100644 --- a/common/cuchemcommon/data/helper/chembldata.py +++ b/common/cuchemcommon/data/helper/chembldata.py @@ -3,10 +3,10 @@ import pandas import sqlite3 import logging - +import sys from typing import List -from dask import delayed, dataframe - +#from dask import delayed, dataframe +import dask from contextlib import closing from cuchemcommon.utils.singleton import Singleton from cuchemcommon.context import Context @@ -234,7 +234,7 @@ def _fetch_mol_embedding(self, # TODO: loading compounds from the database and computing fingerprints need to be separated # We may need to recompute fingerprints but not reload compounds. # TODO: user must be able to load compounds by specifying start and batch_size - logger.info('Fetching %d records starting %d...' % (batch_size, start)) + logger.info('\n_fetch_mol_embedding: Fetching %d records starting %d...' % (batch_size, start)) imp_cols = ['cp.' + col for col in IMP_PROPS] @@ -268,17 +268,15 @@ def _fetch_mol_embedding(self, # TODO: Discuss internally to find use or refactor this code to remove # model specific filtering df['transformed_smiles'] = df['canonical_smiles'] - # if smiles_transforms is not None: - # if len(smiles_transforms) > 0: - # for xf in smiles_transforms: - # df['transformed_smiles'] = df['transformed_smiles'].map(xf.transform) - # df.dropna(subset=['transformed_smiles'], axis=0, inplace=True) # Conversion to fingerprints or embeddings - # transformed_smiles = df['transformed_smiles'] transformation = self.fp_type(**transformation_kwargs) - cache_data, raw_fp_list = transformation.transform(df, return_fp=True) - #cache_data = transformation.transform(df) + + # This is where the int64 fingerprint columns are computed: + cache_data, raw_fp_list = transformation.transform( + df, + return_fp=True + ) return_df = pandas.DataFrame(cache_data) return_df = pandas.DataFrame( return_df, @@ -288,11 +286,12 @@ def _fetch_mol_embedding(self, return_df = df.merge(return_df, left_index=True, right_index=True) # TODO: expect to run into the issue that the fingerprint cannot be a cudf column # TODO: compute here so that chemvisualize does not have to - #return_df['fp'] = raw_fp_list + # The computed fingerprint columns are inserted into the df with the 'fp' prefix (to + # distinguish from PCA columns that are also numeric) for i, fp_col in enumerate(raw_fp_list): - return_df[f'fp{i}'] = fp_col + return_df[f'fp{i}'] = fp_col return_df.rename(columns={'molregno': 'id'}, inplace=True) - logger.info(f'_fetch_mol_embedding returning: {list(return_df.columns)}\n{return_df.head()}') + return return_df def fetch_mol_embedding(self, @@ -304,31 +303,32 @@ def fetch_mol_embedding(self, Returns compound properties and structure for the first N number of records in a dataframe. """ - logger.debug('Fetching properties for all molecules...') - if num_recs is None or num_recs < 0: num_recs = self.fetch_molecule_cnt() logger.info('num_recs %d', num_recs) logger.info('batch_size %d', batch_size) - meta_df = self._meta_df(**transformation_kwargs) dls = [] for start in range(0, num_recs, batch_size): bsize = min(num_recs - start, batch_size) - dl_data = delayed(self._fetch_mol_embedding)(start=start, - batch_size=bsize, - molregnos=molregnos, - **transformation_kwargs) + dl_data = dask.delayed(self._fetch_mol_embedding)( + start=start, + batch_size=bsize, + molregnos=molregnos, + **transformation_kwargs + ) dls.append(dl_data) + meta_df = self._meta_df( + columns=dls[0].columns.compute(), **transformation_kwargs) - return dataframe.from_delayed(dls, meta=meta_df) + return dask.dataframe.from_delayed(dls, meta=meta_df) def save_fingerprints(self, hdf_path='data/filter_*.h5', num_recs=None, batch_size=5000): """ Generates fingerprints for all ChEMBL ID's in the database """ - logger.debug('Fetching molecules from database for fingerprints...') - mol_df = self.fetch_mol_embedding(num_recs=num_recs, batch_size=batch_size) + logger.info(f'save_fingerprints writing {type(mol_df)} to {hdf_path}') mol_df.to_hdf(hdf_path, 'fingerprints') + diff --git a/common/cuchemcommon/workflow.py b/common/cuchemcommon/workflow.py index 8219ff8c..15160ba1 100644 --- a/common/cuchemcommon/workflow.py +++ b/common/cuchemcommon/workflow.py @@ -100,12 +100,12 @@ def compute_unique_smiles(self, embeddings = interp_df['embeddings'] embeddings_dim = interp_df['embeddings_dim'] for index, row in interp_df.iterrows(): - smile_string = row['SMILES'] + smiles_string = row['SMILES'] try: - canonical_smile = CanonSmiles(smile_string) + canonical_smile = CanonSmiles(smiles_string) except: # If a SMILES cannot be canonicalized, just use the original - canonical_smile = smile_string + canonical_smile = smiles_string row['SMILES'] = canonical_smile diff --git a/cuchem/cuchem/decorator/__init__.py b/cuchem/cuchem/decorator/__init__.py index 37137aae..25f4354d 100644 --- a/cuchem/cuchem/decorator/__init__.py +++ b/cuchem/cuchem/decorator/__init__.py @@ -8,7 +8,7 @@ class BaseMolPropertyDecorator(object): def decorate(self, df: Union[cudf.DataFrame, pandas.DataFrame], - smile_cols: int = 0): + smiles_cols: int = 0): NotImplemented diff --git a/cuchem/cuchem/decorator/lipinski.py b/cuchem/cuchem/decorator/lipinski.py index 3588df1c..6500e41c 100644 --- a/cuchem/cuchem/decorator/lipinski.py +++ b/cuchem/cuchem/decorator/lipinski.py @@ -21,7 +21,7 @@ class LipinskiRuleOfFiveDecorator(BaseMolPropertyDecorator): def decorate(self, df: Union[cudf.DataFrame, pandas.DataFrame], - smile_cols: int = 0): + smiles_cols: int = 0): mol_wt = [] mol_logp = [] @@ -33,7 +33,7 @@ def decorate(self, for idx in range(df.shape[0]): - smiles = df.iat[idx, smile_cols] + smiles = df.iat[idx, smiles_cols] m = Chem.MolFromSmiles(smiles) if m is None: diff --git a/cuchem/cuchem/decorator/mol_structure.py b/cuchem/cuchem/decorator/mol_structure.py index 1720d1e7..d96bde34 100644 --- a/cuchem/cuchem/decorator/mol_structure.py +++ b/cuchem/cuchem/decorator/mol_structure.py @@ -17,12 +17,12 @@ class MolecularStructureDecorator(BaseMolPropertyDecorator): def decorate(self, df: Union[cudf.DataFrame, pandas.DataFrame], - smile_cols: int = 0): + smiles_col: int = 0): mol_struct = [] for idx in range(df.shape[0]): - smiles = df.iat[idx, smile_cols] + smiles = df.iat[idx, smiles_col] try: m = Chem.MolFromSmiles(smiles) drawer = Draw.rdMolDraw2D.MolDraw2DCairo(500, 125) diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index 442012fa..4284ec15 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -136,7 +136,7 @@ def __init__(self, cluster_wf, fingerprint_radius=2, fingerprint_nBits=512): self.cluster_wf_cls = 'cuchem.wf.cluster.gpukmeansumap.GpuKmeansUmapHybrid' self.generative_wf_cls = 'cuchem.wf.generative.MegatronMolBART' - self.fp_df = None # all fingerprints of all ChemBl compounds and their IDs as a pandas dataframe + self.fp_df = None # all fingerprints of all ChemBl compounds and their IDs as a pandas dataframe for use in compound similarity search on the CPU self.fingerprint_radius = fingerprint_radius self.fingerprint_nBits = fingerprint_nBits @@ -319,7 +319,6 @@ def handle_add_candidate(self, bt_add_candidate, def handle_analoguing_candidate(self, bt_analoguing_candidate, analoguing_candidates): comp_id, event_type = self._fetch_event_data() - logger.info(f'handle_analoguing_candidate({bt_analoguing_candidate}, {analoguing_candidates}): cid={comp_id}, et={event_type}, dash.callback_context.triggered[0]["value"]={ dash.callback_context.triggered[0]["value"]}') if event_type != 'n_clicks' or dash.callback_context.triggered[0]['value'] == 0: raise dash.exceptions.PreventUpdate @@ -333,7 +332,6 @@ def handle_analoguing_candidate(self, bt_analoguing_candidate, analoguing_candid if selected_chembl_id not in selected_candidates: selected_candidates.append(selected_chembl_id) - logger.info(f'comp_detail={comp_detail}, selected_candidates={selected_candidates}') return ','.join(selected_candidates) def _fetch_event_data(self): @@ -369,8 +367,6 @@ def handle_generation( self.generative_wf_cls = sl_generative_wf wf_class = locate(self.generative_wf_cls) generative_wf = wf_class() - logger.info(f'locate({self.generative_wf_cls}) = {wf_class}, rd_generation_type={rd_generation_type}') - sys.stdout.flush() n2generate = int(n2generate) scaled_radius = float(scaled_radius) @@ -401,7 +397,6 @@ def handle_generation( if show_generated_mol is None: show_generated_mol = 0 show_generated_mol += 1 - # Add other useful attributes to be added for rendering self.generated_df = MolecularStructureDecorator().decorate(self.generated_df) self.generated_df = LipinskiRuleOfFiveDecorator().decorate(self.generated_df) @@ -419,7 +414,7 @@ def handle_generation( self.generated_df = pd.concat([self.generated_df, df_fp], axis=1) df_fp=cudf.from_pandas(df_fp) df_fp['id'] = list(map(str, self.generated_df['id'])) - df_fp['cluster'] = list(map(int, self.generated_df['Generated'])) + df_fp['cluster'] = list(map(int, self.generated_df['Generated'])) # This controls the color n_generated = self.generated_df['Generated'].sum() if n_generated < len(self.generated_df) / 2: # Highlight the generated compounds @@ -436,65 +431,50 @@ def handle_generation( df_embedding, prop_series = self.cluster_wf._remove_non_numerics(df_embedding) prop_series['cluster'] = cluster_col n_molecules, n_obs = df_embedding.compute().shape # needed? - logger.info( - f'cluster_wf: {self.cluster_wf}, self.cluster_wf.pca={self.cluster_wf.pca}, isinstance(self.cluster_wf.pca, cuml.PCA)={isinstance(self.cluster_wf.pca, cuml.PCA)}'\ - f'\ndf_embedding: {type(df_embedding)}, isinstance(df_embedding, dask_cudf.DataFrame)={isinstance(df_embedding, dask_cudf.DataFrame)}\n{df_embedding.head()}') - sys.stdout.flush() #if hasattr(df_embedding, 'compute'): # df_embedding = df_embedding.compute() - # logger.info(f'df_embedding after compute(): {type(df_embedding)}\n{df_embedding.head()}') - # sys.stdout.flush() if isinstance(self.cluster_wf.pca, cuml.PCA) and isinstance(df_embedding, dask_cudf.DataFrame): + # Trying to accommodate the GpuKmeansUmapHybrid workflow df_embedding = df_embedding.compute() df_embedding = self.cluster_wf.pca.transform(df_embedding) - df_embedding = df_embedding.persist() # TODO: wait after this? - #X_train = df_embedding.compute() # needed? + if hasattr(df_embedding, 'persist'): + df_embedding = df_embedding.persist() + wait(df_embedding) Xt = self.cluster_wf.umap.transform(df_embedding) df_embedding['x'] = Xt[0] df_embedding['y'] = Xt[1] for col in prop_series.keys(): - #logger.info(f'col={col}') sys.stdout.flush() df_embedding[col] = prop_series[col]#.compute() fig, northstar_cluster = self.create_graph(df_embedding, north_stars=north_stars) - + # Create Table header table_headers = [] - columns = [ + all_columns = self.generated_df.columns.to_list() + columns_in_table = [ col_name for col_name in self.generated_df.columns.to_list() - if not isinstance(col_name, int) + if (not isinstance(col_name, int)) and (not col_name.startswith('fp')) and not ('embeddings' in col_name) ] - #columns = self.generated_df.columns.to_list() - ignore_columns = ['embeddings', 'embeddings_dim'] - for column in columns: - if column in ignore_columns: - continue + # TODO: factor this into a separate function: build table from dataframe + for column in columns_in_table: table_headers.append(html.Th(column, style={'fontSize': '150%', 'text-align': 'center'})) - prop_recs = [html.Tr(table_headers, style={'background': 'lightgray'})] - invalid_mol_cnt = 0 for row_idx in range(self.generated_df.shape[0]): td = [] - try: - col_pos = columns.index('Chemical Structure') + col_pos = all_columns.index('Chemical Structure') col_data = self.generated_df.iat[row_idx, col_pos] - - if 'value' in col_data and col_data['value'] == MolecularStructureDecorator.ERROR_VALUE: - invalid_mol_cnt += 1 + if 'value' in col_data and col_data['value'] == 'Error interpreting SMILES using RDKit': continue except ValueError: pass - - for col_id in range(len(columns)): + for col_name in columns_in_table: + col_id = all_columns.index(col_name) col_data = self.generated_df.iat[row_idx, col_id] - #if columns[col_id] in ignore_columns: - # continue - col_level = 'info' if isinstance(col_data, dict): col_value = col_data['value'] @@ -502,24 +482,12 @@ def handle_generation( col_level = col_data['level'] else: col_value = col_data - if isinstance(col_value, str) and col_value.startswith('data:image/png;base64,'): td.append(html.Td(html.Img(src=col_value))) else: - td.append(html.Td(str(col_value), - style={'maxWidth': '300px', - 'wordWrap': 'break-word', - 'text-align': 'center', - 'color': LEVEL_TO_STYLE[col_level]['color'] - } - )) - - #prop_recs.append(html.Tr(td, style={'fontSize': '125%'})) + td.append( + html.Td(str(col_value), style=LEVEL_TO_STYLE[col_level].update({'maxWidth': '100px', 'wordWrap':'break-word'}))) prop_recs.append(html.Tr(td)) - - msg_generated_molecules = '' - if invalid_mol_cnt > 0: - msg_generated_molecules = f'{invalid_mol_cnt} invalid molecules were created, which were eliminated from the result.' return {'display': 'inline'}, fig, html.Table( prop_recs, style={'width': '100%', 'margin': 12, 'border': '1px solid lightgray'} @@ -533,11 +501,9 @@ def handle_fitting( fit_nn_max_epochs, fit_nn_learning_rate, fit_nn_weight_decay, fit_nn_batch_size ): comp_id, event_type = self._fetch_event_data() - #logger.info(f'handle_fitting: comp_id={comp_id}, event_type={event_type}') sys.stdout.flush() if (comp_id != 'bt_fit') or (event_type != 'n_clicks'): return dash.no_update, dash.no_update - #logger.info(f'comp_id={comp_id}, event_type={event_type}') self.featurizing_wf_cls = sl_featurizing_wf wf_class = locate(self.featurizing_wf_cls) featurizing_wf = wf_class() @@ -555,7 +521,6 @@ def handle_fitting( weight_decay=float(fit_nn_weight_decay), batch_size=int(fit_nn_batch_size) ) - #logger.info(df.head()) sys.stdout.flush() fig = self.create_plot(df, fit_nn_compound_property) return {'display': 'inline'}, fig @@ -566,7 +531,6 @@ def handle_analoguing( ): comp_id, event_type = self._fetch_event_data() - #logger.info(f'handle_analoguing: mol={analoguing_mol_id}, n={analoguing_n_analogues}, th={analoguing_threshold}, type={analoguing_type}') sys.stdout.flush() if (comp_id != 'bt_analoguing') or (event_type != 'n_clicks'): return dash.no_update, dash.no_update @@ -576,18 +540,20 @@ def handle_analoguing( smiles_column = 'canonical_smiles' else: smiles_columns = 'SMILES' - logger.info(f'self.cluster_wf.df_embedding: {self.cluster_wf.df_embedding}\n{self.cluster_wf.df_embedding.head()}') - if self.fp_df is None: # CPU-based workflow, to be deprecated - logger.info(f'self.fp_df not set, computing on CPU') + + if self.fp_df is None: + # Note: CPU-based workflow is no longer needed, can be removed + logger.info(f'CPU-based similarity search: self.fp_df not set') + # First move the smiles to the CPU: if isinstance(self.cluster_wf.df_embedding, dask_cudf.DataFrame): smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']].map_partitions(cudf.DataFrame.to_pandas) elif isinstance(self.cluster_wf.df_embedding, cudf.DataFrame): smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']].to_pandas() else: smiles_df = self.cluster_wf.df_embedding[[smiles_column, 'id']] - + # Then compute fingerprints on the CPU using RDKit: if 'fp' not in self.cluster_wf.df_embedding.columns: - logger.info(f'Computing fingerprints...') + logger.info(f'Computing fingerprints with radius={self.fingerprint_radius}, nBits={self.fingerprint_nBits}...') _, v = MorganFingerprint(radius=self.fingerprint_radius, nBits=self.fingerprint_nBits).transform( smiles_df, smiles_column=smiles_column, return_fp=True, raw=True) else: @@ -596,7 +562,7 @@ def handle_analoguing( v = list(self.cluster_wf.df_embedding['fp'].compute().to_pandas()) else: v = list(self.cluster_wf.df_embedding['fp']) - + # This pandas dataframe has the fingerprints in the fp column: self.fp_df = pd.DataFrame({ 'fp': v, smiles_column: smiles_df[smiles_column], #list(self.cluster_wf.df_embedding[smiles_column].compute().to_pandas()), #smiles_df[smiles_column], @@ -608,14 +574,14 @@ def handle_analoguing( wait(self.cluster_wf.df_embedding) if 'pc' not in self.cluster_wf.df_embedding.columns: - # Pre-computing the popcounts for all compounds in the database: + # Pre-computing the popcounts for all compounds in the database for use in GPU-based similarity search: t0 = time.time() self.cluster_wf.df_embedding['op_col'] = 0 self.cluster_wf.df_embedding['pc'] = 0 - + n_fp_cols = 0 for col in self.cluster_wf.df_embedding.columns: if (type(col) == str) and col.startswith('fp') and (len(col) > 2): - logger.info(f'{col}: {self.cluster_wf.df_embedding[col]}') + n_fp_cols += 1 self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.apply_rows( popcll_wrapper, incols = {col: 'ip_col'}, outcols = {'op_col': int}, kwargs = {}) # More complex syntax was not necessary: @@ -625,7 +591,7 @@ def handle_analoguing( self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() wait(self.cluster_wf.df_embedding) t1 = time.time() - logger.info(f'Time to compute partial popcounts: {t1 - t0}') + logger.info(f'Time to compute partial popcounts ({n_fp_cols} fp columns): {t1 - t0}:\n{self.cluster_wf.df_embedding["pc"].head()}') # Prepare the query compound: logger.info(f'analoguing_mol_id={analoguing_mol_id}') @@ -649,15 +615,11 @@ def handle_analoguing( #self.cluster_wf.df_embedding['op_col'] = 0 self.cluster_wf.df_embedding['n_intersection'] = 0 t4 = time.time() - logger.info(f'self.cluster_wf.df_embedding: {type(self.cluster_wf.df_embedding)}, {list(self.cluster_wf.df_embedding.columns)}\n{self.cluster_wf.df_embedding.head()}') - sys.stdout.flush() for i in range(0, self.fingerprint_nBits, INTEGER_NBITS): fp_num = i // INTEGER_NBITS - logger.info(f'i={i}, fp_num={fp_num}') self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.apply_rows( intersection_wrapper, incols={f'fp{fp_num}': 'fp_int_col'}, outcols={'op_col': int}, kwargs={'query_fp_int': query_fp_ints[fp_num]}) - #logging.info(f'{i}:\n{self.cluster_wf.df_embedding.head()}') #self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() #wait(self.cluster_wf.df_embedding) self.cluster_wf.df_embedding['n_intersection'] += self.cluster_wf.df_embedding['op_col'] @@ -941,7 +903,7 @@ def create_graph(self, ldf, color_col='cluster', north_stars=None, gradient_prop # Compute size of northstar and normal points df_shape = df_size.copy() - df_size = (df_size * 2) + DOT_SIZE + df_size = (df_size * 18) + DOT_SIZE df_shape = df_shape * 2 x_data = cdf['x'] y_data = cdf['y'] @@ -1145,6 +1107,8 @@ def constuct_layout(self): html.Div([ dcc.Markdown("""**Molecule(s) of Interest**"""), + dcc.Markdown(children="""Click *Highlight* to populate this list""", + style={'marginTop': 18}), dcc.Markdown("Please enter ChEMBL ID(s) separated by commas."), html.Div(className='row', children=[ @@ -1154,6 +1118,7 @@ def constuct_layout(self): className='three columns'), ], style={'marginLeft': 0, 'marginBottom': 18, }), + dcc.Markdown("For fingerprint changes to take effect, first *Apply* the *GPU KMeans-UMAP* Workflow, then *Recluster*"), html.Div(className='row', children=[ dcc.Markdown("Fingerprint Radius", style={'marginTop': 12,}), dcc.Input(id='fingerprint_radius', value=2), @@ -1278,6 +1243,8 @@ def constuct_layout(self): dcc.Markdown(children="""**Please Select Two**""", id="mk_selection_msg", style={'marginTop': 18}), + dcc.Markdown(children="""Click *Add* to populate this list""", + style={'marginTop': 18}), dcc.Checklist( id='ckl_candidate_mol_id', options=[], @@ -1397,6 +1364,8 @@ def constuct_layout(self): id="analoguing_msg", style={'marginTop': 18} ), + dcc.Markdown(children="""Click *Analogue* to populate this list""", + style={'marginTop': 18}), dcc.Checklist( id='ckl_analoguing_mol_id', options=[], @@ -1690,7 +1659,7 @@ def handle_re_cluster( north_star, radius=int(fingerprint_radius), nBits=int(fingerprint_nBits)) recluster_data = len(missing_mols) > 0 logger.info("%d missing molecules added...", len(missing_mols)) - logger.debug("Missing molecules werew %s", missing_mols) + logger.debug("Missing molecules were %s", missing_mols) moi_molregno = " ,".join(list(map(str, molregnos))) if refresh_moi_prop_table is None: diff --git a/cuchem/cuchem/wf/cluster/gpukmeansumap.py b/cuchem/cuchem/wf/cluster/gpukmeansumap.py index 395dd419..158d0f8d 100644 --- a/cuchem/cuchem/wf/cluster/gpukmeansumap.py +++ b/cuchem/cuchem/wf/cluster/gpukmeansumap.py @@ -22,6 +22,8 @@ import cuml import dask import dask_cudf +import sys +from dask.distributed import wait from cuchemcommon.context import Context from cuchemcommon.data import ClusterWfDAO from cuchemcommon.data.cluster_wf import ChemblClusterWfDao @@ -62,7 +64,6 @@ def _(embedding, n_pca, reuse_umap, reuse_pca, self): @_gpu_cluster_wrapper.register(dask_cudf.core.DataFrame) def _(embedding, n_pca, reuse_umap, reuse_pca, self): embedding = embedding.persist() - logger.info(f'_gpu_cluster_wrapper: self={self}, {type(embedding)}, {n_pca}, {reuse_umap}, {reuse_pca}') return self._cluster(embedding, n_pca, reuse_umap, reuse_pca) @@ -182,26 +183,33 @@ def _cluster(self, embedding, n_pca, reuse_umap=False, reuse_pca=True): return embedding - def cluster(self, df_mol_embedding=None, reuse_umap=False, reuse_pca=True, fingerprint_radius=2, fingerprint_nBits=512): + def cluster( + self, + df_mol_embedding=None, + reuse_umap=False, + reuse_pca=True, + fingerprint_radius=2, + fingerprint_nBits=512 + ): - #logger.info("Executing GPU workflow...") logger.info(f"GpuKmeansUmap.cluster(radius={fingerprint_radius}, nBits={fingerprint_nBits}), df_mol_embedding={df_mol_embedding}") - + sys.stdout.flush() if (df_mol_embedding is None) or (fingerprint_radius != self.fingerprint_radius) or (fingerprint_nBits != self.fingerprint_nBits): self.n_molecules = self.context.n_molecule self.dao = ChemblClusterWfDao( MorganFingerprint, radius=fingerprint_radius, nBits=fingerprint_nBits) self.fingerprint_radius = fingerprint_radius self.fingerprint_nBits = fingerprint_nBits - logger.info(f'dao={self.dao}') + logger.info(f'dao={self.dao}, getting df_mol_embedding...') df_mol_embedding = self.dao.fetch_molecular_embedding( self.n_molecules, cache_directory=self.context.cache_directory, radius=fingerprint_radius, nBits=fingerprint_nBits - ) - + ) df_mol_embedding = df_mol_embedding.persist() + wait(df_mol_embedding) + self.df_embedding = _gpu_cluster_wrapper( df_mol_embedding, self.pca_comps, @@ -278,8 +286,6 @@ def add_molecules(self, chemblids: List, radius=2, nBits=512): if hasattr(self.df_embedding, 'compute'): self.df_embedding = self.df_embedding.compute() - logger.info(self.df_embedding.shape) - return chem_mol_map, molregnos, self.df_embedding diff --git a/cuchem/cuchem/wf/generative/megatronmolbart.py b/cuchem/cuchem/wf/generative/megatronmolbart.py index 075295de..44aeba5e 100644 --- a/cuchem/cuchem/wf/generative/megatronmolbart.py +++ b/cuchem/cuchem/wf/generative/megatronmolbart.py @@ -71,7 +71,7 @@ def embedding_to_smiles(self, dim=dim, pad_mask=pad_mask) - return self.stub.EmbeddingToSmiles(spec) + return self.stub.EmbeddingToSmiles(spec) # Not yet implemented in the main branch but supported elsewhere def find_similars_smiles(self, smiles: str, @@ -80,6 +80,11 @@ def find_similars_smiles(self, force_unique=False, sanitize=True, compound_id=None): + if isinstance(compound_id, list): + # Sometimes calling routine may send a list of length one containing the compound ID + if len(compound_id) > 1: + logger.info(f'find_similars_smiles received {compound_id}, generating neighbors only for first compound!') + compound_id = compound_id[0] spec = GenerativeSpec(model=GenerativeModel.MegaMolBART, smiles=smiles, radius=scaled_radius, @@ -106,8 +111,6 @@ def find_similars_smiles(self, for i in range(len(generatedSmiles) - 1) ], }) - #generated_df['Generated'].iat[0] = False - return generated_df def interpolate_smiles( @@ -171,11 +174,7 @@ def extrapolate_from_cluster(self, radius = self._compute_radius(scaled_radius) # TO DO: User must be able to extrapolate directly from smiles in the table; # these may themselves be generated compounds without any chemblid. - logger.info(f'cluster_id={cluster_id}, compound_property={compound_property}, compounds_df: {len(compounds_df)}, {type(compounds_df)}') - logger.info(compounds_df.head()) - logger.info(f'{list(compounds_df.columns)}, {list(compounds_df.dtypes)}') df_cluster = compounds_df[ compounds_df['cluster'] == int(cluster_id) ].dropna().reset_index(drop=True).compute() - logger.info(f'df_cluster: {len(df_cluster)}\n{df_cluster.head()}') if 'transformed_smiles' in df_cluster: smiles_col = 'transformed_smiles' elif 'SMILES' in df_cluster: @@ -185,7 +184,7 @@ def extrapolate_from_cluster(self, else: logger.info(list(df_cluster.columns)) logger.info(df_cluster.head()) - raise Error('No smiles column') + raise RuntimeError('No smiles column') smiles_col = None smiles_list = df_cluster[smiles_col].to_array() return self.extrapolate_from_smiles(smiles_list, @@ -212,14 +211,7 @@ def _get_embedding_direction(self, logger.info(f'_get_embedding_direction: emb:{embedding_list.shape}, {type(embedding_list)}, prop:{compound_property_vals.shape}, {type(compound_property_vals)}, prop: {min(compound_property_vals)} - {max(compound_property_vals)}') n_data = compound_property_vals.shape[0] n_dimensions = embedding_list[0].shape[0] - try: - reg = Lasso()#alpha=1.0/n_dimensions)#, tol=1.0/n_dimensions) - #reg = Ridge()#alpha=1.0/n_dimensions, solver='cd') # default is 'eig' - reg = reg.fit(embedding_list, compound_property_vals) - except Exception as e: - logger.info(f'Ridge regression encountered {e}, trying Lasso regression') - reg = Lasso()#alpha=1.0/n_dimensions) - reg = reg.fit(embedding_list, compound_property_vals) + reg = reg.fit(embedding_list, compound_property_vals) n_zero_coefs = len([x for x in reg.coef_ if x == 0.0]) zero_coef_indices = [i for i, x in enumerate(reg.coef_) if x != 0.0] logger.info(f'coef: {n_zero_coefs} / {len(reg.coef_)} coefficients are zero (in some positions between {min(zero_coef_indices)} and {max(zero_coef_indices)});'\ @@ -329,8 +321,6 @@ def extrapolate_from_smiles(self, logger.info(f'direction: {type(direction)}, shape={direction.shape}, {direction}\n, embeddings: {type(embeddings)}, shape: {embeddings.shape}, embeddings[0]={embeddings[0]}') for step_num in range(1, 1 + num_points): - #noise = cp.random.normal(loc=0.0, scale=emb_std, size=emb_std.shape) - #logger.info(f'noise: {type(noise)}, {noise.shape}; dir: {type(direction)}, {direction.shape}') direction_sampled = cp.random.normal(loc=direction, scale=emb_std, size=emb_std.shape) #direction + noise logger.info(f'step ({type(step_num)} * {type(diff_size)} * {type(step_size)} * {type(direction_sampled)}') step = float(step_num * diff_size * step_size) * direction_sampled @@ -340,8 +330,6 @@ def extrapolate_from_smiles(self, smiles_gen_list = [] ids_interp_list = [] for i in range(len(extrap_embeddings)): - #diff = extrap_embeddings[i] - embeddings[i] - #logger.info(f'{i}: diff: {diff.argmin()}: {min(diff)} to {diff.argmax()}: {max(diff)}') extrap_embedding = list(extrap_embeddings[i,:]) logger.info(f'embedding: {type(extrap_embedding)}, {len(extrap_embeddings)};'\ f' dim: {type(emb_shape)}, {len(emb_shape)}; pad_mask={type(full_mask)}, {len(full_mask)}') @@ -429,10 +417,8 @@ def fit_nn( emb = result.embedding mask = result.pad_mask dim = result.dim - logger.info(f'{i}: smiles={smiles}, emd: {len(emb)}, {emb[:5]}; dim={dim}, mask: {len(mask)}') - emb_shape = result.dim #emb[:2] - #emb = emb[2:] - + #logger.info(f'{i}: smiles={smiles}, emd: {len(emb)}, {emb[:5]}; dim={dim}, mask: {len(mask)}') + emb_shape = result.dim if debug: spec = EmbeddingList( embedding=emb, @@ -440,7 +426,6 @@ def fit_nn( pad_mask=mask ) generated_mols = self.stub.EmbeddingToSmiles(spec).generatedSmiles - #generated_mols = self.inverse_transform([emb.reshape(emb_shape)], k=1, mem_pad_mask=mask.bool().cuda()) if len(generated_mols) > 0: m = MolFromSmiles(generated_mols[0]) if m is not None: @@ -449,15 +434,10 @@ def fit_nn( logger.info(f'{n_recovered}/ {i+1}: {smiles} ({len(smiles)} chars)--> emb:{emb_shape}, mask:{len(mask)} --> {generated_mols} (tani={tani:.2f})') avg_tani += tani embeddings.append(torch.tensor(emb, device=self.device)) #emb.detach().reshape(-1)) #torch tensor - #if full_mask is None: - # full_mask = mask - # emb_shape = emb.shape - #else: - # full_mask &= mask + if debug: logger.info(f'{n_recovered} / {len(smiles_list)} compounds yielded something after embedding, with avg tani = {avg_tani / n_recovered if n_recovered > 0 else 0}') - #full_mask = full_mask.bool().cuda() embeddings = torch.nn.utils.rnn.pad_sequence( embeddings, batch_first=True, padding_value=PAD_TOKEN) embeddings_train = embeddings[:n_train,:] diff --git a/cuchem/startdash.py b/cuchem/startdash.py index 8903b0a0..256b6ab0 100755 --- a/cuchem/startdash.py +++ b/cuchem/startdash.py @@ -164,9 +164,18 @@ def cache(self): prepocess_type = Embeddings # TODO: when loading precomputed fingerprints, the radius and size should be specified + # For now, we are hard-coding this information: + nBits = 512 + radius = 2 chem_data = ChEmblData(fp_type=prepocess_type) + subdir = f'{args.cache_directory}/fp_r{radius}_n{nBits}' + if not os.path.isdir(subdir): + os.mkdir(subdir) + + logging.info(f'client: saving fingerprints') + # This will trigger a reread if fingerprints are not found in the cache directory! chem_data.save_fingerprints( - os.path.join(args.cache_directory, FINGER_PRINT_FILES), num_recs = args.n_mol, + os.path.join(subdir, FINGER_PRINT_FILES), num_recs = args.n_mol, batch_size=args.batch_size) logger.info('Fingerprint generated in (hh:mm:ss.ms) {}'.format( @@ -341,8 +350,8 @@ def analyze(self): pca_comps=args.pca_comps, n_clusters=args.num_clusters) + # Cluster() will trigger a read if not found in the cache dir: mol_df = workflow.cluster() - if args.benchmark: workflow.compute_qa_matric() if not args.cpu: diff --git a/megamolbart/megamolbart/inference.py b/megamolbart/megamolbart/inference.py index 4ea51b73..fcf6f368 100644 --- a/megamolbart/megamolbart/inference.py +++ b/megamolbart/megamolbart/inference.py @@ -342,6 +342,6 @@ def interpolate_smiles(self, result_df.append(interp_df) result_df = pd.concat(result_df) - smile_list = list(result_df['SMILES']) + smiles_list = list(result_df['SMILES']) - return result_df, smile_list + return result_df, smiles_list diff --git a/megamolbart/megamolbart/service.py b/megamolbart/megamolbart/service.py index a4b160d9..a22db381 100644 --- a/megamolbart/megamolbart/service.py +++ b/megamolbart/megamolbart/service.py @@ -26,9 +26,9 @@ def __init__(self, *args, **kwargs): # TODO how to handle length overrun for batch processing --> see also MegaMolBART.load_model in inference.py def SmilesToEmbedding(self, spec, context): - smile_str = ''.join(spec.smiles) + smiles_str = ''.join(spec.smiles) - embedding, pad_mask = self.megamolbart.smiles2embedding(smile_str, + embedding, pad_mask = self.megamolbart.smiles2embedding(smiles_str, pad_length=spec.padding) dim = embedding.shape embedding = embedding.flatten().tolist() @@ -53,10 +53,10 @@ def EmbeddingToSmiles(self, embedding_spec, context): def FindSimilars(self, spec, context): - smile_str = ''.join(spec.smiles) + smiles_str = ''.join(spec.smiles) generated_df = self.megamolbart.find_similars_smiles( - smile_str, + smiles_str, num_requested=spec.numRequested, scaled_radius=spec.radius, force_unique=False) From 3e3271553cc73470e1619331e41d91f02f3e023b Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Thu, 24 Feb 2022 10:02:49 -0800 Subject: [PATCH 17/27] minor --- launch.sh | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/launch.sh b/launch.sh index 849e855c..f8be2bb1 100755 --- a/launch.sh +++ b/launch.sh @@ -94,15 +94,9 @@ build() { IFS=':' read -ra MEGAMOLBART_CONT_BASENAME <<< ${MEGAMOLBART_CONT} echo "Building ${MEGAMOLBART_CONT_BASENAME}..." docker build --network host \ -<<<<<<< HEAD - -t ${MEGAMOLBART_CONT_BASENAME}:latest \ - -t ${MEGAMOLBART_CONT} \ - --build-arg SOURCE_CONTAINER=${MEGAMOLBART_TRAINING_CONT} \ -======= --build-arg GITHUB_ACCESS_TOKEN=${GITHUB_ACCESS_TOKEN} \ -t ${MEGAMOLBART_CONT_BASENAME}:latest \ -t ${MEGAMOLBART_CONT} \ ->>>>>>> dev -f Dockerfile.megamolbart . fi @@ -166,14 +160,9 @@ dev() { else DOCKER_CMD="${DOCKER_CMD} --privileged" DOCKER_CMD="${DOCKER_CMD} -v ${PROJECT_PATH}/chemportal/config:/etc/nvidia/cuChem/" -<<<<<<< HEAD - DOCKER_CMD="${DOCKER_CMD} -v /var/run/docker.sock:/var/run/docker.sock" - DOCKER_CMD="${DOCKER_CMD} -e PYTHONPATH=${DEV_PYTHONPATH}:" -======= DOCKER_CMD="${DOCKER_CMD} -v ${CONTENT_PATH}/logs/:/logs" DOCKER_CMD="${DOCKER_CMD} -v /var/run/docker.sock:/var/run/docker.sock" DOCKER_CMD="${DOCKER_CMD} -e PYTHONPATH=${PYTHONPATH_CUCHEM}:/workspace/benchmark" ->>>>>>> dev DOCKER_CMD="${DOCKER_CMD} -w /workspace/cuchem/" fi @@ -192,11 +181,7 @@ start() { validate_docker if [[ -d "/opt/nvidia/cheminfomatics" ]]; then -<<<<<<< HEAD - PYTHONPATH=/opt/nvidia/cheminfomatics/common/generated:/opt/nvidia/cheminfomatics/common:/opt/nvidia/cheminfomatics/cuchem:/opt/nvidia/cheminfomatics/chemportal -======= PYTHONPATH=${PYTHONPATH_CUCHEM} ->>>>>>> dev dbSetup "${DATA_MOUNT_PATH}" cd ${CHEMINFO_DIR}/cuchem/; python3 startdash.py analyze $@ else @@ -215,13 +200,6 @@ start() { export PYTHONPATH_MEGAMOLBART="${CHEMINFO_DIR}/common:/${CHEMINFO_DIR}/common/generated/" export NGINX_CONFIG=${PROJECT_PATH}/setup/config/nginx.conf -<<<<<<< HEAD - export ADDITIONAL_PARAM="$@" - export CUCHEM_PATH=/workspace - export MEGAMOLBART_PATH=/workspace/megamolbart - export WORKSPACE_DIR='.' -======= ->>>>>>> dev docker-compose --env-file .env \ -f setup/docker_compose.yml \ --project-directory . \ From d167e8a0cd5842c792130bd615f5cf38e1aa7e3b Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Thu, 24 Feb 2022 12:35:07 -0800 Subject: [PATCH 18/27] minor --- common/cuchemcommon/fingerprint.py | 42 +++++++++++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/common/cuchemcommon/fingerprint.py b/common/cuchemcommon/fingerprint.py index 5c86fbda..1dc25d65 100644 --- a/common/cuchemcommon/fingerprint.py +++ b/common/cuchemcommon/fingerprint.py @@ -8,6 +8,7 @@ from rdkit.Chem import AllChem from math import ceil +INTEGER_NBITS = 64 # Maximum number of bits in an integer column in a cudf Series logger = logging.getLogger(__name__) @@ -57,7 +58,7 @@ def transform_single(self, smiles): fp = cupy.asarray(fp) return fp - def transform(self, data, col_name='transformed_smiles', return_fp=False, raw=False): + def transform_new(self, data, col_name='transformed_smiles', return_fp=False, raw=False): """Single threaded processing of list""" data = data[col_name] fp_array = [] @@ -84,5 +85,44 @@ def transform(self, data, col_name='transformed_smiles', return_fp=False, raw=Fa return fp_array, np.asarray(raw_fp_array, dtype=np.uint64) return fp_array + def transform( + self, + data, + smiles_column = 'transformed_smiles', + return_fp = False, # When set to True, an additional value is returned determined by the raw parameter + raw = False # The RDKit fingerprint object is returned when raw = True, and the int64 fingerprint columns are returned when raw = False + ): + data = data[smiles_column] + fp_array = [] + self.n_fp_integers = ceil(self.kwargs['nBits'] / INTEGER_NBITS) + if raw: + raw_fp_array = [] + else: + raw_fp_array = [[] for i in range(0, self.kwargs['nBits'], INTEGER_NBITS)] + for mol_smiles in data: + m = Chem.MolFromSmiles(mol_smiles) + if not m: + fp = None + fp_bs = '0' * self.kwargs['nBits'] + else: + fp = self.func(m, **self.kwargs) + fp_bs = fp.ToBitString() + fp_array.append(cupy.asarray(np.frombuffer(fp_bs.encode(), 'u1') - ord('0'))) + if return_fp: + if raw: + raw_fp_array.append(fp) + else: + for i in range(0, self.kwargs['nBits'], INTEGER_NBITS): + raw_fp_array[i // INTEGER_NBITS].append(int(fp_bs[i: i + INTEGER_NBITS], 2)) + #fp_array = np.asarray(fp_array) + fp_array = cupy.stack(fp_array) + + if return_fp: + if raw: + return fp_array, raw_fp_array + else: + return fp_array, np.asarray(raw_fp_array, dtype=np.uint64) + return fp_array + def __len__(self): return self.kwargs['nBits'] From 408cc138d93facfa6d93d34b54a4c056317b1c10 Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Thu, 24 Feb 2022 17:18:04 -0800 Subject: [PATCH 19/27] debugging --- cuchem/cuchem/interactive/chemvisualize.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index 37720097..7f43ca76 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -249,7 +249,8 @@ def __init__(self, cluster_wf, fingerprint_radius=2, fingerprint_nBits=512): Output('table_generated_molecules', 'children'), Output('show_generated_mol', 'children'), Output('msg_generated_molecules', 'children'), - Output('interpolation_error', 'children')], + #Output('interpolation_error', 'children') + ], [Input("bt_generate", "n_clicks"), ], [State('sl_generative_wf', 'value'), State('ckl_candidate_mol_id', 'value'), @@ -351,19 +352,20 @@ def handle_property_tables(self, show_generated_mol, show_selected_mol): return {'display': 'block', 'width': '100%'}, {'display': 'none'} return dash.no_update, dash.no_update - @report_ui_error(4) + @report_ui_error(3) def handle_generation( self, bt_generate, sl_generative_wf, ckl_candidate_mol_id, n2generate, extrap_compound_property, extrap_cluster_number, extrap_n_compounds, extrap_step_size, scaled_radius, rd_generation_type, show_generated_mol ): + print('***handle_generation***') comp_id, event_type = self._fetch_event_data() - + logger.info(f'handle_generation: comp_id={comp_id}, event_type={event_type}, rd_generation_type={rd_generation_type}') chembl_ids = [] if comp_id == 'bt_generate' and event_type == 'n_clicks': chembl_ids = ckl_candidate_mol_id else: - return dash.no_update, dash.no_update + return dash.no_update, dash.no_update, dash.no_update, dash.no_update, dash.no_update self.generative_wf_cls = sl_generative_wf wf_class = locate(self.generative_wf_cls) @@ -1205,9 +1207,9 @@ def constuct_layout(self): dcc.RadioItems( id='rd_generation_type', options=[ - {'label': 'Interpolate between two molecules', 'value': 'INTERPOLATE'}, - {'label': 'Fit cluster to property and extrapolate', 'value': 'EXTRAPOLATE'}, {'label': 'Sample around one molecule', 'value': 'SAMPLE'}, + {'label': 'Fit cluster to property and extrapolate', 'value': 'EXTRAPOLATE'}, + {'label': 'Interpolate between two molecules', 'value': 'INTERPOLATE'}, ], value='INTERPOLATE', style={'marginTop': 18}, @@ -1264,7 +1266,7 @@ def constuct_layout(self): labelStyle={'display': 'block', 'marginLeft': 6, 'marginRight': 6} ), html.Div(className='row', children=[ - dbc.Button('Generate', id='bt_generate', n_clicks=0, style={'marginRight': 12}), + dbc.Button('GENERATE', id='bt_generate', n_clicks=0, style={'marginRight': 12}), dbc.Button('Reset', id='bt_reset_candidates', n_clicks=0), ], style={'marginLeft': 0}), ]), From dd6a7b40c14996a9922d979c9ec44e9bb2d8a365 Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Thu, 24 Feb 2022 22:02:41 -0800 Subject: [PATCH 20/27] fixing --- cuchem/cuchem/interactive/chemvisualize.py | 173 +++++++++++++++------ 1 file changed, 127 insertions(+), 46 deletions(-) diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index 7f43ca76..1086de63 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -243,26 +243,6 @@ def __init__(self, cluster_wf, fingerprint_radius=2, fingerprint_nBits=512): [Output('ckl_analoguing_mol_id', 'value')], [Input('ckl_analoguing_mol_id', 'value')])(self.handle_analoguing_ckl_selection) - self.app.callback( - [Output('section_generated_molecules_clustered', 'style'), - Output('gen_figure', 'figure'), - Output('table_generated_molecules', 'children'), - Output('show_generated_mol', 'children'), - Output('msg_generated_molecules', 'children'), - #Output('interpolation_error', 'children') - ], - [Input("bt_generate", "n_clicks"), ], - [State('sl_generative_wf', 'value'), - State('ckl_candidate_mol_id', 'value'), - State('n2generate', 'value'), - State('extrap_compound_property', 'value'), - State('extrap_cluster_number', 'value'), - State('extrap_n_compounds', 'value'), - State('extrap_step_size', 'value'), - State('scaled_radius', 'value'), - State('rd_generation_type', 'value'), - State('show_generated_mol', 'children')])(self.handle_generation) - self.app.callback( [Output('section_fitting', 'style'), Output('fitting_figure', 'figure')], @@ -294,6 +274,37 @@ def __init__(self, cluster_wf, fingerprint_radius=2, fingerprint_nBits=512): [Input('show_generated_mol', 'children'), Input('show_selected_mol', 'children')])(self.handle_property_tables) + #self.app.callback( + # Output("n_test", "value"), + # [Input("bt_test", "n_clicks")], + # [State("n_test", "value")])(self.handle_test) + + self.app.callback( + [Output('section_generated_molecules_clustered', 'style'), + Output('gen_figure', 'figure'), + Output('table_generated_molecules', 'children'), + Output('show_generated_mol', 'children'), + Output('msg_generated_molecules', 'children'), + Output('interpolation_error', 'children')], + [Input("bt_generate", "n_clicks")], + [State('sl_generative_wf', 'value'), + State('ckl_candidate_mol_id', 'value'), + State('n2generate', 'value'), + State('extrap_compound_property', 'value'), + State('extrap_cluster_number', 'value'), + State('extrap_n_compounds', 'value'), + State('extrap_step_size', 'value'), + State('scaled_radius', 'value'), + State('rd_generation_type', 'value'), + State('show_generated_mol', 'children')])(self.handle_generation) + + def handle_test(self, bt_test, n_test): + comp_id, event_type = self._fetch_event_data() + if comp_id == 'bt_test' and event_type == 'n_clicks': + return n_test + 1 + raise dash.exceptions.PreventUpdate + + def handle_add_candidate(self, bt_add_candidate, bt_reset_candidates, generation_candidates): @@ -357,7 +368,26 @@ def handle_generation( self, bt_generate, sl_generative_wf, ckl_candidate_mol_id, n2generate, extrap_compound_property, extrap_cluster_number, extrap_n_compounds, extrap_step_size, scaled_radius, rd_generation_type, show_generated_mol - ): + ): + """ + [Output('section_generated_molecules_clustered', 'style'), + Output('gen_figure', 'figure'), + Output('table_generated_molecules', 'children'), + Output('show_generated_mol', 'children'), + Output('msg_generated_molecules', 'children'), + Output('interpolation_error', 'children')], + [Input("bt_generate", "n_clicks")], + [State('sl_generative_wf', 'value'), + State('ckl_candidate_mol_id', 'value'), + State('n2generate', 'value'), + State('extrap_compound_property', 'value'), + State('extrap_cluster_number', 'value'), + State('extrap_n_compounds', 'value'), + State('extrap_step_size', 'value'), + State('scaled_radius', 'value'), + State('rd_generation_type', 'value'), + State('show_generated_mol', 'children')])(self.handle_generation) + """ print('***handle_generation***') comp_id, event_type = self._fetch_event_data() logger.info(f'handle_generation: comp_id={comp_id}, event_type={event_type}, rd_generation_type={rd_generation_type}') @@ -468,12 +498,14 @@ def handle_generation( for column in columns_in_table: table_headers.append(html.Th(column, style={'fontSize': '150%', 'text-align': 'center'})) prop_recs = [html.Tr(table_headers, style={'background': 'lightgray'})] + invalid_mol_cnt = 0 for row_idx in range(self.generated_df.shape[0]): td = [] try: col_pos = all_columns.index('Chemical Structure') col_data = self.generated_df.iat[row_idx, col_pos] if 'value' in col_data and col_data['value'] == 'Error interpreting SMILES using RDKit': + invalid_mol_cnt += 1 continue except ValueError: pass @@ -490,13 +522,29 @@ def handle_generation( if isinstance(col_value, str) and col_value.startswith('data:image/png;base64,'): td.append(html.Td(html.Img(src=col_value))) else: - td.append( - html.Td(str(col_value), style=LEVEL_TO_STYLE[col_level].update({'maxWidth': '100px', 'wordWrap':'break-word'}))) - prop_recs.append(html.Tr(td)) + #td.append( + # html.Td(str(col_value), style=LEVEL_TO_STYLE[col_level].update({'maxWidth': '100px', 'wordWrap':'break-word'}))) + td.append(html.Td(str(col_value), + style={'maxWidth': '300px', + 'wordWrap': 'break-word', + 'text-align': 'center', + 'color': LEVEL_TO_STYLE[col_level]['color'] + } + )) + prop_recs.append(html.Tr(td, style={'fontSize': '125%'})) + + #[Output('section_generated_molecules_clustered', 'style'), + # Output('gen_figure', 'figure'), + # Output('table_generated_molecules', 'children'), + # Output('show_generated_mol', 'children'), + # Output('msg_generated_molecules', 'children')], + msg_generated_molecules = '' + if invalid_mol_cnt > 0: + msg_generated_molecules = f'{invalid_mol_cnt} invalid molecules were created, which were eliminated from the result.' return {'display': 'inline'}, fig, html.Table( prop_recs, style={'width': '100%', 'margin': 12, 'border': '1px solid lightgray'} - ), show_generated_mol, dash.no_update + ), show_generated_mol, msg_generated_molecules, dash.no_update @report_ui_error(3) @@ -505,6 +553,23 @@ def handle_fitting( fit_nn_compound_property, fit_nn_train_cluster_number, fit_nn_test_cluster_number, fit_nn_hidden_layer_sizes, fit_nn_activation_fn, fit_nn_final_activation_fn, fit_nn_max_epochs, fit_nn_learning_rate, fit_nn_weight_decay, fit_nn_batch_size ): + """ + self.app.callback( + [Output('section_fitting', 'style'), + Output('fitting_figure', 'figure')], + [Input("bt_fit", "n_clicks"),], + [State('sl_featurizing_wf', 'value'), + State('fit_nn_compound_property', 'value'), + State('fit_nn_train_cluster_number', 'value'), + State('fit_nn_test_cluster_number', 'value'), + State('fit_nn_hidden_layer_sizes', 'value'), + State('fit_nn_activation_fn', 'value'), + State('fit_nn_final_activation_fn', 'value'), + State('fit_nn_max_epochs', 'value'), + State('fit_nn_learning_rate', 'value'), + State('fit_nn_weight_decay', 'value'), + State('fit_nn_batch_size', 'value')])(self.handle_fitting) + """ comp_id, event_type = self._fetch_event_data() sys.stdout.flush() if (comp_id != 'bt_fit') or (event_type != 'n_clicks'): @@ -1144,6 +1209,13 @@ def constuct_layout(self): ], style={'marginLeft': 0, 'marginTop': '6px'} ), + + html.Div(className='row', children=[ + dcc.Markdown("n_test", style={'marginTop': 12,}), + dcc.Input(id='n_test', value=0), + ]#, style={'marginLeft': 0, 'marginTop': '6px'} + ), + dcc.Tabs([ dcc.Tab(label='Cluster Molecules', children=[ dcc.Markdown("""**Select Workflow**""", style={'marginTop': 18, }), @@ -1256,8 +1328,8 @@ def constuct_layout(self): dcc.Markdown(children="""**Please Select Two**""", id="mk_selection_msg", style={'marginTop': 18}), - dcc.Markdown(children="""Click *Add* to populate this list""", - style={'marginTop': 18}), + #dcc.Markdown(children="""Click *Add* to populate this list""", + # style={'marginTop': 18}), dcc.Checklist( id='ckl_candidate_mol_id', options=[], @@ -1269,11 +1341,15 @@ def constuct_layout(self): dbc.Button('GENERATE', id='bt_generate', n_clicks=0, style={'marginRight': 12}), dbc.Button('Reset', id='bt_reset_candidates', n_clicks=0), ], style={'marginLeft': 0}), - ]), + #html.Div(className='row', children=[ + # dbc.Button('TEST', id='bt_test', n_clicks=0), #, style={'marginRight': 12}), + #]), #style={'marginLeft': 0}), + ]), + dcc.Tab(label='Predict Properties', children=[ - dcc.Markdown("""**Select Featurizing Model**""", style={'marginTop': 18,}), + dcc.Markdown("**Select Featurizing Model**", style={'marginTop': 18,}), html.Div(children=[ dcc.Dropdown(id='sl_featurizing_wf', multi=False, options=[{'label': 'CDDD Model', @@ -1304,7 +1380,7 @@ def constuct_layout(self): dcc.Input(id='fit_nn_test_cluster_number', value=1), ], style={'marginLeft': 0, 'marginTop': '6px'} ), - dcc.Markdown(children="""**Neural Network Parameters**""", + dcc.Markdown(children="**Neural Network Parameters**", id="nn_params_msg", style={'marginTop': 18} ), @@ -1373,11 +1449,11 @@ def constuct_layout(self): value='similar', clearable=False), ]), - dcc.Markdown(children="""Choose a compound""", + dcc.Markdown(children="Choose a compound", id="analoguing_msg", style={'marginTop': 18} ), - dcc.Markdown(children="""Click *Analogue* to populate this list""", + dcc.Markdown(children="Click *Analogue* to populate this list", style={'marginTop': 18}), dcc.Checklist( id='ckl_analoguing_mol_id', @@ -1403,21 +1479,23 @@ def constuct_layout(self): ], className='three columns', style={'marginLeft': 18, 'marginTop': 90, 'verticalAlign': 'text-top', }), ]), - html.Div(id='section_generated_molecules', - children=[ - html.A( - 'Export to SDF', - id='download-link', - download="rawdata.sdf", - href="/cheminfo/downloadSDF", - target="_blank", - n_clicks=0, - style={'marginLeft': 10, 'fontSize': '150%'} + #html.Div(className='row', children=[ + html.Div(id='section_generated_molecules', children=[ + html.Div(className='row', children=[ + html.A('Export to SDF', + id='download-link', + download="rawdata.sdf", + href="/cheminfo/downloadSDF", + target="_blank", + n_clicks=0, + style={'fontSize': '150%'} ), - html.Div(id='table_generated_molecules', children=[]), - ], - style={'display': 'none'} - ), + html.Div(id='msg_generated_molecules', children=[], + style={'color': 'red', 'fontWeight': 'bold', 'marginLeft': 12, 'fontSize': '150%'}), + ], style={'marginLeft': 0, 'marginBottom': 18, }), + html.Div(id='table_generated_molecules', children=[], style={'width': '100%'}) + ], style={'display': 'none', 'width': '100%'}), + html.Div(id='section_generated_molecules_clustered', children=[ dcc.Graph(id='gen_figure', figure=fig, @@ -1474,6 +1552,9 @@ def constuct_layout(self): ]) ], style={'display': 'none'}), + #], style={'margin': 12}), + + html.Div(id='refresh_main_fig', style={'display': 'none'}), html.Div(id='northstar_cluster', style={'display': 'none'}), html.Div(id='recluster_error', style={'display': 'none'}), From 6a2da069428e4c40145f974652a2cc1e1cc8c715 Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Fri, 25 Feb 2022 16:19:55 -0800 Subject: [PATCH 21/27] fixing --- cuchem/cuchem/wf/generative/megatronmolbart.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cuchem/cuchem/wf/generative/megatronmolbart.py b/cuchem/cuchem/wf/generative/megatronmolbart.py index bbe6ced9..b78def50 100644 --- a/cuchem/cuchem/wf/generative/megatronmolbart.py +++ b/cuchem/cuchem/wf/generative/megatronmolbart.py @@ -126,7 +126,8 @@ def interpolate_smiles( num_points: int = 10, scaled_radius=None, force_unique=False, - compound_ids=[] + compound_ids=[], + sanitize=True ): if len(compound_ids) == 0: compound_ids = [f'source{i}' for i in range(len(smiles))] @@ -135,7 +136,8 @@ def interpolate_smiles( smiles=smiles, radius=scaled_radius, numRequested=num_points, - forceUnique=force_unique) + forceUnique=force_unique, + sanitize=sanitize) result = self.stub.Interpolate(spec) result = result.generatedSmiles From a6b76567bc9bf3439761903448c44e2a3766dea1 Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Sat, 26 Feb 2022 12:31:17 -0800 Subject: [PATCH 22/27] debugging --- common/cuchemcommon/workflow.py | 35 +++++++++++------- cuchem/cuchem/interactive/chemvisualize.py | 37 +++++++++++++------ .../cuchem/wf/generative/megatronmolbart.py | 3 ++ 3 files changed, 49 insertions(+), 26 deletions(-) diff --git a/common/cuchemcommon/workflow.py b/common/cuchemcommon/workflow.py index fe4ed2ae..ce0ec24b 100644 --- a/common/cuchemcommon/workflow.py +++ b/common/cuchemcommon/workflow.py @@ -2,6 +2,7 @@ from functools import singledispatch from typing import List +import pandas as pd import numpy as np from cuchemcommon.data import GenerativeWfDao from cuchemcommon.fingerprint import BaseTransformation @@ -218,29 +219,35 @@ def extrapolate_from_cluster(self, def find_similars_smiles_by_id(self, - chembl_id: str, + chembl_ids: List[str], # actually a list of strings id_type: str = 'chemblid', num_requested=10, force_unique=False, scaled_radius: int = 1, sanitize=True): - smiles = None - + smiles_list = [] + if not self.min_jitter_radius: raise Exception('Property `radius_scale` must be defined in model class.') if id_type.lower() == 'chemblid': - smiles = [row[2] for row in self.dao.fetch_id_from_chembl(chembl_id)] - if len(smiles) != len(chembl_id): - raise Exception('One of the ids is invalid %s' + chembl_id) + smiles_list = [row[2] for row in self.dao.fetch_id_from_chembl(chembl_ids)] + if len(smiles_list) != len(chembl_ids): + raise Exception('One of the ids is invalid %s' + chembl_ids) else: raise Exception('id type %s not supported' % id_type) - return self.find_similars_smiles( - smiles[0], - num_requested=num_requested, - scaled_radius=scaled_radius, - force_unique=force_unique, - compound_id=str(chembl_id), - sanitize=sanitize - ) + ret_vals = [ + self.find_similars_smiles( + smiles, + num_requested=num_requested, + scaled_radius=scaled_radius, + force_unique=force_unique, + compound_id=str(chembl_id), + sanitize=sanitize + ) + for smiles, chembl_id in zip(smiles_list, chembl_ids) + ] + if len(ret_vals) == 1: + return ret_vals[0] + return pd.concat(ret_vals, ignore_index=True, copy=False) diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index 1086de63..5993e2be 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -429,6 +429,7 @@ def handle_generation( force_unique=True, sanitize=True) + logging.info(f'RAW self.generated_df: {self.generated_df.columns}, {len(self.generated_df)}\n{self.generated_df.head()}\n{self.generated_df.tail()}') if show_generated_mol is None: show_generated_mol = 0 show_generated_mol += 1 @@ -442,6 +443,9 @@ def handle_generation( # Note: we are not allowing fingerprint specification to change here because we want to see the results on the same PCA / UMAP as the original figure # TODO: make this clear in the UI + + # The number of generated molecules is not expected to be large, so regular cudf dataframes should suffice + logging.info(f'VALID self.generated_df: {self.generated_df.columns}, {len(self.generated_df)}\n{self.generated_df.head()}\n{self.generated_df.tail()}') fps = MorganFingerprint( radius=self.fingerprint_radius, nBits=self.fingerprint_nBits ).transform(self.generated_df, smiles_column='SMILES') @@ -459,30 +463,39 @@ def handle_generation( north_stars = ','.join(list(df_fp[ ~self.generated_df['Generated'] ]['id'].values_host)) # TODO: check if all these lines are necessary! - chunksize=max(10, int(df_fp.shape[0] * 0.1)) - df_embedding = dask_cudf.from_cudf(df_fp, chunksize=chunksize) - df_embedding = df_embedding.reset_index() + #chunksize=max(10, int(df_fp.shape[0] * 0.1)) + df_embedding = df_fp #dask_cudf.from_cudf(df_fp, chunksize=chunksize) + #df_embedding = df_embedding.reset_index() cluster_col = df_embedding['cluster'] df_embedding, prop_series = self.cluster_wf._remove_non_numerics(df_embedding) prop_series['cluster'] = cluster_col - n_molecules, n_obs = df_embedding.compute().shape # needed? + #n_molecules, n_obs = df_embedding.compute().shape # needed? #if hasattr(df_embedding, 'compute'): # df_embedding = df_embedding.compute() - if isinstance(self.cluster_wf.pca, cuml.PCA) and isinstance(df_embedding, dask_cudf.DataFrame): - # Trying to accommodate the GpuKmeansUmapHybrid workflow - df_embedding = df_embedding.compute() + #if isinstance(self.cluster_wf.pca, cuml.PCA) and isinstance(df_embedding, dask_cudf.DataFrame): + # # Trying to accommodate the GpuKmeansUmapHybrid workflow + # df_embedding = df_embedding.compute() df_embedding = self.cluster_wf.pca.transform(df_embedding) - if hasattr(df_embedding, 'persist'): - df_embedding = df_embedding.persist() - wait(df_embedding) + logging.info(f'df_embedding: {type(df_embedding)}, {len(df_embedding)}') + #if hasattr(df_embedding, 'persist'): + # df_embedding = df_embedding.persist() + # wait(df_embedding) Xt = self.cluster_wf.umap.transform(df_embedding) + logging.info(f'Xt: {type(Xt)}, {len(Xt)}') df_embedding['x'] = Xt[0] df_embedding['y'] = Xt[1] - + logging.info(f'df_embedding: {type(df_embedding)}, {df_embedding.columns}, {len(df_embedding)}, {df_embedding.index}') for col in prop_series.keys(): sys.stdout.flush() - df_embedding[col] = prop_series[col]#.compute() + # TypeError: Implicit conversion to a host NumPy array via __array__ is not allowed, + # To explicitly construct a GPU array, consider using cupy.asarray(...) + # To explicitly construct a host array, consider using .to_array() + logging.info(f'before prop_series[{col}]: {type(prop_series[col])}') + #if hasattr(prop_series[col], 'compute'): + # prop_series[col] = prop_series[col].compute() + #logging.info(f'prop_series[{col}]: {type(prop_series[col])}, {prop_series[col].index}') + df_embedding[col] = prop_series[col]#.to_pandas() #cupy.asarray(prop_series[col]) #.to_array()#.compute() # Cannot align indices with non-unique values fig, northstar_cluster = self.create_graph(df_embedding, north_stars=north_stars) diff --git a/cuchem/cuchem/wf/generative/megatronmolbart.py b/cuchem/cuchem/wf/generative/megatronmolbart.py index b78def50..914a07fd 100644 --- a/cuchem/cuchem/wf/generative/megatronmolbart.py +++ b/cuchem/cuchem/wf/generative/megatronmolbart.py @@ -88,11 +88,13 @@ def find_similars_smiles(self, force_unique=False, sanitize=True, compound_id=None): + logger.info(f'find_similars_smiles: compound_id={compound_id}') if isinstance(compound_id, list): # Sometimes calling routine may send a list of length one containing the compound ID if len(compound_id) > 1: logger.info(f'find_similars_smiles received {compound_id}, generating neighbors only for first compound!') compound_id = compound_id[0] + logger.info(f'comp_id: {compound_id}') spec = GenerativeSpec(model=GenerativeModel.MegaMolBART, smiles=smiles, radius=scaled_radius, @@ -118,6 +120,7 @@ def find_similars_smiles(self, for i in range(len(generatedSmiles) - 1) ], }) + logging.info(f'find_similars_smiles returning: {type(generated_df)}, {len(generated_df)}\n{generated_df.head()}') return generated_df def interpolate_smiles( From 881c9a9c54b5bb1e69ba4480e0b7fb78fd7453fe Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Sun, 27 Feb 2022 09:12:14 -0800 Subject: [PATCH 23/27] fixed --- cuchem/cuchem/interactive/chemvisualize.py | 26 +++++++++++++------ .../cuchem/wf/generative/megatronmolbart.py | 5 +++- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index 5993e2be..5c0c7731 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -429,7 +429,7 @@ def handle_generation( force_unique=True, sanitize=True) - logging.info(f'RAW self.generated_df: {self.generated_df.columns}, {len(self.generated_df)}\n{self.generated_df.head()}\n{self.generated_df.tail()}') + logging.info(f'RAW self.generated_df: {type(self.generated_df)}, {self.generated_df.columns}, {len(self.generated_df)}\n{self.generated_df.head()}\n{self.generated_df.tail()}') if show_generated_mol is None: show_generated_mol = 0 show_generated_mol += 1 @@ -462,8 +462,7 @@ def handle_generation( # Highlight the source compound(s) north_stars = ','.join(list(df_fp[ ~self.generated_df['Generated'] ]['id'].values_host)) - # TODO: check if all these lines are necessary! - #chunksize=max(10, int(df_fp.shape[0] * 0.1)) + # TODO: dask_cudf needs to be handled carefully as each chunk has its own index df_embedding = df_fp #dask_cudf.from_cudf(df_fp, chunksize=chunksize) #df_embedding = df_embedding.reset_index() cluster_col = df_embedding['cluster'] @@ -476,13 +475,24 @@ def handle_generation( #if isinstance(self.cluster_wf.pca, cuml.PCA) and isinstance(df_embedding, dask_cudf.DataFrame): # # Trying to accommodate the GpuKmeansUmapHybrid workflow # df_embedding = df_embedding.compute() + + # TODO: cuml.dask.decomposition.PCA needs a dask dataframe!!! + if isinstance(self.cluster_wf.pca, cuml.dask.decomposition.PCA) and not isinstance(df_embedding, dask_cudf.DataFrame): + #chunksize=max(10, int(df_fp.shape[0] * 0.1)) + df_embedding = dask_cudf.from_cudf(df_embedding, npartitions=1) #chunksize=chunksize) df_embedding = self.cluster_wf.pca.transform(df_embedding) - logging.info(f'df_embedding: {type(df_embedding)}, {len(df_embedding)}') - #if hasattr(df_embedding, 'persist'): - # df_embedding = df_embedding.persist() - # wait(df_embedding) Xt = self.cluster_wf.umap.transform(df_embedding) - logging.info(f'Xt: {type(Xt)}, {len(Xt)}') + if hasattr(Xt, 'persist'): + logging.info(f'BEFORE Xt: {type(Xt)}, {Xt.columns}, {len(Xt)}, {Xt.index}') + Xt = Xt.compute() + #wait(Xt) + logging.info(f'Xt: {type(Xt)}, {Xt.columns}, {len(Xt)}, {Xt.index}') + + if hasattr(df_embedding, 'persist'): + logging.info(f'BEFORE df_embedding: {type(df_embedding)}, {df_embedding.columns}, {len(df_embedding)}, {df_embedding.index}') + # Note: When converting a dask_cudf to cudf, the indices within each chunk are retained + df_embedding = df_embedding.compute().reset_index(drop=True) + #wait(df_embedding) df_embedding['x'] = Xt[0] df_embedding['y'] = Xt[1] logging.info(f'df_embedding: {type(df_embedding)}, {df_embedding.columns}, {len(df_embedding)}, {df_embedding.index}') diff --git a/cuchem/cuchem/wf/generative/megatronmolbart.py b/cuchem/cuchem/wf/generative/megatronmolbart.py index 914a07fd..d7a74736 100644 --- a/cuchem/cuchem/wf/generative/megatronmolbart.py +++ b/cuchem/cuchem/wf/generative/megatronmolbart.py @@ -186,7 +186,9 @@ def extrapolate_from_cluster(self, radius = self._compute_radius(scaled_radius) # TO DO: User must be able to extrapolate directly from smiles in the table; # these may themselves be generated compounds without any chemblid. - df_cluster = compounds_df[ compounds_df['cluster'] == int(cluster_id) ].dropna().reset_index(drop=True).compute() + df_cluster = compounds_df[ compounds_df['cluster'] == int(cluster_id) ].dropna().reset_index(drop=True) + if hasattr(df_cluster, 'compute'): + df_cluster = df_cluster.compute() if 'transformed_smiles' in df_cluster: smiles_col = 'transformed_smiles' elif 'SMILES' in df_cluster: @@ -223,6 +225,7 @@ def _get_embedding_direction(self, logger.info(f'_get_embedding_direction: emb:{embedding_list.shape}, {type(embedding_list)}, prop:{compound_property_vals.shape}, {type(compound_property_vals)}, prop: {min(compound_property_vals)} - {max(compound_property_vals)}') n_data = compound_property_vals.shape[0] n_dimensions = embedding_list[0].shape[0] + reg = Lasso() reg = reg.fit(embedding_list, compound_property_vals) n_zero_coefs = len([x for x in reg.coef_ if x == 0.0]) zero_coef_indices = [i for i, x in enumerate(reg.coef_) if x != 0.0] From b318c452b5ac7c440395abd16c646a86a6d4b4e2 Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Sun, 27 Feb 2022 20:11:29 -0800 Subject: [PATCH 24/27] cleaned up --- README.md | 5 -- common/cuchemcommon/data/helper/chembldata.py | 1 - common/cuchemcommon/fingerprint.py | 3 +- common/cuchemcommon/workflow.py | 4 - cuchem/cuchem/interactive/chemvisualize.py | 77 +++---------------- cuchem/cuchem/wf/cluster/gpukmeansumap.py | 3 - .../cuchem/wf/generative/megatronmolbart.py | 45 +++-------- 7 files changed, 21 insertions(+), 117 deletions(-) diff --git a/README.md b/README.md index fa1f86de..fa785866 100644 --- a/README.md +++ b/README.md @@ -50,11 +50,6 @@ Build your container: ./launch.sh build ``` -Download the ChEMBL database (version 27): -``` -./launch.sh dbSetup -``` - Launch the interactive ChEMBL exploration tool: ``` ./launch.sh start diff --git a/common/cuchemcommon/data/helper/chembldata.py b/common/cuchemcommon/data/helper/chembldata.py index d164b727..7ccd7c58 100644 --- a/common/cuchemcommon/data/helper/chembldata.py +++ b/common/cuchemcommon/data/helper/chembldata.py @@ -5,7 +5,6 @@ import logging import sys from typing import List -#from dask import delayed, dataframe import dask from contextlib import closing from cuchemcommon.utils.singleton import Singleton diff --git a/common/cuchemcommon/fingerprint.py b/common/cuchemcommon/fingerprint.py index 1dc25d65..9b4799a1 100644 --- a/common/cuchemcommon/fingerprint.py +++ b/common/cuchemcommon/fingerprint.py @@ -114,9 +114,8 @@ def transform( else: for i in range(0, self.kwargs['nBits'], INTEGER_NBITS): raw_fp_array[i // INTEGER_NBITS].append(int(fp_bs[i: i + INTEGER_NBITS], 2)) - #fp_array = np.asarray(fp_array) fp_array = cupy.stack(fp_array) - + # TODO: return value parameter names should be self-explanatory if return_fp: if raw: return fp_array, raw_fp_array diff --git a/common/cuchemcommon/workflow.py b/common/cuchemcommon/workflow.py index ce0ec24b..26dacf42 100644 --- a/common/cuchemcommon/workflow.py +++ b/common/cuchemcommon/workflow.py @@ -1,4 +1,3 @@ -import logging from functools import singledispatch from typing import List @@ -8,9 +7,6 @@ from cuchemcommon.fingerprint import BaseTransformation from rdkit.Chem import PandasTools, CanonSmiles -logger = logging.getLogger(__name__) - - @singledispatch def add_jitter(embedding, radius, cnt, shape): return NotImplemented diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index 5c0c7731..a97021cc 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -388,9 +388,7 @@ def handle_generation( State('rd_generation_type', 'value'), State('show_generated_mol', 'children')])(self.handle_generation) """ - print('***handle_generation***') comp_id, event_type = self._fetch_event_data() - logger.info(f'handle_generation: comp_id={comp_id}, event_type={event_type}, rd_generation_type={rd_generation_type}') chembl_ids = [] if comp_id == 'bt_generate' and event_type == 'n_clicks': chembl_ids = ckl_candidate_mol_id @@ -429,7 +427,6 @@ def handle_generation( force_unique=True, sanitize=True) - logging.info(f'RAW self.generated_df: {type(self.generated_df)}, {self.generated_df.columns}, {len(self.generated_df)}\n{self.generated_df.head()}\n{self.generated_df.tail()}') if show_generated_mol is None: show_generated_mol = 0 show_generated_mol += 1 @@ -438,14 +435,14 @@ def handle_generation( self.generated_df = LipinskiRuleOfFiveDecorator().decorate(self.generated_df) self.generated_df = self.generated_df[ ~self.generated_df['invalid'] ].reset_index(drop=True).drop(columns=['invalid']) if len(self.generated_df) == 0: - logger.info("None of the generated smiles yielded valid molecules!!!") + logger.info("None of the generated smiles yielded valid molecules!") return dash.no_update, dash.no_update # Note: we are not allowing fingerprint specification to change here because we want to see the results on the same PCA / UMAP as the original figure # TODO: make this clear in the UI - # The number of generated molecules is not expected to be large, so regular cudf dataframes should suffice - logging.info(f'VALID self.generated_df: {self.generated_df.columns}, {len(self.generated_df)}\n{self.generated_df.head()}\n{self.generated_df.tail()}') + # The number of generated molecules is not expected to be large, so regular cudf dataframes should suffice. + # However, we have to handle both cudf and dask_cudf, perhaps more elegantly. fps = MorganFingerprint( radius=self.fingerprint_radius, nBits=self.fingerprint_nBits ).transform(self.generated_df, smiles_column='SMILES') @@ -461,54 +458,27 @@ def handle_generation( else: # Highlight the source compound(s) north_stars = ','.join(list(df_fp[ ~self.generated_df['Generated'] ]['id'].values_host)) - - # TODO: dask_cudf needs to be handled carefully as each chunk has its own index - df_embedding = df_fp #dask_cudf.from_cudf(df_fp, chunksize=chunksize) - #df_embedding = df_embedding.reset_index() + df_embedding = df_fp cluster_col = df_embedding['cluster'] df_embedding, prop_series = self.cluster_wf._remove_non_numerics(df_embedding) prop_series['cluster'] = cluster_col - #n_molecules, n_obs = df_embedding.compute().shape # needed? - #if hasattr(df_embedding, 'compute'): - # df_embedding = df_embedding.compute() - - #if isinstance(self.cluster_wf.pca, cuml.PCA) and isinstance(df_embedding, dask_cudf.DataFrame): - # # Trying to accommodate the GpuKmeansUmapHybrid workflow - # df_embedding = df_embedding.compute() - - # TODO: cuml.dask.decomposition.PCA needs a dask dataframe!!! if isinstance(self.cluster_wf.pca, cuml.dask.decomposition.PCA) and not isinstance(df_embedding, dask_cudf.DataFrame): - #chunksize=max(10, int(df_fp.shape[0] * 0.1)) - df_embedding = dask_cudf.from_cudf(df_embedding, npartitions=1) #chunksize=chunksize) + df_embedding = dask_cudf.from_cudf(df_embedding, npartitions=1) df_embedding = self.cluster_wf.pca.transform(df_embedding) Xt = self.cluster_wf.umap.transform(df_embedding) if hasattr(Xt, 'persist'): - logging.info(f'BEFORE Xt: {type(Xt)}, {Xt.columns}, {len(Xt)}, {Xt.index}') Xt = Xt.compute() - #wait(Xt) - logging.info(f'Xt: {type(Xt)}, {Xt.columns}, {len(Xt)}, {Xt.index}') - if hasattr(df_embedding, 'persist'): logging.info(f'BEFORE df_embedding: {type(df_embedding)}, {df_embedding.columns}, {len(df_embedding)}, {df_embedding.index}') - # Note: When converting a dask_cudf to cudf, the indices within each chunk are retained + # Note: When converting a dask_cudf to cudf, the indices within each chunk are retained, so we need to reset_index. df_embedding = df_embedding.compute().reset_index(drop=True) - #wait(df_embedding) df_embedding['x'] = Xt[0] df_embedding['y'] = Xt[1] logging.info(f'df_embedding: {type(df_embedding)}, {df_embedding.columns}, {len(df_embedding)}, {df_embedding.index}') for col in prop_series.keys(): - sys.stdout.flush() - # TypeError: Implicit conversion to a host NumPy array via __array__ is not allowed, - # To explicitly construct a GPU array, consider using cupy.asarray(...) - # To explicitly construct a host array, consider using .to_array() - logging.info(f'before prop_series[{col}]: {type(prop_series[col])}') - #if hasattr(prop_series[col], 'compute'): - # prop_series[col] = prop_series[col].compute() - #logging.info(f'prop_series[{col}]: {type(prop_series[col])}, {prop_series[col].index}') - df_embedding[col] = prop_series[col]#.to_pandas() #cupy.asarray(prop_series[col]) #.to_array()#.compute() # Cannot align indices with non-unique values + df_embedding[col] = prop_series[col] fig, northstar_cluster = self.create_graph(df_embedding, north_stars=north_stars) - # Create Table header table_headers = [] all_columns = self.generated_df.columns.to_list() @@ -545,8 +515,6 @@ def handle_generation( if isinstance(col_value, str) and col_value.startswith('data:image/png;base64,'): td.append(html.Td(html.Img(src=col_value))) else: - #td.append( - # html.Td(str(col_value), style=LEVEL_TO_STYLE[col_level].update({'maxWidth': '100px', 'wordWrap':'break-word'}))) td.append(html.Td(str(col_value), style={'maxWidth': '300px', 'wordWrap': 'break-word', @@ -556,11 +524,6 @@ def handle_generation( )) prop_recs.append(html.Tr(td, style={'fontSize': '125%'})) - #[Output('section_generated_molecules_clustered', 'style'), - # Output('gen_figure', 'figure'), - # Output('table_generated_molecules', 'children'), - # Output('show_generated_mol', 'children'), - # Output('msg_generated_molecules', 'children')], msg_generated_molecules = '' if invalid_mol_cnt > 0: msg_generated_molecules = f'{invalid_mol_cnt} invalid molecules were created, which were eliminated from the result.' @@ -576,23 +539,6 @@ def handle_fitting( fit_nn_compound_property, fit_nn_train_cluster_number, fit_nn_test_cluster_number, fit_nn_hidden_layer_sizes, fit_nn_activation_fn, fit_nn_final_activation_fn, fit_nn_max_epochs, fit_nn_learning_rate, fit_nn_weight_decay, fit_nn_batch_size ): - """ - self.app.callback( - [Output('section_fitting', 'style'), - Output('fitting_figure', 'figure')], - [Input("bt_fit", "n_clicks"),], - [State('sl_featurizing_wf', 'value'), - State('fit_nn_compound_property', 'value'), - State('fit_nn_train_cluster_number', 'value'), - State('fit_nn_test_cluster_number', 'value'), - State('fit_nn_hidden_layer_sizes', 'value'), - State('fit_nn_activation_fn', 'value'), - State('fit_nn_final_activation_fn', 'value'), - State('fit_nn_max_epochs', 'value'), - State('fit_nn_learning_rate', 'value'), - State('fit_nn_weight_decay', 'value'), - State('fit_nn_batch_size', 'value')])(self.handle_fitting) - """ comp_id, event_type = self._fetch_event_data() sys.stdout.flush() if (comp_id != 'bt_fit') or (event_type != 'n_clicks'): @@ -635,7 +581,7 @@ def handle_analoguing( smiles_columns = 'SMILES' if self.fp_df is None: - # Note: CPU-based workflow is no longer needed, can be removed + # Note: CPU-based workflow for fingerprint similarity is no longer needed, can be removed logger.info(f'CPU-based similarity search: self.fp_df not set') # First move the smiles to the CPU: if isinstance(self.cluster_wf.df_embedding, dask_cudf.DataFrame): @@ -650,7 +596,6 @@ def handle_analoguing( _, v = MorganFingerprint(radius=self.fingerprint_radius, nBits=self.fingerprint_nBits).transform( smiles_df, smiles_column=smiles_column, return_fp=True, raw=True) else: - logger.info(f'Fingerprints already available') if hasattr(self.cluster_wf.df_embedding, 'compute'): v = list(self.cluster_wf.df_embedding['fp'].compute().to_pandas()) else: @@ -677,14 +622,12 @@ def handle_analoguing( n_fp_cols += 1 self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.apply_rows( popcll_wrapper, incols = {col: 'ip_col'}, outcols = {'op_col': int}, kwargs = {}) - # More complex syntax was not necessary: - #self.cluster_wf.df_embedding['op_col'] = self.cluster_wf.df_embedding.map_partitions(popcll_wrapper_dask, col, 'op_col') #lambda df: df = df.apply_rows(popcll_wrapper, incols = {col: 'ip_col'}, outcols = {'op_col': int}, kwargs = {})) self.cluster_wf.df_embedding['pc'] += self.cluster_wf.df_embedding['op_col'] if hasattr(self.cluster_wf.df_embedding, 'persist'): self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() wait(self.cluster_wf.df_embedding) t1 = time.time() - logger.info(f'Time to compute partial popcounts ({n_fp_cols} fp columns): {t1 - t0}:\n{self.cluster_wf.df_embedding["pc"].head()}') + #logger.info(f'Time to compute partial popcounts ({n_fp_cols} fp columns): {t1 - t0}:\n{self.cluster_wf.df_embedding["pc"].head()}') # Prepare the query compound: logger.info(f'analoguing_mol_id={analoguing_mol_id}') @@ -737,7 +680,7 @@ def handle_analoguing( ).reset_index(drop=True) t1 = time.time() - logger.info(f'Fingerprint length={self.fingerprint_nBits}: GPU-Method: {t5 - t4}, CPU-Method: {t1 - t0}') + #logger.info(f'Fingerprint length={self.fingerprint_nBits}: GPU-Method: {t5 - t4}, CPU-Method: {t1 - t0}') #self.analoguing_df = self.fp_df[ self.fp_df['similarity_cpu'] >= float(analoguing_threshold) ] self.analoguing_df = self.cluster_wf.df_embedding[ self.cluster_wf.df_embedding['similarity'] >= float(analoguing_threshold) ] diff --git a/cuchem/cuchem/wf/cluster/gpukmeansumap.py b/cuchem/cuchem/wf/cluster/gpukmeansumap.py index c84b3e6b..0942ca42 100644 --- a/cuchem/cuchem/wf/cluster/gpukmeansumap.py +++ b/cuchem/cuchem/wf/cluster/gpukmeansumap.py @@ -127,7 +127,6 @@ def _cluster(self, embedding, n_pca, reuse_umap=False, reuse_pca=True): with MetricsLogger('kmeans', self.n_molecules) as ml: if self.n_molecules < self.n_clusters: # < MIN_RECLUSTER_SIZE: raise Exception('Reclustering {self.n_molecules} molecules into {self.n_clusters} clusters not supported.')# % MIN_RECLUSTER_SIZE) - #raise Exception('Reclustering less than %d molecules is not supported.' % MIN_RECLUSTER_SIZE) kmeans_cuml = cuDaskKMeans(client=dask_client, n_clusters=self.n_clusters) @@ -288,8 +287,6 @@ def add_molecules(self, chemblids: List, radius=2, nBits=512): if hasattr(self.df_embedding, 'compute'): self.df_embedding = self.df_embedding.compute() - logger.info(self.df_embedding.shape) - return chem_mol_map, molregnos, self.df_embedding diff --git a/cuchem/cuchem/wf/generative/megatronmolbart.py b/cuchem/cuchem/wf/generative/megatronmolbart.py index d7a74736..10a6842c 100644 --- a/cuchem/cuchem/wf/generative/megatronmolbart.py +++ b/cuchem/cuchem/wf/generative/megatronmolbart.py @@ -88,13 +88,11 @@ def find_similars_smiles(self, force_unique=False, sanitize=True, compound_id=None): - logger.info(f'find_similars_smiles: compound_id={compound_id}') if isinstance(compound_id, list): # Sometimes calling routine may send a list of length one containing the compound ID if len(compound_id) > 1: logger.info(f'find_similars_smiles received {compound_id}, generating neighbors only for first compound!') compound_id = compound_id[0] - logger.info(f'comp_id: {compound_id}') spec = GenerativeSpec(model=GenerativeModel.MegaMolBART, smiles=smiles, radius=scaled_radius, @@ -120,7 +118,6 @@ def find_similars_smiles(self, for i in range(len(generatedSmiles) - 1) ], }) - logging.info(f'find_similars_smiles returning: {type(generated_df)}, {len(generated_df)}\n{generated_df.head()}') return generated_df def interpolate_smiles( @@ -196,9 +193,7 @@ def extrapolate_from_cluster(self, elif 'smiles' in df_cluster: smiles_col = 'smiles' else: - logger.info(list(df_cluster.columns)) - logger.info(df_cluster.head()) - raise RuntimeError('No smiles column') + raise RuntimeError(f'No smiles column in df_cluster: {list(df_cluster.columns)}\n{df_cluster.head()}') smiles_col = None smiles_list = df_cluster[smiles_col].to_array() return self.extrapolate_from_smiles(smiles_list, @@ -222,7 +217,8 @@ def _get_embedding_direction(self, compute the step size along the direction that is expected to increase the compound_property value by step_percentage. """ - logger.info(f'_get_embedding_direction: emb:{embedding_list.shape}, {type(embedding_list)}, prop:{compound_property_vals.shape}, {type(compound_property_vals)}, prop: {min(compound_property_vals)} - {max(compound_property_vals)}') + logger.info(f'_get_embedding_direction: emb:{embedding_list.shape}, {type(embedding_list)}, prop:{compound_property_vals.shape}, {type(compound_property_vals)},'\ + f' prop range: {max(compound_property_vals)} - {min(compound_property_vals)}.\nFitting Lasso regression...') n_data = compound_property_vals.shape[0] n_dimensions = embedding_list[0].shape[0] reg = Lasso() @@ -235,15 +231,13 @@ def _get_embedding_direction(self, y_pred = reg.predict(embedding_list) rmse = sqrt(mean_squared_error(compound_property_vals, y_pred.astype('float64'))) pearson_rho = cp.corrcoef(compound_property_vals, y_pred) - logger.info(f'_get_embedding_direction: n={len(compound_property_vals)}, rho={pearson_rho}, rmse={rmse}') #:.2f}') emb_std = np.std(embedding_list, axis=0) - logger.info(f'embedding_list.std: {emb_std}') + logger.info(f'_get_embedding_direction: n={len(compound_property_vals)}, rho={pearson_rho}, rmse={rmse}, embedding_list.std: {emb_std}') emb_max = embedding_list[ np.argmax(compound_property_vals) ] emb_min = embedding_list[ np.argmin(compound_property_vals) ] diff_size = np.linalg.norm(emb_max - emb_min) / sqrt(n_dimensions) - # TODO: project on to embedding direction!!! - logger.info(f'compound_property_vals: [{np.argmin(compound_property_vals)}]={np.amin(compound_property_vals)}, [{np.argmax(compound_property_vals)}]={np.amax(compound_property_vals)}, diff_size={diff_size}') + #logger.info(f'compound_property_vals: [{np.argmin(compound_property_vals)}]={np.amin(compound_property_vals)}, [{np.argmax(compound_property_vals)}]={np.amax(compound_property_vals)}, diff_size={diff_size}') return reg.coef_, emb_std, diff_size @@ -271,9 +265,6 @@ def extrapolate_from_smiles(self, id_list = list(map(str, range(len(smiles_list)))) logger.info(f'molbart: extrapolate_from_smiles: {len(smiles_list)} smiles ({type(smiles_list)}), {num_points} extrapolations each with step_size {step_size}') data = pd.DataFrame({'transformed_smiles': smiles_list}) - logger.info(data.head()) - #pad_length = max(map(len, smiles_list)) + 2 # add 2 for start / stop - # TODO: check reversibility / recovery full_mask = None emb_shape = None n_recovered = 0 @@ -316,8 +307,6 @@ def extrapolate_from_smiles(self, emb_shape = [n_embedding_tokens, emb_shape[1], emb_shape[2]] embeddings = cp.asarray(embeddings) full_mask = [False] * n_embedding_tokens - logger.info(f'emb type: {type(embeddings)} of {type(embeddings[0])}') - logger.info(f'embeddings.shape:{embeddings.shape}, emb_shape={emb_shape}, embeddings[0]={embeddings[0]}') # Use the entire cluster to infer the direction: direction, emb_std, diff_size = self._get_embedding_direction(embeddings, compound_property_vals) @@ -333,21 +322,16 @@ def extrapolate_from_smiles(self, id_list = [id_list[i] for i in indices] result_df_list = [ pd.DataFrame({'SMILES': smiles_list, 'Generated': False, 'id': id_list}) ] - logger.info(f'direction: {type(direction)}, shape={direction.shape}, {direction}\n, embeddings: {type(embeddings)}, shape: {embeddings.shape}, embeddings[0]={embeddings[0]}') for step_num in range(1, 1 + num_points): direction_sampled = cp.random.normal(loc=direction, scale=emb_std, size=emb_std.shape) #direction + noise - logger.info(f'step ({type(step_num)} * {type(diff_size)} * {type(step_size)} * {type(direction_sampled)}') step = float(step_num * diff_size * step_size) * direction_sampled - logger.info(step) extrap_embeddings = embeddings + step # TODO: print and check output - logger.info(f'step ({step_num} * {diff_size} * {step_size} * direction_sampled): {type(step)}, {step.shape}, {step}\n:extrap_embeddings: {type(extrap_embeddings)}, {extrap_embeddings.shape}, extrap_embeddings[0]={extrap_embeddings[0]}') + #logger.info(f'step ({step_num} * {diff_size} * {step_size} * direction_sampled): {type(step)}, {step.shape}, {step}\n:extrap_embeddings: {type(extrap_embeddings)}, {extrap_embeddings.shape}, extrap_embeddings[0]={extrap_embeddings[0]}') smiles_gen_list = [] ids_interp_list = [] for i in range(len(extrap_embeddings)): extrap_embedding = list(extrap_embeddings[i,:]) - logger.info(f'embedding: {type(extrap_embedding)}, {len(extrap_embeddings)};'\ - f' dim: {type(emb_shape)}, {len(emb_shape)}; pad_mask={type(full_mask)}, {len(full_mask)}') spec = EmbeddingList( embedding=extrap_embedding, dim=emb_shape, @@ -362,7 +346,6 @@ def extrapolate_from_smiles(self, 'Generated': True, 'id': ids_interp_list }) - logger.info(extrap_df.head()) if force_unique: inv_transform_funct = partial(self.inverse_transform, mem_pad_mask=full_mask) @@ -370,7 +353,7 @@ def extrapolate_from_smiles(self, smiles_gen, inv_transform_funct, radius=radius) - logger.info(f'step_num={step_num} yielded {len(extrap_df)} compounds:\n{extrap_df.head()}') + logger.info(f'step_num={step_num} yielded {len(extrap_df)} compounds') result_df_list.append(extrap_df) results_df = pd.concat(result_df_list, ignore_index=True) results_df['id'] = results_df['id'].apply(str) @@ -391,7 +374,7 @@ def fit_nn( batch_size=32, learning_rate=0.001, weight_decay=0.0001, - debug=False, #82 / 88 compounds yielded something after embedding, with avg tani = 0.8287583649661866 + debug=False, #scaled_radius=None ): """ @@ -409,18 +392,13 @@ def fit_nn( n_train = len(df_train) n_test = len(df_test) - logger.info(f'df_train: {len(df_train)}\n{df_train.head()}') - logger.info(f"type(df_train['transformed_smiles'])={type(df_train['transformed_smiles'])}") - smiles_list = np.concatenate((df_train['transformed_smiles'].to_array(), df_test['transformed_smiles'].to_array()), axis=0) logger.info(f'smiles_list: {smiles_list.shape}') pad_length = max(map(len, smiles_list)) + 2 # add 2 for start / stop embeddings = [] - #full_mask = None emb_shape = None n_recovered = 0 avg_tani = 0 - #radius = self._compute_radius(scaled_radius) for i, smiles in enumerate(smiles_list): spec = GenerativeSpec( @@ -432,7 +410,6 @@ def fit_nn( emb = result.embedding mask = result.pad_mask dim = result.dim - #logger.info(f'{i}: smiles={smiles}, emd: {len(emb)}, {emb[:5]}; dim={dim}, mask: {len(mask)}') emb_shape = result.dim if debug: spec = EmbeddingList( @@ -457,10 +434,8 @@ def fit_nn( embeddings, batch_first=True, padding_value=PAD_TOKEN) embeddings_train = embeddings[:n_train,:] embeddings_test = embeddings[n_train:,:] - logger.info(f'emb train: {type(embeddings_train)} of {type(embeddings_train[0])}, {embeddings_train.shape}') - compound_property_vals_train = torch.tensor(df_train[compound_property], device=self.device, dtype=torch.float32)#.to_gpu_array() # need to move to GPU array?? - compound_property_vals_test = torch.tensor(df_test[compound_property], device=self.device, dtype=torch.float32)#.to_gpu_array() # need to move to GPU array?? - logger.info(f'type(df_train[{compound_property}])={type(df_train[compound_property])}, type(compound_property_vals_train)={type(compound_property_vals_train)}') + compound_property_vals_train = torch.tensor(df_train[compound_property], device=self.device, dtype=torch.float32) + compound_property_vals_test = torch.tensor(df_test[compound_property], device=self.device, dtype=torch.float32) train_pred, test_pred = self._build_and_train_nn( embeddings_train, compound_property_vals_train, From a785c00addffcf1317edb86d4732c69251e461d0 Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Sun, 27 Feb 2022 20:48:55 -0800 Subject: [PATCH 25/27] removed test --- cuchem/cuchem/interactive/chemvisualize.py | 26 ++-------------------- 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index a97021cc..c856679e 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -274,11 +274,6 @@ def __init__(self, cluster_wf, fingerprint_radius=2, fingerprint_nBits=512): [Input('show_generated_mol', 'children'), Input('show_selected_mol', 'children')])(self.handle_property_tables) - #self.app.callback( - # Output("n_test", "value"), - # [Input("bt_test", "n_clicks")], - # [State("n_test", "value")])(self.handle_test) - self.app.callback( [Output('section_generated_molecules_clustered', 'style'), Output('gen_figure', 'figure'), @@ -298,13 +293,6 @@ def __init__(self, cluster_wf, fingerprint_radius=2, fingerprint_nBits=512): State('rd_generation_type', 'value'), State('show_generated_mol', 'children')])(self.handle_generation) - def handle_test(self, bt_test, n_test): - comp_id, event_type = self._fetch_event_data() - if comp_id == 'bt_test' and event_type == 'n_clicks': - return n_test + 1 - raise dash.exceptions.PreventUpdate - - def handle_add_candidate(self, bt_add_candidate, bt_reset_candidates, generation_candidates): @@ -1173,14 +1161,7 @@ def constuct_layout(self): dcc.Markdown("Fingerprint Size", style={'marginTop': 12,}), dcc.Input(id='fingerprint_nBits', value=512), ], style={'marginLeft': 0, 'marginTop': '6px'} - ), - - - html.Div(className='row', children=[ - dcc.Markdown("n_test", style={'marginTop': 12,}), - dcc.Input(id='n_test', value=0), - ]#, style={'marginLeft': 0, 'marginTop': '6px'} - ), + ), dcc.Tabs([ dcc.Tab(label='Cluster Molecules', children=[ @@ -1307,10 +1288,7 @@ def constuct_layout(self): dbc.Button('GENERATE', id='bt_generate', n_clicks=0, style={'marginRight': 12}), dbc.Button('Reset', id='bt_reset_candidates', n_clicks=0), ], style={'marginLeft': 0}), - - #html.Div(className='row', children=[ - # dbc.Button('TEST', id='bt_test', n_clicks=0), #, style={'marginRight': 12}), - #]), #style={'marginLeft': 0}), + ]), dcc.Tab(label='Predict Properties', children=[ From d32e949de2786ba5f19719c51bb838b60da09943 Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Sun, 27 Feb 2022 21:04:33 -0800 Subject: [PATCH 26/27] minor --- cuchem/cuchem/interactive/chemvisualize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index c856679e..b42a8eda 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -1275,8 +1275,8 @@ def constuct_layout(self): dcc.Markdown(children="""**Please Select Two**""", id="mk_selection_msg", style={'marginTop': 18}), - #dcc.Markdown(children="""Click *Add* to populate this list""", - # style={'marginTop': 18}), + dcc.Markdown(children="""Click *Add* to populate this list""", + style={'marginTop': 18}), dcc.Checklist( id='ckl_candidate_mol_id', options=[], @@ -1285,7 +1285,7 @@ def constuct_layout(self): labelStyle={'display': 'block', 'marginLeft': 6, 'marginRight': 6} ), html.Div(className='row', children=[ - dbc.Button('GENERATE', id='bt_generate', n_clicks=0, style={'marginRight': 12}), + dbc.Button('Generate', id='bt_generate', n_clicks=0, style={'marginRight': 12}), dbc.Button('Reset', id='bt_reset_candidates', n_clicks=0), ], style={'marginLeft': 0}), From 7161687ae318dec76c5466fd83f3a6d208a47af7 Mon Sep 17 00:00:00 2001 From: Venkatesh Mysore Date: Wed, 23 Mar 2022 22:31:40 -0700 Subject: [PATCH 27/27] ready for merge with dev --- cuchem/cuchem/interactive/chemvisualize.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cuchem/cuchem/interactive/chemvisualize.py b/cuchem/cuchem/interactive/chemvisualize.py index b42a8eda..81c2e81f 100644 --- a/cuchem/cuchem/interactive/chemvisualize.py +++ b/cuchem/cuchem/interactive/chemvisualize.py @@ -650,8 +650,9 @@ def handle_analoguing( self.cluster_wf.df_embedding['n_union'] = self.cluster_wf.df_embedding['pc'] - self.cluster_wf.df_embedding['n_intersection'] + query_pc self.cluster_wf.df_embedding['similarity'] = self.cluster_wf.df_embedding['n_intersection'] / self.cluster_wf.df_embedding['n_union'] - self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() - wait(self.cluster_wf.df_embedding) + if hasattr(self.cluster_wf.df_embedding, 'persist'): + self.cluster_wf.df_embedding = self.cluster_wf.df_embedding.persist() + wait(self.cluster_wf.df_embedding) t5 = time.time() t0 = time.time() self.fp_df['similarity_cpu'] = self.fp_df['fp'].apply(lambda x: DataStructs.FingerprintSimilarity(query_fp, x))