diff --git a/bin/upload_to_gcloud.py b/bin/upload_to_gcloud.py new file mode 100644 index 00000000..799164f9 --- /dev/null +++ b/bin/upload_to_gcloud.py @@ -0,0 +1,35 @@ +#!/usr/bin/env python3 + +import argparse +import os +from datetime import datetime + +from google.cloud import storage + + +def upload_evidence_to_gcloud_bucket(source_file_name, destination_folder): + """Uploads source_file_name to destination_folder in the Google Cloud Storage bucket. + File will be renamed to cttv012-[yyyy]-[mm]-[dd].json.gz""" + if 'OT_BUCKET_NAME' not in os.environ or 'OT_CREDS_FILE' not in os.environ: + print('Environment variables OT_BUCKET_NAME and OT_CREDS_FILE must be set') + return + bucket_name = os.environ['OT_BUCKET_NAME'] + creds_json_file = os.environ['OT_CREDS_FILE'] + + date_string = datetime.today().strftime('%Y-%m-%d') + destination_blob_name = f'{destination_folder}/cttv012-{date_string}.json.gz' + + storage_client = storage.Client.from_service_account_json(creds_json_file) + bucket = storage_client.bucket(bucket_name) + blob = bucket.blob(destination_blob_name) + + blob.upload_from_filename(source_file_name) + print(f'File {source_file_name} uploaded to gs://{bucket_name}/{destination_blob_name}.') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Upload file to Open Targets Google Cloud Storage') + parser.add_argument('--input-file', required=True, help='File to upload') + parser.add_argument('--destination-folder', required=True, help='Destination folder within bucket') + args = parser.parse_args() + upload_evidence_to_gcloud_bucket(args.input_file, args.destination_folder) diff --git a/docs/open-targets/environment.md b/docs/open-targets/environment.md index 3fd98914..9fd5c0d6 100644 --- a/docs/open-targets/environment.md +++ b/docs/open-targets/environment.md @@ -1,10 +1,11 @@ # Setting up the common environment -1. Log in to the LSF cluster (currently `codon`), where all data processing must take place. +1. Log in to the Codon cluster, where all data processing must take place. 1. Using a `become` command, switch to a common EVA production user instead of your personal account. -1. Adjust and execute the commands below to set up the environment. Notes: - - The first five variables are installation-specific and are blanked in this repository. You can get the values for the EVA installation from the [private repository](https://github.com/EBIvariation/configuration/blob/master/open-targets-configuration.md). - - These instructions use the Gitlab deployment of the pipeline onto the cluster. To use another installation just modify `CODE_ROOT` and `PYTHON_BIN` accordingly. +1. Adjust and execute the commands below to set up the environment. + +Note several variables are installation-specific and are blanked in this repository. EVA users can refer to the [private repository](https://github.com/EBIvariation/configuration/blob/master/open-targets/set-up-clinvar.sh) for values +(or `source` the file directly in the cluster). ```bash # This variable should point to the directory where the clone of this repository is located on the cluster @@ -28,4 +29,8 @@ export PYTHONPATH=${PYTHON_INSTALL_PATH} # Location of Python executable, pointing to the virtualenv export PYTHON_BIN=${CODE_ROOT}/env/bin/python + +# Required for Google Cloud upload +export OT_BUCKET_NAME= +export OT_CREDS_FILE= ``` diff --git a/docs/open-targets/generate-evidence-strings.md b/docs/open-targets/generate-evidence-strings.md index c095ea13..251189e6 100644 --- a/docs/open-targets/generate-evidence-strings.md +++ b/docs/open-targets/generate-evidence-strings.md @@ -59,14 +59,19 @@ After the evidence strings have been generated, summary metrics need to be updat There are also a few version numbers to record. For EFO version, compare the release date [here](https://github.com/EBISPOT/efo/releases) with manual curation date. For Ensembl version, do the same with the release date [here](https://www.ensembl.org/index.html) and the evidence string generation date. ### Submit evidence strings -The evidence string file (`evidence_strings.json`) must be uploaded to the [Open Targets Google Cloud Storage](https://console.cloud.google.com/storage/browser/otar012-eva/) and be named in the format `cttv012-[yyyy]-[mm]-[dd].json.gz` (e.g. `cttv012-2020-10-21.json.gz`). +The evidence string file (`evidence_strings.json`) must be compressed and uploaded to the [Open Targets Google Cloud Storage](https://console.cloud.google.com/storage/browser/otar012-eva/). +To do this, run the following: +```shell +gzip evidence_strings/evidence_strings.json +${CODE_ROOT}/bin/upload_to_gcloud.py --input-file evidence_strings/evidence_strings.json.gz --destination-folder disease-target-evidence +``` Once the upload is complete, send an email to Open Targets (data [at] opentargets.org) containing the following information from the [metrics spreadsheet](https://docs.google.com/spreadsheets/d/1g_4tHNWP4VIikH7Jb0ui5aNr0PiFgvscZYOe69g191k/): * The number of submitted evidence strings * The ClinVar release date * The Ensembl release * The EFO version used for mapping -* The `eva-opentargets` pipeline version +* The `CMAT` pipeline version * The Open Targets JSON schema version ### Submit feedback to ZOOMA @@ -76,7 +81,7 @@ The idea with [ZOOMA](http://www.ebi.ac.uk/spot/zooma/) is that we not only use The files are uploaded to the FTP, where ZOOMA will pick it up. At this stage, you only need to upload the **clinvar_xrefs** dataset (the *eva_clinvar* dataset is updated in the process of the manual curation). -To make changes to the FTP, you will need to log in to the cluster using your **personal account** and then you will need to have set up the common environment like in step 1. +To make changes to the FTP, you will need to log in to a datamover node. ```bash # Create the folder, copy the file to FTP, and update the “latest” folder FTP_PATH=${FTP_PATH_BASE}/`date +%Y/%m/%d` diff --git a/requirements.txt b/requirements.txt index c10d2182..3bd22bcd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ coverage==6.5.0 coveralls==3.3.1 +google-cloud-storage==3.5.0 jsonschema==4.23.0 numpy==1.26.0 pandas==2.2.3 diff --git a/setup.py b/setup.py index 46fa62e9..161c2c2e 100644 --- a/setup.py +++ b/setup.py @@ -43,5 +43,6 @@ def get_requires(): tests_require=get_requires(), setup_requires=get_requires(), test_suite='tests', - classifiers=classifiers + classifiers=classifiers, + scripts=[os.path.join(os.path.dirname(__file__), 'bin', 'upload_to_gcloud.py')] ) diff --git a/tests/trait_mapping/test_main.py b/tests/trait_mapping/test_main.py index 1d8ebab0..1821595e 100644 --- a/tests/trait_mapping/test_main.py +++ b/tests/trait_mapping/test_main.py @@ -97,17 +97,17 @@ def test_ols_exact_match(self): def test_zooma_high_confidence(self): # Finds nothing exact via OLS, so goes through Zooma as well and finds a high-confidence result trait = Trait('11p partial monosomy syndrome', None, None) - processed_trai = self.run_process_trait(trait) - assert len(processed_trai.ols_result_list) == 6 - assert len(processed_trai.zooma_result_list) == 1 - assert processed_trai.is_finished + processed_trait = self.run_process_trait(trait) + assert len(processed_trait.ols_result_list) == 6 + assert len(processed_trait.zooma_result_list) == 2 + assert processed_trait.is_finished def test_not_finished(self): # No sufficiently good mappings in OLS or Zooma trait = Trait('aicardi-goutieres syndrome 99', None, None) processed_trait = self.run_process_trait(trait) assert len(processed_trait.ols_result_list) == 0 - assert len(processed_trait.zooma_result_list) == 15 + assert len(processed_trait.zooma_result_list) == 19 assert not processed_trait.is_finished def test_ols_exact_ascii_match(self):