From bae1fbe5bb2dc2c03934e5387c2fa262eb99a38d Mon Sep 17 00:00:00 2001
From: khetherin <hetherin@ebi.ac.uk>
Date: Thu, 12 Feb 2026 16:27:27 +0000
Subject: [PATCH] added metadataJSON.py and query_mapper.yaml

---
 convert_gvf_to_vcf/etc/query_mapper.yaml |  26 ++++
 convert_gvf_to_vcf/metadataJSON.py       | 190 +++++++++++++++++++++++
 2 files changed, 216 insertions(+)
 create mode 100644 convert_gvf_to_vcf/etc/query_mapper.yaml
 create mode 100644 convert_gvf_to_vcf/metadataJSON.py

diff --git a/convert_gvf_to_vcf/etc/query_mapper.yaml b/convert_gvf_to_vcf/etc/query_mapper.yaml
new file mode 100644
index 0000000..9b5a60d
--- /dev/null
+++ b/convert_gvf_to_vcf/etc/query_mapper.yaml
@@ -0,0 +1,26 @@
+###############################################################################
+# WARNING: do not use f-strings or % formatting here, use placeholders instead
+SUBMITTER_DETAILS:
+  first_name_query: "SELECT * FROM table WHERE x = ? AND y = ? AND z > ?"
+  first_name_query_placeholders: ("value1", "value2", 1111111)
+  last_name_query: "SELECT * FROM table WHERE x = ? AND y = ? AND z > ?"
+  last_name_query_placeholders: ("value3", "value4", 2222222)
+  email_query: "SELECT * FROM table WHERE x = ? AND y = ? AND z > ?"
+  email_query_placeholders: ("value5", "value6", 3333333)
+  centre_query: "SELECT * FROM table WHERE x = ? AND y = ? AND z > ?"
+  centre_query_placeholders: ("value7", "value8", 4444444)
+PROJECT_PREREGISTERED:
+  project_accession_query: "SELECT * FROM table WHERE x = ? AND y = ? AND z > ?"
+  project_accession_query_placeholders: ("value9", "value10", 555555)
+PROJECT_NEW:
+  title_query: "SELECT * FROM table WHERE x = ? AND y = ? AND z > ?"
+  title_query_placeholders: ("value11", "value12", 6666666)
+  project_description_query: "SELECT * FROM table WHERE x = ? AND y = ? AND z > ?"
+  project_description_query_placeholders: ("value13", "value14", 7777777)
+  tax_id_query: "SELECT * FROM table WHERE x = ? AND y = ? AND z > ?"
+  tax_id_query_placeholders: ("value15", "value16", 888888888)
+# ANALYSIS
+# SAMPLES - PREREGISTERED
+# SAMPLES - NEW
+# FILES
+###############################################################################
\ No newline at end of file
diff --git a/convert_gvf_to_vcf/metadataJSON.py b/convert_gvf_to_vcf/metadataJSON.py
new file mode 100644
index 0000000..b0ce92f
--- /dev/null
+++ b/convert_gvf_to_vcf/metadataJSON.py
@@ -0,0 +1,190 @@
+import json
+import re
+
+import psycopg2
+import yaml
+
+from ebi_eva_common_pyutils.config import cfg
+from ebi_eva_common_pyutils.logger import logging_config as log_cfg
+
+logger = log_cfg.get_logger(__name__)
+
+
+class MetadataManager:
+    """
+    The responsibility of this class is to manage the metadata for submission to EVA.
+    It will extract the required properties to fill in the EVA JSON submission schema
+    (https://github.com/EBIvariation/eva-sub-cli/blob/main/eva_sub_cli/etc/eva_schema.json)
+    by querying the DGVa database.
+    It will generate the metadata submission JSON file.
+    """
+    def __init__(self, path_to_config_yaml, path_to_sql_queries_yaml):
+        # coming from the config file
+        cfg.load_config_file(path_to_config_yaml)  # cfg is a dictionary
+        # db connection setup
+        self.connection = None # no connection yet
+        self.host = cfg.get("key_to_host") # get information from the config dictionary
+        self.port = cfg.get("key_to_port")
+
+        # load sql queries and their placeholders
+        sql_map = self.load_sql_registry(path_to_sql_queries_yaml)
+        # coming from DGVa
+        # the following are dict values obtained from the DGVa database, may need to parse to obtain the value itself
+        # SUBMITTER DETAILS
+        self.first_name = self.load_from_db(sql_map["SUBMITTER_DETAILS"]["first_name_query"], sql_map["SUBMITTER_DETAILS"]["first_name_query_placeholders"])
+        self.last_name = self.load_from_db(sql_map["SUBMITTER_DETAILS"]["last_name_query"], sql_map["SUBMITTER_DETAILS"]["last_name_query_placeholders"])
+        self.email = self.load_from_db(sql_map["SUBMITTER_DETAILS"]["email_query"], sql_map["SUBMITTER_DETAILS"]["email_query_placeholders"])
+        self.laboratory = "PLACEHOLDER VALUE"
+        self.centre = self.load_from_db(sql_map["SUBMITTER_DETAILS"]["centre_query"], sql_map["SUBMITTER_DETAILS"]["centre_query_placeholders"])
+        # PROJECT - PREREGISTERED
+        self.project_accession = self.load_from_db(sql_map["PROJECT_PREREGISTERED"]["project_accession_query"],
+                                                   sql_map["PROJECT_PREREGISTERED"]["project_accession_query_placeholders"]) # required
+        regex_pattern = r"^PRJ(E|D|N)[A-Z][0-9]+$" # regex for project accession
+        assert re.fullmatch(regex_pattern, self.project_accession), f"Invalid project accession: {self.project_accession} "
+        # PROJECT - NEW
+        self.title = self.load_from_db(sql_map["PROJECT_NEW"]["title_query"], sql_map["PROJECT_NEW"]["title_query_placeholders"])
+        self.project_description = self.load_from_db(sql_map["PROJECT_NEW"]["project_description_query"], sql_map["PROJECT_NEW"]["project_description_query_placeholders"])
+        self.tax_id = self.load_from_db(sql_map["PROJECT_NEW"]["tax_id_query"],
+            sql_map["PROJECT_NEW"]["tax_id_query_placeholders"])
+        assert isinstance(self.tax_id, int), f"Taxonomy id must be an integer. Taxa ID provided: {self.tax_id}"
+        # ANALYSIS
+        # SAMPLES - PREREGISTERED
+        # SAMPLES - NEW
+        # FILES
+
+    @staticmethod
+    def load_sql_registry(path_to_sql_queries_yaml):
+        with open(path_to_sql_queries_yaml, "r") as query_file:
+            return yaml.safe_load(query_file)
+
+    def load_from_db(self, query_to_load, query_place_holder_to_load):
+        try:
+            # create a connection to the database
+            self.connection = psycopg2.connect(host=self.host, port=self.port, database=self.database,
+                                               username=self.username, password=self.pw)
+            # create the iterator to process queries
+            with self.connection.cursor() as cur:
+                # prepare to fetch data
+                ###############################################################################
+                # WARNING: do not use f-strings or % formatting here, use placeholders instead
+                query = query_to_load
+                query_placeholders = query_place_holder_to_load
+                ###############################################################################
+                # run the sql query
+                cur.execute(query, query_placeholders)
+                # fetch the data from the cursor
+                row = cur.fetchall()
+                if row:
+                    logger.info(f"Fetching metadata query - SUCCESS - {len(dict)} records found")
+                    # sql row object is converted to python dict
+                    return dict(row)
+                else:
+                    logger.info("Fetching metadata query - SUCCESS - 0 records found")
+                    return {}
+        except Exception as e:
+            logger.warning(f"Database error: {e}")
+            # rollback failed transaction
+            self.connection.rollback()
+            return {}
+        finally:
+            self.connection.close()
+            logger.info("Connection closed safely - SUCCESS")
+
+    def get_submitter_details(self):
+        submitter_details_array = [
+            {
+                "lastName": self.last_name,
+                "firstName": self.first_name,
+                "email": self.email,
+                "laboratory": self.laboratory,
+                "centre": self.centre
+            }
+        ]
+        return submitter_details_array
+
+    def get_project_pre_registered(self):
+        # return project_object
+        # requires a project accession
+        # check project accession meets the regex
+        pass
+
+    def get_project_new(self):
+        # return project object
+        # requries title, description, taxID, centre
+        # check taxID is int
+        pass
+
+    def get_analysis(self):
+        # return analysis_array
+        pass
+
+    def get_sample_pre_registered(self):
+        # return sample_array
+        # requires analysisAlias, sampleinVCF, biosample_accession
+        pass
+
+    def get_sample_new(self):
+        # return sample_array
+        # requires analysisAlias, sampleinVCF, bioSampleObject
+        # bioSampleObject requires = name, taxID, scientific_name, release (hold-date) which can be found in DGVA
+        # bioSampleObject requires = collection date, geo loc which can be set to unknown/not collected
+        pass
+
+    def get_files(self):
+        # return files_array
+        # requires analysisAlias, fileName
+        pass
+
+
+    def determine_project_pre_registered(self):
+        if self.project_accession:
+            is_project_preregistered = True
+        else:
+            is_project_preregistered = False
+        return is_project_preregistered
+
+    def determine_sample_pre_registered(self):
+        if self.biosample_accession:
+            is_sample_preregistered = True
+        else:
+            is_sample_preregistered = False
+        return is_sample_preregistered
+
+    def create_json_file(self):
+        # determine if project new or pre-registered
+        is_project_preregistered = self.determine_project_pre_registered()
+        if is_project_preregistered:
+            project_metadata = self.get_project_pre_registered()
+        else:
+            project_metadata = self.get_project_new()
+
+        # determine if sample new or pre-registered
+        is_sample_preregistered = self.determine_sample_pre_registered()
+        if is_sample_preregistered:
+            sample_metadata = self.get_sample_pre_registered()
+        else:
+            sample_metadata = self.get_sample_new()
+
+        json_in_eva_format = {
+            "submitterDetails": self.get_submitter_details(),
+            "project": project_metadata,
+            "analysis": self.get_analysis(),
+            "sample": sample_metadata,
+            "files": self.get_files()
+        }
+        return json_in_eva_format
+
+    def write_json_file(self, json_file_path, json_in_eva_format):
+        # provide it with the dictionary and write the file using json.dump
+        with open(json_file_path, 'w') as f:
+            json.dump(json_in_eva_format, f, indent=4)
+        logger.info(f"Write JSON file - SUCCESS: {json_file_path}")
+
+# def main():
+    #manager = MetadataManager(path_to_query_mapper, path_to_sql_queries_yaml)
+    # root_json = manager.create_json_file()
+    # manager.write_json_file(path_to_json_file, json_in_eva_format)
+
+#
+# if __name__ == "__main__":
+#     main()