diff --git a/Makefile b/Makefile index c3baa81..8c1c446 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,6 @@ .PHONY: test test: + docker-compose down docker-compose run spec sh /app/test/run_tests.sh + docker-compose down \ No newline at end of file diff --git a/data_sources/djornl.yaml b/data_sources/djornl.yaml new file mode 100644 index 0000000..495aa8a --- /dev/null +++ b/data_sources/djornl.yaml @@ -0,0 +1,5 @@ +name: djornl +category: network +title: Jacobson Lab Exascale Networking data +home_url: https://github.com/kbase/exascale_data +data_url: https://github.com/kbase/exascale_data/releases/latest diff --git a/importers/djornl/manifest.schema.json b/importers/djornl/manifest.schema.json new file mode 100644 index 0000000..e29ab28 --- /dev/null +++ b/importers/djornl/manifest.schema.json @@ -0,0 +1,52 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "Exascale parser file manifest", + "type": "array", + "items": { + "type": "object", + "required": ["data_type", "path"], + "oneOf": [ + { + "properties": { + "data_type": { "enum": ["cluster"] } + }, + "required": [ "prefix" ] + }, + { + "properties": { + "data_type": { "enum": [ "node", "edge" ] } + } + } + ], + "properties": { + "data_type": { + "title": "Data type", + "type": "string", + "enum": ["node", "edge", "cluster"] + }, + "creation_date": { + "title": "File creation date", + "description": "date of file creation in the format YYYY-MM-DD", + "type": "string", + "format": "date" + }, + "description": { + "title": "Description of the cluster set", + "type": "string" + }, + "path": { + "title": "File path", + "type": "string" + }, + "prefix": { + "title": "Prefix", + "type": "string", + "description": "The prefix to be used for clusters, e.g. markov_i2:4. Required for cluster data, not used for node or edge data" + }, + "title": { + "title": "Name of the cluster set", + "type": "string" + } + } + } +} diff --git a/importers/djornl/parser.py b/importers/djornl/parser.py index 9eb5c87..5295fdb 100644 --- a/importers/djornl/parser.py +++ b/importers/djornl/parser.py @@ -8,6 +8,8 @@ import requests import os import csv +import yaml +from jsonschema.validators import Draft7Validator import importers.utils.config as config @@ -28,34 +30,49 @@ def _configure(self): configuration['_NODE_NAME'] = 'djornl_node' configuration['_EDGE_NAME'] = 'djornl_edge' - # Path config - configuration['_NODE_PATH'] = os.path.join( - configuration['ROOT_DATA_PATH'], - 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv' - ) - configuration['_NODE_FILE_COL_COUNT'] = 20 + # read the manifest file, which contains path and file type info + manifest_file = os.path.join(configuration['ROOT_DATA_PATH'], 'manifest.yaml') + + try: + with open(manifest_file) as fd: + manifest = yaml.safe_load(fd) + except FileNotFoundError: + raise RuntimeError( + f"No manifest file found at {manifest_file}.\n" + + "Please ensure that you have created a manifest that lists the files " + + "in the release" + ) + + # load the schema for the manifest and ensure that it is valid + schema_file = os.path.join(os.path.dirname(__file__), 'manifest.schema.json') + with open(schema_file) as fd: + manifest_schema = json.load(fd) + + validator = Draft7Validator(manifest_schema) + if not validator.is_valid(manifest): + raise RuntimeError( + "The manifest file failed validation with the following errors:\n" + + "\n".join(e.message for e in sorted(validator.iter_errors(manifest), key=str)) + + "\nPlease recheck the file and try again." + ) + + # make sure all the files listed actually exist + for type in ['node', 'edge', 'cluster']: + configuration[type + '_files'] = [] + + for file in manifest: + file_path = os.path.join(configuration['ROOT_DATA_PATH'], file['path']) + + if not os.path.exists(file_path): + raise RuntimeError(f"{file_path}: file does not exist") + + if not os.path.isfile(file_path): + raise RuntimeError(f"{file_path}: not a file") + + # add the file to the appropriate list + file['file_path'] = file_path + configuration[file['data_type'] + '_files'].append(file) - configuration['_EDGE_PATH'] = os.path.join( - configuration['ROOT_DATA_PATH'], - 'merged_edges-AMW-060820_AF.tsv' - ) - configuration['_EDGE_FILE_COL_COUNT'] = 5 - - _CLUSTER_BASE = os.path.join(configuration['ROOT_DATA_PATH'], 'cluster_data') - configuration['_CLUSTER_PATHS'] = { - 'cluster_I2': os.path.join( - _CLUSTER_BASE, - 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv' - ), - 'cluster_I4': os.path.join( - _CLUSTER_BASE, - 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv' - ), - 'cluster_I6': os.path.join( - _CLUSTER_BASE, - 'out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv' - ), - } self._config = configuration return self._config @@ -76,33 +93,44 @@ def load_edges(self): node_ix = {} edges = [] node_name = self.config()['_NODE_NAME'] - expected_col_count = self.config()['_EDGE_FILE_COL_COUNT'] - - with open(self.config()['_EDGE_PATH']) as fd: - csv_reader = csv.reader(fd, delimiter='\t') - next(csv_reader, None) # skip headers - line_no = 1 - for row in csv_reader: - line_no += 1 - - cols = [c.strip() for c in row] - if len(cols) != expected_col_count: - n_cols = len(cols) - raise RuntimeError(f"line {line_no}: expected {expected_col_count} cols, found {n_cols}") - - node_ix[cols[0]] = 1 - node_ix[cols[1]] = 1 - edge_type = cols[4] - if edge_type not in edge_remap: - raise RuntimeError(f"line {line_no}: invalid edge type: {edge_type}") - - edges.append({ - '_key': f'{cols[0]}__{cols[1]}__{edge_remap[edge_type]}__{cols[2]}', - '_from': f'{node_name}/{cols[0]}', - '_to': f'{node_name}/{cols[1]}', - 'score': float(cols[2]), - 'edge_type': edge_remap[edge_type], - }) + expected_col_count = 0 + headers = [] + + for file in self.config()['edge_files']: + with open(file['file_path']) as fd: + csv_reader = csv.reader(fd, delimiter='\t') + line_no = 0 + for row in csv_reader: + line_no += 1 + if len(row) <= 1 or row[0][0] == '#': + # comment / metadata + continue + + cols = [c.strip() for c in row] + + if len(cols) != expected_col_count: + n_cols = len(cols) + + if len(headers) == 0: + expected_col_count = len(cols) + headers = cols + continue + + raise RuntimeError(f"{file['path']} line {line_no}: expected {expected_col_count} cols, found {n_cols}") + + node_ix[cols[0]] = 1 + node_ix[cols[1]] = 1 + edge_type = cols[4] + if edge_type not in edge_remap: + raise RuntimeError(f"{file['path']} line {line_no}: invalid edge type: {edge_type}") + + edges.append({ + '_key': f'{cols[0]}__{cols[1]}__{edge_remap[edge_type]}__{cols[2]}', + '_from': f'{node_name}/{cols[0]}', + '_to': f'{node_name}/{cols[1]}', + 'score': float(cols[2]), + 'edge_type': edge_remap[edge_type], + }) return { 'nodes': [{'_key': n} for n in node_ix.keys()], @@ -114,73 +142,108 @@ def load_node_metadata(self): """Load node metadata""" nodes = [] - expected_col_count = self.config()['_NODE_FILE_COL_COUNT'] - with open(self.config()['_NODE_PATH']) as fd: - csv_reader = csv.reader(fd, delimiter=',') - next(csv_reader, None) # skip headers - line_no = 1 - for row in csv_reader: - line_no += 1 - - cols = [c.strip() for c in row] - if len(cols) != expected_col_count: - n_cols = len(cols) - raise RuntimeError(f"line {line_no}: expected {expected_col_count} cols, found {n_cols}") - - _key = cols[0] - node_type = cols[1] - if node_type != 'gene' and node_type != 'pheno': - raise RuntimeError(f"line {line_no}: invalid node type: {node_type}") - - go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else [] - - doc = { - '_key': _key, - 'node_type': node_type, - 'transcript': cols[2], - 'gene_symbol': cols[3], - 'gene_full_name': cols[4], - 'gene_model_type': cols[5], - 'tair_computational_desc': cols[6], - 'tair_curator_summary': cols[7], - 'tair_short_desc': cols[8], - 'go_descr': cols[9], - 'go_terms': go_terms, - 'mapman_bin': cols[11], - 'mapman_name': cols[12], - 'mapman_desc': cols[13], - 'pheno_aragwas_id': cols[14], - 'pheno_desc1': cols[15], - 'pheno_desc2': cols[16], - 'pheno_desc3': cols[17], - 'pheno_ref': cols[18], - 'user_notes': cols[19], - } - nodes.append(doc) + headers = [] + expected_col_count = 0 + valid_node_types = ['gene', 'pheno'] + for file in self.config()['node_files']: + with open(file['file_path']) as fd: + csv_reader = csv.reader(fd, delimiter=',') + line_no = 0 + for row in csv_reader: + line_no += 1 + if len(row) <= 1 or row[0][0] == '#': + # comment / metadata + continue + + cols = [c.strip() for c in row] + if len(cols) != expected_col_count: + + if len(headers) == 0: + # this is the header row; set up the expected column count + expected_col_count = len(cols) + headers = cols + continue + + # otherwise, this row does not have the correct number of columns + n_cols = len(cols) + raise RuntimeError(f"{file['path']} line {line_no}: expected {expected_col_count} cols, found {n_cols}") + + _key = cols[0] + node_type = cols[1] + if node_type not in valid_node_types: + raise RuntimeError(f"{file['path']} line {line_no}: invalid node type: {node_type}") + + go_terms = [c.strip() for c in cols[10].split(',')] if len(cols[10]) else [] + + doc = { + '_key': _key, + 'node_type': node_type, + 'transcript': cols[2], + 'gene_symbol': cols[3], + 'gene_full_name': cols[4], + 'gene_model_type': cols[5], + 'tair_computational_desc': cols[6], + 'tair_curator_summary': cols[7], + 'tair_short_desc': cols[8], + 'go_descr': cols[9], + 'go_terms': go_terms, + 'mapman_bin': cols[11], + 'mapman_name': cols[12], + 'mapman_desc': cols[13], + 'pheno_aragwas_id': cols[14], + 'pheno_desc1': cols[15], + 'pheno_desc2': cols[16], + 'pheno_desc3': cols[17], + 'pheno_ref': cols[18], + 'user_notes': cols[19], + } + nodes.append(doc) return {'nodes': nodes} def load_cluster_data(self): """Annotate genes with cluster ID fields.""" - nodes = [] - cluster_paths = self.config()['_CLUSTER_PATHS'] - for (cluster_label, path) in cluster_paths.items(): - with open(path) as fd: + + # index of nodes + node_ix = {} + for file in self.config()['cluster_files']: + cluster_label = file['prefix'] + with open(file['file_path']) as fd: csv_reader = csv.reader(fd, delimiter='\t') + line_no = 0 for row in csv_reader: - if len(row) > 1: - # remove the 'Cluster' text - cluster_id = row[0].replace('Cluster','') - gene_keys = row[1:] - nodes += [ - {'_key': key, cluster_label: int(cluster_id)} - for key in gene_keys - ] + line_no += 1 + if len(row) <= 1 or row[0][0] == '#': + # comment / metadata + continue + + self._parse_cluster_row(row, cluster_label, node_ix) + + # gather a list of cluster IDs for each node + nodes = [{ + '_key': key, + 'clusters': cluster_data + } for (key, cluster_data) in node_ix.items()] return {'nodes': nodes} + def _parse_cluster_row(self, row, cluster_label, node_ix): + + # remove the 'Cluster' text + id = row[0].replace('Cluster','') + node_keys = row[1:] + + for key in node_keys: + if key not in node_ix: + node_ix[key] = [] + + cluster_id = cluster_label + ':' + id + if cluster_id not in node_ix[key]: + node_ix[key].append(cluster_id) + + def save_dataset(self, dataset): if 'nodes' in dataset and len(dataset['nodes']) > 0: @@ -212,3 +275,32 @@ def load_data(self): self.save_dataset(self.load_node_metadata()) self.save_dataset(self.load_cluster_data()) + + def check_data_delta(self): + edge_data = self.load_edges() + node_metadata = self.load_node_metadata() + clusters = self.load_cluster_data() + + self.check_deltas(edge_data=edge_data, node_metadata=node_metadata, cluster_data=clusters) + + def check_deltas(self, edge_data={}, node_metadata={}, cluster_data={}): + + edge_nodes = set([e['_key'] for e in edge_data['nodes']]) + node_metadata_nodes = set([e['_key'] for e in node_metadata['nodes']]) + cluster_nodes = set([e['_key'] for e in cluster_data['nodes']]) + all_nodes = edge_nodes.union(node_metadata_nodes).union(cluster_nodes) + + # check all nodes in cluster_data have node_metadata + clstr_no_node_md_set = cluster_nodes.difference(node_metadata_nodes) + if clstr_no_node_md_set: + print({'clusters with no node metadata': clstr_no_node_md_set}) + + # check all nodes in the edge_data have node_metadata + edge_no_node_md_set = edge_nodes.difference(node_metadata_nodes) + if edge_no_node_md_set: + print({'edges with no node metadata': edge_no_node_md_set}) + + # count all edges + print("Dataset contains " + str(len(edge_data['edges'])) + " edges") + # count all nodes + print("Dataset contains " + str(len(all_nodes)) + " nodes") diff --git a/schemas/deltaloader/delta_load_registry.yaml b/schemas/deltaloader/delta_load_registry.yaml index dc9c7f8..419b2a7 100644 --- a/schemas/deltaloader/delta_load_registry.yaml +++ b/schemas/deltaloader/delta_load_registry.yaml @@ -5,7 +5,7 @@ schema: "$schema": http://json-schema.org/draft-07/schema# title: delta_load_registry type: object - description: Don't touch this. It's for the exlusive use of delta loaders. + description: Don't touch this. It's for the exclusive use of delta loaders. properties: _key: type: string diff --git a/schemas/djornl/djornl_node.yaml b/schemas/djornl/djornl_node.yaml index 9248f1c..a7b44a8 100644 --- a/schemas/djornl/djornl_node.yaml +++ b/schemas/djornl/djornl_node.yaml @@ -2,6 +2,10 @@ name: djornl_node type: vertex delta: false +indexes: + - type: hash + fields: ["clusters[*]"] + schema: "$schema": http://json-schema.org/draft-07/schema# title: Gene and Phenotype Vertices @@ -13,21 +17,14 @@ schema: type: string title: Key examples: ["AT1G01010"] - cluster_I2: - type: integer - title: Cluster 2 ID - description: Iterative random forest cluster group ID - examples: [1] - cluster_I4: - type: integer - title: Cluster 4 ID - description: Iterative random forest cluster group ID - examples: [13] - cluster_I6: - type: integer - title: Cluster 6 ID - description: Iterative random forest cluster group ID - examples: [27] + clusters: + type: array + title: Clusters + description: Clusters to which the node has been assigned + items: + type: string +# pattern: "^\w+:\d+$" + examples: [["markov_i2:1", "markov_i4:5"], ["markov_i6:3"]] node_type: type: string title: Node type diff --git a/stored_queries/djornl/djornl_fetch_clusters.yaml b/stored_queries/djornl/djornl_fetch_clusters.yaml index 4c6b8c5..1fadca3 100644 --- a/stored_queries/djornl/djornl_fetch_clusters.yaml +++ b/stored_queries/djornl/djornl_fetch_clusters.yaml @@ -2,25 +2,13 @@ name: djornl_fetch_clusters description: Fetch all nodes that are members of the specified cluster(s), and the edges and nodes within the specified distance (number of hops) of those nodes. params: type: object + required: [cluster_ids] properties: - cluster_i2_ids: - title: Cluster I2 IDs - description: Cluster I2 IDs to locate - items: {type: integer} - default: [] - examples: [[1], [3, 5]] - cluster_i4_ids: - title: Cluster I4 IDs - description: Cluster I4 IDs to locate - items: {type: integer} - examples: [[2], [4, 6]] - default: [] - cluster_i6_ids: - title: Cluster I6 IDs - description: Cluster I6 IDs to locate - items: {type: integer} - examples: [[666], [999, 333]] - default: [] + cluster_ids: + title: Cluster IDs + description: Cluster IDs, in the form "clustering_system_name:cluster_id" + items: {type: string} + examples: [['markov_i2:5', 'markov_i6:2'],['markov_i6:1']] distance: type: integer title: Traversal Distance @@ -31,7 +19,7 @@ params: query: | LET node_ids = ( FOR n IN djornl_node - FILTER n.cluster_I2 IN @cluster_i2_ids OR n.cluster_I4 IN @cluster_i4_ids OR n.cluster_I6 IN @cluster_i6_ids + FILTER n.clusters ANY IN @cluster_ids FOR node IN 0..@distance ANY n djornl_edge OPTIONS {bfs: true, uniqueVertices: "global"} RETURN DISTINCT node._id diff --git a/test/djornl/col_count_errors/manifest.yaml b/test/djornl/col_count_errors/manifest.yaml new file mode 100644 index 0000000..88ab96d --- /dev/null +++ b/test/djornl/col_count_errors/manifest.yaml @@ -0,0 +1,5 @@ +- data_type: edge + path: merged_edges-AMW-060820_AF.tsv + +- data_type: node + path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv diff --git a/test/djornl/empty_files/manifest.yaml b/test/djornl/empty_files/manifest.yaml new file mode 100644 index 0000000..7d42ff6 --- /dev/null +++ b/test/djornl/empty_files/manifest.yaml @@ -0,0 +1,17 @@ +- data_type: edge + path: merged_edges-AMW-060820_AF.tsv + +- data_type: node + path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv + +- data_type: cluster + prefix: markov_i2 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv + +- data_type: cluster + prefix: markov_i4 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv + +- data_type: cluster + prefix: markov_i6 + path: cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv diff --git a/test/djornl/invalid_file/edges.tsv/touch b/test/djornl/invalid_file/edges.tsv/touch new file mode 100644 index 0000000..e69de29 diff --git a/test/djornl/invalid_file/manifest.yaml b/test/djornl/invalid_file/manifest.yaml new file mode 100644 index 0000000..3a12de5 --- /dev/null +++ b/test/djornl/invalid_file/manifest.yaml @@ -0,0 +1,9 @@ +- data_type: edge + path: edges.tsv + +- data_type: node + path: nodes.csv + +- data_type: cluster + prefix: markov_i2 + path: clusters.tsv diff --git a/test/djornl/invalid_manifest/manifest.yaml b/test/djornl/invalid_manifest/manifest.yaml new file mode 100644 index 0000000..e7fa88e --- /dev/null +++ b/test/djornl/invalid_manifest/manifest.yaml @@ -0,0 +1,10 @@ +- data_type: edge + path: edges.tsv + +- data_type: node + +- data_type: cluster + path: clusters.tsv + +- data_type: ping-pong balls + path: where? \ No newline at end of file diff --git a/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv index af5fa6c..543dd99 100644 --- a/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv +++ b/test/djornl/invalid_types/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv @@ -1,4 +1,5 @@ node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes +# data_type: node As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", AT1G01010,Monkey,AT1G01010.1,NTL10,NAC domain containing protein 1,protein_coding,NAC domain containing protein 1;(source:Araport11),,NAC domain containing protein 1,"DNA-binding transcription factor activity, DNA binding","GO:0003700, GO:0003677",15.5.17,.RNA biosynthesis.transcriptional regulation.transcription factor (NAC),transcription factor (NAC) (original description: pep chromosome:TAIR10:1:3631:5899:1 gene:AT1G01010 transcript:AT1G01010.1 gene_biotype:protein_coding transcript_biotype:protein_coding gene_symbol:NAC001 description:NAC domain-containing protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q0WV96]),,,,,, diff --git a/test/djornl/invalid_types/manifest.yaml b/test/djornl/invalid_types/manifest.yaml new file mode 100644 index 0000000..88ab96d --- /dev/null +++ b/test/djornl/invalid_types/manifest.yaml @@ -0,0 +1,5 @@ +- data_type: edge + path: merged_edges-AMW-060820_AF.tsv + +- data_type: node + path: aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv diff --git a/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv b/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv index f9857bd..a98f49f 100644 --- a/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv +++ b/test/djornl/invalid_types/merged_edges-AMW-060820_AF.tsv @@ -1,3 +1,4 @@ +# data_type: edge node1 node2 edge edge_descrip layer_descrip As2 AT1G01020 8.422046084731258 AraGWAS-Association_score AraGWAS-Some-Old-Rubbish-I-Made-Up As2 AT1G01040 5.422046084731258 AraGWAS-Association_score AraGWAS-Phenotype_Associations diff --git a/test/djornl/missing_files/manifest.yaml b/test/djornl/missing_files/manifest.yaml new file mode 100644 index 0000000..3a12de5 --- /dev/null +++ b/test/djornl/missing_files/manifest.yaml @@ -0,0 +1,9 @@ +- data_type: edge + path: edges.tsv + +- data_type: node + path: nodes.csv + +- data_type: cluster + prefix: markov_i2 + path: clusters.tsv diff --git a/test/djornl/results.json b/test/djornl/results.json index a844c2c..7fd3a4d 100644 --- a/test/djornl/results.json +++ b/test/djornl/results.json @@ -27,21 +27,15 @@ }, "load_cluster_data": { "nodes": [ - {"_key": "AT1G01010", "cluster_I2": 1}, - {"_key": "AT1G01030", "cluster_I2": 1}, - {"_key": "AT1G01040", "cluster_I2": 1}, - {"_key": "AT1G01050", "cluster_I2": 2}, - {"_key": "AT1G01060", "cluster_I2": 2}, - {"_key": "AT1G01070", "cluster_I2": 2}, - {"_key": "AT1G01080", "cluster_I2": 3}, - {"_key": "AT1G01090", "cluster_I2": 3}, - {"_key": "AT1G01020", "cluster_I2": 5}, - {"_key": "AT1G01040", "cluster_I6": 1}, - {"_key": "AT1G01090", "cluster_I6": 1}, - {"_key": "AT1G01070", "cluster_I6": 2}, - {"_key": "AT1G01010", "cluster_I6": 3}, - {"_key": "AT1G01020", "cluster_I6": 3}, - {"_key": "AT1G01030", "cluster_I6": 3} + {"_key": "AT1G01010", "clusters": ["markov_i2:1", "markov_i6:3"]}, + {"_key": "AT1G01030", "clusters": ["markov_i2:1", "markov_i6:3"]}, + {"_key": "AT1G01040", "clusters": ["markov_i2:1", "markov_i6:1"]}, + {"_key": "AT1G01050", "clusters": ["markov_i2:2"]}, + {"_key": "AT1G01060", "clusters": ["markov_i2:2"]}, + {"_key": "AT1G01070", "clusters": ["markov_i2:2", "markov_i6:2"]}, + {"_key": "AT1G01080", "clusters": ["markov_i2:3"]}, + {"_key": "AT1G01090", "clusters": ["markov_i2:3", "markov_i6:1"]}, + {"_key": "AT1G01020", "clusters": ["markov_i2:5", "markov_i6:3"]} ] }, "load_node_metadata": { @@ -93,220 +87,265 @@ ] }, "fetch_genes": { - "AT1G01010": { - "0": { - "nodes": ["AT1G01010"], - "edges": [] + "keys": { + "Mary Poppins": { + "distance": { + "0": {"nodes": [], "edges": []}, + "1": {"nodes": [], "edges": []}, + "5": {"nodes": [], "edges": []} + } }, - "1": { - "nodes": [ - "AT1G01010", - "AT1G01020", - "AT1G01030", - "AT1G01040" - ], - "edges": [ - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5" - ] + "AT1G01010": { + "distance": { + "0": { + "nodes": ["AT1G01010"], + "edges": [] + }, + "1": { + "nodes": [ + "AT1G01010", + "AT1G01020", + "AT1G01030", + "AT1G01040" + ], + "edges": [ + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] - } - }, - "AT1G01020__AT1G01070": { - "0": { - "nodes": ["AT1G01020", "AT1G01070"], - "edges": [] - }, - "1": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3" - ] - }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] + "AT1G01020__AT1G01070": { + "distance": { + "0": { + "nodes": ["AT1G01020", "AT1G01070"], + "edges": [] + }, + "1": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } } } }, "fetch_phenotypes": { - "As2": { - "0": { - "nodes": ["As2"], - "edges": [] - }, - "1": { - "nodes": ["As2", "AT1G01020", "AT1G01040"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4" - ] - }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] - } - }, - "As2__Na23": { - "0": { - "nodes": ["As2", "Na23"], - "edges": [] + "keys": { + "Mary Poppins": { + "distance": { + "0": {"nodes": [], "edges": []}, + "1": {"nodes": [], "edges": []}, + "5": {"nodes": [], "edges": []} + } }, - "1": { - "nodes": ["As2", "Na23", "AT1G01020", "AT1G01040"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4" - ] + "As2": { + "distance": { + "0": { + "nodes": ["As2"], + "edges": [] + }, + "1": { + "nodes": ["As2", "AT1G01020", "AT1G01040"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "Na23"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] + "As2__Na23": { + "distance": { + "0": { + "nodes": ["As2", "Na23"], + "edges": [] + }, + "1": { + "nodes": ["As2", "Na23", "AT1G01020", "AT1G01040"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "Na23"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } } } }, "search_nodes": { - "Mary Poppins": { - "0": {"nodes": [], "edges": []}, - "1": {"nodes": [], "edges": []}, - "5": {"nodes": [], "edges": []} - }, - "GO:0005515": { - "0": { - "nodes": ["AT1G01040", "AT1G01090"], - "edges": [] - }, - "1": { - "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], - "edges": [ - "As2__AT1G01040__pheno_assn__5.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01080__AT1G01090__ppi_liter__2.8" - ] + "search_text": { + "Mary Poppins": { + "distance": { + "0": {"nodes": [], "edges": []}, + "1": {"nodes": [], "edges": []}, + "5": {"nodes": [], "edges": []} + } }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7", - "AT1G01080__AT1G01090__ppi_liter__2.8" - ] + "GO:0005515": { + "distance": { + "0": { + "nodes": ["AT1G01040", "AT1G01090"], + "edges": [] + }, + "1": { + "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01040__pheno_assn__5.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01080__AT1G01090__ppi_liter__2.8" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7", + "AT1G01080__AT1G01090__ppi_liter__2.8" + ] + } + } } } }, "fetch_clusters": { - "i6-1": { - "0": { - "nodes": ["AT1G01040", "AT1G01090"], - "edges": [] - }, - "1": { - "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], - "edges": [ - "As2__AT1G01040__pheno_assn__5.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01080__AT1G01090__ppi_liter__2.8" - ] - }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7", - "AT1G01080__AT1G01090__ppi_liter__2.8" - ] - } - }, - "i2-5__i6-2": { - "0": { - "nodes": ["AT1G01020", "AT1G01070"], - "edges": [] + "cluster_ids": { + "Mary Poppins": { + "distance": { + "0": {"nodes": [], "edges": []}, + "1": {"nodes": [], "edges": []}, + "5": {"nodes": [], "edges": []} + } }, - "1": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3" - ] + "markov_i6:1": { + "distance": { + "0": { + "nodes": ["AT1G01040", "AT1G01090"], + "edges": [] + }, + "1": { + "nodes": ["As2", "AT1G01010", "AT1G01040", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01040__pheno_assn__5.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01080__AT1G01090__ppi_liter__2.8" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01080", "AT1G01090"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7", + "AT1G01080__AT1G01090__ppi_liter__2.8" + ] + } + } }, - "5": { - "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], - "edges": [ - "As2__AT1G01020__pheno_assn__8.4", - "As2__AT1G01040__pheno_assn__5.4", - "As75__AT1G01020__pheno_assn__39.9", - "AT1G01010__AT1G01020__ppi_hithru__2.3", - "AT1G01010__AT1G01030__ppi_hithru__2.4", - "AT1G01010__AT1G01040__domain_co_occur__2.5", - "AT1G01010__AT1G01040__ppi_liter__170.5", - "AT1G01030__AT1G01050__gene_coexpr__2.6", - "AT1G01050__AT1G01060__ppi_liter__2.7" - ] + "markov_i2:5__markov_i6:2": { + "distance": { + "0": { + "nodes": ["AT1G01020", "AT1G01070"], + "edges": [] + }, + "1": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01070"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3" + ] + }, + "5": { + "nodes": ["As2", "As75", "AT1G01010", "AT1G01020", "AT1G01030", "AT1G01040", "AT1G01050", "AT1G01060", "AT1G01070"], + "edges": [ + "As2__AT1G01020__pheno_assn__8.4", + "As2__AT1G01040__pheno_assn__5.4", + "As75__AT1G01020__pheno_assn__39.9", + "AT1G01010__AT1G01020__ppi_hithru__2.3", + "AT1G01010__AT1G01030__ppi_hithru__2.4", + "AT1G01010__AT1G01040__domain_co_occur__2.5", + "AT1G01010__AT1G01040__ppi_liter__170.5", + "AT1G01030__AT1G01050__gene_coexpr__2.6", + "AT1G01050__AT1G01060__ppi_liter__2.7" + ] + } + } } } } diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv b/test/djornl/test_data/I2_named.tsv similarity index 61% rename from test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv rename to test/djornl/test_data/I2_named.tsv index 086a920..46f4498 100644 --- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I2_named.tsv +++ b/test/djornl/test_data/I2_named.tsv @@ -1,3 +1,6 @@ +# data_type: cluster +# prefix: markov_i2 +# title: Markov clustering, inflation = 2 Cluster1 AT1G01010 AT1G01030 AT1G01040 Cluster2 AT1G01050 AT1G01060 AT1G01070 Cluster3 AT1G01080 AT1G01090 diff --git a/test/djornl/test_data/I4_named.tsv b/test/djornl/test_data/I4_named.tsv new file mode 100644 index 0000000..147831e --- /dev/null +++ b/test/djornl/test_data/I4_named.tsv @@ -0,0 +1,3 @@ +# prefix: markov_i4 +# title: Markov clustering, inflation = 4 +# data_type: cluster diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv b/test/djornl/test_data/I6_named.tsv similarity index 53% rename from test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv rename to test/djornl/test_data/I6_named.tsv index 389cae2..b4680eb 100644 --- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I6_named.tsv +++ b/test/djornl/test_data/I6_named.tsv @@ -1,3 +1,6 @@ +# data_type: cluster +# prefix: markov_i6 +# title: Markov clustering, inflation = 6 Cluster1 AT1G01040 AT1G01090 Cluster2 AT1G01070 Cluster3 AT1G01010 AT1G01020 AT1G01030 diff --git a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv b/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv deleted file mode 100644 index 8b13789..0000000 --- a/test/djornl/test_data/cluster_data/out.aranetv2_subnet_AT-CX_top10percent_anno_AF_082919.abc.I4_named.tsv +++ /dev/null @@ -1 +0,0 @@ - diff --git a/test/djornl/test_data/merged_edges-AMW-060820_AF.tsv b/test/djornl/test_data/edges.tsv similarity index 100% rename from test/djornl/test_data/merged_edges-AMW-060820_AF.tsv rename to test/djornl/test_data/edges.tsv diff --git a/test/djornl/test_data/manifest.yaml b/test/djornl/test_data/manifest.yaml new file mode 100644 index 0000000..2eb28e3 --- /dev/null +++ b/test/djornl/test_data/manifest.yaml @@ -0,0 +1,19 @@ +- data_type: edge + path: edges.tsv + date_created: 2020-12-25 + +- data_type: node + path: nodes.csv + date_created: 2019-01-01 + +- data_type: cluster + prefix: markov_i2 + path: I2_named.tsv + +- data_type: cluster + prefix: markov_i4 + path: I4_named.tsv + +- data_type: cluster + prefix: markov_i6 + path: I6_named.tsv diff --git a/test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv b/test/djornl/test_data/nodes.csv similarity index 99% rename from test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv rename to test/djornl/test_data/nodes.csv index 5bc0e1d..a032142 100644 --- a/test/djornl/test_data/aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv +++ b/test/djornl/test_data/nodes.csv @@ -1,3 +1,4 @@ +# data_type: node node_id,node_type,transcript,gene_symbol,gene_full_name,gene_model_type,TAIR_Computational_description,TAIR_Curator_summary,TAIR_short_description,GO_descr,GO_terms,MapMan_bin,MapMan_name,MapMan_descr,pheno_AraGWAS_ID,pheno_descrip1,pheno_descrip2,pheno_descrip3,pheno_ref,UserNotes As2,pheno,,,,,,,,,,,,,10.21958/phenotype:103,,bacterial disease resistance,The resistance exhibited by a plant or a group of plants (population) in response to the disease caused by a bacterial pathogen infection as compared to the susceptible and/or the reference plants of the same species. [GR:pj],"Atwell et. al, Nature 2010", As75,pheno,,,,,,,,,,,,,10.21958/phenotype:67,"Arsenic concentrations in leaves, grown in soil. Elemental analysis was performed with an ICP-MS (PerkinElmer). Sample normalized to calculated weights as described in Baxter et al., 2008",arsenic concentration,A mineral and ion content related trait (TO:0000465) which is the concentration of arsenic (CHEBI:22632) in some plant structure (PO:0009011). [GR:Karthik],"Atwell et. al, Nature 2010", diff --git a/test/stored_queries/test_djornl.py b/test/stored_queries/test_djornl.py index df2a7e5..7c502a3 100644 --- a/test/stored_queries/test_djornl.py +++ b/test/stored_queries/test_djornl.py @@ -6,10 +6,8 @@ import unittest import requests import os -import glob -import yaml -from test.helpers import get_config, assert_subset, modified_environ +from test.helpers import get_config, modified_environ from test.stored_queries.helpers import create_test_docs from importers.djornl.parser import DJORNL_Parser @@ -79,6 +77,7 @@ def check_expected_results(self, description, response, expected): if _VERBOSE: print("Running test " + description) + results = response['results'][0] self.assertEqual( set([n["_key"] for n in results['nodes']]), @@ -93,44 +92,48 @@ def check_expected_results(self, description, response, expected): def test_fetch_all(self): - # expect all the nodes from load_node_metadata and all the edges from load_edges - expected = { - "nodes": [n["_key"] for n in self.json_data['load_node_metadata']['nodes']], - "edges": [ { - "_to": e["_to"], - "_from": e["_from"], - "score": e["score"], - "edge_type": e["edge_type"] } for e in self.json_data['load_edges']['edges'] - ] - } - + response = self.submit_query('djornl_fetch_all') self.check_expected_results( "djornl_fetch_all", - self.submit_query('djornl_fetch_all'), + response, self.json_data['fetch_all'] ) + # ensure that all the cluster data is returned OK + node_data = response['results'][0]['nodes'] + nodes_with_clusters = [json.dumps({ + '_key': n['_key'], + 'clusters': n['clusters'] + }) for n in node_data if 'clusters' in n] + self.assertEqual( + set(nodes_with_clusters), + set([json.dumps(this) for this in self.json_data['load_cluster_data']['nodes']]) + ) # indexing schema in results.json - # self.json_data[query][primary_param][distance_param] - # if primary_param is an array, join the array entities with "__" + # self.json_data[query_name][param_name][param_value]["distance"][distance_param] + # e.g. for fetch_clusters data: + # "fetch_clusters": { + # "cluster_ids": { + # "markov_i2:6__markov_i4:3": { + # "distance": { + # 1: { + # "nodes": [ node IDs ], + # "edges": [ edge data ], + # } + # } + # } + # } + # } + # if param_value is an array, join the array entities with "__" # results are in the form {"nodes": [...], "edges": [...]} # nodes are represented as a list of node[_key] # edges are objects with keys _to, _from, edge_type and score - def test_fetch_phenotypes_no_results(self): - - resp = self.submit_query('djornl_fetch_phenotypes', { - # gene node - "keys": ["AT1G01010"], - }) - self.assertEqual(resp['results'][0], self.no_results) - - def test_fetch_phenotypes(self): - for fetch_args in self.json_data['fetch_phenotypes'].keys(): - for distance in self.json_data['fetch_phenotypes'][fetch_args].keys(): + for (fetch_args, key_data) in self.json_data['fetch_phenotypes']['keys'].items(): + for (distance, distance_data) in key_data['distance'].items(): resp = self.submit_query('djornl_fetch_phenotypes', { "keys": fetch_args.split('__'), "distance": int(distance), @@ -138,22 +141,14 @@ def test_fetch_phenotypes(self): self.check_expected_results( "fetch phenotypes with args " + fetch_args + " and distance " + distance, resp, - self.json_data['fetch_phenotypes'][fetch_args][distance] + distance_data ) - def test_fetch_genes_no_results(self): - resp = self.submit_query('djornl_fetch_genes', { - # phenotype node - "keys": ["As2"], - }) - self.assertEqual(resp['results'][0], self.no_results) - - def test_fetch_genes(self): - for fetch_args in self.json_data['fetch_genes'].keys(): - for distance in self.json_data['fetch_genes'][fetch_args].keys(): + for (fetch_args, key_data) in self.json_data['fetch_genes']['keys'].items(): + for (distance, distance_data) in key_data['distance'].items(): resp = self.submit_query('djornl_fetch_genes', { "keys": fetch_args.split('__'), "distance": int(distance), @@ -161,54 +156,29 @@ def test_fetch_genes(self): self.check_expected_results( "fetch genes with args " + fetch_args + " and distance " + distance, resp, - self.json_data['fetch_genes'][fetch_args][distance] + distance_data ) - def test_fetch_clusters_no_results(self): - - resp = self.submit_query('djornl_fetch_clusters', { - 'cluster_i2_ids': [666], - 'cluster_i4_ids': [666], - 'cluster_i6_ids': [666], - }) - self.assertEqual(resp['results'][0], self.no_results) - - def test_fetch_clusters(self): - for fetch_args in self.json_data['fetch_clusters'].keys(): - cluster_args = {} - for arg in fetch_args.split('__'): - [c_name, c_id] = arg.split('-', maxsplit=1) - if "cluster_" + c_name + "_ids" in cluster_args: - cluster_args["cluster_" + c_name + "_ids"] += int(c_id) - else: - cluster_args["cluster_" + c_name + "_ids"] = [int(c_id)] - - for distance in self.json_data['fetch_clusters'][fetch_args].keys(): - cluster_args['distance'] = int(distance) - resp = self.submit_query('djornl_fetch_clusters', cluster_args) + for (fetch_args, cluster_data) in self.json_data['fetch_clusters']['cluster_ids'].items(): + for (distance, distance_data) in cluster_data['distance'].items(): + resp = self.submit_query('djornl_fetch_clusters', { + "cluster_ids": fetch_args.split('__'), + "distance": int(distance), + }) self.check_expected_results( "fetch clusters with args " + fetch_args + " and distance " + distance, resp, - self.json_data['fetch_clusters'][fetch_args][distance] + distance_data ) - @unittest.skip('This test is disabled until automated view loading is possible') - def test_search_nodes_no_results(self): - - resp = self.submit_query('djornl_search_nodes', { - "search_text": "Mary Poppins", - }) - self.assertEqual(resp['results'][0], self.no_results) - - @unittest.skip('This test is disabled until automated view loading is possible') def test_search_nodes(self): - for search_text in self.json_data['search_nodes'].keys(): - for distance in self.json_data['search_nodes'][search_text].keys(): + for (search_text, search_data) in self.json_data['search_nodes']['search_text'].items(): + for (distance, distance_data) in search_data['distance'].items(): resp = self.submit_query('djornl_search_nodes', { "search_text": search_text, "distance": int(distance), @@ -216,5 +186,5 @@ def test_search_nodes(self): self.check_expected_results( "search nodes with args " + search_text + " and distance " + distance, resp, - self.json_data['search_nodes'][search_text][distance] + distance_data ) diff --git a/test/stored_queries/test_djornl_parser.py b/test/stored_queries/test_djornl_parser.py index b2043b9..91dfdb5 100644 --- a/test/stored_queries/test_djornl_parser.py +++ b/test/stored_queries/test_djornl_parser.py @@ -10,7 +10,6 @@ import requests import os import contextlib - from importers.djornl.parser import DJORNL_Parser from test.helpers import get_config, assert_subset, modified_environ @@ -40,69 +39,86 @@ def init_parser_with_path(self, root_path): return parser - def test_load_empty_files(self): - """ test loading files containing no data """ + def test_load_no_manifest(self): + """ test loading when the manifest does not exist """ + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'no_manifest') + err_str = 'No manifest file found at ' + os.path.join(RES_ROOT_DATA_PATH, 'manifest.yaml') + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) - # path: test/djornl/empty_files - RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files') - parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - self.assertEqual(parser.load_edges(), {"nodes": [], "edges": []}) - self.assertEqual(parser.load_node_metadata(), {"nodes": []}) - self.assertEqual(parser.load_cluster_data(), {"nodes": []}) + def test_load_invalid_manifest(self): + """ test an invalid manifest file """ + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_manifest') + err_str = "The manifest file failed validation with the following errors:" + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) + + + def test_load_invalid_file(self): + """ test loading when a file specified in the manifest is a directory """ + + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_file') + + # edges: directory, not a file + err_str = os.path.join(RES_ROOT_DATA_PATH, "edges.tsv") + ": not a file" + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) def test_load_missing_files(self): """ test loading when files cannot be found """ - # this dir does not contain the correct file structure - # path: test/djornl/empty_files/cluster_data - RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files', 'cluster_data') - parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'missing_files') + # not found + err_str = os.path.join(RES_ROOT_DATA_PATH, "edges.tsv") + ': file does not exist' + with self.assertRaisesRegex(RuntimeError, err_str): + self.init_parser_with_path(RES_ROOT_DATA_PATH) - err_str = "No such file or directory: '" + RES_ROOT_DATA_PATH - with self.assertRaisesRegex(FileNotFoundError, err_str): - parser.load_edges() - with self.assertRaisesRegex(FileNotFoundError, err_str): - parser.load_node_metadata() + def test_load_empty_files(self): + """ test loading files containing no data """ - with self.assertRaisesRegex(FileNotFoundError, err_str): - parser.load_cluster_data() + # path: test/djornl/empty_files + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'empty_files') + parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) + self.assertEqual(parser.load_edges(), {"nodes": [], "edges": []}) + self.assertEqual(parser.load_node_metadata(), {"nodes": []}) + self.assertEqual(parser.load_cluster_data(), {"nodes": []}) - def test_load_invalid_types(self): - """ test file format errors """ + def test_load_col_count_errors(self): + """ test files with invalid numbers of columns """ - # path: test/djornl/invalid_types - RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types') + # path: test/djornl/col_count_errors + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'col_count_errors') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) # invalid edge type - edge_err_msg = 'line 2: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up' + edge_err_msg = 'line 6: expected 5 cols, found 3' with self.assertRaisesRegex(RuntimeError, edge_err_msg): parser.load_edges() # invalid node type - node_err_msg = 'line 4: invalid node type: Monkey' + node_err_msg = 'line 3: expected 20 cols, found 22' with self.assertRaisesRegex(RuntimeError, node_err_msg): parser.load_node_metadata() - def test_load_col_count_errors(self): - """ test files with invalid numbers of columns """ + def test_load_invalid_types(self): + """ test file format errors """ - # path: test/djornl/col_count_errors - RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'col_count_errors') + # path: test/djornl/invalid_types + RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'invalid_types') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) # invalid edge type - edge_err_msg = 'line 6: expected 5 cols, found 3' + edge_err_msg = 'merged_edges-AMW-060820_AF.tsv line 3: invalid edge type: AraGWAS-Some-Old-Rubbish-I-Made-Up' with self.assertRaisesRegex(RuntimeError, edge_err_msg): parser.load_edges() # invalid node type - node_err_msg = 'line 3: expected 20 cols, found 22' + node_err_msg = 'aranet2-aragwas-MERGED-AMW-v2_091319_nodeTable.csv line 5: invalid node type: Monkey' with self.assertRaisesRegex(RuntimeError, node_err_msg): parser.load_node_metadata() @@ -112,8 +128,6 @@ def test_load_valid_edge_data(self): RES_ROOT_DATA_PATH = os.path.join(_TEST_DIR, 'djornl', 'test_data') parser = self.init_parser_with_path(RES_ROOT_DATA_PATH) - self.maxDiff = None - edge_data = parser.load_edges() self.assertEqual( edge_data, @@ -143,3 +157,5 @@ def test_load_valid_cluster_data(self): self.json_data["load_cluster_data"] ) + parser.check_data_delta() +