From d833d672ee09f9a3d734a8ecea5ac81b15715a7d Mon Sep 17 00:00:00 2001 From: jiarongli Date: Mon, 3 Nov 2025 14:43:35 +0100 Subject: [PATCH 1/4] init initialize update last update on the old openaire api format create mapping for new openaire api update update add new samples remove sample data add project remove run_test script --- pygeometa/schemas/__init__.py | 3 +- pygeometa/schemas/openaire/__init__.py | 376 +++++++++++++++++++++++++ 2 files changed, 378 insertions(+), 1 deletion(-) create mode 100644 pygeometa/schemas/openaire/__init__.py diff --git a/pygeometa/schemas/__init__.py b/pygeometa/schemas/__init__.py index e9059ce..a4bae55 100644 --- a/pygeometa/schemas/__init__.py +++ b/pygeometa/schemas/__init__.py @@ -64,7 +64,8 @@ 'wmo-cmp': 'pygeometa.schemas.wmo_cmp.WMOCMPOutputSchema', 'wmo-wcmp2': 'pygeometa.schemas.wmo_wcmp2.WMOWCMP2OutputSchema', 'wmo-wigos': 'pygeometa.schemas.wmo_wigos.WMOWIGOSOutputSchema', - 'cwl': 'pygeometa.schemas.cwl.CWLOutputSchema' + 'cwl': 'pygeometa.schemas.cwl.CWLOutputSchema', + 'openaire': 'pygeometa.schemas.openaire.OpenAireOutputSchema' } diff --git a/pygeometa/schemas/openaire/__init__.py b/pygeometa/schemas/openaire/__init__.py new file mode 100644 index 0000000..24ead40 --- /dev/null +++ b/pygeometa/schemas/openaire/__init__.py @@ -0,0 +1,376 @@ +# ================================================================= +# +# Terms and Conditions of Use +# +# Unless otherwise noted, computer program source code of this +# distribution # is covered under Crown Copyright, Government of +# Canada, and is distributed under the MIT License. +# +# The Canada wordmark and related graphics associated with this +# distribution are protected under trademark law and copyright law. +# No permission is granted to use them outside the parameters of +# the Government of Canada's corporate identity program. For +# more information, see +# http://www.tbs-sct.gc.ca/fip-pcim/index-eng.asp +# +# Copyright title to all 3rd party software distributed with this +# software is held by the respective copyright holders as noted in +# those files. Users are asked to read the 3rd Party Licenses +# referenced with those assets. +# +# Copyright (c) 2025 Tom Kralidis, Jiarong Li, Paul van Genuchten +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# ================================================================= + +from datetime import date, datetime +import logging +import os +import json +from typing import Union +import uuid + +from pygeometa import __version__ +from pygeometa.core import get_charstring +from pygeometa.helpers import json_dumps +from pygeometa.schemas.base import BaseOutputSchema + +THISDIR = os.path.dirname(os.path.realpath(__file__)) + +LOGGER = logging.getLogger(__name__) + + +class OpenAireOutputSchema(BaseOutputSchema): + """OpenAire: record schema""" + + def __init__(self): + """ + Initialize object + + :returns: pygeometa.schemas.base.BaseOutputSchema + """ + + description = 'OpenAire' + + super().__init__('openaire', description, 'json', THISDIR) + + def import_(self, metadata: str) -> dict: + """ + Import metadata into MCF + + :param metadata: string of metadata content + + :returns: `dict` of MCF content + """ + + # Initialized mcf + mcf = { + 'mcf': { + 'version': '1.0', + }, + 'metadata': {}, + 'identification': {}, + 'contact': {}, + 'tag': 'test' + } + + # Process metadata (convert XML to JSON if needed) + metadata = xml_to_json(metadata) + md = json.loads(metadata) + + if md is None: + LOGGER.info('invalid openaire metadata') + return mcf + + + header_ = md.get('header') + metadata_ = md.get('results')[0] + + # mcf: metadata + + pids_ = metadata_.get('pids') + pids_schemevalue = [ + { + 'identifier': i.get('value'), + 'scheme': i.get('scheme') + } + for i in pids_] + children_instances_ = metadata_.get('instances') + main_id_, main_instance_ = process_id_and_instance(pids_, children_instances_) + + mcf['metadata']['identifier'] = main_id_ + mcf['metadata']['additional_identifiers'] = pids_schemevalue + + project_ = metadata_.get('projects') + if project_ is not None and isinstance(project_, list): + rel_project = [] + for p in project_: + pids = p.get('pids', []) + if pids is None or len(pids) == 0: + continue + pid = pids[0] + pro_dict = {'identifier': pid.get('value'), 'scheme': pid.get('scheme'), 'type': 'project'} + rel_project.append(pro_dict) + if len(rel_project) > 0: + mcf['metadata']['relations'] = rel_project + + + instance_type_ = main_instance_.get('type') + if instance_type_: + mcf['metadata']['hierarchylevel'] = instance_type_ + + date_of_collection = metadata_.get('dateOfCollection') + if date_of_collection: + mcf['metadata']['datestamp'] = metadata_.get('dateOfCollection') + + urls = main_instance_.get('urls') + if urls: + mcf['metadata']['dataseturi'] = urls[0] + + # mcf: identification + language_ = metadata_.get('language', {}).get('code') + if language_: + mcf['identification']['language'] = language_ + + main_title = metadata_.get('mainTitle') + # subtitle also exists + if main_title: + mcf['identification']['title'] = main_title + + description_ = metadata_.get('descriptions') + if description_: + mcf['identification']['abstract'] = description_[0] + + version_ = metadata_.get('version') + if version_: + mcf['identification']['edition'] = version_ + + ## topiccategory + + right_ = metadata_.get('bestAccessRight', {}).get('label') + instance_right_ = main_instance_.get('accessRight', {}).get('label') + if right_ is not None and right_ != 'unspecified': + mcf['identification']['rights'] = right_ + elif instance_right_ is not None and instance_right_ != 'unspecified': + mcf['identification']['rights'] = instance_right_ + + license_ = main_instance_.get('license') + if license_: + mcf['identification']['license'] = {'name': license_, 'url': ''} + + ## url + dates_dict = {} + p_date = metadata_.get('publicationDate') + e_date = metadata_.get('embargoEndDate') + if p_date: + dates_dict['publication'] = p_date + if e_date: + dates_dict['embargoend'] = e_date + if dates_dict: + mcf['identification']['dates'] = dates_dict + + + subjects_ = metadata_.get('subjects') + if isinstance(subjects_, dict): + mcf['identification']['keywords'] = process_keywords([subjects_]) + elif isinstance(subjects_, list): + mcf['identification']['keywords'] = process_keywords(subjects_) + + ## contact point + authors_ = metadata_.get('authors', []) + orgs_ = metadata_.get('organizations', []) + authors_ = authors_ or [] + orgs_ = orgs_ or [] + contact_ = authors_ + orgs_ + if len(contact_) > 0: + mcf['contact'] = process_contact(contact_) + + return mcf + + def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: + """ + Write outputschema to JSON string buffer + + :param mcf: dict of MCF content model + :param stringify: whether to return a string representation (default) + else native (dict, etree) + + :returns: `dict` or `str` of MCF as Schema.org + """ + + # no write implementation for now + + return 'test' + + # return None + +def xml_to_json(content: str) -> str: + """ + Convert XML to JSON if content is detected as XML + + Write it later + """ + return content + + +def process_id_and_instance(ids: list, instances: list) -> tuple[str, object]: + """ + Find one pair of children instance and pid with the same doi. + Use the instance as the entry of mcf attributes. Use the doi as the identifier. + If can't find a match, use instance[0] and pid[0] + """ + + # get the first doi as main id + if len(ids) == 0: + LOGGER.info('identifier missed') + return None, instances[0] if instances else None + first_id = ids[0] + main_id = first_id.get('value') if first_id else None + if len(ids) > 1: + for i in ids: + if i.get('schema') == "doi": + main_id = i.get('value') + break + if len(instances) == 0: + return main_id, None + + # get the instance matched with the main id + main_instance = instances[0] + for ins in instances: + pid = ins.get('pid', {}) + if isinstance(pid, list): # instance has multiple pid + pid_values = [i.get('value') for i in pid] + if main_id in pid_values: + main_instance = ins + break + elif isinstance(pid, dict): # instance has one pid + if pid.get('value') == main_id: + main_instance = ins + break + else: + continue + return main_id, main_instance + +def process_keywords(subjects: list) -> dict: + """ + convert openaire keywords to mcf keywords + + group keywords by scheme + + """ + unique_scheme = list(set([s.get('subject', {}).get('scheme') for s in subjects])) + + scheme_uuid_dict = {scheme: str(uuid.uuid4()) for scheme in unique_scheme} + + keywords_dict = { + value: { + 'keywords': [], + 'vocabulary': { + 'name': key + } + } + for key, value in scheme_uuid_dict.items() + } + + for s in subjects: + s_value = s.get('subject') + for k, v in keywords_dict.items(): + if s_value.get('scheme') == v.get('vocabulary', {}).get('name'): + v['keywords'].append(s_value.get('value')) + break + return keywords_dict + + +def process_contact(contact_list: list) -> dict: + """ + Process authors and organizations into MCF contact format + + :param authors: list of author objects + :param orgs: list of organization objects + + :returns: dict with UUID keys and contact point values + """ + contact_dict = {} + + for contact in contact_list: + contact_uuid = str(uuid.uuid4()) + # Initialize contact point structure + contactpoint_dict = { + 'individualname': '', + 'organization': '', + 'url': '' + } + # Process authors + if 'fullName' in contact: + contactpoint_dict['individualname'] = contact.get('fullName') + pid = contact.get('pid') + if pid is not None and pid.get('id') is not None: + pid_scheme = pid.get('id', {}).get('scheme') + pid_value = pid.get('id', {}).get('value') + if pid_scheme is not None and pid_value is not None: + contactpoint_dict['url'] = id2url(pid_scheme, pid_value) + + # Process organizations + elif 'legalName' in contact: + org_name = contact.get('legalName') + contactpoint_dict['organization'] = org_name + pids = contact.get('pids', []) + if pids is not None: + for p in pids: + if p.get('scheme').lower() == 'ror': + contactpoint_dict['url'] = id2url(p.get('scheme'), p.get('value')) + break + elif p.get('scheme').lower() == 'grid': + contactpoint_dict['url'] = id2url(p.get('scheme'), p.get('value')) + break + elif p.get('scheme').lower() == 'wikidata': + contactpoint_dict['url'] = id2url(p.get('scheme'), p.get('value')) + break + elif p.get('scheme').lower() == 'isni': + contactpoint_dict['url'] = id2url(p.get('scheme'), p.get('value')) + break + # Add to contactpoint dict + if contactpoint_dict['individualname'] or contactpoint_dict['organization']: + contact_dict[contact_uuid] = contactpoint_dict + + return contact_dict + + + +def id2url(scheme: str, id: str) -> str: + """ + Convert orcid, wikidata, ror or grid value to url + """ + if scheme.lower() == 'orcid': + return 'https://orcid.org/' + id + elif scheme.lower() == 'ror': + return id + elif scheme.lower() == 'grid': + return id + elif scheme.lower() == 'wikidata': + return 'https://www.wikidata.org/wiki/' + id + elif scheme.lower() == 'isni': + return 'https://isni.org/isni/' + id + else: + return None + From 86ea07d079ab5d9ace8422a449db4007fac1ad3d Mon Sep 17 00:00:00 2001 From: jiarongli Date: Mon, 15 Dec 2025 15:15:52 +0100 Subject: [PATCH 2/4] fix identifier issue --- pygeometa/schemas/openaire/__init__.py | 127 +++++++++++++++++-------- 1 file changed, 88 insertions(+), 39 deletions(-) diff --git a/pygeometa/schemas/openaire/__init__.py b/pygeometa/schemas/openaire/__init__.py index 24ead40..918dcce 100644 --- a/pygeometa/schemas/openaire/__init__.py +++ b/pygeometa/schemas/openaire/__init__.py @@ -108,18 +108,18 @@ def import_(self, metadata: str) -> dict: # mcf: metadata - pids_ = metadata_.get('pids') - pids_schemevalue = [ - { - 'identifier': i.get('value'), - 'scheme': i.get('scheme') - } - for i in pids_] + pids_ = metadata_.get('pids', []) + originIds_ = metadata_.get('originalIds', []) + id_ = metadata_.get('id') + children_instances_ = metadata_.get('instances') - main_id_, main_instance_ = process_id_and_instance(pids_, children_instances_) + main_id_, altIds_, main_instance_ = process_id_and_instance(pids_, originIds_, id_, children_instances_) + + if main_id_: + mcf['metadata']['identifier'] = main_id_ - mcf['metadata']['identifier'] = main_id_ - mcf['metadata']['additional_identifiers'] = pids_schemevalue + if altIds_: + mcf['metadata']['additional_identifiers'] = altIds_ project_ = metadata_.get('projects') if project_ is not None and isinstance(project_, list): @@ -134,18 +134,19 @@ def import_(self, metadata: str) -> dict: if len(rel_project) > 0: mcf['metadata']['relations'] = rel_project - - instance_type_ = main_instance_.get('type') - if instance_type_: - mcf['metadata']['hierarchylevel'] = instance_type_ + if main_instance_: + instance_type_ = main_instance_.get('type') + if instance_type_: + mcf['metadata']['hierarchylevel'] = instance_type_ date_of_collection = metadata_.get('dateOfCollection') if date_of_collection: mcf['metadata']['datestamp'] = metadata_.get('dateOfCollection') - urls = main_instance_.get('urls') - if urls: - mcf['metadata']['dataseturi'] = urls[0] + if main_instance_: + urls = main_instance_.get('urls') + if urls: + mcf['metadata']['dataseturi'] = urls[0] # mcf: identification language_ = metadata_.get('language', {}).get('code') @@ -168,15 +169,18 @@ def import_(self, metadata: str) -> dict: ## topiccategory right_ = metadata_.get('bestAccessRight', {}).get('label') - instance_right_ = main_instance_.get('accessRight', {}).get('label') + instance_right_ = None + if main_instance_: + instance_right_ = main_instance_.get('accessRight', {}).get('label') if right_ is not None and right_ != 'unspecified': mcf['identification']['rights'] = right_ elif instance_right_ is not None and instance_right_ != 'unspecified': mcf['identification']['rights'] = instance_right_ - license_ = main_instance_.get('license') - if license_: - mcf['identification']['license'] = {'name': license_, 'url': ''} + if main_instance_: + license_ = main_instance_.get('license') + if license_: + mcf['identification']['license'] = {'name': license_, 'url': ''} ## url dates_dict = {} @@ -233,26 +237,70 @@ def xml_to_json(content: str) -> str: return content -def process_id_and_instance(ids: list, instances: list) -> tuple[str, object]: +def process_id_and_instance(pids: list, originIds: list, id: str, instances: list) -> tuple[str, list, dict]: """ - Find one pair of children instance and pid with the same doi. - Use the instance as the entry of mcf attributes. Use the doi as the identifier. - If can't find a match, use instance[0] and pid[0] + Get the main_id, alternative_ids and main_instance from the input data + + Alternative ids are the unique ids from pids and originIds + + Main id is the first doi from pids, otherwise the first pid, + otherwise a doi from originId, otherwise an http url from originId, otherwise the first originId. + + main_instance is the first instance with the matched id of main_id, otherwise the first instance. + """ - # get the first doi as main id - if len(ids) == 0: - LOGGER.info('identifier missed') - return None, instances[0] if instances else None - first_id = ids[0] - main_id = first_id.get('value') if first_id else None - if len(ids) > 1: - for i in ids: - if i.get('schema') == "doi": - main_id = i.get('value') - break - if len(instances) == 0: - return main_id, None + # altids and main_id + pids_schemevalue = None + if pids is not None: + first_id = pids[0] + main_id = first_id.get('value') if first_id else None + if len(pids) > 1: + for i in pids: + if i.get('scheme') == "doi": + main_id = i.get('value') + break + pids_schemevalue = [ + { + 'identifier': i.get('value'), + 'scheme': i.get('scheme') + } + for i in pids] + elif originIds is not None: + main_id = next((item for item in originIds if item.startswith('10.')), None) + if main_id is None: + main_id = next((item for item in originIds if item.startswith('http')), None) + if main_id is None: + main_id = originIds[0] + elif id is not None: + main_id = id + else: + LOGGER.error('no valid identifier') + main_id = None + + origin_and_ids = originIds + [id] if originIds else [id] + origin_and_ids_uni = list(set(origin_and_ids)) if origin_and_ids is not None else [] + + if pids_schemevalue: + pids_values = [i.get('identifier') for i in pids_schemevalue] + for i in origin_and_ids_uni: + if i not in pids_values and i is not None: + pids_schemevalue.append({ + 'identifier': i, + 'scheme': None + }) + else: + pids_schemevalue = [ + { + 'identifier': i, + 'scheme': None + } + for i in origin_and_ids_uni + ] + + # instance + if instances is None or len(instances) == 0: + return main_id, pids_schemevalue, None # get the instance matched with the main id main_instance = instances[0] @@ -269,7 +317,8 @@ def process_id_and_instance(ids: list, instances: list) -> tuple[str, object]: break else: continue - return main_id, main_instance + + return main_id, pids_schemevalue, main_instance def process_keywords(subjects: list) -> dict: """ From 54ea5fa667917bda8e2bdbd605c16b27cd7b5951 Mon Sep 17 00:00:00 2001 From: Paul van Genuchten Date: Tue, 13 Jan 2026 08:53:37 +0100 Subject: [PATCH 3/4] remove test string, add datestamp --- pygeometa/schemas/openaire/__init__.py | 176 ++++++++++++------------- tests/run_tests.py | 6 +- 2 files changed, 90 insertions(+), 92 deletions(-) diff --git a/pygeometa/schemas/openaire/__init__.py b/pygeometa/schemas/openaire/__init__.py index 918dcce..0d2ef4d 100644 --- a/pygeometa/schemas/openaire/__init__.py +++ b/pygeometa/schemas/openaire/__init__.py @@ -43,16 +43,12 @@ # # ================================================================= -from datetime import date, datetime import logging import os import json from typing import Union import uuid -from pygeometa import __version__ -from pygeometa.core import get_charstring -from pygeometa.helpers import json_dumps from pygeometa.schemas.base import BaseOutputSchema THISDIR = os.path.dirname(os.path.realpath(__file__)) @@ -90,30 +86,25 @@ def import_(self, metadata: str) -> dict: }, 'metadata': {}, 'identification': {}, - 'contact': {}, - 'tag': 'test' + 'contact': {} } - # Process metadata (convert XML to JSON if needed) - metadata = xml_to_json(metadata) md = json.loads(metadata) if md is None: LOGGER.info('invalid openaire metadata') return mcf - - header_ = md.get('header') - metadata_ = md.get('results')[0] + metadata_ = md.get('results')[0] # mcf: metadata - pids_ = metadata_.get('pids', []) originIds_ = metadata_.get('originalIds', []) id_ = metadata_.get('id') children_instances_ = metadata_.get('instances') - main_id_, altIds_, main_instance_ = process_id_and_instance(pids_, originIds_, id_, children_instances_) + main_id_, altIds_, main_instance_ = process_id_and_instance( + pids_, originIds_, id_, children_instances_) if main_id_: mcf['metadata']['identifier'] = main_id_ @@ -129,7 +120,11 @@ def import_(self, metadata: str) -> dict: if pids is None or len(pids) == 0: continue pid = pids[0] - pro_dict = {'identifier': pid.get('value'), 'scheme': pid.get('scheme'), 'type': 'project'} + pro_dict = { + 'identifier': pid.get('value'), + 'scheme': pid.get('scheme'), + 'type': 'project' + } rel_project.append(pro_dict) if len(rel_project) > 0: mcf['metadata']['relations'] = rel_project @@ -138,7 +133,7 @@ def import_(self, metadata: str) -> dict: instance_type_ = main_instance_.get('type') if instance_type_: mcf['metadata']['hierarchylevel'] = instance_type_ - + date_of_collection = metadata_.get('dateOfCollection') if date_of_collection: mcf['metadata']['datestamp'] = metadata_.get('dateOfCollection') @@ -152,12 +147,12 @@ def import_(self, metadata: str) -> dict: language_ = metadata_.get('language', {}).get('code') if language_: mcf['identification']['language'] = language_ - + main_title = metadata_.get('mainTitle') # subtitle also exists if main_title: mcf['identification']['title'] = main_title - + description_ = metadata_.get('descriptions') if description_: mcf['identification']['abstract'] = description_[0] @@ -166,41 +161,44 @@ def import_(self, metadata: str) -> dict: if version_: mcf['identification']['edition'] = version_ - ## topiccategory - + # topiccategory right_ = metadata_.get('bestAccessRight', {}).get('label') instance_right_ = None if main_instance_: - instance_right_ = main_instance_.get('accessRight', {}).get('label') + instance_right_ = main_instance_.get( + 'accessRight', {}).get('label') if right_ is not None and right_ != 'unspecified': mcf['identification']['rights'] = right_ elif instance_right_ is not None and instance_right_ != 'unspecified': mcf['identification']['rights'] = instance_right_ - + if main_instance_: license_ = main_instance_.get('license') if license_: - mcf['identification']['license'] = {'name': license_, 'url': ''} - - ## url + mcf['identification']['license'] = { + 'name': license_, + 'url': '' + } + + # url dates_dict = {} p_date = metadata_.get('publicationDate') e_date = metadata_.get('embargoEndDate') if p_date: dates_dict['publication'] = p_date + mcf['identification']['datestamp'] = [p_date] if e_date: dates_dict['embargoend'] = e_date if dates_dict: mcf['identification']['dates'] = dates_dict - subjects_ = metadata_.get('subjects') if isinstance(subjects_, dict): mcf['identification']['keywords'] = process_keywords([subjects_]) elif isinstance(subjects_, list): mcf['identification']['keywords'] = process_keywords(subjects_) - ## contact point + # contact point authors_ = metadata_.get('authors', []) orgs_ = metadata_.get('organizations', []) authors_ = authors_ or [] @@ -223,31 +221,23 @@ def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: """ # no write implementation for now - - return 'test' - - # return None - -def xml_to_json(content: str) -> str: - """ - Convert XML to JSON if content is detected as XML - - Write it later - """ - return content + return '' -def process_id_and_instance(pids: list, originIds: list, id: str, instances: list) -> tuple[str, list, dict]: +def process_id_and_instance( + pids: list, originIds: list, + id: str, instances: list) -> tuple[str, list, dict]: """ Get the main_id, alternative_ids and main_instance from the input data - + Alternative ids are the unique ids from pids and originIds - - Main id is the first doi from pids, otherwise the first pid, - otherwise a doi from originId, otherwise an http url from originId, otherwise the first originId. - main_instance is the first instance with the matched id of main_id, otherwise the first instance. + Main id is the first doi from pids, otherwise the first pid, + otherwise a doi from originId, otherwise an http url from originId, + otherwise the first originId. + main_instance is the first instance with the matched id of main_id, + otherwise the first instance. """ # altids and main_id @@ -261,15 +251,18 @@ def process_id_and_instance(pids: list, originIds: list, id: str, instances: lis main_id = i.get('value') break pids_schemevalue = [ - { - 'identifier': i.get('value'), - 'scheme': i.get('scheme') - } - for i in pids] + { + 'identifier': i.get('value'), + 'scheme': i.get('scheme') + } + for i in pids + ] elif originIds is not None: - main_id = next((item for item in originIds if item.startswith('10.')), None) + main_id = next((item for item in originIds + if item.startswith('10.')), None) if main_id is None: - main_id = next((item for item in originIds if item.startswith('http')), None) + main_id = next((item for item in originIds + if item.startswith('http')), None) if main_id is None: main_id = originIds[0] elif id is not None: @@ -277,68 +270,70 @@ def process_id_and_instance(pids: list, originIds: list, id: str, instances: lis else: LOGGER.error('no valid identifier') main_id = None - + origin_and_ids = originIds + [id] if originIds else [id] - origin_and_ids_uni = list(set(origin_and_ids)) if origin_and_ids is not None else [] + origin_and_ids_uni = list(set(origin_and_ids)) if ( + origin_and_ids is not None) else [] if pids_schemevalue: pids_values = [i.get('identifier') for i in pids_schemevalue] for i in origin_and_ids_uni: if i not in pids_values and i is not None: pids_schemevalue.append({ - 'identifier': i, - 'scheme': None + 'identifier': i, + 'scheme': None }) else: pids_schemevalue = [ - { - 'identifier': i, - 'scheme': None - } - for i in origin_and_ids_uni + { + 'identifier': i, + 'scheme': None + } + for i in origin_and_ids_uni ] - + # instance if instances is None or len(instances) == 0: return main_id, pids_schemevalue, None - + # get the instance matched with the main id main_instance = instances[0] for ins in instances: pid = ins.get('pid', {}) - if isinstance(pid, list): # instance has multiple pid + if isinstance(pid, list): # instance has multiple pid pid_values = [i.get('value') for i in pid] if main_id in pid_values: main_instance = ins break - elif isinstance(pid, dict): # instance has one pid + elif isinstance(pid, dict): # instance has one pid if pid.get('value') == main_id: main_instance = ins break else: continue - + return main_id, pids_schemevalue, main_instance - + + def process_keywords(subjects: list) -> dict: """ convert openaire keywords to mcf keywords group keywords by scheme - """ - unique_scheme = list(set([s.get('subject', {}).get('scheme') for s in subjects])) + unique_scheme = list(set([s.get('subject', {}).get('scheme') + for s in subjects])) scheme_uuid_dict = {scheme: str(uuid.uuid4()) for scheme in unique_scheme} keywords_dict = { - value: { - 'keywords': [], - 'vocabulary': { - 'name': key + value: { + 'keywords': [], + 'vocabulary': { + 'name': key + } } - } - for key, value in scheme_uuid_dict.items() + for key, value in scheme_uuid_dict.items() } for s in subjects: @@ -346,21 +341,21 @@ def process_keywords(subjects: list) -> dict: for k, v in keywords_dict.items(): if s_value.get('scheme') == v.get('vocabulary', {}).get('name'): v['keywords'].append(s_value.get('value')) - break + break return keywords_dict def process_contact(contact_list: list) -> dict: """ Process authors and organizations into MCF contact format - + :param authors: list of author objects :param orgs: list of organization objects - + :returns: dict with UUID keys and contact point values """ contact_dict = {} - + for contact in contact_list: contact_uuid = str(uuid.uuid4()) # Initialize contact point structure @@ -378,32 +373,36 @@ def process_contact(contact_list: list) -> dict: pid_value = pid.get('id', {}).get('value') if pid_scheme is not None and pid_value is not None: contactpoint_dict['url'] = id2url(pid_scheme, pid_value) - + # Process organizations elif 'legalName' in contact: - org_name = contact.get('legalName') + org_name = contact.get('legalName') contactpoint_dict['organization'] = org_name pids = contact.get('pids', []) if pids is not None: for p in pids: if p.get('scheme').lower() == 'ror': - contactpoint_dict['url'] = id2url(p.get('scheme'), p.get('value')) + contactpoint_dict['url'] = id2url( + p.get('scheme'), p.get('value')) break elif p.get('scheme').lower() == 'grid': - contactpoint_dict['url'] = id2url(p.get('scheme'), p.get('value')) + contactpoint_dict['url'] = id2url( + p.get('scheme'), p.get('value')) break elif p.get('scheme').lower() == 'wikidata': - contactpoint_dict['url'] = id2url(p.get('scheme'), p.get('value')) + contactpoint_dict['url'] = id2url( + p.get('scheme'), p.get('value')) break elif p.get('scheme').lower() == 'isni': - contactpoint_dict['url'] = id2url(p.get('scheme'), p.get('value')) - break + contactpoint_dict['url'] = id2url( + p.get('scheme'), p.get('value')) + break # Add to contactpoint dict - if contactpoint_dict['individualname'] or contactpoint_dict['organization']: + if (contactpoint_dict['individualname'] or + contactpoint_dict['organization']): contact_dict[contact_uuid] = contactpoint_dict - - return contact_dict + return contact_dict def id2url(scheme: str, id: str) -> str: @@ -422,4 +421,3 @@ def id2url(scheme: str, id: str) -> str: return 'https://isni.org/isni/' + id else: return None - diff --git a/tests/run_tests.py b/tests/run_tests.py index ed71518..317407b 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -226,11 +226,11 @@ def test_get_supported_schemas(self): schemas = sorted(get_supported_schemas()) self.assertIsInstance(schemas, list, 'Expected list') - self.assertEqual(len(schemas), 12, + self.assertEqual(len(schemas), 13, 'Expected specific number of supported schemas') self.assertEqual(sorted(schemas), - sorted(['cwl', 'csvw', 'dcat', 'iso19139', - 'iso19139-2', 'iso19139-hnap', 'oarec-record', + sorted(['cwl', 'csvw', 'dcat', 'iso19139', 'iso19139-2', + 'iso19139-hnap', 'oarec-record', 'openaire', 'schema-org', 'stac-item', 'wmo-cmp', 'wmo-wcmp2', 'wmo-wigos']), 'Expected exact list of supported schemas') From 4c2995c7b90066332d9ca4f3215f5ba62358f7db Mon Sep 17 00:00:00 2001 From: Paul van Genuchten Date: Thu, 29 Jan 2026 12:51:24 +0100 Subject: [PATCH 4/4] - rebase to master - do not require results wrapper - add import test --- README.md | 3 + pygeometa/schemas/__init__.py | 6 +- pygeometa/schemas/openaire/__init__.py | 103 ++++++----- tests/openaire.json | 243 +++++++++++++++++++++++++ tests/run_tests.py | 15 +- 5 files changed, 321 insertions(+), 49 deletions(-) create mode 100644 tests/openaire.json diff --git a/README.md b/README.md index 750d71b..e410b28 100644 --- a/README.md +++ b/README.md @@ -75,10 +75,13 @@ pygeometa metadata transform path/to/file.xml --input-schema=autodetect --output ### Supported schemas Schemas supported by pygeometa: +* CSV on the web, [reference](https://www.w3.org/TR/2015/REC-tabular-metadata-20151217/) * dcat, [reference](https://www.w3.org/TR/vocab-dcat-2/) * iso19139, [reference](http://www.iso.org/iso/catalogue_detail.htm?csnumber=32557) * iso19139-hnap, [reference](http://www.gcpedia.gc.ca/wiki/Federal_Geospatial_Platform/Policies_and_Standards/Catalogue/Release/Appendix_B_Guidelines_and_Best_Practices/Guide_to_Harmonized_ISO_19115:2003_NAP) * OGC API - Records - Part 1: Core, record model, [reference](https://github.com/opengeospatial/ogcapi-records/blob/master/core/openapi/schemas/recordGeoJSON.yaml) +* OpenAire metadata schema, [reference](https://graph.openaire.eu/docs/data-model/entities/research-product) +* Schema.org, [reference](https://schema.org/Dataset) * SpatioTemporal Asset Catalog [(STAC)](https://stacspec.org) * iso19139-2, [reference](https://www.iso.org/standard/67039.html) * [wmo-cmp](doc/content/reference/formats/wmo-cmp.md), [reference](http://wis.wmo.int/2013/metadata/version_1-3-0/WMO_Core_Metadata_Profile_v1.3_Part_1.pdf) diff --git a/pygeometa/schemas/__init__.py b/pygeometa/schemas/__init__.py index a4bae55..65557fe 100644 --- a/pygeometa/schemas/__init__.py +++ b/pygeometa/schemas/__init__.py @@ -58,14 +58,14 @@ 'iso19139': 'pygeometa.schemas.iso19139.ISO19139OutputSchema', 'iso19139-2': 'pygeometa.schemas.iso19139_2.ISO19139_2OutputSchema', 'iso19139-hnap': 'pygeometa.schemas.iso19139_hnap.ISO19139HNAPOutputSchema', # noqa - 'oarec-record': 'pygeometa.schemas.ogcapi_records.OGCAPIRecordOutputSchema', # noqa + 'oarec-record': 'pygeometa.schemas.ogcapi_records.OGCAPIRecordOutputSchema', # noqa + 'openaire': 'pygeometa.schemas.openaire.OpenAireOutputSchema', 'schema-org': 'pygeometa.schemas.schema_org.SchemaOrgOutputSchema', 'stac-item': 'pygeometa.schemas.stac.STACItemOutputSchema', 'wmo-cmp': 'pygeometa.schemas.wmo_cmp.WMOCMPOutputSchema', 'wmo-wcmp2': 'pygeometa.schemas.wmo_wcmp2.WMOWCMP2OutputSchema', 'wmo-wigos': 'pygeometa.schemas.wmo_wigos.WMOWIGOSOutputSchema', - 'cwl': 'pygeometa.schemas.cwl.CWLOutputSchema', - 'openaire': 'pygeometa.schemas.openaire.OpenAireOutputSchema' + 'cwl': 'pygeometa.schemas.cwl.CWLOutputSchema' } diff --git a/pygeometa/schemas/openaire/__init__.py b/pygeometa/schemas/openaire/__init__.py index 0d2ef4d..9cdccb4 100644 --- a/pygeometa/schemas/openaire/__init__.py +++ b/pygeometa/schemas/openaire/__init__.py @@ -18,7 +18,9 @@ # those files. Users are asked to read the 3rd Party Licenses # referenced with those assets. # -# Copyright (c) 2025 Tom Kralidis, Jiarong Li, Paul van Genuchten +# Copyright (c) 2026 Tom Kralidis +# Copyright (c) 2026 Jiarong Li +# Copyright (c) 2026 Paul van Genuchten # # Permission is hereby granted, free of charge, to any person # obtaining a copy of this software and associated documentation @@ -43,11 +45,11 @@ # # ================================================================= +import json import logging import os -import json -from typing import Union import uuid +from typing import Union from pygeometa.schemas.base import BaseOutputSchema @@ -92,10 +94,15 @@ def import_(self, metadata: str) -> dict: md = json.loads(metadata) if md is None: - LOGGER.info('invalid openaire metadata') - return mcf + raise ValueError('No openaire metadata') - metadata_ = md.get('results')[0] + if 'results' in md: + LOGGER.info('Using first record from results') + md = next(iter(md.get('results')), None) + if md is None: + raise ValueError('No openaire metadata in results') + + metadata_ = md # mcf: metadata pids_ = metadata_.get('pids', []) @@ -108,6 +115,8 @@ def import_(self, metadata: str) -> dict: if main_id_: mcf['metadata']['identifier'] = main_id_ + else: + raise ValueError('No identification on record') if altIds_: mcf['metadata']['additional_identifiers'] = altIds_ @@ -135,30 +144,30 @@ def import_(self, metadata: str) -> dict: mcf['metadata']['hierarchylevel'] = instance_type_ date_of_collection = metadata_.get('dateOfCollection') - if date_of_collection: + if date_of_collection is not None: mcf['metadata']['datestamp'] = metadata_.get('dateOfCollection') - if main_instance_: + if main_instance_ is not None: urls = main_instance_.get('urls') if urls: - mcf['metadata']['dataseturi'] = urls[0] + mcf['metadata']['dataseturi'] = next(iter(urls), '') # mcf: identification language_ = metadata_.get('language', {}).get('code') - if language_: + if language_ is not None: mcf['identification']['language'] = language_ main_title = metadata_.get('mainTitle') # subtitle also exists - if main_title: + if main_title is not None: mcf['identification']['title'] = main_title description_ = metadata_.get('descriptions') - if description_: - mcf['identification']['abstract'] = description_[0] + if description_ is not None: + mcf['identification']['abstract'] = next(iter(description_), '') version_ = metadata_.get('version') - if version_: + if version_ is not None: mcf['identification']['edition'] = version_ # topiccategory @@ -172,7 +181,7 @@ def import_(self, metadata: str) -> dict: elif instance_right_ is not None and instance_right_ != 'unspecified': mcf['identification']['rights'] = instance_right_ - if main_instance_: + if main_instance_ is not None: license_ = main_instance_.get('license') if license_: mcf['identification']['license'] = { @@ -238,11 +247,18 @@ def process_id_and_instance( main_instance is the first instance with the matched id of main_id, otherwise the first instance. + + :param pids: main pids + :param originIds: pids from original + :param id: id of the record + :param instances: instances in the record + + :returns: `tuple` of (main_id, alternative_ids, main_instance) """ # altids and main_id pids_schemevalue = None - if pids is not None: + if pids is not None and len(pids) > 0: first_id = pids[0] main_id = first_id.get('value') if first_id else None if len(pids) > 1: @@ -257,7 +273,7 @@ def process_id_and_instance( } for i in pids ] - elif originIds is not None: + elif originIds is not None and len(originIds) > 0: main_id = next((item for item in originIds if item.startswith('10.')), None) if main_id is None: @@ -320,6 +336,10 @@ def process_keywords(subjects: list) -> dict: convert openaire keywords to mcf keywords group keywords by scheme + + :param subjects: list + + :returns: `dict` grouped keywords """ unique_scheme = list(set([s.get('subject', {}).get('scheme') for s in subjects])) @@ -371,7 +391,7 @@ def process_contact(contact_list: list) -> dict: if pid is not None and pid.get('id') is not None: pid_scheme = pid.get('id', {}).get('scheme') pid_value = pid.get('id', {}).get('value') - if pid_scheme is not None and pid_value is not None: + if None not in [pid_scheme, pid_value]: contactpoint_dict['url'] = id2url(pid_scheme, pid_value) # Process organizations @@ -381,22 +401,12 @@ def process_contact(contact_list: list) -> dict: pids = contact.get('pids', []) if pids is not None: for p in pids: - if p.get('scheme').lower() == 'ror': - contactpoint_dict['url'] = id2url( - p.get('scheme'), p.get('value')) - break - elif p.get('scheme').lower() == 'grid': - contactpoint_dict['url'] = id2url( - p.get('scheme'), p.get('value')) - break - elif p.get('scheme').lower() == 'wikidata': - contactpoint_dict['url'] = id2url( - p.get('scheme'), p.get('value')) - break - elif p.get('scheme').lower() == 'isni': + if p.get('scheme').lower() in ['ror', 'grid', + 'wikidata', 'isni']: contactpoint_dict['url'] = id2url( p.get('scheme'), p.get('value')) break + # Add to contactpoint dict if (contactpoint_dict['individualname'] or contactpoint_dict['organization']): @@ -405,19 +415,24 @@ def process_contact(contact_list: list) -> dict: return contact_dict -def id2url(scheme: str, id: str) -> str: +def id2url(scheme: str, id_: str) -> str: """ Convert orcid, wikidata, ror or grid value to url + + :param scheme: scheme + :param id: identifier + + :returns: `str` url """ - if scheme.lower() == 'orcid': - return 'https://orcid.org/' + id - elif scheme.lower() == 'ror': - return id - elif scheme.lower() == 'grid': - return id - elif scheme.lower() == 'wikidata': - return 'https://www.wikidata.org/wiki/' + id - elif scheme.lower() == 'isni': - return 'https://isni.org/isni/' + id - else: - return None + scheme2 = scheme.lower() + + if scheme2 in ['ror', 'grid']: + return id_ + elif scheme2 == 'orcid': + return f'https://orcid.org/{id_}' + elif scheme2 == 'wikidata': + return f'https://www.wikidata.org/wiki/{id_}' + elif scheme2 == 'isni': + return f'https://isni.org/isni/{id_}' + + return None diff --git a/tests/openaire.json b/tests/openaire.json new file mode 100644 index 0000000..26aa181 --- /dev/null +++ b/tests/openaire.json @@ -0,0 +1,243 @@ +{ + "header": { + "numFound": 1, + "maxScore": 1.0, + "queryTime": 10, + "page": 1, + "pageSize": 10 + }, + "results": [ + { + "authors": [ + { + "fullName": "Fohrafellner, Julia", + "name": "Julia", + "surname": "Fohrafellner", + "rank": 1, + "pid": { + "id": { + "scheme": "orcid_pending", + "value": "0000-0001-5734-7353" + }, + "provenance": null + } + } + ], + "openAccessColor": null, + "publiclyFunded": null, + "type": "dataset", + "language": { + "code": "eng", + "label": "English" + }, + "countries": null, + "subjects": [ + { + "subject": { + "scheme": "keyword", + "value": "EJP SOIL" + }, + "provenance": null + }, + { + "subject": { + "scheme": "keyword", + "value": "ProbeField" + }, + "provenance": null + }, + { + "subject": { + "scheme": "keyword", + "value": "Spectroscopy, Near-Infrared" + }, + "provenance": null + }, + { + "subject": { + "scheme": "keyword", + "value": "data" + }, + "provenance": null + } + ], + "mainTitle": "Dataset to: Foundation for an Austrian NIR Soil Spectral Library for Soil Health Assessments", + "subTitle": null, + "descriptions": [ + "Dataset description This is the corresponding dataset to the publication \"Foundation for an Austrian NIR Soil Spectral Library for Soil Health Assessments\" by Fohrafellner et al. (2025). In this publication, we created the first Near-Infrared (NIR) Austrian Soil Spectral Library (ASSL, 680 – 2500 nm) using 2,129 legacy samples from all environmental zones of Austria. Additionally, we utilized partial least squares regression modeling to evaluate the dataset's current effectiveness for soil health assessments. The dataset contains three tabs, \"Document meta data\", \"Legend\" and \"Dataset\". Tab \"Document meta data\" gives information on the authors, the data collection time frame, terms of use, etc. In \"Legend\", each column of the \"Dataset\" is described. The \"Dataset\" contains information on the legacy soil samples including: meta data (e.g. sample number, sampling year, zip code, environmental zone, land use), soil properties (soil organic carbon [SOC], SOC to clay ratio, total carbon, labile carbon, CaCO3, total nitrogen, plant available phosphorus, pH measured in CaCl2 and acetate, cation exchange capacity, texture [sand, silt, clay content], and clay content measured by density in suspension), and measured NIR soil spectra, also for the standards. Project description This Austrian Soil Spectral Library was built within the ProbeField project (November 2021 – January 2025), which was part of the European Joint Program for SOIL ‘Towards climate-smart sustainable management of agricultural soils’ (EJP SOIL) funded by the European Union Horizon 2020 research and innovation programme (Grant Agreement N° 862695). The project aimed to create a protocol detailing procedures and methodologies for accurately estimating fertility-related properties in agricultural soils in the field. Additionally, the potential for extending this data to two- and three-dimensional mapping using co-variates was demonstrated. ProbeField further collected field spectra that closely match laboratory spectra, enabling the prediction of soil properties using models calibrated with soil spectral libraries. References Fohrafellner, J., Lippl, M., Bajraktarevic, A., Baumgarten, A., Spiegel, H., Körner, R. and Sandén, T.: Foundation for an Austrian NIR Soil Spectral Library for Soil Health Assessments, 2025, in review." + ], + "publicationDate": "2025-07-07", + "publisher": "Zenodo", + "embargoEndDate": null, + "sources": null, + "formats": null, + "contributors": [ + "Fohrafellner, Julia", + "Lippl, Maximilian", + "Bajraktarevic, Armin", + "Spiegel, Heide", + "Körner, Robert", + "Sandén, Taru" + ], + "coverages": null, + "bestAccessRight": { + "code": "c_abf2", + "label": "OPEN", + "scheme": "http://vocabularies.coar-repositories.org/documentation/access_rights/" + }, + "container": null, + "documentationUrls": null, + "codeRepositoryUrl": null, + "programmingLanguage": null, + "contactPeople": null, + "contactGroups": null, + "tools": null, + "size": null, + "version": "1", + "geoLocations": null, + "id": "doi_dedup___::344b6b438c623855996b060ef6656bdd", + "originalIds": [ + "50|datacite____::344b6b438c623855996b060ef6656bdd", + "10.5281/zenodo.15772619", + "50|sygma_______::344b6b438c623855996b060ef6656bdd" + ], + "pids": [ + { + "scheme": "doi", + "value": "10.5281/zenodo.15772619" + } + ], + "dateOfCollection": null, + "lastUpdateTimeStamp": null, + "indicators": { + "citationImpact": { + "citationCount": 0.0, + "influence": 2.4895952E-9, + "popularity": 2.7494755E-9, + "impulse": 0.0, + "citationClass": "C5", + "influenceClass": "C5", + "impulseClass": "C5", + "popularityClass": "C5" + } + }, + "projects": [ + { + "id": "corda__h2020::f3d09ef95fcf5a35c0e90e6560b6b2e0", + "code": "862695", + "acronym": "EJP SOIL", + "title": "Towards climate-smart sustainable management of agricultural soils", + "funder": "European Commission", + "pids": [ + { + "scheme": "doi", + "value": "10.3030/862695" + } + ] + } + ], + "organizations": [ + { + "legalName": "Austrian Agency for Health and Food Safety", + "acronym": "AGES", + "id": "openorgs____::fcf31bab84c16bfd27726caa5bb57126", + "pids": [ + { + "scheme": "OrgReg", + "value": "AT3001" + }, + { + "scheme": "GRID", + "value": "grid.414107.7" + }, + { + "scheme": "ROR", + "value": "https://ror.org/055xb4311" + }, + { + "scheme": "PIC", + "value": "998254743" + }, + { + "scheme": "ISNI", + "value": "0000000122246253" + } + ] + } + ], + "communities": [ + { + "code": "eosc", + "label": "EOSC", + "provenance": null + } + ], + "collectedFrom": [ + { + "key": "openaire____::9e3be59865b2c1c335d32dae2fe7b254", + "value": "Datacite" + }, + { + "key": "openaire____::a8db6f6b2ce4fe72e8b2314a9a93e7d9", + "value": "Sygma" + } + ], + "instances": [ + { + "pids": [ + { + "scheme": "doi", + "value": "10.5281/zenodo.15772619" + } + ], + "license": "CC BY", + "type": "Dataset", + "urls": [ + "https://dx.doi.org/10.5281/zenodo.15772619" + ], + "publicationDate": "2025-07-07", + "refereed": "nonPeerReviewed", + "hostedBy": { + "key": "opendoar____::358aee4cc897452c00244351e4d91f69", + "value": "ZENODO" + }, + "collectedFrom": { + "key": "openaire____::9e3be59865b2c1c335d32dae2fe7b254", + "value": "Datacite" + } + }, + { + "alternateIdentifiers": [ + { + "scheme": "doi", + "value": "10.5281/zenodo.15772619" + } + ], + "license": "CC BY", + "accessRight": { + "code": "c_abf2", + "label": "OPEN", + "scheme": "http://vocabularies.coar-repositories.org/documentation/access_rights/", + "openAccessRoute": null + }, + "type": "Dataset", + "urls": [ + "https://doi.org/10.5281/zenodo.15772619" + ], + "publicationDate": "2025-07-07", + "refereed": "nonPeerReviewed", + "hostedBy": { + "key": "openaire____::55045bd2a65019fd8e6741a755395c8c", + "value": "Unknown Repository" + }, + "collectedFrom": { + "key": "openaire____::a8db6f6b2ce4fe72e8b2314a9a93e7d9", + "value": "Sygma" + } + } + ], + "isGreen": null, + "isInDiamondJournal": null + } + ] +} \ No newline at end of file diff --git a/tests/run_tests.py b/tests/run_tests.py index 317407b..268ce68 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -229,14 +229,14 @@ def test_get_supported_schemas(self): self.assertEqual(len(schemas), 13, 'Expected specific number of supported schemas') self.assertEqual(sorted(schemas), - sorted(['cwl', 'csvw', 'dcat', 'iso19139', 'iso19139-2', + sorted(['cwl', 'csvw', 'dcat', 'iso19139', 'iso19139-2', # noqa 'iso19139-hnap', 'oarec-record', 'openaire', 'schema-org', 'stac-item', 'wmo-cmp', 'wmo-wcmp2', 'wmo-wigos']), 'Expected exact list of supported schemas') schemas = get_supported_schemas(include_autodetect=True) - self.assertEqual(len(schemas), 13, + self.assertEqual(len(schemas), 14, 'Expected specific number of supported schemas') self.assertIn('autodetect', schemas, 'Expected autodetect in list') @@ -452,6 +452,17 @@ def test_import_metadata(self): 'WIS/GTS bulletin SMJP01 RJTD in FM12 SYNOP', 'Expected specific title') + def test_openaire(self): + """test metadata import openaire""" + + with open(get_abspath('openaire.json')) as fh: + mcf = import_metadata('openaire', fh.read()) + + self.assertEqual( + mcf['identification']['title'], + 'Dataset to: Foundation for an Austrian NIR Soil Spectral Library for Soil Health Assessments', # noqa + 'Expected specific title') + with open(get_abspath('md-SMJP01RJTD-gmd.xml')) as fh: mcf = import_metadata('autodetect', fh.read())