diff --git a/README.md b/README.md index 750d71b..e410b28 100644 --- a/README.md +++ b/README.md @@ -75,10 +75,13 @@ pygeometa metadata transform path/to/file.xml --input-schema=autodetect --output ### Supported schemas Schemas supported by pygeometa: +* CSV on the web, [reference](https://www.w3.org/TR/2015/REC-tabular-metadata-20151217/) * dcat, [reference](https://www.w3.org/TR/vocab-dcat-2/) * iso19139, [reference](http://www.iso.org/iso/catalogue_detail.htm?csnumber=32557) * iso19139-hnap, [reference](http://www.gcpedia.gc.ca/wiki/Federal_Geospatial_Platform/Policies_and_Standards/Catalogue/Release/Appendix_B_Guidelines_and_Best_Practices/Guide_to_Harmonized_ISO_19115:2003_NAP) * OGC API - Records - Part 1: Core, record model, [reference](https://github.com/opengeospatial/ogcapi-records/blob/master/core/openapi/schemas/recordGeoJSON.yaml) +* OpenAire metadata schema, [reference](https://graph.openaire.eu/docs/data-model/entities/research-product) +* Schema.org, [reference](https://schema.org/Dataset) * SpatioTemporal Asset Catalog [(STAC)](https://stacspec.org) * iso19139-2, [reference](https://www.iso.org/standard/67039.html) * [wmo-cmp](doc/content/reference/formats/wmo-cmp.md), [reference](http://wis.wmo.int/2013/metadata/version_1-3-0/WMO_Core_Metadata_Profile_v1.3_Part_1.pdf) diff --git a/pygeometa/schemas/__init__.py b/pygeometa/schemas/__init__.py index e9059ce..65557fe 100644 --- a/pygeometa/schemas/__init__.py +++ b/pygeometa/schemas/__init__.py @@ -58,7 +58,8 @@ 'iso19139': 'pygeometa.schemas.iso19139.ISO19139OutputSchema', 'iso19139-2': 'pygeometa.schemas.iso19139_2.ISO19139_2OutputSchema', 'iso19139-hnap': 'pygeometa.schemas.iso19139_hnap.ISO19139HNAPOutputSchema', # noqa - 'oarec-record': 'pygeometa.schemas.ogcapi_records.OGCAPIRecordOutputSchema', # noqa + 'oarec-record': 'pygeometa.schemas.ogcapi_records.OGCAPIRecordOutputSchema', # noqa + 'openaire': 'pygeometa.schemas.openaire.OpenAireOutputSchema', 'schema-org': 'pygeometa.schemas.schema_org.SchemaOrgOutputSchema', 'stac-item': 'pygeometa.schemas.stac.STACItemOutputSchema', 'wmo-cmp': 'pygeometa.schemas.wmo_cmp.WMOCMPOutputSchema', diff --git a/pygeometa/schemas/openaire/__init__.py b/pygeometa/schemas/openaire/__init__.py new file mode 100644 index 0000000..9cdccb4 --- /dev/null +++ b/pygeometa/schemas/openaire/__init__.py @@ -0,0 +1,438 @@ +# ================================================================= +# +# Terms and Conditions of Use +# +# Unless otherwise noted, computer program source code of this +# distribution # is covered under Crown Copyright, Government of +# Canada, and is distributed under the MIT License. +# +# The Canada wordmark and related graphics associated with this +# distribution are protected under trademark law and copyright law. +# No permission is granted to use them outside the parameters of +# the Government of Canada's corporate identity program. For +# more information, see +# http://www.tbs-sct.gc.ca/fip-pcim/index-eng.asp +# +# Copyright title to all 3rd party software distributed with this +# software is held by the respective copyright holders as noted in +# those files. Users are asked to read the 3rd Party Licenses +# referenced with those assets. +# +# Copyright (c) 2026 Tom Kralidis +# Copyright (c) 2026 Jiarong Li +# Copyright (c) 2026 Paul van Genuchten +# +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +# ================================================================= + +import json +import logging +import os +import uuid +from typing import Union + +from pygeometa.schemas.base import BaseOutputSchema + +THISDIR = os.path.dirname(os.path.realpath(__file__)) + +LOGGER = logging.getLogger(__name__) + + +class OpenAireOutputSchema(BaseOutputSchema): + """OpenAire: record schema""" + + def __init__(self): + """ + Initialize object + + :returns: pygeometa.schemas.base.BaseOutputSchema + """ + + description = 'OpenAire' + + super().__init__('openaire', description, 'json', THISDIR) + + def import_(self, metadata: str) -> dict: + """ + Import metadata into MCF + + :param metadata: string of metadata content + + :returns: `dict` of MCF content + """ + + # Initialized mcf + mcf = { + 'mcf': { + 'version': '1.0', + }, + 'metadata': {}, + 'identification': {}, + 'contact': {} + } + + md = json.loads(metadata) + + if md is None: + raise ValueError('No openaire metadata') + + if 'results' in md: + LOGGER.info('Using first record from results') + md = next(iter(md.get('results')), None) + if md is None: + raise ValueError('No openaire metadata in results') + + metadata_ = md + + # mcf: metadata + pids_ = metadata_.get('pids', []) + originIds_ = metadata_.get('originalIds', []) + id_ = metadata_.get('id') + + children_instances_ = metadata_.get('instances') + main_id_, altIds_, main_instance_ = process_id_and_instance( + pids_, originIds_, id_, children_instances_) + + if main_id_: + mcf['metadata']['identifier'] = main_id_ + else: + raise ValueError('No identification on record') + + if altIds_: + mcf['metadata']['additional_identifiers'] = altIds_ + + project_ = metadata_.get('projects') + if project_ is not None and isinstance(project_, list): + rel_project = [] + for p in project_: + pids = p.get('pids', []) + if pids is None or len(pids) == 0: + continue + pid = pids[0] + pro_dict = { + 'identifier': pid.get('value'), + 'scheme': pid.get('scheme'), + 'type': 'project' + } + rel_project.append(pro_dict) + if len(rel_project) > 0: + mcf['metadata']['relations'] = rel_project + + if main_instance_: + instance_type_ = main_instance_.get('type') + if instance_type_: + mcf['metadata']['hierarchylevel'] = instance_type_ + + date_of_collection = metadata_.get('dateOfCollection') + if date_of_collection is not None: + mcf['metadata']['datestamp'] = metadata_.get('dateOfCollection') + + if main_instance_ is not None: + urls = main_instance_.get('urls') + if urls: + mcf['metadata']['dataseturi'] = next(iter(urls), '') + + # mcf: identification + language_ = metadata_.get('language', {}).get('code') + if language_ is not None: + mcf['identification']['language'] = language_ + + main_title = metadata_.get('mainTitle') + # subtitle also exists + if main_title is not None: + mcf['identification']['title'] = main_title + + description_ = metadata_.get('descriptions') + if description_ is not None: + mcf['identification']['abstract'] = next(iter(description_), '') + + version_ = metadata_.get('version') + if version_ is not None: + mcf['identification']['edition'] = version_ + + # topiccategory + right_ = metadata_.get('bestAccessRight', {}).get('label') + instance_right_ = None + if main_instance_: + instance_right_ = main_instance_.get( + 'accessRight', {}).get('label') + if right_ is not None and right_ != 'unspecified': + mcf['identification']['rights'] = right_ + elif instance_right_ is not None and instance_right_ != 'unspecified': + mcf['identification']['rights'] = instance_right_ + + if main_instance_ is not None: + license_ = main_instance_.get('license') + if license_: + mcf['identification']['license'] = { + 'name': license_, + 'url': '' + } + + # url + dates_dict = {} + p_date = metadata_.get('publicationDate') + e_date = metadata_.get('embargoEndDate') + if p_date: + dates_dict['publication'] = p_date + mcf['identification']['datestamp'] = [p_date] + if e_date: + dates_dict['embargoend'] = e_date + if dates_dict: + mcf['identification']['dates'] = dates_dict + + subjects_ = metadata_.get('subjects') + if isinstance(subjects_, dict): + mcf['identification']['keywords'] = process_keywords([subjects_]) + elif isinstance(subjects_, list): + mcf['identification']['keywords'] = process_keywords(subjects_) + + # contact point + authors_ = metadata_.get('authors', []) + orgs_ = metadata_.get('organizations', []) + authors_ = authors_ or [] + orgs_ = orgs_ or [] + contact_ = authors_ + orgs_ + if len(contact_) > 0: + mcf['contact'] = process_contact(contact_) + + return mcf + + def write(self, mcf: dict, stringify: str = True) -> Union[dict, str]: + """ + Write outputschema to JSON string buffer + + :param mcf: dict of MCF content model + :param stringify: whether to return a string representation (default) + else native (dict, etree) + + :returns: `dict` or `str` of MCF as Schema.org + """ + + # no write implementation for now + return '' + + +def process_id_and_instance( + pids: list, originIds: list, + id: str, instances: list) -> tuple[str, list, dict]: + """ + Get the main_id, alternative_ids and main_instance from the input data + + Alternative ids are the unique ids from pids and originIds + + Main id is the first doi from pids, otherwise the first pid, + otherwise a doi from originId, otherwise an http url from originId, + otherwise the first originId. + + main_instance is the first instance with the matched id of main_id, + otherwise the first instance. + + :param pids: main pids + :param originIds: pids from original + :param id: id of the record + :param instances: instances in the record + + :returns: `tuple` of (main_id, alternative_ids, main_instance) + """ + + # altids and main_id + pids_schemevalue = None + if pids is not None and len(pids) > 0: + first_id = pids[0] + main_id = first_id.get('value') if first_id else None + if len(pids) > 1: + for i in pids: + if i.get('scheme') == "doi": + main_id = i.get('value') + break + pids_schemevalue = [ + { + 'identifier': i.get('value'), + 'scheme': i.get('scheme') + } + for i in pids + ] + elif originIds is not None and len(originIds) > 0: + main_id = next((item for item in originIds + if item.startswith('10.')), None) + if main_id is None: + main_id = next((item for item in originIds + if item.startswith('http')), None) + if main_id is None: + main_id = originIds[0] + elif id is not None: + main_id = id + else: + LOGGER.error('no valid identifier') + main_id = None + + origin_and_ids = originIds + [id] if originIds else [id] + origin_and_ids_uni = list(set(origin_and_ids)) if ( + origin_and_ids is not None) else [] + + if pids_schemevalue: + pids_values = [i.get('identifier') for i in pids_schemevalue] + for i in origin_and_ids_uni: + if i not in pids_values and i is not None: + pids_schemevalue.append({ + 'identifier': i, + 'scheme': None + }) + else: + pids_schemevalue = [ + { + 'identifier': i, + 'scheme': None + } + for i in origin_and_ids_uni + ] + + # instance + if instances is None or len(instances) == 0: + return main_id, pids_schemevalue, None + + # get the instance matched with the main id + main_instance = instances[0] + for ins in instances: + pid = ins.get('pid', {}) + if isinstance(pid, list): # instance has multiple pid + pid_values = [i.get('value') for i in pid] + if main_id in pid_values: + main_instance = ins + break + elif isinstance(pid, dict): # instance has one pid + if pid.get('value') == main_id: + main_instance = ins + break + else: + continue + + return main_id, pids_schemevalue, main_instance + + +def process_keywords(subjects: list) -> dict: + """ + convert openaire keywords to mcf keywords + + group keywords by scheme + + :param subjects: list + + :returns: `dict` grouped keywords + """ + unique_scheme = list(set([s.get('subject', {}).get('scheme') + for s in subjects])) + + scheme_uuid_dict = {scheme: str(uuid.uuid4()) for scheme in unique_scheme} + + keywords_dict = { + value: { + 'keywords': [], + 'vocabulary': { + 'name': key + } + } + for key, value in scheme_uuid_dict.items() + } + + for s in subjects: + s_value = s.get('subject') + for k, v in keywords_dict.items(): + if s_value.get('scheme') == v.get('vocabulary', {}).get('name'): + v['keywords'].append(s_value.get('value')) + break + return keywords_dict + + +def process_contact(contact_list: list) -> dict: + """ + Process authors and organizations into MCF contact format + + :param authors: list of author objects + :param orgs: list of organization objects + + :returns: dict with UUID keys and contact point values + """ + contact_dict = {} + + for contact in contact_list: + contact_uuid = str(uuid.uuid4()) + # Initialize contact point structure + contactpoint_dict = { + 'individualname': '', + 'organization': '', + 'url': '' + } + # Process authors + if 'fullName' in contact: + contactpoint_dict['individualname'] = contact.get('fullName') + pid = contact.get('pid') + if pid is not None and pid.get('id') is not None: + pid_scheme = pid.get('id', {}).get('scheme') + pid_value = pid.get('id', {}).get('value') + if None not in [pid_scheme, pid_value]: + contactpoint_dict['url'] = id2url(pid_scheme, pid_value) + + # Process organizations + elif 'legalName' in contact: + org_name = contact.get('legalName') + contactpoint_dict['organization'] = org_name + pids = contact.get('pids', []) + if pids is not None: + for p in pids: + if p.get('scheme').lower() in ['ror', 'grid', + 'wikidata', 'isni']: + contactpoint_dict['url'] = id2url( + p.get('scheme'), p.get('value')) + break + + # Add to contactpoint dict + if (contactpoint_dict['individualname'] or + contactpoint_dict['organization']): + contact_dict[contact_uuid] = contactpoint_dict + + return contact_dict + + +def id2url(scheme: str, id_: str) -> str: + """ + Convert orcid, wikidata, ror or grid value to url + + :param scheme: scheme + :param id: identifier + + :returns: `str` url + """ + scheme2 = scheme.lower() + + if scheme2 in ['ror', 'grid']: + return id_ + elif scheme2 == 'orcid': + return f'https://orcid.org/{id_}' + elif scheme2 == 'wikidata': + return f'https://www.wikidata.org/wiki/{id_}' + elif scheme2 == 'isni': + return f'https://isni.org/isni/{id_}' + + return None diff --git a/tests/openaire.json b/tests/openaire.json new file mode 100644 index 0000000..26aa181 --- /dev/null +++ b/tests/openaire.json @@ -0,0 +1,243 @@ +{ + "header": { + "numFound": 1, + "maxScore": 1.0, + "queryTime": 10, + "page": 1, + "pageSize": 10 + }, + "results": [ + { + "authors": [ + { + "fullName": "Fohrafellner, Julia", + "name": "Julia", + "surname": "Fohrafellner", + "rank": 1, + "pid": { + "id": { + "scheme": "orcid_pending", + "value": "0000-0001-5734-7353" + }, + "provenance": null + } + } + ], + "openAccessColor": null, + "publiclyFunded": null, + "type": "dataset", + "language": { + "code": "eng", + "label": "English" + }, + "countries": null, + "subjects": [ + { + "subject": { + "scheme": "keyword", + "value": "EJP SOIL" + }, + "provenance": null + }, + { + "subject": { + "scheme": "keyword", + "value": "ProbeField" + }, + "provenance": null + }, + { + "subject": { + "scheme": "keyword", + "value": "Spectroscopy, Near-Infrared" + }, + "provenance": null + }, + { + "subject": { + "scheme": "keyword", + "value": "data" + }, + "provenance": null + } + ], + "mainTitle": "Dataset to: Foundation for an Austrian NIR Soil Spectral Library for Soil Health Assessments", + "subTitle": null, + "descriptions": [ + "Dataset description This is the corresponding dataset to the publication \"Foundation for an Austrian NIR Soil Spectral Library for Soil Health Assessments\" by Fohrafellner et al. (2025). In this publication, we created the first Near-Infrared (NIR) Austrian Soil Spectral Library (ASSL, 680 – 2500 nm) using 2,129 legacy samples from all environmental zones of Austria. Additionally, we utilized partial least squares regression modeling to evaluate the dataset's current effectiveness for soil health assessments. The dataset contains three tabs, \"Document meta data\", \"Legend\" and \"Dataset\". Tab \"Document meta data\" gives information on the authors, the data collection time frame, terms of use, etc. In \"Legend\", each column of the \"Dataset\" is described. The \"Dataset\" contains information on the legacy soil samples including: meta data (e.g. sample number, sampling year, zip code, environmental zone, land use), soil properties (soil organic carbon [SOC], SOC to clay ratio, total carbon, labile carbon, CaCO3, total nitrogen, plant available phosphorus, pH measured in CaCl2 and acetate, cation exchange capacity, texture [sand, silt, clay content], and clay content measured by density in suspension), and measured NIR soil spectra, also for the standards. Project description This Austrian Soil Spectral Library was built within the ProbeField project (November 2021 – January 2025), which was part of the European Joint Program for SOIL ‘Towards climate-smart sustainable management of agricultural soils’ (EJP SOIL) funded by the European Union Horizon 2020 research and innovation programme (Grant Agreement N° 862695). The project aimed to create a protocol detailing procedures and methodologies for accurately estimating fertility-related properties in agricultural soils in the field. Additionally, the potential for extending this data to two- and three-dimensional mapping using co-variates was demonstrated. ProbeField further collected field spectra that closely match laboratory spectra, enabling the prediction of soil properties using models calibrated with soil spectral libraries. References Fohrafellner, J., Lippl, M., Bajraktarevic, A., Baumgarten, A., Spiegel, H., Körner, R. and Sandén, T.: Foundation for an Austrian NIR Soil Spectral Library for Soil Health Assessments, 2025, in review." + ], + "publicationDate": "2025-07-07", + "publisher": "Zenodo", + "embargoEndDate": null, + "sources": null, + "formats": null, + "contributors": [ + "Fohrafellner, Julia", + "Lippl, Maximilian", + "Bajraktarevic, Armin", + "Spiegel, Heide", + "Körner, Robert", + "Sandén, Taru" + ], + "coverages": null, + "bestAccessRight": { + "code": "c_abf2", + "label": "OPEN", + "scheme": "http://vocabularies.coar-repositories.org/documentation/access_rights/" + }, + "container": null, + "documentationUrls": null, + "codeRepositoryUrl": null, + "programmingLanguage": null, + "contactPeople": null, + "contactGroups": null, + "tools": null, + "size": null, + "version": "1", + "geoLocations": null, + "id": "doi_dedup___::344b6b438c623855996b060ef6656bdd", + "originalIds": [ + "50|datacite____::344b6b438c623855996b060ef6656bdd", + "10.5281/zenodo.15772619", + "50|sygma_______::344b6b438c623855996b060ef6656bdd" + ], + "pids": [ + { + "scheme": "doi", + "value": "10.5281/zenodo.15772619" + } + ], + "dateOfCollection": null, + "lastUpdateTimeStamp": null, + "indicators": { + "citationImpact": { + "citationCount": 0.0, + "influence": 2.4895952E-9, + "popularity": 2.7494755E-9, + "impulse": 0.0, + "citationClass": "C5", + "influenceClass": "C5", + "impulseClass": "C5", + "popularityClass": "C5" + } + }, + "projects": [ + { + "id": "corda__h2020::f3d09ef95fcf5a35c0e90e6560b6b2e0", + "code": "862695", + "acronym": "EJP SOIL", + "title": "Towards climate-smart sustainable management of agricultural soils", + "funder": "European Commission", + "pids": [ + { + "scheme": "doi", + "value": "10.3030/862695" + } + ] + } + ], + "organizations": [ + { + "legalName": "Austrian Agency for Health and Food Safety", + "acronym": "AGES", + "id": "openorgs____::fcf31bab84c16bfd27726caa5bb57126", + "pids": [ + { + "scheme": "OrgReg", + "value": "AT3001" + }, + { + "scheme": "GRID", + "value": "grid.414107.7" + }, + { + "scheme": "ROR", + "value": "https://ror.org/055xb4311" + }, + { + "scheme": "PIC", + "value": "998254743" + }, + { + "scheme": "ISNI", + "value": "0000000122246253" + } + ] + } + ], + "communities": [ + { + "code": "eosc", + "label": "EOSC", + "provenance": null + } + ], + "collectedFrom": [ + { + "key": "openaire____::9e3be59865b2c1c335d32dae2fe7b254", + "value": "Datacite" + }, + { + "key": "openaire____::a8db6f6b2ce4fe72e8b2314a9a93e7d9", + "value": "Sygma" + } + ], + "instances": [ + { + "pids": [ + { + "scheme": "doi", + "value": "10.5281/zenodo.15772619" + } + ], + "license": "CC BY", + "type": "Dataset", + "urls": [ + "https://dx.doi.org/10.5281/zenodo.15772619" + ], + "publicationDate": "2025-07-07", + "refereed": "nonPeerReviewed", + "hostedBy": { + "key": "opendoar____::358aee4cc897452c00244351e4d91f69", + "value": "ZENODO" + }, + "collectedFrom": { + "key": "openaire____::9e3be59865b2c1c335d32dae2fe7b254", + "value": "Datacite" + } + }, + { + "alternateIdentifiers": [ + { + "scheme": "doi", + "value": "10.5281/zenodo.15772619" + } + ], + "license": "CC BY", + "accessRight": { + "code": "c_abf2", + "label": "OPEN", + "scheme": "http://vocabularies.coar-repositories.org/documentation/access_rights/", + "openAccessRoute": null + }, + "type": "Dataset", + "urls": [ + "https://doi.org/10.5281/zenodo.15772619" + ], + "publicationDate": "2025-07-07", + "refereed": "nonPeerReviewed", + "hostedBy": { + "key": "openaire____::55045bd2a65019fd8e6741a755395c8c", + "value": "Unknown Repository" + }, + "collectedFrom": { + "key": "openaire____::a8db6f6b2ce4fe72e8b2314a9a93e7d9", + "value": "Sygma" + } + } + ], + "isGreen": null, + "isInDiamondJournal": null + } + ] +} \ No newline at end of file diff --git a/tests/run_tests.py b/tests/run_tests.py index ed71518..268ce68 100644 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -226,17 +226,17 @@ def test_get_supported_schemas(self): schemas = sorted(get_supported_schemas()) self.assertIsInstance(schemas, list, 'Expected list') - self.assertEqual(len(schemas), 12, + self.assertEqual(len(schemas), 13, 'Expected specific number of supported schemas') self.assertEqual(sorted(schemas), - sorted(['cwl', 'csvw', 'dcat', 'iso19139', - 'iso19139-2', 'iso19139-hnap', 'oarec-record', + sorted(['cwl', 'csvw', 'dcat', 'iso19139', 'iso19139-2', # noqa + 'iso19139-hnap', 'oarec-record', 'openaire', 'schema-org', 'stac-item', 'wmo-cmp', 'wmo-wcmp2', 'wmo-wigos']), 'Expected exact list of supported schemas') schemas = get_supported_schemas(include_autodetect=True) - self.assertEqual(len(schemas), 13, + self.assertEqual(len(schemas), 14, 'Expected specific number of supported schemas') self.assertIn('autodetect', schemas, 'Expected autodetect in list') @@ -452,6 +452,17 @@ def test_import_metadata(self): 'WIS/GTS bulletin SMJP01 RJTD in FM12 SYNOP', 'Expected specific title') + def test_openaire(self): + """test metadata import openaire""" + + with open(get_abspath('openaire.json')) as fh: + mcf = import_metadata('openaire', fh.read()) + + self.assertEqual( + mcf['identification']['title'], + 'Dataset to: Foundation for an Austrian NIR Soil Spectral Library for Soil Health Assessments', # noqa + 'Expected specific title') + with open(get_abspath('md-SMJP01RJTD-gmd.xml')) as fh: mcf = import_metadata('autodetect', fh.read())