Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion pygeometa/schemas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@
'wmo-cmp': 'pygeometa.schemas.wmo_cmp.WMOCMPOutputSchema',
'wmo-wcmp2': 'pygeometa.schemas.wmo_wcmp2.WMOWCMP2OutputSchema',
'wmo-wigos': 'pygeometa.schemas.wmo_wigos.WMOWIGOSOutputSchema',
'cwl': 'pygeometa.schemas.cwl.CWLOutputSchema'
'cwl': 'pygeometa.schemas.cwl.CWLOutputSchema',
'gbif-eml': 'pygeometa.schemas.gbif_eml.GBIF_EMLOutputSchema'
}


Expand Down
198 changes: 198 additions & 0 deletions pygeometa/schemas/gbif_eml/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
import re
from pathlib import Path

import pycountry
from bs4 import BeautifulSoup
from pygeometa.schemas.base import BaseOutputSchema

THISDIR = Path(__file__).parent


def text_or_null(node, strip=False):
if not node:
return None

if strip:
return node.text.strip()

return node.text


def text_or_empty(node, strip=False):
if not node:
return ""

if strip:
return node.text.strip()

return node.text


def scrub_dict(d):
if type(d) is dict:
return dict(
(k, scrub_dict(v))
for k, v in d.items()
if v is not None and scrub_dict(v) is not None
)
else:
return d


def to_contact_role(node, role, mapped_role=None):
if not mapped_role:
mapped_role = role

for idx, contact in enumerate(node.find_all(role)):
name = f'{text_or_empty(contact.find("surName"))}, '
name += text_or_empty(contact.find("givenName"))
org = text_or_empty(contact.find("organizationName"))
yield (
mapped_role + (f"_{idx}" if idx else ""),
{
"organization": org,
"individualname": name,
"positionname": text_or_empty(contact.find("positionName"))
or text_or_empty(contact.find("role")),
"phone": "",
"url": "",
"fax": "",
"address": "",
"city": "",
"administrativearea": "",
"postalcode": "",
"country": text_or_empty(contact.find("country")),
"email": text_or_empty(contact.find("electronicMailAddress")),
},
)


class GBIF_EMLOutputSchema(BaseOutputSchema):
def __init__(self):
super().__init__("gbif-eml", "EML - GBIF profile", "xml", THISDIR)

def import_(self, metadata):
soup = BeautifulSoup(metadata, features="lxml-xml")
dataset = soup.find("dataset")
mcf = {
"mcf": {
"version": 1,
},
"metadata": {
"charset": "utf8",
"hierarchylevel": "dataset",
"datestamp": text_or_null(dataset.find("pubDate")) or "$date$",
},
"identification": {},
"contact": {},
"distribution": {},
}

for identifier in dataset.find_all("alternateIdentifier"):
mcf["metadata"]["identifier"] = text_or_null(identifier)

if language := dataset.find("language"):
lang = text_or_null(language)
if lang and pycountry.languages.get(alpha_3=lang):
mcf["metadata"]["language"] = pycountry.languages.get(
alpha_3=lang
).alpha_2

idf = mcf["identification"]

idf["title"] = text_or_null(dataset.find("title"))
idf["abstract"] = text_or_null(dataset.find("abstract"))

if intellectual_rights := dataset.find("intellectualRights"):
url = (
intellectual_rights.find("ulink")["url"]
if intellectual_rights.find("ulink")
else None
)
idf["rights"] = {
"name": text_or_null(intellectual_rights.find("citetitle")),
"url": url,
}

idf["url"] = text_or_null(dataset.find("alternateIdentifier"))
idf["status"] = "completed"

# if maintenance := dataset.find("maintenance"):
# metadata.maintenance_update_description = text_or_null(
# maintenance.find("description")
# )

idf["maintenancefrequency"] = (
text_or_null(dataset.find("maintenanceUpdateFrequency"))
or "unknown"
)

idf["dates"] = {"publication": text_or_null(dataset.find("pubDate"))}
idf["extents"] = {}

if coords := dataset.find("boundingCoordinates"):
idf["extents"]["spatial"] = [{}]
spatial = idf["extents"]["spatial"][0]

spatial["bbox"] = [
float(coords.find("westBoundingCoordinate").text),
float(coords.find("southBoundingCoordinate").text),
float(coords.find("eastBoundingCoordinate").text),
float(coords.find("northBoundingCoordinate").text),
]

spatial["crs"] = 4326
spatial["description"] = text_or_null(
dataset.find("geographicDescription")
)

# temporal = idf["extents"]["temporal"]
# temporal["begin"]
# temporal["end"]
# temporal["resolution"]

idf["keywords"] = {}

ct = mcf["contact"]

for r, obj in to_contact_role(dataset, "contact", "pointOfContact"):
ct[r] = obj

for r, obj in to_contact_role(
dataset, "metadataProvider", "distributor"
):
ct[r] = obj

for r, obj in to_contact_role(dataset, "creator"):
ct[r] = obj

for r, obj in to_contact_role(
dataset, "personnel", "projectPersonnel"
):
ct[r] = obj

for idx, keyword_set in enumerate(dataset.find_all("keywordSet")):
thesaurus = text_or_null(keyword_set.find("keywordThesaurus"))
match = re.search(r"(?P<url>https?://[^\s]+)", thesaurus)
definition = match.group("url") if match else None

idf["keywords"][f"default-{idx}"] = {
"keywords": [
text_or_null(kw) for kw in keyword_set.find_all("keyword")
],
"vocabulary": {"name": thesaurus, "url": definition},
}

mcf["spatial"] = {"datatype": "vector", "geomtype": "composite"}

mcf["distribution"] = {
"file": {
"url": idf["url"],
"type": "WWW:LINK",
"function": "information",
"description": "",
"name": "Darwin Core Archive",
}
}

return scrub_dict(mcf)
150 changes: 150 additions & 0 deletions pygeometa/schemas/gbif_eml/main.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
<eml:eml xmlns:eml="eml://ecoinformatics.org/eml-2.1.1" xmlns:dc="http://purl.org/dc/terms/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="eml://ecoinformatics.org/eml-2.1.1 http://rs.gbif.org/schema/eml-gbif-profile/1.1/eml.xsd"
packageId="{{ record['metadata']['dataseturi'] }}" system="http://gbif.org"
scope="system" xml:lang="{{ record['identification']['language'] }}">

<dataset>
<alternateIdentifier>{{ record['identification']['doi' ]}}</alternateIdentifier>
<title xml:lang="{{ record['identification']['language'] }}">{{ record['identification']['title'] }}</title>
{#
<!--creator>
{% include 'person.j2' %}
</creator>
<creator>
{% include 'person.j2' %}
</creator>
<metadataProvider>
{% include 'person.j2' %}
</metadataProvider-->
#}
<pubDate>
{{ record['identification']['dates']['publication'] }}
</pubDate>
<language>{{ record['identification']['language'] }}</language>
<abstract>
<para>{{ record['identification']['abstract'] }}</para>
</abstract>
{% for group, keywords in record['identification']['keywords'].items() %}
<keywordSet>
{% for kw in keywords['keywords'] %}
<keyword>{{ kw }}</keyword>
{% endfor %}
<keywordThesaurus>{{ keywords['vocabulary']['name'] }}: {{ keywords['vocabulary']['url'] }}</keywordThesaurus>
</keywordSet>
{% endfor%}
<intellectualRights>
<para>This work is licensed under a <ulink url="{{ record['identification']['rights']['url'] }}">
<citetitle>{{ record['identification']['rights']['name'] }}</citetitle>
</ulink>.</para>
</intellectualRights>
<distribution scope="document">
<online>
{% for key, value in record['distribution'].items() %}
<url function="{{ value['function'] }}">{{ value['url' ]}}</url>
{% endfor %}
</online>
</distribution>
{% set extents = record['identification']['extents'] %}
{% set bbox = extents['spatial'][0]['bbox'] %}
<coverage>
<geographicCoverage>
<geographicDescription>{{ extents['spatial'][0]['description'] }}</geographicDescription>
<boundingCoordinates>
<westBoundingCoordinate>{{ bbox[0] }}</westBoundingCoordinate>
<eastBoundingCoordinate>{{ bbox[1] }}</eastBoundingCoordinate>
<northBoundingCoordinate>{{ bbox[2] }}</northBoundingCoordinate>
<southBoundingCoordinate>{{ bbox[3] }}</southBoundingCoordinate>
</boundingCoordinates>
</geographicCoverage>
{% if 'temporal' in extents %}
<temporalCoverage>
<rangeOfDates>
<beginDate>
<calendarDate>{{ extents['temporal'][0]['begin'] }}</calendarDate>
</beginDate>
{% if extents['temporal'][0]['end'] %}
<endDate>
<calendarDate>{{ extents['temporal'][0]['end'] }}</calendarDate>
</endDate>
{% endif %}
</rangeOfDates>
</temporalCoverage>
{% endif %}
<taxonomicCoverage>
<generalTaxonomicCoverage>
</generalTaxonomicCoverage>
<taxonomicClassification>
<taxonRankName></taxonRankName>
<taxonRankValue></taxonRankValue>
<commonName></commonName>
</taxonomicClassification>
</taxonomicCoverage>
</coverage>
<maintenance>
<description>
<para />
</description>
<maintenanceUpdateFrequency></maintenanceUpdateFrequency>
</maintenance>

{#
<contact>
{% include 'person.j2' %}
</contact>
<methods>
<methodStep>
<description>
<para></para>
</description>
</methodStep>
<sampling>
<studyExtent>
<description>
<para>
</para>
</description>
</studyExtent>
<samplingDescription>
<para></para>
</samplingDescription>
</sampling>
<qualityControl>
<description>
<para></para>
</description>
</qualityControl>
</methods>
<project>
<title></title>
<personnel>
{% include 'person.j2' %}
<role />
</personnel>
<abstract>
<para></para>
</abstract>
<funding>
<para>Artsdatabanken</para>
</funding>
<studyAreaDescription>
<descriptor name="generic" citableClassificationSystem="false">
<descriptorValue></descriptorValue>
</descriptor>
</studyAreaDescription>
</project>
#}
</dataset>
{#
<additionalMetadata>
<metadata>
<gbif>
<dateStamp></dateStamp>
<hierarchyLevel>dataset</hierarchyLevel>
<citation></citation>
<resourceLogoUrl></resourceLogoUrl>
</gbif>
</metadata>
</additionalMetadata>
#}
</eml:eml>
11 changes: 11 additions & 0 deletions pygeometa/schemas/gbif_eml/person.j2
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<individualName>
<givenName>{{ first_name }}</givenName>
<surName>{{ last_name }}</surName>
</individualName>
<organizationName>{{ org_name }}</organizationName>
<positionName>{{ position }}</positionName>
<address>
<country>{{ country }}</country>
</address>
<electronicMailAddress>{{ email }}</electronicMailAddress>
{% if  orcid %}<userId directory="http://orcid.org/">{{ orcid }}</userId>{% endif %}
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ jsonschema
lxml
OWSLib
pyyaml
beautifulsoup4
pycountry
Loading