From 1fe48e691bc2bbc3d00d290177a20f679f2d39f2 Mon Sep 17 00:00:00 2001 From: Jacco van Ossenbruggen Date: Thu, 9 Jan 2025 15:11:03 +0000 Subject: [PATCH] Fix literal property values that should be resources --- dataverse_utils/config-example.py | 6 +++++ dataverse_utils/create-graph.py | 43 +++++++++++++++++-------------- dataverse_utils/requirements.txt | 3 +++ dataverse_utils/utils.py | 41 +++++++++++++++++++++++++++++ 4 files changed, 74 insertions(+), 19 deletions(-) create mode 100644 dataverse_utils/requirements.txt create mode 100644 dataverse_utils/utils.py diff --git a/dataverse_utils/config-example.py b/dataverse_utils/config-example.py index 4a21b85..75838e0 100644 --- a/dataverse_utils/config-example.py +++ b/dataverse_utils/config-example.py @@ -2,3 +2,9 @@ PRODUCTION_API_TOKEN = '' ROOT = '' FUSEKI_PASSWORD = '' + +DATAVERSE_URL="https://portal.odissei.nl" +FUSEKI_URL="https://fuseki.devstack.odissei.nl" +FUSEKI_COLLECTION="odissei" +FUSEKI_LOGIN='' +DEBUG=True diff --git a/dataverse_utils/create-graph.py b/dataverse_utils/create-graph.py index 99c0e9a..24cc61f 100644 --- a/dataverse_utils/create-graph.py +++ b/dataverse_utils/create-graph.py @@ -1,29 +1,36 @@ +# coding: UTF-8 import requests import xmltodict import json import pprint import re -from config import FUSEKI_PASSWORD +from config import DATAVERSE_URL, FUSEKI_URL, FUSEKI_COLLECTION, FUSEKI_LOGIN, FUSEKI_PASSWORD, DEBUG +from utils import resolve_blank_nodes +from rdflib import Graph +from urllib.parse import urlparse - -url = "https://portal.odissei.nl/sitemap.xml" -fusekiurl = "https://fuseki.odissei.nl" -user = 'admin' -password = FUSEKI_PASSWORD +url = f"{DATAVERSE_URL}/sitemap.xml" +parsed_url = urlparse(url) +bnode_url = f"{parsed_url.scheme}://{parsed_url.netloc}/bnode" response = requests.get(url) -collection = 'odissei' doc = xmltodict.parse(response.text) pp = pprint.PrettyPrinter(indent=4) - def uploadRDF(lasturl): response = requests.get(lasturl) - print(json.loads(response.text)) - uploadfusekiurl = "%s/%s/data" % (fusekiurl, collection) - resp = requests.post(uploadfusekiurl, data=response.text, - auth=(user, password), - headers={"Content-Type": "application/ld+json"}) - print(resp.text) + print("uploadRDF(%s)"%lasturl) + # print(json.loads(response.text)) + uploadfusekiurl = "%s/%s/data" % (FUSEKI_URL, FUSEKI_COLLECTION) + g = Graph() + g.parse(data=response.text, format="json-ld") + # Resolve blank nodes + resolved_graph = resolve_blank_nodes(bnode_url, g) + json_ser = resolved_graph.serialize(format="json-ld").encode('utf8') + resp = requests.post(uploadfusekiurl, data=json_ser, + auth=(FUSEKI_LOGIN, FUSEKI_PASSWORD), + headers={"Content-Type": "application/ld+json; charset=utf8"}) + if DEBUG: + print(resp.text) return @@ -32,8 +39,6 @@ def uploadRDF(lasturl): if hostitems: dvnurl = "%s/api/datasets/export?exporter=OAI_ORE&%s" % ( hostitems.group(1), hostitems.group(2)) - print(dvnurl) - try: - uploadRDF(dvnurl) - except: - print("Ignore %s" % dvnurl) + # print(dvnurl) + uploadRDF(dvnurl) + # except: print("UploadRDF() failed, Ignore %s" % dvnurl) diff --git a/dataverse_utils/requirements.txt b/dataverse_utils/requirements.txt new file mode 100644 index 0000000..bf5ccd7 --- /dev/null +++ b/dataverse_utils/requirements.txt @@ -0,0 +1,3 @@ +rdflib +xmltodict +requests diff --git a/dataverse_utils/utils.py b/dataverse_utils/utils.py new file mode 100644 index 0000000..3003161 --- /dev/null +++ b/dataverse_utils/utils.py @@ -0,0 +1,41 @@ +from rdflib import Graph, URIRef, Literal, BNode +from urllib.parse import urlparse + +def resolve_blank_nodes(bnode_url, graph): + """ + Resolves blank nodes in an RDF graph by converting them to URIs and preserving their properties. + + Args: + bnode_url (str): Base URL to use for generating new URIs for blank nodes + graph (rdflib.Graph): Input RDF graph containing blank nodes + + Returns: + rdflib.Graph: New graph with resolved blank nodes + """ + resolved_graph = Graph() + for subj, pred, obj in graph: + if isinstance(obj, BNode): + print("Resolving blank node: %s"%str(obj)) + # Retrieve properties attached to the blank node + attached_data = {p: o for s, p, o in graph.triples((obj, None, None))} + # Generate a unique URI for the blank node based on its attached properties + new_uri = URIRef(f"{bnode_url}/{hash(obj)}") + # Add the original triple with the new URI + resolved_graph.add((subj, pred, new_uri)) + # Add the attached properties to the graph using the new URI + for p, o in attached_data.items(): + # Convert string URLs to URIRef in attached data + if isinstance(o, Literal) and ('http://' in str(o)[:7] or 'https://' in str(o)[:8]): + o = URIRef(str(o)) + if isinstance(p, Literal) and ('http://' in str(p)[:7] or 'https://' in str(p)[:8]): + p = URIRef(str(p)) + resolved_graph.add((new_uri, p, o)) + else: + # Convert string URLs to URIRef for non-blank node objects and predicates + if isinstance(obj, Literal) and ('http://' in str(obj)[:7] or 'https://' in str(obj)[:8]): + obj = URIRef(str(obj)) + print("Urifying %s"%str(obj)) + if isinstance(pred, Literal) and ('http://' in str(pred)[:7] or 'https://' in str(pred)[:8]): + pred = URIRef(str(pred)) + resolved_graph.add((subj, pred, obj)) + return resolved_graph