Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions dataverse_utils/config-example.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,9 @@
PRODUCTION_API_TOKEN = ''
ROOT = ''
FUSEKI_PASSWORD = ''

DATAVERSE_URL="https://portal.odissei.nl"
FUSEKI_URL="https://fuseki.devstack.odissei.nl"
FUSEKI_COLLECTION="odissei"
FUSEKI_LOGIN=''
DEBUG=True
43 changes: 24 additions & 19 deletions dataverse_utils/create-graph.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,36 @@
# coding: UTF-8
import requests
import xmltodict
import json
import pprint
import re
from config import FUSEKI_PASSWORD
from config import DATAVERSE_URL, FUSEKI_URL, FUSEKI_COLLECTION, FUSEKI_LOGIN, FUSEKI_PASSWORD, DEBUG
from utils import resolve_blank_nodes
from rdflib import Graph
from urllib.parse import urlparse


url = "https://portal.odissei.nl/sitemap.xml"
fusekiurl = "https://fuseki.odissei.nl"
user = 'admin'
password = FUSEKI_PASSWORD
url = f"{DATAVERSE_URL}/sitemap.xml"
parsed_url = urlparse(url)
bnode_url = f"{parsed_url.scheme}://{parsed_url.netloc}/bnode"
response = requests.get(url)
collection = 'odissei'
doc = xmltodict.parse(response.text)
pp = pprint.PrettyPrinter(indent=4)


def uploadRDF(lasturl):
response = requests.get(lasturl)
print(json.loads(response.text))
uploadfusekiurl = "%s/%s/data" % (fusekiurl, collection)
resp = requests.post(uploadfusekiurl, data=response.text,
auth=(user, password),
headers={"Content-Type": "application/ld+json"})
print(resp.text)
print("uploadRDF(%s)"%lasturl)
# print(json.loads(response.text))
uploadfusekiurl = "%s/%s/data" % (FUSEKI_URL, FUSEKI_COLLECTION)
g = Graph()
g.parse(data=response.text, format="json-ld")
# Resolve blank nodes
resolved_graph = resolve_blank_nodes(bnode_url, g)
json_ser = resolved_graph.serialize(format="json-ld").encode('utf8')
resp = requests.post(uploadfusekiurl, data=json_ser,
auth=(FUSEKI_LOGIN, FUSEKI_PASSWORD),
headers={"Content-Type": "application/ld+json; charset=utf8"})
if DEBUG:
print(resp.text)
return


Expand All @@ -32,8 +39,6 @@ def uploadRDF(lasturl):
if hostitems:
dvnurl = "%s/api/datasets/export?exporter=OAI_ORE&%s" % (
hostitems.group(1), hostitems.group(2))
print(dvnurl)
try:
uploadRDF(dvnurl)
except:
print("Ignore %s" % dvnurl)
# print(dvnurl)
uploadRDF(dvnurl)
# except: print("UploadRDF() failed, Ignore %s" % dvnurl)
3 changes: 3 additions & 0 deletions dataverse_utils/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
rdflib
xmltodict
requests
41 changes: 41 additions & 0 deletions dataverse_utils/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from rdflib import Graph, URIRef, Literal, BNode
from urllib.parse import urlparse

def resolve_blank_nodes(bnode_url, graph):
"""
Resolves blank nodes in an RDF graph by converting them to URIs and preserving their properties.

Args:
bnode_url (str): Base URL to use for generating new URIs for blank nodes
graph (rdflib.Graph): Input RDF graph containing blank nodes

Returns:
rdflib.Graph: New graph with resolved blank nodes
"""
resolved_graph = Graph()
for subj, pred, obj in graph:
if isinstance(obj, BNode):
print("Resolving blank node: %s"%str(obj))
# Retrieve properties attached to the blank node
attached_data = {p: o for s, p, o in graph.triples((obj, None, None))}
# Generate a unique URI for the blank node based on its attached properties
new_uri = URIRef(f"{bnode_url}/{hash(obj)}")
# Add the original triple with the new URI
resolved_graph.add((subj, pred, new_uri))
# Add the attached properties to the graph using the new URI
for p, o in attached_data.items():
# Convert string URLs to URIRef in attached data
if isinstance(o, Literal) and ('http://' in str(o)[:7] or 'https://' in str(o)[:8]):
o = URIRef(str(o))
if isinstance(p, Literal) and ('http://' in str(p)[:7] or 'https://' in str(p)[:8]):
p = URIRef(str(p))
resolved_graph.add((new_uri, p, o))
else:
# Convert string URLs to URIRef for non-blank node objects and predicates
if isinstance(obj, Literal) and ('http://' in str(obj)[:7] or 'https://' in str(obj)[:8]):
obj = URIRef(str(obj))
print("Urifying %s"%str(obj))
if isinstance(pred, Literal) and ('http://' in str(pred)[:7] or 'https://' in str(pred)[:8]):
pred = URIRef(str(pred))
resolved_graph.add((subj, pred, obj))
return resolved_graph