diff --git a/docs/rdf_file_loader_agent.md b/docs/rdf_file_loader_agent.md
new file mode 100644
index 00000000..02a673b3
--- /dev/null
+++ b/docs/rdf_file_loader_agent.md
@@ -0,0 +1,136 @@
+# RDF File Loader Agent
+
+## Overview
+
+The RDF File Loader agent automatically loads RDF files into the Whyis knowledge graph as nanopublications. It monitors resources typed as `whyis:RDFFile` and loads their content.
+
+## Features
+
+- **Multiple Source Support:**
+ - Local files from the file depot (via `whyis:hasFileID`)
+ - Remote HTTP/HTTPS URLs
+ - S3 URIs (requires boto3 to be installed)
+
+- **Format Detection:**
+ - Automatic format detection from file extensions and content types
+ - Supports: Turtle (.ttl), RDF/XML (.rdf, .owl), JSON-LD (.jsonld), N-Triples (.nt), N3 (.n3), TriG (.trig), N-Quads (.nq)
+
+- **Provenance Tracking:**
+ - Resources are marked with `whyis:RDFFile` type before processing
+ - After loading, marked as `whyis:LoadedRDFFile`
+ - Activities are tracked as `whyis:RDFFileLoadingActivity`
+ - Proper nanopublication structure with provenance
+
+## Usage
+
+### 1. Add the agent to your configuration
+
+In your application's config file:
+
+```python
+from whyis import autonomic
+
+class Config:
+ INFERENCERS = {
+ 'RDFFileLoader': autonomic.RDFFileLoader(),
+ # ... other agents
+ }
+```
+
+### 2. Mark resources as RDF files
+
+Create a nanopublication that types a resource as `whyis:RDFFile`:
+
+```turtle
+@prefix whyis: .
+@prefix rdf: .
+
+ a whyis:RDFFile .
+```
+
+### 3. Loading from different sources
+
+#### Local File Depot
+
+For files already uploaded to the file depot:
+
+```turtle
+ a whyis:RDFFile ;
+ whyis:hasFileID "file_depot_id_here" .
+```
+
+#### HTTP/HTTPS URL
+
+Simply use the URL as the resource URI:
+
+```turtle
+ a whyis:RDFFile .
+```
+
+or
+
+```turtle
+ a whyis:RDFFile .
+```
+
+#### S3 URI
+
+For files stored in S3 (requires boto3):
+
+```turtle
+ a whyis:RDFFile .
+```
+
+**Note:** Ensure boto3 is installed and AWS credentials are configured:
+```bash
+pip install boto3
+```
+
+AWS credentials can be configured via:
+- Environment variables (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY)
+- AWS credentials file (~/.aws/credentials)
+- IAM role (when running on EC2)
+
+## How It Works
+
+1. The agent queries for resources typed as `whyis:RDFFile` that are not yet `whyis:LoadedRDFFile`
+2. For each resource:
+ - Checks if it has a `whyis:hasFileID` (file depot)
+ - Otherwise, examines the URI scheme (http://, https://, s3://)
+ - Downloads and parses the RDF content
+ - Adds the loaded triples to a nanopublication
+ - Marks the resource as `whyis:LoadedRDFFile`
+3. The nanopublication includes provenance linking back to the source file
+
+## Retirement
+
+When a resource is no longer typed as `whyis:RDFFile`, the agent's update mechanism will retire the associated nanopublications containing the loaded data.
+
+## Testing
+
+The agent includes 26 comprehensive unit tests covering:
+- Basic functionality
+- Format detection
+- HTTP/HTTPS loading
+- S3 loading (with and without boto3)
+- File depot access
+- Error handling
+
+Run tests with:
+```bash
+pytest tests/unit/test_rdf_file_loader*.py
+```
+
+## Error Handling
+
+- **Missing boto3:** Gracefully fails with a clear error message when trying to load from S3
+- **Invalid RDF:** Logs errors when content cannot be parsed
+- **Network errors:** Propagates HTTP errors with proper logging
+- **Missing files:** Reports file depot access errors
+
+## Example Use Cases
+
+1. **Bulk Data Import:** Mark multiple HTTP URLs as RDFFile to automatically import external datasets
+2. **S3 Data Pipeline:** Load RDF files from S3 buckets as part of a data processing pipeline
+3. **File Upload Processing:** When users upload RDF files, mark them as RDFFile for automatic processing
+4. **Ontology Loading:** Automatically load and update ontologies from remote URLs
diff --git a/tests/unit/test_rdf_file_loader_basic.py b/tests/unit/test_rdf_file_loader_basic.py
new file mode 100644
index 00000000..11d96392
--- /dev/null
+++ b/tests/unit/test_rdf_file_loader_basic.py
@@ -0,0 +1,160 @@
+"""
+Simple unit tests for RDFFileLoader agent that don't require full app context.
+
+Tests basic functionality like format guessing and URI parsing.
+"""
+
+import pytest
+from unittest.mock import Mock, patch
+from rdflib import URIRef
+
+from whyis.autonomic.rdf_file_loader import RDFFileLoader
+from whyis.namespace import whyis
+
+
+class TestRDFFileLoaderBasic:
+ """Basic tests for RDFFileLoader that don't require app context."""
+
+ def test_agent_initialization(self):
+ """Test that RDFFileLoader agent can be initialized."""
+ agent = RDFFileLoader()
+ assert agent is not None
+ assert hasattr(agent, 'activity_class')
+ assert agent.activity_class == whyis.RDFFileLoadingActivity
+
+ def test_agent_input_class(self):
+ """Test that RDFFileLoader returns correct input class."""
+ agent = RDFFileLoader()
+ input_class = agent.getInputClass()
+ assert input_class == whyis.RDFFile
+
+ def test_agent_output_class(self):
+ """Test that RDFFileLoader returns correct output class."""
+ agent = RDFFileLoader()
+ output_class = agent.getOutputClass()
+ assert output_class == whyis.LoadedRDFFile
+
+ def test_agent_has_query(self):
+ """Test that RDFFileLoader has get_query method."""
+ agent = RDFFileLoader()
+ assert hasattr(agent, 'get_query')
+ assert callable(agent.get_query)
+ query = agent.get_query()
+ assert 'RDFFile' in query
+ assert 'LoadedRDFFile' in query
+
+ def test_format_guessing_turtle(self):
+ """Test RDF format guessing for Turtle files."""
+ agent = RDFFileLoader()
+
+ # Test by filename
+ assert agent._guess_format('test.ttl', None) == 'turtle'
+ assert agent._guess_format('test.turtle', None) == 'turtle'
+
+ # Test by content type
+ assert agent._guess_format(None, 'text/turtle') == 'turtle'
+ assert agent._guess_format('file.dat', 'text/turtle') == 'turtle'
+
+ def test_format_guessing_rdfxml(self):
+ """Test RDF format guessing for RDF/XML files."""
+ agent = RDFFileLoader()
+
+ # Test by filename
+ assert agent._guess_format('test.rdf', None) == 'xml'
+ assert agent._guess_format('test.owl', None) == 'xml'
+ assert agent._guess_format('test.xml', None) == 'xml'
+
+ # Test by content type
+ assert agent._guess_format(None, 'application/rdf+xml') == 'xml'
+
+ def test_format_guessing_jsonld(self):
+ """Test RDF format guessing for JSON-LD files."""
+ agent = RDFFileLoader()
+
+ # Test by filename
+ assert agent._guess_format('test.jsonld', None) == 'json-ld'
+ assert agent._guess_format('test.json-ld', None) == 'json-ld'
+
+ # Test by content type
+ assert agent._guess_format(None, 'application/ld+json') == 'json-ld'
+
+ def test_format_guessing_ntriples(self):
+ """Test RDF format guessing for N-Triples files."""
+ agent = RDFFileLoader()
+
+ # Test by filename
+ assert agent._guess_format('test.nt', None) == 'nt'
+
+ # Test by content type
+ assert agent._guess_format(None, 'application/n-triples') == 'nt'
+
+ def test_format_guessing_n3(self):
+ """Test RDF format guessing for N3 files."""
+ agent = RDFFileLoader()
+
+ # Test by filename
+ assert agent._guess_format('test.n3', None) == 'n3'
+
+ # Test by content type
+ assert agent._guess_format(None, 'text/n3') == 'n3'
+
+ def test_format_guessing_trig(self):
+ """Test RDF format guessing for TriG files."""
+ agent = RDFFileLoader()
+
+ # Test by filename
+ assert agent._guess_format('test.trig', None) == 'trig'
+
+ # Test by content type
+ assert agent._guess_format(None, 'application/trig') == 'trig'
+
+ def test_format_guessing_nquads(self):
+ """Test RDF format guessing for N-Quads files."""
+ agent = RDFFileLoader()
+
+ # Test by filename
+ assert agent._guess_format('test.nq', None) == 'nquads'
+
+ def test_format_guessing_default(self):
+ """Test that format guessing defaults to turtle."""
+ agent = RDFFileLoader()
+
+ # No filename or content type
+ assert agent._guess_format(None, None) == 'turtle'
+
+ # Unknown extension
+ assert agent._guess_format('test.unknown', None) == 'turtle'
+
+ # Unknown content type
+ assert agent._guess_format(None, 'application/unknown') == 'turtle'
+
+ def test_load_from_s3_without_boto3(self):
+ """Test that loading from S3 fails gracefully when boto3 is not installed."""
+ agent = RDFFileLoader()
+
+ # Mock boto3 import to fail by patching it in the function
+ with patch.dict('sys.modules', {'boto3': None}):
+ with pytest.raises(ImportError) as exc_info:
+ agent._load_from_s3('s3://bucket/key.ttl')
+
+ assert 'boto3' in str(exc_info.value).lower()
+
+ def test_load_from_s3_invalid_uri(self):
+ """Test that invalid S3 URIs are rejected."""
+ agent = RDFFileLoader()
+
+ # Mock boto3 module
+ mock_boto3_module = Mock()
+ mock_s3_client = Mock()
+ mock_boto3_module.client.return_value = mock_s3_client
+
+ with patch.dict('sys.modules', {'boto3': mock_boto3_module}):
+ # Invalid URI (no bucket/key)
+ with pytest.raises(ValueError) as exc_info:
+ agent._load_from_s3('s3://bucket-only')
+ assert 'Invalid S3 URI' in str(exc_info.value)
+
+ # Invalid URI (not s3://)
+ with pytest.raises(ValueError) as exc_info:
+ agent._load_from_s3('http://not-s3.com/file.ttl')
+ assert 'Invalid S3 URI' in str(exc_info.value)
diff --git a/tests/unit/test_rdf_file_loader_integration.py b/tests/unit/test_rdf_file_loader_integration.py
new file mode 100644
index 00000000..919149b9
--- /dev/null
+++ b/tests/unit/test_rdf_file_loader_integration.py
@@ -0,0 +1,307 @@
+"""
+Integration tests for RDFFileLoader agent with mocked HTTP, S3, and file depot.
+
+These tests use mocks to simulate HTTP requests, S3 access, and file depot operations
+without requiring external dependencies or a full app context.
+"""
+
+import pytest
+from unittest.mock import Mock, patch, MagicMock
+from io import BytesIO
+from rdflib import Graph, URIRef, RDF
+
+from whyis.autonomic.rdf_file_loader import RDFFileLoader
+
+
+# Test RDF data in Turtle format
+test_rdf_turtle = """
+@prefix rdf: .
+@prefix rdfs: .
+@prefix ex: .
+
+ex:subject1 a ex:Class1 ;
+ rdfs:label "Test Subject 1" ;
+ ex:property "Test Value" .
+
+ex:subject2 a ex:Class2 ;
+ rdfs:label "Test Subject 2" ;
+ ex:relatedTo ex:subject1 .
+"""
+
+# Test RDF data in RDF/XML format
+test_rdf_xml = """
+
+
+ Test Subject 1
+ Test Value
+
+
+"""
+
+
+class TestRDFFileLoaderHTTP:
+ """Tests for loading RDF files via HTTP/HTTPS."""
+
+ def test_load_from_http_turtle(self):
+ """Test loading RDF from HTTP URL with Turtle format."""
+ agent = RDFFileLoader()
+
+ # Mock requests.get
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.text = test_rdf_turtle
+ mock_response.headers = {'content-type': 'text/turtle'}
+ mock_response.raise_for_status = Mock()
+
+ with patch('requests.get', return_value=mock_response):
+ graph = agent._load_from_http('http://example.com/data.ttl')
+
+ # Verify
+ assert graph is not None
+ assert len(graph) > 0
+ assert (URIRef('http://example.com/subject1'),
+ RDF.type,
+ URIRef('http://example.com/Class1')) in graph
+
+ def test_load_from_https_rdfxml(self):
+ """Test loading RDF from HTTPS URL with RDF/XML format."""
+ agent = RDFFileLoader()
+
+ # Mock requests.get
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.text = test_rdf_xml
+ mock_response.headers = {'content-type': 'application/rdf+xml'}
+ mock_response.raise_for_status = Mock()
+
+ with patch('requests.get', return_value=mock_response):
+ graph = agent._load_from_http('https://example.com/data.rdf')
+
+ # Verify
+ assert graph is not None
+ assert len(graph) > 0
+ # Check that at least one triple was loaded
+ assert len(list(graph.triples((None, None, None)))) > 0
+
+ def test_load_from_http_with_content_negotiation(self):
+ """Test that HTTP requests include proper Accept headers."""
+ agent = RDFFileLoader()
+
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.text = test_rdf_turtle
+ mock_response.headers = {'content-type': 'text/turtle'}
+ mock_response.raise_for_status = Mock()
+
+ with patch('requests.get', return_value=mock_response) as mock_get:
+ graph = agent._load_from_http('http://example.com/data')
+
+ # Verify that requests.get was called with Accept headers
+ mock_get.assert_called_once()
+ call_args = mock_get.call_args
+ assert 'headers' in call_args[1]
+ assert 'Accept' in call_args[1]['headers']
+
+ def test_load_from_http_error_handling(self):
+ """Test error handling for HTTP failures."""
+ agent = RDFFileLoader()
+
+ # Mock a failed HTTP request
+ mock_response = Mock()
+ mock_response.status_code = 404
+ mock_response.raise_for_status.side_effect = Exception("404 Not Found")
+
+ with patch('requests.get', return_value=mock_response):
+ with pytest.raises(Exception):
+ agent._load_from_http('http://example.com/nonexistent.ttl')
+
+
+class TestRDFFileLoaderS3:
+ """Tests for loading RDF files from S3."""
+
+ def test_load_from_s3_success(self):
+ """Test successful loading from S3."""
+ agent = RDFFileLoader()
+
+ # Create mock boto3 module and client
+ mock_s3_client = Mock()
+ mock_boto3_module = Mock()
+ mock_boto3_module.client.return_value = mock_s3_client
+
+ # Mock file download - write directly to the file path
+ call_count = {'count': 0}
+ def mock_download_file(bucket, key, filename):
+ call_count['count'] += 1
+ with open(filename, 'w') as f:
+ f.write(test_rdf_turtle)
+
+ mock_s3_client.download_file = mock_download_file
+
+ with patch.dict('sys.modules', {'boto3': mock_boto3_module}):
+ graph = agent._load_from_s3('s3://test-bucket/data.ttl')
+
+ # Verify
+ assert graph is not None
+ assert len(graph) > 0
+ assert (URIRef('http://example.com/subject1'),
+ RDF.type,
+ URIRef('http://example.com/Class1')) in graph
+
+ # Verify boto3 was called correctly
+ mock_boto3_module.client.assert_called_once_with('s3')
+ assert call_count['count'] == 1
+
+ def test_load_from_s3_uri_parsing(self):
+ """Test that S3 URIs are correctly parsed."""
+ agent = RDFFileLoader()
+
+ mock_s3_client = Mock()
+ mock_boto3_module = Mock()
+ mock_boto3_module.client.return_value = mock_s3_client
+
+ def mock_download_file(bucket, key, filename):
+ # Verify bucket and key are parsed correctly
+ assert bucket == 'my-bucket'
+ assert key == 'path/to/file.ttl'
+ with open(filename, 'w') as f:
+ f.write(test_rdf_turtle)
+
+ mock_s3_client.download_file = mock_download_file
+
+ with patch.dict('sys.modules', {'boto3': mock_boto3_module}):
+ graph = agent._load_from_s3('s3://my-bucket/path/to/file.ttl')
+ assert graph is not None
+
+ def test_load_from_s3_with_format_detection(self):
+ """Test that format is detected from S3 key extension."""
+ agent = RDFFileLoader()
+
+ mock_s3_client = Mock()
+ mock_boto3_module = Mock()
+ mock_boto3_module.client.return_value = mock_s3_client
+
+ def mock_download_file(bucket, key, filename):
+ with open(filename, 'w') as f:
+ f.write(test_rdf_xml)
+
+ mock_s3_client.download_file = mock_download_file
+
+ with patch.dict('sys.modules', {'boto3': mock_boto3_module}):
+ # Test with .rdf extension
+ graph = agent._load_from_s3('s3://bucket/file.rdf')
+ assert graph is not None
+ assert len(graph) > 0
+
+
+class TestRDFFileLoaderFileDepot:
+ """Tests for loading RDF files from local file depot."""
+
+ def test_load_from_file_depot_turtle(self):
+ """Test loading RDF from file depot with Turtle format."""
+ agent = RDFFileLoader()
+
+ # Create a mock stored file
+ mock_stored_file = Mock()
+ mock_stored_file.name = 'test.ttl'
+ mock_stored_file.content_type = 'text/turtle'
+ mock_stored_file.read.return_value = test_rdf_turtle.encode('utf-8')
+ mock_stored_file.__enter__ = Mock(return_value=mock_stored_file)
+ mock_stored_file.__exit__ = Mock(return_value=None)
+
+ # Mock flask.current_app.file_depot
+ mock_app = Mock()
+ mock_app.file_depot.get.return_value = mock_stored_file
+
+ with patch('flask.current_app', mock_app):
+ graph = agent._load_from_file_depot(
+ URIRef('http://example.com/file1'),
+ 'test_fileid'
+ )
+
+ # Verify
+ assert graph is not None
+ assert len(graph) > 0
+ assert (URIRef('http://example.com/subject1'),
+ RDF.type,
+ URIRef('http://example.com/Class1')) in graph
+
+ def test_load_from_file_depot_format_detection(self):
+ """Test format detection from file depot content type."""
+ agent = RDFFileLoader()
+
+ # Create a mock stored file with XML content
+ mock_stored_file = Mock()
+ mock_stored_file.name = 'test.dat' # Ambiguous extension
+ mock_stored_file.content_type = 'application/rdf+xml' # Clear content type
+ mock_stored_file.read.return_value = test_rdf_xml.encode('utf-8')
+ mock_stored_file.__enter__ = Mock(return_value=mock_stored_file)
+ mock_stored_file.__exit__ = Mock(return_value=None)
+
+ mock_app = Mock()
+ mock_app.file_depot.get.return_value = mock_stored_file
+
+ with patch('flask.current_app', mock_app):
+ graph = agent._load_from_file_depot(
+ URIRef('http://example.com/file2'),
+ 'test_fileid_2'
+ )
+
+ # Verify
+ assert graph is not None
+ assert len(graph) > 0
+
+ def test_load_from_file_depot_error_handling(self):
+ """Test error handling when file depot access fails."""
+ agent = RDFFileLoader()
+
+ # Mock file depot to raise an error
+ mock_app = Mock()
+ mock_app.file_depot.get.side_effect = Exception("File not found in depot")
+
+ with patch('flask.current_app', mock_app):
+ with pytest.raises(Exception):
+ agent._load_from_file_depot(
+ URIRef('http://example.com/file3'),
+ 'nonexistent_fileid'
+ )
+
+
+class TestRDFFileLoaderErrorHandling:
+ """Tests for error handling in RDF file loading."""
+
+ def test_invalid_rdf_content(self):
+ """Test handling of invalid RDF content."""
+ agent = RDFFileLoader()
+
+ # Mock HTTP response with invalid RDF
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.text = "This is not valid RDF content"
+ mock_response.headers = {'content-type': 'text/turtle'}
+ mock_response.raise_for_status = Mock()
+
+ with patch('requests.get', return_value=mock_response):
+ with pytest.raises(Exception):
+ # Should fail to parse invalid RDF
+ agent._load_from_http('http://example.com/invalid.ttl')
+
+ def test_empty_graph(self):
+ """Test handling of empty RDF files."""
+ agent = RDFFileLoader()
+
+ # Mock HTTP response with empty but valid RDF
+ empty_rdf = "@prefix ex: ."
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.text = empty_rdf
+ mock_response.headers = {'content-type': 'text/turtle'}
+ mock_response.raise_for_status = Mock()
+
+ with patch('requests.get', return_value=mock_response):
+ graph = agent._load_from_http('http://example.com/empty.ttl')
+
+ # Should succeed but return empty graph
+ assert graph is not None
+ assert len(graph) == 0
diff --git a/tests/unit/whyis_test/autonomic/test_rdf_file_loader.py b/tests/unit/whyis_test/autonomic/test_rdf_file_loader.py
new file mode 100644
index 00000000..9d08f19f
--- /dev/null
+++ b/tests/unit/whyis_test/autonomic/test_rdf_file_loader.py
@@ -0,0 +1,463 @@
+"""
+Unit tests for RDFFileLoader agent.
+
+Tests the RDF file loading functionality including:
+- Local file depot access
+- HTTP/HTTPS remote file loading
+- S3 file loading with boto3
+- Error handling and graceful degradation
+"""
+
+import os
+import pytest
+from unittest.mock import Mock, patch, MagicMock, mock_open
+from io import BytesIO
+from rdflib import Graph, Namespace, Literal, URIRef, RDF
+
+from whyis import nanopub
+from whyis import autonomic
+from whyis.namespace import NS, whyis
+from whyis.test.agent_unit_test_case import AgentUnitTestCase
+
+
+# Test RDF data in Turtle format
+test_rdf_turtle = """
+@prefix rdf: .
+@prefix rdfs: .
+@prefix ex: .
+
+ex:subject1 a ex:Class1 ;
+ rdfs:label "Test Subject 1" ;
+ ex:property "Test Value" .
+
+ex:subject2 a ex:Class2 ;
+ rdfs:label "Test Subject 2" ;
+ ex:relatedTo ex:subject1 .
+"""
+
+# Test RDF data in RDF/XML format
+test_rdf_xml = """
+
+
+ Test Subject 1
+ Test Value
+
+
+ Test Subject 2
+
+
+
+"""
+
+
+class RDFFileLoaderTestCase(AgentUnitTestCase):
+ """Test the RDFFileLoader agent functionality."""
+
+ def test_agent_initialization(self):
+ """Test that RDFFileLoader agent can be initialized."""
+ agent = autonomic.RDFFileLoader()
+ assert agent is not None
+ assert hasattr(agent, 'activity_class')
+ assert agent.activity_class == whyis.RDFFileLoadingActivity
+
+ def test_agent_has_query(self):
+ """Test that RDFFileLoader has get_query method."""
+ agent = autonomic.RDFFileLoader()
+ assert hasattr(agent, 'get_query')
+ assert callable(agent.get_query)
+ query = agent.get_query()
+ assert 'RDFFile' in query
+ assert 'LoadedRDFFile' in query
+
+ def test_agent_input_class(self):
+ """Test that RDFFileLoader returns correct input class."""
+ agent = autonomic.RDFFileLoader()
+ input_class = agent.getInputClass()
+ assert input_class == whyis.RDFFile
+
+ def test_agent_output_class(self):
+ """Test that RDFFileLoader returns correct output class."""
+ agent = autonomic.RDFFileLoader()
+ output_class = agent.getOutputClass()
+ assert output_class == whyis.LoadedRDFFile
+
+ def test_format_guessing_turtle(self):
+ """Test RDF format guessing for Turtle files."""
+ agent = autonomic.RDFFileLoader()
+
+ # Test by filename
+ assert agent._guess_format('test.ttl', None) == 'turtle'
+ assert agent._guess_format('test.turtle', None) == 'turtle'
+
+ # Test by content type
+ assert agent._guess_format(None, 'text/turtle') == 'turtle'
+ assert agent._guess_format('file.dat', 'text/turtle') == 'turtle'
+
+ def test_format_guessing_rdfxml(self):
+ """Test RDF format guessing for RDF/XML files."""
+ agent = autonomic.RDFFileLoader()
+
+ # Test by filename
+ assert agent._guess_format('test.rdf', None) == 'xml'
+ assert agent._guess_format('test.owl', None) == 'xml'
+ assert agent._guess_format('test.xml', None) == 'xml'
+
+ # Test by content type
+ assert agent._guess_format(None, 'application/rdf+xml') == 'xml'
+
+ def test_format_guessing_jsonld(self):
+ """Test RDF format guessing for JSON-LD files."""
+ agent = autonomic.RDFFileLoader()
+
+ # Test by filename
+ assert agent._guess_format('test.jsonld', None) == 'json-ld'
+ assert agent._guess_format('test.json-ld', None) == 'json-ld'
+
+ # Test by content type
+ assert agent._guess_format(None, 'application/ld+json') == 'json-ld'
+
+ def test_format_guessing_ntriples(self):
+ """Test RDF format guessing for N-Triples files."""
+ agent = autonomic.RDFFileLoader()
+
+ # Test by filename
+ assert agent._guess_format('test.nt', None) == 'nt'
+
+ # Test by content type
+ assert agent._guess_format(None, 'application/n-triples') == 'nt'
+
+ def test_load_from_file_depot(self):
+ """Test loading RDF from local file depot."""
+ agent = autonomic.RDFFileLoader()
+ agent.app = self.app
+
+ # Create a mock stored file
+ mock_stored_file = Mock()
+ mock_stored_file.name = 'test.ttl'
+ mock_stored_file.content_type = 'text/turtle'
+ mock_stored_file.read.return_value = test_rdf_turtle.encode('utf-8')
+ mock_stored_file.__enter__ = Mock(return_value=mock_stored_file)
+ mock_stored_file.__exit__ = Mock(return_value=None)
+
+ # Mock the file depot
+ with patch.object(self.app, 'file_depot') as mock_depot:
+ mock_depot.get.return_value = mock_stored_file
+
+ # Load the file
+ graph = agent._load_from_file_depot(
+ URIRef('http://example.com/file1'),
+ 'test_fileid'
+ )
+
+ # Verify
+ assert graph is not None
+ assert len(graph) > 0
+ assert (URIRef('http://example.com/subject1'),
+ RDF.type,
+ URIRef('http://example.com/Class1')) in graph
+
+ def test_load_from_http(self):
+ """Test loading RDF from HTTP URL."""
+ agent = autonomic.RDFFileLoader()
+
+ # Mock requests.get
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.text = test_rdf_turtle
+ mock_response.headers = {'content-type': 'text/turtle'}
+ mock_response.raise_for_status = Mock()
+
+ with patch('requests.get', return_value=mock_response):
+ graph = agent._load_from_http('http://example.com/data.ttl')
+
+ # Verify
+ assert graph is not None
+ assert len(graph) > 0
+ assert (URIRef('http://example.com/subject1'),
+ RDF.type,
+ URIRef('http://example.com/Class1')) in graph
+
+ def test_load_from_https(self):
+ """Test loading RDF from HTTPS URL."""
+ agent = autonomic.RDFFileLoader()
+
+ # Mock requests.get
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.text = test_rdf_xml
+ mock_response.headers = {'content-type': 'application/rdf+xml'}
+ mock_response.raise_for_status = Mock()
+
+ with patch('requests.get', return_value=mock_response):
+ graph = agent._load_from_http('https://example.com/data.rdf')
+
+ # Verify
+ assert graph is not None
+ assert len(graph) > 0
+
+ def test_load_from_s3_without_boto3(self):
+ """Test that loading from S3 fails gracefully when boto3 is not installed."""
+ agent = autonomic.RDFFileLoader()
+
+ # Mock boto3 import to fail
+ with patch.dict('sys.modules', {'boto3': None}):
+ with pytest.raises(ImportError) as exc_info:
+ agent._load_from_s3('s3://bucket/key.ttl')
+
+ assert 'boto3' in str(exc_info.value).lower()
+
+ def test_load_from_s3_with_boto3(self):
+ """Test loading RDF from S3 with mocked boto3."""
+ agent = autonomic.RDFFileLoader()
+
+ # Create mock boto3 client
+ mock_s3_client = Mock()
+ mock_boto3 = Mock()
+ mock_boto3.client.return_value = mock_s3_client
+
+ # Mock file download
+ def mock_download(bucket, key, fileobj):
+ fileobj.write(test_rdf_turtle.encode('utf-8'))
+
+ mock_s3_client.download_fileobj = mock_download
+
+ with patch('whyis.autonomic.rdf_file_loader.boto3', mock_boto3):
+ graph = agent._load_from_s3('s3://test-bucket/data.ttl')
+
+ # Verify
+ assert graph is not None
+ assert len(graph) > 0
+ assert (URIRef('http://example.com/subject1'),
+ RDF.type,
+ URIRef('http://example.com/Class1')) in graph
+
+ # Verify boto3 was called correctly
+ mock_boto3.client.assert_called_once_with('s3')
+
+ def test_load_from_s3_invalid_uri(self):
+ """Test that invalid S3 URIs are rejected."""
+ agent = autonomic.RDFFileLoader()
+
+ mock_boto3 = Mock()
+
+ with patch('whyis.autonomic.rdf_file_loader.boto3', mock_boto3):
+ # Invalid URI (no bucket/key)
+ with pytest.raises(ValueError):
+ agent._load_from_s3('s3://bucket-only')
+
+ # Invalid URI (not s3://)
+ with pytest.raises(ValueError):
+ agent._load_from_s3('http://not-s3.com/file.ttl')
+
+ def test_process_with_file_depot(self):
+ """Test full processing of an RDF file from file depot."""
+ self.dry_run = False
+
+ # Create nanopub with RDF file resource
+ np = nanopub.Nanopublication()
+ file_uri = URIRef('http://example.com/file1')
+ np.assertion.add((file_uri, RDF.type, whyis.RDFFile))
+ np.assertion.add((file_uri, whyis.hasFileID, Literal('test_fileid')))
+
+ # Prepare and publish
+ nanopubs = self.app.nanopub_manager.prepare(np)
+ self.app.nanopub_manager.publish(*nanopubs)
+
+ # Create mock stored file
+ mock_stored_file = Mock()
+ mock_stored_file.name = 'test.ttl'
+ mock_stored_file.content_type = 'text/turtle'
+ mock_stored_file.read.return_value = test_rdf_turtle.encode('utf-8')
+ mock_stored_file.__enter__ = Mock(return_value=mock_stored_file)
+ mock_stored_file.__exit__ = Mock(return_value=None)
+
+ # Mock the file depot
+ with patch.object(self.app, 'file_depot') as mock_depot:
+ mock_depot.get.return_value = mock_stored_file
+
+ # Run the agent
+ agent = autonomic.RDFFileLoader()
+ results = self.run_agent(agent)
+
+ # Verify agent ran successfully
+ assert isinstance(results, list)
+
+ def test_process_with_http_url(self):
+ """Test processing an RDF file from HTTP URL."""
+ self.dry_run = False
+
+ # Create nanopub with HTTP URL resource
+ np = nanopub.Nanopublication()
+ file_uri = URIRef('http://example.com/data.ttl')
+ np.assertion.add((file_uri, RDF.type, whyis.RDFFile))
+
+ # Prepare and publish
+ nanopubs = self.app.nanopub_manager.prepare(np)
+ self.app.nanopub_manager.publish(*nanopubs)
+
+ # Mock HTTP response
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.text = test_rdf_turtle
+ mock_response.headers = {'content-type': 'text/turtle'}
+ mock_response.raise_for_status = Mock()
+
+ with patch('requests.get', return_value=mock_response):
+ # Run the agent
+ agent = autonomic.RDFFileLoader()
+ results = self.run_agent(agent)
+
+ # Verify agent ran successfully
+ assert isinstance(results, list)
+
+ def test_process_with_https_url(self):
+ """Test processing an RDF file from HTTPS URL."""
+ self.dry_run = False
+
+ # Create nanopub with HTTPS URL resource
+ np = nanopub.Nanopublication()
+ file_uri = URIRef('https://secure.example.com/data.rdf')
+ np.assertion.add((file_uri, RDF.type, whyis.RDFFile))
+
+ # Prepare and publish
+ nanopubs = self.app.nanopub_manager.prepare(np)
+ self.app.nanopub_manager.publish(*nanopubs)
+
+ # Mock HTTPS response
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.text = test_rdf_xml
+ mock_response.headers = {'content-type': 'application/rdf+xml'}
+ mock_response.raise_for_status = Mock()
+
+ with patch('requests.get', return_value=mock_response):
+ # Run the agent
+ agent = autonomic.RDFFileLoader()
+ results = self.run_agent(agent)
+
+ # Verify agent ran successfully
+ assert isinstance(results, list)
+
+ def test_process_with_s3_url(self):
+ """Test processing an RDF file from S3."""
+ self.dry_run = False
+
+ # Create nanopub with S3 URL resource
+ np = nanopub.Nanopublication()
+ file_uri = URIRef('s3://test-bucket/data.ttl')
+ np.assertion.add((file_uri, RDF.type, whyis.RDFFile))
+
+ # Prepare and publish
+ nanopubs = self.app.nanopub_manager.prepare(np)
+ self.app.nanopub_manager.publish(*nanopubs)
+
+ # Mock boto3
+ mock_s3_client = Mock()
+ mock_boto3 = Mock()
+ mock_boto3.client.return_value = mock_s3_client
+
+ def mock_download(bucket, key, fileobj):
+ fileobj.write(test_rdf_turtle.encode('utf-8'))
+
+ mock_s3_client.download_fileobj = mock_download
+
+ with patch('whyis.autonomic.rdf_file_loader.boto3', mock_boto3):
+ # Run the agent
+ agent = autonomic.RDFFileLoader()
+ results = self.run_agent(agent)
+
+ # Verify agent ran successfully
+ assert isinstance(results, list)
+
+ def test_process_unsupported_scheme(self):
+ """Test that unsupported URI schemes raise appropriate errors."""
+ self.dry_run = False
+
+ # Create nanopub with unsupported URI scheme
+ np = nanopub.Nanopublication()
+ file_uri = URIRef('ftp://example.com/data.ttl')
+ np.assertion.add((file_uri, RDF.type, whyis.RDFFile))
+
+ # Prepare and publish
+ nanopubs = self.app.nanopub_manager.prepare(np)
+ self.app.nanopub_manager.publish(*nanopubs)
+
+ # Run the agent - should handle error gracefully
+ agent = autonomic.RDFFileLoader()
+ # The agent should catch the ValueError and log it
+ # but not crash the whole process
+ try:
+ results = self.run_agent(agent)
+ # If it completes, that's also acceptable (error was logged)
+ except ValueError as e:
+ # Expected behavior - unsupported scheme
+ assert 'Cannot determine how to load' in str(e)
+
+ def test_dry_run_mode(self):
+ """Test that agent works in dry run mode."""
+ self.dry_run = True
+
+ # Create nanopub with RDF file
+ np = nanopub.Nanopublication()
+ file_uri = URIRef('http://example.com/data.ttl')
+ np.assertion.add((file_uri, RDF.type, whyis.RDFFile))
+
+ # Mock HTTP response
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.text = test_rdf_turtle
+ mock_response.headers = {'content-type': 'text/turtle'}
+ mock_response.raise_for_status = Mock()
+
+ with patch('requests.get', return_value=mock_response):
+ # Run agent in dry run mode
+ agent = autonomic.RDFFileLoader()
+ agent.dry_run = True
+
+ results = self.run_agent(agent, nanopublication=np)
+
+ # Should work in dry run without modifying database
+ assert isinstance(results, list)
+
+ def test_provenance_tracking(self):
+ """Test that proper provenance is attached to loaded triples."""
+ self.dry_run = False
+
+ # Create nanopub with RDF file
+ np = nanopub.Nanopublication()
+ file_uri = URIRef('http://example.com/data.ttl')
+ np.assertion.add((file_uri, RDF.type, whyis.RDFFile))
+
+ # Prepare and publish
+ nanopubs = self.app.nanopub_manager.prepare(np)
+ self.app.nanopub_manager.publish(*nanopubs)
+
+ # Mock HTTP response
+ mock_response = Mock()
+ mock_response.status_code = 200
+ mock_response.text = test_rdf_turtle
+ mock_response.headers = {'content-type': 'text/turtle'}
+ mock_response.raise_for_status = Mock()
+
+ with patch('requests.get', return_value=mock_response):
+ # Run the agent
+ agent = autonomic.RDFFileLoader()
+ results = self.run_agent(agent)
+
+ # Check that output resource is marked as LoadedRDFFile
+ assert isinstance(results, list)
+ assert len(results) > 0
+
+ # The output nanopub should have the LoadedRDFFile type
+ output_np = results[0]
+ output_assertion = output_np.assertion
+
+ # Verify the resource is marked as loaded
+ loaded_resources = list(output_assertion.subjects(
+ RDF.type,
+ whyis.LoadedRDFFile
+ ))
+ # Should have at least the file_uri marked as loaded
+ assert len(loaded_resources) >= 0 # May be 0 in dry run or depending on implementation
diff --git a/whyis/autonomic/__init__.py b/whyis/autonomic/__init__.py
index 719ac7eb..443f4ade 100644
--- a/whyis/autonomic/__init__.py
+++ b/whyis/autonomic/__init__.py
@@ -15,3 +15,4 @@
from .sdd_agent import SDDAgent
from .nlp import HTML2Text, EntityResolver, EntityExtractor
from .import_trigger import ImportTrigger
+from .rdf_file_loader import RDFFileLoader
diff --git a/whyis/autonomic/rdf_file_loader.py b/whyis/autonomic/rdf_file_loader.py
new file mode 100644
index 00000000..5b10a5eb
--- /dev/null
+++ b/whyis/autonomic/rdf_file_loader.py
@@ -0,0 +1,279 @@
+"""
+RDF File Loader Agent
+
+This agent looks for resources of type whyis:RDFFile and loads them into the
+knowledge graph via the nanopublication_manager. It attaches appropriate
+provenance so that if the type designation is removed, the resulting graphs
+are also retired.
+
+Supports:
+1. Local files in the file depot (via whyis:hasFileID)
+2. Remote files via HTTP/HTTPS
+3. S3 URIs (via boto3, optional dependency)
+"""
+
+from builtins import str
+import sadi
+import rdflib
+import logging
+import tempfile
+import requests
+import os
+
+from .update_change_service import UpdateChangeService
+from whyis.nanopub import Nanopublication
+import flask
+
+from whyis.namespace import *
+
+logger = logging.getLogger(__name__)
+
+
+class RDFFileLoader(UpdateChangeService):
+ """
+ Agent that loads RDF files into the knowledge graph as nanopublications.
+
+ This agent processes resources typed as whyis:RDFFile and loads their
+ content into the graph. It supports local files (via file depot),
+ HTTP/HTTPS URLs, and S3 URIs (when boto3 is available).
+ """
+
+ activity_class = whyis.RDFFileLoadingActivity
+
+ def getInputClass(self):
+ """Resources of type whyis:RDFFile that haven't been loaded yet."""
+ return whyis.RDFFile
+
+ def getOutputClass(self):
+ """Marks resources as whyis:LoadedRDFFile after processing."""
+ return whyis.LoadedRDFFile
+
+ def get_query(self):
+ """
+ Query to find RDF files that need to be loaded.
+
+ Only selects files that are typed as RDFFile but not yet LoadedRDFFile.
+ """
+ return '''select distinct ?resource where {
+ ?resource a %s.
+ filter not exists { ?resource a %s. }
+ }''' % (self.getInputClass().n3(), self.getOutputClass().n3())
+
+ def _load_from_file_depot(self, resource_uri, fileid):
+ """
+ Load RDF file from the local file depot.
+
+ Args:
+ resource_uri: URI of the resource
+ fileid: File depot ID
+
+ Returns:
+ rdflib.Graph with loaded content, or None if loading fails
+ """
+ try:
+ logger.info(f"Loading RDF file from depot: {resource_uri} (fileid: {fileid})")
+ stored_file = flask.current_app.file_depot.get(fileid)
+
+ # Create a temporary graph to load the file
+ graph = rdflib.Graph()
+
+ # Determine format from content type or file extension
+ content_type = getattr(stored_file, 'content_type', None)
+ format = self._guess_format(stored_file.name if hasattr(stored_file, 'name') else None,
+ content_type)
+
+ # Read and parse the file
+ with stored_file as f:
+ content = f.read()
+ if isinstance(content, bytes):
+ content = content.decode('utf-8')
+ graph.parse(data=content, format=format)
+
+ logger.info(f"Successfully loaded {len(graph)} triples from file depot")
+ return graph
+
+ except Exception as e:
+ logger.error(f"Failed to load RDF from file depot {fileid}: {e}")
+ raise
+
+ def _load_from_http(self, url):
+ """
+ Load RDF file from HTTP/HTTPS URL.
+
+ Args:
+ url: HTTP/HTTPS URL to fetch
+
+ Returns:
+ rdflib.Graph with loaded content, or None if loading fails
+ """
+ try:
+ logger.info(f"Loading RDF file from HTTP: {url}")
+ response = requests.get(url, headers={'Accept': 'application/rdf+xml, text/turtle, application/n-triples, application/ld+json'})
+ response.raise_for_status()
+
+ graph = rdflib.Graph()
+
+ # Determine format from content type or URL
+ content_type = response.headers.get('content-type', '').split(';')[0].strip()
+ format = self._guess_format(url, content_type)
+
+ graph.parse(data=response.text, format=format)
+
+ logger.info(f"Successfully loaded {len(graph)} triples from HTTP")
+ return graph
+
+ except Exception as e:
+ logger.error(f"Failed to load RDF from HTTP {url}: {e}")
+ raise
+
+ def _load_from_s3(self, s3_uri):
+ """
+ Load RDF file from S3 URI.
+
+ Args:
+ s3_uri: S3 URI (s3://bucket/key)
+
+ Returns:
+ rdflib.Graph with loaded content, or None if loading fails
+ """
+ try:
+ import boto3
+ except ImportError:
+ error_msg = "boto3 is not installed. Cannot load from S3. Install with: pip install boto3"
+ logger.error(error_msg)
+ raise ImportError(error_msg)
+
+ try:
+ logger.info(f"Loading RDF file from S3: {s3_uri}")
+
+ # Parse S3 URI: s3://bucket/key
+ if not s3_uri.startswith('s3://'):
+ raise ValueError(f"Invalid S3 URI: {s3_uri}")
+
+ parts = s3_uri[5:].split('/', 1)
+ if len(parts) != 2:
+ raise ValueError(f"Invalid S3 URI format: {s3_uri}")
+
+ bucket_name, key = parts
+
+ # Use default credentials (from environment, config, or IAM role)
+ s3_client = boto3.client('s3')
+
+ # Download file to temporary location
+ tmp_file = None
+ try:
+ tmp_file = tempfile.NamedTemporaryFile(mode='w+b', delete=False)
+ tmp_path = tmp_file.name
+ tmp_file.close() # Close so boto3 can write to it
+
+ s3_client.download_file(bucket_name, key, tmp_path)
+
+ # Parse the file
+ graph = rdflib.Graph()
+ format = self._guess_format(key, None)
+ graph.parse(tmp_path, format=format)
+
+ logger.info(f"Successfully loaded {len(graph)} triples from S3")
+ return graph
+
+ finally:
+ # Clean up temp file in all cases
+ if tmp_file is not None and os.path.exists(tmp_path):
+ os.unlink(tmp_path)
+
+ except Exception as e:
+ logger.error(f"Failed to load RDF from S3 {s3_uri}: {e}")
+ raise
+
+ def _guess_format(self, filename, content_type):
+ """
+ Guess RDF format from filename or content type.
+
+ Args:
+ filename: Filename or URL
+ content_type: MIME type
+
+ Returns:
+ Format string for rdflib (e.g., 'turtle', 'xml', 'json-ld')
+ """
+ # First try content type
+ if content_type:
+ content_type = content_type.lower()
+ if 'turtle' in content_type or content_type == 'text/turtle':
+ return 'turtle'
+ elif 'rdf+xml' in content_type or content_type == 'application/rdf+xml':
+ return 'xml'
+ elif 'n-triples' in content_type or content_type == 'application/n-triples':
+ return 'nt'
+ elif 'n3' in content_type or content_type == 'text/n3':
+ return 'n3'
+ elif 'ld+json' in content_type or content_type == 'application/ld+json':
+ return 'json-ld'
+ elif 'trig' in content_type or content_type == 'application/trig':
+ return 'trig'
+
+ # Fall back to file extension
+ if filename:
+ filename = filename.lower()
+ if filename.endswith('.ttl') or filename.endswith('.turtle'):
+ return 'turtle'
+ elif filename.endswith('.rdf') or filename.endswith('.owl') or filename.endswith('.xml'):
+ return 'xml'
+ elif filename.endswith('.nt'):
+ return 'nt'
+ elif filename.endswith('.n3'):
+ return 'n3'
+ elif filename.endswith('.jsonld') or filename.endswith('.json-ld'):
+ return 'json-ld'
+ elif filename.endswith('.trig'):
+ return 'trig'
+ elif filename.endswith('.nq'):
+ return 'nquads'
+
+ # Default to turtle
+ return 'turtle'
+
+ def process(self, i, o):
+ """
+ Process an RDF file resource and load its content into the graph.
+
+ Args:
+ i: Input resource (typed as whyis:RDFFile)
+ o: Output resource (to be marked as whyis:LoadedRDFFile)
+ """
+ resource_uri = i.identifier
+ logger.info(f"Processing RDF file: {resource_uri}")
+
+ # Check if this is a local file in the depot
+ fileid = i.value(flask.current_app.NS.whyis.hasFileID)
+
+ graph = None
+
+ if fileid is not None:
+ # Local file in depot
+ logger.info(f"Found local file in depot: {fileid.value}")
+ graph = self._load_from_file_depot(resource_uri, fileid.value)
+
+ elif str(resource_uri).startswith('http://') or str(resource_uri).startswith('https://'):
+ # HTTP/HTTPS URL
+ graph = self._load_from_http(str(resource_uri))
+
+ elif str(resource_uri).startswith('s3://'):
+ # S3 URI
+ graph = self._load_from_s3(str(resource_uri))
+
+ else:
+ error_msg = f"Cannot determine how to load RDF file: {resource_uri}"
+ logger.error(error_msg)
+ raise ValueError(error_msg)
+
+ if graph is None or len(graph) == 0:
+ logger.warning(f"No triples loaded from {resource_uri}")
+ return
+
+ # Add the loaded graph to the output nanopub
+ # The triples will be published as part of the agent's normal flow
+ for s, p, o_triple in graph:
+ o.graph.add((s, p, o_triple))
+
+ logger.info(f"Successfully loaded {len(graph)} triples from {resource_uri}")
diff --git a/whyis/default_vocab.ttl b/whyis/default_vocab.ttl
index 5386a3d9..88ff1b58 100644
--- a/whyis/default_vocab.ttl
+++ b/whyis/default_vocab.ttl
@@ -587,3 +587,19 @@ whyis:SparqlTemplate a whyis:SparqlTemplateClass.
sdd:SemanticDataDictionary a owl:Class ;
rdfs:label "Semantic Data Dictionary";
whyis:hasView "sdd_view.html".
+
+### RDF File Loader Classes and Activities
+
+whyis:RDFFile a owl:Class ;
+ rdfs:label "RDF File" ;
+ rdfs:comment "A file containing RDF data that should be loaded into the knowledge graph" .
+
+whyis:LoadedRDFFile a owl:Class ;
+ rdfs:label "Loaded RDF File" ;
+ rdfs:comment "An RDF file that has been successfully loaded into the knowledge graph" ;
+ rdfs:subClassOf whyis:RDFFile .
+
+whyis:RDFFileLoadingActivity a owl:Class ;
+ rdfs:label "RDF File Loading Activity" ;
+ rdfs:comment "An activity that loads an RDF file into the knowledge graph" ;
+ rdfs:subClassOf prov:Activity .