diff --git a/.gitignore b/.gitignore
index c0c5d24b3..63de6b912 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,3 +10,4 @@ pom.xml.versionsBackup
**/maven-eclipse.xml
**/any23-site/
**/nb*.xml
+**/c/build/
diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties
index 4f68586d3..3d211ff7e 100644
--- a/api/src/main/resources/default-configuration.properties
+++ b/api/src/main/resources/default-configuration.properties
@@ -43,10 +43,18 @@ any23.extraction.metadata.domain.per.entity=off
# Allows to decide which RDFa Extractor to enable.
# If 'on' will be activated the programmatic RDFa 1.1 Extractor
-# (org.deri.any23.extractor.rdfa.RDFa11Extractor) otherwise will be
-# registered the RDFa 1.0 legacy one (org.deri.any23.extractor.rdfa.RDFaExtractor).
+# (org.apache.any23.extractor.rdfa.RDFa11Extractor) otherwise will be
+# registered the RDFa 1.0 legacy one (org.apache.any23.extractor.rdfa.RDFaExtractor).
any23.extraction.rdfa.programmatic=on
+# Allows to enable Librdfa Extractor.
+# If 'on' will override the extractors with the programmatic option,
+# RDFa 1.1 Extractor (org.apache.any23.extractor.rdfa.RDFa11Extractor) and
+# RDFa 1.0 Exctractor (org.apache.any23.extractor.rdfa.RDFaExtractor).
+# If the option is 'off' (by default), it will choose the especfied extractor
+# in the programmatic option (any23.extraction.rdfa.programmatic).
+any23.extraction.rdfa.librdfa=off
+
# The extraction context IRI to be used by the
# SingleDocumentExtraction. If == '?' the document IRI will
# be used. It can be overriden by specifying a different
diff --git a/core/pom.xml b/core/pom.xml
index 49a1bfcb9..b066d55f9 100644
--- a/core/pom.xml
+++ b/core/pom.xml
@@ -62,6 +62,14 @@
+
+
+ ${project.groupId}
+ apache-any23-librdfa
+ ${project.version}
+
+
+
org.apache.httpcomponents
@@ -347,7 +355,7 @@
-
+
diff --git a/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java b/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java
index ca3bb982b..4482e5a8e 100644
--- a/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java
+++ b/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java
@@ -17,15 +17,15 @@
package org.apache.any23.extractor;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.extractor.html.HTMLMetaExtractorFactory;
+import org.apache.any23.extractor.rdfa.LibRdfaExtractorFactory;
import org.apache.any23.extractor.rdfa.RDFa11ExtractorFactory;
import org.apache.any23.extractor.rdfa.RDFaExtractorFactory;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
/**
* Singleton class acting as a register for all the various
* {@link Extractor}.
@@ -55,12 +55,17 @@ public static ExtractorRegistry getInstance() {
if (instance == null) {
instance = new ExtractorRegistryImpl();
- if(conf.getFlagProperty("any23.extraction.rdfa.programmatic")) {
+ if(conf.getFlagProperty("any23.extraction.rdfa.librdfa")){
+ instance.unregister(RDFaExtractorFactory.NAME);
+ instance.unregister(RDFa11ExtractorFactory.NAME);
+ } else if(conf.getFlagProperty("any23.extraction.rdfa.programmatic")) {
+ instance.unregister(LibRdfaExtractorFactory.NAME);
instance.unregister(RDFaExtractorFactory.NAME);
// FIXME: Unregister RDFaExtractor if flag is not set
//instance.register(RDFa11Extractor.factory);
} else {
instance.unregister(RDFa11ExtractorFactory.NAME);
+ instance.unregister(LibRdfaExtractorFactory.NAME);
// FIXME: Unregister RDFaExtractor if flag is set
//instance.register(RDFaExtractor.factory);
}
diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
index 6b4406a5b..e2618ffce 100644
--- a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
+++ b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java
@@ -17,10 +17,16 @@
package org.apache.any23.extractor.rdf;
-import org.apache.any23.extractor.IssueReport;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.util.Collections;
+import java.util.HashSet;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionResult;
+import org.apache.any23.extractor.IssueReport;
import org.apache.any23.rdf.Any23ValueFactoryWrapper;
+import org.apache.any23.rdf.rdfa.LibrdfaRDFaParser;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.rio.ParseErrorListener;
import org.eclipse.rdf4j.rio.RDFFormat;
@@ -36,12 +42,6 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import java.util.Collections;
-import java.util.HashSet;
-
/**
* This factory provides a common logic for creating and configuring correctly
* any RDF parser used within the library.
@@ -124,6 +124,27 @@ public RDFParser getRDFa11Parser(
configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
return parser;
}
+
+ /**
+ * Returns a new instance of a configured RDFaParser using the librdfa library.
+ *
+ * @param verifyDataType data verification enable if true.
+ * @param stopAtFirstError the parser stops at first error if true.
+ * @param extractionContext the extraction context where the parser is used.
+ * @param extractionResult the output extraction result.
+ * @return a new instance of a configured RDFXML parser.
+ */
+ public RDFParser getRDFaLibrdfaParser(
+ final boolean verifyDataType,
+ final boolean stopAtFirstError,
+ final ExtractionContext extractionContext,
+ final ExtractionResult extractionResult
+ ) {
+ final RDFParser parser = new LibrdfaRDFaParser();
+ parser.getParserConfig().set(RDFaParserSettings.RDFA_COMPATIBILITY, RDFaVersion.RDFA_1_1);
+ configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
+ return parser;
+ }
/**
* Returns a new instance of a configured RDFXMLParser.
diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java
new file mode 100644
index 000000000..e1d598a28
--- /dev/null
+++ b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.extractor.rdfa;
+
+import org.apache.any23.extractor.ExtractionContext;
+import org.apache.any23.extractor.ExtractionResult;
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.extractor.rdf.BaseRDFExtractor;
+import org.apache.any23.extractor.rdf.RDFParserFactory;
+import org.eclipse.rdf4j.rio.RDFParser;
+
+/**
+ *
+ * @author Julio Caguano
+ */
+public class LibRdfaExtractor extends BaseRDFExtractor {
+
+ public LibRdfaExtractor(boolean verifyDataType, boolean stopAtFirstError) {
+ super(verifyDataType, stopAtFirstError);
+ }
+
+ public LibRdfaExtractor() {
+ this(false, false);
+ }
+
+ @Override
+ protected RDFParser getParser(ExtractionContext extractionContext, ExtractionResult extractionResult) {
+ return RDFParserFactory.getInstance().getRDFaLibrdfaParser(
+ isVerifyDataType(), isStopAtFirstError(), extractionContext, extractionResult
+ );
+ }
+
+ @Override
+ public ExtractorDescription getDescription() {
+ return LibRdfaExtractorFactory.getDescriptionInstance();
+ }
+
+}
diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java
new file mode 100644
index 000000000..6d1d51e8c
--- /dev/null
+++ b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.extractor.rdfa;
+
+import java.util.Arrays;
+import org.apache.any23.extractor.ExtractorDescription;
+import org.apache.any23.extractor.ExtractorFactory;
+import org.apache.any23.extractor.SimpleExtractorFactory;
+import org.apache.any23.rdf.Prefixes;
+
+/**
+ *
+ * @author Julio Caguano
+ */
+public class LibRdfaExtractorFactory extends SimpleExtractorFactory
+ implements ExtractorFactory {
+
+ public static final String NAME = "html-librdfa";
+ public static final Prefixes PREFIXES = null;
+
+ private static final ExtractorDescription descriptionInstance = new LibRdfaExtractorFactory();
+
+ public LibRdfaExtractorFactory() {
+ super(LibRdfaExtractorFactory.NAME,
+ LibRdfaExtractorFactory.PREFIXES,
+ Arrays.asList("text/html;q=0.3", "application/xhtml+xml;q=0.3"),
+ "example-rdfa11.html");
+ }
+
+ @Override
+ public LibRdfaExtractor createExtractor() {
+ return new LibRdfaExtractor();
+ }
+
+ public static ExtractorDescription getDescriptionInstance() {
+ return descriptionInstance;
+ }
+}
diff --git a/core/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory b/core/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory
index 2b1df7996..7303bcbba 100644
--- a/core/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory
+++ b/core/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory
@@ -27,5 +27,6 @@ org.apache.any23.extractor.rdf.TriXExtractorFactory
org.apache.any23.extractor.rdf.TurtleExtractorFactory
org.apache.any23.extractor.rdfa.RDFa11ExtractorFactory
org.apache.any23.extractor.rdfa.RDFaExtractorFactory
+org.apache.any23.extractor.rdfa.LibRdfaExtractorFactory
org.apache.any23.extractor.xpath.XPathExtractorFactory
org.apache.any23.extractor.yaml.YAMLExtractorFactory
\ No newline at end of file
diff --git a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java
new file mode 100644
index 000000000..d0572c4d5
--- /dev/null
+++ b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java
@@ -0,0 +1,335 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.extractor.rdfa;
+
+import java.io.IOException;
+import java.util.List;
+import org.apache.any23.extractor.ExtractionException;
+import org.apache.any23.extractor.ExtractorFactory;
+import static org.apache.any23.extractor.rdfa.AbstractRDFaExtractorTestCase.vFOAF;
+import org.apache.any23.rdf.RDFUtils;
+import org.apache.any23.vocab.FOAF;
+import org.apache.any23.vocab.OGP;
+import org.apache.any23.vocab.OGPMusic;
+import org.eclipse.rdf4j.model.Literal;
+import org.eclipse.rdf4j.model.Statement;
+import org.eclipse.rdf4j.model.Value;
+import org.eclipse.rdf4j.model.vocabulary.RDF;
+import org.eclipse.rdf4j.repository.RepositoryException;
+import org.eclipse.rdf4j.repository.RepositoryResult;
+import org.eclipse.rdf4j.rio.RDFHandlerException;
+import org.eclipse.rdf4j.rio.RDFParseException;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Reference Test Class for {@link RDFaExtractor}.
+ * @author Julio Caguano
+ */
+public class RDFaLibrdfaExtractorTest extends AbstractRDFaExtractorTestCase {
+
+ /**
+ * Taken from the
+ * GoodRelations
+ * test cases. It checks if the extraction is the same when the
+ * namespaces are defined in RDFa1.0 or
+ * RDFa1.1 respectively.
+ *
+ * @throws org.eclipse.rdf4j.repository.RepositoryException
+ * @throws java.io.IOException
+ * @throws org.eclipse.rdf4j.rio.RDFHandlerException
+ * @throws org.eclipse.rdf4j.rio.RDFParseException
+ */
+ @Test
+ public void testRDFa11PrefixBackwardCompatibility()
+ throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
+ final int EXPECTED_STATEMENTS = 31;
+
+ assertExtract("/html/rdfa/goodrelations-rdfa10.html");
+ logger.debug("Model 1 " + dumpHumanReadableTriples());
+ Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size());
+ List rdfa10Stmts = dumpAsListOfStatements();
+
+ //assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq");
+ assertExtract("/html/rdfa/goodrelations-rdfa11.html");
+ logger.debug("Model 2 " + dumpHumanReadableTriples());
+ Assert.assertTrue(dumpAsListOfStatements().size() >= EXPECTED_STATEMENTS);
+
+ for (Statement stmt : rdfa10Stmts) {
+ assertContains(stmt);
+ }
+ }
+
+ /**
+ * This test verifies the correct object resource conversion.
+ *
+ * @throws RepositoryException
+ */
+ @Test
+ public void testObjectResourceConversion() throws RepositoryException {
+ assertExtract("/html/rdfa/object-resource-test.html");
+ logger.debug(dumpModelToTurtle());
+ assertContains(
+ null,
+ FOAF.getInstance().page,
+ RDFUtils.iri("http://en.wikipedia.org/New_York")
+ );
+ }
+
+ /**
+ * This test checks the behavior of the RDFa extraction where the
+ * datatype of a property is explicitly set. For details see the
+ * RDFa in XHTML: Syntax and
+ * Processing
+ * recommendation.
+ *
+ * @throws RepositoryException
+ */
+ @Test
+ public void testExplicitDatatypeDeclaration() throws RepositoryException {
+ assertExtract("/html/rdfa/xmlliteral-datatype-test.html");
+ logger.debug(dumpModelToTurtle());
+
+ RepositoryResult stmts
+ = conn.getStatements(RDFUtils.iri("http://dbpedia.org/resource/Albert_Einstein"),
+ vFOAF.name, null, false);
+ Assert.assertTrue(stmts.hasNext());
+ Value obj = stmts.next().getObject();
+ Assert.assertTrue(obj instanceof Literal);
+ Literal lit = (Literal) obj;
+ Assert.assertEquals(lit.getDatatype(), RDF.XMLLITERAL);
+ Assert.assertEquals(lit.getLabel(), "Albert "
+ + "Einstein");
+ }
+
+ /**
+ * Tests the correct behavior of REL and HREF.
+ *
+ * @throws RepositoryException
+ */
+ @Test
+ public void testRelWithHref() throws RepositoryException {
+ assertExtract("/html/rdfa/rel-href.html");
+ logger.debug(dumpModelToTurtle());
+
+ assertContains(
+ RDFUtils.iri(baseIRI.toString(), "#me"),
+ FOAF.getInstance().name,
+ "John Doe"
+ );
+ assertContains(
+ RDFUtils.iri(baseIRI.toString(), "#me"),
+ FOAF.getInstance().homepage,
+ RDFUtils.iri("http://example.org/blog/")
+ );
+ }
+
+ /**
+ * This test verifies the correct REL/REV attribute usage.
+ *
+ * @throws RepositoryException
+ */
+ @Test
+ public void testRelRevSupport() throws RepositoryException {
+ assertExtract("/html/rdfa/rel-rev.html");
+ logger.debug(dumpModelToTurtle());
+
+ assertContains(
+ baseIRI,
+ RDFUtils.iri("http://bob.example.com/cite"),
+ RDFUtils.iri("http://www.example.com/books/the_two_towers")
+ );
+ assertContains(
+ RDFUtils.iri("http://path/to/chapter"),
+ RDFUtils.iri("http://bob.example.com/isChapterOf"),
+ baseIRI
+ );
+ }
+
+ /**
+ * Tests the @vocab support.
+ *
+ * @throws RepositoryException
+ */
+ @Test
+ public void testVocabSupport() throws RepositoryException {
+ assertExtract("/html/rdfa/vocab.html");
+ logger.debug(dumpModelToTurtle());
+
+ assertContains(
+ RDFUtils.iri(baseIRI.toString(), "#me"),
+ RDFUtils.iri("http://xmlns.com/foaf/0.1/name"),
+ RDFUtils.literal("John Doe")
+ );
+ assertContains(
+ RDFUtils.iri(baseIRI.toString(), "#me"),
+ RDFUtils.iri("http://xmlns.com/foaf/0.1/homepage"),
+ RDFUtils.iri("http://example.org/blog/")
+ );
+ }
+
+ /**
+ * Tests the correct support of alternate
+ * Open Graph Protocol Object Types
+ *
+ * @throws IOException
+ * @throws org.apache.any23.extractor.ExtractionException
+ * @throws RepositoryException
+ */
+ @Test
+ public void testOpenGraphAlternateObjectTypes() throws IOException, ExtractionException, RepositoryException {
+ assertExtract("/html/rdfa/opengraph-music-song-object-type.html");
+ logger.info(dumpHumanReadableTriples());
+
+ Assert.assertEquals(9, getStatementsSize(null, null, null));
+ final OGPMusic vOGPMusic = OGPMusic.getInstance();
+ assertContains(baseIRI, vOGPMusic.musicDuration, RDFUtils.literal("447"));
+ assertContains(
+ baseIRI,
+ vOGPMusic.musicMusician,
+ RDFUtils.literal(
+ "Jono Grant / Tony McGuinness / Ashley Tomberlin"
+ )
+ );
+ assertContains(baseIRI, vOGPMusic.musicAlbum, RDFUtils.literal("Tri-State"));
+ }
+
+ /**
+ * Taken from the
+ * GoodRelations
+ * test cases. It checks if the extraction is the same when the
+ * namespaces are defined in RDFa1.0.
+ *
+ * @throws RepositoryException
+ * @throws java.io.IOException
+ * @throws org.eclipse.rdf4j.rio.RDFHandlerException
+ * @throws org.eclipse.rdf4j.rio.RDFParseException
+ */
+ @Test
+ public void testRDFa10Extraction()
+ throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
+ final int EXPECTED_STATEMENTS = 31;
+
+ assertExtract("/html/rdfa/goodrelations-rdfa10.html");
+ logger.debug(dumpModelToNQuads());
+
+ Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size());
+ assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq");
+ }
+
+ /**
+ * Taken from the
+ * GoodRelations
+ * test cases. It checks if the extraction is the same when the
+ * namespaces are defined in RDFa1.1.
+ *
+ * @throws RepositoryException
+ * @throws java.io.IOException
+ * @throws org.eclipse.rdf4j.rio.RDFHandlerException
+ * @throws org.eclipse.rdf4j.rio.RDFParseException
+ */
+ @Test
+ public void testRDFa11Extraction()
+ throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
+ final int EXPECTED_STATEMENTS = 31;
+
+ assertExtract("/html/rdfa/goodrelations-rdfa11.html");
+ logger.debug(dumpHumanReadableTriples());
+
+ Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size());
+ assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq");
+ }
+
+ /**
+ * Tests the correct support of Open Graph
+ * Protocol's
+ * Basic Metadata,
+ * Optional Metadata,
+ * Structured Properties and
+ * Arrays.
+ *
+ * @throws IOException
+ * @throws org.apache.any23.extractor.ExtractionException
+ * @throws RepositoryException
+ */
+ @Test
+ public void testOpenGraphStructuredProperties() throws IOException, ExtractionException, RepositoryException {
+ assertExtract("/html/rdfa/opengraph-structured-properties.html");
+ logger.info(dumpHumanReadableTriples());
+
+ Assert.assertEquals(31, getStatementsSize(null, null, null));
+ final OGP vOGP = OGP.getInstance();
+ assertContains(baseIRI, vOGP.audio, RDFUtils.literal("http://example.com/sound.mp3"));
+ assertContains(
+ baseIRI,
+ vOGP.description,
+ RDFUtils.literal(
+ "Sean Connery found fame and fortune as the suave, sophisticated British agent, James Bond."
+ )
+ );
+ assertContains(baseIRI, vOGP.determiner, RDFUtils.literal("the"));
+ assertContains(baseIRI, vOGP.locale, RDFUtils.literal("en_GB"));
+ assertContains(baseIRI, vOGP.localeAlternate, RDFUtils.literal("fr_FR"));
+ assertContains(baseIRI, vOGP.localeAlternate, RDFUtils.literal("es_ES"));
+ assertContains(baseIRI, vOGP.siteName, RDFUtils.literal("IMDb"));
+ assertContains(baseIRI, vOGP.video, RDFUtils.literal("http://example.com/bond/trailer.swf"));
+ }
+
+ /**
+ * Tests that the default parser settings enable tolerance in data type
+ * parsing.
+ */
+ @Test
+ public void testTolerantParsing() {
+ assertExtract("/html/rdfa/oreilly-invalid-datatype.html");
+ }
+
+ @Override
+ protected ExtractorFactory> getExtractorFactory() {
+ return new LibRdfaExtractorFactory();
+ }
+
+}
diff --git a/librdfa-rdf4j/README.MD b/librdfa-rdf4j/README.MD
new file mode 100644
index 000000000..2e73818f9
--- /dev/null
+++ b/librdfa-rdf4j/README.MD
@@ -0,0 +1,56 @@
+# Librdfa - RDF4J
+
+RDF4J parser that uses [librdfa](https://github.com/rdfa/librdfa) to parse RDFa to triples. See the [documentation](https://cwiki.apache.org/confluence/display/ANY23/Librdfa-rdf4j+documentation) for more information.
+
+## Prerequisites
+
+You need to install the [librdfa](https://github.com/rdfa/librdfa) library.
+
+## Install
+
+``` mvn
+
+ org.apache.any23
+ apache-any23-librdfa
+ ${project.version}
+
+```
+
+## Compile
+
+`mvn clean install`
+
+## Use
+
+Add the library and you can parse an `InputStream` as you would do with [`Rio`](http://docs.rdf4j.org/javadoc/2.1/org/eclipse/rdf4j/rio/Rio.html).
+
+``` java
+RDFParser rdfParser = Rio.createParser(RDFFormat.RDFA);
+Model model = new LinkedHashModel();
+rdfParser.setRDFHandler(new StatementCollector(model));
+rdfParser.parse(in, "http://www.example.org./");
+```
+
+## Benchmarking
+
+In general librdfa is 2-5 seconds faster than semargl.
+
+### librdfa-rdf4j
+- round: 0.11 [+- 0.00]
+- round.block: 0.00 [+- 0.00]
+- round.gc: 0.00 [+- 0.00]
+- GC.calls: 0
+- GC.time: 0.00
+- time.total: 0.11
+- time.warmup: 0.00
+- time.bench: 0.11
+
+### semargl-rdf4j
+- round: 0.15 [+- 0.00]
+- round.block: 0.00 [+- 0.00]
+- round.gc: 0.00 [+- 0.00]
+- GC.calls: 1
+- GC.time: 0.00
+- time.total: 0.15
+- time.warmup: 0.00
+- time.bench: 0.15
diff --git a/librdfa-rdf4j/pom.xml b/librdfa-rdf4j/pom.xml
new file mode 100644
index 000000000..4ead804b5
--- /dev/null
+++ b/librdfa-rdf4j/pom.xml
@@ -0,0 +1,152 @@
+
+
+
+
+
+ org.apache.any23
+ apache-any23
+ 2.3-SNAPSHOT
+ ../
+
+
+ 4.0.0
+ org.apache.any23
+ apache-any23-librdfa
+ jar
+ Apache Any23 :: Librdfa-RDF4J
+
+
+ ${project.basedir}/src/main/c/
+ ${jni.base}/build/
+
+ 4.12
+ 0.7.2
+
+
+
+
+
+ com.googlecode.cmake-maven-project
+ cmake-maven-plugin
+ 3.7.2-b1
+
+
+ cmake-generate
+
+ generate
+
+
+ ${jni.base}
+ ${jni.build}
+ Unix Makefiles
+ linux-x86_64
+
+ ${jni.build}
+
+
+
+
+ cmake-compile
+ process-resources
+
+ compile
+
+
+ ${jni.build}
+ linux-x86_64
+
+
+
+
+
+ org.codehaus.mojo
+ build-helper-maven-plugin
+ 3.0.0
+
+
+ add-source
+ generate-sources
+
+ add-source
+
+
+
+ ${jni.build}
+
+
+
+
+
+
+ maven-antrun-plugin
+ 1.8
+
+
+ process-classes
+
+
+ ${jni.base}
+
+
+
+
+
+
+
+
+ run
+
+
+
+
+
+
+
+
+
+ org.eclipse.rdf4j
+ rdf4j-rio-api
+
+
+
+
+ junit
+ junit
+ test
+
+
+ com.carrotsearch
+ junit-benchmarks
+ ${junit.benchmarks.version}
+ test
+
+
+ org.slf4j
+ slf4j-log4j12
+ test
+
+
+ org.semarglproject
+ semargl-rdf4j
+ test
+
+
+
+
\ No newline at end of file
diff --git a/librdfa-rdf4j/src/main/c/CMakeLists.txt b/librdfa-rdf4j/src/main/c/CMakeLists.txt
new file mode 100644
index 000000000..ef5f08a6c
--- /dev/null
+++ b/librdfa-rdf4j/src/main/c/CMakeLists.txt
@@ -0,0 +1,54 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cmake_minimum_required(VERSION 2.8)
+
+# Check if required packages are installed
+find_package(SWIG REQUIRED)
+find_package(Java REQUIRED)
+find_package(JNI REQUIRED)
+find_package(LibXml2 REQUIRED)
+
+# Add modules
+include(UseJava)
+include(UseSWIG)
+
+# Add directories to the build process
+set( CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR} )
+
+include_directories(${LIBXML2_INCLUDE_DIR})
+include_directories(${JNI_INCLUDE_DIRS})
+include_directories(${CMAKE_CURRENT_BINARY_DIR}/..)
+
+# Link library. TODO: replace because of deprecation
+link_libraries(rdfa)
+
+# Build the C++ code into a dynamic library: rdfaJava.dll (on Windows) or librdfaJava.so (on Linux)
+set(CMAKE_SWIG_FLAGS -package org.apache.any23.rdf.librdfa)
+set(CMAKE_SWIG_OUTDIR "${CMAKE_CURRENT_BINARY_DIR}/org/apache/any23/rdf/librdfa")
+set_property(SOURCE rdfa.i PROPERTY CPLUSPLUS ON)
+swig_add_module(
+ rdfaJava
+ java
+ rdfa.i
+ RdfaParser.cpp
+)
+
+# For convenience we copy the dynamic library to the current build folder
+add_custom_command(
+ TARGET rdfaJava
+ POST_BUILD
+ COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${CMAKE_CURRENT_BINARY_DIR}
+)
diff --git a/librdfa-rdf4j/src/main/c/RdfaParser.cpp b/librdfa-rdf4j/src/main/c/RdfaParser.cpp
new file mode 100644
index 000000000..990756449
--- /dev/null
+++ b/librdfa-rdf4j/src/main/c/RdfaParser.cpp
@@ -0,0 +1,18 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "RdfaParser.h"
\ No newline at end of file
diff --git a/librdfa-rdf4j/src/main/c/RdfaParser.h b/librdfa-rdf4j/src/main/c/RdfaParser.h
new file mode 100644
index 000000000..7a8f769af
--- /dev/null
+++ b/librdfa-rdf4j/src/main/c/RdfaParser.h
@@ -0,0 +1,121 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#ifndef _RDFA_PARSER_H_
+#define _RDFA_PARSER_H_
+
+#include
+#include
+#include
+#include
+#include
+#include
+
+struct rdfacontext;
+
+class Callback {
+public:
+
+ virtual ~Callback() {
+ }
+
+ virtual void default_graph(char* subject, char* predicate, char* object, int object_type, char* datatype, char* language) {
+ }
+
+ virtual void processor_graph(char* subject, char* predicate, char* object, int object_type, char* datatype, char* language) {
+ }
+
+ virtual char* fill_data(size_t buffer_length) {
+ }
+
+ virtual size_t fill_len() {
+ }
+};
+
+/**
+ * The RdfaParser class is a wrapper class for Java to provide a
+ * simple API for using librdfa in Java.
+ */
+class RdfaParser {
+private:
+ Callback *_callback;
+public:
+ /**
+ * The base URI that will be used when resolving relative pathnames
+ * in the document.
+ */
+ std::string mBaseUri;
+
+ /**
+ * The base RDFa context to use when setting the triple handler callback,
+ * buffer filler callback, and executing the parser call.
+ */
+ rdfacontext* mBaseContext;
+
+ RdfaParser(const char* baseUri) : _callback(0) {
+ mBaseUri = baseUri;
+ mBaseContext = rdfa_create_context(baseUri);
+ }
+
+ /**
+ * Standard destructor.
+ */
+ ~RdfaParser() {
+ rdfa_free_context(mBaseContext);
+ delCallback();
+ }
+
+ void c_process_default_graph_triple(rdftriple* triple, void* callback_data) {
+ _callback->default_graph(triple->subject, triple-> predicate, triple->object, triple->object_type, triple-> datatype, triple-> language);
+ rdfa_free_triple(triple);
+ }
+
+ void c_process_processor_graph_triple(rdftriple* triple, void* callback_data) {
+ _callback->processor_graph(triple->subject, triple-> predicate, triple->object, triple->object_type, triple-> datatype, triple-> language);
+ rdfa_free_triple(triple);
+ }
+
+ size_t c_fill_buffer(char* buffer, size_t buffer_length, void* callback_data) {
+ char* data = _callback->fill_data(buffer_length);
+ size_t size = _callback -> fill_len();
+ memset(buffer, ' ', buffer_length);
+ memcpy(buffer, data, size);
+
+ return size;
+ }
+
+ /**
+ * Starts the parsing process for librdfa. When more data is
+ * required by the XML parser, the buffer filler callback is
+ * called. If triples are found, then the triple handler callback
+ * is called.
+ */
+ int parse() {
+ return rdfa_parse(mBaseContext);
+ }
+
+ void delCallback() {
+ delete _callback;
+ _callback = 0;
+ }
+
+ void setCallback(Callback *cb) {
+ delCallback();
+ _callback = cb;
+ }
+};
+
+#endif /* _RDFA_PARSER_H_ */
diff --git a/librdfa-rdf4j/src/main/c/rdfa.i b/librdfa-rdf4j/src/main/c/rdfa.i
new file mode 100644
index 000000000..68b8f1d23
--- /dev/null
+++ b/librdfa-rdf4j/src/main/c/rdfa.i
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements. See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+%module(directors="1") rdfa
+%feature("director") Callback;
+
+%{
+ #include "RdfaParser.h"
+
+ RdfaParser* gRdfaParser = NULL;
+ void process_default_graph_triple(rdftriple* triple, void* callback_data);
+ void process_processor_graph_triple(rdftriple* triple, void* callback_data);
+ size_t fill_buffer(char* buffer, size_t buffer_length, void* callback_data);
+%}
+
+%constant int RDF_TYPE_NAMESPACE_PREFIX = RDF_TYPE_NAMESPACE_PREFIX;
+%constant int RDF_TYPE_IRI = RDF_TYPE_IRI;
+%constant int RDF_TYPE_PLAIN_LITERAL = RDF_TYPE_PLAIN_LITERAL;
+%constant int RDF_TYPE_XML_LITERAL = RDF_TYPE_XML_LITERAL;
+%constant int RDF_TYPE_TYPED_LITERAL = RDF_TYPE_TYPED_LITERAL;
+
+%{
+ void process_default_graph_triple(rdftriple* triple, void* callback_data){
+ gRdfaParser->c_process_default_graph_triple(triple, callback_data);
+ }
+ void process_processor_graph_triple(rdftriple* triple, void* callback_data){
+ gRdfaParser->c_process_processor_graph_triple( triple, callback_data);
+ }
+ size_t fill_buffer(char* buffer, size_t buffer_length, void* callback_data){
+ return gRdfaParser->c_fill_buffer(buffer, buffer_length, callback_data);
+ }
+%}
+
+%include RdfaParser.h
+
+%extend RdfaParser {
+ void init (){
+ gRdfaParser = self;
+ rdfa_set_default_graph_triple_handler(gRdfaParser->mBaseContext, &process_default_graph_triple);
+ rdfa_set_processor_graph_triple_handler(gRdfaParser->mBaseContext, &process_processor_graph_triple);
+ rdfa_set_buffer_filler(gRdfaParser->mBaseContext, &fill_buffer);
+ }
+}
diff --git a/librdfa-rdf4j/src/main/c/readme.md b/librdfa-rdf4j/src/main/c/readme.md
new file mode 100644
index 000000000..f3538b428
--- /dev/null
+++ b/librdfa-rdf4j/src/main/c/readme.md
@@ -0,0 +1,8 @@
+# Librdfa Integration with Java
+
+Build:
+
+`mkdir build`
+`cd build`
+`cmake ..`
+`cmake --build .`
diff --git a/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaFilter.java b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaFilter.java
new file mode 100644
index 000000000..53cdf101f
--- /dev/null
+++ b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaFilter.java
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.rdf.rdfa;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.logging.Level;
+import java.util.logging.Logger;
+import org.apache.any23.rdf.librdfa.Callback;
+import org.apache.any23.rdf.librdfa.rdfa;
+import org.eclipse.rdf4j.model.IRI;
+import org.eclipse.rdf4j.model.Statement;
+import org.eclipse.rdf4j.model.Value;
+import org.eclipse.rdf4j.model.ValueFactory;
+import org.eclipse.rdf4j.model.vocabulary.RDF;
+import org.eclipse.rdf4j.rio.RDFHandler;
+
+/**
+ *
+ * @author Julio Caguano
+ */
+public class LibrdfaFilter extends Callback {
+
+ private BufferedReader bis = null;
+ private int len = 0;
+ private RDFHandler handler;
+ private ValueFactory valueFactory;
+
+ public LibrdfaFilter(InputStream is) {
+ super();
+ bis = new BufferedReader(new InputStreamReader(is));
+ }
+
+ public LibrdfaFilter(Reader reader) {
+ super();
+ bis = new BufferedReader(reader);
+ }
+
+ @Override
+ public void default_graph(String subject, String predicate, String object, int object_type, String datatype, String language) {
+ IRI s = valueFactory.createIRI(subject);
+ IRI p = valueFactory.createIRI(predicate);
+ Value o = null;
+
+ if (object_type == rdfa.RDF_TYPE_IRI) { // 1
+ o = valueFactory.createIRI(object);
+ } else if (object_type == rdfa.RDF_TYPE_PLAIN_LITERAL) { // 2
+ o = valueFactory.createLiteral(object);
+ } else if (object_type == rdfa.RDF_TYPE_XML_LITERAL) { // 3
+ o = valueFactory.createLiteral(object, RDF.XMLLITERAL);
+ } else if (object_type == rdfa.RDF_TYPE_TYPED_LITERAL) { // 4
+ if (datatype != null) {
+ IRI dt = valueFactory.createIRI(datatype);
+ o = valueFactory.createLiteral(object, dt);
+ } else {
+ o = valueFactory.createLiteral(object, language);
+ }
+ }
+ if (handler != null && o != null) {
+ Statement stmt = valueFactory.createStatement(s, p, o);
+ handler.handleStatement(stmt);
+ } else {
+ System.err.println("VALIDATE: S=" + subject + "P=" + predicate + "O=" + object + "OT=" + object_type + "DT=" + datatype + "LANG=" + language);
+ }
+ }
+
+ @Override
+ public void processor_graph(String subject, String predicate, String object, int object_type, String datatype, String language) {
+ if (handler != null && rdfa.RDF_TYPE_NAMESPACE_PREFIX == object_type) { // 0
+ handler.handleNamespace(predicate, object);
+ } else {
+ System.out.println("Processor: S=" + subject + "\tP=" + predicate + "\tO=" + object + "\tOT=" + object_type + "\tDT:" + datatype + "\tLANG=" + language);
+ }
+ }
+
+ @Override
+ public String fill_data(long buffer_length) {
+ char[] d = new char[(int) buffer_length];
+
+ try {
+ len = bis.read(d, 0, (int) buffer_length);
+ } catch (IOException ex) {
+ Logger.getLogger(LibrdfaFilter.class.getName()).log(Level.SEVERE, null, ex);
+ }
+
+ return new String(d);
+ }
+
+ @Override
+ public long fill_len() {
+ if (len == -1) {
+ return 0;
+ }
+ return len;
+ }
+
+ public RDFHandler getHandler() {
+ return handler;
+ }
+
+ public void setHandler(RDFHandler handler) {
+ this.handler = handler;
+ }
+
+ public ValueFactory getValueFactory() {
+ return valueFactory;
+ }
+
+ public void setValueFactory(ValueFactory valueFactory) {
+ this.valueFactory = valueFactory;
+ }
+
+}
diff --git a/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParser.java b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParser.java
new file mode 100644
index 000000000..cff321e74
--- /dev/null
+++ b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParser.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.rdf.rdfa;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import org.apache.any23.rdf.librdfa.RdfaParser;
+import org.apache.any23.rdf.rdfa.utils.LibraryLoader;
+import org.eclipse.rdf4j.rio.RDFFormat;
+import org.eclipse.rdf4j.rio.RDFHandlerException;
+import org.eclipse.rdf4j.rio.RDFParseException;
+import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser;
+
+/**
+ *
+ * @author Julio Caguano
+ */
+public class LibrdfaRDFaParser extends AbstractRDFParser {
+
+ static {
+ try {
+ LibraryLoader.loadLibrary("rdfaJava");
+ } catch (IOException ex) {
+ throw new RuntimeException(ex);
+ }
+ }
+
+ @Override
+ public RDFFormat getRDFFormat() {
+ return RDFFormat.RDFA;
+ }
+
+ @Override
+ public void parse(InputStream in, String baseURI) throws IOException, RDFParseException, RDFHandlerException {
+ if (in == null) {
+ throw new IllegalArgumentException("Input stream cannot be 'null'");
+ }
+ if (baseURI == null) {
+ throw new IllegalArgumentException("Base URI cannot be 'null'");
+ }
+
+ RdfaParser parser = new RdfaParser(baseURI);
+ parser.init();
+
+ LibrdfaFilter filter = new LibrdfaFilter(in);
+ parser.setCallback(filter);
+
+ filter.setHandler(rdfHandler);
+ filter.setValueFactory(valueFactory);
+
+ int status = parser.parse();
+
+ parser.delCallback();
+ filter.delete();
+ }
+
+ @Override
+ public void parse(Reader reader, String baseURI) throws IOException, RDFParseException, RDFHandlerException {
+ if (reader == null) {
+ throw new IllegalArgumentException("Input stream cannot be 'null'");
+ }
+ if (baseURI == null) {
+ throw new IllegalArgumentException("Base URI cannot be 'null'");
+ }
+
+ RdfaParser parser = new RdfaParser(baseURI);
+ parser.init();
+ LibrdfaFilter filter = new LibrdfaFilter(reader);
+ parser.setCallback(filter);
+
+ filter.setHandler(rdfHandler);
+ filter.setValueFactory(valueFactory);
+
+ parser.parse();
+
+ parser.delCallback();
+ parser.delete();
+ }
+
+}
diff --git a/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParserFactory.java b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParserFactory.java
new file mode 100644
index 000000000..996a61c4f
--- /dev/null
+++ b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParserFactory.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.rdf.rdfa;
+
+import org.eclipse.rdf4j.rio.RDFFormat;
+import org.eclipse.rdf4j.rio.RDFParser;
+import org.eclipse.rdf4j.rio.RDFParserFactory;
+
+/**
+ * Parser factory to integrate the {@link LibrdfaRDFaParser} into RDF4j.
+ *
+ * @author Julio Caguano
+ */
+public class LibrdfaRDFaParserFactory implements RDFParserFactory {
+
+ @Override
+ public RDFFormat getRDFFormat() {
+ return RDFFormat.RDFA;
+ }
+
+ @Override
+ public RDFParser getParser() {
+ return new LibrdfaRDFaParser();
+ }
+
+}
diff --git a/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/utils/LibraryLoader.java b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/utils/LibraryLoader.java
new file mode 100644
index 000000000..29cbe1049
--- /dev/null
+++ b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/utils/LibraryLoader.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.rdf.rdfa.utils;
+
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+/**
+ *
+ * @author Julio Caguano
+ *
+ */
+public final class LibraryLoader {
+
+ public static void loadLibrary(String name) throws IOException {
+ try {
+ System.loadLibrary(name);
+ } catch (UnsatisfiedLinkError e) {
+ String filename = System.mapLibraryName(name);
+ InputStream in = LibraryLoader.class.getClassLoader().getResourceAsStream(filename);
+ int pos = filename.lastIndexOf('.');
+ File file = File.createTempFile(filename.substring(0, pos), filename.substring(pos));
+ file.deleteOnExit();
+ try {
+ byte[] buf = new byte[4096];
+ OutputStream out = new FileOutputStream(file);
+ try {
+ while (in.available() > 0) {
+ int len = in.read(buf);
+ if (len >= 0) {
+ out.write(buf, 0, len);
+ }
+ }
+ } finally {
+ out.close();
+ }
+ } finally {
+ in.close();
+ }
+ System.load(file.getAbsolutePath());
+ }
+ }
+
+}
diff --git a/librdfa-rdf4j/src/main/resources/META-INF/services/org.eclipse.rdf4j.rio.RDFParserFactory b/librdfa-rdf4j/src/main/resources/META-INF/services/org.eclipse.rdf4j.rio.RDFParserFactory
new file mode 100644
index 000000000..463600e9b
--- /dev/null
+++ b/librdfa-rdf4j/src/main/resources/META-INF/services/org.eclipse.rdf4j.rio.RDFParserFactory
@@ -0,0 +1 @@
+org.apache.any23.rdf.rdfa.LibrdfaRDFaParserFactory
diff --git a/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaBenchmarkTest.java b/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaBenchmarkTest.java
new file mode 100644
index 000000000..c37f3d5c1
--- /dev/null
+++ b/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaBenchmarkTest.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.rdf.librdfa;
+
+import com.carrotsearch.junitbenchmarks.AbstractBenchmark;
+import com.carrotsearch.junitbenchmarks.BenchmarkOptions;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import org.apache.any23.rdf.rdfa.LibrdfaRDFaParser;
+import org.eclipse.rdf4j.rio.RDFParser;
+import org.eclipse.rdf4j.rio.helpers.StatementCollector;
+import static org.junit.Assert.assertEquals;
+import org.junit.Before;
+import org.junit.Test;
+import org.semarglproject.rdf4j.rdf.rdfa.RDF4JRDFaParser;
+
+/**
+ *
+ * @author Julio Caguano
+ */
+@BenchmarkOptions(callgc = false, benchmarkRounds = 20, warmupRounds = 0)
+public class LibrdfaRDFaBenchmarkTest extends AbstractBenchmark {
+
+ private final int ITERATIONS = 2000;
+ private String DOCUMENT = "";
+
+ @Before
+ public void init() {
+ DOCUMENT = "\n"
+ + "\n"
+ + "\n"
+ + "Speed Test
";
+ for (int i = 0; i < ITERATIONS; i++) {
+ DOCUMENT += "";
+ }
+ DOCUMENT += "
";
+ }
+
+ @Test
+ public void testSemargl() throws Exception {
+ runTest(new RDF4JRDFaParser());
+ }
+
+ @Test
+ public void testLibrdfa() throws IOException {
+ runTest(new LibrdfaRDFaParser());
+ }
+
+ private void runTest(RDFParser parser) throws IOException {
+ InputStream in = new ByteArrayInputStream(DOCUMENT.getBytes(StandardCharsets.UTF_8));
+ StatementCollector sc = new StatementCollector();
+ parser.setRDFHandler(sc);
+ parser.parse(in, "http://example.org/");
+ assertEquals(ITERATIONS, sc.getStatements().size());
+ }
+}
diff --git a/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaParserTest.java b/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaParserTest.java
new file mode 100644
index 000000000..7974c4919
--- /dev/null
+++ b/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaParserTest.java
@@ -0,0 +1,63 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.any23.rdf.librdfa;
+
+import java.io.IOException;
+import java.io.InputStream;
+import org.apache.any23.rdf.rdfa.LibrdfaRDFaParser;
+import org.eclipse.rdf4j.model.ValueFactory;
+import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
+import org.eclipse.rdf4j.rio.RDFParser;
+import org.eclipse.rdf4j.rio.helpers.ParseErrorCollector;
+import org.eclipse.rdf4j.rio.helpers.StatementCollector;
+import static org.junit.Assert.assertEquals;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ *
+ * @author Julio Caguano
+ *
+ */
+public class LibrdfaRDFaParserTest {
+
+ private ValueFactory vf;
+ private RDFParser parser;
+ private StatementCollector sc;
+ private ParseErrorCollector el;
+
+ @Before
+ public void setUp() throws Exception {
+
+ vf = SimpleValueFactory.getInstance();
+ parser = new LibrdfaRDFaParser();
+ sc = new StatementCollector();
+ parser.setRDFHandler(sc);
+ el = new ParseErrorCollector();
+// parser.setParseErrorListener(el);
+ }
+
+ @Test
+ public void testHtml() throws IOException {
+ try (final InputStream in = this.getClass().getResourceAsStream(
+ "/org/apache/any23/rdf/librdfa/site.html");) {
+ parser.parse(in, "http://example.org/");
+ assertEquals(4, sc.getStatements().size());
+ }
+ }
+
+}
diff --git a/librdfa-rdf4j/src/test/resources/log4j.properties b/librdfa-rdf4j/src/test/resources/log4j.properties
new file mode 100644
index 000000000..32492dd43
--- /dev/null
+++ b/librdfa-rdf4j/src/test/resources/log4j.properties
@@ -0,0 +1,35 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+log4j.rootCategory=INFO, O
+
+# Stdout
+log4j.appender.O=org.apache.log4j.ConsoleAppender
+
+# File
+#log4j.appender.R=org.apache.log4j.RollingFileAppender
+#log4j.appender.R.File=log4j.log
+
+# Control the maximum log file size
+#log4j.appender.R.MaxFileSize=100KB
+
+# Archive log files (one backup file here)
+log4j.appender.R.MaxBackupIndex=1
+
+log4j.appender.R.layout=org.apache.log4j.PatternLayout
+log4j.appender.O.layout=org.apache.log4j.PatternLayout
+
+log4j.appender.R.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n
+log4j.appender.O.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n
diff --git a/librdfa-rdf4j/src/test/resources/org/apache/any23/rdf/librdfa/site.html b/librdfa-rdf4j/src/test/resources/org/apache/any23/rdf/librdfa/site.html
new file mode 100644
index 000000000..b80aca1a8
--- /dev/null
+++ b/librdfa-rdf4j/src/test/resources/org/apache/any23/rdf/librdfa/site.html
@@ -0,0 +1,17 @@
+
+
+
+
+ Test
+
+
+
+
+ Julio Caguano
+ Julio Caguano
+
+
+
+
\ No newline at end of file
diff --git a/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java b/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java
index e8e4505dc..7d97b2f21 100644
--- a/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java
+++ b/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java
@@ -17,22 +17,20 @@
package org.apache.any23.plugin;
-import org.apache.any23.cli.Crawler;
-import org.apache.any23.cli.Tool;
-import org.apache.any23.extractor.ExtractorGroup;
-import org.apache.any23.extractor.ExtractorRegistryImpl;
-import org.junit.After;
-import org.junit.Before;
-import org.junit.Test;
-
import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
-
+import org.apache.any23.cli.Crawler;
+import org.apache.any23.cli.Tool;
+import org.apache.any23.extractor.ExtractorGroup;
+import org.apache.any23.extractor.ExtractorRegistryImpl;
+import org.junit.After;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
+import org.junit.Before;
+import org.junit.Test;
/**
* Integration test for plugins.
@@ -41,7 +39,7 @@
*/
public class PluginIT {
- private static final int NUM_OF_EXTRACTORS_INCL_OPENIE = 34;
+ private static final int NUM_OF_EXTRACTORS_INCL_OPENIE = 35;
private static final int NUM_OF_EXTRACTORS_EXCL_OPENIE = 33;
diff --git a/pom.xml b/pom.xml
index ce2ee5d17..e5d1038e1 100644
--- a/pom.xml
+++ b/pom.xml
@@ -232,6 +232,7 @@
csvutilsmimeencoding
+ librdfa-rdf4jcorecliplugins/basic-crawler
diff --git a/test-resources/src/test/resources/html/rdfa/basic.html b/test-resources/src/test/resources/html/rdfa/basic.html
index 542b88b73..f9ffab285 100644
--- a/test-resources/src/test/resources/html/rdfa/basic.html
+++ b/test-resources/src/test/resources/html/rdfa/basic.html
@@ -14,7 +14,7 @@
See the License for the specific language governing permissions and
limitations under the License.
-->
-
+