diff --git a/.gitignore b/.gitignore index c0c5d24b3..63de6b912 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ pom.xml.versionsBackup **/maven-eclipse.xml **/any23-site/ **/nb*.xml +**/c/build/ diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties index 4f68586d3..3d211ff7e 100644 --- a/api/src/main/resources/default-configuration.properties +++ b/api/src/main/resources/default-configuration.properties @@ -43,10 +43,18 @@ any23.extraction.metadata.domain.per.entity=off # Allows to decide which RDFa Extractor to enable. # If 'on' will be activated the programmatic RDFa 1.1 Extractor -# (org.deri.any23.extractor.rdfa.RDFa11Extractor) otherwise will be -# registered the RDFa 1.0 legacy one (org.deri.any23.extractor.rdfa.RDFaExtractor). +# (org.apache.any23.extractor.rdfa.RDFa11Extractor) otherwise will be +# registered the RDFa 1.0 legacy one (org.apache.any23.extractor.rdfa.RDFaExtractor). any23.extraction.rdfa.programmatic=on +# Allows to enable Librdfa Extractor. +# If 'on' will override the extractors with the programmatic option, +# RDFa 1.1 Extractor (org.apache.any23.extractor.rdfa.RDFa11Extractor) and +# RDFa 1.0 Exctractor (org.apache.any23.extractor.rdfa.RDFaExtractor). +# If the option is 'off' (by default), it will choose the especfied extractor +# in the programmatic option (any23.extraction.rdfa.programmatic). +any23.extraction.rdfa.librdfa=off + # The extraction context IRI to be used by the # SingleDocumentExtraction. If == '?' the document IRI will # be used. It can be overriden by specifying a different diff --git a/core/pom.xml b/core/pom.xml index 49a1bfcb9..b066d55f9 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -62,6 +62,14 @@ + + + ${project.groupId} + apache-any23-librdfa + ${project.version} + + + org.apache.httpcomponents @@ -347,7 +355,7 @@ - + diff --git a/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java b/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java index ca3bb982b..4482e5a8e 100644 --- a/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java +++ b/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java @@ -17,15 +17,15 @@ package org.apache.any23.extractor; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import org.apache.any23.configuration.DefaultConfiguration; import org.apache.any23.extractor.html.HTMLMetaExtractorFactory; +import org.apache.any23.extractor.rdfa.LibRdfaExtractorFactory; import org.apache.any23.extractor.rdfa.RDFa11ExtractorFactory; import org.apache.any23.extractor.rdfa.RDFaExtractorFactory; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - /** * Singleton class acting as a register for all the various * {@link Extractor}. @@ -55,12 +55,17 @@ public static ExtractorRegistry getInstance() { if (instance == null) { instance = new ExtractorRegistryImpl(); - if(conf.getFlagProperty("any23.extraction.rdfa.programmatic")) { + if(conf.getFlagProperty("any23.extraction.rdfa.librdfa")){ + instance.unregister(RDFaExtractorFactory.NAME); + instance.unregister(RDFa11ExtractorFactory.NAME); + } else if(conf.getFlagProperty("any23.extraction.rdfa.programmatic")) { + instance.unregister(LibRdfaExtractorFactory.NAME); instance.unregister(RDFaExtractorFactory.NAME); // FIXME: Unregister RDFaExtractor if flag is not set //instance.register(RDFa11Extractor.factory); } else { instance.unregister(RDFa11ExtractorFactory.NAME); + instance.unregister(LibRdfaExtractorFactory.NAME); // FIXME: Unregister RDFaExtractor if flag is set //instance.register(RDFaExtractor.factory); } diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java index 6b4406a5b..e2618ffce 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java @@ -17,10 +17,16 @@ package org.apache.any23.extractor.rdf; -import org.apache.any23.extractor.IssueReport; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.util.Collections; +import java.util.HashSet; import org.apache.any23.extractor.ExtractionContext; import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.IssueReport; import org.apache.any23.rdf.Any23ValueFactoryWrapper; +import org.apache.any23.rdf.rdfa.LibrdfaRDFaParser; import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.eclipse.rdf4j.rio.ParseErrorListener; import org.eclipse.rdf4j.rio.RDFFormat; @@ -36,12 +42,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.io.InputStream; -import java.io.Reader; -import java.util.Collections; -import java.util.HashSet; - /** * This factory provides a common logic for creating and configuring correctly * any RDF parser used within the library. @@ -124,6 +124,27 @@ public RDFParser getRDFa11Parser( configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult); return parser; } + + /** + * Returns a new instance of a configured RDFaParser using the librdfa library. + * + * @param verifyDataType data verification enable if true. + * @param stopAtFirstError the parser stops at first error if true. + * @param extractionContext the extraction context where the parser is used. + * @param extractionResult the output extraction result. + * @return a new instance of a configured RDFXML parser. + */ + public RDFParser getRDFaLibrdfaParser( + final boolean verifyDataType, + final boolean stopAtFirstError, + final ExtractionContext extractionContext, + final ExtractionResult extractionResult + ) { + final RDFParser parser = new LibrdfaRDFaParser(); + parser.getParserConfig().set(RDFaParserSettings.RDFA_COMPATIBILITY, RDFaVersion.RDFA_1_1); + configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult); + return parser; + } /** * Returns a new instance of a configured RDFXMLParser. diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java new file mode 100644 index 000000000..e1d598a28 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.extractor.rdfa; + +import org.apache.any23.extractor.ExtractionContext; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.rdf.BaseRDFExtractor; +import org.apache.any23.extractor.rdf.RDFParserFactory; +import org.eclipse.rdf4j.rio.RDFParser; + +/** + * + * @author Julio Caguano + */ +public class LibRdfaExtractor extends BaseRDFExtractor { + + public LibRdfaExtractor(boolean verifyDataType, boolean stopAtFirstError) { + super(verifyDataType, stopAtFirstError); + } + + public LibRdfaExtractor() { + this(false, false); + } + + @Override + protected RDFParser getParser(ExtractionContext extractionContext, ExtractionResult extractionResult) { + return RDFParserFactory.getInstance().getRDFaLibrdfaParser( + isVerifyDataType(), isStopAtFirstError(), extractionContext, extractionResult + ); + } + + @Override + public ExtractorDescription getDescription() { + return LibRdfaExtractorFactory.getDescriptionInstance(); + } + +} diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java new file mode 100644 index 000000000..6d1d51e8c --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.extractor.rdfa; + +import java.util.Arrays; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.Prefixes; + +/** + * + * @author Julio Caguano + */ +public class LibRdfaExtractorFactory extends SimpleExtractorFactory + implements ExtractorFactory { + + public static final String NAME = "html-librdfa"; + public static final Prefixes PREFIXES = null; + + private static final ExtractorDescription descriptionInstance = new LibRdfaExtractorFactory(); + + public LibRdfaExtractorFactory() { + super(LibRdfaExtractorFactory.NAME, + LibRdfaExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.3", "application/xhtml+xml;q=0.3"), + "example-rdfa11.html"); + } + + @Override + public LibRdfaExtractor createExtractor() { + return new LibRdfaExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} diff --git a/core/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory b/core/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory index 2b1df7996..7303bcbba 100644 --- a/core/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory +++ b/core/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory @@ -27,5 +27,6 @@ org.apache.any23.extractor.rdf.TriXExtractorFactory org.apache.any23.extractor.rdf.TurtleExtractorFactory org.apache.any23.extractor.rdfa.RDFa11ExtractorFactory org.apache.any23.extractor.rdfa.RDFaExtractorFactory +org.apache.any23.extractor.rdfa.LibRdfaExtractorFactory org.apache.any23.extractor.xpath.XPathExtractorFactory org.apache.any23.extractor.yaml.YAMLExtractorFactory \ No newline at end of file diff --git a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java new file mode 100644 index 000000000..d0572c4d5 --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java @@ -0,0 +1,335 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.extractor.rdfa; + +import java.io.IOException; +import java.util.List; +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractorFactory; +import static org.apache.any23.extractor.rdfa.AbstractRDFaExtractorTestCase.vFOAF; +import org.apache.any23.rdf.RDFUtils; +import org.apache.any23.vocab.FOAF; +import org.apache.any23.vocab.OGP; +import org.apache.any23.vocab.OGPMusic; +import org.eclipse.rdf4j.model.Literal; +import org.eclipse.rdf4j.model.Statement; +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.vocabulary.RDF; +import org.eclipse.rdf4j.repository.RepositoryException; +import org.eclipse.rdf4j.repository.RepositoryResult; +import org.eclipse.rdf4j.rio.RDFHandlerException; +import org.eclipse.rdf4j.rio.RDFParseException; +import org.junit.Assert; +import org.junit.Test; + +/** + * Reference Test Class for {@link RDFaExtractor}. + * @author Julio Caguano + */ +public class RDFaLibrdfaExtractorTest extends AbstractRDFaExtractorTestCase { + + /** + * Taken from the + * GoodRelations + * test cases. It checks if the extraction is the same when the + * namespaces are defined in RDFa1.0 or + * RDFa1.1 respectively. + * + * @throws org.eclipse.rdf4j.repository.RepositoryException + * @throws java.io.IOException + * @throws org.eclipse.rdf4j.rio.RDFHandlerException + * @throws org.eclipse.rdf4j.rio.RDFParseException + */ + @Test + public void testRDFa11PrefixBackwardCompatibility() + throws RepositoryException, RDFHandlerException, IOException, RDFParseException { + final int EXPECTED_STATEMENTS = 31; + + assertExtract("/html/rdfa/goodrelations-rdfa10.html"); + logger.debug("Model 1 " + dumpHumanReadableTriples()); + Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size()); + List rdfa10Stmts = dumpAsListOfStatements(); + + //assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq"); + assertExtract("/html/rdfa/goodrelations-rdfa11.html"); + logger.debug("Model 2 " + dumpHumanReadableTriples()); + Assert.assertTrue(dumpAsListOfStatements().size() >= EXPECTED_STATEMENTS); + + for (Statement stmt : rdfa10Stmts) { + assertContains(stmt); + } + } + + /** + * This test verifies the correct object resource conversion. + * + * @throws RepositoryException + */ + @Test + public void testObjectResourceConversion() throws RepositoryException { + assertExtract("/html/rdfa/object-resource-test.html"); + logger.debug(dumpModelToTurtle()); + assertContains( + null, + FOAF.getInstance().page, + RDFUtils.iri("http://en.wikipedia.org/New_York") + ); + } + + /** + * This test checks the behavior of the RDFa extraction where the + * datatype of a property is explicitly set. For details see the + * RDFa in XHTML: Syntax and + * Processing + * recommendation. + * + * @throws RepositoryException + */ + @Test + public void testExplicitDatatypeDeclaration() throws RepositoryException { + assertExtract("/html/rdfa/xmlliteral-datatype-test.html"); + logger.debug(dumpModelToTurtle()); + + RepositoryResult stmts + = conn.getStatements(RDFUtils.iri("http://dbpedia.org/resource/Albert_Einstein"), + vFOAF.name, null, false); + Assert.assertTrue(stmts.hasNext()); + Value obj = stmts.next().getObject(); + Assert.assertTrue(obj instanceof Literal); + Literal lit = (Literal) obj; + Assert.assertEquals(lit.getDatatype(), RDF.XMLLITERAL); + Assert.assertEquals(lit.getLabel(), "Albert " + + "Einstein"); + } + + /** + * Tests the correct behavior of REL and HREF. + * + * @throws RepositoryException + */ + @Test + public void testRelWithHref() throws RepositoryException { + assertExtract("/html/rdfa/rel-href.html"); + logger.debug(dumpModelToTurtle()); + + assertContains( + RDFUtils.iri(baseIRI.toString(), "#me"), + FOAF.getInstance().name, + "John Doe" + ); + assertContains( + RDFUtils.iri(baseIRI.toString(), "#me"), + FOAF.getInstance().homepage, + RDFUtils.iri("http://example.org/blog/") + ); + } + + /** + * This test verifies the correct REL/REV attribute usage. + * + * @throws RepositoryException + */ + @Test + public void testRelRevSupport() throws RepositoryException { + assertExtract("/html/rdfa/rel-rev.html"); + logger.debug(dumpModelToTurtle()); + + assertContains( + baseIRI, + RDFUtils.iri("http://bob.example.com/cite"), + RDFUtils.iri("http://www.example.com/books/the_two_towers") + ); + assertContains( + RDFUtils.iri("http://path/to/chapter"), + RDFUtils.iri("http://bob.example.com/isChapterOf"), + baseIRI + ); + } + + /** + * Tests the @vocab support. + * + * @throws RepositoryException + */ + @Test + public void testVocabSupport() throws RepositoryException { + assertExtract("/html/rdfa/vocab.html"); + logger.debug(dumpModelToTurtle()); + + assertContains( + RDFUtils.iri(baseIRI.toString(), "#me"), + RDFUtils.iri("http://xmlns.com/foaf/0.1/name"), + RDFUtils.literal("John Doe") + ); + assertContains( + RDFUtils.iri(baseIRI.toString(), "#me"), + RDFUtils.iri("http://xmlns.com/foaf/0.1/homepage"), + RDFUtils.iri("http://example.org/blog/") + ); + } + + /** + * Tests the correct support of alternate + * Open Graph Protocol Object Types + * + * @throws IOException + * @throws org.apache.any23.extractor.ExtractionException + * @throws RepositoryException + */ + @Test + public void testOpenGraphAlternateObjectTypes() throws IOException, ExtractionException, RepositoryException { + assertExtract("/html/rdfa/opengraph-music-song-object-type.html"); + logger.info(dumpHumanReadableTriples()); + + Assert.assertEquals(9, getStatementsSize(null, null, null)); + final OGPMusic vOGPMusic = OGPMusic.getInstance(); + assertContains(baseIRI, vOGPMusic.musicDuration, RDFUtils.literal("447")); + assertContains( + baseIRI, + vOGPMusic.musicMusician, + RDFUtils.literal( + "Jono Grant / Tony McGuinness / Ashley Tomberlin" + ) + ); + assertContains(baseIRI, vOGPMusic.musicAlbum, RDFUtils.literal("Tri-State")); + } + + /** + * Taken from the + * GoodRelations + * test cases. It checks if the extraction is the same when the + * namespaces are defined in RDFa1.0. + * + * @throws RepositoryException + * @throws java.io.IOException + * @throws org.eclipse.rdf4j.rio.RDFHandlerException + * @throws org.eclipse.rdf4j.rio.RDFParseException + */ + @Test + public void testRDFa10Extraction() + throws RepositoryException, RDFHandlerException, IOException, RDFParseException { + final int EXPECTED_STATEMENTS = 31; + + assertExtract("/html/rdfa/goodrelations-rdfa10.html"); + logger.debug(dumpModelToNQuads()); + + Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size()); + assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq"); + } + + /** + * Taken from the + * GoodRelations + * test cases. It checks if the extraction is the same when the + * namespaces are defined in RDFa1.1. + * + * @throws RepositoryException + * @throws java.io.IOException + * @throws org.eclipse.rdf4j.rio.RDFHandlerException + * @throws org.eclipse.rdf4j.rio.RDFParseException + */ + @Test + public void testRDFa11Extraction() + throws RepositoryException, RDFHandlerException, IOException, RDFParseException { + final int EXPECTED_STATEMENTS = 31; + + assertExtract("/html/rdfa/goodrelations-rdfa11.html"); + logger.debug(dumpHumanReadableTriples()); + + Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size()); + assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq"); + } + + /** + * Tests the correct support of Open Graph + * Protocol's + * Basic Metadata, + * Optional Metadata, + * Structured Properties and + * Arrays. + * + * @throws IOException + * @throws org.apache.any23.extractor.ExtractionException + * @throws RepositoryException + */ + @Test + public void testOpenGraphStructuredProperties() throws IOException, ExtractionException, RepositoryException { + assertExtract("/html/rdfa/opengraph-structured-properties.html"); + logger.info(dumpHumanReadableTriples()); + + Assert.assertEquals(31, getStatementsSize(null, null, null)); + final OGP vOGP = OGP.getInstance(); + assertContains(baseIRI, vOGP.audio, RDFUtils.literal("http://example.com/sound.mp3")); + assertContains( + baseIRI, + vOGP.description, + RDFUtils.literal( + "Sean Connery found fame and fortune as the suave, sophisticated British agent, James Bond." + ) + ); + assertContains(baseIRI, vOGP.determiner, RDFUtils.literal("the")); + assertContains(baseIRI, vOGP.locale, RDFUtils.literal("en_GB")); + assertContains(baseIRI, vOGP.localeAlternate, RDFUtils.literal("fr_FR")); + assertContains(baseIRI, vOGP.localeAlternate, RDFUtils.literal("es_ES")); + assertContains(baseIRI, vOGP.siteName, RDFUtils.literal("IMDb")); + assertContains(baseIRI, vOGP.video, RDFUtils.literal("http://example.com/bond/trailer.swf")); + } + + /** + * Tests that the default parser settings enable tolerance in data type + * parsing. + */ + @Test + public void testTolerantParsing() { + assertExtract("/html/rdfa/oreilly-invalid-datatype.html"); + } + + @Override + protected ExtractorFactory getExtractorFactory() { + return new LibRdfaExtractorFactory(); + } + +} diff --git a/librdfa-rdf4j/README.MD b/librdfa-rdf4j/README.MD new file mode 100644 index 000000000..2e73818f9 --- /dev/null +++ b/librdfa-rdf4j/README.MD @@ -0,0 +1,56 @@ +# Librdfa - RDF4J + +RDF4J parser that uses [librdfa](https://github.com/rdfa/librdfa) to parse RDFa to triples. See the [documentation](https://cwiki.apache.org/confluence/display/ANY23/Librdfa-rdf4j+documentation) for more information. + +## Prerequisites + +You need to install the [librdfa](https://github.com/rdfa/librdfa) library. + +## Install + +``` mvn + + org.apache.any23 + apache-any23-librdfa + ${project.version} + +``` + +## Compile + +`mvn clean install` + +## Use + +Add the library and you can parse an `InputStream` as you would do with [`Rio`](http://docs.rdf4j.org/javadoc/2.1/org/eclipse/rdf4j/rio/Rio.html). + +``` java +RDFParser rdfParser = Rio.createParser(RDFFormat.RDFA); +Model model = new LinkedHashModel(); +rdfParser.setRDFHandler(new StatementCollector(model)); +rdfParser.parse(in, "http://www.example.org./"); +``` + +## Benchmarking + +In general librdfa is 2-5 seconds faster than semargl. + +### librdfa-rdf4j +- round: 0.11 [+- 0.00] +- round.block: 0.00 [+- 0.00] +- round.gc: 0.00 [+- 0.00] +- GC.calls: 0 +- GC.time: 0.00 +- time.total: 0.11 +- time.warmup: 0.00 +- time.bench: 0.11 + +### semargl-rdf4j +- round: 0.15 [+- 0.00] +- round.block: 0.00 [+- 0.00] +- round.gc: 0.00 [+- 0.00] +- GC.calls: 1 +- GC.time: 0.00 +- time.total: 0.15 +- time.warmup: 0.00 +- time.bench: 0.15 diff --git a/librdfa-rdf4j/pom.xml b/librdfa-rdf4j/pom.xml new file mode 100644 index 000000000..4ead804b5 --- /dev/null +++ b/librdfa-rdf4j/pom.xml @@ -0,0 +1,152 @@ + + + + + + org.apache.any23 + apache-any23 + 2.3-SNAPSHOT + ../ + + + 4.0.0 + org.apache.any23 + apache-any23-librdfa + jar + Apache Any23 :: Librdfa-RDF4J + + + ${project.basedir}/src/main/c/ + ${jni.base}/build/ + + 4.12 + 0.7.2 + + + + + + com.googlecode.cmake-maven-project + cmake-maven-plugin + 3.7.2-b1 + + + cmake-generate + + generate + + + ${jni.base} + ${jni.build} + Unix Makefiles + linux-x86_64 + + ${jni.build} + + + + + cmake-compile + process-resources + + compile + + + ${jni.build} + linux-x86_64 + + + + + + org.codehaus.mojo + build-helper-maven-plugin + 3.0.0 + + + add-source + generate-sources + + add-source + + + + ${jni.build} + + + + + + + maven-antrun-plugin + 1.8 + + + process-classes + + + ${jni.base} + + + + + + + + + run + + + + + + + + + + org.eclipse.rdf4j + rdf4j-rio-api + + + + + junit + junit + test + + + com.carrotsearch + junit-benchmarks + ${junit.benchmarks.version} + test + + + org.slf4j + slf4j-log4j12 + test + + + org.semarglproject + semargl-rdf4j + test + + + + \ No newline at end of file diff --git a/librdfa-rdf4j/src/main/c/CMakeLists.txt b/librdfa-rdf4j/src/main/c/CMakeLists.txt new file mode 100644 index 000000000..ef5f08a6c --- /dev/null +++ b/librdfa-rdf4j/src/main/c/CMakeLists.txt @@ -0,0 +1,54 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +cmake_minimum_required(VERSION 2.8) + +# Check if required packages are installed +find_package(SWIG REQUIRED) +find_package(Java REQUIRED) +find_package(JNI REQUIRED) +find_package(LibXml2 REQUIRED) + +# Add modules +include(UseJava) +include(UseSWIG) + +# Add directories to the build process +set( CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR} ) + +include_directories(${LIBXML2_INCLUDE_DIR}) +include_directories(${JNI_INCLUDE_DIRS}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/..) + +# Link library. TODO: replace because of deprecation +link_libraries(rdfa) + +# Build the C++ code into a dynamic library: rdfaJava.dll (on Windows) or librdfaJava.so (on Linux) +set(CMAKE_SWIG_FLAGS -package org.apache.any23.rdf.librdfa) +set(CMAKE_SWIG_OUTDIR "${CMAKE_CURRENT_BINARY_DIR}/org/apache/any23/rdf/librdfa") +set_property(SOURCE rdfa.i PROPERTY CPLUSPLUS ON) +swig_add_module( + rdfaJava + java + rdfa.i + RdfaParser.cpp +) + +# For convenience we copy the dynamic library to the current build folder +add_custom_command( + TARGET rdfaJava + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${CMAKE_CURRENT_BINARY_DIR} +) diff --git a/librdfa-rdf4j/src/main/c/RdfaParser.cpp b/librdfa-rdf4j/src/main/c/RdfaParser.cpp new file mode 100644 index 000000000..990756449 --- /dev/null +++ b/librdfa-rdf4j/src/main/c/RdfaParser.cpp @@ -0,0 +1,18 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "RdfaParser.h" \ No newline at end of file diff --git a/librdfa-rdf4j/src/main/c/RdfaParser.h b/librdfa-rdf4j/src/main/c/RdfaParser.h new file mode 100644 index 000000000..7a8f769af --- /dev/null +++ b/librdfa-rdf4j/src/main/c/RdfaParser.h @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#ifndef _RDFA_PARSER_H_ +#define _RDFA_PARSER_H_ + +#include +#include +#include +#include +#include +#include + +struct rdfacontext; + +class Callback { +public: + + virtual ~Callback() { + } + + virtual void default_graph(char* subject, char* predicate, char* object, int object_type, char* datatype, char* language) { + } + + virtual void processor_graph(char* subject, char* predicate, char* object, int object_type, char* datatype, char* language) { + } + + virtual char* fill_data(size_t buffer_length) { + } + + virtual size_t fill_len() { + } +}; + +/** + * The RdfaParser class is a wrapper class for Java to provide a + * simple API for using librdfa in Java. + */ +class RdfaParser { +private: + Callback *_callback; +public: + /** + * The base URI that will be used when resolving relative pathnames + * in the document. + */ + std::string mBaseUri; + + /** + * The base RDFa context to use when setting the triple handler callback, + * buffer filler callback, and executing the parser call. + */ + rdfacontext* mBaseContext; + + RdfaParser(const char* baseUri) : _callback(0) { + mBaseUri = baseUri; + mBaseContext = rdfa_create_context(baseUri); + } + + /** + * Standard destructor. + */ + ~RdfaParser() { + rdfa_free_context(mBaseContext); + delCallback(); + } + + void c_process_default_graph_triple(rdftriple* triple, void* callback_data) { + _callback->default_graph(triple->subject, triple-> predicate, triple->object, triple->object_type, triple-> datatype, triple-> language); + rdfa_free_triple(triple); + } + + void c_process_processor_graph_triple(rdftriple* triple, void* callback_data) { + _callback->processor_graph(triple->subject, triple-> predicate, triple->object, triple->object_type, triple-> datatype, triple-> language); + rdfa_free_triple(triple); + } + + size_t c_fill_buffer(char* buffer, size_t buffer_length, void* callback_data) { + char* data = _callback->fill_data(buffer_length); + size_t size = _callback -> fill_len(); + memset(buffer, ' ', buffer_length); + memcpy(buffer, data, size); + + return size; + } + + /** + * Starts the parsing process for librdfa. When more data is + * required by the XML parser, the buffer filler callback is + * called. If triples are found, then the triple handler callback + * is called. + */ + int parse() { + return rdfa_parse(mBaseContext); + } + + void delCallback() { + delete _callback; + _callback = 0; + } + + void setCallback(Callback *cb) { + delCallback(); + _callback = cb; + } +}; + +#endif /* _RDFA_PARSER_H_ */ diff --git a/librdfa-rdf4j/src/main/c/rdfa.i b/librdfa-rdf4j/src/main/c/rdfa.i new file mode 100644 index 000000000..68b8f1d23 --- /dev/null +++ b/librdfa-rdf4j/src/main/c/rdfa.i @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +%module(directors="1") rdfa +%feature("director") Callback; + +%{ + #include "RdfaParser.h" + + RdfaParser* gRdfaParser = NULL; + void process_default_graph_triple(rdftriple* triple, void* callback_data); + void process_processor_graph_triple(rdftriple* triple, void* callback_data); + size_t fill_buffer(char* buffer, size_t buffer_length, void* callback_data); +%} + +%constant int RDF_TYPE_NAMESPACE_PREFIX = RDF_TYPE_NAMESPACE_PREFIX; +%constant int RDF_TYPE_IRI = RDF_TYPE_IRI; +%constant int RDF_TYPE_PLAIN_LITERAL = RDF_TYPE_PLAIN_LITERAL; +%constant int RDF_TYPE_XML_LITERAL = RDF_TYPE_XML_LITERAL; +%constant int RDF_TYPE_TYPED_LITERAL = RDF_TYPE_TYPED_LITERAL; + +%{ + void process_default_graph_triple(rdftriple* triple, void* callback_data){ + gRdfaParser->c_process_default_graph_triple(triple, callback_data); + } + void process_processor_graph_triple(rdftriple* triple, void* callback_data){ + gRdfaParser->c_process_processor_graph_triple( triple, callback_data); + } + size_t fill_buffer(char* buffer, size_t buffer_length, void* callback_data){ + return gRdfaParser->c_fill_buffer(buffer, buffer_length, callback_data); + } +%} + +%include RdfaParser.h + +%extend RdfaParser { + void init (){ + gRdfaParser = self; + rdfa_set_default_graph_triple_handler(gRdfaParser->mBaseContext, &process_default_graph_triple); + rdfa_set_processor_graph_triple_handler(gRdfaParser->mBaseContext, &process_processor_graph_triple); + rdfa_set_buffer_filler(gRdfaParser->mBaseContext, &fill_buffer); + } +} diff --git a/librdfa-rdf4j/src/main/c/readme.md b/librdfa-rdf4j/src/main/c/readme.md new file mode 100644 index 000000000..f3538b428 --- /dev/null +++ b/librdfa-rdf4j/src/main/c/readme.md @@ -0,0 +1,8 @@ +# Librdfa Integration with Java + +Build: + +`mkdir build` +`cd build` +`cmake ..` +`cmake --build .` diff --git a/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaFilter.java b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaFilter.java new file mode 100644 index 000000000..53cdf101f --- /dev/null +++ b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaFilter.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.rdf.rdfa; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.apache.any23.rdf.librdfa.Callback; +import org.apache.any23.rdf.librdfa.rdfa; +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Statement; +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.ValueFactory; +import org.eclipse.rdf4j.model.vocabulary.RDF; +import org.eclipse.rdf4j.rio.RDFHandler; + +/** + * + * @author Julio Caguano + */ +public class LibrdfaFilter extends Callback { + + private BufferedReader bis = null; + private int len = 0; + private RDFHandler handler; + private ValueFactory valueFactory; + + public LibrdfaFilter(InputStream is) { + super(); + bis = new BufferedReader(new InputStreamReader(is)); + } + + public LibrdfaFilter(Reader reader) { + super(); + bis = new BufferedReader(reader); + } + + @Override + public void default_graph(String subject, String predicate, String object, int object_type, String datatype, String language) { + IRI s = valueFactory.createIRI(subject); + IRI p = valueFactory.createIRI(predicate); + Value o = null; + + if (object_type == rdfa.RDF_TYPE_IRI) { // 1 + o = valueFactory.createIRI(object); + } else if (object_type == rdfa.RDF_TYPE_PLAIN_LITERAL) { // 2 + o = valueFactory.createLiteral(object); + } else if (object_type == rdfa.RDF_TYPE_XML_LITERAL) { // 3 + o = valueFactory.createLiteral(object, RDF.XMLLITERAL); + } else if (object_type == rdfa.RDF_TYPE_TYPED_LITERAL) { // 4 + if (datatype != null) { + IRI dt = valueFactory.createIRI(datatype); + o = valueFactory.createLiteral(object, dt); + } else { + o = valueFactory.createLiteral(object, language); + } + } + if (handler != null && o != null) { + Statement stmt = valueFactory.createStatement(s, p, o); + handler.handleStatement(stmt); + } else { + System.err.println("VALIDATE: S=" + subject + "P=" + predicate + "O=" + object + "OT=" + object_type + "DT=" + datatype + "LANG=" + language); + } + } + + @Override + public void processor_graph(String subject, String predicate, String object, int object_type, String datatype, String language) { + if (handler != null && rdfa.RDF_TYPE_NAMESPACE_PREFIX == object_type) { // 0 + handler.handleNamespace(predicate, object); + } else { + System.out.println("Processor: S=" + subject + "\tP=" + predicate + "\tO=" + object + "\tOT=" + object_type + "\tDT:" + datatype + "\tLANG=" + language); + } + } + + @Override + public String fill_data(long buffer_length) { + char[] d = new char[(int) buffer_length]; + + try { + len = bis.read(d, 0, (int) buffer_length); + } catch (IOException ex) { + Logger.getLogger(LibrdfaFilter.class.getName()).log(Level.SEVERE, null, ex); + } + + return new String(d); + } + + @Override + public long fill_len() { + if (len == -1) { + return 0; + } + return len; + } + + public RDFHandler getHandler() { + return handler; + } + + public void setHandler(RDFHandler handler) { + this.handler = handler; + } + + public ValueFactory getValueFactory() { + return valueFactory; + } + + public void setValueFactory(ValueFactory valueFactory) { + this.valueFactory = valueFactory; + } + +} diff --git a/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParser.java b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParser.java new file mode 100644 index 000000000..cff321e74 --- /dev/null +++ b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParser.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.rdf.rdfa; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import org.apache.any23.rdf.librdfa.RdfaParser; +import org.apache.any23.rdf.rdfa.utils.LibraryLoader; +import org.eclipse.rdf4j.rio.RDFFormat; +import org.eclipse.rdf4j.rio.RDFHandlerException; +import org.eclipse.rdf4j.rio.RDFParseException; +import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser; + +/** + * + * @author Julio Caguano + */ +public class LibrdfaRDFaParser extends AbstractRDFParser { + + static { + try { + LibraryLoader.loadLibrary("rdfaJava"); + } catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + @Override + public RDFFormat getRDFFormat() { + return RDFFormat.RDFA; + } + + @Override + public void parse(InputStream in, String baseURI) throws IOException, RDFParseException, RDFHandlerException { + if (in == null) { + throw new IllegalArgumentException("Input stream cannot be 'null'"); + } + if (baseURI == null) { + throw new IllegalArgumentException("Base URI cannot be 'null'"); + } + + RdfaParser parser = new RdfaParser(baseURI); + parser.init(); + + LibrdfaFilter filter = new LibrdfaFilter(in); + parser.setCallback(filter); + + filter.setHandler(rdfHandler); + filter.setValueFactory(valueFactory); + + int status = parser.parse(); + + parser.delCallback(); + filter.delete(); + } + + @Override + public void parse(Reader reader, String baseURI) throws IOException, RDFParseException, RDFHandlerException { + if (reader == null) { + throw new IllegalArgumentException("Input stream cannot be 'null'"); + } + if (baseURI == null) { + throw new IllegalArgumentException("Base URI cannot be 'null'"); + } + + RdfaParser parser = new RdfaParser(baseURI); + parser.init(); + LibrdfaFilter filter = new LibrdfaFilter(reader); + parser.setCallback(filter); + + filter.setHandler(rdfHandler); + filter.setValueFactory(valueFactory); + + parser.parse(); + + parser.delCallback(); + parser.delete(); + } + +} diff --git a/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParserFactory.java b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParserFactory.java new file mode 100644 index 000000000..996a61c4f --- /dev/null +++ b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParserFactory.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.rdf.rdfa; + +import org.eclipse.rdf4j.rio.RDFFormat; +import org.eclipse.rdf4j.rio.RDFParser; +import org.eclipse.rdf4j.rio.RDFParserFactory; + +/** + * Parser factory to integrate the {@link LibrdfaRDFaParser} into RDF4j. + * + * @author Julio Caguano + */ +public class LibrdfaRDFaParserFactory implements RDFParserFactory { + + @Override + public RDFFormat getRDFFormat() { + return RDFFormat.RDFA; + } + + @Override + public RDFParser getParser() { + return new LibrdfaRDFaParser(); + } + +} diff --git a/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/utils/LibraryLoader.java b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/utils/LibraryLoader.java new file mode 100644 index 000000000..29cbe1049 --- /dev/null +++ b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/utils/LibraryLoader.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.rdf.rdfa.utils; + + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +/** + * + * @author Julio Caguano + * + */ +public final class LibraryLoader { + + public static void loadLibrary(String name) throws IOException { + try { + System.loadLibrary(name); + } catch (UnsatisfiedLinkError e) { + String filename = System.mapLibraryName(name); + InputStream in = LibraryLoader.class.getClassLoader().getResourceAsStream(filename); + int pos = filename.lastIndexOf('.'); + File file = File.createTempFile(filename.substring(0, pos), filename.substring(pos)); + file.deleteOnExit(); + try { + byte[] buf = new byte[4096]; + OutputStream out = new FileOutputStream(file); + try { + while (in.available() > 0) { + int len = in.read(buf); + if (len >= 0) { + out.write(buf, 0, len); + } + } + } finally { + out.close(); + } + } finally { + in.close(); + } + System.load(file.getAbsolutePath()); + } + } + +} diff --git a/librdfa-rdf4j/src/main/resources/META-INF/services/org.eclipse.rdf4j.rio.RDFParserFactory b/librdfa-rdf4j/src/main/resources/META-INF/services/org.eclipse.rdf4j.rio.RDFParserFactory new file mode 100644 index 000000000..463600e9b --- /dev/null +++ b/librdfa-rdf4j/src/main/resources/META-INF/services/org.eclipse.rdf4j.rio.RDFParserFactory @@ -0,0 +1 @@ +org.apache.any23.rdf.rdfa.LibrdfaRDFaParserFactory diff --git a/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaBenchmarkTest.java b/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaBenchmarkTest.java new file mode 100644 index 000000000..c37f3d5c1 --- /dev/null +++ b/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaBenchmarkTest.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.rdf.librdfa; + +import com.carrotsearch.junitbenchmarks.AbstractBenchmark; +import com.carrotsearch.junitbenchmarks.BenchmarkOptions; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import org.apache.any23.rdf.rdfa.LibrdfaRDFaParser; +import org.eclipse.rdf4j.rio.RDFParser; +import org.eclipse.rdf4j.rio.helpers.StatementCollector; +import static org.junit.Assert.assertEquals; +import org.junit.Before; +import org.junit.Test; +import org.semarglproject.rdf4j.rdf.rdfa.RDF4JRDFaParser; + +/** + * + * @author Julio Caguano + */ +@BenchmarkOptions(callgc = false, benchmarkRounds = 20, warmupRounds = 0) +public class LibrdfaRDFaBenchmarkTest extends AbstractBenchmark { + + private final int ITERATIONS = 2000; + private String DOCUMENT = ""; + + @Before + public void init() { + DOCUMENT = "\n" + + "\n" + + "\n" + + "Speed Test

"; + for (int i = 0; i < ITERATIONS; i++) { + DOCUMENT += ""; + } + DOCUMENT += "

"; + } + + @Test + public void testSemargl() throws Exception { + runTest(new RDF4JRDFaParser()); + } + + @Test + public void testLibrdfa() throws IOException { + runTest(new LibrdfaRDFaParser()); + } + + private void runTest(RDFParser parser) throws IOException { + InputStream in = new ByteArrayInputStream(DOCUMENT.getBytes(StandardCharsets.UTF_8)); + StatementCollector sc = new StatementCollector(); + parser.setRDFHandler(sc); + parser.parse(in, "http://example.org/"); + assertEquals(ITERATIONS, sc.getStatements().size()); + } +} diff --git a/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaParserTest.java b/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaParserTest.java new file mode 100644 index 000000000..7974c4919 --- /dev/null +++ b/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaParserTest.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.rdf.librdfa; + +import java.io.IOException; +import java.io.InputStream; +import org.apache.any23.rdf.rdfa.LibrdfaRDFaParser; +import org.eclipse.rdf4j.model.ValueFactory; +import org.eclipse.rdf4j.model.impl.SimpleValueFactory; +import org.eclipse.rdf4j.rio.RDFParser; +import org.eclipse.rdf4j.rio.helpers.ParseErrorCollector; +import org.eclipse.rdf4j.rio.helpers.StatementCollector; +import static org.junit.Assert.assertEquals; +import org.junit.Before; +import org.junit.Test; + +/** + * + * @author Julio Caguano + * + */ +public class LibrdfaRDFaParserTest { + + private ValueFactory vf; + private RDFParser parser; + private StatementCollector sc; + private ParseErrorCollector el; + + @Before + public void setUp() throws Exception { + + vf = SimpleValueFactory.getInstance(); + parser = new LibrdfaRDFaParser(); + sc = new StatementCollector(); + parser.setRDFHandler(sc); + el = new ParseErrorCollector(); +// parser.setParseErrorListener(el); + } + + @Test + public void testHtml() throws IOException { + try (final InputStream in = this.getClass().getResourceAsStream( + "/org/apache/any23/rdf/librdfa/site.html");) { + parser.parse(in, "http://example.org/"); + assertEquals(4, sc.getStatements().size()); + } + } + +} diff --git a/librdfa-rdf4j/src/test/resources/log4j.properties b/librdfa-rdf4j/src/test/resources/log4j.properties new file mode 100644 index 000000000..32492dd43 --- /dev/null +++ b/librdfa-rdf4j/src/test/resources/log4j.properties @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +log4j.rootCategory=INFO, O + +# Stdout +log4j.appender.O=org.apache.log4j.ConsoleAppender + +# File +#log4j.appender.R=org.apache.log4j.RollingFileAppender +#log4j.appender.R.File=log4j.log + +# Control the maximum log file size +#log4j.appender.R.MaxFileSize=100KB + +# Archive log files (one backup file here) +log4j.appender.R.MaxBackupIndex=1 + +log4j.appender.R.layout=org.apache.log4j.PatternLayout +log4j.appender.O.layout=org.apache.log4j.PatternLayout + +log4j.appender.R.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n +log4j.appender.O.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n diff --git a/librdfa-rdf4j/src/test/resources/org/apache/any23/rdf/librdfa/site.html b/librdfa-rdf4j/src/test/resources/org/apache/any23/rdf/librdfa/site.html new file mode 100644 index 000000000..b80aca1a8 --- /dev/null +++ b/librdfa-rdf4j/src/test/resources/org/apache/any23/rdf/librdfa/site.html @@ -0,0 +1,17 @@ + + + + + Test + + +

+ + Julio Caguano + Julio Caguano + +

+ + \ No newline at end of file diff --git a/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java b/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java index e8e4505dc..7d97b2f21 100644 --- a/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java +++ b/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java @@ -17,22 +17,20 @@ package org.apache.any23.plugin; -import org.apache.any23.cli.Crawler; -import org.apache.any23.cli.Tool; -import org.apache.any23.extractor.ExtractorGroup; -import org.apache.any23.extractor.ExtractorRegistryImpl; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - import java.io.File; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; - +import org.apache.any23.cli.Crawler; +import org.apache.any23.cli.Tool; +import org.apache.any23.extractor.ExtractorGroup; +import org.apache.any23.extractor.ExtractorRegistryImpl; +import org.junit.After; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import org.junit.Before; +import org.junit.Test; /** * Integration test for plugins. @@ -41,7 +39,7 @@ */ public class PluginIT { - private static final int NUM_OF_EXTRACTORS_INCL_OPENIE = 34; + private static final int NUM_OF_EXTRACTORS_INCL_OPENIE = 35; private static final int NUM_OF_EXTRACTORS_EXCL_OPENIE = 33; diff --git a/pom.xml b/pom.xml index ce2ee5d17..e5d1038e1 100644 --- a/pom.xml +++ b/pom.xml @@ -232,6 +232,7 @@ csvutils mime encoding + librdfa-rdf4j core cli plugins/basic-crawler diff --git a/test-resources/src/test/resources/html/rdfa/basic.html b/test-resources/src/test/resources/html/rdfa/basic.html index 542b88b73..f9ffab285 100644 --- a/test-resources/src/test/resources/html/rdfa/basic.html +++ b/test-resources/src/test/resources/html/rdfa/basic.html @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. --> - +