From a9f37b2293fb371eda431b4385e26cf99fbff365 Mon Sep 17 00:00:00 2001 From: Julio Caguano Date: Thu, 28 Jun 2018 22:03:56 -0500 Subject: [PATCH 01/11] Add extractors for librdfa. Signed-off-by: Julio Caguano --- .../extractor/rdfa/LibRdfaExtractor.java | 49 ++++++++++++++++++ .../rdfa/LibRdfaExtractorFactory.java | 51 +++++++++++++++++++ 2 files changed, 100 insertions(+) create mode 100644 core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java create mode 100644 core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java new file mode 100644 index 000000000..4172c6262 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.extractor.rdfa; + +import org.apache.any23.extractor.ExtractionContext; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.rdf.BaseRDFExtractor; +import org.eclipse.rdf4j.rio.RDFParser; + +/** + * + * @author Julio Caguano + */ +public class LibRdfaExtractor extends BaseRDFExtractor { + + public LibRdfaExtractor(boolean verifyDataType, boolean stopAtFirstError) { + super(verifyDataType, stopAtFirstError); + } + + public LibRdfaExtractor() { + this(false, false); + } + + @Override + protected RDFParser getParser(ExtractionContext extractionContext, ExtractionResult extractionResult) { + throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + } + + @Override + public ExtractorDescription getDescription() { + return LibRdfaExtractorFactory.getDescriptionInstance(); + } + +} diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java new file mode 100644 index 000000000..ce7229d85 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java @@ -0,0 +1,51 @@ +/* + * Copyright 2018 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.extractor.rdfa; + +import java.util.Arrays; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.Prefixes; + +/** + * + * @author Julio Caguano + */ +public class LibRdfaExtractorFactory extends SimpleExtractorFactory + implements ExtractorFactory { + + public static final String NAME = "html-librdfa"; + public static final Prefixes PREFIXES = null; + + private static final ExtractorDescription descriptionInstance = new RDFa11ExtractorFactory(); + + public LibRdfaExtractorFactory() { + super(RDFa11ExtractorFactory.NAME, + RDFa11ExtractorFactory.PREFIXES, + Arrays.asList("text/html;q=0.3", "application/xhtml+xml;q=0.3"), + "example-rdfa11.html"); + } + + @Override + public LibRdfaExtractor createExtractor() { + return new LibRdfaExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} From 64082692139f69df16d2985b6e9591d000e6457b Mon Sep 17 00:00:00 2001 From: Julio Caguano Date: Mon, 16 Jul 2018 10:18:09 -0500 Subject: [PATCH 02/11] add librdfa extractor --- core/pom.xml | 5 ++ .../any23/extractor/rdf/RDFParserFactory.java | 35 ++++++-- .../extractor/rdfa/LibRdfaExtractor.java | 5 +- .../rdfa/LibRdfaExtractorFactory.java | 6 +- .../rdfa/RDFaLibrdfaExtractorTest.java | 83 +++++++++++++++++++ 5 files changed, 123 insertions(+), 11 deletions(-) create mode 100644 core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java diff --git a/core/pom.xml b/core/pom.xml index c7bc08174..5d4f39d65 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -60,6 +60,11 @@ test-jar test + + ${project.groupId} + apache-any23-librdfa + 0.0.1-SNAPSHOT + diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java index 277862134..549053a5e 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java @@ -17,10 +17,16 @@ package org.apache.any23.extractor.rdf; -import org.apache.any23.extractor.IssueReport; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.util.Collections; +import java.util.HashSet; import org.apache.any23.extractor.ExtractionContext; import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.IssueReport; import org.apache.any23.rdf.Any23ValueFactoryWrapper; +import org.apache.any23.rdf.rdfa.LibrdfaRDFaParser; import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.eclipse.rdf4j.rio.ParseErrorListener; import org.eclipse.rdf4j.rio.RDFFormat; @@ -36,12 +42,6 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.io.IOException; -import java.io.InputStream; -import java.io.Reader; -import java.util.Collections; -import java.util.HashSet; - /** * This factory provides a common logic for creating and configuring correctly * any RDF parser used within the library. @@ -124,6 +124,27 @@ public RDFParser getRDFa11Parser( configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult); return parser; } + + /** + * Returns a new instance of a configured RDFaParser using the librdfa library. + * + * @param verifyDataType data verification enable if true. + * @param stopAtFirstError the parser stops at first error if true. + * @param extractionContext the extraction context where the parser is used. + * @param extractionResult the output extraction result. + * @return a new instance of a configured RDFXML parser. + */ + public RDFParser getRDFaLibrdfaParser( + final boolean verifyDataType, + final boolean stopAtFirstError, + final ExtractionContext extractionContext, + final ExtractionResult extractionResult + ) { + final RDFParser parser = new LibrdfaRDFaParser(); + parser.getParserConfig().set(RDFaParserSettings.RDFA_COMPATIBILITY, RDFaVersion.RDFA_1_1); + configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult); + return parser; + } /** * Returns a new instance of a configured RDFXMLParser. diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java index 4172c6262..e1d598a28 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractor.java @@ -20,6 +20,7 @@ import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.extractor.rdf.BaseRDFExtractor; +import org.apache.any23.extractor.rdf.RDFParserFactory; import org.eclipse.rdf4j.rio.RDFParser; /** @@ -38,7 +39,9 @@ public LibRdfaExtractor() { @Override protected RDFParser getParser(ExtractionContext extractionContext, ExtractionResult extractionResult) { - throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. + return RDFParserFactory.getInstance().getRDFaLibrdfaParser( + isVerifyDataType(), isStopAtFirstError(), extractionContext, extractionResult + ); } @Override diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java index ce7229d85..b79e8733a 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java @@ -31,11 +31,11 @@ public class LibRdfaExtractorFactory extends SimpleExtractorFactoryGoodRelations test cases. + * It checks if the extraction is the same when the namespaces are defined in RDFa1.0 or + * RDFa1.1 respectively. + * + * @throws org.eclipse.rdf4j.repository.RepositoryException + * @throws java.io.IOException + * @throws org.eclipse.rdf4j.rio.RDFHandlerException + * @throws org.eclipse.rdf4j.rio.RDFParseException + */ + @Test + public void testRDFa11PrefixBackwardCompatibility() + throws RepositoryException, RDFHandlerException, IOException, RDFParseException { + final int EXPECTED_STATEMENTS = 31; + + assertExtract("/html/rdfa/goodrelations-rdfa10.html"); + logger.debug("Model 1 " + dumpHumanReadableTriples()); + Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size()); + List rdfa10Stmts = dumpAsListOfStatements(); + + //assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq"); + + assertExtract("/html/rdfa/goodrelations-rdfa11.html"); + logger.debug("Model 2 " + dumpHumanReadableTriples()); + Assert.assertTrue(dumpAsListOfStatements().size() >= EXPECTED_STATEMENTS); + + for(Statement stmt : rdfa10Stmts) { + assertContains(stmt); + } + } + + @Test + public void testRDFa11CURIEs() throws Exception { + } + + /** + * Tests that the default parser settings enable tolerance in data type parsing. + */ + @Test + public void testTolerantParsing() { + assertExtract("/html/rdfa/oreilly-invalid-datatype.html"); + } + + @Override + protected ExtractorFactory getExtractorFactory() { + return new LibRdfaExtractorFactory(); + } + +} From 68f0d8078fb2adc3d11d9e8ebf83c6e7e58aa9b3 Mon Sep 17 00:00:00 2001 From: Julio Caguano Date: Wed, 18 Jul 2018 22:14:56 -0500 Subject: [PATCH 03/11] ignore basic test --- .../rdfa/RDFaLibrdfaExtractorTest.java | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java index 9e3135fd6..c0daa57ce 100644 --- a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java @@ -14,7 +14,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.apache.any23.extractor.rdfa; import java.io.IOException; @@ -25,6 +24,7 @@ import org.eclipse.rdf4j.rio.RDFHandlerException; import org.eclipse.rdf4j.rio.RDFParseException; import org.junit.Assert; +import org.junit.Ignore; import org.junit.Test; /** @@ -33,8 +33,10 @@ public class RDFaLibrdfaExtractorTest extends AbstractRDFaExtractorTestCase { /** - * Taken from the GoodRelations test cases. - * It checks if the extraction is the same when the namespaces are defined in RDFa1.0 or + * Taken from the + * GoodRelations + * test cases. It checks if the extraction is the same when the + * namespaces are defined in RDFa1.0 or * RDFa1.1 respectively. * * @throws org.eclipse.rdf4j.repository.RepositoryException @@ -44,7 +46,7 @@ public class RDFaLibrdfaExtractorTest extends AbstractRDFaExtractorTestCase { */ @Test public void testRDFa11PrefixBackwardCompatibility() - throws RepositoryException, RDFHandlerException, IOException, RDFParseException { + throws RepositoryException, RDFHandlerException, IOException, RDFParseException { final int EXPECTED_STATEMENTS = 31; assertExtract("/html/rdfa/goodrelations-rdfa10.html"); @@ -53,22 +55,23 @@ public void testRDFa11PrefixBackwardCompatibility() List rdfa10Stmts = dumpAsListOfStatements(); //assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq"); - assertExtract("/html/rdfa/goodrelations-rdfa11.html"); logger.debug("Model 2 " + dumpHumanReadableTriples()); Assert.assertTrue(dumpAsListOfStatements().size() >= EXPECTED_STATEMENTS); - for(Statement stmt : rdfa10Stmts) { + for (Statement stmt : rdfa10Stmts) { assertContains(stmt); } } @Test - public void testRDFa11CURIEs() throws Exception { + @Ignore(value = "ERROR: Corrupted STDOUT by directly writing to native stream in forked JVM 1") + public void testBasic() throws Exception { } - + /** - * Tests that the default parser settings enable tolerance in data type parsing. + * Tests that the default parser settings enable tolerance in data type + * parsing. */ @Test public void testTolerantParsing() { From bd70dfc1abc4864fcd3857291d009e2a45d9b556 Mon Sep 17 00:00:00 2001 From: Julio Caguano Date: Wed, 25 Jul 2018 22:40:57 -0500 Subject: [PATCH 04/11] Make libdrfa configurable. --- .../resources/default-configuration.properties | 8 ++++++++ .../any23/extractor/ExtractorRegistryImpl.java | 15 ++++++++++----- .../org.apache.any23.extractor.ExtractorFactory | 1 + 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties index 4f68586d3..15077711d 100644 --- a/api/src/main/resources/default-configuration.properties +++ b/api/src/main/resources/default-configuration.properties @@ -47,6 +47,14 @@ any23.extraction.metadata.domain.per.entity=off # registered the RDFa 1.0 legacy one (org.deri.any23.extractor.rdfa.RDFaExtractor). any23.extraction.rdfa.programmatic=on +# Allows to enable Librdfa Extractor. +# If 'on' will override the extractors with the programmatic option, +# RDFa 1.1 Extractor (org.deri.any23.extractor.rdfa.RDFa11Extractor) and +# RDFa 1.0 Exctractor (org.deri.any23.extractor.rdfa.RDFaExtractor). +# If the option is 'off' (by default), it will choose the especfied extractor +# in the programmatic option (any23.extraction.rdfa.programmatic). +any23.extraction.rdfa.librdfa=off + # The extraction context IRI to be used by the # SingleDocumentExtraction. If == '?' the document IRI will # be used. It can be overriden by specifying a different diff --git a/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java b/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java index ca3bb982b..4482e5a8e 100644 --- a/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java +++ b/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java @@ -17,15 +17,15 @@ package org.apache.any23.extractor; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import org.apache.any23.configuration.DefaultConfiguration; import org.apache.any23.extractor.html.HTMLMetaExtractorFactory; +import org.apache.any23.extractor.rdfa.LibRdfaExtractorFactory; import org.apache.any23.extractor.rdfa.RDFa11ExtractorFactory; import org.apache.any23.extractor.rdfa.RDFaExtractorFactory; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - /** * Singleton class acting as a register for all the various * {@link Extractor}. @@ -55,12 +55,17 @@ public static ExtractorRegistry getInstance() { if (instance == null) { instance = new ExtractorRegistryImpl(); - if(conf.getFlagProperty("any23.extraction.rdfa.programmatic")) { + if(conf.getFlagProperty("any23.extraction.rdfa.librdfa")){ + instance.unregister(RDFaExtractorFactory.NAME); + instance.unregister(RDFa11ExtractorFactory.NAME); + } else if(conf.getFlagProperty("any23.extraction.rdfa.programmatic")) { + instance.unregister(LibRdfaExtractorFactory.NAME); instance.unregister(RDFaExtractorFactory.NAME); // FIXME: Unregister RDFaExtractor if flag is not set //instance.register(RDFa11Extractor.factory); } else { instance.unregister(RDFa11ExtractorFactory.NAME); + instance.unregister(LibRdfaExtractorFactory.NAME); // FIXME: Unregister RDFaExtractor if flag is set //instance.register(RDFaExtractor.factory); } diff --git a/core/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory b/core/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory index 2b1df7996..7303bcbba 100644 --- a/core/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory +++ b/core/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory @@ -27,5 +27,6 @@ org.apache.any23.extractor.rdf.TriXExtractorFactory org.apache.any23.extractor.rdf.TurtleExtractorFactory org.apache.any23.extractor.rdfa.RDFa11ExtractorFactory org.apache.any23.extractor.rdfa.RDFaExtractorFactory +org.apache.any23.extractor.rdfa.LibRdfaExtractorFactory org.apache.any23.extractor.xpath.XPathExtractorFactory org.apache.any23.extractor.yaml.YAMLExtractorFactory \ No newline at end of file From a271a21760ca4caee0ec9359008cd446fe8b950a Mon Sep 17 00:00:00 2001 From: Julio Caguano Date: Sun, 29 Jul 2018 19:39:19 -0500 Subject: [PATCH 05/11] solve integration test. Librdfa is loaded with SPI. --- .../java/org/apache/any23/plugin/PluginIT.java | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java b/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java index e8e4505dc..7d97b2f21 100644 --- a/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java +++ b/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java @@ -17,22 +17,20 @@ package org.apache.any23.plugin; -import org.apache.any23.cli.Crawler; -import org.apache.any23.cli.Tool; -import org.apache.any23.extractor.ExtractorGroup; -import org.apache.any23.extractor.ExtractorRegistryImpl; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; - import java.io.File; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; - +import org.apache.any23.cli.Crawler; +import org.apache.any23.cli.Tool; +import org.apache.any23.extractor.ExtractorGroup; +import org.apache.any23.extractor.ExtractorRegistryImpl; +import org.junit.After; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import org.junit.Before; +import org.junit.Test; /** * Integration test for plugins. @@ -41,7 +39,7 @@ */ public class PluginIT { - private static final int NUM_OF_EXTRACTORS_INCL_OPENIE = 34; + private static final int NUM_OF_EXTRACTORS_INCL_OPENIE = 35; private static final int NUM_OF_EXTRACTORS_EXCL_OPENIE = 33; From 5dbc86c7601f3e9abd1aec08cb5e41716c2e7cab Mon Sep 17 00:00:00 2001 From: Julio Caguano Date: Sun, 29 Jul 2018 22:18:25 -0500 Subject: [PATCH 06/11] Add test suite --- .../rdfa/RDFaLibrdfaExtractorTest.java | 255 +++++++++++++++++- 1 file changed, 252 insertions(+), 3 deletions(-) diff --git a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java index c0daa57ce..d0572c4d5 100644 --- a/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/rdfa/RDFaLibrdfaExtractorTest.java @@ -18,17 +18,27 @@ import java.io.IOException; import java.util.List; +import org.apache.any23.extractor.ExtractionException; import org.apache.any23.extractor.ExtractorFactory; +import static org.apache.any23.extractor.rdfa.AbstractRDFaExtractorTestCase.vFOAF; +import org.apache.any23.rdf.RDFUtils; +import org.apache.any23.vocab.FOAF; +import org.apache.any23.vocab.OGP; +import org.apache.any23.vocab.OGPMusic; +import org.eclipse.rdf4j.model.Literal; import org.eclipse.rdf4j.model.Statement; +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.vocabulary.RDF; import org.eclipse.rdf4j.repository.RepositoryException; +import org.eclipse.rdf4j.repository.RepositoryResult; import org.eclipse.rdf4j.rio.RDFHandlerException; import org.eclipse.rdf4j.rio.RDFParseException; import org.junit.Assert; -import org.junit.Ignore; import org.junit.Test; /** * Reference Test Class for {@link RDFaExtractor}. + * @author Julio Caguano */ public class RDFaLibrdfaExtractorTest extends AbstractRDFaExtractorTestCase { @@ -64,9 +74,248 @@ public void testRDFa11PrefixBackwardCompatibility() } } + /** + * This test verifies the correct object resource conversion. + * + * @throws RepositoryException + */ @Test - @Ignore(value = "ERROR: Corrupted STDOUT by directly writing to native stream in forked JVM 1") - public void testBasic() throws Exception { + public void testObjectResourceConversion() throws RepositoryException { + assertExtract("/html/rdfa/object-resource-test.html"); + logger.debug(dumpModelToTurtle()); + assertContains( + null, + FOAF.getInstance().page, + RDFUtils.iri("http://en.wikipedia.org/New_York") + ); + } + + /** + * This test checks the behavior of the RDFa extraction where the + * datatype of a property is explicitly set. For details see the + * RDFa in XHTML: Syntax and + * Processing + * recommendation. + * + * @throws RepositoryException + */ + @Test + public void testExplicitDatatypeDeclaration() throws RepositoryException { + assertExtract("/html/rdfa/xmlliteral-datatype-test.html"); + logger.debug(dumpModelToTurtle()); + + RepositoryResult stmts + = conn.getStatements(RDFUtils.iri("http://dbpedia.org/resource/Albert_Einstein"), + vFOAF.name, null, false); + Assert.assertTrue(stmts.hasNext()); + Value obj = stmts.next().getObject(); + Assert.assertTrue(obj instanceof Literal); + Literal lit = (Literal) obj; + Assert.assertEquals(lit.getDatatype(), RDF.XMLLITERAL); + Assert.assertEquals(lit.getLabel(), "Albert " + + "Einstein"); + } + + /** + * Tests the correct behavior of REL and HREF. + * + * @throws RepositoryException + */ + @Test + public void testRelWithHref() throws RepositoryException { + assertExtract("/html/rdfa/rel-href.html"); + logger.debug(dumpModelToTurtle()); + + assertContains( + RDFUtils.iri(baseIRI.toString(), "#me"), + FOAF.getInstance().name, + "John Doe" + ); + assertContains( + RDFUtils.iri(baseIRI.toString(), "#me"), + FOAF.getInstance().homepage, + RDFUtils.iri("http://example.org/blog/") + ); + } + + /** + * This test verifies the correct REL/REV attribute usage. + * + * @throws RepositoryException + */ + @Test + public void testRelRevSupport() throws RepositoryException { + assertExtract("/html/rdfa/rel-rev.html"); + logger.debug(dumpModelToTurtle()); + + assertContains( + baseIRI, + RDFUtils.iri("http://bob.example.com/cite"), + RDFUtils.iri("http://www.example.com/books/the_two_towers") + ); + assertContains( + RDFUtils.iri("http://path/to/chapter"), + RDFUtils.iri("http://bob.example.com/isChapterOf"), + baseIRI + ); + } + + /** + * Tests the @vocab support. + * + * @throws RepositoryException + */ + @Test + public void testVocabSupport() throws RepositoryException { + assertExtract("/html/rdfa/vocab.html"); + logger.debug(dumpModelToTurtle()); + + assertContains( + RDFUtils.iri(baseIRI.toString(), "#me"), + RDFUtils.iri("http://xmlns.com/foaf/0.1/name"), + RDFUtils.literal("John Doe") + ); + assertContains( + RDFUtils.iri(baseIRI.toString(), "#me"), + RDFUtils.iri("http://xmlns.com/foaf/0.1/homepage"), + RDFUtils.iri("http://example.org/blog/") + ); + } + + /** + * Tests the correct support of alternate + * Open Graph Protocol Object Types + * + * @throws IOException + * @throws org.apache.any23.extractor.ExtractionException + * @throws RepositoryException + */ + @Test + public void testOpenGraphAlternateObjectTypes() throws IOException, ExtractionException, RepositoryException { + assertExtract("/html/rdfa/opengraph-music-song-object-type.html"); + logger.info(dumpHumanReadableTriples()); + + Assert.assertEquals(9, getStatementsSize(null, null, null)); + final OGPMusic vOGPMusic = OGPMusic.getInstance(); + assertContains(baseIRI, vOGPMusic.musicDuration, RDFUtils.literal("447")); + assertContains( + baseIRI, + vOGPMusic.musicMusician, + RDFUtils.literal( + "Jono Grant / Tony McGuinness / Ashley Tomberlin" + ) + ); + assertContains(baseIRI, vOGPMusic.musicAlbum, RDFUtils.literal("Tri-State")); + } + + /** + * Taken from the + * GoodRelations + * test cases. It checks if the extraction is the same when the + * namespaces are defined in RDFa1.0. + * + * @throws RepositoryException + * @throws java.io.IOException + * @throws org.eclipse.rdf4j.rio.RDFHandlerException + * @throws org.eclipse.rdf4j.rio.RDFParseException + */ + @Test + public void testRDFa10Extraction() + throws RepositoryException, RDFHandlerException, IOException, RDFParseException { + final int EXPECTED_STATEMENTS = 31; + + assertExtract("/html/rdfa/goodrelations-rdfa10.html"); + logger.debug(dumpModelToNQuads()); + + Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size()); + assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq"); + } + + /** + * Taken from the + * GoodRelations + * test cases. It checks if the extraction is the same when the + * namespaces are defined in RDFa1.1. + * + * @throws RepositoryException + * @throws java.io.IOException + * @throws org.eclipse.rdf4j.rio.RDFHandlerException + * @throws org.eclipse.rdf4j.rio.RDFParseException + */ + @Test + public void testRDFa11Extraction() + throws RepositoryException, RDFHandlerException, IOException, RDFParseException { + final int EXPECTED_STATEMENTS = 31; + + assertExtract("/html/rdfa/goodrelations-rdfa11.html"); + logger.debug(dumpHumanReadableTriples()); + + Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size()); + assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq"); + } + + /** + * Tests the correct support of Open Graph + * Protocol's + * Basic Metadata, + * Optional Metadata, + * Structured Properties and + * Arrays. + * + * @throws IOException + * @throws org.apache.any23.extractor.ExtractionException + * @throws RepositoryException + */ + @Test + public void testOpenGraphStructuredProperties() throws IOException, ExtractionException, RepositoryException { + assertExtract("/html/rdfa/opengraph-structured-properties.html"); + logger.info(dumpHumanReadableTriples()); + + Assert.assertEquals(31, getStatementsSize(null, null, null)); + final OGP vOGP = OGP.getInstance(); + assertContains(baseIRI, vOGP.audio, RDFUtils.literal("http://example.com/sound.mp3")); + assertContains( + baseIRI, + vOGP.description, + RDFUtils.literal( + "Sean Connery found fame and fortune as the suave, sophisticated British agent, James Bond." + ) + ); + assertContains(baseIRI, vOGP.determiner, RDFUtils.literal("the")); + assertContains(baseIRI, vOGP.locale, RDFUtils.literal("en_GB")); + assertContains(baseIRI, vOGP.localeAlternate, RDFUtils.literal("fr_FR")); + assertContains(baseIRI, vOGP.localeAlternate, RDFUtils.literal("es_ES")); + assertContains(baseIRI, vOGP.siteName, RDFUtils.literal("IMDb")); + assertContains(baseIRI, vOGP.video, RDFUtils.literal("http://example.com/bond/trailer.swf")); } /** From c4b5dccbbd004e480494f38f57238936d3e8942d Mon Sep 17 00:00:00 2001 From: Julio Caguano Date: Sun, 29 Jul 2018 22:18:52 -0500 Subject: [PATCH 07/11] add last version of librdfa-rdf4j --- core/pom.xml | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/core/pom.xml b/core/pom.xml index 5d4f39d65..4ea203071 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -60,13 +60,16 @@ test-jar test + + + ${project.groupId} apache-any23-librdfa - 0.0.1-SNAPSHOT + 1.0.0 - - + + org.apache.httpcomponents @@ -334,7 +337,14 @@ - + + + + librdfa-rdf4j + https://raw.github.com/JulioCCBUcuenca/librdfa-java/repository/ + + + From 85e0c7e13df92e457d8144c9cf66f67fe85bd9e5 Mon Sep 17 00:00:00 2001 From: Julio Caguano Date: Sun, 29 Jul 2018 22:20:52 -0500 Subject: [PATCH 08/11] Add lang tag. lang tag is used to identify language in HTML pages, and xml:lang is used to identify in xml files. --- test-resources/src/test/resources/html/rdfa/basic.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test-resources/src/test/resources/html/rdfa/basic.html b/test-resources/src/test/resources/html/rdfa/basic.html index 542b88b73..f9ffab285 100644 --- a/test-resources/src/test/resources/html/rdfa/basic.html +++ b/test-resources/src/test/resources/html/rdfa/basic.html @@ -14,7 +14,7 @@ See the License for the specific language governing permissions and limitations under the License. --> - +
From 3ad4fc6c1d7c5fdfbaea8d2acd4049b4f1426f58 Mon Sep 17 00:00:00 2001 From: Julio Caguano Date: Fri, 3 Aug 2018 10:42:58 -0500 Subject: [PATCH 09/11] add librdfa-rdf4j as a separate module of any23 --- librdfa-rdf4j/README.MD | 63 ++++++++ librdfa-rdf4j/pom.xml | 139 ++++++++++++++++++ librdfa-rdf4j/src/main/c/CMakeLists.txt | 39 +++++ librdfa-rdf4j/src/main/c/RdfaParser.cpp | 19 +++ librdfa-rdf4j/src/main/c/RdfaParser.h | 124 ++++++++++++++++ librdfa-rdf4j/src/main/c/main.java | 82 +++++++++++ librdfa-rdf4j/src/main/c/rdfa.i | 40 +++++ librdfa-rdf4j/src/main/c/readme.md | 12 ++ .../apache/any23/rdf/rdfa/LibrdfaFilter.java | 130 ++++++++++++++++ .../any23/rdf/rdfa/LibrdfaRDFaParser.java | 95 ++++++++++++ .../rdf/rdfa/LibrdfaRDFaParserFactory.java | 40 +++++ .../any23/rdf/rdfa/utils/LibraryLoader.java | 62 ++++++++ .../org.eclipse.rdf4j.rio.RDFParserFactory | 1 + .../rdf/librdfa/LibrdfaRDFaBenchmarkTest.java | 72 +++++++++ .../rdf/librdfa/LibrdfaRDFaParserTest.java | 63 ++++++++ .../org/apache/any23/rdf/librdfa/site.html | 17 +++ 16 files changed, 998 insertions(+) create mode 100644 librdfa-rdf4j/README.MD create mode 100644 librdfa-rdf4j/pom.xml create mode 100644 librdfa-rdf4j/src/main/c/CMakeLists.txt create mode 100644 librdfa-rdf4j/src/main/c/RdfaParser.cpp create mode 100644 librdfa-rdf4j/src/main/c/RdfaParser.h create mode 100644 librdfa-rdf4j/src/main/c/main.java create mode 100644 librdfa-rdf4j/src/main/c/rdfa.i create mode 100644 librdfa-rdf4j/src/main/c/readme.md create mode 100644 librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaFilter.java create mode 100644 librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParser.java create mode 100644 librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParserFactory.java create mode 100644 librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/utils/LibraryLoader.java create mode 100644 librdfa-rdf4j/src/main/resources/META-INF/services/org.eclipse.rdf4j.rio.RDFParserFactory create mode 100644 librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaBenchmarkTest.java create mode 100644 librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaParserTest.java create mode 100644 librdfa-rdf4j/src/test/resources/org/apache/any23/rdf/librdfa/site.html diff --git a/librdfa-rdf4j/README.MD b/librdfa-rdf4j/README.MD new file mode 100644 index 000000000..4df84f96b --- /dev/null +++ b/librdfa-rdf4j/README.MD @@ -0,0 +1,63 @@ +# Librdfa - RDF4J + +RDF4J parser that uses [librdfa](https://github.com/rdfa/librdfa) to parse RDFa to triples. + +## Prerequisites + +You need to install the [librdfa](https://github.com/rdfa/librdfa) library. + +## Install + +``` mvn + + org.apache.any23 + apache-any23-librdfa + 1.0.0 + + + + + librdfa-rdf4j + https://raw.github.com/JulioCCBUcuenca/librdfa-java/repository/ + + +``` + +## Compile + +`mvn clean install` + +## Use + +Add the library and you can parse an `InputStream` as you would do with [`Rio`](http://docs.rdf4j.org/javadoc/2.1/org/eclipse/rdf4j/rio/Rio.html). + +``` java +RDFParser rdfParser = Rio.createParser(RDFFormat.RDFA); +Model model = new LinkedHashModel(); +rdfParser.setRDFHandler(new StatementCollector(model)); +rdfParser.parse(in, "http://www.example.org./"); +``` + +## Benchmarking + +In general librdfa is 2-5 seconds faster than semargl. + +### librdfa-rdf4j +- round: 0.11 [+- 0.00] +- round.block: 0.00 [+- 0.00] +- round.gc: 0.00 [+- 0.00] +- GC.calls: 0 +- GC.time: 0.00 +- time.total: 0.11 +- time.warmup: 0.00 +- time.bench: 0.11 + +### semargl-rdf4j +- round: 0.15 [+- 0.00] +- round.block: 0.00 [+- 0.00] +- round.gc: 0.00 [+- 0.00] +- GC.calls: 1 +- GC.time: 0.00 +- time.total: 0.15 +- time.warmup: 0.00 +- time.bench: 0.15 diff --git a/librdfa-rdf4j/pom.xml b/librdfa-rdf4j/pom.xml new file mode 100644 index 000000000..ac760a221 --- /dev/null +++ b/librdfa-rdf4j/pom.xml @@ -0,0 +1,139 @@ + + + 4.0.0 + org.apache.any23 + apache-any23-librdfa + 0.0.1-SNAPSHOT + jar + Apache Any23 :: Librdfa + + UTF-8 + 1.8 + 1.8 + + ${project.basedir}/src/main/c/ + ${jni.base}/build/ + + 2.2.4 + 4.12 + 0.7.2 + 1.7.25 + 0.7 + + + + + + com.googlecode.cmake-maven-project + cmake-maven-plugin + 3.7.2-b1 + + + cmake-generate + + generate + + + ${jni.base} + ${jni.build} + Unix Makefiles + linux-x86_64 + + ${jni.build} + + + + + cmake-compile + process-resources + + compile + + + ${jni.build} + linux-x86_64 + + + + + + org.codehaus.mojo + build-helper-maven-plugin + 3.0.0 + + + add-source + generate-sources + + add-source + + + + ${jni.build} + + + + + + + maven-antrun-plugin + 1.8 + + + process-classes + + + ${jni.base} + + + + + + + + + run + + + + + + + + + + org.eclipse.rdf4j + rdf4j-rio-api + ${rdf4j.version} + + + + + junit + junit + ${junit.version} + test + + + com.carrotsearch + junit-benchmarks + ${junit.benchmarks.version} + test + + + org.slf4j + slf4j-simple + ${slf4j.version} + test + + + org.semarglproject + semargl-rdf4j + ${semargl.rdf4j.version} + test + + + + \ No newline at end of file diff --git a/librdfa-rdf4j/src/main/c/CMakeLists.txt b/librdfa-rdf4j/src/main/c/CMakeLists.txt new file mode 100644 index 000000000..ddd9e6ebc --- /dev/null +++ b/librdfa-rdf4j/src/main/c/CMakeLists.txt @@ -0,0 +1,39 @@ +cmake_minimum_required(VERSION 2.8) + +# Check if required packages are installed +find_package(SWIG REQUIRED) +find_package(Java REQUIRED) +find_package(JNI REQUIRED) +find_package(LibXml2 REQUIRED) + +# Add modules +include(UseJava) +include(UseSWIG) + +# Add directories to the build process +set( CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR} ) + +include_directories(${LIBXML2_INCLUDE_DIR}) +include_directories(${JNI_INCLUDE_DIRS}) +include_directories(${CMAKE_CURRENT_BINARY_DIR}/..) + +# Link library. TODO: replace because of deprecation +link_libraries(rdfa) + +# Build the C++ code into a dynamic library: rdfaJava.dll (on Windows) or librdfaJava.so (on Linux) +set(CMAKE_SWIG_FLAGS -package org.apache.any23.rdf.librdfa) +set(CMAKE_SWIG_OUTDIR "${CMAKE_CURRENT_BINARY_DIR}/org/apache/any23/rdf/librdfa") +set_property(SOURCE rdfa.i PROPERTY CPLUSPLUS ON) +swig_add_module( + rdfaJava + java + rdfa.i + RdfaParser.cpp +) + +# For convenience we copy the dynamic library to the current build folder +add_custom_command( + TARGET rdfaJava + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different $ ${CMAKE_CURRENT_BINARY_DIR} +) diff --git a/librdfa-rdf4j/src/main/c/RdfaParser.cpp b/librdfa-rdf4j/src/main/c/RdfaParser.cpp new file mode 100644 index 000000000..4d844c31b --- /dev/null +++ b/librdfa-rdf4j/src/main/c/RdfaParser.cpp @@ -0,0 +1,19 @@ +/* + * + * This file is part of librdfa. + * + * librdfa is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * librdfa is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with librdfa. If not, see . + * + */ +#include "RdfaParser.h" \ No newline at end of file diff --git a/librdfa-rdf4j/src/main/c/RdfaParser.h b/librdfa-rdf4j/src/main/c/RdfaParser.h new file mode 100644 index 000000000..b9cc4d988 --- /dev/null +++ b/librdfa-rdf4j/src/main/c/RdfaParser.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2008 Digital Bazaar, Inc. + * + * This file is part of librdfa. + * + * librdfa is free software: you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation, either version 3 of the + * License, or (at your option) any later version. + * + * librdfa is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with librdfa. If not, see . + */ +#ifndef _RDFA_PARSER_H_ +#define _RDFA_PARSER_H_ + +#include +#include +#include +#include +#include +#include + +struct rdfacontext; + +class Callback { +public: + + virtual ~Callback() { + //std::cout << "Callback::~Callback()" << std::endl; + } + + virtual void default_graph(char* subject, char* predicate, char* object, int object_type, char* datatype, char* language) { + } + + virtual void processor_graph(char* subject, char* predicate, char* object, int object_type, char* datatype, char* language) { + } + + virtual char* fill_data(size_t buffer_length) { + } + + virtual size_t fill_len() { + } +}; + +/** + * The RdfaParser class is a wrapper class for Java to provide a + * simple API for using librdfa in Java. + */ +class RdfaParser { +private: + Callback *_callback; +public: + /** + * The base URI that will be used when resolving relative pathnames + * in the document. + */ + std::string mBaseUri; + + /** + * The base RDFa context to use when setting the triple handler callback, + * buffer filler callback, and executing the parser call. + */ + rdfacontext* mBaseContext; + + RdfaParser(const char* baseUri) : _callback(0) { + mBaseUri = baseUri; + mBaseContext = rdfa_create_context(baseUri); + } + + /** + * Standard destructor. + */ + ~RdfaParser() { + rdfa_free_context(mBaseContext); + delCallback(); + } + + void c_process_default_graph_triple(rdftriple* triple, void* callback_data) { + _callback->default_graph(triple->subject, triple-> predicate, triple->object, triple->object_type, triple-> datatype, triple-> language); + rdfa_free_triple(triple); + } + + void c_process_processor_graph_triple(rdftriple* triple, void* callback_data) { + _callback->processor_graph(triple->subject, triple-> predicate, triple->object, triple->object_type, triple-> datatype, triple-> language); + rdfa_free_triple(triple); + } + + size_t c_fill_buffer(char* buffer, size_t buffer_length, void* callback_data) { + char* data = _callback->fill_data(buffer_length); + size_t size = _callback -> fill_len(); + memset(buffer, ' ', buffer_length); + memcpy(buffer, data, size); + + return size; + } + + /** + * Starts the parsing process for librdfa. When more data is + * required by the XML parser, the buffer filler callback is + * called. If triples are found, then the triple handler callback + * is called. + */ + int parse() { + return rdfa_parse(mBaseContext); + } + + void delCallback() { + delete _callback; + _callback = 0; + } + + void setCallback(Callback *cb) { + delCallback(); + _callback = cb; + } +}; + +#endif /* _RDFA_PARSER_H_ */ diff --git a/librdfa-rdf4j/src/main/c/main.java b/librdfa-rdf4j/src/main/c/main.java new file mode 100644 index 000000000..6e7c50813 --- /dev/null +++ b/librdfa-rdf4j/src/main/c/main.java @@ -0,0 +1,82 @@ + +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import org.apache.any23.rdf.librdfa.Callback; +import org.apache.any23.rdf.librdfa.RdfaParser; + +public class main { + + public static void main(String argv[]) { + System.loadLibrary("rdfaJava"); // Attempts to load example.dll (on Windows) or libexample.so (on Linux) + + System.out.println("Adding and calling a normal C++ callback"); + System.out.println("----------------------------------------"); + String ds = "\n" + + "\n" + + "\n" + + "\n" + + " Test 0001\n" + + "\n" + + "\n" + + "

This photo was taken by Mark Birbeck.

\n" + + "\n" + + ""; + + RdfaParser caller = new RdfaParser("http://www.google.com/"); + caller.init(); + Callback callback = new JavaCallback(new ByteArrayInputStream(ds.getBytes(StandardCharsets.UTF_8))); + caller.setCallback(callback); + + caller.parse(); + //rdfa.set_rdfa_parser(caller); + + } +} + +class JavaCallback extends Callback { + + BufferedReader bis = null; + int len = 0; + + public JavaCallback(InputStream is) { + super(); + bis = new BufferedReader(new InputStreamReader(is)); + } + + @Override + public void default_graph(String subject, String predicate, String object, int object_type, String datatype, String language) { + System.out.println("default_graph(...)"); + System.out.println("S=" + subject + "P=" + predicate + "O=" + object + "OT=" + object_type + "DT=" + datatype + "LANG=" + language); + } + + @Override + public void processor_graph(String subject, String predicate, String object, int object_type, String datatype, String language) { + System.out.println("processor_graph(...)"); + System.out.println("S=" + subject + "P=" + predicate + "O=" + object + "OT=" + object_type + datatype + "LANG=" + language); + } + + @Override + public String fill_data(long buffer_length) { + System.out.println("buffer_length:" + buffer_length); + StringBuilder sb = new StringBuilder(new StringBuffer((int) buffer_length)); + len = 0; + try { + for (int c; (c = bis.read()) != -1;) { + sb.append((char) c); + len++; + } + } catch (IOException ex) { + } + return sb.toString(); + } + + @Override + public long fill_len() { + return len; + } +} diff --git a/librdfa-rdf4j/src/main/c/rdfa.i b/librdfa-rdf4j/src/main/c/rdfa.i new file mode 100644 index 000000000..9f5213ca7 --- /dev/null +++ b/librdfa-rdf4j/src/main/c/rdfa.i @@ -0,0 +1,40 @@ +%module(directors="1") rdfa +%feature("director") Callback; + +%{ + #include "RdfaParser.h" + + RdfaParser* gRdfaParser = NULL; + void process_default_graph_triple(rdftriple* triple, void* callback_data); + void process_processor_graph_triple(rdftriple* triple, void* callback_data); + size_t fill_buffer(char* buffer, size_t buffer_length, void* callback_data); +%} + +%constant int RDF_TYPE_NAMESPACE_PREFIX = RDF_TYPE_NAMESPACE_PREFIX; +%constant int RDF_TYPE_IRI = RDF_TYPE_IRI; +%constant int RDF_TYPE_PLAIN_LITERAL = RDF_TYPE_PLAIN_LITERAL; +%constant int RDF_TYPE_XML_LITERAL = RDF_TYPE_XML_LITERAL; +%constant int RDF_TYPE_TYPED_LITERAL = RDF_TYPE_TYPED_LITERAL; + +%{ + void process_default_graph_triple(rdftriple* triple, void* callback_data){ + gRdfaParser->c_process_default_graph_triple(triple, callback_data); + } + void process_processor_graph_triple(rdftriple* triple, void* callback_data){ + gRdfaParser->c_process_processor_graph_triple( triple, callback_data); + } + size_t fill_buffer(char* buffer, size_t buffer_length, void* callback_data){ + return gRdfaParser->c_fill_buffer(buffer, buffer_length, callback_data); + } +%} + +%include RdfaParser.h + +%extend RdfaParser { + void init (){ + gRdfaParser = self; + rdfa_set_default_graph_triple_handler(gRdfaParser->mBaseContext, &process_default_graph_triple); + rdfa_set_processor_graph_triple_handler(gRdfaParser->mBaseContext, &process_processor_graph_triple); + rdfa_set_buffer_filler(gRdfaParser->mBaseContext, &fill_buffer); + } +} diff --git a/librdfa-rdf4j/src/main/c/readme.md b/librdfa-rdf4j/src/main/c/readme.md new file mode 100644 index 000000000..5b1fd4a57 --- /dev/null +++ b/librdfa-rdf4j/src/main/c/readme.md @@ -0,0 +1,12 @@ +# Librdfa Integration with Java + +Build: + +`mkdir build` +`cd build` +`cmake ..` +`cmake --build .` + +Run: + +`java -Djava.library.path=. -jar Main.jar` diff --git a/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaFilter.java b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaFilter.java new file mode 100644 index 000000000..53cdf101f --- /dev/null +++ b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaFilter.java @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.rdf.rdfa; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.logging.Level; +import java.util.logging.Logger; +import org.apache.any23.rdf.librdfa.Callback; +import org.apache.any23.rdf.librdfa.rdfa; +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Statement; +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.ValueFactory; +import org.eclipse.rdf4j.model.vocabulary.RDF; +import org.eclipse.rdf4j.rio.RDFHandler; + +/** + * + * @author Julio Caguano + */ +public class LibrdfaFilter extends Callback { + + private BufferedReader bis = null; + private int len = 0; + private RDFHandler handler; + private ValueFactory valueFactory; + + public LibrdfaFilter(InputStream is) { + super(); + bis = new BufferedReader(new InputStreamReader(is)); + } + + public LibrdfaFilter(Reader reader) { + super(); + bis = new BufferedReader(reader); + } + + @Override + public void default_graph(String subject, String predicate, String object, int object_type, String datatype, String language) { + IRI s = valueFactory.createIRI(subject); + IRI p = valueFactory.createIRI(predicate); + Value o = null; + + if (object_type == rdfa.RDF_TYPE_IRI) { // 1 + o = valueFactory.createIRI(object); + } else if (object_type == rdfa.RDF_TYPE_PLAIN_LITERAL) { // 2 + o = valueFactory.createLiteral(object); + } else if (object_type == rdfa.RDF_TYPE_XML_LITERAL) { // 3 + o = valueFactory.createLiteral(object, RDF.XMLLITERAL); + } else if (object_type == rdfa.RDF_TYPE_TYPED_LITERAL) { // 4 + if (datatype != null) { + IRI dt = valueFactory.createIRI(datatype); + o = valueFactory.createLiteral(object, dt); + } else { + o = valueFactory.createLiteral(object, language); + } + } + if (handler != null && o != null) { + Statement stmt = valueFactory.createStatement(s, p, o); + handler.handleStatement(stmt); + } else { + System.err.println("VALIDATE: S=" + subject + "P=" + predicate + "O=" + object + "OT=" + object_type + "DT=" + datatype + "LANG=" + language); + } + } + + @Override + public void processor_graph(String subject, String predicate, String object, int object_type, String datatype, String language) { + if (handler != null && rdfa.RDF_TYPE_NAMESPACE_PREFIX == object_type) { // 0 + handler.handleNamespace(predicate, object); + } else { + System.out.println("Processor: S=" + subject + "\tP=" + predicate + "\tO=" + object + "\tOT=" + object_type + "\tDT:" + datatype + "\tLANG=" + language); + } + } + + @Override + public String fill_data(long buffer_length) { + char[] d = new char[(int) buffer_length]; + + try { + len = bis.read(d, 0, (int) buffer_length); + } catch (IOException ex) { + Logger.getLogger(LibrdfaFilter.class.getName()).log(Level.SEVERE, null, ex); + } + + return new String(d); + } + + @Override + public long fill_len() { + if (len == -1) { + return 0; + } + return len; + } + + public RDFHandler getHandler() { + return handler; + } + + public void setHandler(RDFHandler handler) { + this.handler = handler; + } + + public ValueFactory getValueFactory() { + return valueFactory; + } + + public void setValueFactory(ValueFactory valueFactory) { + this.valueFactory = valueFactory; + } + +} diff --git a/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParser.java b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParser.java new file mode 100644 index 000000000..cff321e74 --- /dev/null +++ b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParser.java @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.rdf.rdfa; + +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import org.apache.any23.rdf.librdfa.RdfaParser; +import org.apache.any23.rdf.rdfa.utils.LibraryLoader; +import org.eclipse.rdf4j.rio.RDFFormat; +import org.eclipse.rdf4j.rio.RDFHandlerException; +import org.eclipse.rdf4j.rio.RDFParseException; +import org.eclipse.rdf4j.rio.helpers.AbstractRDFParser; + +/** + * + * @author Julio Caguano + */ +public class LibrdfaRDFaParser extends AbstractRDFParser { + + static { + try { + LibraryLoader.loadLibrary("rdfaJava"); + } catch (IOException ex) { + throw new RuntimeException(ex); + } + } + + @Override + public RDFFormat getRDFFormat() { + return RDFFormat.RDFA; + } + + @Override + public void parse(InputStream in, String baseURI) throws IOException, RDFParseException, RDFHandlerException { + if (in == null) { + throw new IllegalArgumentException("Input stream cannot be 'null'"); + } + if (baseURI == null) { + throw new IllegalArgumentException("Base URI cannot be 'null'"); + } + + RdfaParser parser = new RdfaParser(baseURI); + parser.init(); + + LibrdfaFilter filter = new LibrdfaFilter(in); + parser.setCallback(filter); + + filter.setHandler(rdfHandler); + filter.setValueFactory(valueFactory); + + int status = parser.parse(); + + parser.delCallback(); + filter.delete(); + } + + @Override + public void parse(Reader reader, String baseURI) throws IOException, RDFParseException, RDFHandlerException { + if (reader == null) { + throw new IllegalArgumentException("Input stream cannot be 'null'"); + } + if (baseURI == null) { + throw new IllegalArgumentException("Base URI cannot be 'null'"); + } + + RdfaParser parser = new RdfaParser(baseURI); + parser.init(); + LibrdfaFilter filter = new LibrdfaFilter(reader); + parser.setCallback(filter); + + filter.setHandler(rdfHandler); + filter.setValueFactory(valueFactory); + + parser.parse(); + + parser.delCallback(); + parser.delete(); + } + +} diff --git a/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParserFactory.java b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParserFactory.java new file mode 100644 index 000000000..996a61c4f --- /dev/null +++ b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/LibrdfaRDFaParserFactory.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.rdf.rdfa; + +import org.eclipse.rdf4j.rio.RDFFormat; +import org.eclipse.rdf4j.rio.RDFParser; +import org.eclipse.rdf4j.rio.RDFParserFactory; + +/** + * Parser factory to integrate the {@link LibrdfaRDFaParser} into RDF4j. + * + * @author Julio Caguano + */ +public class LibrdfaRDFaParserFactory implements RDFParserFactory { + + @Override + public RDFFormat getRDFFormat() { + return RDFFormat.RDFA; + } + + @Override + public RDFParser getParser() { + return new LibrdfaRDFaParser(); + } + +} diff --git a/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/utils/LibraryLoader.java b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/utils/LibraryLoader.java new file mode 100644 index 000000000..29cbe1049 --- /dev/null +++ b/librdfa-rdf4j/src/main/java/org/apache/any23/rdf/rdfa/utils/LibraryLoader.java @@ -0,0 +1,62 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.rdf.rdfa.utils; + + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +/** + * + * @author Julio Caguano + * + */ +public final class LibraryLoader { + + public static void loadLibrary(String name) throws IOException { + try { + System.loadLibrary(name); + } catch (UnsatisfiedLinkError e) { + String filename = System.mapLibraryName(name); + InputStream in = LibraryLoader.class.getClassLoader().getResourceAsStream(filename); + int pos = filename.lastIndexOf('.'); + File file = File.createTempFile(filename.substring(0, pos), filename.substring(pos)); + file.deleteOnExit(); + try { + byte[] buf = new byte[4096]; + OutputStream out = new FileOutputStream(file); + try { + while (in.available() > 0) { + int len = in.read(buf); + if (len >= 0) { + out.write(buf, 0, len); + } + } + } finally { + out.close(); + } + } finally { + in.close(); + } + System.load(file.getAbsolutePath()); + } + } + +} diff --git a/librdfa-rdf4j/src/main/resources/META-INF/services/org.eclipse.rdf4j.rio.RDFParserFactory b/librdfa-rdf4j/src/main/resources/META-INF/services/org.eclipse.rdf4j.rio.RDFParserFactory new file mode 100644 index 000000000..463600e9b --- /dev/null +++ b/librdfa-rdf4j/src/main/resources/META-INF/services/org.eclipse.rdf4j.rio.RDFParserFactory @@ -0,0 +1 @@ +org.apache.any23.rdf.rdfa.LibrdfaRDFaParserFactory diff --git a/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaBenchmarkTest.java b/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaBenchmarkTest.java new file mode 100644 index 000000000..c37f3d5c1 --- /dev/null +++ b/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaBenchmarkTest.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.rdf.librdfa; + +import com.carrotsearch.junitbenchmarks.AbstractBenchmark; +import com.carrotsearch.junitbenchmarks.BenchmarkOptions; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import org.apache.any23.rdf.rdfa.LibrdfaRDFaParser; +import org.eclipse.rdf4j.rio.RDFParser; +import org.eclipse.rdf4j.rio.helpers.StatementCollector; +import static org.junit.Assert.assertEquals; +import org.junit.Before; +import org.junit.Test; +import org.semarglproject.rdf4j.rdf.rdfa.RDF4JRDFaParser; + +/** + * + * @author Julio Caguano + */ +@BenchmarkOptions(callgc = false, benchmarkRounds = 20, warmupRounds = 0) +public class LibrdfaRDFaBenchmarkTest extends AbstractBenchmark { + + private final int ITERATIONS = 2000; + private String DOCUMENT = ""; + + @Before + public void init() { + DOCUMENT = "\n" + + "\n" + + "\n" + + "Speed Test

"; + for (int i = 0; i < ITERATIONS; i++) { + DOCUMENT += ""; + } + DOCUMENT += "

"; + } + + @Test + public void testSemargl() throws Exception { + runTest(new RDF4JRDFaParser()); + } + + @Test + public void testLibrdfa() throws IOException { + runTest(new LibrdfaRDFaParser()); + } + + private void runTest(RDFParser parser) throws IOException { + InputStream in = new ByteArrayInputStream(DOCUMENT.getBytes(StandardCharsets.UTF_8)); + StatementCollector sc = new StatementCollector(); + parser.setRDFHandler(sc); + parser.parse(in, "http://example.org/"); + assertEquals(ITERATIONS, sc.getStatements().size()); + } +} diff --git a/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaParserTest.java b/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaParserTest.java new file mode 100644 index 000000000..7974c4919 --- /dev/null +++ b/librdfa-rdf4j/src/test/java/org/apache/any23/rdf/librdfa/LibrdfaRDFaParserTest.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.rdf.librdfa; + +import java.io.IOException; +import java.io.InputStream; +import org.apache.any23.rdf.rdfa.LibrdfaRDFaParser; +import org.eclipse.rdf4j.model.ValueFactory; +import org.eclipse.rdf4j.model.impl.SimpleValueFactory; +import org.eclipse.rdf4j.rio.RDFParser; +import org.eclipse.rdf4j.rio.helpers.ParseErrorCollector; +import org.eclipse.rdf4j.rio.helpers.StatementCollector; +import static org.junit.Assert.assertEquals; +import org.junit.Before; +import org.junit.Test; + +/** + * + * @author Julio Caguano + * + */ +public class LibrdfaRDFaParserTest { + + private ValueFactory vf; + private RDFParser parser; + private StatementCollector sc; + private ParseErrorCollector el; + + @Before + public void setUp() throws Exception { + + vf = SimpleValueFactory.getInstance(); + parser = new LibrdfaRDFaParser(); + sc = new StatementCollector(); + parser.setRDFHandler(sc); + el = new ParseErrorCollector(); +// parser.setParseErrorListener(el); + } + + @Test + public void testHtml() throws IOException { + try (final InputStream in = this.getClass().getResourceAsStream( + "/org/apache/any23/rdf/librdfa/site.html");) { + parser.parse(in, "http://example.org/"); + assertEquals(4, sc.getStatements().size()); + } + } + +} diff --git a/librdfa-rdf4j/src/test/resources/org/apache/any23/rdf/librdfa/site.html b/librdfa-rdf4j/src/test/resources/org/apache/any23/rdf/librdfa/site.html new file mode 100644 index 000000000..b80aca1a8 --- /dev/null +++ b/librdfa-rdf4j/src/test/resources/org/apache/any23/rdf/librdfa/site.html @@ -0,0 +1,17 @@ + + + + + Test + + +

+ + Julio Caguano + Julio Caguano + +

+ + \ No newline at end of file From a7969e2fc1b4accbfda3dcd5a74a6b7a267cd7a8 Mon Sep 17 00:00:00 2001 From: Julio Caguano Date: Tue, 7 Aug 2018 18:41:02 -0500 Subject: [PATCH 10/11] Add header files, use any23 version, and correct typo. --- .gitignore | 1 + .../default-configuration.properties | 8 +- core/pom.xml | 6 +- .../rdfa/LibRdfaExtractorFactory.java | 13 +- librdfa-rdf4j/pom.xml | 271 +++++++++--------- librdfa-rdf4j/src/main/c/CMakeLists.txt | 15 + librdfa-rdf4j/src/main/c/RdfaParser.cpp | 27 +- librdfa-rdf4j/src/main/c/RdfaParser.h | 27 +- librdfa-rdf4j/src/main/c/main.java | 82 ------ librdfa-rdf4j/src/main/c/rdfa.i | 15 + .../src/test/resources/log4j.properties | 35 +++ 11 files changed, 247 insertions(+), 253 deletions(-) delete mode 100644 librdfa-rdf4j/src/main/c/main.java create mode 100644 librdfa-rdf4j/src/test/resources/log4j.properties diff --git a/.gitignore b/.gitignore index c0c5d24b3..63de6b912 100644 --- a/.gitignore +++ b/.gitignore @@ -10,3 +10,4 @@ pom.xml.versionsBackup **/maven-eclipse.xml **/any23-site/ **/nb*.xml +**/c/build/ diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties index 15077711d..3d211ff7e 100644 --- a/api/src/main/resources/default-configuration.properties +++ b/api/src/main/resources/default-configuration.properties @@ -43,14 +43,14 @@ any23.extraction.metadata.domain.per.entity=off # Allows to decide which RDFa Extractor to enable. # If 'on' will be activated the programmatic RDFa 1.1 Extractor -# (org.deri.any23.extractor.rdfa.RDFa11Extractor) otherwise will be -# registered the RDFa 1.0 legacy one (org.deri.any23.extractor.rdfa.RDFaExtractor). +# (org.apache.any23.extractor.rdfa.RDFa11Extractor) otherwise will be +# registered the RDFa 1.0 legacy one (org.apache.any23.extractor.rdfa.RDFaExtractor). any23.extraction.rdfa.programmatic=on # Allows to enable Librdfa Extractor. # If 'on' will override the extractors with the programmatic option, -# RDFa 1.1 Extractor (org.deri.any23.extractor.rdfa.RDFa11Extractor) and -# RDFa 1.0 Exctractor (org.deri.any23.extractor.rdfa.RDFaExtractor). +# RDFa 1.1 Extractor (org.apache.any23.extractor.rdfa.RDFa11Extractor) and +# RDFa 1.0 Exctractor (org.apache.any23.extractor.rdfa.RDFaExtractor). # If the option is 'off' (by default), it will choose the especfied extractor # in the programmatic option (any23.extraction.rdfa.programmatic). any23.extraction.rdfa.librdfa=off diff --git a/core/pom.xml b/core/pom.xml index 3c54414d9..0a65ca7bf 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -64,9 +64,9 @@ - ${project.groupId} - apache-any23-librdfa - 1.0.0 + ${project.groupId} + apache-any23-librdfa + 1.0.0 diff --git a/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java index b79e8733a..6d1d51e8c 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java +++ b/core/src/main/java/org/apache/any23/extractor/rdfa/LibRdfaExtractorFactory.java @@ -1,11 +1,12 @@ /* - * Copyright 2018 The Apache Software Foundation. + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 + * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, diff --git a/librdfa-rdf4j/pom.xml b/librdfa-rdf4j/pom.xml index ac760a221..1f6b82628 100644 --- a/librdfa-rdf4j/pom.xml +++ b/librdfa-rdf4j/pom.xml @@ -1,139 +1,152 @@ + - 4.0.0 + + org.apache.any23 - apache-any23-librdfa - 0.0.1-SNAPSHOT - jar - Apache Any23 :: Librdfa - - UTF-8 - 1.8 - 1.8 + apache-any23 + 2.3-SNAPSHOT + ../ + + + 4.0.0 + org.apache.any23 + apache-any23-librdfa + jar + Apache Any23 :: Librdfa + + + ${project.basedir}/src/main/c/ + ${jni.base}/build/ - ${project.basedir}/src/main/c/ - ${jni.base}/build/ - - 2.2.4 - 4.12 - 0.7.2 - 1.7.25 - 0.7 - + 4.12 + 0.7.2 + - - - - com.googlecode.cmake-maven-project - cmake-maven-plugin - 3.7.2-b1 - - - cmake-generate - - generate - - - ${jni.base} - ${jni.build} - Unix Makefiles - linux-x86_64 - - ${jni.build} - - - - - cmake-compile - process-resources - - compile - - - ${jni.build} - linux-x86_64 - - - - - - org.codehaus.mojo - build-helper-maven-plugin - 3.0.0 - - - add-source - generate-sources - - add-source - - - - ${jni.build} - - - - - - - maven-antrun-plugin - 1.8 - - - process-classes - - - ${jni.base} + + + + com.googlecode.cmake-maven-project + cmake-maven-plugin + 3.7.2-b1 + + + cmake-generate + + generate + + + ${jni.base} + ${jni.build} + Unix Makefiles + linux-x86_64 + + ${jni.build} + + + + + cmake-compile + process-resources + + compile + + + ${jni.build} + linux-x86_64 + + + + + + org.codehaus.mojo + build-helper-maven-plugin + 3.0.0 + + + add-source + generate-sources + + add-source + + + + ${jni.build} + + + + + + + maven-antrun-plugin + 1.8 + + + process-classes + + + ${jni.base} - - - - - - - - run - - - - - - + + + + + + + + run + + + + + + - - - org.eclipse.rdf4j - rdf4j-rio-api - ${rdf4j.version} - + + + org.eclipse.rdf4j + rdf4j-rio-api + - - - junit - junit - ${junit.version} - test - - - com.carrotsearch - junit-benchmarks - ${junit.benchmarks.version} - test - - - org.slf4j - slf4j-simple - ${slf4j.version} - test - - - org.semarglproject - semargl-rdf4j - ${semargl.rdf4j.version} - test - - + + + junit + junit + test + + + com.carrotsearch + junit-benchmarks + ${junit.benchmarks.version} + test + + + org.slf4j + slf4j-log4j12 + test + + + org.semarglproject + semargl-rdf4j + test + + \ No newline at end of file diff --git a/librdfa-rdf4j/src/main/c/CMakeLists.txt b/librdfa-rdf4j/src/main/c/CMakeLists.txt index ddd9e6ebc..ef5f08a6c 100644 --- a/librdfa-rdf4j/src/main/c/CMakeLists.txt +++ b/librdfa-rdf4j/src/main/c/CMakeLists.txt @@ -1,3 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + cmake_minimum_required(VERSION 2.8) # Check if required packages are installed diff --git a/librdfa-rdf4j/src/main/c/RdfaParser.cpp b/librdfa-rdf4j/src/main/c/RdfaParser.cpp index 4d844c31b..990756449 100644 --- a/librdfa-rdf4j/src/main/c/RdfaParser.cpp +++ b/librdfa-rdf4j/src/main/c/RdfaParser.cpp @@ -1,19 +1,18 @@ /* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * This file is part of librdfa. - * - * librdfa is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * librdfa is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with librdfa. If not, see . + * http://www.apache.org/licenses/LICENSE-2.0 * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + #include "RdfaParser.h" \ No newline at end of file diff --git a/librdfa-rdf4j/src/main/c/RdfaParser.h b/librdfa-rdf4j/src/main/c/RdfaParser.h index b9cc4d988..7a8f769af 100644 --- a/librdfa-rdf4j/src/main/c/RdfaParser.h +++ b/librdfa-rdf4j/src/main/c/RdfaParser.h @@ -1,20 +1,18 @@ /* - * Copyright (c) 2008 Digital Bazaar, Inc. + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at * - * This file is part of librdfa. + * http://www.apache.org/licenses/LICENSE-2.0 * - * librdfa is free software: you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * librdfa is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with librdfa. If not, see . + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ #ifndef _RDFA_PARSER_H_ #define _RDFA_PARSER_H_ @@ -32,7 +30,6 @@ class Callback { public: virtual ~Callback() { - //std::cout << "Callback::~Callback()" << std::endl; } virtual void default_graph(char* subject, char* predicate, char* object, int object_type, char* datatype, char* language) { diff --git a/librdfa-rdf4j/src/main/c/main.java b/librdfa-rdf4j/src/main/c/main.java deleted file mode 100644 index 6e7c50813..000000000 --- a/librdfa-rdf4j/src/main/c/main.java +++ /dev/null @@ -1,82 +0,0 @@ - -import java.io.BufferedReader; -import java.io.ByteArrayInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.nio.charset.StandardCharsets; -import org.apache.any23.rdf.librdfa.Callback; -import org.apache.any23.rdf.librdfa.RdfaParser; - -public class main { - - public static void main(String argv[]) { - System.loadLibrary("rdfaJava"); // Attempts to load example.dll (on Windows) or libexample.so (on Linux) - - System.out.println("Adding and calling a normal C++ callback"); - System.out.println("----------------------------------------"); - String ds = "\n" - + "\n" - + "\n" - + "\n" - + " Test 0001\n" - + "\n" - + "\n" - + "

This photo was taken by Mark Birbeck.

\n" - + "\n" - + ""; - - RdfaParser caller = new RdfaParser("http://www.google.com/"); - caller.init(); - Callback callback = new JavaCallback(new ByteArrayInputStream(ds.getBytes(StandardCharsets.UTF_8))); - caller.setCallback(callback); - - caller.parse(); - //rdfa.set_rdfa_parser(caller); - - } -} - -class JavaCallback extends Callback { - - BufferedReader bis = null; - int len = 0; - - public JavaCallback(InputStream is) { - super(); - bis = new BufferedReader(new InputStreamReader(is)); - } - - @Override - public void default_graph(String subject, String predicate, String object, int object_type, String datatype, String language) { - System.out.println("default_graph(...)"); - System.out.println("S=" + subject + "P=" + predicate + "O=" + object + "OT=" + object_type + "DT=" + datatype + "LANG=" + language); - } - - @Override - public void processor_graph(String subject, String predicate, String object, int object_type, String datatype, String language) { - System.out.println("processor_graph(...)"); - System.out.println("S=" + subject + "P=" + predicate + "O=" + object + "OT=" + object_type + datatype + "LANG=" + language); - } - - @Override - public String fill_data(long buffer_length) { - System.out.println("buffer_length:" + buffer_length); - StringBuilder sb = new StringBuilder(new StringBuffer((int) buffer_length)); - len = 0; - try { - for (int c; (c = bis.read()) != -1;) { - sb.append((char) c); - len++; - } - } catch (IOException ex) { - } - return sb.toString(); - } - - @Override - public long fill_len() { - return len; - } -} diff --git a/librdfa-rdf4j/src/main/c/rdfa.i b/librdfa-rdf4j/src/main/c/rdfa.i index 9f5213ca7..68b8f1d23 100644 --- a/librdfa-rdf4j/src/main/c/rdfa.i +++ b/librdfa-rdf4j/src/main/c/rdfa.i @@ -1,3 +1,18 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + %module(directors="1") rdfa %feature("director") Callback; diff --git a/librdfa-rdf4j/src/test/resources/log4j.properties b/librdfa-rdf4j/src/test/resources/log4j.properties new file mode 100644 index 000000000..32492dd43 --- /dev/null +++ b/librdfa-rdf4j/src/test/resources/log4j.properties @@ -0,0 +1,35 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +log4j.rootCategory=INFO, O + +# Stdout +log4j.appender.O=org.apache.log4j.ConsoleAppender + +# File +#log4j.appender.R=org.apache.log4j.RollingFileAppender +#log4j.appender.R.File=log4j.log + +# Control the maximum log file size +#log4j.appender.R.MaxFileSize=100KB + +# Archive log files (one backup file here) +log4j.appender.R.MaxBackupIndex=1 + +log4j.appender.R.layout=org.apache.log4j.PatternLayout +log4j.appender.O.layout=org.apache.log4j.PatternLayout + +log4j.appender.R.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n +log4j.appender.O.layout.ConversionPattern=[%d{ISO8601}]%5p%6.6r[%t]%x - %C.%M(%F:%L) - %m%n From 04f2adf33bf83fbdedf0c8b276bb7f56081e7141 Mon Sep 17 00:00:00 2001 From: Julio Caguano Date: Wed, 8 Aug 2018 18:01:12 -0500 Subject: [PATCH 11/11] Add librdfa-rdf4j --- core/pom.xml | 11 ++--------- librdfa-rdf4j/README.MD | 11 ++--------- librdfa-rdf4j/pom.xml | 2 +- librdfa-rdf4j/src/main/c/readme.md | 4 ---- pom.xml | 1 + 5 files changed, 6 insertions(+), 23 deletions(-) diff --git a/core/pom.xml b/core/pom.xml index de8a57026..b066d55f9 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -62,11 +62,11 @@ - + ${project.groupId} apache-any23-librdfa - 1.0.0 + ${project.version} @@ -356,13 +356,6 @@ - - - librdfa-rdf4j - https://raw.github.com/JulioCCBUcuenca/librdfa-java/repository/ - - - diff --git a/librdfa-rdf4j/README.MD b/librdfa-rdf4j/README.MD index 4df84f96b..2e73818f9 100644 --- a/librdfa-rdf4j/README.MD +++ b/librdfa-rdf4j/README.MD @@ -1,6 +1,6 @@ # Librdfa - RDF4J -RDF4J parser that uses [librdfa](https://github.com/rdfa/librdfa) to parse RDFa to triples. +RDF4J parser that uses [librdfa](https://github.com/rdfa/librdfa) to parse RDFa to triples. See the [documentation](https://cwiki.apache.org/confluence/display/ANY23/Librdfa-rdf4j+documentation) for more information. ## Prerequisites @@ -12,15 +12,8 @@ You need to install the [librdfa](https://github.com/rdfa/librdfa) library. org.apache.any23 apache-any23-librdfa - 1.0.0 + ${project.version} - - - - librdfa-rdf4j - https://raw.github.com/JulioCCBUcuenca/librdfa-java/repository/ - - ``` ## Compile diff --git a/librdfa-rdf4j/pom.xml b/librdfa-rdf4j/pom.xml index 1f6b82628..4ead804b5 100644 --- a/librdfa-rdf4j/pom.xml +++ b/librdfa-rdf4j/pom.xml @@ -28,7 +28,7 @@ org.apache.any23 apache-any23-librdfa jar - Apache Any23 :: Librdfa + Apache Any23 :: Librdfa-RDF4J ${project.basedir}/src/main/c/ diff --git a/librdfa-rdf4j/src/main/c/readme.md b/librdfa-rdf4j/src/main/c/readme.md index 5b1fd4a57..f3538b428 100644 --- a/librdfa-rdf4j/src/main/c/readme.md +++ b/librdfa-rdf4j/src/main/c/readme.md @@ -6,7 +6,3 @@ Build: `cd build` `cmake ..` `cmake --build .` - -Run: - -`java -Djava.library.path=. -jar Main.jar` diff --git a/pom.xml b/pom.xml index ce2ee5d17..e5d1038e1 100644 --- a/pom.xml +++ b/pom.xml @@ -232,6 +232,7 @@ csvutils mime encoding + librdfa-rdf4j core cli plugins/basic-crawler