Skip to content
This repository was archived by the owner on Jul 3, 2023. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@ pom.xml.versionsBackup
**/maven-eclipse.xml
**/any23-site/
**/nb*.xml
**/c/build/
12 changes: 10 additions & 2 deletions api/src/main/resources/default-configuration.properties
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,18 @@ any23.extraction.metadata.domain.per.entity=off

# Allows to decide which RDFa Extractor to enable.
# If 'on' will be activated the programmatic RDFa 1.1 Extractor
# (org.deri.any23.extractor.rdfa.RDFa11Extractor) otherwise will be
# registered the RDFa 1.0 legacy one (org.deri.any23.extractor.rdfa.RDFaExtractor).
# (org.apache.any23.extractor.rdfa.RDFa11Extractor) otherwise will be
# registered the RDFa 1.0 legacy one (org.apache.any23.extractor.rdfa.RDFaExtractor).
any23.extraction.rdfa.programmatic=on

# Allows to enable Librdfa Extractor.
# If 'on' will override the extractors with the programmatic option,
# RDFa 1.1 Extractor (org.apache.any23.extractor.rdfa.RDFa11Extractor) and
# RDFa 1.0 Exctractor (org.apache.any23.extractor.rdfa.RDFaExtractor).
# If the option is 'off' (by default), it will choose the especfied extractor
# in the programmatic option (any23.extraction.rdfa.programmatic).
any23.extraction.rdfa.librdfa=off

# The extraction context IRI to be used by the
# SingleDocumentExtraction. If == '?' the document IRI will
# be used. It can be overriden by specifying a different
Expand Down
10 changes: 9 additions & 1 deletion core/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@
</dependency>
<!-- END: Any23 -->

<!-- BEGIN: Librdfa-RDF4J-->
<dependency>
<groupId>${project.groupId}</groupId>
<artifactId>apache-any23-librdfa</artifactId>
<version>${project.version}</version>
</dependency>
<!-- END: Librdfa-RDF4J -->

<!-- BEGIN: httpcomponents -->
<dependency> <!-- used by RDF4J, Tika -->
<groupId>org.apache.httpcomponents</groupId>
Expand Down Expand Up @@ -347,7 +355,7 @@
</dependency>
<!-- END: Test Dependencies -->
</dependencies>

<build>
<resources>
<resource>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,15 @@

package org.apache.any23.extractor;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.extractor.html.HTMLMetaExtractorFactory;
import org.apache.any23.extractor.rdfa.LibRdfaExtractorFactory;
import org.apache.any23.extractor.rdfa.RDFa11ExtractorFactory;
import org.apache.any23.extractor.rdfa.RDFaExtractorFactory;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

/**
* Singleton class acting as a register for all the various
* {@link Extractor}.
Expand Down Expand Up @@ -55,12 +55,17 @@ public static ExtractorRegistry getInstance() {
if (instance == null) {
instance = new ExtractorRegistryImpl();

if(conf.getFlagProperty("any23.extraction.rdfa.programmatic")) {
if(conf.getFlagProperty("any23.extraction.rdfa.librdfa")){
instance.unregister(RDFaExtractorFactory.NAME);
instance.unregister(RDFa11ExtractorFactory.NAME);
} else if(conf.getFlagProperty("any23.extraction.rdfa.programmatic")) {
instance.unregister(LibRdfaExtractorFactory.NAME);
instance.unregister(RDFaExtractorFactory.NAME);
// FIXME: Unregister RDFaExtractor if flag is not set
//instance.register(RDFa11Extractor.factory);
} else {
instance.unregister(RDFa11ExtractorFactory.NAME);
instance.unregister(LibRdfaExtractorFactory.NAME);
// FIXME: Unregister RDFaExtractor if flag is set
//instance.register(RDFaExtractor.factory);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,16 @@

package org.apache.any23.extractor.rdf;

import org.apache.any23.extractor.IssueReport;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.Collections;
import java.util.HashSet;
import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.IssueReport;
import org.apache.any23.rdf.Any23ValueFactoryWrapper;
import org.apache.any23.rdf.rdfa.LibrdfaRDFaParser;
import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
import org.eclipse.rdf4j.rio.ParseErrorListener;
import org.eclipse.rdf4j.rio.RDFFormat;
Expand All @@ -36,12 +42,6 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
import java.util.Collections;
import java.util.HashSet;

/**
* This factory provides a common logic for creating and configuring correctly
* any <i>RDF</i> parser used within the library.
Expand Down Expand Up @@ -124,6 +124,27 @@ public RDFParser getRDFa11Parser(
configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
return parser;
}

/**
* Returns a new instance of a configured RDFaParser using the librdfa library.
*
* @param verifyDataType data verification enable if <code>true</code>.
* @param stopAtFirstError the parser stops at first error if <code>true</code>.
* @param extractionContext the extraction context where the parser is used.
* @param extractionResult the output extraction result.
* @return a new instance of a configured RDFXML parser.
*/
public RDFParser getRDFaLibrdfaParser(
final boolean verifyDataType,
final boolean stopAtFirstError,
final ExtractionContext extractionContext,
final ExtractionResult extractionResult
) {
final RDFParser parser = new LibrdfaRDFaParser();
parser.getParserConfig().set(RDFaParserSettings.RDFA_COMPATIBILITY, RDFaVersion.RDFA_1_1);
configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
return parser;
}

/**
* Returns a new instance of a configured RDFXMLParser.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.rdfa;

import org.apache.any23.extractor.ExtractionContext;
import org.apache.any23.extractor.ExtractionResult;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.extractor.rdf.BaseRDFExtractor;
import org.apache.any23.extractor.rdf.RDFParserFactory;
import org.eclipse.rdf4j.rio.RDFParser;

/**
*
* @author Julio Caguano
*/
public class LibRdfaExtractor extends BaseRDFExtractor {

public LibRdfaExtractor(boolean verifyDataType, boolean stopAtFirstError) {
super(verifyDataType, stopAtFirstError);
}

public LibRdfaExtractor() {
this(false, false);
}

@Override
protected RDFParser getParser(ExtractionContext extractionContext, ExtractionResult extractionResult) {
return RDFParserFactory.getInstance().getRDFaLibrdfaParser(
isVerifyDataType(), isStopAtFirstError(), extractionContext, extractionResult
);
}

@Override
public ExtractorDescription getDescription() {
return LibRdfaExtractorFactory.getDescriptionInstance();
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.any23.extractor.rdfa;

import java.util.Arrays;
import org.apache.any23.extractor.ExtractorDescription;
import org.apache.any23.extractor.ExtractorFactory;
import org.apache.any23.extractor.SimpleExtractorFactory;
import org.apache.any23.rdf.Prefixes;

/**
*
* @author Julio Caguano
*/
public class LibRdfaExtractorFactory extends SimpleExtractorFactory<LibRdfaExtractor>
implements ExtractorFactory<LibRdfaExtractor> {

public static final String NAME = "html-librdfa";
public static final Prefixes PREFIXES = null;

private static final ExtractorDescription descriptionInstance = new LibRdfaExtractorFactory();

public LibRdfaExtractorFactory() {
super(LibRdfaExtractorFactory.NAME,
LibRdfaExtractorFactory.PREFIXES,
Arrays.asList("text/html;q=0.3", "application/xhtml+xml;q=0.3"),
"example-rdfa11.html");
}

@Override
public LibRdfaExtractor createExtractor() {
return new LibRdfaExtractor();
}

public static ExtractorDescription getDescriptionInstance() {
return descriptionInstance;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,6 @@ org.apache.any23.extractor.rdf.TriXExtractorFactory
org.apache.any23.extractor.rdf.TurtleExtractorFactory
org.apache.any23.extractor.rdfa.RDFa11ExtractorFactory
org.apache.any23.extractor.rdfa.RDFaExtractorFactory
org.apache.any23.extractor.rdfa.LibRdfaExtractorFactory
org.apache.any23.extractor.xpath.XPathExtractorFactory
org.apache.any23.extractor.yaml.YAMLExtractorFactory
Loading