From db176414b03c8c9f66acdc3f9e85a700f52f304a Mon Sep 17 00:00:00 2001 From: Taher Rahgooy Date: Tue, 18 Jul 2017 09:56:47 +0430 Subject: [PATCH 1/5] Code refactoring --- .../SpRL2013/LANDMARK.java | 21 +++++---- .../SpRL2013/SPATIALINDICATOR.java | 21 +++++---- .../SpRL2013/TRAJECTOR.java | 21 +++++---- .../SpatialRoleLabeling/SpRLAnnotation.java | 6 +-- .../SpatialRoleLabeling/SpRLDataReader.java | 44 +++++++++++++------ 5 files changed, 63 insertions(+), 50 deletions(-) diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRL2013/LANDMARK.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRL2013/LANDMARK.java index 8135c62e..78a92bbe 100755 --- a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRL2013/LANDMARK.java +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRL2013/LANDMARK.java @@ -17,7 +17,6 @@ import edu.illinois.cs.cogcomp.saulexamples.nlp.SpatialRoleLabeling.SpRLAnnotation; import javax.xml.bind.annotation.*; -import java.math.BigInteger; @XmlAccessorType(XmlAccessType.FIELD) @@ -28,9 +27,9 @@ public class LANDMARK implements SpRLAnnotation { @XmlAttribute(name = "id", required = true) protected String id; @XmlAttribute(name = "start", required = true) - protected BigInteger start; + protected Integer start; @XmlAttribute(name = "end", required = true) - protected BigInteger end; + protected Integer end; @XmlAttribute(name = "text", required = true) protected String text; @@ -63,10 +62,10 @@ public void setId(String value) { * * @return * possible object is - * {@link BigInteger } + * {@link Integer } * */ - public BigInteger getStart() { + public Integer getStart() { return start; } @@ -75,10 +74,10 @@ public BigInteger getStart() { * * @param value * allowed object is - * {@link BigInteger } + * {@link Integer } * */ - public void setStart(BigInteger value) { + public void setStart(Integer value) { this.start = value; } @@ -87,10 +86,10 @@ public void setStart(BigInteger value) { * * @return * possible object is - * {@link BigInteger } + * {@link Integer } * */ - public BigInteger getEnd() { + public Integer getEnd() { return end; } @@ -99,10 +98,10 @@ public BigInteger getEnd() { * * @param value * allowed object is - * {@link BigInteger } + * {@link Integer } * */ - public void setEnd(BigInteger value) { + public void setEnd(Integer value) { this.end = value; } diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRL2013/SPATIALINDICATOR.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRL2013/SPATIALINDICATOR.java index 81e79884..c10dcdbb 100755 --- a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRL2013/SPATIALINDICATOR.java +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRL2013/SPATIALINDICATOR.java @@ -16,7 +16,6 @@ import edu.illinois.cs.cogcomp.saulexamples.nlp.SpatialRoleLabeling.SpRLAnnotation; -import java.math.BigInteger; import javax.xml.bind.annotation.XmlAccessType; import javax.xml.bind.annotation.*; @@ -28,9 +27,9 @@ public class SPATIALINDICATOR implements SpRLAnnotation { @XmlAttribute(name = "id", required = true) protected String id; @XmlAttribute(name = "start", required = true) - protected BigInteger start; + protected Integer start; @XmlAttribute(name = "end", required = true) - protected BigInteger end; + protected Integer end; @XmlAttribute(name = "text", required = true) protected String text; @@ -63,10 +62,10 @@ public void setId(String value) { * * @return * possible object is - * {@link BigInteger } + * {@link Integer } * */ - public BigInteger getStart() { + public Integer getStart() { return start; } @@ -75,10 +74,10 @@ public BigInteger getStart() { * * @param value * allowed object is - * {@link BigInteger } + * {@link Integer } * */ - public void setStart(BigInteger value) { + public void setStart(Integer value) { this.start = value; } @@ -87,10 +86,10 @@ public void setStart(BigInteger value) { * * @return * possible object is - * {@link BigInteger } + * {@link Integer } * */ - public BigInteger getEnd() { + public Integer getEnd() { return end; } @@ -99,10 +98,10 @@ public BigInteger getEnd() { * * @param value * allowed object is - * {@link BigInteger } + * {@link Integer } * */ - public void setEnd(BigInteger value) { + public void setEnd(Integer value) { this.end = value; } diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRL2013/TRAJECTOR.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRL2013/TRAJECTOR.java index f0af8dfc..234f8e57 100755 --- a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRL2013/TRAJECTOR.java +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRL2013/TRAJECTOR.java @@ -16,7 +16,6 @@ import edu.illinois.cs.cogcomp.saulexamples.nlp.SpatialRoleLabeling.SpRLAnnotation; -import java.math.BigInteger; import javax.xml.bind.annotation.*; @@ -28,9 +27,9 @@ public class TRAJECTOR implements SpRLAnnotation { @XmlAttribute(name = "id", required = true) protected String id; @XmlAttribute(name = "start", required = true) - protected BigInteger start; + protected Integer start; @XmlAttribute(name = "end", required = true) - protected BigInteger end; + protected Integer end; @XmlAttribute(name = "text", required = true) protected String text; @@ -58,9 +57,9 @@ public void setId(String value) { * Gets the value of the start property. * * @return possible object is - * {@link BigInteger } + * {@link Integer } */ - public BigInteger getStart() { + public Integer getStart() { return start; } @@ -68,9 +67,9 @@ public BigInteger getStart() { * Sets the value of the start property. * * @param value allowed object is - * {@link BigInteger } + * {@link Integer } */ - public void setStart(BigInteger value) { + public void setStart(Integer value) { this.start = value; } @@ -78,9 +77,9 @@ public void setStart(BigInteger value) { * Gets the value of the end property. * * @return possible object is - * {@link BigInteger } + * {@link Integer } */ - public BigInteger getEnd() { + public Integer getEnd() { return end; } @@ -88,9 +87,9 @@ public BigInteger getEnd() { * Sets the value of the end property. * * @param value allowed object is - * {@link BigInteger } + * {@link Integer } */ - public void setEnd(BigInteger value) { + public void setEnd(Integer value) { this.end = value; } diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRLAnnotation.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRLAnnotation.java index 5388f638..698c9bbe 100644 --- a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRLAnnotation.java +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRLAnnotation.java @@ -6,14 +6,12 @@ */ package edu.illinois.cs.cogcomp.saulexamples.nlp.SpatialRoleLabeling; -import java.math.BigInteger; - /** * Created by taher on 7/30/16. */ public interface SpRLAnnotation { - BigInteger getStart(); - BigInteger getEnd(); + Integer getStart(); + Integer getEnd(); String getText(); String getId(); } diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRLDataReader.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRLDataReader.java index 3303ef83..f713e8b6 100644 --- a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRLDataReader.java +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/SpatialRoleLabeling/SpRLDataReader.java @@ -24,35 +24,53 @@ public class SpRLDataReader { private final Class jaxbClass; public List documents; public final String corpusPath; + public final String extension; - public SpRLDataReader(String corpusPath, Class jaxbClass) { + public SpRLDataReader(String corpusPath, Class jaxbClass) { + this(corpusPath, jaxbClass, ".xml"); + } + + public SpRLDataReader(String corpusPath, Class jaxbClass, String extension) { this.corpusPath = corpusPath; documents = new ArrayList<>(); this.jaxbClass = jaxbClass; + this.extension = extension; } public void readData() throws ParserConfigurationException, IOException, SAXException, JAXBException { File dir = new File(corpusPath); - if(!dir.exists()) - throw new IOException("Cannot find '" + dir.getAbsolutePath() + "' path."); - Collection files = getAllFiles(dir); - documents = new ArrayList<>(); - for(File f : files) { - T doc = XmlModel.load(jaxbClass,f); - doc.setFilename(f.getName()); + if (dir.isFile() && dir.getName().toLowerCase().endsWith(extension)) { + + documents = new ArrayList<>(); + T doc = XmlModel.load(jaxbClass, dir); + doc.setFilename(dir.getName()); documents.add(doc); + + } else { + + if (!dir.exists()) + throw new IOException("Cannot find '" + dir.getAbsolutePath() + "' path."); + + Collection files = getAllFiles(dir); + documents = new ArrayList<>(); + for (File f : files) { + T doc = XmlModel.load(jaxbClass, f); + doc.setFilename(f.getName()); + documents.add(doc); + } } } - private static Collection getAllFiles(File dir) { + private Collection getAllFiles(File dir) { Set files = new HashSet<>(); for (File file : dir.listFiles()) { if (file.isFile()) { - if(file.getName().toLowerCase().endsWith(".xml")) + if (file.getName().toLowerCase().endsWith(extension)) files.add(file); - } - else files.addAll(getAllFiles(file)); + } else files.addAll(getAllFiles(file)); } - return files; + List sortedFiles = new ArrayList<>(files); + sortedFiles.sort((f1, f2) -> f1.getPath().compareToIgnoreCase(f2.getPath())); + return sortedFiles; } } From 96b0143e4ff3a8e4a05b8dcaba62f7536f1c0b67 Mon Sep 17 00:00:00 2001 From: Taher Rahgooy Date: Tue, 18 Jul 2017 09:58:08 +0430 Subject: [PATCH 2/5] base types added. --- .../saulexamples/nlp/BaseTypes/BaseTyps.md | 31 ++++++ .../saulexamples/nlp/BaseTypes/Document.java | 32 ++++++ .../nlp/BaseTypes/ExactMatching.java | 18 ++++ .../nlp/BaseTypes/ISpanElement.java | 22 ++++ .../nlp/BaseTypes/ISpanElementMatching.java | 16 +++ .../nlp/BaseTypes/InclusionMatching.java | 18 ++++ .../nlp/BaseTypes/NlpBaseElement.java | 90 ++++++++++++++++ .../nlp/BaseTypes/NlpBaseElementTypes.java | 14 +++ .../nlp/BaseTypes/OverlapMatching.java | 17 +++ .../nlp/BaseTypes/PartOfMatching.java | 18 ++++ .../saulexamples/nlp/BaseTypes/Phrase.java | 41 +++++++ .../saulexamples/nlp/BaseTypes/Relation.java | 102 ++++++++++++++++++ .../saulexamples/nlp/BaseTypes/Sentence.java | 36 +++++++ .../nlp/BaseTypes/SpanBasedElement.java | 75 +++++++++++++ .../saulexamples/nlp/BaseTypes/Token.java | 61 +++++++++++ 15 files changed, 591 insertions(+) create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/BaseTyps.md create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Document.java create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ExactMatching.java create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ISpanElement.java create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ISpanElementMatching.java create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/InclusionMatching.java create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/NlpBaseElement.java create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/NlpBaseElementTypes.java create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/OverlapMatching.java create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/PartOfMatching.java create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Phrase.java create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Relation.java create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Sentence.java create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/SpanBasedElement.java create mode 100644 saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Token.java diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/BaseTyps.md b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/BaseTyps.md new file mode 100644 index 00000000..aea1f084 --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/BaseTyps.md @@ -0,0 +1,31 @@ +# Saul NLP Base Types +Built in hierarchy of common linguistic units for processing text and NLP tasks +and a relation class that connects various linguistic units. +Nlp base types help to design data models in Saul and provide many built in feature +extractors and helpers which the NLP sensors operate on them. + +## Hierarchy +All classes of the hierarchy assumed to have a mandatory unique **id** throughout the entire corpora, +an optional **text**, and also an optional character based span which shows the **start** index and exclusive +**end** index of each linguistic unit. + +On the top level of the hierarchy we have the [`Document`](Document.java) class. +Each document contains many [`Sentences`](Sententce.java) which in turn can +have many [`Phrases`](Phrase.java) and finally each phrase can contain many [`Tokens`](Token.java). +Note that, you can omit one or more of these linguistic units for specific usages. + +### Properties +We can specify additional properties for all hierarchy classes using +`setPropertyValue`. This function adds a value to the list of values for that property. +The value list can be retrieved by `getPropertyValues` function. +And there is the `getPropertyFirstValue` which returns the first +value of the list for that property. + +## Relation +Data modeling in Saul usually requires having edges between the model's nodes. +[Relations](Relation.java) help to have a container that holds the information +needed to construct those edges. + +Each relation should have a unique **Id** and two or more **argumentId** which determine +the Id of the linguistic units that used in this relation. Additional properties can be added +using `setProperty` function. diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Document.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Document.java new file mode 100644 index 00000000..e0d55aa1 --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Document.java @@ -0,0 +1,32 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +/** + * Created by Taher on 2016-12-18. + */ +public class Document extends NlpBaseElement { + public Document() { + } + + public Document(String id) { + super(id, -1, -1, ""); + } + + public Document(String id, Integer start, Integer end) { + super(id, start, end, ""); + } + + public Document(String id, Integer start, Integer end, String text) { + super(id, start, end, text); + } + + @Override + public NlpBaseElementTypes getType() { + return NlpBaseElementTypes.Document; + } +} diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ExactMatching.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ExactMatching.java new file mode 100644 index 00000000..f463b1f2 --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ExactMatching.java @@ -0,0 +1,18 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +/** + * Created by Taher on 2016-12-28. + */ +public class ExactMatching implements ISpanElementMatching { + + @Override + public boolean matches(ISpanElement xmlElement, ISpanElement element) { + return xmlElement.matches(element); + } +} diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ISpanElement.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ISpanElement.java new file mode 100644 index 00000000..e58eb2de --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ISpanElement.java @@ -0,0 +1,22 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +/** + * Created by Taher on 2016-12-28. + */ +public interface ISpanElement { + int getStart(); + void setStart(int start); + void setEnd(int end); + int getEnd(); + String getText(); + boolean matches(ISpanElement e); + boolean contains(ISpanElement e); + boolean isPartOf(ISpanElement e); + boolean overlaps(ISpanElement e); +} diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ISpanElementMatching.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ISpanElementMatching.java new file mode 100644 index 00000000..4eb16d6a --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ISpanElementMatching.java @@ -0,0 +1,16 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +import edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes.ISpanElement; + +/** + * Created by Taher on 2016-12-28. + */ +public interface ISpanElementMatching { + boolean matches(ISpanElement xmlElement, ISpanElement element); +} diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/InclusionMatching.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/InclusionMatching.java new file mode 100644 index 00000000..00eee8e2 --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/InclusionMatching.java @@ -0,0 +1,18 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +/** + * Created by Taher on 2016-12-28. + */ +public class InclusionMatching implements ISpanElementMatching { + + @Override + public boolean matches(ISpanElement xmlElement, ISpanElement element) { + return xmlElement.contains(element); + } +} diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/NlpBaseElement.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/NlpBaseElement.java new file mode 100644 index 00000000..6d381e6c --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/NlpBaseElement.java @@ -0,0 +1,90 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Created by Taher on 2016-12-18. + */ +public abstract class NlpBaseElement extends SpanBasedElement { + private String id; + private String text; + private Map> properties = new HashMap<>(); + + public NlpBaseElement() { + setStart(-1); + setEnd(-1); + } + + public NlpBaseElement(String id, Integer start, Integer end, String text) { + this.setId(id); + this.setStart(start); + this.setEnd(end); + this.setText(text); + } + + public abstract NlpBaseElementTypes getType(); + + public boolean containsProperty(String name) { + return properties.containsKey(name) && !properties.get(name).isEmpty(); + } + + public String getPropertyFirstValue(String name) { + if (containsProperty(name)) + return properties.get(name).get(0); + return null; + } + + public List getPropertyValues(String name) { + if (containsProperty(name)) + return properties.get(name); + return new ArrayList<>(); + } + + public void addPropertyValue(String name, String value) { + if (!containsProperty(name)) + properties.put(name, new ArrayList<>()); + properties.get(name).add(value); + } + + public void removeProperty(String name) { + if (containsProperty(name)) + properties.remove(name); + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public static NlpBaseElement create(NlpBaseElementTypes type) { + + switch (type) { + case Document: + return new Document(); + case Sentence: + return new Sentence(); + case Phrase: + return new Phrase(); + case Token: + return new Token(); + } + return null; + } + + @Override + public String toString() { + return getText(); + } +} diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/NlpBaseElementTypes.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/NlpBaseElementTypes.java new file mode 100644 index 00000000..3c068c87 --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/NlpBaseElementTypes.java @@ -0,0 +1,14 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +/** + * Created by Taher on 2016-12-24. + */ +public enum NlpBaseElementTypes { + Document, Sentence, Phrase, Token +} diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/OverlapMatching.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/OverlapMatching.java new file mode 100644 index 00000000..e2bdf807 --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/OverlapMatching.java @@ -0,0 +1,17 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +/** + * Created by Taher on 2016-12-28. + */ +public class OverlapMatching implements ISpanElementMatching{ + @Override + public boolean matches(ISpanElement xmlElement, ISpanElement element) { + return xmlElement.overlaps(element); + } +} diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/PartOfMatching.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/PartOfMatching.java new file mode 100644 index 00000000..203bc4c4 --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/PartOfMatching.java @@ -0,0 +1,18 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +/** + * Created by Taher on 2016-12-28. + */ +public class PartOfMatching implements ISpanElementMatching { + + @Override + public boolean matches(ISpanElement xmlElement, ISpanElement element) { + return xmlElement.isPartOf(element); + } +} diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Phrase.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Phrase.java new file mode 100644 index 00000000..b0363e5e --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Phrase.java @@ -0,0 +1,41 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +/** + * Created by Taher on 2016-12-24. + */ +public class Phrase extends NlpBaseElement { + + private Sentence sentence; + + public Phrase(){ + + } + + public Phrase(Sentence sentence, String id, Integer start, Integer end, String text) { + super(id, start, end, text); + this.sentence = sentence; + } + + @Override + public NlpBaseElementTypes getType() { + return NlpBaseElementTypes.Phrase; + } + + public Document getDocument() { + return getSentence().getDocument(); + } + + public Sentence getSentence() { + return sentence; + } + + public void setSentence(Sentence sentence) { + this.sentence = sentence; + } +} \ No newline at end of file diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Relation.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Relation.java new file mode 100644 index 00000000..b7b57aa5 --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Relation.java @@ -0,0 +1,102 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + *

+ * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +import java.util.*; + +/** + * Created by Taher on 2016-12-25. + */ +public class Relation { + private String id; + private NlpBaseElement parent; + private Map properties = new HashMap<>(); + private Map argumentIds = new HashMap<>(); + private Map arguments = new HashMap<>(); + + public Relation() { + id = ""; + } + + public Relation(String id) { + this.setId(id); + } + + public boolean containsProperty(String name) { + return properties.containsKey(name); + } + + public String getProperty(String name) { + if (properties.containsKey(name)) + return properties.get(name); + return null; + } + + public void setProperty(String name, String value) { + properties.put(name, value); + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public int getArgumentsCount() { + return argumentIds.size(); + } + + public void setArgumentId(int index, String argumentId) { + argumentIds.put(index, argumentId); + } + + public String getArgumentId(int index) { + if (!argumentIds.containsKey(index)) + return null; + return argumentIds.get(index); + } + + public void setArgument(int index, NlpBaseElement argument) { + arguments.put(index, argument); + } + + public NlpBaseElement getArgument(int index) { + if (!arguments.containsKey(index)) + return null; + return arguments.get(index); + } + public Collection getArgumentIds() { + return argumentIds.values(); + } + + public Collection getArguments() { + return arguments.values(); + } + + public boolean hasSameArguments(Relation r) { + if (r == null) + return getArgumentsCount() > 0; + if(r.getArgumentsCount() != r.getArgumentsCount()) + return false; + for(int i=0; i< getArgumentsCount(); i++){ + if(getArgumentId(i) != r.getArgumentId(i)) + return false; + } + return true; + } + + public NlpBaseElement getParent() { + return parent; + } + + public void setParent(NlpBaseElement parent) { + this.parent = parent; + } +} diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Sentence.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Sentence.java new file mode 100644 index 00000000..90cf2348 --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Sentence.java @@ -0,0 +1,36 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +/** + * Created by Taher on 2016-12-18. + */ +public class Sentence extends NlpBaseElement { + + private Document document; + + public Sentence() { + } + + public Sentence(Document document, String id, Integer start, Integer end, String text) { + super(id, start, end, text); + this.setDocument(document); + } + + @Override + public NlpBaseElementTypes getType() { + return NlpBaseElementTypes.Sentence; + } + + public Document getDocument() { + return document; + } + + public void setDocument(Document document) { + this.document = document; + } +} diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/SpanBasedElement.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/SpanBasedElement.java new file mode 100644 index 00000000..89500c8e --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/SpanBasedElement.java @@ -0,0 +1,75 @@ +/** + * This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + *

+ * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +/** + * Created by Taher on 2016-12-28. + */ +public class SpanBasedElement implements ISpanElement { + + private int start; + private int end; + private String text; + + public int getStart() { + return start; + } + + public void setStart(int start) { + this.start = start; + } + + public int getEnd() { + return end; + } + + public void setEnd(int end) { + this.end = end; + } + + @Override + public boolean matches(ISpanElement e) { + if (e == null) + return false; + return getStart() == e.getStart() && + getEnd() == e.getEnd(); + } + + @Override + public boolean contains(ISpanElement e) { + if (e == null) + return false; + return matches(e) || + (getStart() <= e.getStart() && getEnd() >= e.getEnd()); + } + + @Override + public boolean isPartOf(ISpanElement e) { + if (e == null) + return false; + return e.contains(this); + } + + @Override + public boolean overlaps(ISpanElement e) { + if (e == null) + return false; + return matches(e) || + (getStart() <= e.getStart() && e.getStart() < getEnd()) || + (e.getStart() <= getStart() && getStart() < e.getEnd()); + } + + @Override + public String getText() { + return text; + } + + public void setText(String text) { + this.text = text; + } +} diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Token.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Token.java new file mode 100644 index 00000000..7901db37 --- /dev/null +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/Token.java @@ -0,0 +1,61 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes; + +/** + * Created by Taher on 2016-12-18. + */ +public class Token extends NlpBaseElement { + + private Sentence sentence; + private Phrase phrase; + + public Token() { + + } + + public Token(Sentence sentence, String id, Integer start, Integer end, String text) { + this(sentence, null, id, start, end, text); + } + + public Token(Phrase phrase, String id, Integer start, Integer end, String text) { + this(phrase.getSentence(), phrase, id, start, end, text); + } + + public Token(Sentence sentence, Phrase phrase, String id, Integer start, Integer end, String text) { + super(id, start, end, text); + this.setSentence(sentence); + this.setPhrase(phrase); + } + + @Override + public NlpBaseElementTypes getType() { + return NlpBaseElementTypes.Token; + } + + public Document getDocument() { + return getSentence().getDocument(); + } + + public Sentence getSentence() { + if (sentence != null) + return sentence; + return phrase != null ? phrase.getSentence() : null; + } + + public void setSentence(Sentence sentence) { + this.sentence = sentence; + } + + public Phrase getPhrase() { + return phrase; + } + + public void setPhrase(Phrase phrase) { + this.phrase = phrase; + } +} From 0fc68e1699bbb0ea1c330d1e908e7ae532232df8 Mon Sep 17 00:00:00 2001 From: Taher Rahgooy Date: Tue, 18 Jul 2017 09:58:24 +0430 Subject: [PATCH 3/5] base type sensors added. --- .../nlp/LanguageBaseTypeSensors.scala | 376 ++++++++++++++++++ .../nlp/LanguageBaseTypeSensorTests.scala | 118 ++++++ 2 files changed, 494 insertions(+) create mode 100644 saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/LanguageBaseTypeSensors.scala create mode 100644 saul-examples/src/test/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/LanguageBaseTypeSensorTests.scala diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/LanguageBaseTypeSensors.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/LanguageBaseTypeSensors.scala new file mode 100644 index 00000000..43ee51c1 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/LanguageBaseTypeSensors.scala @@ -0,0 +1,376 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp + +import java.util.Properties + +import edu.illinois.cs.cogcomp.core.datastructures.{ViewNames, _} +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{Relation => _, Sentence => _, _} +import edu.illinois.cs.cogcomp.edison.features.FeatureUtilities +import edu.illinois.cs.cogcomp.edison.features.factory.{SubcategorizationFrame, WordFeatureExtractorFactory} +import edu.illinois.cs.cogcomp.edison.features.helpers.PathFeatureHelper +import edu.illinois.cs.cogcomp.nlp.common.PipelineConfigurator._ +import edu.illinois.cs.cogcomp.nlp.utilities.CollinsHeadFinder +import edu.illinois.cs.cogcomp.saul.util.Logging +import edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes._ +import edu.illinois.cs.cogcomp.saulexamples.nlp.SpatialRoleLabeling.SpRLSensors.dependencyView + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +/** Created by parisakordjamshidi on 12/25/16. + */ +object LanguageBaseTypeSensors extends Logging { + private val dependencyView = ViewNames.DEPENDENCY_STANFORD + private val parserView = ViewNames.PARSE_STANFORD + private val sentenceById = mutable.HashMap[String, TextAnnotation]() + private val settings = new Properties() + TextAnnotationFactory.disableSettings(settings, USE_SRL_NOM, USE_NER_CONLL, USE_NER_ONTONOTES, USE_SRL_VERB) + private val as = TextAnnotationFactory.createPipelineAnnotatorService(settings) + + def documentToSentenceMatching(d: Document, s: Sentence): Boolean = { + d.getId == s.getDocument.getId + } + + def documentToSentenceGenerating(d: Document): Seq[Sentence] = { + getSentences(d) + } + + def sentenceToPhraseGenerating(s: Sentence): Seq[Phrase] = { + getPhrases(s) + } + + def sentenceToPhraseMatching(s: Sentence, p: Phrase): Boolean = { + s.getId == p.getSentence.getId + } + + def phraseToTokenGenerating(p: Phrase): Seq[Token] = { + getTokens(p) + } + + def phraseToTokenMatching(p: Phrase, t: Token): Boolean = { + if (t.getPhrase != null) + p.getId == t.getPhrase.getId + else + p.contains(t) + } + + def sentenceToTokenGenerating(s: Sentence): Seq[Token] = { + getTokens(s) + } + + def sentenceToTokenMatching(s: Sentence, t: Token): Boolean = { + s.getId == t.getSentence.getId + } + + def documentToRelationMatching(d: Document, r: Relation): Boolean = { + r.getParent != null && d.getId == getDocument(r.getParent).getId + } + + def sentenceToRelationMatching(s: Sentence, r: Relation): Boolean = { + r.getParent != null && s.getId == getSentence(r.getParent).getId + } + + def phraseToRelationMatching(p: Phrase, r: Relation): Boolean = { + r.getParent != null && p.getId == getPhrase(r.getParent).getId + } + + def relationToTokenMatching(r: Relation, t: Token): Boolean = { + r.getArgumentIds.contains(t.getId) + } + + def getPos(e: NlpBaseElement): Seq[String] = { + val constituents = getElementConstituents(e) + constituents.map(x => WordFeatureExtractorFactory.pos.getFeatures(x).asScala.mkString) + } + + def getPhrasePos(p: Phrase): String = { + val ta = getTextAnnotation(p) + val v = ta.getView(ViewNames.SHALLOW_PARSE) + v.getLabelsCoveringSpan(getStartTokenId(p), getEndTokenId(p) + 1).asScala.head + } + + def getLemma(e: NlpBaseElement): Seq[String] = { + val constituents = getElementConstituents(e) + constituents.map(x => WordFeatureExtractorFactory.lemma.getFeatures(x).asScala.mkString) + } + + def getHeadword(p: Phrase): Token = { + var ta = getTextAnnotation(p) + val (startId: Int, endId: Int) = getTextAnnotationSpan(p) + var phrase = new Constituent("temp", "", ta, startId, endId + 1) + var headId: Int = getHeadwordId(ta, phrase) + if (headId < startId || headId > endId) { + //when out of phrase, create a text annotation using just the phrase text + ta = TextAnnotationFactory.createTextAnnotation(as, "", "", p.getText) + phrase = new Constituent("temp", "", ta, 0, ta.getTokens.length) + headId = getHeadwordId(ta, phrase) + } + val head = ta.getView(ViewNames.TOKENS).asInstanceOf[TokenLabelView].getConstituentAtToken(headId) + new Token(p, p.getId + head.getSpan, head.getStartCharOffset, head.getEndCharOffset, head.toString) + } + + def getTokens(text: String): List[Token] = { + val ta = TextAnnotationFactory.createTextAnnotation(as, "", "", text) + ta.getView(ViewNames.TOKENS).getConstituents.asScala.map(x=> + new Token(null.asInstanceOf[Sentence], null, x.getStartCharOffset, x.getEndCharOffset, x.toString)).toList + } + + def getHeadword(text: String): (String, Int, Int) = { + val ta = TextAnnotationFactory.createTextAnnotation(as, "", "", text) + val phrase = new Constituent("temp", "", ta, 0, ta.getTokens.length) + val headId = getHeadwordId(ta, phrase) + val head = ta.getView(ViewNames.TOKENS).asInstanceOf[TokenLabelView].getConstituentAtToken(headId) + (head.toString, head.getStartCharOffset, head.getEndCharOffset) + } + + def getSemanticRole(e: NlpBaseElement): String = { + val ta = getTextAnnotation(e) + val view = if (ta.hasView(ViewNames.SRL_VERB)) { + ta.getView(ViewNames.SRL_VERB) + } else { + logger.warn("Cannot find SRL view") + null + } + val (startId: Int, endId: Int) = getTextAnnotationSpan(e) + view match { + case null => "" + case _ => view.getLabelsCoveringSpan(startId, endId + 1).asScala.mkString(",") + } + } + + def isBefore(t1: NlpBaseElement, t2: NlpBaseElement): Boolean = { + getStartTokenId(t1) < getStartTokenId(t2) + } + + def getTokenDistance(t1: NlpBaseElement, t2: NlpBaseElement): Int = { + Math.abs(getStartTokenId(t1) - getStartTokenId(t2)) + } + + def getCandidateRelations[T <: NlpBaseElement](argumentInstances: List[T]*): List[Relation] = { + if (argumentInstances.length < 2) { + List.empty + } else { + crossProduct(argumentInstances.seq.toList) + // don't consider elements that are from different parents(sentences) + .filter(args => args.filter(_ != null).groupBy { + case x: Token => x.getSentence.getId + case x: Phrase => x.getSentence.getId + case x: Sentence => x.getDocument.getId + case _ => null + }.size <= 1 && args.filter(_ != null) + .groupBy(_.getId).size == args.count(_ != null) // distinct arguments + ) + .map(args => { + val r = new Relation() + args.zipWithIndex.filter(x => x._1 != null).foreach { + case (a, i) => { + r.setArgumentId(i, a.getId) + r.setArgument(i, a) + r.setId(r.getId + "[" + i + ", " + a.getId + "]") + } + } + r + }) + } + } + + def getSubCategorization(e: NlpBaseElement): String = { + val (startId: Int, endId: Int) = getTextAnnotationSpan(e) + val ta = getTextAnnotation(e) + val v = ta.getView(ViewNames.TOKENS) + val constituents = v.getConstituentsCoveringSpan(startId, endId + 1).asScala + constituents + .map(x => FeatureUtilities.getFeatureSet(new SubcategorizationFrame(ViewNames.PARSE_STANFORD), x) + .asScala.mkString(",")).mkString(";") + } + + def getWindow(t: Token, before: Int, after: Int): Seq[String] = { + val id = getStartTokenId(t) + val ta = getTextAnnotation(t) + val start = Math.max(0, id - before) + val end = Math.min(ta.getTokens.length - 1, id + after) + ta.getTokens.slice(start, end) + } + + def getDependencyRelation(t: Token): String = { + val relations = getDependencyRelations(getTextAnnotation(t)) + val root = getDependencyRoot(relations) + if (root != null && root.getStartCharOffset == t.getStart) + "root" + else + relations.find(r => r.getTarget.getStartCharOffset == t.getStart) match { + case Some(r) => r.getRelationName + case _ => "" + } + } + + def getDependencyPath(t1: Token, t2: Token): String = { + + def getRelationName(relations: List[textannotation.Relation], c1: Constituent, c2: Constituent, dir: String): String = { + val r = relations.find(x => (x.getSource == c1 && x.getTarget == c2) || (x.getSource == c2 && x.getTarget == c1)) + r match { + case Some(r) => dir + r.getRelationName + case None => "" + } + } + + val ta = getTextAnnotation(t1) + val c1 = ta.getView(dependencyView).getConstituentsCoveringToken(getStartTokenId(t1)).get(0) + val c2 = ta.getView(dependencyView).getConstituentsCoveringToken(getStartTokenId(t2)).get(0) + + val parse: TreeView = ta.getView(dependencyView).asInstanceOf[TreeView] + + val relations = parse.getRelations.asScala.toList + val paths = PathFeatureHelper.getPathsToCommonAncestor(c1, c2, 400) + + val up = paths.getFirst.asScala.toList + val down = paths.getSecond.asScala.toList + + val path: StringBuilder = new StringBuilder + var i = 0 + while (i < up.size - 1) { + path.append(getRelationName(relations, up(i), up(i + 1), "↑")) + i += 1 + } + i = down.size - 1 + while (i > 0) { + path.append(getRelationName(relations, down(i), down(i - 1), "↓")) + i -= 1 + } + + path.toString.toUpperCase + } + + + //////////////////////////////////////////////////////////////////////////// + /// private methods + //////////////////////////////////////////////////////////////////////////// + private def crossProduct[T](input: List[List[T]]): List[List[T]] = input match { + case Nil => Nil + case head :: Nil => head.map(_ :: Nil) + case head :: tail => for (elem <- head; sub <- crossProduct(tail)) yield elem :: sub + } + + private def getHeadwordId(ta: TextAnnotation, phrase: Constituent) = { + val tree: TreeView = ta.getView(ViewNames.PARSE_STANFORD).asInstanceOf[TreeView] + val parsePhrase = tree.getParsePhrase(phrase) + val headId = CollinsHeadFinder.getInstance.getHeadWordPosition(parsePhrase) + headId + } + + private def getDependencyRoot(relations: Seq[textannotation.Relation]): Constituent = { + relations.find(x => relations.count(r => r.getTarget == x.getSource) == 0) match { + case Some(x) => x.getSource + case _ => null + } + } + + private def getDependencyRelations(ta: TextAnnotation): Seq[textannotation.Relation] = { + ta.getView(dependencyView).asInstanceOf[TreeView].getRelations.asScala + } + + private def getPhrase(e: NlpBaseElement) = e match { + case p: Phrase => p + case t: Token => t.getPhrase + case _ => + logger.warn("cannot use 'getPhrase' for document or Sentence type.") + null + } + + private def getSentence(e: NlpBaseElement) = e match { + case s: Sentence => s + case p: Phrase => p.getSentence + case t: Token => t.getSentence + case _ => + logger.warn("cannot use 'getSentence' for document type.") + null + } + + private def getDocument(e: NlpBaseElement) = e match { + case x: Document => x + case _ => getSentence(e).getDocument + } + + private def getSentences(document: Document): Seq[Sentence] = { + val ta = as.createBasicTextAnnotation("", document.getId, document.getText) + ta.sentences().asScala.map(x => + new Sentence(document, document.getId + "_" + x.getSentenceId, x.getSentenceConstituent.getStartCharOffset, + x.getSentenceConstituent.getEndCharOffset, x.getText)) + } + + private def getPhrases(sentence: Sentence): Seq[Phrase] = { + val ta = getTextAnnotation(sentence) + val v = ta.getView(ViewNames.SHALLOW_PARSE) + v.getConstituents.asScala.map(x => + new Phrase(sentence, generateId(sentence, x), x.getStartCharOffset, x.getEndCharOffset, x.toString)) + } + + def getTokens(e: NlpBaseElement): Seq[Token] = { + val ta = getTextAnnotation(e) + if (ta == null) + return Seq() + val v = ta.getView(ViewNames.TOKENS) + val (startId: Int, endId: Int) = getTextAnnotationSpan(e) + v.getConstituentsCoveringSpan(startId, endId + 1).asScala.map(x => + e match { + case p: Phrase => new Token(p, generateId(e, x), x.getStartCharOffset, x.getEndCharOffset, x.toString) + case s: Sentence => new Token(s, generateId(e, x), x.getStartCharOffset, x.getEndCharOffset, x.toString) + case _ => + logger.warn("cannot find tokens for base types other than phrase and sentence.") + null + }) + } + + private def generateId(e: NlpBaseElement, x: Constituent): String = { + e.getId + x.getSpan + } + + private def getElementConstituents(e: NlpBaseElement): Seq[Constituent] = { + val s = getSentence(e) + if (s == null) + return Seq() + val ta = getTextAnnotation(s) + val v = ta.getView(ViewNames.TOKENS) + val startId = ta.getTokenIdFromCharacterOffset(e.getStart) + val endId = ta.getTokenIdFromCharacterOffset(e.getEnd - 1) + v.getConstituentsCoveringSpan(startId, endId + 1).asScala + } + + private def getTextAnnotation(e: NlpBaseElement): TextAnnotation = { + val sentence = getSentence(e) + if (sentence == null) + return null + if (!sentenceById.contains(sentence.getId)) { + val ta = TextAnnotationFactory.createTextAnnotation(as, sentence.getDocument.getId, sentence.getId, sentence.getText) + sentenceById.put(sentence.getId, ta) + } + sentenceById(sentence.getId) + } + + private def getTextAnnotationSpan(e: NlpBaseElement): (Int, Int) = { + (getStartTokenId(e), getEndTokenId(e)) + } + + private def getStartTokenId(e: NlpBaseElement): Int = { + val ta = getTextAnnotation(e) + val start = e match { + case _: Document | _: Sentence => 0 + case _ => e.getStart + } + ta.getTokenIdFromCharacterOffset(start) + } + + private def getEndTokenId(e: NlpBaseElement): Int = { + val ta = getTextAnnotation(e) + val end = e match { + case _: Document | _: Sentence => ta.getView(ViewNames.TOKENS).getConstituents.asScala.last.getEndCharOffset + case _ => e.getEnd + } + ta.getTokenIdFromCharacterOffset(end - 1) + } +} \ No newline at end of file diff --git a/saul-examples/src/test/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/LanguageBaseTypeSensorTests.scala b/saul-examples/src/test/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/LanguageBaseTypeSensorTests.scala new file mode 100644 index 00000000..10bc975a --- /dev/null +++ b/saul-examples/src/test/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/LanguageBaseTypeSensorTests.scala @@ -0,0 +1,118 @@ +package edu.illinois.cs.cogcomp.saulexamples.nlp + +import edu.illinois.cs.cogcomp.saulexamples.nlp.BaseTypes.{ Document, Sentence } +import edu.illinois.cs.cogcomp.saulexamples.nlp.LanguageBaseTypeSensors._ +import org.scalatest.{ FlatSpec, Matchers } + +/** Created by Taher on 2017-01-11. + */ +class LanguageBaseTypeSensorTests extends FlatSpec with Matchers { + + "Extracted features for 'I received the book.'" should "be correct" in { + val text = "I received the book." + val document = new Document("doc1", 0, text.length, text) + val sentence = new Sentence(document, "s1", 0, text.length, text) + val tokens = sentenceToTokenGenerating(sentence) + + tokens.length should be(5) + + tokens(0).getDocument.getId should be("doc1") + tokens(1).getDocument.getId should be("doc1") + tokens(2).getDocument.getId should be("doc1") + tokens(3).getDocument.getId should be("doc1") + tokens(4).getDocument.getId should be("doc1") + + tokens(0).getSentence.getId should be("s1") + tokens(1).getSentence.getId should be("s1") + tokens(2).getSentence.getId should be("s1") + tokens(3).getSentence.getId should be("s1") + tokens(4).getSentence.getId should be("s1") + + tokens(0).getText should be("I") + tokens(1).getText should be("received") + tokens(2).getText should be("the") + tokens(3).getText should be("book") + tokens(4).getText should be(".") + + getPos(tokens(0)).mkString should be("PRP") + getPos(tokens(1)).mkString should be("VBD") + getPos(tokens(2)).mkString should be("DT") + getPos(tokens(3)).mkString should be("NN") + getPos(tokens(4)).mkString should be(".") + + getLemma(tokens(0)).mkString should be("i") + getLemma(tokens(1)).mkString should be("receive") + getLemma(tokens(2)).mkString should be("the") + getLemma(tokens(3)).mkString should be("book") + getLemma(tokens(4)).mkString should be(".") + + getDependencyRelation(tokens(0)).mkString should be("nsubj") + getDependencyRelation(tokens(1)).mkString should be("root") + getDependencyRelation(tokens(2)).mkString should be("det") + getDependencyRelation(tokens(3)).mkString should be("dobj") + getDependencyRelation(tokens(4)).mkString should be("") + + getSemanticRole(tokens(0)).mkString should be("") + getSemanticRole(tokens(1)).mkString should be("") + getSemanticRole(tokens(2)).mkString should be("") + getSemanticRole(tokens(3)).mkString should be("") + getSemanticRole(tokens(4)).mkString should be("") + + getSubCategorization(tokens(0)).mkString should be("S>(NP)VP.") + getSubCategorization(tokens(1)).mkString should be("VP>(VBD)NP") + getSubCategorization(tokens(2)).mkString should be("NP>(DT)NN") + getSubCategorization(tokens(3)).mkString should be("NP>DT(NN)") + getSubCategorization(tokens(4)).mkString should be("S>NPVP(.)") + + } + + "Extracted features for 'I am going to eat lunch.'" should "be correct" in { + val text = "I am going to eat lunch." + val document = new Document("doc1", 0, text.length, text) + val sentence = new Sentence(document, "s2", 0, text.length, text) + val tokens = sentenceToTokenGenerating(sentence) + + tokens.length should be(7) + + tokens(0).getText should be("I") + tokens(1).getText should be("am") + tokens(2).getText should be("going") + tokens(3).getText should be("to") + tokens(4).getText should be("eat") + tokens(5).getText should be("lunch") + tokens(6).getText should be(".") + + getPos(tokens(0)).mkString should be("PRP") + getPos(tokens(1)).mkString should be("VBP") + getPos(tokens(2)).mkString should be("VBG") + getPos(tokens(3)).mkString should be("TO") + getPos(tokens(4)).mkString should be("VB") + getPos(tokens(5)).mkString should be("NN") + getPos(tokens(6)).mkString should be(".") + + getLemma(tokens(0)).mkString should be("i") + getLemma(tokens(1)).mkString should be("be") + getLemma(tokens(2)).mkString should be("go") + getLemma(tokens(3)).mkString should be("to") + getLemma(tokens(4)).mkString should be("eat") + getLemma(tokens(5)).mkString should be("lunch") + getLemma(tokens(6)).mkString should be(".") + + getDependencyRelation(tokens(0)).mkString should be("nsubj") + getDependencyRelation(tokens(1)).mkString should be("aux") + getDependencyRelation(tokens(2)).mkString should be("root") + getDependencyRelation(tokens(3)).mkString should be("aux") + getDependencyRelation(tokens(4)).mkString should be("xcomp") + getDependencyRelation(tokens(5)).mkString should be("dobj") + getDependencyRelation(tokens(6)).mkString should be("") + + getSubCategorization(tokens(0)).mkString should be("S>(NP)VP.") + getSubCategorization(tokens(1)).mkString should be("VP>(VBP)VP") + getSubCategorization(tokens(2)).mkString should be("VP>(VBG)S") + getSubCategorization(tokens(3)).mkString should be("VP>(TO)VP") + getSubCategorization(tokens(4)).mkString should be("VP>(VB)NP") + getSubCategorization(tokens(5)).mkString should be("VP>VB(NP)") + getSubCategorization(tokens(6)).mkString should be("S>NPVP(.)") + + } +} From dd747b302493a6d422347dbf42829e6dd344d514 Mon Sep 17 00:00:00 2001 From: Taher Rahgooy Date: Tue, 18 Jul 2017 10:04:54 +0430 Subject: [PATCH 4/5] refactoring --- .../cs/cogcomp/saulexamples/nlp/BaseTypes/ExactMatching.java | 4 ++-- .../saulexamples/nlp/BaseTypes/ISpanElementMatching.java | 2 +- .../cogcomp/saulexamples/nlp/BaseTypes/InclusionMatching.java | 4 ++-- .../cogcomp/saulexamples/nlp/BaseTypes/OverlapMatching.java | 4 ++-- .../cs/cogcomp/saulexamples/nlp/BaseTypes/PartOfMatching.java | 4 ++-- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ExactMatching.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ExactMatching.java index f463b1f2..3fa72b94 100644 --- a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ExactMatching.java +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ExactMatching.java @@ -12,7 +12,7 @@ public class ExactMatching implements ISpanElementMatching { @Override - public boolean matches(ISpanElement xmlElement, ISpanElement element) { - return xmlElement.matches(element); + public boolean matches(ISpanElement e1, ISpanElement e2) { + return e1.matches(e2); } } diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ISpanElementMatching.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ISpanElementMatching.java index 4eb16d6a..a934ffa3 100644 --- a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ISpanElementMatching.java +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/ISpanElementMatching.java @@ -12,5 +12,5 @@ * Created by Taher on 2016-12-28. */ public interface ISpanElementMatching { - boolean matches(ISpanElement xmlElement, ISpanElement element); + boolean matches(ISpanElement e1, ISpanElement e2); } diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/InclusionMatching.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/InclusionMatching.java index 00eee8e2..ca788f7e 100644 --- a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/InclusionMatching.java +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/InclusionMatching.java @@ -12,7 +12,7 @@ public class InclusionMatching implements ISpanElementMatching { @Override - public boolean matches(ISpanElement xmlElement, ISpanElement element) { - return xmlElement.contains(element); + public boolean matches(ISpanElement e1, ISpanElement e2) { + return e1.contains(e2); } } diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/OverlapMatching.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/OverlapMatching.java index e2bdf807..8b8a242e 100644 --- a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/OverlapMatching.java +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/OverlapMatching.java @@ -11,7 +11,7 @@ */ public class OverlapMatching implements ISpanElementMatching{ @Override - public boolean matches(ISpanElement xmlElement, ISpanElement element) { - return xmlElement.overlaps(element); + public boolean matches(ISpanElement e1, ISpanElement e2) { + return e1.overlaps(e2); } } diff --git a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/PartOfMatching.java b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/PartOfMatching.java index 203bc4c4..9f529d0c 100644 --- a/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/PartOfMatching.java +++ b/saul-examples/src/main/java/edu/illinois/cs/cogcomp/saulexamples/nlp/BaseTypes/PartOfMatching.java @@ -12,7 +12,7 @@ public class PartOfMatching implements ISpanElementMatching { @Override - public boolean matches(ISpanElement xmlElement, ISpanElement element) { - return xmlElement.isPartOf(element); + public boolean matches(ISpanElement e1, ISpanElement e2) { + return e1.isPartOf(e2); } } From 1288317fea05e85fd2a1f48b98e1b36650ef074e Mon Sep 17 00:00:00 2001 From: Taher Rahgooy Date: Tue, 18 Jul 2017 10:08:13 +0430 Subject: [PATCH 5/5] format --- .../nlp/LanguageBaseTypeSensors.scala | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/LanguageBaseTypeSensors.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/LanguageBaseTypeSensors.scala index 43ee51c1..a6883985 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/LanguageBaseTypeSensors.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/LanguageBaseTypeSensors.scala @@ -8,10 +8,10 @@ package edu.illinois.cs.cogcomp.saulexamples.nlp import java.util.Properties -import edu.illinois.cs.cogcomp.core.datastructures.{ViewNames, _} -import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{Relation => _, Sentence => _, _} +import edu.illinois.cs.cogcomp.core.datastructures.{ ViewNames, _ } +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{ Relation => _, Sentence => _, _ } import edu.illinois.cs.cogcomp.edison.features.FeatureUtilities -import edu.illinois.cs.cogcomp.edison.features.factory.{SubcategorizationFrame, WordFeatureExtractorFactory} +import edu.illinois.cs.cogcomp.edison.features.factory.{ SubcategorizationFrame, WordFeatureExtractorFactory } import edu.illinois.cs.cogcomp.edison.features.helpers.PathFeatureHelper import edu.illinois.cs.cogcomp.nlp.common.PipelineConfigurator._ import edu.illinois.cs.cogcomp.nlp.utilities.CollinsHeadFinder @@ -116,7 +116,7 @@ object LanguageBaseTypeSensors extends Logging { def getTokens(text: String): List[Token] = { val ta = TextAnnotationFactory.createTextAnnotation(as, "", "", text) - ta.getView(ViewNames.TOKENS).getConstituents.asScala.map(x=> + ta.getView(ViewNames.TOKENS).getConstituents.asScala.map(x => new Token(null.asInstanceOf[Sentence], null, x.getStartCharOffset, x.getEndCharOffset, x.toString)).toList } @@ -158,13 +158,13 @@ object LanguageBaseTypeSensors extends Logging { crossProduct(argumentInstances.seq.toList) // don't consider elements that are from different parents(sentences) .filter(args => args.filter(_ != null).groupBy { - case x: Token => x.getSentence.getId - case x: Phrase => x.getSentence.getId - case x: Sentence => x.getDocument.getId - case _ => null - }.size <= 1 && args.filter(_ != null) - .groupBy(_.getId).size == args.count(_ != null) // distinct arguments - ) + case x: Token => x.getSentence.getId + case x: Phrase => x.getSentence.getId + case x: Sentence => x.getDocument.getId + case _ => null + }.size <= 1 && args.filter(_ != null) + .groupBy(_.getId).size == args.count(_ != null) // distinct arguments + ) .map(args => { val r = new Relation() args.zipWithIndex.filter(x => x._1 != null).foreach { @@ -246,7 +246,6 @@ object LanguageBaseTypeSensors extends Logging { path.toString.toUpperCase } - //////////////////////////////////////////////////////////////////////////// /// private methods ////////////////////////////////////////////////////////////////////////////