CogComp · bhargav · Oct 6, 2016 · Oct 7, 2016 · Feb 16, 2017 · Mar 1, 2017
diff --git a/saul-examples/README.md b/saul-examples/README.md
@@ -31,12 +31,15 @@ In Spatial Role Labeling, we try to find spatial relations and label spatial rol
 6. [Part-of-Speech Tagging](src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/POSTagger/README.md): 
 Part-of-Speech Tagging is the identification of words as nouns, verbs, adjectives, adverbs, etc.
 
-7. [Twitter Sentiment Analysis] (src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/TwitterSentimentAnalysis/README.md):
+7. [Twitter Sentiment Analysis](src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/TwitterSentimentAnalysis/README.md):
 This example trains models for classifying twitter posts as positive, negative or neutral. It also includes a twitter client for real time processing of the tweets.
 
-8. [Question Type Classification] (src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/QuestionTypeClassification/README.md):
+8. [Question Type Classification](src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/QuestionTypeClassification/README.md):
 This example contains a classifications to categorize questions into different semantic classes based on the possible semantic types of the answers. 
 
+9. [Chunker](src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md):
+This example comtains an implementation of a Shallow Parsing system.
+
 
 * Note: Examples are under active development. 
 

diff --git a/...es/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala b/...es/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala
@@ -0,0 +1,68 @@
+/** This software is released under the University of Illinois/Research and Academic Use License. See
+  * the LICENSE file in the root folder for details. Copyright (c) 2016
+  *
+  * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign
+  * http://cogcomp.cs.illinois.edu/
+  */
+package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker
+
+import edu.illinois.cs.cogcomp.annotation.Annotator
+import edu.illinois.cs.cogcomp.core.datastructures.ViewNames
+import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{Constituent, TextAnnotation, TokenLabelView}
+import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager
+import edu.illinois.cs.cogcomp.saulexamples.nlp.POSTagger.{POSAnnotator, POSTaggerApp}
+
+import scala.collection.JavaConversions._
+
+/** Chunker Annotator implementation
+  *
+  * @param useHeuristics To use heuristics to fix BIO annotation.
+  */
+class ChunkerAnnotator(val useHeuristics: Boolean = true)
+  extends Annotator(ChunkerConstants.SHALLOW_PARSE_ANNOTATED_SPAN_VIEW, Array(ViewNames.TOKENS)) {
+
+  override def initialize(rm: ResourceManager): Unit = {}
+
+  /** Adds the POS view to a TextAnnotation
+    * Note: Assumes that the classifiers are populated with required models
+    * @param ta TextAnnotation instance
+    */
+  override def addView(ta: TextAnnotation): Unit = {
+    if (!ta.hasView(ViewNames.POS)) {
+      ChunkerAnnotator.localPOSAnnotator.addView(ta)
+    }
+
+    val tokens = ta.getView(ViewNames.TOKENS).getConstituents
+
+    ChunkerDataModel.sentence.clear()
+    val sentences = (0 until ta.getNumberOfSentences).map(ta.getSentence)
+    ChunkerDataModel.sentence.populate(sentences, train = false)
+
+    val chunkerBIOView = new TokenLabelView(ChunkerConstants.SHALLOW_PARSE_ANNOTATED_BIO_VIEW, ta)
+
+    tokens.foreach({ cons: Constituent =>
+      val label = ChunkerClassifiers.ChunkerClassifier(cons)
+      val posCons = cons.cloneForNewViewWithDestinationLabel(chunkerBIOView.getViewName, label)
+      chunkerBIOView.addConstituent(posCons)
+    })
+
+    ta.addView(chunkerBIOView.getViewName, chunkerBIOView)
+
+    if (useHeuristics) {
+      ChunkerUtilities.addSpanLabelViewUsingHeuristics(
+        ta,
+        chunkerBIOView.getViewName,
+        ChunkerConstants.SHALLOW_PARSE_ANNOTATED_SPAN_VIEW)
+    } else {
+      ChunkerUtilities.addGoldSpanLabelView(
+        ta,
+        chunkerBIOView.getViewName,
+        ChunkerConstants.SHALLOW_PARSE_ANNOTATED_SPAN_VIEW)
+    }
+  }
+}
+
+object ChunkerAnnotator {
+  /** Instance of a local POS Annotator if required */
+  private lazy val localPOSAnnotator: POSAnnotator = POSTaggerApp.getPretrainedAnnotator()
+}
diff --git a/...examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala b/...examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala
@@ -0,0 +1,143 @@
+/** This software is released under the University of Illinois/Research and Academic Use License. See
+  * the LICENSE file in the root folder for details. Copyright (c) 2016
+  *
+  * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign
+  * http://cogcomp.cs.illinois.edu/
+  */
+package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker
+
+import java.util.Properties
+
+import edu.illinois.cs.cogcomp.core.datastructures.ViewNames
+import edu.illinois.cs.cogcomp.core.datastructures.textannotation._
+import edu.illinois.cs.cogcomp.core.experiments.ClassificationTester
+import edu.illinois.cs.cogcomp.core.experiments.evaluators.ConstituentLabelingEvaluator
+import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager
+import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer
+import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder
+import edu.illinois.cs.cogcomp.saul.classifier.ClassifierUtils
+import edu.illinois.cs.cogcomp.saul.util.Logging
+
+import scala.collection.JavaConversions._
+import scala.io.StdIn
+
+
+object ChunkerApp extends Logging {
+  import ChunkerConstants._
+
+  val trainFile = "../data/conll2000chunking/train.txt"
+  val testFile = "../data/conll2000chunking/test.txt"
+
+  val jarModelPath = ""
+
+  object ChunkerExperimentType extends Enumeration {
+    val TrainAndTest, TestFromModel, Interactive = Value
+
+    def withNameOpt(s: String): Option[Value] = values.find(_.toString == s)
+  }
+
+  def main(args: Array[String]): Unit = {
+    /** Try to parse the experiment type as input argument or use default */
+    val testType = args.headOption
+      .flatMap(ChunkerExperimentType.withNameOpt)
+      .getOrElse(ChunkerExperimentType.Interactive)
+
+    testType match {
+      case ChunkerExperimentType.TrainAndTest => trainAndTest()
+      case ChunkerExperimentType.TestFromModel => testWithPretrainedModels()
+      case ChunkerExperimentType.Interactive => interactiveWithPretrainedModels()
+    }
+  }
+
+  private def loadModelFromJarPath(): Unit = {
+    // Load model from jar path
+//    ClassifierUtils.LoadClassifier(
+//      jarModelPath,
+//      ChunkerClassifiers.ChunkerClassifier)
+    ChunkerClassifiers.ChunkerClassifier.load()
+  }
+
+  private def getSentencesInTextAnnotation(taSeq: Seq[TextAnnotation]) = {
+    taSeq.flatMap({ textAnnotation: TextAnnotation =>
+      (0 until textAnnotation.getNumberOfSentences).map(textAnnotation.getSentence)
+    })
+  }
+
+  lazy val trainData = ChunkerDataReader.parseData(trainFile)
+  lazy val testData = ChunkerDataReader.parseData(testFile)
+
+  lazy val preTrainedAnnotator: ChunkerAnnotator = {
+    loadModelFromJarPath()
+
+    val annotatorInstance = new ChunkerAnnotator()
+    annotatorInstance.initialize(new ResourceManager(new Properties()))
+    annotatorInstance
+  }
+
+  /** Note: This function does NOT populate testing instances.
+    * Also does not use GOLD POS tags. Instead a trained POSAnnotater is used. */
+  private def testModelImpl(): Unit = {
+    ClassifierUtils.TestClassifiers(ChunkerClassifiers.ChunkerClassifier)
+
+    val evaluator = new ConstituentLabelingEvaluator()
+    val tester = new ClassificationTester()
+
+    testData.foreach({ textAnnotation: TextAnnotation =>
+      // Remove POS View before evaluation.
+      textAnnotation.removeView(ViewNames.POS)
+
+      preTrainedAnnotator.addView(textAnnotation)
+
+      val goldView = textAnnotation.getView(SHALLOW_PARSE_GOLD_SPAN_VIEW)
+      val annotatedView = textAnnotation.getView(SHALLOW_PARSE_ANNOTATED_SPAN_VIEW)
+
+      // Workaround for incorrect ConstituentLabelingEvaluator behaviour.
+      val predictedView = new SpanLabelView(SHALLOW_PARSE_GOLD_SPAN_VIEW, textAnnotation)
+      annotatedView.getConstituents.foreach({ cons: Constituent =>
+        predictedView.addConstituent(cons.cloneForNewView(SHALLOW_PARSE_GOLD_SPAN_VIEW))
+      })
+
+      evaluator.evaluate(tester, goldView, predictedView)
+    })
+
+    println(tester.getPerformanceTable.toOrgTable)
+  }
+
+  def trainAndTest(): Unit = {
+    ChunkerDataModel.sentence.populate(getSentencesInTextAnnotation(trainData), train = true)
+    ChunkerDataModel.sentence.populate(getSentencesInTextAnnotation(testData), train = false)
+
+    ChunkerClassifiers.ChunkerClassifier.learn(50)
+    ClassifierUtils.SaveClassifiers(ChunkerClassifiers.ChunkerClassifier)
+
+    testModelImpl()
+  }
+
+  def testWithPretrainedModels(): Unit = {
+    loadModelFromJarPath()
+
+    ChunkerDataModel.sentence.populate(getSentencesInTextAnnotation(testData), train = false)
+
+    testModelImpl()
+  }
+
+  def interactiveWithPretrainedModels(): Unit = {
+    val taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer())
+
+    while (true) {
+      println("Enter a sentence to annotate (or Press Enter to exit)")
+      val input = StdIn.readLine()
+
+      input match {
+        case sentence: String if sentence.trim.nonEmpty =>
+          // Create a Text Annotation with the current input sentence.
+          val ta = taBuilder.createTextAnnotation(sentence.trim)
+          preTrainedAnnotator.addView(ta)
+          println("POS View            : " + ta.getView(ViewNames.POS).toString)
+          println("Annotated BIO View  : " + ta.getView(SHALLOW_PARSE_ANNOTATED_BIO_VIEW))
+          println("Annotated Span View : " + ta.getView(SHALLOW_PARSE_ANNOTATED_SPAN_VIEW))
+        case _ => return
+      }
+    }
+  }
+}
diff --git a/.../src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala b/.../src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala
@@ -0,0 +1,35 @@
+/** This software is released under the University of Illinois/Research and Academic Use License. See
+  * the LICENSE file in the root folder for details. Copyright (c) 2016
+  *
+  * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign
+  * http://cogcomp.cs.illinois.edu/
+  */
+package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker
+
+import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent
+import edu.illinois.cs.cogcomp.lbjava.learn.{ SparseAveragedPerceptron, SparseNetworkLearner }
+import edu.illinois.cs.cogcomp.saul.classifier.Learnable
+
+object ChunkerClassifiers {
+  import ChunkerDataModel._
+
+  object ChunkerClassifier extends Learnable[Constituent](tokens) {
+
+    override lazy val classifier = {
+      // Parameters
+      val params = new SparseAveragedPerceptron.Parameters()
+      params.learningRate = 0.1
+      params.thickness = 0.2
+      val baseLTU = new SparseAveragedPerceptron(params)
+
+      new SparseNetworkLearner(baseLTU)
+    }
+
+    /** Label property for users classifier */
+    override def label = chunkLabel
+
+    override def feature = using(
+      wordTypeInformation, affixes, posWindow, capitalizationWindowProperty, previousTags,
+      forms, formpp, mixed, SOPrevious)
+  }
+}
diff --git a/...es/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerConstants.scala b/...es/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerConstants.scala
@@ -0,0 +1,16 @@
+/** This software is released under the University of Illinois/Research and Academic Use License. See
+  * the LICENSE file in the root folder for details. Copyright (c) 2016
+  *
+  * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign
+  * http://cogcomp.cs.illinois.edu/
+  */
+package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker
+
+/** Constants used by the Chunker experiment */
+object ChunkerConstants {
+  val SHALLOW_PARSE_GOLD_SPAN_VIEW = "SHALLOW_PARSE_GOLD"
+  val SHALLOW_PARSE_GOLD_BIO_VIEW = "SHALLOW_PARSE_GOLD_BIO"
+
+  val SHALLOW_PARSE_ANNOTATED_SPAN_VIEW = "SHALLOW_PARSE_ANNOTATED"
+  val SHALLOW_PARSE_ANNOTATED_BIO_VIEW = "SHALLOW_PARSE_ANNOTATED_BIO"
+}