From 760868a3b7ee79676a49ff463328b3ef58f0114d Mon Sep 17 00:00:00 2001 From: Bhargav Mangipudi Date: Thu, 6 Oct 2016 16:34:06 -0500 Subject: [PATCH 01/11] Add Chunker basic structure and some edison based features. --- .../saulexamples/nlp/Chunker/ChunkerApp.scala | 120 ++++++++++++++++++ .../nlp/Chunker/ChunkerClassifiers.scala | 33 +++++ .../nlp/Chunker/ChunkerDataModel.scala | 49 +++++++ .../nlp/Chunker/ChunkerSensors.scala | 24 ++++ 4 files changed, 226 insertions(+) create mode 100644 saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala create mode 100644 saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala create mode 100644 saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala create mode 100644 saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala new file mode 100644 index 00000000..2cc0b9a5 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala @@ -0,0 +1,120 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +import edu.illinois.cs.cogcomp.annotation.BasicTextAnnotationBuilder +import edu.illinois.cs.cogcomp.core.datastructures.ViewNames +import edu.illinois.cs.cogcomp.core.datastructures.textannotation._ +import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer +import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder +import edu.illinois.cs.cogcomp.saulexamples.nlp.POSTagger.POSTaggerApp + +import scala.collection.JavaConversions._ +import scala.collection.mutable +import scala.io.{ Source, StdIn } + +object ChunkerApp extends App { + val trainFile = "../data/conll2000chunking/train.txt" + val testFile = "../data/conll2000chunking/test.txt" + + def parseData(fileName: String): Seq[TextAnnotation] = { + val arrayBuffer = mutable.Buffer[TextAnnotation]() + + val tokenConstituents = mutable.ArrayBuffer[String]() + val posLabels = mutable.ArrayBuffer[String]() + val chunkLabels = mutable.ArrayBuffer[String]() + var numSentences = 0 + + Source.fromFile(fileName) + .getLines() + .foreach({ line: String => + if (line.isEmpty) { + val sentenceList = List(tokenConstituents.toArray[String]) + val textAnnotation = BasicTextAnnotationBuilder.createTextAnnotationFromTokens(sentenceList) + + val posView = new TokenLabelView(ViewNames.POS, textAnnotation) + val chunkLabelView = new SpanLabelView(ViewNames.SHALLOW_PARSE, textAnnotation) + + textAnnotation.getView(ViewNames.TOKENS) + .getConstituents + .zipWithIndex + .foreach({ + case (constituent: Constituent, idx: Int) => + val posCons = constituent.cloneForNewViewWithDestinationLabel(ViewNames.POS, posLabels(idx)) + posView.addConstituent(posCons) + + val chunkCons = constituent.cloneForNewViewWithDestinationLabel(ViewNames.SHALLOW_PARSE, chunkLabels(idx)) + chunkLabelView.addConstituent(chunkCons) + }) + + textAnnotation.addView(ViewNames.POS, posView) + textAnnotation.addView(ViewNames.SHALLOW_PARSE, chunkLabelView) + + arrayBuffer.append(textAnnotation) + tokenConstituents.clear() + posLabels.clear() + chunkLabels.clear() + + numSentences += 1 + } else { + val reader = line.split(" ") + tokenConstituents.append(reader(0)) + posLabels.append(reader(1)) + chunkLabels.append(reader(2)) + } + }) + + println("Number of sentences = " + numSentences) + + arrayBuffer + } + + lazy val trainData = parseData(trainFile) + lazy val testData = parseData(testFile) + + val jarModelPath = "" + + trainData.foreach({ textAnnotation: TextAnnotation => + val numberOfSentences = textAnnotation.getNumberOfSentences + val sentences = (0 until numberOfSentences).map(textAnnotation.getSentence) + ChunkerDataModel.sentence.populate(sentences, train = true) + }) + + testData.foreach({ textAnnotation: TextAnnotation => + val numberOfSentences = textAnnotation.getNumberOfSentences + val sentences = (0 until numberOfSentences).map(textAnnotation.getSentence) + ChunkerDataModel.sentence.populate(sentences, train = false) + }) + + ChunkerClassifiers.ChunkerClassifier.learn(10) + println(ChunkerClassifiers.ChunkerClassifier.test()) + + /** Interactive model to annotate input sentences with Pre-trained models + */ + def interactiveWithPretrainedModels(): Unit = { + val posAnnotator = POSTaggerApp.getPretrainedAnnotator(ViewNames.POS) + val taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer()) + + while (true) { + println("Enter a sentence to annotate (or Press Enter to exit)") + val input = StdIn.readLine() + + input match { + case sentence: String if sentence.trim.nonEmpty => + // Create a Text Annotation with the current input sentence. + val ta = taBuilder.createTextAnnotation(sentence.trim) + posAnnotator.addView(ta) + + val tokens = ta.getView(ViewNames.TOKENS).getConstituents + ChunkerDataModel.tokens.populate(tokens) + + println("Tokens: " + ta.getView(ViewNames.TOKENS)) + case _ => return + } + } + } +} diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala new file mode 100644 index 00000000..bdbf460c --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala @@ -0,0 +1,33 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent +import edu.illinois.cs.cogcomp.lbjava.learn.{ SparseAveragedPerceptron, SparseNetworkLearner } +import edu.illinois.cs.cogcomp.saul.classifier.Learnable + +object ChunkerClassifiers { + import ChunkerDataModel._ + + object ChunkerClassifier extends Learnable[Constituent](tokens) { + + override lazy val classifier = { + // Parameters + val params = new SparseAveragedPerceptron.Parameters() + params.learningRate = 0.1 + params.thickness = 0.2 + val baseLTU = new SparseAveragedPerceptron(params) + + new SparseNetworkLearner(baseLTU) + } + + /** Label property for users classifier */ + override def label = chunkLabel + + override def feature = using(wordTypeInformation, affixes, posWindow) + } +} diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala new file mode 100644 index 00000000..408b213a --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala @@ -0,0 +1,49 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +import edu.illinois.cs.cogcomp.core.datastructures.ViewNames +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{ Constituent, Sentence } +import edu.illinois.cs.cogcomp.edison.features.lrec.{ Affixes, POSWindow, WordTypeInformation } +import edu.illinois.cs.cogcomp.saul.datamodel.DataModel + +import scala.collection.JavaConversions._ + +object ChunkerDataModel extends DataModel { + val sentence = node[Sentence] + val tokens = node[Constituent] + + val sentenceToTokens = edge(sentence, tokens) + sentenceToTokens.addSensor(ChunkerSensors.getTokensInSentence _) + + // Label + val chunkLabel = property(tokens, "ChunkLabel") { token: Constituent => token.getLabel } + + // Affixes feature + private val affixFeatureExtractor = new Affixes(ViewNames.TOKENS) + val affixes = property(tokens, "Affixes") { token: Constituent => + affixFeatureExtractor.getFeatures(token) + .map(_.getName) + .toList + } + + // WordTypeInformation feature + private val wordTypeInformationExtractor = new WordTypeInformation(ViewNames.TOKENS) + val wordTypeInformation = property(tokens, "WordTypeInformation") { token: Constituent => + wordTypeInformationExtractor.getFeatures(token) + .map(_.getName) + .toList + } + + // POS Window features + private val posWindowExtractor = new POSWindow(ViewNames.POS) + val posWindow = property(tokens, "POSWindow") { token: Constituent => + posWindowExtractor.getFeatures(token) + .map(_.getName) + .toList + } +} diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala new file mode 100644 index 00000000..85ade7a7 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala @@ -0,0 +1,24 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +import edu.illinois.cs.cogcomp.core.datastructures.ViewNames +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{ Constituent, Sentence, TextAnnotation } + +import scala.collection.JavaConversions._ + +object ChunkerSensors { + + def getSentencesInDocument(document: TextAnnotation): Seq[Sentence] = { + val numberOfSentences = document.getNumberOfSentences + (0 until numberOfSentences).map(document.getSentence) + } + + def getTokensInSentence(sentence: Sentence): Seq[Constituent] = { + sentence.getView(ViewNames.SHALLOW_PARSE).getConstituents + } +} From 4e6b4927e5dbf747c99fb3412f63b7b757d54757 Mon Sep 17 00:00:00 2001 From: Bhargav Mangipudi Date: Fri, 7 Oct 2016 04:47:00 -0500 Subject: [PATCH 02/11] Chunker Context feature + Training. --- .../saulexamples/nlp/Chunker/ChunkerApp.scala | 32 ++----------------- .../nlp/Chunker/ChunkerClassifiers.scala | 2 +- .../nlp/Chunker/ChunkerDataModel.scala | 26 +++++++++++++++ 3 files changed, 30 insertions(+), 30 deletions(-) diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala index 2cc0b9a5..ca833d26 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala @@ -9,13 +9,11 @@ package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker import edu.illinois.cs.cogcomp.annotation.BasicTextAnnotationBuilder import edu.illinois.cs.cogcomp.core.datastructures.ViewNames import edu.illinois.cs.cogcomp.core.datastructures.textannotation._ -import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer -import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder -import edu.illinois.cs.cogcomp.saulexamples.nlp.POSTagger.POSTaggerApp +import edu.illinois.cs.cogcomp.saul.classifier.ClassifierUtils import scala.collection.JavaConversions._ import scala.collection.mutable -import scala.io.{ Source, StdIn } +import scala.io.Source object ChunkerApp extends App { val trainFile = "../data/conll2000chunking/train.txt" @@ -92,29 +90,5 @@ object ChunkerApp extends App { ChunkerClassifiers.ChunkerClassifier.learn(10) println(ChunkerClassifiers.ChunkerClassifier.test()) - - /** Interactive model to annotate input sentences with Pre-trained models - */ - def interactiveWithPretrainedModels(): Unit = { - val posAnnotator = POSTaggerApp.getPretrainedAnnotator(ViewNames.POS) - val taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer()) - - while (true) { - println("Enter a sentence to annotate (or Press Enter to exit)") - val input = StdIn.readLine() - - input match { - case sentence: String if sentence.trim.nonEmpty => - // Create a Text Annotation with the current input sentence. - val ta = taBuilder.createTextAnnotation(sentence.trim) - posAnnotator.addView(ta) - - val tokens = ta.getView(ViewNames.TOKENS).getConstituents - ChunkerDataModel.tokens.populate(tokens) - - println("Tokens: " + ta.getView(ViewNames.TOKENS)) - case _ => return - } - } - } + ClassifierUtils.SaveClassifiers(ChunkerClassifiers.ChunkerClassifier) } diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala index bdbf460c..17d5f274 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala @@ -28,6 +28,6 @@ object ChunkerClassifiers { /** Label property for users classifier */ override def label = chunkLabel - override def feature = using(wordTypeInformation, affixes, posWindow) + override def feature = using(wordTypeInformation, affixes, posWindow, capitalizationWindowProperty, previousTags) } } diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala index 408b213a..a7d40e38 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala @@ -8,6 +8,8 @@ package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker import edu.illinois.cs.cogcomp.core.datastructures.ViewNames import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{ Constituent, Sentence } +import edu.illinois.cs.cogcomp.edison.features.ContextFeatureExtractor +import edu.illinois.cs.cogcomp.edison.features.factory.WordFeatureExtractorFactory import edu.illinois.cs.cogcomp.edison.features.lrec.{ Affixes, POSWindow, WordTypeInformation } import edu.illinois.cs.cogcomp.saul.datamodel.DataModel @@ -46,4 +48,28 @@ object ChunkerDataModel extends DataModel { .map(_.getName) .toList } + + // Capitalization features + private val capitalizationExtractor = new ContextFeatureExtractor(2, true, true, + WordFeatureExtractorFactory.capitalization) + val capitalizationWindowProperty = property(tokens, "Capitalization") { token: Constituent => + capitalizationExtractor.getFeatures(token) + .map(_.getName) + .toList + } + + // Filter to restrict window to current sentence's tokens only. + val previousTagsFilter = Seq({ token: Constituent => tokens(token) ~> -sentenceToTokens }) + val previousTags = property(tokens, "PreviousTags", cache = true) { token: Constituent => + tokens.getWithWindow(token, -2, -1, previousTagsFilter) + .flatten + .map({ previousCons: Constituent => + // Use Label while training and prediction while testing. + if (ChunkerClassifiers.ChunkerClassifier.isTraining) { + chunkLabel(previousCons) + } else { + ChunkerClassifiers.ChunkerClassifier(previousCons) + } + }) + } } From b48e21e60d8f9b8a2d7b085b73660a4ff0fc9e36 Mon Sep 17 00:00:00 2001 From: Bhargav Mangipudi Date: Wed, 15 Feb 2017 19:51:53 -0600 Subject: [PATCH 03/11] Update some features. --- .../nlp/Chunker/ChunkerDataModel.scala | 37 ++++++++++++++++++- 1 file changed, 35 insertions(+), 2 deletions(-) diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala index a7d40e38..fe230581 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala @@ -59,9 +59,11 @@ object ChunkerDataModel extends DataModel { } // Filter to restrict window to current sentence's tokens only. - val previousTagsFilter = Seq({ token: Constituent => tokens(token) ~> -sentenceToTokens }) + val sameSentenceTokensFilter = Seq({ token: Constituent => tokens(token) ~> -sentenceToTokens }) + + // Get Previous Chunk labels val previousTags = property(tokens, "PreviousTags", cache = true) { token: Constituent => - tokens.getWithWindow(token, -2, -1, previousTagsFilter) + tokens.getWithWindow(token, -2, -1, sameSentenceTokensFilter) .flatten .map({ previousCons: Constituent => // Use Label while training and prediction while testing. @@ -72,4 +74,35 @@ object ChunkerDataModel extends DataModel { } }) } + + // Get surface forms in context window + val forms = property(tokens, "Forms") { token: Constituent => + tokens.getWithWindow(token, -2, +2, sameSentenceTokensFilter) + .flatten + .map(_.getSurfaceForm) + } + + // Formpp Feature + val formpp = property(tokens, "Formpp") { token: Constituent => + val window = 2 + val surfaceForms: List[String] = forms(token) + + // Feature range + val range = for { + j <- 0 until window + i <- surfaceForms.indices + } yield (j, i) + + range.map({ case (j: Int, i: Int) => + val contextStrings = for { + context <- 0 until window + if i + context < surfaceForms.length + } yield s"${i}_${j}:${surfaceForms(i + context)}" + + contextStrings.mkString("_") + }) + .toList + } + + } From 0b6cf41bef76be5641f1d792ac315284f1f1353f Mon Sep 17 00:00:00 2001 From: Bhargav Mangipudi Date: Wed, 1 Mar 2017 05:24:22 -0600 Subject: [PATCH 04/11] Add evaluation using ConstituentLabelingEvaluator. --- .../nlp/Chunker/ChunkerAnnotator.scala | 53 +++++++++++++++ .../saulexamples/nlp/Chunker/ChunkerApp.scala | 45 +++++++++++-- .../nlp/Chunker/ChunkerSensors.scala | 10 +-- .../nlp/Chunker/ChunkerUtilities.scala | 64 +++++++++++++++++++ 4 files changed, 160 insertions(+), 12 deletions(-) create mode 100644 saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala create mode 100644 saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala new file mode 100644 index 00000000..eea96de0 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala @@ -0,0 +1,53 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +import edu.illinois.cs.cogcomp.annotation.Annotator +import edu.illinois.cs.cogcomp.core.datastructures.ViewNames +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{Constituent, TextAnnotation, TokenLabelView} +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager +import edu.illinois.cs.cogcomp.saulexamples.nlp.POSTagger.{POSAnnotator, POSTaggerApp} + +import scala.collection.JavaConversions._ + +class ChunkerAnnotator extends Annotator(ChunkerConstants.SHALLOW_PARSE_ANNOTATED_SPAN_VIEW, Array(ViewNames.TOKENS)) { + + override def initialize(rm: ResourceManager): Unit = {} + + /** Adds the POS view to a TextAnnotation + * Note: Assumes that the classifiers are populated with required models + * @param ta TextAnnotation instance + */ + override def addView(ta: TextAnnotation): Unit = { + if (!ta.hasView(ViewNames.POS)) { + ChunkerAnnotator.localPOSAnnotator.addView(ta) + } + + val tokens = ta.getView(ChunkerConstants.SHALLOW_PARSE_GOLD_BIO_VIEW).getConstituents + + ChunkerDataModel.sentence.clear() + val sentences = (0 until ta.getNumberOfSentences).map(ta.getSentence) + ChunkerDataModel.sentence.populate(sentences, train = false) + + val chunkerBIOView = new TokenLabelView(ChunkerConstants.SHALLOW_PARSE_ANNOTATED_BIO_VIEW, ta) + + tokens.foreach({ cons: Constituent => + val label = ChunkerClassifiers.ChunkerClassifier(cons) + val posCons = cons.cloneForNewViewWithDestinationLabel(chunkerBIOView.getViewName, label) + chunkerBIOView.addConstituent(posCons) + }) + + ta.addView(chunkerBIOView.getViewName, chunkerBIOView) + + ChunkerUtilities.addGoldSpanLabelView(ta, chunkerBIOView.getViewName, ChunkerConstants.SHALLOW_PARSE_ANNOTATED_SPAN_VIEW) + } +} + +object ChunkerAnnotator { + /** Instance of a local POS Annotator if required */ + private lazy val localPOSAnnotator: POSAnnotator = POSTaggerApp.getPretrainedAnnotator() +} \ No newline at end of file diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala index ca833d26..d71bd07a 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala @@ -9,13 +9,25 @@ package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker import edu.illinois.cs.cogcomp.annotation.BasicTextAnnotationBuilder import edu.illinois.cs.cogcomp.core.datastructures.ViewNames import edu.illinois.cs.cogcomp.core.datastructures.textannotation._ +import edu.illinois.cs.cogcomp.core.experiments.ClassificationTester +import edu.illinois.cs.cogcomp.core.experiments.evaluators.ConstituentLabelingEvaluator import edu.illinois.cs.cogcomp.saul.classifier.ClassifierUtils import scala.collection.JavaConversions._ import scala.collection.mutable import scala.io.Source +object ChunkerConstants { + val SHALLOW_PARSE_GOLD_SPAN_VIEW = "SHALLOW_PARSE_GOLD" + val SHALLOW_PARSE_GOLD_BIO_VIEW = "SHALLOW_PARSE_GOLD_BIO" + + val SHALLOW_PARSE_ANNOTATED_SPAN_VIEW = "SHALLOW_PARSE_ANNOTATED" + val SHALLOW_PARSE_ANNOTATED_BIO_VIEW = "SHALLOW_PARSE_ANNOTATED_BIO" +} + object ChunkerApp extends App { + import ChunkerConstants._ + val trainFile = "../data/conll2000chunking/train.txt" val testFile = "../data/conll2000chunking/test.txt" @@ -35,7 +47,7 @@ object ChunkerApp extends App { val textAnnotation = BasicTextAnnotationBuilder.createTextAnnotationFromTokens(sentenceList) val posView = new TokenLabelView(ViewNames.POS, textAnnotation) - val chunkLabelView = new SpanLabelView(ViewNames.SHALLOW_PARSE, textAnnotation) + val chunkLabelView = new SpanLabelView(SHALLOW_PARSE_GOLD_BIO_VIEW, textAnnotation) textAnnotation.getView(ViewNames.TOKENS) .getConstituents @@ -45,12 +57,14 @@ object ChunkerApp extends App { val posCons = constituent.cloneForNewViewWithDestinationLabel(ViewNames.POS, posLabels(idx)) posView.addConstituent(posCons) - val chunkCons = constituent.cloneForNewViewWithDestinationLabel(ViewNames.SHALLOW_PARSE, chunkLabels(idx)) + val chunkCons = constituent.cloneForNewViewWithDestinationLabel(SHALLOW_PARSE_GOLD_BIO_VIEW, chunkLabels(idx)) chunkLabelView.addConstituent(chunkCons) }) textAnnotation.addView(ViewNames.POS, posView) - textAnnotation.addView(ViewNames.SHALLOW_PARSE, chunkLabelView) + textAnnotation.addView(SHALLOW_PARSE_GOLD_BIO_VIEW, chunkLabelView) + + ChunkerUtilities.addGoldSpanLabelView(textAnnotation, SHALLOW_PARSE_GOLD_BIO_VIEW, SHALLOW_PARSE_GOLD_SPAN_VIEW) arrayBuffer.append(textAnnotation) tokenConstituents.clear() @@ -90,5 +104,28 @@ object ChunkerApp extends App { ChunkerClassifiers.ChunkerClassifier.learn(10) println(ChunkerClassifiers.ChunkerClassifier.test()) - ClassifierUtils.SaveClassifiers(ChunkerClassifiers.ChunkerClassifier) + + val evaluator = new ConstituentLabelingEvaluator() + val tester = new ClassificationTester() + + val chunkerAnnotator = new ChunkerAnnotator() + testData.foreach({ textAnnotation: TextAnnotation => + // Remove POS View before evaluation. + textAnnotation.removeView(ViewNames.POS) + + chunkerAnnotator.addView(textAnnotation) + + val goldView = textAnnotation.getView(SHALLOW_PARSE_GOLD_SPAN_VIEW) + val annotatedView = textAnnotation.getView(SHALLOW_PARSE_ANNOTATED_SPAN_VIEW) + + // Workaround for incorrect ConstituentLabelingEvaluator behaviour. + val predictedView = new SpanLabelView(SHALLOW_PARSE_GOLD_SPAN_VIEW, textAnnotation) + annotatedView.getConstituents.foreach({ cons: Constituent => + predictedView.addConstituent(cons.cloneForNewView(SHALLOW_PARSE_GOLD_SPAN_VIEW)) + }) + + evaluator.evaluate(tester, goldView, predictedView) + }) + + println(tester.getPerformanceTable.toOrgTable) } diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala index 85ade7a7..b6d44d01 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala @@ -6,19 +6,13 @@ */ package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker -import edu.illinois.cs.cogcomp.core.datastructures.ViewNames -import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{ Constituent, Sentence, TextAnnotation } +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{ Constituent, Sentence } import scala.collection.JavaConversions._ object ChunkerSensors { - def getSentencesInDocument(document: TextAnnotation): Seq[Sentence] = { - val numberOfSentences = document.getNumberOfSentences - (0 until numberOfSentences).map(document.getSentence) - } - def getTokensInSentence(sentence: Sentence): Seq[Constituent] = { - sentence.getView(ViewNames.SHALLOW_PARSE).getConstituents + sentence.getView(ChunkerConstants.SHALLOW_PARSE_GOLD_BIO_VIEW).getConstituents } } diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala new file mode 100644 index 00000000..14374cd4 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala @@ -0,0 +1,64 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{SpanLabelView, TextAnnotation, TokenLabelView} +import edu.illinois.cs.cogcomp.saul.util.Logging + +import scala.collection.JavaConversions._ + +object ChunkerUtilities extends Logging { + + /** Convert the gold BIO labelling to Span Label View + * Note: Use this method only for the GOLD view as this does not perform error handling. */ + def addGoldSpanLabelView(ta: TextAnnotation, sourceBIOView: String, destView: String): Unit = { + assert(ta.hasView(sourceBIOView)) + assert(!ta.hasView(destView)) + + val destinationView = new SpanLabelView(destView, ta) + + var currentSpanStart = -1 + var currentSpanEnd = -1 + var currentTag = "" + + ta.getView(sourceBIOView).getConstituents.foreach({ constituent => + val inASpan = currentSpanStart != -1 + + if (inASpan) { + if (constituent.getLabel.startsWith("O") || constituent.getLabel.startsWith("B-")) { + destinationView.addSpanLabel(currentSpanStart, currentSpanEnd, currentTag, 1.0d) + currentSpanStart = -1 + currentSpanEnd = -1 + currentTag = "" + } else { + // Label Starts with I- + if (constituent.getLabel.endsWith(currentTag)) { + currentSpanEnd = constituent.getEndSpan + } else { + destinationView.addSpanLabel(currentSpanStart, currentSpanEnd, currentTag, 1.0d) + logger.info("Dangling I-label") + + currentSpanStart = -1 + currentSpanEnd = -1 + currentTag = "" + } + } + } + + if (constituent.getLabel.startsWith("B-")) { + currentSpanStart = constituent.getStartSpan + currentSpanEnd = constituent.getEndSpan + currentTag = constituent.getLabel.substring(2) + } + else if (!inASpan && constituent.getLabel.startsWith("I-")) { + logger.info(s"Dangling I- label for constituent - $constituent") + } + }) + + ta.addView(destView, destinationView) + } +} From 0ff8d2f2e54ca06cdc1c54d99a135d0ff543cbcb Mon Sep 17 00:00:00 2001 From: Bhargav Mangipudi Date: Wed, 1 Mar 2017 05:59:05 -0600 Subject: [PATCH 05/11] Add heuristic decoding of BIO annotation. --- .../nlp/Chunker/ChunkerAnnotator.scala | 19 +++- .../nlp/Chunker/ChunkerUtilities.scala | 87 +++++++++++++++---- 2 files changed, 86 insertions(+), 20 deletions(-) diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala index eea96de0..8c818d6f 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala @@ -14,7 +14,12 @@ import edu.illinois.cs.cogcomp.saulexamples.nlp.POSTagger.{POSAnnotator, POSTagg import scala.collection.JavaConversions._ -class ChunkerAnnotator extends Annotator(ChunkerConstants.SHALLOW_PARSE_ANNOTATED_SPAN_VIEW, Array(ViewNames.TOKENS)) { +/** Chunker Annotator implementation + * + * @param useHeuristics To use heuristics to fix BIO annotation. + */ +class ChunkerAnnotator(val useHeuristics: Boolean = true) + extends Annotator(ChunkerConstants.SHALLOW_PARSE_ANNOTATED_SPAN_VIEW, Array(ViewNames.TOKENS)) { override def initialize(rm: ResourceManager): Unit = {} @@ -43,7 +48,17 @@ class ChunkerAnnotator extends Annotator(ChunkerConstants.SHALLOW_PARSE_ANNOTATE ta.addView(chunkerBIOView.getViewName, chunkerBIOView) - ChunkerUtilities.addGoldSpanLabelView(ta, chunkerBIOView.getViewName, ChunkerConstants.SHALLOW_PARSE_ANNOTATED_SPAN_VIEW) + if (useHeuristics) { + ChunkerUtilities.addSpanLabelViewUsingHeuristics( + ta, + chunkerBIOView.getViewName, + ChunkerConstants.SHALLOW_PARSE_ANNOTATED_SPAN_VIEW) + } else { + ChunkerUtilities.addGoldSpanLabelView( + ta, + chunkerBIOView.getViewName, + ChunkerConstants.SHALLOW_PARSE_ANNOTATED_SPAN_VIEW) + } } } diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala index 14374cd4..a48e81ff 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala @@ -6,7 +6,7 @@ */ package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker -import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{SpanLabelView, TextAnnotation, TokenLabelView} +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{ Constituent, SpanLabelView, TextAnnotation } import edu.illinois.cs.cogcomp.saul.util.Logging import scala.collection.JavaConversions._ @@ -21,44 +21,95 @@ object ChunkerUtilities extends Logging { val destinationView = new SpanLabelView(destView, ta) - var currentSpanStart = -1 - var currentSpanEnd = -1 - var currentTag = "" + var currentChunkStart = -1 + var currentChunkEnd = -1 + var cLabel = "" ta.getView(sourceBIOView).getConstituents.foreach({ constituent => - val inASpan = currentSpanStart != -1 + val inASpan = currentChunkStart != -1 if (inASpan) { if (constituent.getLabel.startsWith("O") || constituent.getLabel.startsWith("B-")) { - destinationView.addSpanLabel(currentSpanStart, currentSpanEnd, currentTag, 1.0d) - currentSpanStart = -1 - currentSpanEnd = -1 - currentTag = "" + destinationView.addSpanLabel(currentChunkStart, currentChunkEnd, cLabel, 1.0d) + currentChunkStart = -1 + currentChunkEnd = -1 + cLabel = "" } else { // Label Starts with I- - if (constituent.getLabel.endsWith(currentTag)) { - currentSpanEnd = constituent.getEndSpan + if (constituent.getLabel.endsWith(cLabel)) { + currentChunkEnd = constituent.getEndSpan } else { - destinationView.addSpanLabel(currentSpanStart, currentSpanEnd, currentTag, 1.0d) + destinationView.addSpanLabel(currentChunkStart, currentChunkEnd, cLabel, 1.0d) logger.info("Dangling I-label") - currentSpanStart = -1 - currentSpanEnd = -1 - currentTag = "" + currentChunkStart = -1 + currentChunkEnd = -1 + cLabel = "" } } } if (constituent.getLabel.startsWith("B-")) { - currentSpanStart = constituent.getStartSpan - currentSpanEnd = constituent.getEndSpan - currentTag = constituent.getLabel.substring(2) + currentChunkStart = constituent.getStartSpan + currentChunkEnd = constituent.getEndSpan + cLabel = constituent.getLabel.substring(2) } else if (!inASpan && constituent.getLabel.startsWith("I-")) { logger.info(s"Dangling I- label for constituent - $constituent") } }) + if (currentChunkStart != -1 && currentChunkEnd != -1 && cLabel.nonEmpty) { + destinationView.addSpanLabel(currentChunkStart, currentChunkEnd, cLabel, 1.0d) + } + + ta.addView(destView, destinationView) + } + + def addSpanLabelViewUsingHeuristics(ta: TextAnnotation, sourceBIOView: String, destView: String): Unit = { + assert(ta.hasView(sourceBIOView)) + assert(!ta.hasView(destView)) + + val destinationView = new SpanLabelView(destView, ta) + + var currentChunkStart = -1 + var currentChunkEnd = -1 + var cLabel = "" + var previousConstituent: Option[Constituent] = None + + ta.getView(sourceBIOView).getConstituents.foreach({ constituent => + // Running version of current constituent's predicted label. + var currentLabel = constituent.getLabel + + if (currentLabel.startsWith("I-")) { + if (cLabel.isEmpty) { + currentLabel = "B" + currentLabel.substring(1) + } else if (!currentLabel.endsWith(cLabel)) { + currentLabel = "B" + currentLabel.substring(1) + } + } + + if ((currentLabel.startsWith("B-") || currentLabel.startsWith("O")) && cLabel.nonEmpty) { + if (previousConstituent.nonEmpty) { + currentChunkEnd = previousConstituent.get.getEndSpan + destinationView.addSpanLabel(currentChunkStart, currentChunkEnd, cLabel, 1.0d) + cLabel = "" + } + } + + if (currentLabel.startsWith("B-")) { + currentChunkStart = constituent.getStartSpan + cLabel = currentLabel.substring(2) + } + + previousConstituent = Some(constituent) + }) + + if (cLabel.nonEmpty && previousConstituent.nonEmpty) { + currentChunkEnd = previousConstituent.get.getEndSpan + destinationView.addSpanLabel(currentChunkStart, currentChunkEnd, cLabel, 1.0d) + } + ta.addView(destView, destinationView) } } From 6b23f6c7164788fbb04618ca934b76acc0787c34 Mon Sep 17 00:00:00 2001 From: Bhargav Mangipudi Date: Wed, 1 Mar 2017 15:38:43 -0600 Subject: [PATCH 06/11] Cleanup the main class and some logical separation. --- .../saulexamples/nlp/Chunker/ChunkerApp.scala | 177 +++++++++--------- .../nlp/Chunker/ChunkerClassifiers.scala | 4 +- .../nlp/Chunker/ChunkerConstants.scala | 16 ++ .../nlp/Chunker/ChunkerDataModel.scala | 24 ++- .../nlp/Chunker/ChunkerDataReader.scala | 78 ++++++++ .../nlp/Chunker/ChunkerSensors.scala | 1 + .../nlp/Chunker/ChunkerUtilities.scala | 1 + 7 files changed, 201 insertions(+), 100 deletions(-) create mode 100644 saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerConstants.scala create mode 100644 saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataReader.scala diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala index d71bd07a..49195f3e 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala @@ -6,126 +6,119 @@ */ package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker -import edu.illinois.cs.cogcomp.annotation.BasicTextAnnotationBuilder +import java.util.Properties + import edu.illinois.cs.cogcomp.core.datastructures.ViewNames import edu.illinois.cs.cogcomp.core.datastructures.textannotation._ import edu.illinois.cs.cogcomp.core.experiments.ClassificationTester import edu.illinois.cs.cogcomp.core.experiments.evaluators.ConstituentLabelingEvaluator +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager import edu.illinois.cs.cogcomp.saul.classifier.ClassifierUtils +import edu.illinois.cs.cogcomp.saul.util.Logging import scala.collection.JavaConversions._ -import scala.collection.mutable -import scala.io.Source - -object ChunkerConstants { - val SHALLOW_PARSE_GOLD_SPAN_VIEW = "SHALLOW_PARSE_GOLD" - val SHALLOW_PARSE_GOLD_BIO_VIEW = "SHALLOW_PARSE_GOLD_BIO" - val SHALLOW_PARSE_ANNOTATED_SPAN_VIEW = "SHALLOW_PARSE_ANNOTATED" - val SHALLOW_PARSE_ANNOTATED_BIO_VIEW = "SHALLOW_PARSE_ANNOTATED_BIO" -} -object ChunkerApp extends App { +object ChunkerApp extends Logging { import ChunkerConstants._ val trainFile = "../data/conll2000chunking/train.txt" val testFile = "../data/conll2000chunking/test.txt" - def parseData(fileName: String): Seq[TextAnnotation] = { - val arrayBuffer = mutable.Buffer[TextAnnotation]() - - val tokenConstituents = mutable.ArrayBuffer[String]() - val posLabels = mutable.ArrayBuffer[String]() - val chunkLabels = mutable.ArrayBuffer[String]() - var numSentences = 0 - - Source.fromFile(fileName) - .getLines() - .foreach({ line: String => - if (line.isEmpty) { - val sentenceList = List(tokenConstituents.toArray[String]) - val textAnnotation = BasicTextAnnotationBuilder.createTextAnnotationFromTokens(sentenceList) - - val posView = new TokenLabelView(ViewNames.POS, textAnnotation) - val chunkLabelView = new SpanLabelView(SHALLOW_PARSE_GOLD_BIO_VIEW, textAnnotation) - - textAnnotation.getView(ViewNames.TOKENS) - .getConstituents - .zipWithIndex - .foreach({ - case (constituent: Constituent, idx: Int) => - val posCons = constituent.cloneForNewViewWithDestinationLabel(ViewNames.POS, posLabels(idx)) - posView.addConstituent(posCons) - - val chunkCons = constituent.cloneForNewViewWithDestinationLabel(SHALLOW_PARSE_GOLD_BIO_VIEW, chunkLabels(idx)) - chunkLabelView.addConstituent(chunkCons) - }) - - textAnnotation.addView(ViewNames.POS, posView) - textAnnotation.addView(SHALLOW_PARSE_GOLD_BIO_VIEW, chunkLabelView) - - ChunkerUtilities.addGoldSpanLabelView(textAnnotation, SHALLOW_PARSE_GOLD_BIO_VIEW, SHALLOW_PARSE_GOLD_SPAN_VIEW) - - arrayBuffer.append(textAnnotation) - tokenConstituents.clear() - posLabels.clear() - chunkLabels.clear() - - numSentences += 1 - } else { - val reader = line.split(" ") - tokenConstituents.append(reader(0)) - posLabels.append(reader(1)) - chunkLabels.append(reader(2)) - } - }) + val jarModelPath = "" - println("Number of sentences = " + numSentences) + object ChunkerExperimentType extends Enumeration { + val TrainAndTest, TestFromModel, Interactive = Value - arrayBuffer + def withNameOpt(s: String): Option[Value] = values.find(_.toString == s) } - lazy val trainData = parseData(trainFile) - lazy val testData = parseData(testFile) + def main(args: Array[String]): Unit = { + /** Try to parse the experiment type as input argument or use default */ + val testType = args.headOption + .flatMap(ChunkerExperimentType.withNameOpt) + .getOrElse(ChunkerExperimentType.Interactive) + + testType match { + case ChunkerExperimentType.TrainAndTest => trainAndTest() + case ChunkerExperimentType.TestFromModel => testWithPretrainedModels() + case ChunkerExperimentType.Interactive => interactiveWithPretrainedModels() + } + } - val jarModelPath = "" + private def loadModelFromJarPath(): Unit = { + // Load model from jar path +// ClassifierUtils.LoadClassifier( +// jarModelPath, +// ChunkerClassifiers.ChunkerClassifier) + ChunkerClassifiers.ChunkerClassifier.load() + } + + private def getSentencesInTextAnnotation(taSeq: Seq[TextAnnotation]) = { + taSeq.flatMap({ textAnnotation: TextAnnotation => + (0 until textAnnotation.getNumberOfSentences).map(textAnnotation.getSentence) + }) + } + + lazy val trainData = ChunkerDataReader.parseData(trainFile) + lazy val testData = ChunkerDataReader.parseData(testFile) + + lazy val preTrainedAnnotator: ChunkerAnnotator = { + loadModelFromJarPath() - trainData.foreach({ textAnnotation: TextAnnotation => - val numberOfSentences = textAnnotation.getNumberOfSentences - val sentences = (0 until numberOfSentences).map(textAnnotation.getSentence) - ChunkerDataModel.sentence.populate(sentences, train = true) - }) + val annotatorInstance = new ChunkerAnnotator() + annotatorInstance.initialize(new ResourceManager(new Properties())) + annotatorInstance + } - testData.foreach({ textAnnotation: TextAnnotation => - val numberOfSentences = textAnnotation.getNumberOfSentences - val sentences = (0 until numberOfSentences).map(textAnnotation.getSentence) - ChunkerDataModel.sentence.populate(sentences, train = false) - }) + /** Note: This function does NOT populate testing instances. + * Also does not use GOLD POS tags. Instead a trained POSAnnotater is used. */ + private def testModelImpl(): Unit = { + ClassifierUtils.TestClassifiers(ChunkerClassifiers.ChunkerClassifier) - ChunkerClassifiers.ChunkerClassifier.learn(10) - println(ChunkerClassifiers.ChunkerClassifier.test()) + val evaluator = new ConstituentLabelingEvaluator() + val tester = new ClassificationTester() - val evaluator = new ConstituentLabelingEvaluator() - val tester = new ClassificationTester() + testData.foreach({ textAnnotation: TextAnnotation => + // Remove POS View before evaluation. + textAnnotation.removeView(ViewNames.POS) - val chunkerAnnotator = new ChunkerAnnotator() - testData.foreach({ textAnnotation: TextAnnotation => - // Remove POS View before evaluation. - textAnnotation.removeView(ViewNames.POS) + preTrainedAnnotator.addView(textAnnotation) - chunkerAnnotator.addView(textAnnotation) + val goldView = textAnnotation.getView(SHALLOW_PARSE_GOLD_SPAN_VIEW) + val annotatedView = textAnnotation.getView(SHALLOW_PARSE_ANNOTATED_SPAN_VIEW) - val goldView = textAnnotation.getView(SHALLOW_PARSE_GOLD_SPAN_VIEW) - val annotatedView = textAnnotation.getView(SHALLOW_PARSE_ANNOTATED_SPAN_VIEW) + // Workaround for incorrect ConstituentLabelingEvaluator behaviour. + val predictedView = new SpanLabelView(SHALLOW_PARSE_GOLD_SPAN_VIEW, textAnnotation) + annotatedView.getConstituents.foreach({ cons: Constituent => + predictedView.addConstituent(cons.cloneForNewView(SHALLOW_PARSE_GOLD_SPAN_VIEW)) + }) - // Workaround for incorrect ConstituentLabelingEvaluator behaviour. - val predictedView = new SpanLabelView(SHALLOW_PARSE_GOLD_SPAN_VIEW, textAnnotation) - annotatedView.getConstituents.foreach({ cons: Constituent => - predictedView.addConstituent(cons.cloneForNewView(SHALLOW_PARSE_GOLD_SPAN_VIEW)) + evaluator.evaluate(tester, goldView, predictedView) }) - evaluator.evaluate(tester, goldView, predictedView) - }) + println(tester.getPerformanceTable.toOrgTable) + } + + def trainAndTest(): Unit = { + ChunkerDataModel.sentence.populate(getSentencesInTextAnnotation(trainData), train = true) + ChunkerDataModel.sentence.populate(getSentencesInTextAnnotation(testData), train = false) + + ChunkerClassifiers.ChunkerClassifier.learn(50) + ClassifierUtils.SaveClassifiers(ChunkerClassifiers.ChunkerClassifier) + + testModelImpl() + } - println(tester.getPerformanceTable.toOrgTable) + def testWithPretrainedModels(): Unit = { + loadModelFromJarPath() + + ChunkerDataModel.sentence.populate(getSentencesInTextAnnotation(testData), train = false) + + testModelImpl() + } + + def interactiveWithPretrainedModels(): Unit = { + + } } diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala index 17d5f274..392781c7 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala @@ -28,6 +28,8 @@ object ChunkerClassifiers { /** Label property for users classifier */ override def label = chunkLabel - override def feature = using(wordTypeInformation, affixes, posWindow, capitalizationWindowProperty, previousTags) + override def feature = using( + wordTypeInformation, affixes, posWindow, capitalizationWindowProperty, previousTags, + mixed, SOPrevious) } } diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerConstants.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerConstants.scala new file mode 100644 index 00000000..bd83ba40 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerConstants.scala @@ -0,0 +1,16 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +/** Constants used by the Chunker experiment */ +object ChunkerConstants { + val SHALLOW_PARSE_GOLD_SPAN_VIEW = "SHALLOW_PARSE_GOLD" + val SHALLOW_PARSE_GOLD_BIO_VIEW = "SHALLOW_PARSE_GOLD_BIO" + + val SHALLOW_PARSE_ANNOTATED_SPAN_VIEW = "SHALLOW_PARSE_ANNOTATED" + val SHALLOW_PARSE_ANNOTATED_BIO_VIEW = "SHALLOW_PARSE_ANNOTATED_BIO" +} diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala index fe230581..24878d3f 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala @@ -13,6 +13,7 @@ import edu.illinois.cs.cogcomp.edison.features.factory.WordFeatureExtractorFacto import edu.illinois.cs.cogcomp.edison.features.lrec.{ Affixes, POSWindow, WordTypeInformation } import edu.illinois.cs.cogcomp.saul.datamodel.DataModel +import scala.collection.mutable import scala.collection.JavaConversions._ object ChunkerDataModel extends DataModel { @@ -85,24 +86,33 @@ object ChunkerDataModel extends DataModel { // Formpp Feature val formpp = property(tokens, "Formpp") { token: Constituent => val window = 2 + val contextBuffer = new mutable.ArrayBuffer[String]() + val surfaceForms: List[String] = forms(token) // Feature range - val range = for { + for { j <- 0 until window i <- surfaceForms.indices - } yield (j, i) - - range.map({ case (j: Int, i: Int) => + } { val contextStrings = for { context <- 0 until window if i + context < surfaceForms.length } yield s"${i}_${j}:${surfaceForms(i + context)}" - contextStrings.mkString("_") - }) - .toList + contextBuffer.append(contextStrings.mkString("_")) + } + + contextBuffer.toList } + // Mixed Feature + val mixed = property(tokens, "Mixed") { token: Constituent => + "" + } + // SO Previous Feature + val SOPrevious = property(tokens, "SOPrevious") { token: Constituent => + "" + } } diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataReader.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataReader.scala new file mode 100644 index 00000000..4df460a7 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataReader.scala @@ -0,0 +1,78 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +import edu.illinois.cs.cogcomp.annotation.BasicTextAnnotationBuilder +import edu.illinois.cs.cogcomp.core.datastructures.ViewNames +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{Constituent, SpanLabelView, TextAnnotation, TokenLabelView} +import edu.illinois.cs.cogcomp.saul.util.Logging + +import scala.collection.mutable +import scala.collection.JavaConversions._ +import scala.io.Source + +/** Data Reader for the CONLL format for training the Chunker */ +object ChunkerDataReader extends Logging { + import ChunkerConstants._ + + /** Parse the input data and create the POS View and GOLD Shallow Parse BIO View */ + def parseData(fileName: String): Seq[TextAnnotation] = { + logger.info(s"Parsing file - $fileName") + + val arrayBuffer = mutable.Buffer[TextAnnotation]() + + val tokenConstituents = mutable.ArrayBuffer[String]() + val posLabels = mutable.ArrayBuffer[String]() + val chunkLabels = mutable.ArrayBuffer[String]() + var numSentences = 0 + + Source.fromFile(fileName) + .getLines() + .foreach({ line: String => + if (line.isEmpty) { + val sentenceList = List(tokenConstituents.toArray[String]) + val textAnnotation = BasicTextAnnotationBuilder.createTextAnnotationFromTokens(sentenceList) + + val posView = new TokenLabelView(ViewNames.POS, textAnnotation) + val chunkLabelView = new SpanLabelView(SHALLOW_PARSE_GOLD_BIO_VIEW, textAnnotation) + + textAnnotation.getView(ViewNames.TOKENS) + .getConstituents + .zipWithIndex + .foreach({ + case (constituent: Constituent, idx: Int) => + val posCons = constituent.cloneForNewViewWithDestinationLabel(ViewNames.POS, posLabels(idx)) + posView.addConstituent(posCons) + + val chunkCons = constituent.cloneForNewViewWithDestinationLabel(SHALLOW_PARSE_GOLD_BIO_VIEW, chunkLabels(idx)) + chunkLabelView.addConstituent(chunkCons) + }) + + textAnnotation.addView(ViewNames.POS, posView) + textAnnotation.addView(SHALLOW_PARSE_GOLD_BIO_VIEW, chunkLabelView) + + ChunkerUtilities.addGoldSpanLabelView(textAnnotation, SHALLOW_PARSE_GOLD_BIO_VIEW, SHALLOW_PARSE_GOLD_SPAN_VIEW) + + arrayBuffer.append(textAnnotation) + tokenConstituents.clear() + posLabels.clear() + chunkLabels.clear() + + numSentences += 1 + } else { + val reader = line.split(" ") + tokenConstituents.append(reader(0)) + posLabels.append(reader(1)) + chunkLabels.append(reader(2)) + } + }) + + logger.info(s"Number of sentences : $numSentences") + + arrayBuffer + } +} diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala index b6d44d01..c5cc479a 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala @@ -12,6 +12,7 @@ import scala.collection.JavaConversions._ object ChunkerSensors { + /** Sensor to populate tokens node from a Sentence instance */ def getTokensInSentence(sentence: Sentence): Seq[Constituent] = { sentence.getView(ChunkerConstants.SHALLOW_PARSE_GOLD_BIO_VIEW).getConstituents } diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala index a48e81ff..269f6323 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala @@ -66,6 +66,7 @@ object ChunkerUtilities extends Logging { ta.addView(destView, destinationView) } + /** Convert BIO labelled annotation to SpanLabelView using some heuristics to handle error scenarios */ def addSpanLabelViewUsingHeuristics(ta: TextAnnotation, sourceBIOView: String, destView: String): Unit = { assert(ta.hasView(sourceBIOView)) assert(!ta.hasView(destView)) From e1a82e3de3a0e3c74351a0edbf93ff49452ccf7b Mon Sep 17 00:00:00 2001 From: Bhargav Mangipudi Date: Wed, 1 Mar 2017 15:40:00 -0600 Subject: [PATCH 07/11] Add all required features. --- .../cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala index 392781c7..5dc3a964 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala @@ -30,6 +30,6 @@ object ChunkerClassifiers { override def feature = using( wordTypeInformation, affixes, posWindow, capitalizationWindowProperty, previousTags, - mixed, SOPrevious) + forms, formpp, mixed, SOPrevious) } } From 6f9e56278495b96085d82c9e6ecf0a8b2079a5da Mon Sep 17 00:00:00 2001 From: Bhargav Mangipudi Date: Wed, 1 Mar 2017 16:09:23 -0600 Subject: [PATCH 08/11] Some fixes. Remove redundant files etc. --- .../nlp/Chunker/ChunkerAnnotator.scala | 2 +- .../saulexamples/nlp/Chunker/ChunkerApp.scala | 21 ++++++++++++++++++- .../nlp/Chunker/ChunkerDataModel.scala | 15 +++++++++---- .../nlp/Chunker/ChunkerSensors.scala | 19 ----------------- 4 files changed, 32 insertions(+), 25 deletions(-) delete mode 100644 saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala index 8c818d6f..ef8a37f5 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala @@ -32,7 +32,7 @@ class ChunkerAnnotator(val useHeuristics: Boolean = true) ChunkerAnnotator.localPOSAnnotator.addView(ta) } - val tokens = ta.getView(ChunkerConstants.SHALLOW_PARSE_GOLD_BIO_VIEW).getConstituents + val tokens = ta.getView(ViewNames.TOKENS).getConstituents ChunkerDataModel.sentence.clear() val sentences = (0 until ta.getNumberOfSentences).map(ta.getSentence) diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala index 49195f3e..bb9fef14 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala @@ -13,10 +13,13 @@ import edu.illinois.cs.cogcomp.core.datastructures.textannotation._ import edu.illinois.cs.cogcomp.core.experiments.ClassificationTester import edu.illinois.cs.cogcomp.core.experiments.evaluators.ConstituentLabelingEvaluator import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager +import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer +import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder import edu.illinois.cs.cogcomp.saul.classifier.ClassifierUtils import edu.illinois.cs.cogcomp.saul.util.Logging import scala.collection.JavaConversions._ +import scala.io.StdIn object ChunkerApp extends Logging { @@ -119,6 +122,22 @@ object ChunkerApp extends Logging { } def interactiveWithPretrainedModels(): Unit = { - + val taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer()) + + while (true) { + println("Enter a sentence to annotate (or Press Enter to exit)") + val input = StdIn.readLine() + + input match { + case sentence: String if sentence.trim.nonEmpty => + // Create a Text Annotation with the current input sentence. + val ta = taBuilder.createTextAnnotation(sentence.trim) + preTrainedAnnotator.addView(ta) + println("POS View : " + ta.getView(ViewNames.POS).toString) + println("Annotated BIO View : " + ta.getView(SHALLOW_PARSE_ANNOTATED_BIO_VIEW)) + println("Annotated Span View : " + ta.getView(SHALLOW_PARSE_ANNOTATED_SPAN_VIEW)) + case _ => return + } + } } } diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala index 24878d3f..a3ff1a20 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala @@ -12,6 +12,7 @@ import edu.illinois.cs.cogcomp.edison.features.ContextFeatureExtractor import edu.illinois.cs.cogcomp.edison.features.factory.WordFeatureExtractorFactory import edu.illinois.cs.cogcomp.edison.features.lrec.{ Affixes, POSWindow, WordTypeInformation } import edu.illinois.cs.cogcomp.saul.datamodel.DataModel +import edu.illinois.cs.cogcomp.saulexamples.nlp.CommonSensors import scala.collection.mutable import scala.collection.JavaConversions._ @@ -21,10 +22,16 @@ object ChunkerDataModel extends DataModel { val tokens = node[Constituent] val sentenceToTokens = edge(sentence, tokens) - sentenceToTokens.addSensor(ChunkerSensors.getTokensInSentence _) - - // Label - val chunkLabel = property(tokens, "ChunkLabel") { token: Constituent => token.getLabel } + sentenceToTokens.addSensor(CommonSensors.sentenceToTokens _) + + // GOLD BIO label for SHALLOW_PARSE + val chunkLabel = property(tokens, "ChunkLabel") { token: Constituent => + token.getTextAnnotation + .getView(ChunkerConstants.SHALLOW_PARSE_GOLD_BIO_VIEW) + .getConstituentsCovering(token) + .head + .getLabel + } // Affixes feature private val affixFeatureExtractor = new Affixes(ViewNames.TOKENS) diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala deleted file mode 100644 index c5cc479a..00000000 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerSensors.scala +++ /dev/null @@ -1,19 +0,0 @@ -/** This software is released under the University of Illinois/Research and Academic Use License. See - * the LICENSE file in the root folder for details. Copyright (c) 2016 - * - * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign - * http://cogcomp.cs.illinois.edu/ - */ -package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker - -import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{ Constituent, Sentence } - -import scala.collection.JavaConversions._ - -object ChunkerSensors { - - /** Sensor to populate tokens node from a Sentence instance */ - def getTokensInSentence(sentence: Sentence): Seq[Constituent] = { - sentence.getView(ChunkerConstants.SHALLOW_PARSE_GOLD_BIO_VIEW).getConstituents - } -} From ed0d77f27855f96c5b0aca01e7ee02d3a5e5acaa Mon Sep 17 00:00:00 2001 From: Bhargav Mangipudi Date: Wed, 1 Mar 2017 17:05:33 -0600 Subject: [PATCH 09/11] Add the remaining features.` --- .../nlp/Chunker/ChunkerDataModel.scala | 86 ++++++++++++++++++- 1 file changed, 83 insertions(+), 3 deletions(-) diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala index a3ff1a20..bc4e0284 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala @@ -114,12 +114,92 @@ object ChunkerDataModel extends DataModel { } // Mixed Feature + private val mixedBefore = 2 + private val mixedAfter = 2 + private val mixedK = 2 val mixed = property(tokens, "Mixed") { token: Constituent => - "" + val tokenNeighborhood = tokens.getWithWindow(token, -mixedBefore, mixedAfter, sameSentenceTokensFilter).flatten + + val tags = new mutable.ArrayBuffer[String](mixedBefore + mixedAfter + 1) + val forms = new mutable.ArrayBuffer[String](mixedBefore + mixedAfter + 1) + + tokenNeighborhood.foreach({ tokenNear: Constituent => + val posCons = tokenNear.getTextAnnotation + .getView(ViewNames.POS) + .getConstituentsCovering(tokenNear) + .head + + tags.append(posCons.getLabel) + forms.append(tokenNear.getSurfaceForm) + }) + + val mixedFeatures = new mutable.ArrayBuffer[String]() + + for { + j <- 1 to mixedK + x <- 0 to 2 + } { + var t: Boolean = true + tags.zipWithIndex + .foreach({ case (tag: String, i: Int) => + val stringBuffer = new StringBuffer() + + for { + context <- 0 until j + if i + context < tags.size + } { + if (context != 0) stringBuffer.append("_") + + if (t && x == 0) { + stringBuffer.append(tags(i + context)) + } else { + stringBuffer.append(forms(i + context)) + } + + t = !t + } + + mixedFeatures.append(s"${i}_${j}:${stringBuffer.toString}") + }) + } + + mixedFeatures.toList } // SO Previous Feature - val SOPrevious = property(tokens, "SOPrevious") { token: Constituent => - "" + val SOPrevious = property(tokens, "SOPrevious", cache = true) { token: Constituent => + val tokenNeighborhood = tokens.getWithWindow(token, -2, 0, sameSentenceTokensFilter).flatten + + val tags = new mutable.ArrayBuffer[String](3) + val labels = new mutable.ArrayBuffer[String](2) + + tokenNeighborhood.foreach({ tokenNear: Constituent => + val posCons = tokenNear.getTextAnnotation + .getView(ViewNames.POS) + .getConstituentsCovering(tokenNear) + .head + + tags.append(posCons.getLabel) + + // Use Label while training and prediction while testing. + if (ChunkerClassifiers.ChunkerClassifier.isTraining) { + labels.append(chunkLabel(tokenNear)) + } else { + labels.append(ChunkerClassifiers.ChunkerClassifier(tokenNear)) + } + }) + + tags.append(tags.last) + + val features = new mutable.ArrayBuffer[String]() + + if (labels.size >= 2) { + features.append(s"ll:${labels(0)}_${labels(1)}") + features.append(s"lt2:${labels(1)}_${tags(2)}") + } + + features.append(s"lt1:${labels(0)}_${tags(1)}") + + features.toList } } From cfdef89094cc20cf23df443dfca6bf4dca37a865 Mon Sep 17 00:00:00 2001 From: Bhargav Mangipudi Date: Wed, 1 Mar 2017 18:55:17 -0600 Subject: [PATCH 10/11] Add Readme for Chunker. --- saul-examples/README.md | 7 ++- .../nlp/Chunker/ChunkerDataModel.scala | 25 ++++------ .../saulexamples/nlp/Chunker/README.md | 46 +++++++++++++++++++ 3 files changed, 61 insertions(+), 17 deletions(-) create mode 100644 saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md diff --git a/saul-examples/README.md b/saul-examples/README.md index fbca5a9e..16a0094e 100644 --- a/saul-examples/README.md +++ b/saul-examples/README.md @@ -31,12 +31,15 @@ In Spatial Role Labeling, we try to find spatial relations and label spatial rol 6. [Part-of-Speech Tagging](src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/POSTagger/README.md): Part-of-Speech Tagging is the identification of words as nouns, verbs, adjectives, adverbs, etc. -7. [Twitter Sentiment Analysis] (src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/TwitterSentimentAnalysis/README.md): +7. [Twitter Sentiment Analysis](src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/TwitterSentimentAnalysis/README.md): This example trains models for classifying twitter posts as positive, negative or neutral. It also includes a twitter client for real time processing of the tweets. -8. [Question Type Classification] (src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/QuestionTypeClassification/README.md): +8. [Question Type Classification](src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/QuestionTypeClassification/README.md): This example contains a classifications to categorize questions into different semantic classes based on the possible semantic types of the answers. +9. [Chunker](src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md): +This example comtains an implementation of a Shallow Parsing system. + * Note: Examples are under active development. diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala index bc4e0284..5dff966c 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala @@ -124,12 +124,7 @@ object ChunkerDataModel extends DataModel { val forms = new mutable.ArrayBuffer[String](mixedBefore + mixedAfter + 1) tokenNeighborhood.foreach({ tokenNear: Constituent => - val posCons = tokenNear.getTextAnnotation - .getView(ViewNames.POS) - .getConstituentsCovering(tokenNear) - .head - - tags.append(posCons.getLabel) + tags.append(CommonSensors.getPosTag(tokenNear)) forms.append(tokenNear.getSurfaceForm) }) @@ -168,18 +163,13 @@ object ChunkerDataModel extends DataModel { // SO Previous Feature val SOPrevious = property(tokens, "SOPrevious", cache = true) { token: Constituent => - val tokenNeighborhood = tokens.getWithWindow(token, -2, 0, sameSentenceTokensFilter).flatten + val tokenNeighborhood = tokens.getWithWindow(token, -2, -1, sameSentenceTokensFilter).flatten val tags = new mutable.ArrayBuffer[String](3) val labels = new mutable.ArrayBuffer[String](2) tokenNeighborhood.foreach({ tokenNear: Constituent => - val posCons = tokenNear.getTextAnnotation - .getView(ViewNames.POS) - .getConstituentsCovering(tokenNear) - .head - - tags.append(posCons.getLabel) + tags.append(CommonSensors.getPosTag(tokenNear)) // Use Label while training and prediction while testing. if (ChunkerClassifiers.ChunkerClassifier.isTraining) { @@ -189,16 +179,21 @@ object ChunkerDataModel extends DataModel { } }) - tags.append(tags.last) + tags.append(CommonSensors.getPosTag(token)) val features = new mutable.ArrayBuffer[String]() if (labels.size >= 2) { features.append(s"ll:${labels(0)}_${labels(1)}") + } + + if (labels.size >= 2 && tags.size >= 3) { features.append(s"lt2:${labels(1)}_${tags(2)}") } - features.append(s"lt1:${labels(0)}_${tags(1)}") + if (tags.size >= 2) { + features.append(s"lt1:${labels(0)}_${tags(1)}") + } features.toList } diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md new file mode 100644 index 00000000..a38939b3 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md @@ -0,0 +1,46 @@ +# Chunker (Shallow Parser) + +Chunking (Shallow Parsing) is the identification of constituents (noun groups, verbs, verb groups etc.) in a sentence. +The system implemented here is based of the following paper: + +``` +@inproceedings{PunyakanokRo01, + author = {V. Punyakanok and D. Roth}, + title = {The Use of Classifiers in Sequential Inference}, + booktitle = {NIPS}, + pages = {995--1001}, + year = {2001}, + publisher = {MIT Press}, + acceptance = {25/514 (4.8\%) Oral Presentations; 152/514 (29%) overall}, + url = " http://cogcomp.cs.illinois.edu/papers/nips01.pdf", + funding = {NSF98 CAREER}, + projects = {LnI,SI,IE,NE,NLP,CCM}, + comment = {Structured, sequential output; Sequence Prediction: HMM with classifiers, Conditional Models, Constraint Satisfaction}, +} +``` + +## Performance + + +The data for the experiments was extracted from the dataset for the [CONLL 2000 Chunking Shared Task](http://www.cnts.ua.ac.be/conll2000/chunking/). +The training corpus consists of 8,936 sentences composed of 210,996 tokens totally. +The test corpus consists of 2,012 sentences composed of $$ tokens totally. + +### Evaluation: BIO Labeling + +### Evaluation: Span Labeling + +Note: While evaluation (testing), POS tags are provided by an implementation of [POSTagger in Saul](https://github.com/IllinoisCogComp/saul/blob/master/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/POSTagger/README.md). + +## Testing the Chunker interactively + +For a quick demo of the Chunker Tagger, you can run the following command in the project's root folder. + +```shell +sbt "project saulExamples" "runMain edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker.ChunkerApp" +``` + + +## Related + +If you are looking for an implementation of the Chunker in Java, have a look at [this repository](https://github.com/IllinoisCogComp/illinois-cogcomp-nlp/blob/master/chunker/README.md). \ No newline at end of file From 93bacfeaa800d240e73e80fdc36d3b8791a1afc9 Mon Sep 17 00:00:00 2001 From: Bhargav Mangipudi Date: Wed, 1 Mar 2017 22:27:49 -0600 Subject: [PATCH 11/11] Evaluation. --- .../saulexamples/nlp/Chunker/README.md | 43 ++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md index a38939b3..b5a9ff17 100644 --- a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md @@ -24,12 +24,53 @@ The system implemented here is based of the following paper: The data for the experiments was extracted from the dataset for the [CONLL 2000 Chunking Shared Task](http://www.cnts.ua.ac.be/conll2000/chunking/). The training corpus consists of 8,936 sentences composed of 210,996 tokens totally. -The test corpus consists of 2,012 sentences composed of $$ tokens totally. +The test corpus consists of 2,012 sentences composed of 47,372 tokens totally. ### Evaluation: BIO Labeling +| Label | Precision | Recall | F1 | LCount | PCount | +|-----------|-----------:|-------:|-------:|-------:|-------:| +| B-ADJP | 80.323 | 68.192 | 73.762 | 437 | 371 | +| B-ADVP | 82.275 | 79.330 | 80.776 | 866 | 835 | +| B-CONJP | 40.000 | 66.667 | 50.000 | 9 | 15 | +| B-INTJ | 100.000 | 50.000 | 66.667 | 2 | 1 | +| B-LST | 0.000 | 0.000 | 0.000 | 5 | 3 | +| B-NP | 95.718 | 96.412 | 96.064 | 12404 | 12494 | +| B-PP | 96.456 | 97.359 | 96.905 | 4808 | 4853 | +| B-PRT | 79.048 | 78.302 | 78.673 | 106 | 105 | +| B-SBAR | 87.674 | 82.430 | 84.971 | 535 | 503 | +| B-UCP | 0.000 | 0.000 | 0.000 | 0 | 52 | +| B-VP | 94.581 | 95.292 | 94.935 | 4652 | 4687 | +| I-ADJP | 77.982 | 50.898 | 61.594 | 167 | 109 | +| I-ADVP | 60.274 | 49.438 | 54.321 | 89 | 73 | +| I-CONJP | 55.556 | 76.923 | 64.516 | 13 | 18 | +| I-LST | 0.000 | 0.000 | 0.000 | 2 | 0 | +| I-NP | 96.251 | 95.795 | 96.023 | 14365 | 14297 | +| I-PP | 86.111 | 64.583 | 73.810 | 48 | 36 | +| I-PRT | 0.000 | 0.000 | 0.000 | 0 | 1 | +| I-SBAR | 10.526 | 50.000 | 17.391 | 4 | 19 | +| I-UCP | 0.000 | 0.000 | 0.000 | 0 | 9 | +| I-VP | 94.935 | 93.712 | 94.319 | 2640 | 2606 | +| O | 95.172 |96.174 | 95.670 | 6169 | 6234 | +| **Accuracy** | **94.945** | **-** | **-** | **-** | **47321** | + ### Evaluation: Span Labeling +| Label | Total Gold | Total Predicted | Correct Prediction | Precision | Recall | F1 | +| ----- | ---:| ---:| ---:| ---:| ---:| ---:| +| ADJP | 438 | 515 | 296 | 57.48 | 67.58 | 62.12 | +| ADVP | 866 | 1032 | 670 | 64.92 | 77.37 | 70.6 | +| CONJP | 9 | 19 | 6 | 31.58 | 66.67 | 42.86 | +| INTJ | 2 | 2 | 1 | 50 | 50 | 50 | +| LST | 5 | 7 | 0 | 0 | 0 | 0 | +| NP | 12422 | 13376 | 11574 | 86.53 | 93.17 | 89.73 | +| PP | 4811 | 4994 | 4684 | 93.79 | 97.36 | 95.54 | +| PRT | 106 | 138 | 86 | 62.32 | 81.13 | 70.49 | +| SBAR | 535 | 603 | 444 | 73.63 | 82.99 | 78.03 | +| UCP | 0 | 63 | 0 | 0 | 0 | 0 | +| VP | 4658 | 5014 | 4335 | 86.46 | 93.07 | 89.64 | +| **All** | **23852** | **25763** | **22096** | **85.77** | **92.64** | **89.07** | + Note: While evaluation (testing), POS tags are provided by an implementation of [POSTagger in Saul](https://github.com/IllinoisCogComp/saul/blob/master/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/POSTagger/README.md). ## Testing the Chunker interactively