diff --git a/saul-examples/README.md b/saul-examples/README.md index fbca5a9e..16a0094e 100644 --- a/saul-examples/README.md +++ b/saul-examples/README.md @@ -31,12 +31,15 @@ In Spatial Role Labeling, we try to find spatial relations and label spatial rol 6. [Part-of-Speech Tagging](src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/POSTagger/README.md): Part-of-Speech Tagging is the identification of words as nouns, verbs, adjectives, adverbs, etc. -7. [Twitter Sentiment Analysis] (src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/TwitterSentimentAnalysis/README.md): +7. [Twitter Sentiment Analysis](src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/TwitterSentimentAnalysis/README.md): This example trains models for classifying twitter posts as positive, negative or neutral. It also includes a twitter client for real time processing of the tweets. -8. [Question Type Classification] (src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/QuestionTypeClassification/README.md): +8. [Question Type Classification](src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/QuestionTypeClassification/README.md): This example contains a classifications to categorize questions into different semantic classes based on the possible semantic types of the answers. +9. [Chunker](src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md): +This example comtains an implementation of a Shallow Parsing system. + * Note: Examples are under active development. diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala new file mode 100644 index 00000000..ef8a37f5 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerAnnotator.scala @@ -0,0 +1,68 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +import edu.illinois.cs.cogcomp.annotation.Annotator +import edu.illinois.cs.cogcomp.core.datastructures.ViewNames +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{Constituent, TextAnnotation, TokenLabelView} +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager +import edu.illinois.cs.cogcomp.saulexamples.nlp.POSTagger.{POSAnnotator, POSTaggerApp} + +import scala.collection.JavaConversions._ + +/** Chunker Annotator implementation + * + * @param useHeuristics To use heuristics to fix BIO annotation. + */ +class ChunkerAnnotator(val useHeuristics: Boolean = true) + extends Annotator(ChunkerConstants.SHALLOW_PARSE_ANNOTATED_SPAN_VIEW, Array(ViewNames.TOKENS)) { + + override def initialize(rm: ResourceManager): Unit = {} + + /** Adds the POS view to a TextAnnotation + * Note: Assumes that the classifiers are populated with required models + * @param ta TextAnnotation instance + */ + override def addView(ta: TextAnnotation): Unit = { + if (!ta.hasView(ViewNames.POS)) { + ChunkerAnnotator.localPOSAnnotator.addView(ta) + } + + val tokens = ta.getView(ViewNames.TOKENS).getConstituents + + ChunkerDataModel.sentence.clear() + val sentences = (0 until ta.getNumberOfSentences).map(ta.getSentence) + ChunkerDataModel.sentence.populate(sentences, train = false) + + val chunkerBIOView = new TokenLabelView(ChunkerConstants.SHALLOW_PARSE_ANNOTATED_BIO_VIEW, ta) + + tokens.foreach({ cons: Constituent => + val label = ChunkerClassifiers.ChunkerClassifier(cons) + val posCons = cons.cloneForNewViewWithDestinationLabel(chunkerBIOView.getViewName, label) + chunkerBIOView.addConstituent(posCons) + }) + + ta.addView(chunkerBIOView.getViewName, chunkerBIOView) + + if (useHeuristics) { + ChunkerUtilities.addSpanLabelViewUsingHeuristics( + ta, + chunkerBIOView.getViewName, + ChunkerConstants.SHALLOW_PARSE_ANNOTATED_SPAN_VIEW) + } else { + ChunkerUtilities.addGoldSpanLabelView( + ta, + chunkerBIOView.getViewName, + ChunkerConstants.SHALLOW_PARSE_ANNOTATED_SPAN_VIEW) + } + } +} + +object ChunkerAnnotator { + /** Instance of a local POS Annotator if required */ + private lazy val localPOSAnnotator: POSAnnotator = POSTaggerApp.getPretrainedAnnotator() +} \ No newline at end of file diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala new file mode 100644 index 00000000..bb9fef14 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerApp.scala @@ -0,0 +1,143 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +import java.util.Properties + +import edu.illinois.cs.cogcomp.core.datastructures.ViewNames +import edu.illinois.cs.cogcomp.core.datastructures.textannotation._ +import edu.illinois.cs.cogcomp.core.experiments.ClassificationTester +import edu.illinois.cs.cogcomp.core.experiments.evaluators.ConstituentLabelingEvaluator +import edu.illinois.cs.cogcomp.core.utilities.configuration.ResourceManager +import edu.illinois.cs.cogcomp.nlp.tokenizer.StatefulTokenizer +import edu.illinois.cs.cogcomp.nlp.utility.TokenizerTextAnnotationBuilder +import edu.illinois.cs.cogcomp.saul.classifier.ClassifierUtils +import edu.illinois.cs.cogcomp.saul.util.Logging + +import scala.collection.JavaConversions._ +import scala.io.StdIn + + +object ChunkerApp extends Logging { + import ChunkerConstants._ + + val trainFile = "../data/conll2000chunking/train.txt" + val testFile = "../data/conll2000chunking/test.txt" + + val jarModelPath = "" + + object ChunkerExperimentType extends Enumeration { + val TrainAndTest, TestFromModel, Interactive = Value + + def withNameOpt(s: String): Option[Value] = values.find(_.toString == s) + } + + def main(args: Array[String]): Unit = { + /** Try to parse the experiment type as input argument or use default */ + val testType = args.headOption + .flatMap(ChunkerExperimentType.withNameOpt) + .getOrElse(ChunkerExperimentType.Interactive) + + testType match { + case ChunkerExperimentType.TrainAndTest => trainAndTest() + case ChunkerExperimentType.TestFromModel => testWithPretrainedModels() + case ChunkerExperimentType.Interactive => interactiveWithPretrainedModels() + } + } + + private def loadModelFromJarPath(): Unit = { + // Load model from jar path +// ClassifierUtils.LoadClassifier( +// jarModelPath, +// ChunkerClassifiers.ChunkerClassifier) + ChunkerClassifiers.ChunkerClassifier.load() + } + + private def getSentencesInTextAnnotation(taSeq: Seq[TextAnnotation]) = { + taSeq.flatMap({ textAnnotation: TextAnnotation => + (0 until textAnnotation.getNumberOfSentences).map(textAnnotation.getSentence) + }) + } + + lazy val trainData = ChunkerDataReader.parseData(trainFile) + lazy val testData = ChunkerDataReader.parseData(testFile) + + lazy val preTrainedAnnotator: ChunkerAnnotator = { + loadModelFromJarPath() + + val annotatorInstance = new ChunkerAnnotator() + annotatorInstance.initialize(new ResourceManager(new Properties())) + annotatorInstance + } + + /** Note: This function does NOT populate testing instances. + * Also does not use GOLD POS tags. Instead a trained POSAnnotater is used. */ + private def testModelImpl(): Unit = { + ClassifierUtils.TestClassifiers(ChunkerClassifiers.ChunkerClassifier) + + val evaluator = new ConstituentLabelingEvaluator() + val tester = new ClassificationTester() + + testData.foreach({ textAnnotation: TextAnnotation => + // Remove POS View before evaluation. + textAnnotation.removeView(ViewNames.POS) + + preTrainedAnnotator.addView(textAnnotation) + + val goldView = textAnnotation.getView(SHALLOW_PARSE_GOLD_SPAN_VIEW) + val annotatedView = textAnnotation.getView(SHALLOW_PARSE_ANNOTATED_SPAN_VIEW) + + // Workaround for incorrect ConstituentLabelingEvaluator behaviour. + val predictedView = new SpanLabelView(SHALLOW_PARSE_GOLD_SPAN_VIEW, textAnnotation) + annotatedView.getConstituents.foreach({ cons: Constituent => + predictedView.addConstituent(cons.cloneForNewView(SHALLOW_PARSE_GOLD_SPAN_VIEW)) + }) + + evaluator.evaluate(tester, goldView, predictedView) + }) + + println(tester.getPerformanceTable.toOrgTable) + } + + def trainAndTest(): Unit = { + ChunkerDataModel.sentence.populate(getSentencesInTextAnnotation(trainData), train = true) + ChunkerDataModel.sentence.populate(getSentencesInTextAnnotation(testData), train = false) + + ChunkerClassifiers.ChunkerClassifier.learn(50) + ClassifierUtils.SaveClassifiers(ChunkerClassifiers.ChunkerClassifier) + + testModelImpl() + } + + def testWithPretrainedModels(): Unit = { + loadModelFromJarPath() + + ChunkerDataModel.sentence.populate(getSentencesInTextAnnotation(testData), train = false) + + testModelImpl() + } + + def interactiveWithPretrainedModels(): Unit = { + val taBuilder = new TokenizerTextAnnotationBuilder(new StatefulTokenizer()) + + while (true) { + println("Enter a sentence to annotate (or Press Enter to exit)") + val input = StdIn.readLine() + + input match { + case sentence: String if sentence.trim.nonEmpty => + // Create a Text Annotation with the current input sentence. + val ta = taBuilder.createTextAnnotation(sentence.trim) + preTrainedAnnotator.addView(ta) + println("POS View : " + ta.getView(ViewNames.POS).toString) + println("Annotated BIO View : " + ta.getView(SHALLOW_PARSE_ANNOTATED_BIO_VIEW)) + println("Annotated Span View : " + ta.getView(SHALLOW_PARSE_ANNOTATED_SPAN_VIEW)) + case _ => return + } + } + } +} diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala new file mode 100644 index 00000000..5dc3a964 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerClassifiers.scala @@ -0,0 +1,35 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.Constituent +import edu.illinois.cs.cogcomp.lbjava.learn.{ SparseAveragedPerceptron, SparseNetworkLearner } +import edu.illinois.cs.cogcomp.saul.classifier.Learnable + +object ChunkerClassifiers { + import ChunkerDataModel._ + + object ChunkerClassifier extends Learnable[Constituent](tokens) { + + override lazy val classifier = { + // Parameters + val params = new SparseAveragedPerceptron.Parameters() + params.learningRate = 0.1 + params.thickness = 0.2 + val baseLTU = new SparseAveragedPerceptron(params) + + new SparseNetworkLearner(baseLTU) + } + + /** Label property for users classifier */ + override def label = chunkLabel + + override def feature = using( + wordTypeInformation, affixes, posWindow, capitalizationWindowProperty, previousTags, + forms, formpp, mixed, SOPrevious) + } +} diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerConstants.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerConstants.scala new file mode 100644 index 00000000..bd83ba40 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerConstants.scala @@ -0,0 +1,16 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +/** Constants used by the Chunker experiment */ +object ChunkerConstants { + val SHALLOW_PARSE_GOLD_SPAN_VIEW = "SHALLOW_PARSE_GOLD" + val SHALLOW_PARSE_GOLD_BIO_VIEW = "SHALLOW_PARSE_GOLD_BIO" + + val SHALLOW_PARSE_ANNOTATED_SPAN_VIEW = "SHALLOW_PARSE_ANNOTATED" + val SHALLOW_PARSE_ANNOTATED_BIO_VIEW = "SHALLOW_PARSE_ANNOTATED_BIO" +} diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala new file mode 100644 index 00000000..5dff966c --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataModel.scala @@ -0,0 +1,200 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +import edu.illinois.cs.cogcomp.core.datastructures.ViewNames +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{ Constituent, Sentence } +import edu.illinois.cs.cogcomp.edison.features.ContextFeatureExtractor +import edu.illinois.cs.cogcomp.edison.features.factory.WordFeatureExtractorFactory +import edu.illinois.cs.cogcomp.edison.features.lrec.{ Affixes, POSWindow, WordTypeInformation } +import edu.illinois.cs.cogcomp.saul.datamodel.DataModel +import edu.illinois.cs.cogcomp.saulexamples.nlp.CommonSensors + +import scala.collection.mutable +import scala.collection.JavaConversions._ + +object ChunkerDataModel extends DataModel { + val sentence = node[Sentence] + val tokens = node[Constituent] + + val sentenceToTokens = edge(sentence, tokens) + sentenceToTokens.addSensor(CommonSensors.sentenceToTokens _) + + // GOLD BIO label for SHALLOW_PARSE + val chunkLabel = property(tokens, "ChunkLabel") { token: Constituent => + token.getTextAnnotation + .getView(ChunkerConstants.SHALLOW_PARSE_GOLD_BIO_VIEW) + .getConstituentsCovering(token) + .head + .getLabel + } + + // Affixes feature + private val affixFeatureExtractor = new Affixes(ViewNames.TOKENS) + val affixes = property(tokens, "Affixes") { token: Constituent => + affixFeatureExtractor.getFeatures(token) + .map(_.getName) + .toList + } + + // WordTypeInformation feature + private val wordTypeInformationExtractor = new WordTypeInformation(ViewNames.TOKENS) + val wordTypeInformation = property(tokens, "WordTypeInformation") { token: Constituent => + wordTypeInformationExtractor.getFeatures(token) + .map(_.getName) + .toList + } + + // POS Window features + private val posWindowExtractor = new POSWindow(ViewNames.POS) + val posWindow = property(tokens, "POSWindow") { token: Constituent => + posWindowExtractor.getFeatures(token) + .map(_.getName) + .toList + } + + // Capitalization features + private val capitalizationExtractor = new ContextFeatureExtractor(2, true, true, + WordFeatureExtractorFactory.capitalization) + val capitalizationWindowProperty = property(tokens, "Capitalization") { token: Constituent => + capitalizationExtractor.getFeatures(token) + .map(_.getName) + .toList + } + + // Filter to restrict window to current sentence's tokens only. + val sameSentenceTokensFilter = Seq({ token: Constituent => tokens(token) ~> -sentenceToTokens }) + + // Get Previous Chunk labels + val previousTags = property(tokens, "PreviousTags", cache = true) { token: Constituent => + tokens.getWithWindow(token, -2, -1, sameSentenceTokensFilter) + .flatten + .map({ previousCons: Constituent => + // Use Label while training and prediction while testing. + if (ChunkerClassifiers.ChunkerClassifier.isTraining) { + chunkLabel(previousCons) + } else { + ChunkerClassifiers.ChunkerClassifier(previousCons) + } + }) + } + + // Get surface forms in context window + val forms = property(tokens, "Forms") { token: Constituent => + tokens.getWithWindow(token, -2, +2, sameSentenceTokensFilter) + .flatten + .map(_.getSurfaceForm) + } + + // Formpp Feature + val formpp = property(tokens, "Formpp") { token: Constituent => + val window = 2 + val contextBuffer = new mutable.ArrayBuffer[String]() + + val surfaceForms: List[String] = forms(token) + + // Feature range + for { + j <- 0 until window + i <- surfaceForms.indices + } { + val contextStrings = for { + context <- 0 until window + if i + context < surfaceForms.length + } yield s"${i}_${j}:${surfaceForms(i + context)}" + + contextBuffer.append(contextStrings.mkString("_")) + } + + contextBuffer.toList + } + + // Mixed Feature + private val mixedBefore = 2 + private val mixedAfter = 2 + private val mixedK = 2 + val mixed = property(tokens, "Mixed") { token: Constituent => + val tokenNeighborhood = tokens.getWithWindow(token, -mixedBefore, mixedAfter, sameSentenceTokensFilter).flatten + + val tags = new mutable.ArrayBuffer[String](mixedBefore + mixedAfter + 1) + val forms = new mutable.ArrayBuffer[String](mixedBefore + mixedAfter + 1) + + tokenNeighborhood.foreach({ tokenNear: Constituent => + tags.append(CommonSensors.getPosTag(tokenNear)) + forms.append(tokenNear.getSurfaceForm) + }) + + val mixedFeatures = new mutable.ArrayBuffer[String]() + + for { + j <- 1 to mixedK + x <- 0 to 2 + } { + var t: Boolean = true + tags.zipWithIndex + .foreach({ case (tag: String, i: Int) => + val stringBuffer = new StringBuffer() + + for { + context <- 0 until j + if i + context < tags.size + } { + if (context != 0) stringBuffer.append("_") + + if (t && x == 0) { + stringBuffer.append(tags(i + context)) + } else { + stringBuffer.append(forms(i + context)) + } + + t = !t + } + + mixedFeatures.append(s"${i}_${j}:${stringBuffer.toString}") + }) + } + + mixedFeatures.toList + } + + // SO Previous Feature + val SOPrevious = property(tokens, "SOPrevious", cache = true) { token: Constituent => + val tokenNeighborhood = tokens.getWithWindow(token, -2, -1, sameSentenceTokensFilter).flatten + + val tags = new mutable.ArrayBuffer[String](3) + val labels = new mutable.ArrayBuffer[String](2) + + tokenNeighborhood.foreach({ tokenNear: Constituent => + tags.append(CommonSensors.getPosTag(tokenNear)) + + // Use Label while training and prediction while testing. + if (ChunkerClassifiers.ChunkerClassifier.isTraining) { + labels.append(chunkLabel(tokenNear)) + } else { + labels.append(ChunkerClassifiers.ChunkerClassifier(tokenNear)) + } + }) + + tags.append(CommonSensors.getPosTag(token)) + + val features = new mutable.ArrayBuffer[String]() + + if (labels.size >= 2) { + features.append(s"ll:${labels(0)}_${labels(1)}") + } + + if (labels.size >= 2 && tags.size >= 3) { + features.append(s"lt2:${labels(1)}_${tags(2)}") + } + + if (tags.size >= 2) { + features.append(s"lt1:${labels(0)}_${tags(1)}") + } + + features.toList + } +} diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataReader.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataReader.scala new file mode 100644 index 00000000..4df460a7 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerDataReader.scala @@ -0,0 +1,78 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +import edu.illinois.cs.cogcomp.annotation.BasicTextAnnotationBuilder +import edu.illinois.cs.cogcomp.core.datastructures.ViewNames +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{Constituent, SpanLabelView, TextAnnotation, TokenLabelView} +import edu.illinois.cs.cogcomp.saul.util.Logging + +import scala.collection.mutable +import scala.collection.JavaConversions._ +import scala.io.Source + +/** Data Reader for the CONLL format for training the Chunker */ +object ChunkerDataReader extends Logging { + import ChunkerConstants._ + + /** Parse the input data and create the POS View and GOLD Shallow Parse BIO View */ + def parseData(fileName: String): Seq[TextAnnotation] = { + logger.info(s"Parsing file - $fileName") + + val arrayBuffer = mutable.Buffer[TextAnnotation]() + + val tokenConstituents = mutable.ArrayBuffer[String]() + val posLabels = mutable.ArrayBuffer[String]() + val chunkLabels = mutable.ArrayBuffer[String]() + var numSentences = 0 + + Source.fromFile(fileName) + .getLines() + .foreach({ line: String => + if (line.isEmpty) { + val sentenceList = List(tokenConstituents.toArray[String]) + val textAnnotation = BasicTextAnnotationBuilder.createTextAnnotationFromTokens(sentenceList) + + val posView = new TokenLabelView(ViewNames.POS, textAnnotation) + val chunkLabelView = new SpanLabelView(SHALLOW_PARSE_GOLD_BIO_VIEW, textAnnotation) + + textAnnotation.getView(ViewNames.TOKENS) + .getConstituents + .zipWithIndex + .foreach({ + case (constituent: Constituent, idx: Int) => + val posCons = constituent.cloneForNewViewWithDestinationLabel(ViewNames.POS, posLabels(idx)) + posView.addConstituent(posCons) + + val chunkCons = constituent.cloneForNewViewWithDestinationLabel(SHALLOW_PARSE_GOLD_BIO_VIEW, chunkLabels(idx)) + chunkLabelView.addConstituent(chunkCons) + }) + + textAnnotation.addView(ViewNames.POS, posView) + textAnnotation.addView(SHALLOW_PARSE_GOLD_BIO_VIEW, chunkLabelView) + + ChunkerUtilities.addGoldSpanLabelView(textAnnotation, SHALLOW_PARSE_GOLD_BIO_VIEW, SHALLOW_PARSE_GOLD_SPAN_VIEW) + + arrayBuffer.append(textAnnotation) + tokenConstituents.clear() + posLabels.clear() + chunkLabels.clear() + + numSentences += 1 + } else { + val reader = line.split(" ") + tokenConstituents.append(reader(0)) + posLabels.append(reader(1)) + chunkLabels.append(reader(2)) + } + }) + + logger.info(s"Number of sentences : $numSentences") + + arrayBuffer + } +} diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala new file mode 100644 index 00000000..269f6323 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/ChunkerUtilities.scala @@ -0,0 +1,116 @@ +/** This software is released under the University of Illinois/Research and Academic Use License. See + * the LICENSE file in the root folder for details. Copyright (c) 2016 + * + * Developed by: The Cognitive Computations Group, University of Illinois at Urbana-Champaign + * http://cogcomp.cs.illinois.edu/ + */ +package edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker + +import edu.illinois.cs.cogcomp.core.datastructures.textannotation.{ Constituent, SpanLabelView, TextAnnotation } +import edu.illinois.cs.cogcomp.saul.util.Logging + +import scala.collection.JavaConversions._ + +object ChunkerUtilities extends Logging { + + /** Convert the gold BIO labelling to Span Label View + * Note: Use this method only for the GOLD view as this does not perform error handling. */ + def addGoldSpanLabelView(ta: TextAnnotation, sourceBIOView: String, destView: String): Unit = { + assert(ta.hasView(sourceBIOView)) + assert(!ta.hasView(destView)) + + val destinationView = new SpanLabelView(destView, ta) + + var currentChunkStart = -1 + var currentChunkEnd = -1 + var cLabel = "" + + ta.getView(sourceBIOView).getConstituents.foreach({ constituent => + val inASpan = currentChunkStart != -1 + + if (inASpan) { + if (constituent.getLabel.startsWith("O") || constituent.getLabel.startsWith("B-")) { + destinationView.addSpanLabel(currentChunkStart, currentChunkEnd, cLabel, 1.0d) + currentChunkStart = -1 + currentChunkEnd = -1 + cLabel = "" + } else { + // Label Starts with I- + if (constituent.getLabel.endsWith(cLabel)) { + currentChunkEnd = constituent.getEndSpan + } else { + destinationView.addSpanLabel(currentChunkStart, currentChunkEnd, cLabel, 1.0d) + logger.info("Dangling I-label") + + currentChunkStart = -1 + currentChunkEnd = -1 + cLabel = "" + } + } + } + + if (constituent.getLabel.startsWith("B-")) { + currentChunkStart = constituent.getStartSpan + currentChunkEnd = constituent.getEndSpan + cLabel = constituent.getLabel.substring(2) + } + else if (!inASpan && constituent.getLabel.startsWith("I-")) { + logger.info(s"Dangling I- label for constituent - $constituent") + } + }) + + if (currentChunkStart != -1 && currentChunkEnd != -1 && cLabel.nonEmpty) { + destinationView.addSpanLabel(currentChunkStart, currentChunkEnd, cLabel, 1.0d) + } + + ta.addView(destView, destinationView) + } + + /** Convert BIO labelled annotation to SpanLabelView using some heuristics to handle error scenarios */ + def addSpanLabelViewUsingHeuristics(ta: TextAnnotation, sourceBIOView: String, destView: String): Unit = { + assert(ta.hasView(sourceBIOView)) + assert(!ta.hasView(destView)) + + val destinationView = new SpanLabelView(destView, ta) + + var currentChunkStart = -1 + var currentChunkEnd = -1 + var cLabel = "" + var previousConstituent: Option[Constituent] = None + + ta.getView(sourceBIOView).getConstituents.foreach({ constituent => + // Running version of current constituent's predicted label. + var currentLabel = constituent.getLabel + + if (currentLabel.startsWith("I-")) { + if (cLabel.isEmpty) { + currentLabel = "B" + currentLabel.substring(1) + } else if (!currentLabel.endsWith(cLabel)) { + currentLabel = "B" + currentLabel.substring(1) + } + } + + if ((currentLabel.startsWith("B-") || currentLabel.startsWith("O")) && cLabel.nonEmpty) { + if (previousConstituent.nonEmpty) { + currentChunkEnd = previousConstituent.get.getEndSpan + destinationView.addSpanLabel(currentChunkStart, currentChunkEnd, cLabel, 1.0d) + cLabel = "" + } + } + + if (currentLabel.startsWith("B-")) { + currentChunkStart = constituent.getStartSpan + cLabel = currentLabel.substring(2) + } + + previousConstituent = Some(constituent) + }) + + if (cLabel.nonEmpty && previousConstituent.nonEmpty) { + currentChunkEnd = previousConstituent.get.getEndSpan + destinationView.addSpanLabel(currentChunkStart, currentChunkEnd, cLabel, 1.0d) + } + + ta.addView(destView, destinationView) + } +} diff --git a/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md new file mode 100644 index 00000000..b5a9ff17 --- /dev/null +++ b/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/Chunker/README.md @@ -0,0 +1,87 @@ +# Chunker (Shallow Parser) + +Chunking (Shallow Parsing) is the identification of constituents (noun groups, verbs, verb groups etc.) in a sentence. +The system implemented here is based of the following paper: + +``` +@inproceedings{PunyakanokRo01, + author = {V. Punyakanok and D. Roth}, + title = {The Use of Classifiers in Sequential Inference}, + booktitle = {NIPS}, + pages = {995--1001}, + year = {2001}, + publisher = {MIT Press}, + acceptance = {25/514 (4.8\%) Oral Presentations; 152/514 (29%) overall}, + url = " http://cogcomp.cs.illinois.edu/papers/nips01.pdf", + funding = {NSF98 CAREER}, + projects = {LnI,SI,IE,NE,NLP,CCM}, + comment = {Structured, sequential output; Sequence Prediction: HMM with classifiers, Conditional Models, Constraint Satisfaction}, +} +``` + +## Performance + + +The data for the experiments was extracted from the dataset for the [CONLL 2000 Chunking Shared Task](http://www.cnts.ua.ac.be/conll2000/chunking/). +The training corpus consists of 8,936 sentences composed of 210,996 tokens totally. +The test corpus consists of 2,012 sentences composed of 47,372 tokens totally. + +### Evaluation: BIO Labeling + +| Label | Precision | Recall | F1 | LCount | PCount | +|-----------|-----------:|-------:|-------:|-------:|-------:| +| B-ADJP | 80.323 | 68.192 | 73.762 | 437 | 371 | +| B-ADVP | 82.275 | 79.330 | 80.776 | 866 | 835 | +| B-CONJP | 40.000 | 66.667 | 50.000 | 9 | 15 | +| B-INTJ | 100.000 | 50.000 | 66.667 | 2 | 1 | +| B-LST | 0.000 | 0.000 | 0.000 | 5 | 3 | +| B-NP | 95.718 | 96.412 | 96.064 | 12404 | 12494 | +| B-PP | 96.456 | 97.359 | 96.905 | 4808 | 4853 | +| B-PRT | 79.048 | 78.302 | 78.673 | 106 | 105 | +| B-SBAR | 87.674 | 82.430 | 84.971 | 535 | 503 | +| B-UCP | 0.000 | 0.000 | 0.000 | 0 | 52 | +| B-VP | 94.581 | 95.292 | 94.935 | 4652 | 4687 | +| I-ADJP | 77.982 | 50.898 | 61.594 | 167 | 109 | +| I-ADVP | 60.274 | 49.438 | 54.321 | 89 | 73 | +| I-CONJP | 55.556 | 76.923 | 64.516 | 13 | 18 | +| I-LST | 0.000 | 0.000 | 0.000 | 2 | 0 | +| I-NP | 96.251 | 95.795 | 96.023 | 14365 | 14297 | +| I-PP | 86.111 | 64.583 | 73.810 | 48 | 36 | +| I-PRT | 0.000 | 0.000 | 0.000 | 0 | 1 | +| I-SBAR | 10.526 | 50.000 | 17.391 | 4 | 19 | +| I-UCP | 0.000 | 0.000 | 0.000 | 0 | 9 | +| I-VP | 94.935 | 93.712 | 94.319 | 2640 | 2606 | +| O | 95.172 |96.174 | 95.670 | 6169 | 6234 | +| **Accuracy** | **94.945** | **-** | **-** | **-** | **47321** | + +### Evaluation: Span Labeling + +| Label | Total Gold | Total Predicted | Correct Prediction | Precision | Recall | F1 | +| ----- | ---:| ---:| ---:| ---:| ---:| ---:| +| ADJP | 438 | 515 | 296 | 57.48 | 67.58 | 62.12 | +| ADVP | 866 | 1032 | 670 | 64.92 | 77.37 | 70.6 | +| CONJP | 9 | 19 | 6 | 31.58 | 66.67 | 42.86 | +| INTJ | 2 | 2 | 1 | 50 | 50 | 50 | +| LST | 5 | 7 | 0 | 0 | 0 | 0 | +| NP | 12422 | 13376 | 11574 | 86.53 | 93.17 | 89.73 | +| PP | 4811 | 4994 | 4684 | 93.79 | 97.36 | 95.54 | +| PRT | 106 | 138 | 86 | 62.32 | 81.13 | 70.49 | +| SBAR | 535 | 603 | 444 | 73.63 | 82.99 | 78.03 | +| UCP | 0 | 63 | 0 | 0 | 0 | 0 | +| VP | 4658 | 5014 | 4335 | 86.46 | 93.07 | 89.64 | +| **All** | **23852** | **25763** | **22096** | **85.77** | **92.64** | **89.07** | + +Note: While evaluation (testing), POS tags are provided by an implementation of [POSTagger in Saul](https://github.com/IllinoisCogComp/saul/blob/master/saul-examples/src/main/scala/edu/illinois/cs/cogcomp/saulexamples/nlp/POSTagger/README.md). + +## Testing the Chunker interactively + +For a quick demo of the Chunker Tagger, you can run the following command in the project's root folder. + +```shell +sbt "project saulExamples" "runMain edu.illinois.cs.cogcomp.saulexamples.nlp.Chunker.ChunkerApp" +``` + + +## Related + +If you are looking for an implementation of the Chunker in Java, have a look at [this repository](https://github.com/IllinoisCogComp/illinois-cogcomp-nlp/blob/master/chunker/README.md). \ No newline at end of file