Skip to content
This repository was archived by the owner on Jul 15, 2019. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Yara Parser
===================

© Copyright 2014-2015, Yahoo! Inc.
© Copyright 2014-2016, Yahoo! Inc.

© Licensed under the terms of the Apache License 2.0. See LICENSE file at the project root for terms.

Expand All @@ -15,14 +15,16 @@ __Please cite the following technical report if you use Yara in your research:__
* Mohammad Sadegh Rasooli and Joel Tetreault. [Yara Parser: A Fast and Accurate Dependency Parser](http://arxiv.org/abs/1503.06733). arXiv:1503.06733v2 [cs.CL] Mar 2015.

### Version Log
- v0.3 (21 Apr. 2016) Fix a bug in third-order feature computation.
- v0.2.1 (18 Mar. 2015) Some minor changes to packaging and punctuation defaults.
- V0.2 (10 Feb. 2015) Some problems fixed in search pruning and dependency features, and brown cluster features added; compressed model file saving.
- V0.1 (January 2015) First version of the parser with features roughly the same as Zhang and Nivre (2011).

# WARNING
If you use the extended feature set or brown cluster features, currently the parser supports just 64 unique dependency relations and 1M unique words in the training data. If the number of unique relations in your training data is more than 64, your results with extended or brown cluster features may not be precise!

## Performance and Speed on WSJ/Penn Treebank
Performance and speed really depends on the quality of POS taggers and machine power and memory. I used [my own pos tagger v0.2](https://github.com/rasoolims/SemiSupervisedPosTagger/releases/tag/v0.2) and tagged the train file with 10-way jackknifing. I got POS accuracy of 97.14, 97.18 and 97.37 in the train, dev and test files respectively. I converted the data to dependencies with [Penn2Malt tool](http://stp.lingfil.uu.se/~nivre/research/Penn2Malt.html). The following tables are the results.
## Performance and Speed on WSJ/Penn Treebank (v0.2)
Performance and speed really depends on the quality of POS taggers and machine power and memory. I used [my own pos tagger v0.2](https://github.com/rasoolims/SemiSupervisedPosTagger/releases/tag/v0.2) and tagged the train file with 10-way jackknifing. I got POS accuracy of 97.14, 97.18 and 97.37 in the train, dev and test files respectively. I converted the data to dependencies with [Penn2Malt tool](http://stp.lingfil.uu.se/~nivre/research/Penn2Malt.html). The following table shows the results.


|YaraParser.Parser| Dep. Rep. |beam| Features |Iter#| Dev UAS | Test UAS | Test LAS | sen/sec|
Expand Down
Binary file removed jar/YaraParser.jar
Binary file not shown.
34 changes: 31 additions & 3 deletions src/YaraParser/TransitionBasedSystem/Configuration/State.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ public class State implements Cloneable {
protected Pair<Integer, Integer>[] arcs;
protected int[] leftMostArcs;
protected int[] rightMostArcs;
protected int[] secondLeftMostArcs;
protected int[] secondRightMostArcs;
protected int[] leftValency;
protected int[] rightValency;
protected long[] rightDepLabels;
Expand All @@ -40,7 +42,9 @@ public State(int size) {
arcs = new Pair[size + 1];

leftMostArcs = new int[size + 1];
secondLeftMostArcs = new int[size + 1];
rightMostArcs = new int[size + 1];
secondRightMostArcs = new int[size + 1];
leftValency = new int[size + 1];
rightValency = new int[size + 1];
rightDepLabels = new long[size + 1];
Expand Down Expand Up @@ -86,14 +90,23 @@ public void addArc(int dependent, int head, int dependency) {
assert dependency<64;

if (dependent > head) { //right dep
if (rightMostArcs[head] == 0 || dependent > rightMostArcs[head])
if (rightMostArcs[head] == 0 )
rightMostArcs[head] = dependent;
else if(dependent > rightMostArcs[head]){
secondRightMostArcs[head] = rightMostArcs[head];
rightMostArcs[head] = dependent;
} else if(dependent > secondRightMostArcs[head])
secondRightMostArcs[head] = dependent;
rightValency[head] += 1;
rightDepLabels[head] = rightDepLabels[head] | value;

} else { //left dependency
if (leftMostArcs[head] == 0 || dependent < leftMostArcs[head])
if (leftMostArcs[head] == 0 )
leftMostArcs[head] = dependent;
else if(dependent < leftMostArcs[head]){
secondLeftMostArcs[head] = leftMostArcs[head];
leftMostArcs[head] = dependent;
} else if(dependent<secondLeftMostArcs[head])
secondLeftMostArcs[head] = dependent;
leftDepLabels[head] = leftDepLabels[head] | value;
leftValency[head] += 1;
}
Expand Down Expand Up @@ -159,10 +172,18 @@ public int rightMostModifier(int index) {
return (rightMostArcs[index] == 0 ? -1 : rightMostArcs[index]);
}

public int secondRightMostModifier(int index) {
return (secondRightMostArcs[index] == 0 ? -1 : secondRightMostArcs[index]);
}

public int leftMostModifier(int index) {
return (leftMostArcs[index] == 0 ? -1 : leftMostArcs[index]);
}

public int secondLeftMostModifier(int index) {
return (secondLeftMostArcs[index] == 0 ? -1 : secondLeftMostArcs[index]);
}

/**
* @param head
* @return the current number of dependents
Expand Down Expand Up @@ -230,12 +251,19 @@ public State clone() {
state.rightValency[h] = rightValency[h];
state.rightDepLabels[h] = rightDepLabels[h];
}
if (secondRightMostArcs[h] != 0) {
state.secondRightMostArcs[h] = secondRightMostArcs[h];
}

if (leftMostArcs[h] != 0) {
state.leftMostArcs[h] = leftMostArcs[h];
state.leftValency[h] = leftValency[h];
state.leftDepLabels[h] = leftDepLabels[h];
}

if (secondLeftMostArcs[h] != 0) {
state.secondLeftMostArcs[h] = secondLeftMostArcs[h];
}
}
}
state.rootIndex = rootIndex;
Expand Down
Loading