diff --git a/README b/README
index 0a1bc02..a31b1cd 100644
--- a/README
+++ b/README
@@ -10,3 +10,29 @@ src - The source code.
LISENSE - The license of LitWay.
README - This file.
+
+
+How to run the program?
+
+1.build path -> add libraries -> ee,uima
+
+2.build.xml
+run as -> ant build... -> Refresh -> The entire workspace ->output is "./dist/litway-0.8.jar" -> build path -> add to build path
+
+3.train
+info.chenli.litway.bionlp13.ge
+(1)TriggerRecogniser.java
+argument is train data absolute path
+(2)BindingRecogniser.java
+argument is train data absolute path
+(3)ArgumentRecogniser.java
+argument is train data absolute path
+
+ps:The word2vec file at "./word2vec/word2vec100"
+
+4.test
+info.chenli.litway.bionlp13.ge
+EventExtractorBind2.java argument is test data absolute path
+
+5.result path is "./result/"
+
diff --git a/conf/config.xsd b/conf/config.xsd
new file mode 100644
index 0000000..6081039
--- /dev/null
+++ b/conf/config.xsd
@@ -0,0 +1,57 @@
+
+
+
+
+ * Filename : config.xsd
+ * Description: XML Schema for BioNLP
+ * Author(s) : Chen Li
+ * Revision : $Id: config.xsd 8275 2013-10-16 16:28:25Z $
+ * $HeadURL: https://github.com/li-chen/ee $
+ *
+ * Copyright 2013 Chen Li
+ *
+ * This software is licensed according to the terms described in the file
+ * named "LICENSE.txt" included with this distribution and available
+ * online at https://github.com/li-chen/ee/LICENSE.txt
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/conf/config_cg.xml b/conf/config_cg.xml
new file mode 100644
index 0000000..55d0d6b
--- /dev/null
+++ b/conf/config_cg.xml
@@ -0,0 +1,730 @@
+
+
+
+ Amino_acid
+ Anatomical_system
+ Cancer
+ Cell
+ Cellular_component
+ Developing_anatomical_structure
+ Gene_or_gene_product
+ Immaterial_anatomical_entity
+ Multi-tissue_structure
+ Organ
+ Organism
+ Organism_subdivision
+ Organism_substance
+ Pathological_formation
+ Simple_chemical
+ Tissue
+
+
+
+ Acetylation
+
+
+ Gene_or_gene_product
+
+
+
+
+ Amino_acid_catabolism
+
+
+ Gene_or_gene_product
+ Simple_chemical
+
+
+
+
+ Binding
+
+
+ Amino_acid
+ Anatomical_system
+ Cancer
+ Cell
+ Cellular_component
+ Developing_anatomical_structure
+ Gene_or_gene_product
+ Immaterial_anatomical_theme
+ Multi-tissue_structure
+ Organ
+ Organism
+ Organism_subdivision
+ Organism_substance
+ Pathological_formation
+ Simple_chemical
+ Tissue
+
+
+ Protein_domain_or_region
+ DNA_domain_or_region
+
+
+
+
+ Blood_vessel_development
+
+
+ Organism_subdivision
+ Anatomical_system
+ Organ
+ Multi-tissue_structure
+ Tissue
+ Developing_anatomical_structure
+ Cell
+ Cellular_component
+ Cancer
+
+
+
+
+
+
+
+ Breakdown
+
+
+ Organism_subdivision
+ Anatomical_system
+ Organ
+ Multi-tissue_structure
+ Tissue
+ Developing_anatomical_structure
+ Cell
+ Cellular_component
+ Cancer
+
+
+
+
+ Carcinogenesis
+
+
+ Organism_subdivision
+ Anatomical_system
+ Organ
+ Multi-tissue_structure
+ Tissue
+ Developing_anatomical_structure
+ Cell
+ Cellular_component
+ Cancer
+
+
+
+
+ Catabolism
+
+
+ Gene_or_gene_product
+ Simple_chemical
+
+
+
+
+ Cell_death
+
+
+ Organism_subdivision
+ Anatomical_system
+ Organ
+ Multi-tissue_structure
+ Tissue
+ Developing_anatomical_structure
+ Cell
+ Cellular_component
+ Cancer
+
+
+
+
+ Cell_differentiation
+
+
+ Organism_subdivision
+ Anatomical_system
+ Organ
+ Multi-tissue_structure
+ Tissue
+ Developing_anatomical_structure
+ Cell
+ Cellular_component
+ Cancer
+
+
+
+
+ Cell_division
+
+
+ Organism_subdivision
+ Anatomical_system
+ Organ
+ Multi-tissue_structure
+ Tissue
+ Developing_anatomical_structure
+ Cell
+ Cellular_component
+ Cancer
+
+
+
+
+ Cell_proliferation
+
+
+ Organism_subdivision
+ Anatomical_system
+ Organ
+ Multi-tissue_structure
+ Tissue
+ Developing_anatomical_structure
+ Cell
+ Cellular_component
+ Cancer
+
+
+
+
+ Cell_transformation
+
+
+ Organism_subdivision
+ Anatomical_system
+ Organ
+ Multi-tissue_structure
+ Tissue
+ Developing_anatomical_structure
+ Cell
+ Cellular_component
+ Cancer
+
+
+
+
+ DNA_demethylation
+
+
+ Gene_or_gene_product
+
+
+
+
+ DNA_methylation
+
+
+ Gene_or_gene_product
+
+
+
+
+ Death
+
+
+ Organism_subdivision
+ Anatomical_system
+ Organ
+ Multi-tissue_structure
+ Tissue
+ Developing_anatomical_structure
+ Cell
+ Cellular_component
+ Cancer
+
+
+
+
+ Dephosphorylation
+
+
+ Gene_or_gene_product
+
+
+
+
+ Development
+
+
+ Gene_or_gene_product
+
+
+
+
+ Dissociation
+
+
+ Amino_acid
+ Anatomical_system
+ Cancer
+ Cell
+ Cellular_component
+ Developing_anatomical_structure
+ Gene_or_gene_product
+ Immaterial_anatomical_theme
+ Multi-tissue_structure
+ Organ
+ Organism
+ Organism_subdivision
+ Organism_substance
+ Pathological_formation
+ Simple_chemical
+ Tissue
+
+
+
+
+ Gene_expression
+
+
+ Gene_or_gene_product
+
+
+
+
+ Glycolysis
+
+
+
+
+
+
+
+ Glycosylation
+
+
+
+
+
+
+
+ Growth
+
+
+ Organism_subdivision
+ Anatomical_system
+ Organ
+ Multi-tissue_structure
+ Tissue
+ Developing_anatomical_structure
+ Cell
+ Cellular_component
+ Cancer
+
+
+
+
+ Infection
+
+
+
+
+
+
+
+ Localization
+
+
+ Amino_acid
+ Anatomical_system
+ Cancer
+ Cell
+ Cellular_component
+ Developing_anatomical_structure
+ Gene_or_gene_product
+ Immaterial_anatomical_theme
+ Multi-tissue_structure
+ Organ
+ Organism
+ Organism_subdivision
+ Organism_substance
+ Pathological_formation
+ Simple_chemical
+ Tissue
+
+
+
+
+ Metabolism
+
+
+ Gene_or_gene_product
+ Simple_chemical
+
+
+
+
+ Metastasis
+
+
+ Organism_subdivision
+ Anatomical_system
+ Organ
+ Multi-tissue_structure
+ Tissue
+ Developing_anatomical_structure
+ Cell
+ Cellular_component
+ Cancer
+
+
+
+
+ Mutation
+
+
+ Gene_or_gene_product
+
+
+
+
+ Negative_regulation
+
+
+ Amino_acid
+ Anatomical_system
+ Cancer
+ Cell
+ Cellular_component
+ Developing_anatomical_structure
+ Gene_or_gene_product
+ Immaterial_anatomical_theme
+ Multi-tissue_structure
+ Organ
+ Organism
+ Organism_subdivision
+ Organism_substance
+ Pathological_formation
+ Simple_chemical
+ Tissue
+ Acetylation
+ Amino_acid_catabolism
+ Binding
+ Blood_vessel_development
+ Breakdown
+ Carcinogenesis
+ Catabolism
+ Cell_death
+ Cell_differentiation
+ Cell_division
+ Cell_proliferation
+ Cell_transformation
+ DNA_demethylation
+ DNA_domain_or_region
+ DNA_methylation
+ Death
+ Dephosphorylation
+ Development
+ Dissociation
+ Gene_expression
+ Glycolysis
+ Glycosylation
+ Growth
+ Infection
+ Localization
+ Metabolism
+ Metastasis
+ Mutation
+ Negative_regulation
+ Pathway
+ Phosphorylation
+ Planned_process
+ Positive_regulation
+ Protein_domain_or_region
+ Protein_processing
+ Regulation
+ Remodeling
+ Reproduction
+ Synthesis
+ Transcription
+ Translation
+ Ubiquitination
+
+
+
+
+ Pathway
+
+
+ Amino_acid
+ Anatomical_system
+ Cancer
+ Cell
+ Cellular_component
+ Developing_anatomical_structure
+ Gene_or_gene_product
+ Immaterial_anatomical_theme
+ Multi-tissue_structure
+ Organ
+ Organism
+ Organism_subdivision
+ Organism_substance
+ Pathological_formation
+ Simple_chemical
+ Tissue
+
+
+
+
+ Phosphorylation
+
+
+ Gene_or_gene_product
+
+
+
+
+ Planned_process
+
+
+ Amino_acid
+ Anatomical_system
+ Cancer
+ Cell
+ Cellular_component
+ Developing_anatomical_structure
+ Gene_or_gene_product
+ Immaterial_anatomical_theme
+ Multi-tissue_structure
+ Organ
+ Organism
+ Organism_subdivision
+ Organism_substance
+ Pathological_formation
+ Simple_chemical
+ Tissue
+ Acetylation
+ Amino_acid_catabolism
+ Binding
+ Blood_vessel_development
+ Breakdown
+ Carcinogenesis
+ Catabolism
+ Cell_death
+ Cell_differentiation
+ Cell_division
+ Cell_proliferation
+ Cell_transformation
+ DNA_demethylation
+ DNA_domain_or_region
+ DNA_methylation
+ Death
+ Dephosphorylation
+ Development
+ Dissociation
+ Gene_expression
+ Glycolysis
+ Glycosylation
+ Growth
+ Infection
+ Localization
+ Metabolism
+ Metastasis
+ Mutation
+ Negative_regulation
+ Pathway
+ Phosphorylation
+ Planned_process
+ Positive_regulation
+ Protein_domain_or_region
+ Protein_processing
+ Regulation
+ Remodeling
+ Reproduction
+ Synthesis
+ Transcription
+ Translation
+ Ubiquitination
+
+
+
+
+ Positive_regulation
+
+
+ Amino_acid
+ Anatomical_system
+ Cancer
+ Cell
+ Cellular_component
+ Developing_anatomical_structure
+ Gene_or_gene_product
+ Immaterial_anatomical_theme
+ Multi-tissue_structure
+ Organ
+ Organism
+ Organism_subdivision
+ Organism_substance
+ Pathological_formation
+ Simple_chemical
+ Tissue
+ Acetylation
+ Amino_acid_catabolism
+ Binding
+ Blood_vessel_development
+ Breakdown
+ Carcinogenesis
+ Catabolism
+ Cell_death
+ Cell_differentiation
+ Cell_division
+ Cell_proliferation
+ Cell_transformation
+ DNA_demethylation
+ DNA_domain_or_region
+ DNA_methylation
+ Death
+ Dephosphorylation
+ Development
+ Dissociation
+ Gene_expression
+ Glycolysis
+ Glycosylation
+ Growth
+ Infection
+ Localization
+ Metabolism
+ Metastasis
+ Mutation
+ Negative_regulation
+ Pathway
+ Phosphorylation
+ Planned_process
+ Positive_regulation
+ Protein_domain_or_region
+ Protein_processing
+ Regulation
+ Remodeling
+ Reproduction
+ Synthesis
+ Transcription
+ Translation
+ Ubiquitination
+
+
+
+
+ Protein_processing
+
+
+ Gene_or_gene_product
+
+
+
+
+ Regulation
+
+
+ Amino_acid
+ Anatomical_system
+ Cancer
+ Cell
+ Cellular_component
+ Developing_anatomical_structure
+ Gene_or_gene_product
+ Immaterial_anatomical_theme
+ Multi-tissue_structure
+ Organ
+ Organism
+ Organism_subdivision
+ Organism_substance
+ Pathological_formation
+ Simple_chemical
+ Tissue
+ Acetylation
+ Amino_acid_catabolism
+ Binding
+ Blood_vessel_development
+ Breakdown
+ Carcinogenesis
+ Catabolism
+ Cell_death
+ Cell_differentiation
+ Cell_division
+ Cell_proliferation
+ Cell_transformation
+ DNA_demethylation
+ DNA_domain_or_region
+ DNA_methylation
+ Death
+ Dephosphorylation
+ Development
+ Dissociation
+ Gene_expression
+ Glycolysis
+ Glycosylation
+ Growth
+ Infection
+ Localization
+ Metabolism
+ Metastasis
+ Mutation
+ Negative_regulation
+ Pathway
+ Phosphorylation
+ Planned_process
+ Positive_regulation
+ Protein_domain_or_region
+ Protein_processing
+ Regulation
+ Remodeling
+ Reproduction
+ Synthesis
+ Transcription
+ Translation
+ Ubiquitination
+
+
+
+
+ Remodeling
+
+
+ Tissue
+
+
+
+
+ Reproduction
+
+
+ Organism
+
+
+
+
+ Synthesis
+
+
+ Simple_chemical
+
+
+
+
+ Transcription
+
+
+ Gene_or_gene_product
+
+
+
+
+ Translation
+
+
+ Gene_or_gene_product
+
+
+
+
+ Ubiquitination
+
+
+ Gene_or_gene_product
+
+
+
+
+
+
diff --git a/conf/config_ge.xml b/conf/config_ge.xml
new file mode 100644
index 0000000..02c35b3
--- /dev/null
+++ b/conf/config_ge.xml
@@ -0,0 +1,237 @@
+
+
+
+ Protein
+
+
+
+ Gene_expression
+
+
+
+
+
+ Transcription
+
+
+
+
+
+ Protein_catabolism
+
+
+
+
+
+ Localization
+
+
+
+
+
+ Binding
+
+
+
+
+
+ Protein_modification
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Phosphorylation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Ubiquitination
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Acetylation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Deacetylation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Regulation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Positive_regulation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Negative_regulation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/conf/config_pc.xml b/conf/config_pc.xml
new file mode 100644
index 0000000..a73725d
--- /dev/null
+++ b/conf/config_pc.xml
@@ -0,0 +1,543 @@
+
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+
+ Acetylation
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Simple_chemical
+
+
+
+
+ Activation
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+ Acetylation
+ Activation
+ Binding
+ Conversion
+ Deacetylation
+ Degradation
+ Demethylation
+ Dephosphorylation
+ Deubiquitination
+ Dissociation
+ Gene_expression
+ Hydroxylation
+ Inactivation
+ Localization
+ Methylation
+ Negative_regulation
+ Pathway
+ Phosphorylation
+ Positive_regulation
+ Regulation
+ Transcription
+ Translation
+ Transport
+ Ubiquitination
+
+
+
+
+ Binding
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Complex
+
+
+
+
+ Conversion
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+
+
+ Deacetylation
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Simple_chemical
+
+
+
+
+ Degradation
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+
+
+ Demethylation
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Simple_chemical
+
+
+
+
+ Dephosphorylation
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Simple_chemical
+
+
+
+
+ Deubiquitination
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Simple_chemical
+
+
+
+
+ Dissociation
+
+
+ Complex
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+
+
+ Gene_expression
+
+
+ Gene_or_gene_product
+
+
+
+
+ Hydroxylation
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Simple_chemical
+
+
+
+
+ Inactivation
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+ Acetylation
+ Activation
+ Binding
+ Conversion
+ Deacetylation
+ Degradation
+ Demethylation
+ Dephosphorylation
+ Deubiquitination
+ Dissociation
+ Gene_expression
+ Hydroxylation
+ Inactivation
+ Localization
+ Methylation
+ Negative_regulation
+ Pathway
+ Phosphorylation
+ Positive_regulation
+ Regulation
+ Transcription
+ Translation
+ Transport
+ Ubiquitination
+
+
+
+
+ Localization
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Cellular_component
+ Cellular_component
+ Cellular_component
+
+
+
+
+ Methylation
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Simple_chemical
+
+
+
+
+ Negative_regulation
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+ Acetylation
+ Activation
+ Binding
+ Conversion
+ Deacetylation
+ Degradation
+ Demethylation
+ Dephosphorylation
+ Deubiquitination
+ Dissociation
+ Gene_expression
+ Hydroxylation
+ Inactivation
+ Localization
+ Methylation
+ Negative_regulation
+ Pathway
+ Phosphorylation
+ Positive_regulation
+ Regulation
+ Transcription
+ Translation
+ Transport
+ Ubiquitination
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+ Acetylation
+ Activation
+ Binding
+ Conversion
+ Deacetylation
+ Degradation
+ Demethylation
+ Dephosphorylation
+ Deubiquitination
+ Dissociation
+ Gene_expression
+ Hydroxylation
+ Inactivation
+ Localization
+ Methylation
+ Negative_regulation
+ Pathway
+ Phosphorylation
+ Positive_regulation
+ Regulation
+ Transcription
+ Translation
+ Transport
+ Ubiquitination
+
+
+
+
+ Pathway
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+
+
+ Phosphorylation
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Simple_chemical
+
+
+
+
+ Positive_regulation
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+ Acetylation
+ Activation
+ Binding
+ Conversion
+ Deacetylation
+ Degradation
+ Demethylation
+ Dephosphorylation
+ Deubiquitination
+ Dissociation
+ Gene_expression
+ Hydroxylation
+ Inactivation
+ Localization
+ Methylation
+ Negative_regulation
+ Pathway
+ Phosphorylation
+ Positive_regulation
+ Regulation
+ Transcription
+ Translation
+ Transport
+ Ubiquitination
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+ Acetylation
+ Activation
+ Binding
+ Conversion
+ Deacetylation
+ Degradation
+ Demethylation
+ Dephosphorylation
+ Deubiquitination
+ Dissociation
+ Gene_expression
+ Hydroxylation
+ Inactivation
+ Localization
+ Methylation
+ Negative_regulation
+ Pathway
+ Phosphorylation
+ Positive_regulation
+ Regulation
+ Transcription
+ Translation
+ Transport
+ Ubiquitination
+
+
+
+
+ Regulation
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+ Acetylation
+ Activation
+ Binding
+ Conversion
+ Deacetylation
+ Degradation
+ Demethylation
+ Dephosphorylation
+ Deubiquitination
+ Dissociation
+ Gene_expression
+ Hydroxylation
+ Inactivation
+ Localization
+ Methylation
+ Negative_regulation
+ Pathway
+ Phosphorylation
+ Positive_regulation
+ Regulation
+ Transcription
+ Translation
+ Transport
+ Ubiquitination
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+ Acetylation
+ Activation
+ Binding
+ Conversion
+ Deacetylation
+ Degradation
+ Demethylation
+ Dephosphorylation
+ Deubiquitination
+ Dissociation
+ Gene_expression
+ Hydroxylation
+ Inactivation
+ Localization
+ Methylation
+ Negative_regulation
+ Pathway
+ Phosphorylation
+ Positive_regulation
+ Regulation
+ Transcription
+ Translation
+ Transport
+ Ubiquitination
+
+
+
+
+ Transcription
+
+
+ Gene_or_gene_product
+
+
+
+
+ Translation
+
+
+ Gene_or_gene_product
+
+
+
+
+ Transport
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Cellular_component
+ Cellular_component
+
+
+
+
+ Ubiquitination
+
+
+ Simple_chemical
+ Gene_or_gene_product
+ Complex
+ Cellular_component
+
+
+ Simple_chemical
+
+
+
+
+
+
diff --git a/src/info/chenli/classifier/AbstractClassifier.java b/src/info/chenli/classifier/AbstractClassifier.java
index 004368d..0718933 100644
--- a/src/info/chenli/classifier/AbstractClassifier.java
+++ b/src/info/chenli/classifier/AbstractClassifier.java
@@ -31,7 +31,7 @@ public void train(List trainingInstances, int trainingRound) {
public int predict(Instance instance) {
- return predict(instance.getFeaturesNumeric());
+ return predict(instance.getFeaturesNumeric(), instance);
}
/**
@@ -40,7 +40,7 @@ public int predict(Instance instance) {
* @param featureVector
* @return The predicted label.
*/
- public abstract int predict(int[] featureSparseVector);
+ public abstract int predict(int[] featureSparseVector, Instance instance);
public abstract String modelToString();
diff --git a/src/info/chenli/classifier/Instance.java b/src/info/chenli/classifier/Instance.java
index f9e6b68..0172abd 100644
--- a/src/info/chenli/classifier/Instance.java
+++ b/src/info/chenli/classifier/Instance.java
@@ -5,12 +5,49 @@
public class Instance {
// instance ID
+ public boolean isReference;
+ public int tokenId;
+ public int sentenceId;
+ public String fileId;
private String id;
private int label;
private String labelString;
private List featuresString;
private int[] featuresNumeric = null;
+ private double[] featuresNumericWord2vec = null;
+
+ public boolean getIsReference() {
+ return this.isReference;
+ }
+
+ public void setIsReference(boolean isReference) {
+ this.isReference = isReference;
+ }
+
+ public int getTokenId() {
+ return this.tokenId;
+ }
+
+ public void setTokenId(int tokenId) {
+ this.tokenId = tokenId;
+ }
+
+ public int getSentenceId() {
+ return this.sentenceId;
+ }
+ public void setSentenceId(int sentenceId) {
+ this.sentenceId = sentenceId;
+ }
+
+ public String getFileId() {
+ return this.fileId;
+ }
+
+ public void setFileId(String fileId) {
+ this.fileId = fileId;
+ }
+
public String getId() {
return this.id;
}
@@ -34,6 +71,14 @@ public int[] getFeaturesNumeric() {
public void setFeaturesNumeric(int[] featuresNumeric) {
this.featuresNumeric = featuresNumeric;
}
+
+ public double[] getFeaturesNumericWord2vec() {
+ return featuresNumericWord2vec;
+ }
+
+ public void setFeaturesNumericWord2vec(double[] featuresNumericWord2vec) {
+ this.featuresNumericWord2vec = featuresNumericWord2vec;
+ }
public String getLabelString() {
return labelString;
diff --git a/src/info/chenli/classifier/LibLinearFacade.java b/src/info/chenli/classifier/LibLinearFacade.java
index 8b88645..c3a8fde 100644
--- a/src/info/chenli/classifier/LibLinearFacade.java
+++ b/src/info/chenli/classifier/LibLinearFacade.java
@@ -24,7 +24,7 @@ public class LibLinearFacade extends AbstractClassifier {
private final static Logger logger = Logger.getLogger(LibLinearFacade.class
.getName());
- private Model model;
+ public Model model;
public void train(List instances) {
@@ -40,6 +40,13 @@ public void train(List instances) {
}
}
}
+
+ double[] fs0 = instances.get(0).getFeaturesNumericWord2vec();
+ if (null != fs0) {
+ featureNum += fs0.length;
+ }
+
+ System.out.println("number of features:" + featureNum);
problem.n = featureNum; // number of features
problem.x = new Feature[instances.size()][]; // feature nodes
problem.y = new double[instances.size()]; // target values
@@ -50,9 +57,20 @@ public void train(List instances) {
int previousIndex = 0;
List featureNodes = new ArrayList();
+ double[] fs = instance.getFeaturesNumericWord2vec();
+ if (null != fs) {
+ for (int m=0; m previousIndex) {
- featureNodes.add(new FeatureNode(index, 1));
+ if (null != fs) {
+ featureNodes.add(new FeatureNode(fs.length + index, 1));
+ }else {
+ featureNodes.add(new FeatureNode(index, 1));
+ }
// System.out.print("\t" + (index ));
}
previousIndex = index;
@@ -82,7 +100,7 @@ public void train(List instances) {
}
@Override
- public int predict(int[] featureSparseVector) {
+ public int predict(int[] featureSparseVector, Instance instance) {
if (featureSparseVector == null) {
throw new IllegalArgumentException(
@@ -98,10 +116,21 @@ public int predict(int[] featureSparseVector) {
}
List featureNodes = new ArrayList();
+ double[] fs = instance.getFeaturesNumericWord2vec();
+ if (null != fs) {
+ for (int m=0; m previousIndex) {
- featureNodes.add(new FeatureNode(index, 1));
+ if (null != fs) {
+ featureNodes.add(new FeatureNode(fs.length + index, 1));
+ }else {
+ featureNodes.add(new FeatureNode(index, 1));
+ }
}
previousIndex = index;
}
@@ -109,9 +138,9 @@ public int predict(int[] featureSparseVector) {
Feature node = new FeatureNode(n, model.getBias());
featureNodes.add(node);
}
- Feature[] instance = new FeatureNode[featureNodes.size()];
- instance = featureNodes.toArray(instance);
- return (int) Math.round(Linear.predict(this.model, instance));
+ Feature[] instance0 = new FeatureNode[featureNodes.size()];
+ instance0 = featureNodes.toArray(instance0);
+ return (int) Math.round(Linear.predict(this.model, instance0));
}
diff --git a/src/info/chenli/litway/bionlp13/CellularComponent.java b/src/info/chenli/litway/bionlp13/CellularComponent.java
index 083bbfe..261278a 100644
--- a/src/info/chenli/litway/bionlp13/CellularComponent.java
+++ b/src/info/chenli/litway/bionlp13/CellularComponent.java
@@ -11,8 +11,8 @@
/**
- * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013
- * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml
+ * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015
+ * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml
* @generated */
public class CellularComponent extends Annotation {
/** @generated
@@ -54,10 +54,13 @@ public CellularComponent(JCas jcas, int begin, int end) {
readObject();
}
- /**
+ /**
+ *
* Write your own initialization here
*
- @generated modifiable */
+ *
+ * @generated modifiable
+ */
private void readObject() {/*default - does nothing empty block */}
@@ -66,14 +69,18 @@ private void readObject() {/*default - does nothing empty block */}
//* Feature: id
/** getter for id - gets
- * @generated */
+ * @generated
+ * @return value of the feature
+ */
public String getId() {
if (CellularComponent_Type.featOkTst && ((CellularComponent_Type)jcasType).casFeat_id == null)
jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.CellularComponent");
return jcasType.ll_cas.ll_getStringValue(addr, ((CellularComponent_Type)jcasType).casFeatCode_id);}
/** setter for id - sets
- * @generated */
+ * @generated
+ * @param v value to set into the feature
+ */
public void setId(String v) {
if (CellularComponent_Type.featOkTst && ((CellularComponent_Type)jcasType).casFeat_id == null)
jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.CellularComponent");
diff --git a/src/info/chenli/litway/bionlp13/CellularComponent_Type.java b/src/info/chenli/litway/bionlp13/CellularComponent_Type.java
index d405ae1..92d0239 100644
--- a/src/info/chenli/litway/bionlp13/CellularComponent_Type.java
+++ b/src/info/chenli/litway/bionlp13/CellularComponent_Type.java
@@ -14,7 +14,7 @@
import org.apache.uima.jcas.tcas.Annotation_Type;
/**
- * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013
+ * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015
* @generated */
public class CellularComponent_Type extends Annotation_Type {
/** @generated */
diff --git a/src/info/chenli/litway/bionlp13/Chemical.java b/src/info/chenli/litway/bionlp13/Chemical.java
index 4f40c8a..8bd227a 100644
--- a/src/info/chenli/litway/bionlp13/Chemical.java
+++ b/src/info/chenli/litway/bionlp13/Chemical.java
@@ -11,8 +11,8 @@
/**
- * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013
- * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml
+ * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015
+ * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml
* @generated */
public class Chemical extends Annotation {
/** @generated
@@ -54,10 +54,13 @@ public Chemical(JCas jcas, int begin, int end) {
readObject();
}
- /**
+ /**
+ *
* Write your own initialization here
*
- @generated modifiable */
+ *
+ * @generated modifiable
+ */
private void readObject() {/*default - does nothing empty block */}
@@ -66,14 +69,18 @@ private void readObject() {/*default - does nothing empty block */}
//* Feature: id
/** getter for id - gets
- * @generated */
+ * @generated
+ * @return value of the feature
+ */
public String getId() {
if (Chemical_Type.featOkTst && ((Chemical_Type)jcasType).casFeat_id == null)
jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.Chemical");
return jcasType.ll_cas.ll_getStringValue(addr, ((Chemical_Type)jcasType).casFeatCode_id);}
/** setter for id - sets
- * @generated */
+ * @generated
+ * @param v value to set into the feature
+ */
public void setId(String v) {
if (Chemical_Type.featOkTst && ((Chemical_Type)jcasType).casFeat_id == null)
jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.Chemical");
diff --git a/src/info/chenli/litway/bionlp13/Chemical_Type.java b/src/info/chenli/litway/bionlp13/Chemical_Type.java
index 4937af0..6af8c66 100644
--- a/src/info/chenli/litway/bionlp13/Chemical_Type.java
+++ b/src/info/chenli/litway/bionlp13/Chemical_Type.java
@@ -14,7 +14,7 @@
import org.apache.uima.jcas.tcas.Annotation_Type;
/**
- * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013
+ * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015
* @generated */
public class Chemical_Type extends Annotation_Type {
/** @generated */
diff --git a/src/info/chenli/litway/bionlp13/Complex.java b/src/info/chenli/litway/bionlp13/Complex.java
index 6230172..9d0ee60 100644
--- a/src/info/chenli/litway/bionlp13/Complex.java
+++ b/src/info/chenli/litway/bionlp13/Complex.java
@@ -11,8 +11,8 @@
/**
- * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013
- * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml
+ * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015
+ * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml
* @generated */
public class Complex extends Annotation {
/** @generated
@@ -54,10 +54,13 @@ public Complex(JCas jcas, int begin, int end) {
readObject();
}
- /**
+ /**
+ *
* Write your own initialization here
*
- @generated modifiable */
+ *
+ * @generated modifiable
+ */
private void readObject() {/*default - does nothing empty block */}
@@ -66,14 +69,18 @@ private void readObject() {/*default - does nothing empty block */}
//* Feature: id
/** getter for id - gets
- * @generated */
+ * @generated
+ * @return value of the feature
+ */
public String getId() {
if (Complex_Type.featOkTst && ((Complex_Type)jcasType).casFeat_id == null)
jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.Complex");
return jcasType.ll_cas.ll_getStringValue(addr, ((Complex_Type)jcasType).casFeatCode_id);}
/** setter for id - sets
- * @generated */
+ * @generated
+ * @param v value to set into the feature
+ */
public void setId(String v) {
if (Complex_Type.featOkTst && ((Complex_Type)jcasType).casFeat_id == null)
jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.Complex");
diff --git a/src/info/chenli/litway/bionlp13/Complex_Type.java b/src/info/chenli/litway/bionlp13/Complex_Type.java
index 92fc4d4..bc2863f 100644
--- a/src/info/chenli/litway/bionlp13/Complex_Type.java
+++ b/src/info/chenli/litway/bionlp13/Complex_Type.java
@@ -14,7 +14,7 @@
import org.apache.uima.jcas.tcas.Annotation_Type;
/**
- * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013
+ * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015
* @generated */
public class Complex_Type extends Annotation_Type {
/** @generated */
diff --git a/src/info/chenli/litway/bionlp13/EntityTypes.java b/src/info/chenli/litway/bionlp13/EntityTypes.java
index 6f87120..41c39bb 100644
--- a/src/info/chenli/litway/bionlp13/EntityTypes.java
+++ b/src/info/chenli/litway/bionlp13/EntityTypes.java
@@ -1,6 +1,10 @@
package info.chenli.litway.bionlp13;
+
+
public enum EntityTypes {
Cellular_component, Complex, Gene_or_gene_product, Protein, Simple_chemical;
}
+
+
diff --git a/src/info/chenli/litway/bionlp13/Gene.java b/src/info/chenli/litway/bionlp13/Gene.java
index d2870f7..73b1d67 100644
--- a/src/info/chenli/litway/bionlp13/Gene.java
+++ b/src/info/chenli/litway/bionlp13/Gene.java
@@ -11,8 +11,8 @@
/**
- * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013
- * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml
+ * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015
+ * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml
* @generated */
public class Gene extends Annotation {
/** @generated
@@ -54,10 +54,13 @@ public Gene(JCas jcas, int begin, int end) {
readObject();
}
- /**
+ /**
+ *
* Write your own initialization here
*
- @generated modifiable */
+ *
+ * @generated modifiable
+ */
private void readObject() {/*default - does nothing empty block */}
@@ -66,14 +69,18 @@ private void readObject() {/*default - does nothing empty block */}
//* Feature: id
/** getter for id - gets
- * @generated */
+ * @generated
+ * @return value of the feature
+ */
public String getId() {
if (Gene_Type.featOkTst && ((Gene_Type)jcasType).casFeat_id == null)
jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.Gene");
return jcasType.ll_cas.ll_getStringValue(addr, ((Gene_Type)jcasType).casFeatCode_id);}
/** setter for id - sets
- * @generated */
+ * @generated
+ * @param v value to set into the feature
+ */
public void setId(String v) {
if (Gene_Type.featOkTst && ((Gene_Type)jcasType).casFeat_id == null)
jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.Gene");
diff --git a/src/info/chenli/litway/bionlp13/Gene_Type.java b/src/info/chenli/litway/bionlp13/Gene_Type.java
index 467b0cd..0b711a0 100644
--- a/src/info/chenli/litway/bionlp13/Gene_Type.java
+++ b/src/info/chenli/litway/bionlp13/Gene_Type.java
@@ -14,7 +14,7 @@
import org.apache.uima.jcas.tcas.Annotation_Type;
/**
- * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013
+ * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015
* @generated */
public class Gene_Type extends Annotation_Type {
/** @generated */
diff --git a/src/info/chenli/litway/bionlp13/ge/AbstractInstances.java b/src/info/chenli/litway/bionlp13/ge/AbstractInstances.java
index af0ca2e..70ddb11 100644
--- a/src/info/chenli/litway/bionlp13/ge/AbstractInstances.java
+++ b/src/info/chenli/litway/bionlp13/ge/AbstractInstances.java
@@ -12,6 +12,7 @@
import info.chenli.litway.util.DependencyExtractor;
import info.chenli.litway.util.FileFilterImpl;
import info.chenli.litway.util.FileUtil;
+import info.chenli.litway.util.UimaUtil;
import info.chenli.litway.util.StanfordDependencyReader.Pair;
import java.io.File;
@@ -41,6 +42,8 @@
import org.apache.uima.util.XMLInputSource;
import org.uimafit.util.JCasUtil;
+import de.bwaldvogel.liblinear.FeatureNode;
+
public abstract class AbstractInstances {
private final static Logger logger = Logger
@@ -145,7 +148,7 @@ protected JCas processSingleFile(File aFile) {
String document = null;
try {
-
+
document = FileUtils.file2String(aFile);
} catch (IOException e) {
@@ -155,7 +158,6 @@ protected JCas processSingleFile(File aFile) {
}
document = document.trim();
-
try {
// create a CAS
CAS cas = ae.newCAS();
@@ -173,6 +175,10 @@ protected JCas processSingleFile(File aFile) {
FSIterator annoIter = null;
JCas jcas = null;
jcas = cas.getJCas();
+
+
+ //System.out.println(UimaUtil.getJCasFilePath(jcas));
+
for (int annotationType : annotationTypes) {
annoIter = jcas.getAnnotationIndex(annotationType).iterator();
structuredInstances.addAll(getStructuredInstances(jcas,
@@ -237,7 +243,10 @@ protected Token getTriggerToken(List tokens) {
new POSPrioritizer());
for (Token token : tokens) {
-
+ //System.out.println(token.getStem());
+ if (!POS.isPos(token.getPos())) {
+ continue;
+ }
sortedTokens.put(POS.valueOf(token.getPos()), token);
if (TriggerWord.isATriggerWord(token.getCoveredText()) != null) {
return token;
@@ -331,11 +340,22 @@ public void saveSvmLightInstances(File file) {
for (Instance instance : instances) {
sb.append(String.valueOf(instance.getLabel()));
+
+ double[] fs = instance.getFeaturesNumericWord2vec();
+ if (null != fs) {
+ for (int m=0; m previousIndex) {
- sb.append(" ".concat(String.valueOf(feature)).concat(":1"));
+ if (null != fs) {
+ sb.append(" ".concat(String.valueOf(fs.length + feature)).concat(":1"));
+ }else {
+ sb.append(" ".concat(String.valueOf(feature)).concat(":1"));
+ }
}
previousIndex = feature;
}
@@ -377,72 +397,367 @@ protected Instance causeToInstance(JCas jcas, Sentence sentence,
* @param themeToken
* @return
*/
- private Instance themeCauseToInstance(JCas jcas, Sentence sentence,
- Annotation anno, Trigger trigger, Set pairsOfSentence,
- DependencyExtractor dependencyExtractor, boolean isTruepositive,
- Stage stage, Token themeToken) {
+ protected Instance argumentToInstance(JCas jcas, Sentence sentence,
+ Annotation annotation, Trigger trigger, Set pairsOfSentence,
+ DependencyExtractor dependencyExtractor, boolean isTheme,
+ boolean isCause, Stage stage) {
- if (!(anno instanceof Trigger) && !(anno instanceof Protein)) {
- throw new IllegalArgumentException(
- "The theme/cause has to be a protein or trigger.");
+ Instance instance = new Instance();
+ List featuresString = new ArrayList();
+ instance.setFeaturesString(featuresString);
+
+ // get trigger token
+ Token triggerToken = getTriggerToken(jcas, trigger);
+ //System.out.println(annotation.getCoveredText());
+ Token annotationToken = getToken(jcas, annotation);
+ if (annotation instanceof Trigger) {
+ annotationToken = getTriggerToken(jcas, (Trigger)annotation);
}
+ // parser : dependency path between trigger-argument
+ //int dependencyPathLength = dependencyExtractor.getDijkstraShortestPathLength(
+ // triggerToken, annotationToken);
+ //featuresString.add(new String[] { "dependencyPathLength_" + String.valueOf(dependencyPathLength) });
+ String featurePath = dependencyExtractor.getShortestPath(
+ triggerToken, annotationToken, stage);
+/* if ( isTruepositive && null == featurePath && !areSameTokens) {
+ int i = sentence.getId();
+ String s = triggerToken.getCoveredText();
+ String s2 = annoToken.getCoveredText();
+ return null;
+ }*/
+ boolean areSameTokens = (annotationToken.getId() == triggerToken.getId());
+ featurePath = areSameTokens ? "SAMETOKEN" : featurePath;
+ featurePath = (null == featurePath ? null : "featurePath_".concat(featurePath));
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { featurePath });
+ //instance.setId(featurePath);
+ //instance.setFileId(trigger.getEventType());
+ /*if (areSameTokens && isTheme) {
+ System.out.println("theme" + "\t" + trigger.getEventType());
+ } else if (areSameTokens && !isTheme && !isCause) {
+ System.out.println("not" + "\t" + trigger.getEventType());
+ } else if (areSameTokens && isCause) {
+ System.out.println("isCause" + "\t" + trigger.getEventType());
+ } */
+ // parser refined?
- List annoTokens = JCasUtil
- .selectCovered(jcas, Token.class, anno);
+ // parser_simple: grouping of dpendency type;
+ // amod, nn --> nmod
+ // anything ending in subj --> subj
+ // anything ending in subjpass --> subjpass
+ /*String simplifiedFeaturePath = dependencyExtractor
+ .getSimplifiedShortestPath(triggerToken, annotationToken, stage);
+ simplifiedFeaturePath = areSameTokens ? "SAMETOKEN"
+ : simplifiedFeaturePath;
+ simplifiedFeaturePath = (null == simplifiedFeaturePath ? null
+ : "simplifiedFeaturePath_".concat(simplifiedFeaturePath));
+ featuresString.add(null == simplifiedFeaturePath ? new String[0]
+ : new String[] { simplifiedFeaturePath });*/
- // if protein/trigger is within a token
- if (annoTokens.size() == 0) {
- FSIterator iter = jcas.getAnnotationIndex(Token.type)
- .iterator();
- annoTokens = new ArrayList();
- while (iter.hasNext()) {
- Token token = (Token) iter.next();
- if (token.getBegin() <= anno.getBegin()
- && token.getEnd() >= anno.getEnd()) {
- annoTokens.add(token);
- break;
+ // trigger class
+ String triggerClassString;
+ if (EventType.isSimpleEvent(trigger.getEventType())) {
+ triggerClassString = "class_Simple";
+ } else if (EventType.isBindingEvent(trigger.getEventType())) {
+ triggerClassString = "class_Binding";
+ } else if (EventType.isRegulatoryEvent(trigger.getEventType())) {
+ triggerClassString = "class_Regulation";
+ } else {
+ triggerClassString = "class_Complex";
+ }
+ //featuresString.add(new String[] { triggerClassString });
+ /*featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerClassString.concat("_").concat(
+ featurePath) });*/
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerClassString
+ //.concat("_").concat(featurePath)
+ });
+
+ // trigger token & trigger type
+ /*String triggerText = "text_".concat(trigger.getCoveredText()
+ .toLowerCase());
+ featuresString.add(new String[] { triggerText });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerText.concat("_").concat(featurePath) });
+ featuresString.add(null == simplifiedFeaturePath ? new String[0]
+ : new String[] { triggerText.concat("_").concat(
+ simplifiedFeaturePath) });*/
+
+ /*String eventType = "eventType_".concat(trigger.getEventType());
+ //featuresString.add(new String[] { eventType });
+ //featuresString.add(null == featurePath ? new String[0]
+ // : new String[] { eventType.concat("_").concat(featurePath) });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { eventType
+ //.concat("_").concat(featurePath)
+ });
+*/
+ // trigger lemma (using the token's POS, which may be inaccurate)
+ String triggerLemma = "triggerLemma_".concat(triggerToken.getLemma());
+ triggerLemma = (null == triggerToken.getSubLemma() ? triggerLemma
+ : "triggerLemma_".concat(triggerToken.getSubLemma()
+ .toLowerCase()));
+ featuresString.add(new String[] { triggerLemma });
+ /*featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerLemma.concat("_")
+ .concat(featurePath) });*/
+
+ // trigger POS
+ /*String triggerPos = "triggerPos_".concat(triggerToken.getPos());
+ //featuresString.add(new String[] { triggerPos });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerPos.concat("_").concat(featurePath) });*/
+ String triggerPosShort = "triggerShortPos_".concat(triggerToken
+ .getPos().substring(0, 1));
+ //featuresString.add(new String[] { triggerPosShort });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerPosShort.concat("_")
+ .concat(featurePath) });
+
+ /*featuresString.add(new String[] { triggerLemma.concat("_").concat(
+ triggerPos) });
+ featuresString.add(new String[] { triggerLemma.concat("_").concat(
+ triggerPosShort) });*/
+
+ // argument type
+ String argClassString, argType, argLemma;
+ if (annotation instanceof Trigger) {
+ argType = ((Trigger)annotation).getEventType();
+ argLemma = "argLemma_".concat(annotationToken.getLemma());
+ argLemma = (null == annotationToken.getSubLemma() ? argLemma
+ : "argLemma_".concat(annotationToken.getSubLemma()
+ .toLowerCase()));
+ if (EventType.isSimpleEvent(((Trigger)annotation).getEventType())) {
+ argClassString = "arg_class_Simple";
+ } else if (EventType.isBindingEvent(((Trigger)annotation).getEventType())) {
+ argClassString = "arg_class_Binding";
+ } else if (EventType.isRegulatoryEvent(((Trigger)annotation).getEventType())) {
+ argClassString = "arg_class_Regulation";
+ } else {
+ argClassString = "arg_class_Complex";
+ }
+ } else {
+ argClassString = "arg_class_Protein";
+ argType = "arg_class_Protein";
+ argLemma = "arg_class_Protein";
+ }
+ //featuresString.add(new String[] { argClassString + triggerClassString});
+ //featuresString.add(new String[] { argClassString + triggerLemma});
+ //featuresString.add(new String[] { argType });
+ //featuresString.add(new String[] { argLemma });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerClassString.concat("_")
+ .concat(featurePath).concat("_").concat(argClassString) });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerLemma.concat("_")
+ .concat(featurePath).concat("_").concat(argClassString) });
+ /*featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerLemma.concat("_")
+ .concat(simplifiedFeaturePath).concat("_").concat(argLemma) });*/
+/* String argText = "text_Protein";
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerText.concat("_").concat(featurePath).
+ concat("_").concat(argText) });
+ featuresString.add(null == simplifiedFeaturePath ? new String[0]
+ : new String[] { triggerText.concat("_").concat(
+ simplifiedFeaturePath).concat("_").concat(
+ argText) });
+
+ String argLemma = "argLemma_Protein";
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerLemma.concat("_").concat(
+ featurePath).concat("_").concat(
+ argLemma) });
+
+ String argPos = "argPos_Protein";
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerPos
+ .concat("_").concat(argPos) });
+
+ String argType = "argType_Protein";
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerLemma.concat("_").concat(featurePath)
+ .concat("_").concat(argType) });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerSubLemma.concat("_")
+ .concat(featurePath).concat("_").concat(argType) });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerClassString.concat("_")
+ .concat(featurePath).concat("_").concat(argType) });*/
+
+ // text string from trigger to theme/cause: compensate when parsing
+ // fails
+ String textBetween = "", textAbsBetween = "", textShortBetween = "";
+
+ List tokensBetween = JCasUtil.selectCovered(jcas,
+ Token.class, sentence);
+ List proteinsBetween = JCasUtil.selectCovered(jcas,
+ Protein.class, sentence);
+ int start = Math.min(annotationToken.getBegin(), triggerToken.getBegin());
+ int end = Math.max(annotationToken.getEnd(), triggerToken.getEnd());
+ boolean reversed = (start != triggerToken.getBegin());
+
+ List tokensTextBetween = new ArrayList();
+ List tokensAbsTextBetween = new ArrayList();
+
+ tokensLoop: for (Token aToken : tokensBetween) {
+
+ if (aToken.getBegin() < start || !POS.isPos(aToken.getPos())) {
+ continue tokensLoop;
+ } else if (aToken.getEnd() >= end) {
+ break tokensLoop;
+ }
+
+ // if it is a protein
+ for (Protein aProtein : proteinsBetween) {
+ if (aToken.getBegin() == aProtein.getBegin()) {
+ tokensTextBetween.add("PROTEIN");
+ tokensAbsTextBetween.add("PROTEIN");
+ continue tokensLoop;
+ } else if (aToken.getBegin() > aProtein.getBegin()
+ && aToken.getEnd() <= aProtein.getEnd()) {
+ continue tokensLoop;
}
}
+ if (aToken.getBegin() == trigger.getBegin()) {
+ tokensAbsTextBetween.add(trigger.getEventType());
+ continue tokensLoop;
+ } else if (aToken.getBegin() > trigger.getBegin()
+ && aToken.getEnd() <= trigger.getEnd()) {
+ continue tokensLoop;
+ }
+
+ tokensTextBetween.add(aToken.getLemma().toLowerCase());
+ tokensAbsTextBetween.add(aToken.getLemma().toLowerCase());
+
}
- Token annoToken = null;
- if (anno instanceof Protein)
- // Take the last non-digital token if protein is
- // multi-token.
- {
- annoToken = annoTokens.get(annoTokens.size() - 1);
- // for (Token aToken : annoTokens) {
- //
- // try {
- // Double.parseDouble(aToken.getLemma());
- // break;
- // } catch (NumberFormatException e) {
- // token = aToken;
- // }
- //
- // }
- } else if (anno instanceof Trigger) {
- annoToken = getTriggerToken(jcas, (Trigger) anno);
+ /*int tokensTextBetweenLength = tokensTextBetween.size();
+ String[] tokensTextBetweenString = new String[tokensTextBetweenLength];
+ featuresString.add(new String[] { "tokensTextBetweenLength_" + String.valueOf(tokensTextBetweenLength) });
+ int j= 0;
+ for (String aText : tokensTextBetween) {
+ tokensTextBetweenString[j] = aText;
+ j++;
+ }
+ featuresString.add(tokensTextBetweenString);*/
+ for (String aText : tokensTextBetween) {
+ if (reversed) {
+ textBetween = aText.concat(textBetween.equals("") ? ""
+ : "_".concat(textBetween));
+ } else {
+ textBetween = textBetween.equals("") ? aText : textBetween
+ .concat("_").concat(aText);
+ }
+ }
+ for (String aText : tokensAbsTextBetween) {
+ if (reversed) {
+ textAbsBetween = aText
+ .concat(textAbsBetween.equals("") ? "" : "_"
+ .concat(textAbsBetween));
+ } else {
+ textAbsBetween = textAbsBetween.equals("") ? aText
+ : textAbsBetween.concat("_").concat(aText);
+ }
+ }
+ // concatenate text between trigger and theme/cause with the
+ // previous
+ // features.
+ textBetween = textBetween.equals("") ? null : "textString_".concat(
+ reversed ? "reversed_" : "").concat(textBetween);
+ textAbsBetween = textAbsBetween.equals("") ? null
+ : "textStringAbs_".concat(reversed ? "reversed_" : "")
+ .concat(textAbsBetween);
+ for (int i = 1; i < tokensAbsTextBetween.size() - 1; i++) {
+ if (reversed) {
+ textShortBetween = tokensAbsTextBetween.get(i).concat(
+ textShortBetween.equals("") ? "" : "_"
+ .concat(textShortBetween));
+ } else {
+ textShortBetween = textShortBetween.equals("") ? tokensAbsTextBetween
+ .get(i) : textShortBetween.concat("_").concat(
+ tokensAbsTextBetween.get(i));
+ }
+ }
+ textShortBetween = textShortBetween.equals("") ? null
+ : "textStringShort_".concat(reversed ? "reversed_" : "")
+ .concat(textShortBetween);
+ if (areSameTokens) {
+ textBetween = "SAMETOKEN";
+ textAbsBetween = "SAMETOKEN";
+ textShortBetween = "SAMETOKEN";
+ }
+
+ featuresString.add(null == textBetween ? new String[0]
+ : new String[] { textBetween });
+ featuresString.add(null == textBetween ? new String[0]
+ : new String[] { triggerLemma.concat("_").concat(textBetween) });
+ featuresString
+ .add(null != textBetween && null != featurePath ? new String[] { featurePath
+ .concat("_").concat(textBetween) } : new String[0]);
+
+ /**/featuresString.add(null == textAbsBetween ? new String[0]
+ : new String[] { textAbsBetween });
+ featuresString
+ .add(null == textAbsBetween ? new String[0]
+ : new String[] { triggerLemma.concat("_").concat(
+ textAbsBetween) });
+ /*featuresString
+ .add(null != textAbsBetween && null != featurePath ? new String[] { featurePath
+ .concat("_").concat(textAbsBetween) } : new String[0]);*/
+
+ featuresString.add(null == textShortBetween ? new String[0]
+ : new String[] { textShortBetween });
+ /*featuresString.add(null == textShortBetween ? new String[0]
+ : new String[] { triggerLemma.concat("_").concat(
+ textShortBetween) });*/
+ /*featuresString
+ .add(null != textShortBetween && null != featurePath ? new String[] { featurePath
+ .concat("_").concat(textShortBetween) } : new String[0]);*/
+
+ if (isTheme) {
+ instance.setLabelString("Theme");
+ } else if (isCause){
+ instance.setLabelString("Cause");
+ } else {
+ instance.setLabelString("Non_Argument");
}
+ return instance;
+ }
+
+ protected Instance triggerArgumentToInstance(JCas jcas, Sentence sentence,
+ Trigger arguTrigger, Trigger trigger, Set pairsOfSentence,
+ DependencyExtractor dependencyExtractor, boolean isTheme,
+ boolean isCause, Stage stage) {
+
Instance instance = new Instance();
List featuresString = new ArrayList();
instance.setFeaturesString(featuresString);
// get trigger token
Token triggerToken = getTriggerToken(jcas, trigger);
-
+ Token arguToken = getTriggerToken(jcas, arguTrigger);
// parser : dependency path between trigger-argument
String dependencyPath = dependencyExtractor.getShortestPath(
- triggerToken, annoToken, stage);
+ triggerToken, arguToken, stage);
String featurePath = dependencyPath;
if (null == dependencyPath) {
featurePath = dependencyExtractor.getReversedShortestPath(
- triggerToken, annoToken, stage);
+ triggerToken, arguToken, stage);
}
- boolean areSameTokens = (annoToken.getBegin() == triggerToken
- .getBegin() && annoToken.getEnd() == triggerToken.getEnd());
+
+ boolean areSameTokens = (arguToken.getBegin() == triggerToken
+ .getBegin() && arguToken.getEnd() == triggerToken.getEnd());
+
+/* if ( isTruepositive && null == featurePath && !areSameTokens) {
+ int i = sentence.getId();
+ String s = triggerToken.getCoveredText();
+ String s2 = annoToken.getCoveredText();
+ return null;
+ }*/
featurePath = areSameTokens ? "SAMETOKEN" : featurePath;
featurePath = (null == featurePath ? null : "dep_".concat(featurePath));
featuresString.add(null == featurePath ? new String[0]
@@ -457,10 +772,10 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence,
String simplifiedFeaturePath = null;
if (null != dependencyPath) {
simplifiedFeaturePath = dependencyExtractor
- .getSimplifiedShortestPath(triggerToken, annoToken, stage);
+ .getSimplifiedShortestPath(triggerToken, arguToken, stage);
} else {
simplifiedFeaturePath = dependencyExtractor
- .getSimplifiedReversedShortestPath(triggerToken, annoToken,
+ .getSimplifiedReversedShortestPath(triggerToken, arguToken,
stage);
}
simplifiedFeaturePath = areSameTokens ? "SAMETOKEN"
@@ -481,9 +796,11 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence,
} else {
triggerClassString = "class_Complex";
}
+
featuresString.add(null == featurePath ? new String[0]
: new String[] { triggerClassString.concat("_").concat(
featurePath) });
+
featuresString.add(null == simplifiedFeaturePath ? new String[0]
: new String[] { triggerClassString.concat("_").concat(
simplifiedFeaturePath) });
@@ -493,6 +810,7 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence,
.toLowerCase());
featuresString.add(null == featurePath ? new String[0]
: new String[] { triggerText.concat("_").concat(featurePath) });
+
String eventType = "eventType_".concat(trigger.getEventType());
featuresString.add(null == featurePath ? new String[0]
: new String[] { eventType.concat("_").concat(featurePath) });
@@ -500,6 +818,7 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence,
featuresString.add(null == simplifiedFeaturePath ? new String[0]
: new String[] { triggerText.concat("_").concat(
simplifiedFeaturePath) });
+
featuresString.add(null == simplifiedFeaturePath ? new String[0]
: new String[] { eventType.concat("_").concat(
simplifiedFeaturePath) });
@@ -512,6 +831,7 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence,
.add(null == featurePath ? new String[0]
: new String[] { triggerLemma.concat("_").concat(
featurePath) });
+
// trigger sublemma
String triggerSubLemma = (null == triggerToken.getSubLemma() ? triggerLemma
@@ -525,6 +845,7 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence,
String triggerPos = "triggerPos_".concat(triggerToken.getPos());
featuresString.add(null == featurePath ? new String[0]
: new String[] { triggerPos.concat("_").concat(featurePath) });
+
String triggerPosShort = "triggerShortPos_".concat(triggerToken
.getPos().substring(0, 1));
featuresString.add(null == featurePath ? new String[0]
@@ -541,22 +862,60 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence,
triggerPosShort) });
// argument type
- String argType = null;
- if (anno instanceof Protein) {
- argType = "argType_Protein";
- } else if (anno instanceof Trigger) {
- argType = "argType_".concat(((Trigger) anno).getEventType());
- }
+ String argClassString;
+ if (EventType.isSimpleEvent(arguTrigger.getEventType())) {
+ argClassString = "class_Simple";
+ } else if (EventType.isBindingEvent(arguTrigger.getEventType())) {
+ argClassString = "class_Binding";
+ } else if (EventType.isRegulatoryEvent(arguTrigger.getEventType())) {
+ argClassString = "class_Regulation";
+ } else {
+ argClassString = "class_Complex";
+ }
featuresString.add(null == featurePath ? new String[0]
: new String[] { triggerLemma.concat("_").concat(featurePath)
- .concat("_").concat(argType) });
+ .concat("_").concat(argClassString) });
featuresString.add(null == featurePath ? new String[0]
: new String[] { triggerSubLemma.concat("_")
- .concat(featurePath).concat("_").concat(argType) });
+ .concat(featurePath).concat("_").concat(argClassString) });
+ String argType = "argType_".concat(arguTrigger.getEventType());
featuresString.add(null == featurePath ? new String[0]
: new String[] { triggerClassString.concat("_")
.concat(featurePath).concat("_").concat(argType) });
+/* String argText = "text_".concat(arguTrigger.getCoveredText()
+ .toLowerCase());
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerText.concat("_").concat(featurePath).
+ concat("_").concat(argText) });
+ featuresString.add(null == simplifiedFeaturePath ? new String[0]
+ : new String[] { triggerText.concat("_").concat(
+ simplifiedFeaturePath).concat("_").concat(
+ argText) });
+
+ String argLemma = "argLemma_".concat(BioLemmatizerUtil
+ .lemmatizeWord(arguTrigger.getCoveredText(), arguToken.getPos())
+ .toLowerCase());
+ featuresString
+ .add(null == featurePath ? new String[0]
+ : new String[] { triggerLemma.concat("_").concat(
+ featurePath).concat("_").concat(
+ argLemma) });
+
+ String argPos = "argPos_".concat(arguToken.getPos());
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerPos
+ .concat("_").concat(argPos) });
+ String argType = "argType_".concat(arguTrigger.getEventType());
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerLemma.concat("_").concat(featurePath)
+ .concat("_").concat(argType) });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerSubLemma.concat("_")
+ .concat(featurePath).concat("_").concat(argType) });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerClassString.concat("_")
+ .concat(featurePath).concat("_").concat(argType) });*/
// text string from trigger to theme/cause: compensate when parsing
// fails
String textBetween = "", textAbsBetween = "", textShortBetween = "";
@@ -566,8 +925,8 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence,
Token.class, sentence);
List proteinsBetween = JCasUtil.selectCovered(jcas,
Protein.class, sentence);
- int start = Math.min(annoToken.getBegin(), triggerToken.getBegin());
- int end = Math.max(annoToken.getEnd(), triggerToken.getEnd());
+ int start = Math.min(arguToken.getBegin(), triggerToken.getBegin());
+ int end = Math.max(arguToken.getEnd(), triggerToken.getEnd());
boolean reversed = (start != triggerToken.getBegin());
List tokensTextBetween = new ArrayList();
@@ -676,118 +1035,787 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence,
.add(null != textShortBetween && null != dependencyPath ? new String[] { dependencyPath
.concat("_").concat(textShortBetween) } : new String[0]);
- if (stage.equals(Stage.CAUSE)) {
- String pathToTheme = null;
- if (null != themeToken) {
- pathToTheme = dependencyExtractor.getShortestPath(annoToken,
- themeToken, stage);
- if (null == pathToTheme) {
- pathToTheme = dependencyExtractor.getReversedShortestPath(
- annoToken, themeToken, stage);
- }
- }
- featuresString
- .add(null != pathToTheme && themeToken != null ? new String[] { pathToTheme }
- : new String[0]);
- }
-
- String label;
- switch (stage) {
- case THEME:
- label = "Theme";
- break;
- case CAUSE:
- label = "Cause";
- break;
- default:
- label = null;
- }
- if (isTruepositive) {
-
- instance.setLabelString(label);
+ if (isTheme) {
+ instance.setLabelString("Theme");
+ } else if (isCause){
+ instance.setLabelString("Cause");
} else {
- instance.setLabelString("Non_".concat(label.toLowerCase()));
+ instance.setLabelString("Non_Argument");
}
return instance;
}
- protected Instance bindingEventToInstance(JCas jcas, Sentence sentence,
- Event bindingEvent, List themes,
- DependencyExtractor dependencyExtractor) {
-
- boolean truepositive = true;
- if (null != bindingEvent.getThemes()
- && themes.size() == bindingEvent.getThemes().size()) {
- themeSearchingLoop: for (Protein protein : themes) {
- boolean foundTheProtein = false;
- for (int i = 0; i < bindingEvent.getThemes().size(); i++) {
- if (protein.getId().equals(bindingEvent.getThemes(i))) {
- foundTheProtein = true;
- break;
- }
- }
- if (foundTheProtein == false) {
- truepositive = false;
- break themeSearchingLoop;
- }
- }
- } else {
- truepositive = false;
- }
+ protected Instance triggerSpeicalArgumentToInstance(JCas jcas, Sentence sentence,
+ Trigger arguTrigger, Trigger trigger, Set pairsOfSentence,
+ DependencyExtractor dependencyExtractor, boolean isTheme,
+ boolean isCause, Stage stage) {
Instance instance = new Instance();
-
List featuresString = new ArrayList();
instance.setFeaturesString(featuresString);
- Trigger trigger = bindingEvent.getTrigger();
+ // get trigger token
Token triggerToken = getTriggerToken(jcas, trigger);
+ Token arguToken = getTriggerToken(jcas, arguTrigger);
+ // parser : dependency path between trigger-argument
+ String dependencyPath = dependencyExtractor.getShortestPath(
+ triggerToken, arguToken, stage);
+ String featurePath = dependencyPath;
- List themeTokens = new ArrayList();
- for (Protein aProtein : themes) {
- List annoTokens = JCasUtil.selectCovered(jcas, Token.class,
- aProtein);
-
- // if protein/trigger is within a token
- if (annoTokens.size() == 0) {
- FSIterator iter = jcas.getAnnotationIndex(
- Token.type).iterator();
- annoTokens = new ArrayList();
- while (iter.hasNext()) {
- Token token = (Token) iter.next();
- if (token.getBegin() <= aProtein.getBegin()
- && token.getEnd() >= aProtein.getEnd()) {
- annoTokens.add(token);
- break;
- }
- }
+ if (null == dependencyPath) {
+ featurePath = dependencyExtractor.getReversedShortestPath(
+ triggerToken, arguToken, stage);
+ }
+
+ boolean areSameTokens = (arguToken.getBegin() == triggerToken
+ .getBegin() && arguToken.getEnd() == triggerToken.getEnd());
+
+/* if ( isTruepositive && null == featurePath && !areSameTokens) {
+ int i = sentence.getId();
+ String s = triggerToken.getCoveredText();
+ String s2 = annoToken.getCoveredText();
+ return null;
+ }*/
+ featurePath = areSameTokens ? "SAMETOKEN" : featurePath;
+ featurePath = (null == featurePath ? null : "dep_".concat(featurePath));
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { featurePath });
+
+ // parser refined?
+
+ // parser_simple: grouping of dpendency type;
+ // amod, nn --> nmod
+ // anything ending in subj --> subj
+ // anything ending in subjpass --> subjpass
+ String simplifiedFeaturePath = null;
+ if (null != dependencyPath) {
+ simplifiedFeaturePath = dependencyExtractor
+ .getSimplifiedShortestPath(triggerToken, arguToken, stage);
+ } else {
+ simplifiedFeaturePath = dependencyExtractor
+ .getSimplifiedReversedShortestPath(triggerToken, arguToken,
+ stage);
+ }
+ simplifiedFeaturePath = areSameTokens ? "SAMETOKEN"
+ : simplifiedFeaturePath;
+ simplifiedFeaturePath = (null == simplifiedFeaturePath ? null
+ : "dep_simple_".concat(simplifiedFeaturePath));
+ featuresString.add(null == simplifiedFeaturePath ? new String[0]
+ : new String[] { simplifiedFeaturePath });
+
+ // trigger class
+ String triggerClassString;
+ if (EventType.isSimpleEvent(trigger.getEventType())) {
+ triggerClassString = "class_Simple";
+ } else if (EventType.isBindingEvent(trigger.getEventType())) {
+ triggerClassString = "class_Binding";
+ } else if (EventType.isRegulatoryEvent(trigger.getEventType())) {
+ triggerClassString = "class_Regulation";
+ } else {
+ triggerClassString = "class_Complex";
+ }
+
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerClassString.concat("_").concat(
+ featurePath) });
+ featuresString.add(null == simplifiedFeaturePath ? new String[0]
+ : new String[] { triggerClassString.concat("_").concat(
+ simplifiedFeaturePath) });
+
+ // trigger token & trigger type
+ String triggerText = "text_".concat(trigger.getCoveredText()
+ .toLowerCase());
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerText.concat("_").concat(featurePath) });
+
+ String eventType = "eventType_".concat(trigger.getEventType());
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { eventType.concat("_").concat(featurePath) });
+
+ featuresString.add(null == simplifiedFeaturePath ? new String[0]
+ : new String[] { triggerText.concat("_").concat(
+ simplifiedFeaturePath) });
+
+ featuresString.add(null == simplifiedFeaturePath ? new String[0]
+ : new String[] { eventType.concat("_").concat(
+ simplifiedFeaturePath) });
+
+ // trigger lemma (using the token's POS, which may be inaccurate)
+ String triggerLemma = "triggerLemma_".concat(BioLemmatizerUtil
+ .lemmatizeWord(trigger.getCoveredText(), triggerToken.getPos())
+ .toLowerCase());
+ featuresString
+ .add(null == featurePath ? new String[0]
+ : new String[] { triggerLemma.concat("_").concat(
+ featurePath) });
+
+ // trigger sublemma
+ String triggerSubLemma = (null == triggerToken.getSubLemma() ? triggerLemma
+ : "triggerSubLemma_".concat(triggerToken.getSubLemma()
+ .toLowerCase()));
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerSubLemma.concat("_")
+ .concat(featurePath) });
+
+ // trigger POS
+ String triggerPos = "triggerPos_".concat(triggerToken.getPos());
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerPos.concat("_").concat(featurePath) });
+ String triggerPosShort = "triggerShortPos_".concat(triggerToken
+ .getPos().substring(0, 1));
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerPosShort.concat("_")
+ .concat(featurePath) });
+
+ featuresString.add(new String[] { triggerLemma.concat("_").concat(
+ triggerPos) });
+ featuresString.add(new String[] { triggerLemma.concat("_").concat(
+ triggerPosShort) });
+ featuresString.add(new String[] { triggerSubLemma.concat("_").concat(
+ triggerPos) });
+ featuresString.add(new String[] { triggerSubLemma.concat("_").concat(
+ triggerPosShort) });
+
+ // argument type
+ String argClassString;
+ if (EventType.isSimpleEvent(arguTrigger.getEventType())) {
+ argClassString = "class_Simple";
+ } else if (EventType.isBindingEvent(arguTrigger.getEventType())) {
+ argClassString = "class_Binding";
+ } else if (EventType.isRegulatoryEvent(arguTrigger.getEventType())) {
+ argClassString = "class_Regulation";
+ } else {
+ argClassString = "class_Complex";
+ }
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerClassString.concat("_").concat(
+ featurePath).concat("_").concat(argClassString) });
+
+ String argText = "text_".concat(arguTrigger.getCoveredText()
+ .toLowerCase());
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerText.concat("_").concat(featurePath).
+ concat("_").concat(argText) });
+ featuresString.add(null == simplifiedFeaturePath ? new String[0]
+ : new String[] { triggerText.concat("_").concat(
+ simplifiedFeaturePath).concat("_").concat(
+ argText) });
+
+ String argLemma = "argLemma_".concat(BioLemmatizerUtil
+ .lemmatizeWord(arguTrigger.getCoveredText(), arguToken.getPos())
+ .toLowerCase());
+ featuresString
+ .add(null == featurePath ? new String[0]
+ : new String[] { triggerLemma.concat("_").concat(
+ featurePath).concat("_").concat(
+ argLemma) });
+
+ String argPos = "argPos_".concat(arguToken.getPos());
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerPos
+ .concat("_").concat(argPos) });
+
+ String argType = "argType_".concat(arguTrigger.getEventType());
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerLemma.concat("_").concat(featurePath)
+ .concat("_").concat(argType) });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerSubLemma.concat("_")
+ .concat(featurePath).concat("_").concat(argType) });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerClassString.concat("_")
+ .concat(featurePath).concat("_").concat(argType) });
+
+ // text string from trigger to theme/cause: compensate when parsing
+ // fails
+ String textBetween = "", textAbsBetween = "", textShortBetween = "";
+
+ if (!areSameTokens) {
+ List tokensBetween = JCasUtil.selectCovered(jcas,
+ Token.class, sentence);
+ List proteinsBetween = JCasUtil.selectCovered(jcas,
+ Protein.class, sentence);
+ int start = Math.min(arguToken.getBegin(), triggerToken.getBegin());
+ int end = Math.max(arguToken.getEnd(), triggerToken.getEnd());
+ boolean reversed = (start != triggerToken.getBegin());
+
+ List tokensTextBetween = new ArrayList();
+ List tokensAbsTextBetween = new ArrayList();
+
+ tokensLoop: for (Token aToken : tokensBetween) {
+
+ if (aToken.getBegin() < start || !POS.isPos(aToken.getPos())) {
+ continue tokensLoop;
+ } else if (aToken.getEnd() >= end) {
+ break tokensLoop;
+ }
+
+ // if it is a protein
+ for (Protein aProtein : proteinsBetween) {
+ if (aToken.getBegin() == aProtein.getBegin()) {
+ tokensTextBetween.add("PROTEIN");
+ tokensAbsTextBetween.add("PROTEIN");
+ continue tokensLoop;
+ } else if (aToken.getBegin() > aProtein.getBegin()
+ && aToken.getEnd() <= aProtein.getEnd()) {
+ continue tokensLoop;
+ }
+ }
+ if (aToken.getBegin() == trigger.getBegin()) {
+ tokensAbsTextBetween.add(trigger.getEventType());
+ continue tokensLoop;
+ } else if (aToken.getBegin() > trigger.getBegin()
+ && aToken.getEnd() <= trigger.getEnd()) {
+ continue tokensLoop;
+ }
+
+ tokensTextBetween.add(aToken.getLemma().toLowerCase());
+ tokensAbsTextBetween.add(aToken.getLemma().toLowerCase());
+
+ }
+
+ for (String aText : tokensTextBetween) {
+ if (reversed) {
+ textBetween = aText.concat(textBetween.equals("") ? ""
+ : "_".concat(textBetween));
+ } else {
+ textBetween = textBetween.equals("") ? aText : textBetween
+ .concat("_").concat(aText);
+ }
+ }
+ for (String aText : tokensAbsTextBetween) {
+ if (reversed) {
+ textAbsBetween = aText
+ .concat(textAbsBetween.equals("") ? "" : "_"
+ .concat(textAbsBetween));
+ } else {
+ textAbsBetween = textAbsBetween.equals("") ? aText
+ : textAbsBetween.concat("_").concat(aText);
+ }
+ }
+ // concatenate text between trigger and theme/cause with the
+ // previous
+ // features.
+ textBetween = textBetween.equals("") ? null : "textString_".concat(
+ reversed ? "reversed_" : "").concat(textBetween);
+ textAbsBetween = textAbsBetween.equals("") ? null
+ : "textStringAbs_".concat(reversed ? "reversed_" : "")
+ .concat(textAbsBetween);
+ for (int i = 1; i < tokensAbsTextBetween.size() - 1; i++) {
+ if (reversed) {
+ textShortBetween = tokensAbsTextBetween.get(i).concat(
+ textShortBetween.equals("") ? "" : "_"
+ .concat(textShortBetween));
+ } else {
+ textShortBetween = textShortBetween.equals("") ? tokensAbsTextBetween
+ .get(i) : textShortBetween.concat("_").concat(
+ tokensAbsTextBetween.get(i));
+ }
+ }
+ textShortBetween = textShortBetween.equals("") ? null
+ : "textStringShort_".concat(reversed ? "reversed_" : "")
+ .concat(textShortBetween);
+ } else {
+ textBetween = "SAMETOKEN";
+ textAbsBetween = "SAMETOKEN";
+ textShortBetween = "SAMETOKEN";
+ }
+
+ featuresString.add(null == textBetween ? new String[0]
+ : new String[] { textBetween });
+ featuresString.add(null == textBetween ? new String[0]
+ : new String[] { triggerText.concat("_").concat(textBetween) });
+ featuresString
+ .add(null != textBetween && null != dependencyPath ? new String[] { dependencyPath
+ .concat("_").concat(textBetween) } : new String[0]);
+
+ featuresString.add(null == textAbsBetween ? new String[0]
+ : new String[] { textAbsBetween });
+ featuresString
+ .add(null == textAbsBetween ? new String[0]
+ : new String[] { triggerText.concat("_").concat(
+ textAbsBetween) });
+
+ featuresString.add(null == textShortBetween ? new String[0]
+ : new String[] { textShortBetween });
+ featuresString.add(null == textShortBetween ? new String[0]
+ : new String[] { triggerText.concat("_").concat(
+ textShortBetween) });
+ featuresString
+ .add(null != textShortBetween && null != dependencyPath ? new String[] { dependencyPath
+ .concat("_").concat(textShortBetween) } : new String[0]);
+
+
+ if (isTheme) {
+ instance.setLabelString("Theme");
+ } else if (isCause){
+ instance.setLabelString("Cause");
+ } else {
+ instance.setLabelString("Non_Argument");
+ }
+
+ return instance;
+ }
+
+ private Instance themeCauseToInstance(JCas jcas, Sentence sentence,
+ Annotation anno, Trigger trigger, Set pairsOfSentence,
+ DependencyExtractor dependencyExtractor, boolean isTruepositive,
+ Stage stage, Token themeToken) {
+ if (!(anno instanceof Trigger) && !(anno instanceof Protein)) {
+ throw new IllegalArgumentException(
+ "The theme/cause has to be a protein or trigger.");
+ }
+
+ List annoTokens = JCasUtil
+ .selectCovered(jcas, Token.class, anno);
+
+ // if protein/trigger is within a token
+ if (annoTokens.size() == 0) {
+ List tokens = JCasUtil.selectCovered(jcas, Token.class,
+ sentence);
+ annoTokens = new ArrayList();
+ for (Token token : tokens) {
+ if (token.getBegin() <= anno.getBegin()
+ && token.getEnd() >= anno.getEnd()) {
+ annoTokens.add(token);
+ break;
+ }
+ }
+ }
+/* if (annoTokens.size() == 0) {
+ int i = anno.getBegin();
+ int j = anno.getEnd();
+ String s = anno.getCoveredText();
+ }*/
+ Token annoToken = null;
+ if (anno instanceof Protein)
+ // Take the last non-digital token if protein is
+ // multi-token.
+ {
+ annoToken = annoTokens.get(annoTokens.size() - 1);
+ // for (Token aToken : annoTokens) {
+ //
+ // try {
+ // Double.parseDouble(aToken.getLemma());
+ // break;
+ // } catch (NumberFormatException e) {
+ // token = aToken;
+ // }
+ //
+ // }
+ } else if (anno instanceof Trigger) {
+ annoToken = getTriggerToken(jcas, (Trigger) anno);
+ }
+
+ Instance instance = new Instance();
+ List featuresString = new ArrayList();
+ instance.setFeaturesString(featuresString);
+
+ // get trigger token
+ Token triggerToken = getTriggerToken(jcas, trigger);
+
+ // parser : dependency path between trigger-argument
+ String dependencyPath = dependencyExtractor.getShortestPath(
+ triggerToken, annoToken, stage);
+ String featurePath = dependencyPath;
+
+ if (null == dependencyPath) {
+ featurePath = dependencyExtractor.getReversedShortestPath(
+ triggerToken, annoToken, stage);
+ }
+
+ boolean areSameTokens = (annoToken.getBegin() == triggerToken
+ .getBegin() && annoToken.getEnd() == triggerToken.getEnd());
+
+/* if ( isTruepositive && null == featurePath && !areSameTokens) {
+ int i = sentence.getId();
+ String s = triggerToken.getCoveredText();
+ String s2 = annoToken.getCoveredText();
+ return null;
+ }*/
+ featurePath = areSameTokens ? "SAMETOKEN" : featurePath;
+ featurePath = (null == featurePath ? null : "dep_".concat(featurePath));
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { featurePath });
+
+ // parser refined?
+
+ // parser_simple: grouping of dpendency type;
+ // amod, nn --> nmod
+ // anything ending in subj --> subj
+ // anything ending in subjpass --> subjpass
+ String simplifiedFeaturePath = null;
+ if (null != dependencyPath) {
+ simplifiedFeaturePath = dependencyExtractor
+ .getSimplifiedShortestPath(triggerToken, annoToken, stage);
+ } else {
+ simplifiedFeaturePath = dependencyExtractor
+ .getSimplifiedReversedShortestPath(triggerToken, annoToken,
+ stage);
+ }
+ simplifiedFeaturePath = areSameTokens ? "SAMETOKEN"
+ : simplifiedFeaturePath;
+ simplifiedFeaturePath = (null == simplifiedFeaturePath ? null
+ : "dep_simple_".concat(simplifiedFeaturePath));
+ featuresString.add(null == simplifiedFeaturePath ? new String[0]
+ : new String[] { simplifiedFeaturePath });
+
+ // trigger class
+ String triggerClassString;
+ if (EventType.isSimpleEvent(trigger.getEventType())) {
+ triggerClassString = "class_Simple";
+ } else if (EventType.isBindingEvent(trigger.getEventType())) {
+ triggerClassString = "class_Binding";
+ } else if (EventType.isRegulatoryEvent(trigger.getEventType())) {
+ triggerClassString = "class_Regulation";
+ } else {
+ triggerClassString = "class_Complex";
+ }
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerClassString.concat("_").concat(
+ featurePath) });
+ featuresString.add(null == simplifiedFeaturePath ? new String[0]
+ : new String[] { triggerClassString.concat("_").concat(
+ simplifiedFeaturePath) });
+
+ // trigger token & trigger type
+ String triggerText = "text_".concat(trigger.getCoveredText()
+ .toLowerCase());
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerText.concat("_").concat(featurePath) });
+ String eventType = "eventType_".concat(trigger.getEventType());
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { eventType.concat("_").concat(featurePath) });
+
+ featuresString.add(null == simplifiedFeaturePath ? new String[0]
+ : new String[] { triggerText.concat("_").concat(
+ simplifiedFeaturePath) });
+ featuresString.add(null == simplifiedFeaturePath ? new String[0]
+ : new String[] { eventType.concat("_").concat(
+ simplifiedFeaturePath) });
+
+ // trigger lemma (using the token's POS, which may be inaccurate)
+ String triggerLemma = "triggerLemma_".concat(BioLemmatizerUtil
+ .lemmatizeWord(trigger.getCoveredText(), triggerToken.getPos())
+ .toLowerCase());
+ featuresString
+ .add(null == featurePath ? new String[0]
+ : new String[] { triggerLemma.concat("_").concat(
+ featurePath) });
+
+ // trigger sublemma
+ String triggerSubLemma = (null == triggerToken.getSubLemma() ? triggerLemma
+ : "triggerSubLemma_".concat(triggerToken.getSubLemma()
+ .toLowerCase()));
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerSubLemma.concat("_")
+ .concat(featurePath) });
+
+ // trigger POS
+ String triggerPos = "triggerPos_".concat(triggerToken.getPos());
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerPos.concat("_").concat(featurePath) });
+ String triggerPosShort = "triggerShortPos_".concat(triggerToken
+ .getPos().substring(0, 1));
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerPosShort.concat("_")
+ .concat(featurePath) });
+
+ featuresString.add(new String[] { triggerLemma.concat("_").concat(
+ triggerPos) });
+ featuresString.add(new String[] { triggerLemma.concat("_").concat(
+ triggerPosShort) });
+ featuresString.add(new String[] { triggerSubLemma.concat("_").concat(
+ triggerPos) });
+ featuresString.add(new String[] { triggerSubLemma.concat("_").concat(
+ triggerPosShort) });
+
+ // argument type
+ String argType = null;
+ if (anno instanceof Protein) {
+ argType = "argType_Protein";
+ } else if (anno instanceof Trigger) {
+ argType = "argType_".concat(((Trigger) anno).getEventType());
+ }
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerLemma.concat("_").concat(featurePath)
+ .concat("_").concat(argType) });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerSubLemma.concat("_")
+ .concat(featurePath).concat("_").concat(argType) });
+ featuresString.add(null == featurePath ? new String[0]
+ : new String[] { triggerClassString.concat("_")
+ .concat(featurePath).concat("_").concat(argType) });
+
+ // text string from trigger to theme/cause: compensate when parsing
+ // fails
+ String textBetween = "", textAbsBetween = "", textShortBetween = "";
+
+ if (!areSameTokens) {
+ List tokensBetween = JCasUtil.selectCovered(jcas,
+ Token.class, sentence);
+ List proteinsBetween = JCasUtil.selectCovered(jcas,
+ Protein.class, sentence);
+ int start = Math.min(annoToken.getBegin(), triggerToken.getBegin());
+ int end = Math.max(annoToken.getEnd(), triggerToken.getEnd());
+ boolean reversed = (start != triggerToken.getBegin());
+
+ List tokensTextBetween = new ArrayList();
+ List tokensAbsTextBetween = new ArrayList();
+
+ tokensLoop: for (Token aToken : tokensBetween) {
+
+ if (aToken.getBegin() < start || !POS.isPos(aToken.getPos())) {
+ continue tokensLoop;
+ } else if (aToken.getEnd() >= end) {
+ break tokensLoop;
+ }
+
+ // if it is a protein
+ for (Protein aProtein : proteinsBetween) {
+ if (aToken.getBegin() == aProtein.getBegin()) {
+ tokensTextBetween.add("PROTEIN");
+ tokensAbsTextBetween.add("PROTEIN");
+ continue tokensLoop;
+ } else if (aToken.getBegin() > aProtein.getBegin()
+ && aToken.getEnd() <= aProtein.getEnd()) {
+ continue tokensLoop;
+ }
+ }
+ if (aToken.getBegin() == trigger.getBegin()) {
+ tokensAbsTextBetween.add(trigger.getEventType());
+ continue tokensLoop;
+ } else if (aToken.getBegin() > trigger.getBegin()
+ && aToken.getEnd() <= trigger.getEnd()) {
+ continue tokensLoop;
+ }
+
+ tokensTextBetween.add(aToken.getLemma().toLowerCase());
+ tokensAbsTextBetween.add(aToken.getLemma().toLowerCase());
+
+ }
+
+ for (String aText : tokensTextBetween) {
+ if (reversed) {
+ textBetween = aText.concat(textBetween.equals("") ? ""
+ : "_".concat(textBetween));
+ } else {
+ textBetween = textBetween.equals("") ? aText : textBetween
+ .concat("_").concat(aText);
+ }
+ }
+ for (String aText : tokensAbsTextBetween) {
+ if (reversed) {
+ textAbsBetween = aText
+ .concat(textAbsBetween.equals("") ? "" : "_"
+ .concat(textAbsBetween));
+ } else {
+ textAbsBetween = textAbsBetween.equals("") ? aText
+ : textAbsBetween.concat("_").concat(aText);
+ }
+ }
+ // concatenate text between trigger and theme/cause with the
+ // previous
+ // features.
+ textBetween = textBetween.equals("") ? null : "textString_".concat(
+ reversed ? "reversed_" : "").concat(textBetween);
+ textAbsBetween = textAbsBetween.equals("") ? null
+ : "textStringAbs_".concat(reversed ? "reversed_" : "")
+ .concat(textAbsBetween);
+ for (int i = 1; i < tokensAbsTextBetween.size() - 1; i++) {
+ if (reversed) {
+ textShortBetween = tokensAbsTextBetween.get(i).concat(
+ textShortBetween.equals("") ? "" : "_"
+ .concat(textShortBetween));
+ } else {
+ textShortBetween = textShortBetween.equals("") ? tokensAbsTextBetween
+ .get(i) : textShortBetween.concat("_").concat(
+ tokensAbsTextBetween.get(i));
+ }
}
+ textShortBetween = textShortBetween.equals("") ? null
+ : "textStringShort_".concat(reversed ? "reversed_" : "")
+ .concat(textShortBetween);
+ } else {
+ textBetween = "SAMETOKEN";
+ textAbsBetween = "SAMETOKEN";
+ textShortBetween = "SAMETOKEN";
+ }
- Token token = null;
- token = annoTokens.get(0);
- for (Token aToken : annoTokens) {
+ featuresString.add(null == textBetween ? new String[0]
+ : new String[] { textBetween });
+ featuresString.add(null == textBetween ? new String[0]
+ : new String[] { triggerText.concat("_").concat(textBetween) });
+ featuresString
+ .add(null != textBetween && null != dependencyPath ? new String[] { dependencyPath
+ .concat("_").concat(textBetween) } : new String[0]);
- try {
- Double.parseDouble(aToken.getLemma());
- break;
- } catch (NumberFormatException e) {
- token = aToken;
+ featuresString.add(null == textAbsBetween ? new String[0]
+ : new String[] { textAbsBetween });
+ featuresString
+ .add(null == textAbsBetween ? new String[0]
+ : new String[] { triggerText.concat("_").concat(
+ textAbsBetween) });
+
+ featuresString.add(null == textShortBetween ? new String[0]
+ : new String[] { textShortBetween });
+ featuresString.add(null == textShortBetween ? new String[0]
+ : new String[] { triggerText.concat("_").concat(
+ textShortBetween) });
+ featuresString
+ .add(null != textShortBetween && null != dependencyPath ? new String[] { dependencyPath
+ .concat("_").concat(textShortBetween) } : new String[0]);
+
+ if (stage.equals(Stage.CAUSE)) {
+ String pathToTheme = null;
+ if (null != themeToken) {
+ pathToTheme = dependencyExtractor.getShortestPath(annoToken,
+ themeToken, stage);
+ if (null == pathToTheme) {
+ pathToTheme = dependencyExtractor.getReversedShortestPath(
+ annoToken, themeToken, stage);
}
}
- themeTokens.add(token);
+ featuresString
+ .add(null != pathToTheme && themeToken != null ? new String[] { pathToTheme }
+ : new String[0]);
+ }
+
+ String label;
+ switch (stage) {
+ case THEME:
+ label = "Theme";
+ break;
+ case CAUSE:
+ label = "Cause";
+ break;
+ default:
+ label = null;
+ }
+ if (isTruepositive) {
+
+ instance.setLabelString(label);
+
+ } else {
+ instance.setLabelString("Non_".concat(label.toLowerCase()));
+ }
+
+ return instance;
+ }
+
+ protected Instance bindingEventToInstance(JCas jcas, Sentence sentence,
+ Trigger trigger, List themes,
+ DependencyExtractor dependencyExtractor, boolean truepositive) {
+
+ Instance instance = new Instance();
+ List featuresString = new ArrayList();
+ instance.setFeaturesString(featuresString);
+
+ Token triggerToken = getTriggerToken(jcas, trigger);
+ List themeTokens = new ArrayList();
+ for (Protein aProtein : themes) {
+ themeTokens.add(getToken(jcas, aProtein));
}
+
+ List tokensBetween = JCasUtil.selectCovered(jcas,
+ Token.class, sentence);
+ List proteinsBetween = JCasUtil.selectCovered(jcas,
+ Protein.class, sentence);
+
+ String[] themePaths = new String[2];
+ if (themeTokens.size() == 1) {
+ themePaths[0] = "themeSize=1";
+ themePaths[1] = null;
+ //themePaths[2] = null;
+ //themePaths[3] = null;
+ }else if (themeTokens.size() == 2) {
+ String themePath0 = dependencyExtractor.getShortestPath(
+ themeTokens.get(0), themeTokens.get(1), Stage.BINDING);
+ String themePath1 = dependencyExtractor.getShortestPath(
+ themeTokens.get(1), themeTokens.get(0), Stage.BINDING);
+ //String themePath2 = dependencyExtractor
+ // .getSimplifiedShortestPath(themeTokens.get(0), themeTokens.get(1), Stage.BINDING);
+ //String themePath3 = dependencyExtractor
+ // .getSimplifiedShortestPath(themeTokens.get(1), themeTokens.get(0), Stage.BINDING);
+ themePaths[0] = null == themePath0 ? null : "themePath_" + themePath0;
+ themePaths[1] = null == themePath1 ? null : "themePath_" + themePath1;
+ //themePaths[0] = null == themePath2 ? null : "themeSimplifiedPath_" + themePath2;
+ //themePaths[1] = null == themePath3 ? null : "themeSimplifiedPath_" + themePath3;
+ //int dependencyPathLength = dependencyExtractor.getDijkstraShortestPathLength(
+ // themeTokens.get(1), themeTokens.get(0));
+ //themePaths[2] = "dependencyPathLength_" + String.valueOf(dependencyPathLength) ;
+ }
+ featuresString.add(themePaths);
+
+/* String[] themeTextBetween = new String[2];
+ if (themeTokens.size() == 1) {
+ themeTextBetween[0] = "themeSize=1";
+ themeTextBetween[1] = null;
+ //themeTextBetween[2] = null;
+ }else if (themeTokens.size() == 2) {
+ int start = Math.min(themeTokens.get(0).getBegin(),
+ themeTokens.get(1).getBegin());
+ int end = Math.max(themeTokens.get(0).getEnd(), themeTokens.get(1).getEnd());
+
+ List tokensTextBetween = new ArrayList();
+ List tokensAbsTextBetween = new ArrayList();
+
+ tokensLoop: for (Token aToken : tokensBetween) {
+
+ if (aToken.getBegin() < start || !POS.isPos(aToken.getPos())) {
+ continue tokensLoop;
+ } else if (aToken.getEnd() >= end) {
+ break tokensLoop;
+ }
+
+ // if it is a protein
+ for (Protein aProtein : proteinsBetween) {
+ if (aToken.getBegin() == aProtein.getBegin()) {
+ tokensTextBetween.add("PROTEIN");
+ tokensAbsTextBetween.add("PROTEIN");
+ continue tokensLoop;
+ } else if (aToken.getBegin() > aProtein.getBegin()
+ && aToken.getEnd() <= aProtein.getEnd()) {
+ continue tokensLoop;
+ }
+ }
+ if (aToken.getBegin() == trigger.getBegin()) {
+ tokensAbsTextBetween.add(trigger.getEventType());
+ continue tokensLoop;
+ } else if (aToken.getBegin() > trigger.getBegin()
+ && aToken.getEnd() <= trigger.getEnd()) {
+ continue tokensLoop;
+ }
+
+ tokensTextBetween.add(aToken.getLemma().toLowerCase());
+ tokensAbsTextBetween.add(aToken.getLemma().toLowerCase());
- if (themeTokens.size() == 0) {
- throw new RuntimeException("Theme number is zero. Please check.");
+ }
+
+ String textBetween = "", textAbsBetween = "";
+ for (String aText : tokensTextBetween) {
+ textBetween = textBetween.equals("") ? aText : textBetween
+ .concat("_").concat(aText);
+ }
+ for (String aText : tokensAbsTextBetween) {
+ textAbsBetween = textAbsBetween.equals("") ? aText
+ : textAbsBetween.concat("_").concat(aText);
+ }
+ themeTextBetween[0] = null == textBetween ? null : "themeTextBetween_" + textBetween;
+ themeTextBetween[1] = null == textAbsBetween ? null : "themeTextAbsBetween_" + textAbsBetween;
+ int tokensTextBetweenLength = tokensTextBetween.size();
+ //themeTextBetween[2] = "tokensTextBetweenLength_" + String.valueOf(tokensTextBetweenLength);
}
+ featuresString.add(themeTextBetween);*/
+
String triggerText = "text_".concat(triggerToken.getCoveredText()
.toLowerCase());
String triggerLemma = "triggerLemma_".concat(triggerToken.getLemma()
.toLowerCase());
- String triggerSubLemma = (null == triggerToken.getSubLemma() ? triggerToken
- .getLemma() : "triggerSubLemma_".concat(triggerToken
+ triggerLemma = (null == triggerToken.getSubLemma() ? triggerToken
+ .getLemma() : "triggerLemma_".concat(triggerToken
.getSubLemma().toLowerCase()));
String triggerPos = "triggerPos_".concat(triggerToken.getPos());
String triggerPosShort = "triggerShortPos_".concat(triggerToken
@@ -796,62 +1824,71 @@ protected Instance bindingEventToInstance(JCas jcas, Sentence sentence,
// parser : dependency path between trigger-argument
int i = 0;
String[] dependencyPaths = new String[themeTokens.size()];
+ //String[] pathLength = new String[themeTokens.size()];
String[] simplifiedFeaturePaths = new String[themeTokens.size()];
String[] triggerTextPaths = new String[themeTokens.size()];
- String[] triggerTextSimplifiedPaths = new String[themeTokens.size()];
- String[] triggerLemmaPaths = new String[themeTokens.size()];
- String[] triggerSubLemmaPaths = new String[themeTokens.size()];
+ String[] triggerLemmaPaths = new String[themeTokens.size()];
+ String[] triggerLemmaSimplifiedPaths = new String[themeTokens.size()];
String[] triggerPosPaths = new String[themeTokens.size()];
String[] triggerPosShortPaths = new String[themeTokens.size()];
String[] textBetweens = new String[themeTokens.size()];
- String[] triggerTextBetweens = new String[themeTokens.size()];
+ //String[] textBetweenLength = new String[themeTokens.size()];
+ //String[] triggerLemmaBetweens = new String[themeTokens.size()];
String[] textBetweenDependencies = new String[themeTokens.size()];
String[] textAbsBetweenDependencies = new String[themeTokens.size()];
- String[] textShortBetweens = new String[themeTokens.size()];
- String[] textShortBetweenDependencyPaths = new String[themeTokens
- .size()];
+ //String[] textShortBetweens = new String[themeTokens.size()];
+ //String[] textShortBetweenDependencyPaths = new String[themeTokens.size()];
for (Token aThemeToken : themeTokens) {
- String dependencyPath = dependencyExtractor.getShortestPath(
+ /*int triggerPathLength = dependencyExtractor.getDijkstraShortestPathLength(
+ triggerToken, aThemeToken);
+ pathLength[i] = "triggerPathLength_" + String.valueOf(triggerPathLength);
+ if (i==1 && pathLength[1].equals(pathLength[0])) {
+ pathLength[i] = pathLength[i] + "twice";
+ }*/
+ String featurePath = dependencyExtractor.getShortestPath(
triggerToken, aThemeToken, Stage.BINDING);
- String featurePath = dependencyPath;
-
- if (null == dependencyPath) {
- featurePath = dependencyExtractor.getReversedShortestPath(
- triggerToken, aThemeToken, Stage.BINDING);
+ boolean areSameTokens = (aThemeToken.getId() == triggerToken.getId());
+ featurePath = areSameTokens ? "SAMETOKEN" : featurePath;
+ featurePath = (null == featurePath ? null : "featurePath_".concat(featurePath));
+ if (null != featurePath) {
+ for (int m=0; m nmod
// anything ending in subj --> subj
// anything ending in subjpass --> subjpass
- if (null != dependencyPath) {
- simplifiedFeaturePath = dependencyExtractor
- .getSimplifiedShortestPath(triggerToken, aThemeToken, Stage.BINDING);
- } else {
- simplifiedFeaturePath = dependencyExtractor
- .getSimplifiedReversedShortestPath(triggerToken,
- aThemeToken, Stage.BINDING);
- }
+
+ simplifiedFeaturePath = areSameTokens ? "SAMETOKEN" : simplifiedFeaturePath;
simplifiedFeaturePath = (null == simplifiedFeaturePath ? null
- : "dep_simple_".concat(simplifiedFeaturePath));
+ : "simplifiedFeaturePath_".concat(simplifiedFeaturePath));
+ if (null != simplifiedFeaturePath) {
+ for (int m=0; m tokensBetween = JCasUtil.selectCovered(jcas,
- Token.class, sentence);
- List proteinsBetween = JCasUtil.selectCovered(jcas,
- Protein.class, sentence);
int start = Math.min(aThemeToken.getBegin(),
triggerToken.getBegin());
int end = Math.max(aThemeToken.getEnd(), triggerToken.getEnd());
@@ -924,19 +1957,38 @@ protected Instance bindingEventToInstance(JCas jcas, Sentence sentence,
: textAbsBetween.concat("_").concat(aText);
}
}
-
+ if (null != textBetween) {
+ for (int m=0; m iter = jcas.getAnnotationIndex(
+ Token.type).iterator();
+ proteinTokens = new ArrayList();
+ while (iter.hasNext()) {
+ Token token = (Token) iter.next();
+ if (token.getBegin() < protein.getBegin()
+ && token.getEnd() > protein.getBegin()) {
+ proteinTokens.add(token);
+ break;
+ }
+ }
+ }
if (proteinTokens.size() == 0) {
logger.warning("No token found for protein.");
return null;
}
- return proteinTokens.get(proteinTokens.size() - 1);
+ Token token = proteinTokens.get(0);
+ for (Token aToken : proteinTokens) {
+
+ try {
+ Double.parseDouble(aToken.getLemma());
+ break;
+ } catch (NumberFormatException e) {
+ token = aToken;
+ }
+ }
+ return token;
+ }
+
+ protected Token getToken(JCas jcas, Annotation annotation) {
+
+ List tokens = JCasUtil.selectCovered(jcas, Token.class,
+ annotation);
+ // if protein/trigger is within a token
+ if (tokens.size() == 0) {
+ FSIterator iter = jcas.getAnnotationIndex(
+ Token.type).iterator();
+ tokens = new ArrayList();
+ while (iter.hasNext()) {
+ Token token = (Token) iter.next();
+ if (token.getBegin() <= annotation.getBegin()
+ && token.getEnd() >= annotation.getEnd()) {
+ tokens.add(token);
+ break;
+ }
+ }
+ }
+ if (tokens.size() == 0) {
+ FSIterator iter = jcas.getAnnotationIndex(
+ Token.type).iterator();
+ tokens = new ArrayList();
+ while (iter.hasNext()) {
+ Token token = (Token) iter.next();
+ if (token.getBegin() < annotation.getBegin()
+ && token.getEnd() > annotation.getBegin()) {
+ tokens.add(token);
+ break;
+ }
+ }
+ }
+ if (tokens.size() == 0) {
+ logger.warning("No token found for annotation.");
+ return null;
+ }
+
+ Token token = tokens.get(0);
+ for (Token aToken : tokens) {
+
+ try {
+ Double.parseDouble(aToken.getLemma());
+ break;
+ } catch (NumberFormatException e) {
+ token = aToken;
+ }
+ }
+ return token;
}
}
diff --git a/src/info/chenli/litway/bionlp13/ge/ArgumentInstances.java b/src/info/chenli/litway/bionlp13/ge/ArgumentInstances.java
new file mode 100644
index 0000000..f2f3bd9
--- /dev/null
+++ b/src/info/chenli/litway/bionlp13/ge/ArgumentInstances.java
@@ -0,0 +1,280 @@
+package info.chenli.litway.bionlp13.ge;
+
+import info.chenli.classifier.Instance;
+import info.chenli.classifier.InstanceDictionary;
+import info.chenli.litway.corpora.Event;
+import info.chenli.litway.corpora.POS;
+import info.chenli.litway.corpora.Protein;
+import info.chenli.litway.corpora.Sentence;
+import info.chenli.litway.corpora.Token;
+import info.chenli.litway.corpora.Trigger;
+import info.chenli.litway.searn.StructuredInstance;
+import info.chenli.litway.util.DependencyExtractor;
+import info.chenli.litway.util.FileUtil;
+import info.chenli.litway.util.StanfordDependencyReader;
+import info.chenli.litway.util.StanfordDependencyReader.Pair;
+import info.chenli.litway.util.UimaUtil;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.logging.Logger;
+
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.text.AnnotationIndex;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.uimafit.util.JCasUtil;
+
+public class ArgumentInstances extends AbstractInstances {
+
+ private final static Logger logger = Logger.getLogger(ArgumentInstances.class
+ .getName());
+
+ public ArgumentInstances() {
+ super(new int[] { Protein.type, Event.type });
+
+ }
+
+ @Override
+ protected List getLabelsString() {
+
+ ArrayList argumentTypes = new ArrayList();
+
+ argumentTypes.add("Theme");
+ argumentTypes.add("Cause");
+ argumentTypes.add("Non_Argument");
+
+ return argumentTypes;
+
+ }
+
+ @Override
+ protected List getStructuredInstances(JCas jcas,
+ FSIterator tokenIter) {
+
+ List results = new LinkedList();
+
+ AnnotationIndex sentenceIndex = jcas
+ .getAnnotationIndex(Sentence.type);
+
+ FSIterator sentenceIter = sentenceIndex.iterator();
+ Map> pairsOfArticle = new HashMap>();
+ if (new File(FileUtil.removeFileNameExtension(
+ UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc")).exists()) {
+ pairsOfArticle = StanfordDependencyReader
+ .getPairs(new File(FileUtil.removeFileNameExtension(
+ UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc")));
+ } else {
+ pairsOfArticle = StanfordDependencyReader
+ .getPairs(new File(FileUtil.removeFileNameExtension(
+ UimaUtil.getJCasFilePath(jcas)).concat(".sd")));
+ }
+
+ /*String s = FileUtil.removeFileNameExtension(
+ UimaUtil.getJCasFilePath(jcas));*/
+ // Currently, one sentence is considered as one structured instance.
+ while (sentenceIter.hasNext()) {
+
+ StructuredInstance si = new StructuredInstance();
+ List argumentCandidates = new LinkedList();
+ si.setNodes(argumentCandidates);
+
+ Sentence sentence = (Sentence) sentenceIter.next();
+ Set pairsOfSentence = pairsOfArticle.get(sentence.getId());
+
+ DependencyExtractor dependencyExtractor = new DependencyExtractor(
+ JCasUtil.selectCovered(jcas, Token.class, sentence),
+ pairsOfSentence);
+
+ List events = JCasUtil.selectCovered(jcas, Event.class,
+ sentence);
+ List triggers= JCasUtil.selectCovered(jcas, Trigger.class,
+ sentence);
+ List proteins = JCasUtil.selectCovered(jcas,
+ Protein.class, sentence);
+ if (proteins.size() < 1) {
+ continue;
+ }
+ if (triggers.size() < 1) {
+ continue;
+ }
+ Map> triggerEvevts = new TreeMap>();
+ for (Trigger trigger : triggers) {
+ for (Event event : events) {
+ if (event.getTrigger().getBegin() == trigger.getBegin()) {
+ Token themeToken = getThemeToken(jcas, event, sentence);
+ if (null == themeToken) {
+ // There are cross sentence themes, which are not considered
+ // at the moment.
+ //logger.warning(fileName.concat(": An event must have a theme. It may be caused by cross-sentence event."));
+ continue;
+ }
+
+ Set triggerEvevt = new HashSet();
+ if (triggerEvevts.containsKey(trigger.getId())) {
+ triggerEvevt = triggerEvevts.get(trigger.getId());
+ }
+ triggerEvevt.add(event);
+ triggerEvevts.put(trigger.getId(), triggerEvevt);
+ }
+ }
+ }
+
+ for (Trigger trigger : triggers) {
+ /*if (!EventType.isBindingEvent(trigger.getEventType())) {
+ continue;
+ }*/
+ // check protein arguments
+ for (Protein protein : proteins) {
+ boolean isTheme = false, isCause = false;
+ if (triggerEvevts.containsKey(trigger.getId())) {
+ loop : for (Event event : triggerEvevts.get(trigger.getId())) {
+ for (int i = 0; i < event.getThemes().size(); i++) {
+ isTheme = event.getThemes(i).equals(
+ protein.getId());
+ if (isTheme == true) {
+ break loop;
+ }
+ }
+ }
+
+ if (!isTheme
+ && EventType.isComplexEvent(trigger.getEventType())) {
+
+ for (Event event : triggerEvevts.get(trigger.getId())) {
+ if (null != event.getCause()) {
+ isCause = event.getCause().equals(
+ protein.getId());
+ if (isCause == true) {
+ break;
+ }
+ }
+ }
+ }
+ }
+/* Token triggerToken = getToken(jcas, trigger);
+ Token token = getToken(jcas, protein);
+ int pathLength = dependencyExtractor.getDijkstraShortestPathLength(
+ triggerToken, token);
+ int distance = token.getId() > triggerToken.getId() ? token.getId() - triggerToken.getId()
+ : triggerToken.getId() - token.getId();
+ if (pathLength > 6) {
+ if (isTheme || isCause) {
+ System.out.println("error");
+ }
+ //continue;
+ }
+ if (distance > 10) {
+ //notTheme.add(protein.getId());
+ }*/
+
+ Instance instance = argumentToInstance(jcas, sentence,
+ protein, trigger, pairsOfSentence,
+ dependencyExtractor, isTheme, isCause, Stage.THEME);
+ if ( instance != null) {
+ argumentCandidates.add(instance);
+ }
+ }
+
+ // check event arguments
+ if (EventType.isComplexEvent(trigger.getEventType())) {
+ for (Trigger argumentTrigger : triggers) {
+ if (argumentTrigger.getBegin() == trigger.getBegin()) {
+ continue;
+ }
+
+ boolean isTheme =false, isCause =false;
+ if (triggerEvevts.containsKey(trigger.getId())
+ && triggerEvevts.containsKey(argumentTrigger.getId())) {
+ if (EventType.isRegulatoryEvent(trigger.getEventType())) {
+ loop : for (Event event : triggerEvevts.get(trigger.getId())) {
+ for (Event themeEvent : triggerEvevts.get(argumentTrigger.getId())) {
+ if (event.getThemes(0).equalsIgnoreCase(themeEvent.getId())) {
+ isTheme = true;
+ break loop;
+ }
+ }
+ }
+ }
+
+ if (!isTheme) {
+ loop : for (Event event : triggerEvevts.get(trigger.getId())) {
+ for (Event themeEvent : triggerEvevts.get(argumentTrigger.getId())) {
+ if (null != event.getCause()
+ && event.getCause().equalsIgnoreCase(themeEvent.getId())) {
+ isCause = true;
+ break loop;
+ }
+ }
+ }
+ }
+ }
+
+ /*Token triggerToken = getToken(jcas, trigger);
+ Token token = getToken(jcas, argumentTrigger);
+ int pathLength = dependencyExtractor.getDijkstraShortestPathLength(
+ triggerToken, token);
+ int distance = token.getId() > triggerToken.getId() ? token.getId() - triggerToken.getId()
+ : triggerToken.getId() - token.getId();
+ if (pathLength > 6) {
+ if (isTheme || isCause) {
+ System.out.println("error");
+ }
+ //continue;
+ }
+ if (distance > 10) {
+ //notTheme.add(protein.getId());
+ }*/
+
+ argumentCandidates.add(argumentToInstance(jcas, sentence,
+ argumentTrigger, trigger, pairsOfSentence,
+ dependencyExtractor, isTheme, isCause, Stage.THEME));
+ }
+ }
+ }
+ results.add(si);
+ }
+
+ return results;
+ }
+
+ public static void main(String[] args) {
+
+ ArgumentInstances ti = new ArgumentInstances();
+ ti.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml");
+
+ List instances = ti.getInstances(new File(args[0]));
+
+ InstanceDictionary dict = new InstanceDictionary();
+ dict.creatNumericDictionary(instances);
+ String classifierName = "liblinear";
+ dict.saveDictionary(new File("./model/arguments.".concat(classifierName)
+ .concat(".dict")));
+
+ ti.saveInstances(new File("./model/instances.arguments.txt"));
+ ti.saveSvmLightInstances(new File("./model/instances.arguments.svm.txt"));
+
+ if (args.length == 2 && args[1].equals("dev")) {
+
+ ArgumentInstances testInstances = new ArgumentInstances();
+ testInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml");
+ List tInstances = testInstances.getInstances(new File(
+ "./data/development/"));
+
+ tInstances = dict.instancesToNumeric(tInstances);
+
+ testInstances.saveInstances(new File(
+ "./model/instances.arguments.dev.txt"));
+ testInstances.saveSvmLightInstances(new File(
+ "./model/instances.arguments.svm.dev.txt"));
+ }
+
+ }
+}
diff --git a/src/info/chenli/litway/bionlp13/ge/ArgumentRecogniser.java b/src/info/chenli/litway/bionlp13/ge/ArgumentRecogniser.java
new file mode 100644
index 0000000..3b1c43e
--- /dev/null
+++ b/src/info/chenli/litway/bionlp13/ge/ArgumentRecogniser.java
@@ -0,0 +1,245 @@
+package info.chenli.litway.bionlp13.ge;
+
+import info.chenli.classifier.Accurary;
+import info.chenli.classifier.Instance;
+import info.chenli.classifier.InstanceDictionary;
+import info.chenli.classifier.LibLinearFacade;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.logging.Logger;
+
+import de.bwaldvogel.liblinear.Feature;
+import de.bwaldvogel.liblinear.FeatureNode;
+import de.bwaldvogel.liblinear.Linear;
+
+/**
+ *
+ * @author rqsong
+ *
+ */
+public class ArgumentRecogniser extends LibLinearFacade {
+
+ private final static Logger logger = Logger.getLogger(ArgumentRecogniser.class
+ .getName());
+ private final String classifierName = "liblinear";
+
+ public void train(File trainingSet, boolean useSearn) {
+
+ if (useSearn) {
+
+ } else {
+
+ InstanceDictionary dict = new InstanceDictionary();
+
+ ArgumentInstances trainingInstances = new ArgumentInstances();
+ trainingInstances
+ .setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml");
+ List instances = trainingInstances
+ .getInstances(trainingSet);
+
+ dict.creatNumericDictionary(instances);
+ dict.saveDictionary(new File("./model/arguments.".concat(
+ classifierName).concat(".dict")));
+ /*
+ trainingInstances.saveInstances(new File(
+ "./model/instances.theme.txt"));
+ trainingInstances.saveSvmLightInstances(new File(
+ "./model/instances.theme.svm.txt"));
+ */
+ train(dict.instancesToNumeric(instances));
+ saveModel(new File("./model/arguments.".concat(classifierName)
+ .concat(".model")));
+ // System.out.println(accuracy(instances));
+
+
+
+ // System.out.println(accuracy(instances));
+ }
+
+ }
+
+ public void train2(File trainingSet, boolean useSearn) {
+
+ if (useSearn) {
+
+ } else {
+
+ InstanceDictionary dict = new InstanceDictionary();
+
+ ArgumentInstances trainingInstances = new ArgumentInstances();
+ trainingInstances
+ .setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml");
+ List instances = trainingInstances
+ .getInstances(trainingSet);
+
+ dict.creatNumericDictionary(instances);
+ dict.saveDictionary(new File("./model/arguments.train.devel.".concat(
+ classifierName).concat(".dict")));
+ /*
+ trainingInstances.saveInstances(new File(
+ "./model/instances.theme.txt"));
+ trainingInstances.saveSvmLightInstances(new File(
+ "./model/instances.theme.svm.txt"));
+ */
+ train(dict.instancesToNumeric(instances));
+ saveModel(new File("./model/arguments.train.devel.".concat(classifierName)
+ .concat(".model")));
+ // System.out.println(accuracy(instances));
+
+
+
+ // System.out.println(accuracy(instances));
+ }
+
+ }
+
+ public static void main(String[] args) {
+
+ ArgumentRecogniser tr = new ArgumentRecogniser();
+ //tr.train2(new File("/media/songrq/soft/litway/数据/BioNLP13/"
+ // + "BioNLP-ST-2013_GE_train_devel_data_yuanShuJu"), false);
+
+ tr.train2(new File(args[0]), false);
+
+ //tr.train(new File("/media/songrq/soft/litway/数据/BioNLP13/"
+ // + "BioNLP-ST-2013_GE_train_data_yuanShuJu"), false);
+
+ /*tr.train(new File("/media/songrq/soft/litway/数据/BioNLP11/"
+ + "BioNLP-ST-2011-2013_GE_train_data"), false);
+ tr.test(new File("/media/songrq/soft/litway/数据/BioNLP13/"
+ + "BioNLP-ST-2013_GE_devel_data_yuanShuJu"));*/
+
+ //tr.train(new File("/media/songrq/soft/litway/数据/BioNLP11/"
+ // + "b"), false);
+
+ //tr.test(new File("/media/songrq/soft/litway/数据/BioNLP13/"
+ // + "BioNLP-ST-2013_GE_devel_data_yuanShuJu"));
+
+ tr.train2(new File("/media/songrq/soft/litway/数据/BioNLP11/"
+ + "BioNLP-ST-2011-2013_GE_train_devel_data"), false);
+
+
+
+ /*
+ tr.loadModel(new File("./model/themes.liblinear.model".concat(tr.classifierName)
+ .concat(".model")));
+
+ InstanceDictionary dict = new InstanceDictionary();
+ dict.loadDictionary(new File("./model/themes."
+ .concat(tr.classifierName).concat(".dict")));
+
+ ThemeInstances ti = new ThemeInstances();
+ ti.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml");
+
+ List instances = ti.getInstances(new File(args[0]));
+
+ instances = dict.instancesToNumeric(instances);
+
+ int total = 0, correct = 0;
+ for (Instance instance : instances) {
+ int prediction = tr.predict(instance);
+ System.out.print(instance.getLabel() + ":" + prediction);
+ for (String[] values : instance.getFeaturesString()) {
+ for (String value : values) {
+ System.out.print("\t" + value);
+ }
+ }
+ System.out.println();
+ for (int value : instance.getFeaturesNumeric()) {
+ System.out.print("\t" + value);
+ }
+ System.out.println();
+ if (prediction == instance.getLabel()) {
+ correct++;
+ }
+ total++;
+ }
+ System.out.println(new Accurary(correct, total));
+ */
+ }
+
+ private void test(File file) {
+ // TODO Auto-generated method stub
+ ArgumentInstances testInstances = new ArgumentInstances();
+ testInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml");
+ List instances = testInstances.getInstances(file);
+ InstanceDictionary dict = new InstanceDictionary();
+ dict.loadDictionary(new File("./model/arguments."
+ .concat(classifierName).concat(".dict")));
+ this.loadModel(new File("./model/arguments.".concat(
+ classifierName).concat(".model")));
+ instances = dict.instancesToNumeric(instances);
+ testInstances.saveSvmLightInstances(new File(
+ "./model/instances.arguments.svm.dev.txt"));
+ int total = 0, correct = 0, tp, tn = 0, n = 0, fn, fp;
+ float p, r, f;
+
+ for (Instance instance : instances) {
+ int prediction = predict(instance);
+ if (prediction == instance.getLabel()) {
+ if (instance.getLabelString().equalsIgnoreCase("Non_Argument")){
+ tn++;
+ }//else if (instance.getFileId().equals("Binding")){
+ //System.out.println("TP " + instance.getFileId() + " " + instance.getLabelString() + " " + instance.getId());
+ //}
+ correct++;
+ }//else if (!instance.getLabelString().equalsIgnoreCase("Non_Argument")
+ // && prediction == dict.getLabelNumeric("Non_Argument")
+ // && instance.getFileId().equals("Binding")) {
+ //System.out.println("FN " + instance.getFileId() + " " + instance.getId());
+ //}
+
+ if (instance.getLabelString().equalsIgnoreCase("Non_Argument")){
+ n++;
+ }
+ total++;
+ }
+
+ fp = n - tn;
+ tp = correct - tn;
+ fn = total - n - tp;
+ p = (float) tp / (tp + fp);
+ r = (float) tp / (tp + fn);
+ f = (float) 2 * p * r / (p + r);
+
+ System.out.println(new Accurary(correct, total));
+ System.out.println("tp: " + tp + " fp: " + fp + " fn: " + fn);
+ System.out.println("p: " + p + " r: " + r + " f: " + f);
+ }
+
+ public double predict_values(int[] featureSparseVector) {
+
+ if (featureSparseVector == null) {
+ throw new IllegalArgumentException(
+ "Empty sparse vector. This probably due to that the dictionary hasn't converted instances to numeric features yet.");
+ }
+
+ int n;
+ int nr_feature = this.model.getNrFeature();
+ if (this.model.getBias() >= 0) {
+ n = nr_feature + 1;
+ } else {
+ n = nr_feature;
+ }
+
+ List featureNodes = new ArrayList();
+ int previousIndex = 0;
+ for (int index : featureSparseVector) {
+ if (index > previousIndex) {
+ featureNodes.add(new FeatureNode(index, 1));
+ }
+ previousIndex = index;
+ }
+ if (model.getBias() >= 0) {
+ Feature node = new FeatureNode(n, model.getBias());
+ featureNodes.add(node);
+ }
+ Feature[] instance = new FeatureNode[featureNodes.size()];
+ instance = featureNodes.toArray(instance);
+ double[] dec_values = new double[this.model.getNrClass()];
+ int type = (int) Math.round(Linear.predictValues(this.model, instance, dec_values));
+ return dec_values[type];
+ }
+}
diff --git a/src/info/chenli/litway/bionlp13/ge/BindingInstances.java b/src/info/chenli/litway/bionlp13/ge/BindingInstances.java
index 5491ec8..fb998fe 100644
--- a/src/info/chenli/litway/bionlp13/ge/BindingInstances.java
+++ b/src/info/chenli/litway/bionlp13/ge/BindingInstances.java
@@ -18,14 +18,18 @@
import java.io.File;
import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
+import java.util.TreeMap;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.StringArray;
import org.apache.uima.jcas.tcas.Annotation;
import org.uimafit.util.JCasUtil;
@@ -46,20 +50,38 @@ protected List getLabelsString() {
@Override
protected List getStructuredInstances(JCas jcas,
FSIterator annoIter) {
-
+ final String classifierName = "liblinear";
+
+ boolean test = true;
+ ArgumentRecogniser argumentRecogniser = new ArgumentRecogniser();
+ argumentRecogniser.loadModel(new File("./model/arguments.".concat(
+ classifierName).concat(".model")));
+ InstanceDictionary argumentDict = new InstanceDictionary();
+ argumentDict.loadDictionary(new File("./model/arguments.".concat(
+ classifierName).concat(".dict")));
+
List results = new LinkedList();
AnnotationIndex sentenceIndex = jcas
.getAnnotationIndex(Sentence.type);
FSIterator sentenceIter = sentenceIndex.iterator();
- Map> pairsOfArticle = StanfordDependencyReader
- .getPairs(new File(FileUtil.removeFileNameExtension(
- UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc")));
+ Map> pairsOfArticle = new HashMap>();
+ if (new File(FileUtil.removeFileNameExtension(
+ UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc")).exists()) {
+ pairsOfArticle = StanfordDependencyReader
+ .getPairs(new File(FileUtil.removeFileNameExtension(
+ UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc")));
+ } else {
+ pairsOfArticle = StanfordDependencyReader
+ .getPairs(new File(FileUtil.removeFileNameExtension(
+ UimaUtil.getJCasFilePath(jcas)).concat(".sd")));
+ }
+ int sentenceid = 0;
// Currently, one sentence is considered as one structured instance.
while (sentenceIter.hasNext()) {
-
+ sentenceid++;
StructuredInstance si = new StructuredInstance();
List bindingEventCandidates = new LinkedList();
si.setNodes(bindingEventCandidates);
@@ -67,29 +89,196 @@ protected List getStructuredInstances(JCas jcas,
Sentence sentence = (Sentence) sentenceIter.next();
Set pairsOfSentence = pairsOfArticle.get(sentence.getId());
+ List tokens = JCasUtil.selectCovered(jcas, Token.class, sentence);
DependencyExtractor dependencyExtractor = new DependencyExtractor(
- JCasUtil.selectCovered(jcas, Token.class, sentence),
- pairsOfSentence);
+ tokens, pairsOfSentence);
List events = JCasUtil.selectCovered(jcas, Event.class,
sentence);
+ List triggers= JCasUtil.selectCovered(jcas, Trigger.class,
+ sentence);
List proteins = JCasUtil.selectCovered(jcas,
Protein.class, sentence);
-
- for (Event event : events) {
-
- if (event.getTrigger().getEventType()
- .equals(String.valueOf(EventType.Binding))) {
-
- Combinations combs = new Combinations(
- proteins);
- for (List themes : combs.getCombinations()) {
- bindingEventCandidates.add(bindingEventToInstance(jcas,
- sentence, event, themes, dependencyExtractor));
+ if (proteins.size() < 1) {
+ continue;
+ }
+ if (triggers.size() < 1) {
+ continue;
+ }
+ //binding trigger's event
+ Map> triggerEvevts = new TreeMap>();
+ for (Trigger trigger : triggers) {
+
+ if (!trigger.getEventType().equals(String.valueOf(EventType.Binding))) {
+ continue;
+ }
+ for (Event event : events) {
+ if (event.getTrigger().getBegin() == trigger.getBegin()) {
+ int themeToken = getThemeToken2(jcas, event, sentence);
+ if (event.getThemes().size() != themeToken) {
+ // There are cross sentence themes, which are not considered at the moment.
+ continue;
+ }
+
+ Set triggerEvevt = new HashSet();
+ if (triggerEvevts.containsKey(trigger.getId())) {
+ triggerEvevt = triggerEvevts.get(trigger.getId());
+ }
+ triggerEvevt.add(event);
+ triggerEvevts.put(trigger.getId(), triggerEvevt);
}
}
}
+ String[] bindingLemma = {"assembly", "recruitment", "ligand", "interact", "association",
+ "ligation", "binding", "interaction", "recover", "recognize", "bind", "recruit",
+ "dna-binding", "complex", "form", "immunoprecipitate", "heteromultimer"};
+ //proteins that relation is and
+ Set andProtein = getAndProtein(jcas, proteins, dependencyExtractor);
+ //extract instances
+ for (Trigger trigger : triggers) {
+ //int triggerEventSize = 0;
+ if (!trigger.getEventType().equals(String.valueOf(EventType.Binding))) {
+ continue;
+ }
+ /*boolean bind =false;
+ Token token = getTriggerToken(jcas, trigger);
+ for (int i=0; i farProtein = getFarProtein( jcas, trigger, proteins, dependencyExtractor);
+ Set notTheme = getNotProtein( jcas, trigger, proteins, dependencyExtractor);
+
+ for (Protein protein : proteins) {
+ Instance proteinInstance = argumentToInstance(jcas,
+ sentence, protein, trigger, pairsOfSentence,
+ dependencyExtractor, false, false, Stage.THEME);
+ if ( proteinInstance != null) {
+ double prediction = argumentRecogniser.predict(argumentDict
+ .instanceToNumeric(proteinInstance)
+ .getFeaturesNumeric(), proteinInstance);
+ if (prediction != argumentDict.getLabelNumeric("Theme")) {
+ notTheme.add(protein.getId());
+ }
+ }
+ }
+ Set triggerEvevt = triggerEvevts.get(trigger.getId());
+ List themeProteins = new LinkedList();
+ for (Protein protein : proteins) {
+ if (!notTheme.contains(protein.getId()) && !farProtein.contains(protein.getId())) {
+ themeProteins.add(protein);
+ }
+ }
+ Combinations combs = new Combinations(
+ themeProteins);
+
+ loop2 : for (List themes : combs.getCombinations()) {
+ boolean truepositive = false;
+ int equalNum = 0;
+ if (triggerEvevts.containsKey(trigger.getId())) {
+ loop : for (Event bindingEvent : triggerEvevt) {
+ equalNum = 0;
+ if (null != bindingEvent.getThemes()
+ && themes.size() == bindingEvent.getThemes().size()) {
+ for (Protein protein : themes) {
+ boolean foundTheProtein = false;
+ for (int i = 0; i < bindingEvent.getThemes().size(); i++) {
+ if (protein.getId().equals(bindingEvent.getThemes(i))) {
+ equalNum++;
+ if (equalNum == themes.size()) {
+ truepositive = true;
+ break loop;
+ }
+ foundTheProtein = true;
+ break;
+ }
+ }
+ if (foundTheProtein == false) {
+ break;
+ }
+ }
+ }
+ }
+ }
+ /*if (truepositive) {
+ triggerEventSize++;
+ } */
+
+ if (themes.size() > 2) {
+ /*if (truepositive) {
+ System.out.println("error");
+ } */
+ continue;
+ }
+
+ for (Protein p : themes) {
+ if (test && farProtein.contains(p.getId())) {
+ /*if (truepositive) {
+ System.out.println("farerror");
+ } */
+ continue loop2;
+ }
+ if (test && notTheme.contains(p.getId())) {
+ /*if (truepositive) {
+ System.out.println("noterror");
+ } */
+ continue loop2;
+ }
+ }
+ int num = 0;
+ if (themes.size() > 1) {
+ for (Protein p : themes) {
+ if (andProtein.contains(p.getId())) {
+ num++;
+ }
+ }
+ for(Protein p : themes) {
+ for(Protein p2 : themes) {
+ if (p.getId().equalsIgnoreCase(p2.getId())) {
+ continue;
+ }
+ if (p.getCoveredText().equalsIgnoreCase(p2.getCoveredText())
+ || p.getCoveredText().contains(p2.getCoveredText())
+ || p2.getCoveredText().contains(p.getCoveredText())) {
+ /*if (truepositive) {
+ System.out.println("sameerror");
+ } else {
+ System.out.println("same");
+ }*/
+ continue loop2;
+ }
+ }
+ }
+ }
+
+ if (test && num > 1) {
+ /*if (truepositive) {
+ System.out.println("anderror");
+ } */
+ continue;
+ }
+
+
+
+ Instance instance = bindingEventToInstance(jcas,
+ sentence, trigger, themes, dependencyExtractor, truepositive);
+ instance.setSentenceId(sentenceid);
+ instance.setFileId(trigger.getCoveredText() + "\t");
+ for (Protein p : themes) {
+ instance.setFileId(instance.getFileId() + p.getCoveredText() + "\t");
+ }
+ bindingEventCandidates.add(instance);
+ }
+ //System.out.println(triggerEventSize);
+ }
+
results.add(si);
}
@@ -101,17 +290,18 @@ public static void main(String[] args) {
BindingInstances ti = new BindingInstances();
ti.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml");
- List instances = ti.getInstances(new File(args[0]));
+ List instances = ti.getInstances(
+ new File("/media/songrq/soft/litway/数据/BioNLP13/b"));
- InstanceDictionary dict = new InstanceDictionary();
+/* InstanceDictionary dict = new InstanceDictionary();
dict.creatNumericDictionary(instances);
String classifierName = "liblinear";
ti.saveInstances(new File("./model/instances.binding.txt"));
ti.saveSvmLightInstances(new File(
"./model/instances.binding.svm.no_dum.txt"));
-
- if (args.length == 2 && args[1].equals("dev")) {
+*/
+ /*if (args.length == 2 && args[1].equals("dev")) {
dict.saveDictionary(new File("./model/binding.".concat(
classifierName).concat(".dict")));
@@ -125,6 +315,150 @@ public static void main(String[] args) {
ti.saveInstances(new File("./model/instances.binding.dev.txt"));
testInstances.saveSvmLightInstances(new File(
"./model/instances.binding.svm.dev.no_dum.txt"));
+ }*/
+ }
+ protected int getThemeToken2(JCas jcas, Event event, Sentence sentence) {
+
+ int tokenNum = 0;
+ StringArray themes = event.getThemes();
+
+ for (Protein protein : JCasUtil.selectCovered(jcas, Protein.class,
+ sentence)) {
+ for (int i=0; i getNotProtein(JCas jcas, Trigger trigger, List sentenceProteins, DependencyExtractor dependencyExtractor) {
+
+ //delete protein whose token is too far from triggertoken
+ Set notTheme = new HashSet();
+ for (Protein protein : sentenceProteins) {
+ Token triggerToken = getTriggerToken(jcas, trigger);
+ Token token = getToken(jcas, protein);
+ int pathLength = dependencyExtractor.getDijkstraShortestPathLength(
+ triggerToken, token);
+ //int distance = token.getId() > triggerToken.getId() ? token.getId() - triggerToken.getId()
+ // : triggerToken.getId() - token.getId();
+ if (pathLength > 6) {
+ notTheme.add(protein.getId());
+ }
+ /*if (distance > 10) {
+ notTheme.add(protein.getId());
+ }*/
+ }
+ return notTheme;
+ }
+
+ protected Set getFarProtein(JCas jcas, Trigger trigger, List sentenceProteins, DependencyExtractor dependencyExtractor) {
+
+ //the same protein, delete far protein
+ Set farProtein = new HashSet();
+ if (sentenceProteins.size() > 1) {
+ for(Protein p : sentenceProteins) {
+ for(Protein p2 : sentenceProteins) {
+ if (p.getId().equalsIgnoreCase(p2.getId())) {
+ continue;
+ }
+ if (p.getCoveredText().equalsIgnoreCase(p2.getCoveredText())
+ || p.getCoveredText().contains(p2.getCoveredText())
+ || p2.getCoveredText().contains(p.getCoveredText())
+ ) {
+
+ Token triggerToken = getTriggerToken(jcas, trigger);
+ Token token = getToken(jcas, p);
+ Token token2 = getToken(jcas, p2);
+
+ int distance = token.getId() > triggerToken.getId() ? token.getId() - triggerToken.getId()
+ : triggerToken.getId() - token.getId();
+ int distance2 = token2.getId() > triggerToken.getId() ? token2.getId() - triggerToken.getId()
+ : triggerToken.getId() - token2.getId();
+ int pathLength = dependencyExtractor.getDijkstraShortestPathLength(
+ triggerToken, token);
+ int pathLength2 = dependencyExtractor.getDijkstraShortestPathLength(
+ triggerToken, token2);
+ if (pathLength > pathLength2) {
+ farProtein.add(p.getId());
+ }else if (pathLength < pathLength2) {
+ farProtein.add(p2.getId());
+ }else if (pathLength == pathLength2 && distance > distance2) {
+ farProtein.add(p.getId());
+ }else if (pathLength == pathLength2 && distance < distance2) {
+ farProtein.add(p2.getId());
+ }
+ }
+ }
+ }
+ }
+ return farProtein;
+ }
+
+ protected Set getAndProtein(JCas jcas, List sentenceProteins, DependencyExtractor dependencyExtractor) {
+
+ Set andProtein = new HashSet();
+ if (sentenceProteins.size() > 1) {
+ for (Protein protein : sentenceProteins) {
+ for (Protein protein2 : sentenceProteins) {
+ if (protein.getId().equalsIgnoreCase(protein2.getId())) {
+ continue;
+ }
+ Token token2 = getToken(jcas, protein2);
+ Token token = getToken(jcas, protein);
+ String dependencyPath = dependencyExtractor.getShortestPath(
+ token, token2, Stage.BINDING);
+ if (dependencyPath != null
+ && (dependencyPath.equalsIgnoreCase("conj_and")
+ || dependencyPath.equalsIgnoreCase("-conj_and")
+ ||dependencyPath.equalsIgnoreCase("conj_or")
+ || dependencyPath.equalsIgnoreCase("-conj_or")
+ ||dependencyPath.equalsIgnoreCase("abbrev") //缩写,equlv
+ || dependencyPath.equalsIgnoreCase("-abbrev")
+ //||dependencyPath.equalsIgnoreCase("appos")
+ //|| dependencyPath.equalsIgnoreCase("-appos")
+ )) {
+ andProtein.add(protein.getId());
+ andProtein.add(protein2.getId());
+ }
+ /*if (token2.getId() == token.getId()) {
+ //andProtein.add(protein.getId());
+ //andProtein.add(protein2.getId());
+ }*/
+ /*List between = new LinkedList();
+ for (Token token : tokens) {
+ if (protein2.getBegin() >= protein.getEnd()) {
+ if (token.getBegin() >= protein.getEnd()
+ && token.getEnd() <= protein2.getBegin()) {
+ between.add(token);
+ }
+ }else if(protein.getBegin() >= protein2.getEnd()) {
+ if (token.getBegin() >= protein2.getEnd()
+ && token.getEnd() <= protein.getBegin()) {
+ between.add(token);
+ }
+ }
+ }
+ boolean isAnd = true;
+ for (Token token : between) {
+ if (!token.getCoveredText().equalsIgnoreCase(",")
+ && !token.getCoveredText().equalsIgnoreCase("and")
+ && !token.getCoveredText().equalsIgnoreCase("or")
+ ) {
+ isAnd = false;
+ break;
+ }
+ }
+ if (isAnd) {
+ andProtein.add(protein.getId());
+ andProtein.add(protein2.getId());
+ }*/
+ }
+ }
}
+ return andProtein;
}
+
}
diff --git a/src/info/chenli/litway/bionlp13/ge/BindingRecogniser.java b/src/info/chenli/litway/bionlp13/ge/BindingRecogniser.java
index 9cfd27a..50085d2 100644
--- a/src/info/chenli/litway/bionlp13/ge/BindingRecogniser.java
+++ b/src/info/chenli/litway/bionlp13/ge/BindingRecogniser.java
@@ -1,15 +1,21 @@
package info.chenli.litway.bionlp13.ge;
+import info.chenli.classifier.Accurary;
import info.chenli.classifier.Instance;
import info.chenli.classifier.InstanceDictionary;
import info.chenli.classifier.LibLinearFacade;
import info.chenli.litway.util.Timer;
import java.io.File;
+import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.logging.Logger;
+import de.bwaldvogel.liblinear.Feature;
+import de.bwaldvogel.liblinear.FeatureNode;
+import de.bwaldvogel.liblinear.Linear;
+
public class BindingRecogniser extends LibLinearFacade {
private final static Logger logger = Logger
@@ -34,10 +40,10 @@ public void train(String trainingDir, int round) {
logger.info("Save dictionary.");
// save instances
- trainingInstances.saveInstances(new File(
+ /*trainingInstances.saveInstances(new File(
"./model/instances.binding.txt"));
trainingInstances.saveSvmLightInstances(new File(
- "./model/instances.binding.svm.txt"));
+ "./model/instances.binding.svm.txt"));*/
// shuffle
Collections.shuffle(instances);
@@ -56,9 +62,152 @@ public void train(String trainingDir, int round) {
}
+ public void train2(String trainingDir, int round) {
+ //
+ // collect all instances and fetch syntactical information
+ //
+ BindingInstances trainingInstances = new BindingInstances();
+ trainingInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml");
+ List instances = trainingInstances.getInstances(new File(
+ trainingDir));
+ logger.info(String.valueOf(instances.size()).concat(
+ " instances are collected."));
+
+ InstanceDictionary dict = new InstanceDictionary();
+ dict.creatNumericDictionary(instances);
+ dict.saveDictionary(new File("./model/bindings.train.devel.".concat(classifierName)
+ .concat(".dict")));
+ logger.info("Save dictionary.");
+
+ // save instances
+/* trainingInstances.saveInstances(new File(
+ "./model/instances.binding.txt"));
+ trainingInstances.saveSvmLightInstances(new File(
+ "./model/instances.binding.svm.txt"));
+*/
+ // shuffle
+ Collections.shuffle(instances);
+ logger.info("Shuffle instances.");
+
+ Timer timer = new Timer();
+ timer.start();
+
+ train(instances, round);
+ timer.stop();
+ logger.info("Training takes ".concat(String.valueOf(timer
+ .getRunningTime())));
+
+ saveModel(new File("./model/bindings.train.devel.".concat(classifierName).concat(
+ ".model")));
+
+ }
+
+ private void test(File file) {
+ // TODO Auto-generated method stub
+ BindingInstances testInstances = new BindingInstances();
+ testInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml");
+ List instances = testInstances.getInstances(file);
+ logger.info(String.valueOf(instances.size()).concat(
+ " instances are collected."));
+ InstanceDictionary dict = new InstanceDictionary();
+ dict.loadDictionary(new File("./model/bindings."
+ .concat(classifierName).concat(".dict")));
+ this.loadModel(new File("./model/bindings.".concat(
+ classifierName).concat(".model")));
+ instances = dict.instancesToNumeric(instances);
+ testInstances.saveSvmLightInstances(new File(
+ "./model/instances.bindings.svm.dev.txt"));
+ int total = 0, correct = 0, tp = 0, tn = 0, fn, fp, pp = 0;
+ float p, r, f;
+
+ for (Instance instance : instances) {
+ int prediction = predict(instance);
+ if (prediction == instance.getLabel()) {
+ if (instance.getLabelString().equalsIgnoreCase("Binding")){
+ tp++;
+ }
+ correct++;
+ }else if (prediction != instance.getLabel()
+ && instance.getLabelString().equalsIgnoreCase("Non_binding")) {
+ //System.out.print(instance.getSentenceId());
+ //System.out.println("\t" + "fp" + "\t" + instance.getFileId());
+ }else if (prediction != instance.getLabel()
+ && instance.getLabelString().equalsIgnoreCase("Binding")) {
+ //System.out.print(instance.getSentenceId());
+ //System.out.println("\t" + "fn" + "\t" + instance.getFileId());
+ }
+
+ if (instance.getLabelString().equalsIgnoreCase("Binding")){
+ pp++;
+ }
+ total++;
+ }
+
+ fn = pp - tp;
+ tn = correct - tp;
+ fp = total - pp - tn;
+ p = (float) tp / (tp + fp);
+ r = (float) tp / (tp + fn);
+ f = (float) 2 * p * r / (p + r);
+
+ System.out.println(new Accurary(correct, total));
+ System.out.println("tp: " + tp + " fp: " + fp + " fn: " + fn);
+ System.out.println("p: " + p + " r: " + r + " f: " + f);
+ }
public static void main(String[] args) {
BindingRecogniser br = new BindingRecogniser();
- br.train(args[0], 1);
+ //br.train2("/media/songrq/soft/litway/数据/BioNLP13/"
+ // + "BioNLP-ST-2013_GE_train_devel_data_yuanShuJu", 1);
+
+ br.train2(args[0], 1);
+
+ /*br.train("/media/songrq/soft/litway/数据/BioNLP13/"
+ + "BioNLP-ST-2013_GE_train_data_yuanShuJu", 1);
+ br.test(new File("/media/songrq/soft/litway/数据/BioNLP13/"
+ + "BioNLP-ST-2013_GE_devel_data_yuanShuJu"));*/
+
+ /*br.train("/media/songrq/soft/litway/数据/BioNLP11/"
+ + "BioNLP-ST-2011-2013_GE_train_data", 1);
+ br.test(new File("/media/songrq/soft/litway/数据/BioNLP13/"
+ + "BioNLP-ST-2013_GE_devel_data_yuanShuJu"));*/
+
+ //br.train2("/media/songrq/soft/litway/数据/BioNLP11/"
+ // + "BioNLP-ST-2011-2013_GE_train_devel_data", 1);
+ }
+
+
+ public double predict_values(int[] featureSparseVector) {
+
+ if (featureSparseVector == null) {
+ throw new IllegalArgumentException(
+ "Empty sparse vector. This probably due to that the dictionary hasn't converted instances to numeric features yet.");
+ }
+
+ int n;
+ int nr_feature = this.model.getNrFeature();
+ if (this.model.getBias() >= 0) {
+ n = nr_feature + 1;
+ } else {
+ n = nr_feature;
+ }
+
+ List featureNodes = new ArrayList();
+ int previousIndex = 0;
+ for (int index : featureSparseVector) {
+ if (index > previousIndex) {
+ featureNodes.add(new FeatureNode(index, 1));
+ }
+ previousIndex = index;
+ }
+ if (model.getBias() >= 0) {
+ Feature node = new FeatureNode(n, model.getBias());
+ featureNodes.add(node);
+ }
+ Feature[] instance = new FeatureNode[featureNodes.size()];
+ instance = featureNodes.toArray(instance);
+ double[] dec_values = new double[this.model.getNrClass()];
+ int type = (int) Math.round(Linear.predictValues(this.model, instance, dec_values));
+ return dec_values[type];
}
}
diff --git a/src/info/chenli/litway/bionlp13/ge/CauseInstances.java b/src/info/chenli/litway/bionlp13/ge/CauseInstances.java
index 9b8bcd1..2efea54 100644
--- a/src/info/chenli/litway/bionlp13/ge/CauseInstances.java
+++ b/src/info/chenli/litway/bionlp13/ge/CauseInstances.java
@@ -102,10 +102,13 @@ protected List getStructuredInstances(JCas jcas,
boolean isCause = event.getCause() == null ? false : event
.getCause().equals(protein.getId());
-
- causeCandidates.add(causeToInstance(jcas, sentence,
+ Instance instance = causeToInstance(jcas, sentence,
protein, event.getTrigger(), pairsOfSentence,
- dependencyExtractor, isCause, themeToken));
+ dependencyExtractor, isCause, themeToken);
+
+ if ( instance != null) {
+ causeCandidates.add(instance);
+ }
}
// check event causes
diff --git a/src/info/chenli/litway/bionlp13/ge/CauseRecogniser.java b/src/info/chenli/litway/bionlp13/ge/CauseRecogniser.java
index d0ed467..85cb0a2 100644
--- a/src/info/chenli/litway/bionlp13/ge/CauseRecogniser.java
+++ b/src/info/chenli/litway/bionlp13/ge/CauseRecogniser.java
@@ -34,7 +34,7 @@ public void train(File trainingSet, boolean useSearn) {
.getInstances(trainingSet);
dict.creatNumericDictionary(instances);
- dict.saveDictionary(new File("./model/causes.dict"));
+ dict.saveDictionary(new File("./model/causes.liblinear.dict"));
this.train(instances);
diff --git a/src/info/chenli/litway/bionlp13/ge/Classify.java b/src/info/chenli/litway/bionlp13/ge/Classify.java
new file mode 100644
index 0000000..6daa424
--- /dev/null
+++ b/src/info/chenli/litway/bionlp13/ge/Classify.java
@@ -0,0 +1,49 @@
+package info.chenli.litway.bionlp13.ge;
+
+import info.chenli.classifier.Instance;
+import info.chenli.litway.util.FileUtil;
+import info.chenli.litway.util.UimaUtil;
+
+import java.io.*;
+import java.util.List;
+
+public class Classify {
+
+ public static void main(String[] args) {
+ File trainFile = new File("./model/instances.trigger.txt");
+ File develFile = new File("./model/instances.trigger.dev.txt");
+ List instances;
+ Instance instance;
+ try {
+ InputStreamReader trainFileStream = new InputStreamReader(
+ new FileInputStream(trainFile), "UTF8");
+ BufferedReader trainFileBuffer = new BufferedReader(trainFileStream);
+
+ InputStreamReader develFileStream = new InputStreamReader(
+ new FileInputStream(develFile), "UTF8");
+ BufferedReader develFileBuffer = new BufferedReader(develFileStream);
+
+ String trainFileCh;
+ while ((trainFileCh = trainFileBuffer.readLine()) != null) {
+ String[] trainInstance = trainFileCh.split("\t");
+ //instance.setLabel(Integer.parseInt(trainInstance[0]));
+ //instance.getFeaturesNumeric();
+ //instances.add(instance);
+
+ }
+ trainFileBuffer.close();
+ trainFileStream.close();
+ develFileStream.close();
+ develFileBuffer.close();
+
+ } catch (UnsupportedEncodingException e) {
+ e.printStackTrace();
+ } catch (FileNotFoundException e) {
+ e.printStackTrace();
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+
+ }
+
+}
diff --git a/src/info/chenli/litway/bionlp13/ge/EventExtractor.java b/src/info/chenli/litway/bionlp13/ge/EventExtractor.java
index 872ca24..aa4e726 100644
--- a/src/info/chenli/litway/bionlp13/ge/EventExtractor.java
+++ b/src/info/chenli/litway/bionlp13/ge/EventExtractor.java
@@ -3,7 +3,6 @@
import info.chenli.classifier.Instance;
import info.chenli.classifier.InstanceDictionary;
import info.chenli.litway.corpora.Event;
-import info.chenli.litway.corpora.POS;
import info.chenli.litway.corpora.Protein;
import info.chenli.litway.corpora.Sentence;
import info.chenli.litway.corpora.Token;
@@ -92,6 +91,10 @@ public String extractFromSingleFile(File file) {
Map> pairsOfArticle = StanfordDependencyReader
.getPairs(new File(FileUtil.removeFileNameExtension(
file.getAbsolutePath()).concat(".sdepcc")));
+
+ File word2vecFile = new File("/home/songrq/word2vec/data/word2vec100");
+ Map word2vec = ReadWord2vec.word2vec(word2vecFile);
+
//
// Initialize the classifiers
//
@@ -136,15 +139,18 @@ public String extractFromSingleFile(File file) {
while (sentenceIter.hasNext()) {
Sentence sentence = (Sentence) sentenceIter.next();
- Set pairsOfSentence = pairsOfArticle.get(sentence.getId());
-
- // The queue where newly generated events are put
- LinkedBlockingQueue newEvents = new LinkedBlockingQueue();
-
// protein
List sentenceProteins = JCasUtil.selectCovered(jcas,
Protein.class, sentence);
+ if (sentenceProteins.size() <= 0) {
+ continue;
+ }
+
+ Set pairsOfSentence = pairsOfArticle.get(sentence.getId());
+ // The queue where newly generated events are put
+ LinkedBlockingQueue newEvents = new LinkedBlockingQueue();
+
//
// trigger detection
//
@@ -157,9 +163,9 @@ public String extractFromSingleFile(File file) {
triggers.put(sentence.getId(), new ArrayList());
}
- triggerDetectionLoop: for (Token token : tokens) {
+ for (Token token : tokens) {
- if (!POS.isPos(token.getPos())) {
+ /*if (!POS.isPos(token.getPos())) {
continue triggerDetectionLoop;
}
for (Protein protein : sentenceProteins) {
@@ -169,26 +175,28 @@ public String extractFromSingleFile(File file) {
.getEnd() <= protein.getEnd())) {
continue triggerDetectionLoop;
}
- }
+ }*/
Instance tokenInstance = tokenToInstance(jcas, token, null,
tokens, sentenceProteins, pairsOfSentence,
- dependencyExtractor);
+ dependencyExtractor, word2vec);
// set the token filter here
// if (!TriggerRecogniser.isConsidered(tokenInstance
// .getFeaturesString().get(2))) {
// continue;
// }
- int prediction = triggerRecogniser.predict(triggerDict
- .instanceToNumeric(tokenInstance));
-
- if (prediction != triggerDict.getLabelNumeric(String
- .valueOf(EventType.Non_trigger))) {
-
- Trigger trigger = new Trigger(jcas, token.getBegin(),
- token.getEnd());
- trigger.setEventType(triggerDict.getLabelString(prediction));
- trigger.setId("T".concat(String.valueOf(++proteinNum)));
- triggers.get(sentence.getId()).add(trigger);
+ if (tokenInstance != null) {
+ int prediction = triggerRecogniser.predict(triggerDict
+ .instanceToNumeric(tokenInstance));
+
+ if (prediction != triggerDict.getLabelNumeric(String
+ .valueOf(EventType.Non_trigger))) {
+
+ Trigger trigger = new Trigger(jcas, token.getBegin(),
+ token.getEnd());
+ trigger.setEventType(triggerDict.getLabelString(prediction));
+ trigger.setId("T".concat(String.valueOf(++proteinNum)));
+ triggers.get(sentence.getId()).add(trigger);
+ }
}
}
@@ -197,7 +205,9 @@ public String extractFromSingleFile(File file) {
//
// 1. iterate through all proteins
-
+ if (null == triggers.get(sentence.getId())) {
+ continue;
+ }
if (null == events.get(sentence.getId())) {
events.put(sentence.getId(), new ArrayList());
}
@@ -213,7 +223,7 @@ public String extractFromSingleFile(File file) {
dependencyExtractor, false);
double prediction = themeRecogniser.predict(themeDict
.instanceToNumeric(proteinInstance)
- .getFeaturesNumeric());
+ .getFeaturesNumeric(), proteinInstance);
// if (trigger.getEventType().equals(
// String.valueOf(EventType.Localization))
@@ -264,7 +274,7 @@ public String extractFromSingleFile(File file) {
dependencyExtractor, false);
double prediction = themeRecogniser.predict(themeDict
.instanceToNumeric(proteinInstance)
- .getFeaturesNumeric());
+ .getFeaturesNumeric(), proteinInstance);
if (prediction == themeDict.getLabelNumeric("Theme")) {
themes.add(protein);
@@ -273,8 +283,6 @@ public String extractFromSingleFile(File file) {
}
if (themes.size() > 0) {
- Event event = new Event(jcas);
- event.setTrigger(trigger);
// event.setId(String.valueOf(eventIndex++));
List> predictedThemesComb = new ArrayList>();
@@ -285,11 +293,14 @@ public String extractFromSingleFile(File file) {
for (List candidateThemes : combs
.getCombinations()) {
+ if (candidateThemes.size() > 3) {
+ continue;
+ }
Instance bindingInstance = bindingDict
.instanceToNumeric(bindingEventToInstance(
- jcas, sentence, event,
+ jcas, sentence, trigger,
candidateThemes,
- dependencyExtractor));
+ dependencyExtractor, false));
if (bindingRecogniser.predict(bindingInstance) == bindingDict
.getLabelNumeric("Binding")) {
predictedThemesComb.add(candidateThemes);
@@ -358,7 +369,7 @@ public String extractFromSingleFile(File file) {
false));
double prediction = themeRecogniser
- .predict(proteinInstance.getFeaturesNumeric());
+ .predict(proteinInstance.getFeaturesNumeric(), proteinInstance);
if (prediction == themeDict.getLabelNumeric("Theme")) {
@@ -399,7 +410,7 @@ public String extractFromSingleFile(File file) {
double prediction = themeRecogniser.predict(themeDict
.instanceToNumeric(triggerTokenInstance)
- .getFeaturesNumeric());
+ .getFeaturesNumeric(), triggerTokenInstance);
if (prediction == themeDict.getLabelNumeric("Theme")) {
@@ -436,7 +447,7 @@ public String extractFromSingleFile(File file) {
getThemeToken(jcas, event, sentence));
double prediction = causeRecogniser.predict(causeDict
.instanceToNumeric(proteinInstance)
- .getFeaturesNumeric());
+ .getFeaturesNumeric(), proteinInstance);
if (prediction == causeDict.getLabelNumeric(String
.valueOf("Cause"))) {
@@ -464,7 +475,7 @@ public String extractFromSingleFile(File file) {
dependencyExtractor, false, themeToken);
double prediction = causeRecogniser.predict(causeDict
.instanceToNumeric(causeEventInstance)
- .getFeaturesNumeric());
+ .getFeaturesNumeric(), causeEventInstance);
// if (event
// .getTrigger()
// .getEventType()
diff --git a/src/info/chenli/litway/bionlp13/ge/EventExtractorBind2.java b/src/info/chenli/litway/bionlp13/ge/EventExtractorBind2.java
new file mode 100644
index 0000000..1b23d9d
--- /dev/null
+++ b/src/info/chenli/litway/bionlp13/ge/EventExtractorBind2.java
@@ -0,0 +1,2295 @@
+package info.chenli.litway.bionlp13.ge;
+
+import info.chenli.classifier.Instance;
+import info.chenli.classifier.InstanceDictionary;
+import info.chenli.litway.corpora.Event;
+import info.chenli.litway.corpora.Protein;
+import info.chenli.litway.corpora.Sentence;
+import info.chenli.litway.corpora.Token;
+import info.chenli.litway.corpora.Trigger;
+import info.chenli.litway.util.Combinations;
+import info.chenli.litway.util.DependencyExtractor;
+import info.chenli.litway.util.FileFilterImpl;
+import info.chenli.litway.util.FileUtil;
+import info.chenli.litway.util.StanfordDependencyReader;
+import info.chenli.litway.util.UimaUtil;
+import info.chenli.litway.util.StanfordDependencyReader.Pair;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.logging.Logger;
+
+import libsvm.svm;
+import libsvm.svm_model;
+import libsvm.svm_node;
+
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.StringArray;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.uimafit.util.JCasUtil;
+
+import de.bwaldvogel.liblinear.Feature;
+import de.bwaldvogel.liblinear.FeatureNode;
+
+
+public class EventExtractorBind2 extends TokenInstances {
+
+ private final static Logger logger = Logger.getLogger(EventExtractor.class
+ .getName());
+
+ private String classifierName = "liblinear";
+
+ public void train(File dir) {
+
+ if (!dir.isDirectory()) {
+ logger.info(dir.getAbsolutePath().concat(" is not a directory."));
+ }
+
+ //
+ // train trigger
+ //
+
+ //
+ // train theme
+ //
+
+ //
+ // train cause
+ //
+ }
+
+ public void extract(File file) throws IOException {
+
+ int[] perform = {0, 0, 0, 0};//tp, tn, fp, fn
+
+ if (file.isDirectory()) {
+
+ for (File f : file.listFiles(new FileFilterImpl(".txt"))) {
+
+ /*perform = extractFromSingleFile(f, perform);
+
+ for(int i:perform) {
+ System.out.println(i);
+ }*/
+
+ extract(f);
+ }
+
+
+
+ } else if (file.isFile()) {
+
+
+ logger.info("Extracting from ".concat(file.getName()));
+ String newFileName = "./result/".concat(
+ file.getName()
+ .substring(0, file.getName().lastIndexOf(".")))
+ .concat(".a2");
+ FileUtil.saveFile(extractFromSingleFile(file, perform),
+ new File(newFileName));
+ logger.info("Result saved in ".concat(newFileName));
+ }
+ }
+
+ /**
+ * Extract events from the given file.
+ *
+ * @param file
+ * @throws IOException
+ */
+ public String extractFromSingleFile(File file, int[] perform) throws IOException {
+
+ boolean test = true;
+
+ File word2vecFile = new File("./word2vec/word2vec100");
+ //File word2vecFile = new File("/home/songrq/word2vec/data/word2vec100");
+
+ Map word2vec = ReadWord2vec.word2vec(word2vecFile);
+
+ Map> triggers = new TreeMap>();
+ Map> events = new TreeMap>();
+ // Initialize the file
+ JCas jcas = this.processSingleFile(file);
+ int proteinNum = 0;
+ FSIterator proteinIter = jcas.getAnnotationIndex(
+ Protein.type).iterator();
+ while(proteinIter.hasNext()) {
+ Protein protein = (Protein) proteinIter.next();
+ String s = protein.getId().replace('T', '0');
+ proteinNum = proteinNum < Integer.valueOf(s) ? Integer.valueOf(s) : proteinNum;
+ }
+ Map> pairsOfArticle = new HashMap>();
+ if (new File(FileUtil.removeFileNameExtension(
+ UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc")).exists()) {
+ pairsOfArticle = StanfordDependencyReader
+ .getPairs(new File(FileUtil.removeFileNameExtension(
+ UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc")));
+ } else {
+ pairsOfArticle = StanfordDependencyReader
+ .getPairs(new File(FileUtil.removeFileNameExtension(
+ UimaUtil.getJCasFilePath(jcas)).concat(".sd")));
+ }
+
+
+
+ //
+ // Initialize the classifiers
+ //
+
+ // trigger
+ TriggerRecogniser triggerRecogniser = new TriggerRecogniser();
+ InstanceDictionary triggerDict = new InstanceDictionary();
+ triggerRecogniser.loadModel(new File("./model/triggers.".concat(
+ classifierName).concat(".model")));
+ //String triggerModel = "./model/triggers.model";
+ triggerDict.loadDictionary(new File("./model/triggers.".concat(
+ classifierName).concat(".dict")));
+ if (test) {
+ triggerRecogniser.loadModel(new File("./model/triggers.train.devel.".concat(
+ classifierName).concat(".model")));
+ triggerDict.loadDictionary(new File("./model/triggers.train.devel.".concat(
+ classifierName).concat(".dict")));
+
+ }else {
+ triggerRecogniser.loadModel(new File("./model/triggers.".concat(
+ classifierName).concat(".model")));
+ triggerDict.loadDictionary(new File("./model/triggers.".concat(
+ classifierName).concat(".dict")));
+ }
+ // argument
+ ArgumentRecogniser argumentRecogniser = new ArgumentRecogniser();
+ InstanceDictionary argumentDict = new InstanceDictionary();
+ if (test) {
+ argumentRecogniser.loadModel(new File("./model/arguments.train.devel.".concat(
+ classifierName).concat(".model")));
+ argumentDict.loadDictionary(new File("./model/arguments.train.devel.".concat(
+ classifierName).concat(".dict")));
+ }else {
+ argumentRecogniser.loadModel(new File("./model/arguments.".concat(
+ classifierName).concat(".model")));
+ argumentDict.loadDictionary(new File("./model/arguments.".concat(
+ classifierName).concat(".dict")));
+ }
+ /*ArgumentRecogniser reguArgumentRecogniser = new ArgumentRecogniser();
+ reguArgumentRecogniser.loadModel(new File("./model/reguArguments.".concat(
+ classifierName).concat(".model")));
+ InstanceDictionary reguArgumentDict = new InstanceDictionary();
+ reguArgumentDict.loadDictionary(new File("./model/reguArguments.".concat(
+ classifierName).concat(".dict")));*/
+ /*TriggerArgumentRecogniser triggerArgumentRecogniser = new TriggerArgumentRecogniser();
+ triggerArgumentRecogniser.loadModel(new File("./model/triggerArguments.".concat(
+ classifierName).concat(".model")));
+ InstanceDictionary triggerArgumentDict = new InstanceDictionary();
+ triggerArgumentDict.loadDictionary(new File("./model/triggerArguments.".concat(
+ classifierName).concat(".dict")));*/
+
+ // binding
+ BindingRecogniser bindingRecogniser = new BindingRecogniser();
+ InstanceDictionary bindingDict = new InstanceDictionary();
+ if (test) {
+ bindingRecogniser.loadModel(new File("./model/bindings.train.devel.".concat(
+ classifierName).concat(".model")));
+ bindingDict.loadDictionary(new File("./model/bindings.train.devel.".concat(
+ classifierName).concat(".dict")));
+ }else {
+ bindingRecogniser.loadModel(new File("./model/bindings.".concat(
+ classifierName).concat(".model")));
+ bindingDict.loadDictionary(new File("./model/bindings.".concat(
+ classifierName).concat(".dict")));
+ }
+
+
+ // Initialize the iterator and counter
+ FSIterator sentenceIter = jcas.getAnnotationIndex(
+ Sentence.type).iterator();
+ int eventIndex = 1;
+
+ while (sentenceIter.hasNext()) {
+
+ Sentence sentence = (Sentence) sentenceIter.next();
+ Set pairsOfSentence = pairsOfArticle.get(sentence.getId());
+
+ // protein
+ List sentenceProteins = JCasUtil.selectCovered(jcas,
+ Protein.class, sentence);
+
+ if (sentenceProteins.size() <= 0) {
+ continue;
+ }
+ //
+ // trigger detection
+ //
+ List tokens = JCasUtil.selectCovered(jcas, Token.class,
+ sentence);
+
+ DependencyExtractor dependencyExtractor = new DependencyExtractor(
+ tokens, pairsOfSentence);
+
+ if (null == triggers.get(sentence.getId())) {
+ triggers.put(sentence.getId(), new ArrayList());
+ }
+
+ Map triggerId = new HashMap();
+
+ for (Token token : tokens) {
+ if(isProtein(token, sentenceProteins)) {
+ continue;
+ }
+
+ int tokenBegin = token.getBegin();
+ int tokenEnd = token.getEnd();
+ token = containsProtein(token, sentenceProteins);
+ /*if (shouldDelete(jcas, token, sentenceProteins)) {
+ continue;
+ }*/
+
+ Instance tokenInstance = tokenToInstance(jcas, token, null,
+ tokens, sentenceProteins, pairsOfSentence,
+ dependencyExtractor, word2vec);
+ if (tokenInstance != null) {
+ tokenInstance = triggerDict.instanceToNumeric(tokenInstance);
+ int prediction = triggerRecogniser.predict(tokenInstance);
+ //int[] featureSparseVector = tokenInstance.getFeaturesNumeric();
+
+ //int prediction = this.predict2(featureSparseVector, tokenInstance, triggerModel);
+
+ /*String temp = shouldChange(jcas, token, sentenceProteins);
+ if (!temp.equals("Non_trigger")) {
+ prediction = triggerDict.getLabelNumeric(temp);
+ }*/
+ if (prediction != triggerDict.getLabelNumeric(String
+ .valueOf(EventType.Non_trigger))) {
+
+ Trigger trigger = new Trigger(jcas, token.getBegin(),
+ token.getEnd());
+ trigger.setEventType(triggerDict.getLabelString(prediction));
+ trigger.setId("T".concat(String.valueOf(++proteinNum)));
+ triggers.get(sentence.getId()).add(trigger);
+ triggerId.put(trigger.getId(), trigger);
+ }
+ }
+
+ token.setBegin(tokenBegin);
+ token.setEnd(tokenEnd);
+ }
+
+ //
+ // argument assignment
+ //
+
+ // 1. iterate through all proteins
+ if (null == events.get(sentence.getId())) {
+ events.put(sentence.getId(), new LinkedList());
+ }
+ Set sameToken = new HashSet();
+
+ Map> eventArg = new HashMap>();
+ Map> triggerEvents = new HashMap>();
+ Map> triggerCauses = new HashMap>();
+
+ Set andProtein = getAndProtein(jcas, sentenceProteins, dependencyExtractor);
+
+ for (Trigger trigger : triggers.get(sentence.getId())) {
+ for (Protein protein : sentenceProteins) {
+ Token triggerToken = getTriggerToken(jcas, trigger);
+ Token proteinToken = getToken(jcas, protein);
+ boolean areSameTokens = (proteinToken.getId() == triggerToken.getId());
+ if (areSameTokens) {
+ sameToken.add(protein.getId());
+ }
+ }
+ }
+ for (Trigger trigger : triggers.get(sentence.getId())) {
+ Set triggerEvent = new HashSet();
+ Set triggerCause = new HashSet();
+
+ if (EventType.isSimpleEvent(trigger.getEventType())) {
+ for (Protein protein : sentenceProteins) {
+ Token triggerToken = getTriggerToken(jcas, trigger);
+ Token proteinToken = getToken(jcas, protein);
+ boolean areSameTokens = (proteinToken.getId() == triggerToken.getId());
+ Instance proteinInstance = argumentToInstance(jcas,
+ sentence, protein, trigger, pairsOfSentence,
+ dependencyExtractor, false, false, Stage.THEME);
+ if ( proteinInstance != null) {
+ double prediction = argumentRecogniser.predict(argumentDict
+ .instanceToNumeric(proteinInstance)
+ .getFeaturesNumeric(), proteinInstance);
+ if (areSameTokens) {
+ prediction = argumentDict.getLabelNumeric("Theme");
+ }
+ if (prediction == argumentDict.getLabelNumeric("Theme")) {
+ Event event = new Event(jcas);
+ event.setId(String.valueOf(eventIndex++));
+ event.setTrigger(trigger);
+ StringArray themes = new StringArray(jcas, 1);
+ themes.set(0, protein.getId());
+ event.setThemes(themes);
+ events.get(sentence.getId()).add(event);
+ triggerEvent.add(event);
+
+ Argument arg = new Argument();
+ arg.setId(protein.getId());
+ arg.setRelation("Theme");
+ List args = new LinkedList();
+ args.add(arg);
+ eventArg.put("E".concat(event.getId()), args);
+ }
+ }
+ }
+ } else if (EventType.isBindingEvent(trigger.getEventType())) {
+ List> predictedThemesComb = new ArrayList>();
+ Set farProtein = getFarProtein( jcas, trigger, sentenceProteins, dependencyExtractor);
+ Set notTheme = getNotProtein( jcas, trigger, sentenceProteins, dependencyExtractor);
+ for (Protein protein : sentenceProteins) {
+ Token triggerToken = getTriggerToken(jcas, trigger);
+ Token proteinToken = getToken(jcas, protein);
+ boolean areSameTokens = (proteinToken.getId() == triggerToken.getId());
+
+ Instance proteinInstance = argumentToInstance(jcas,
+ sentence, protein, trigger, pairsOfSentence,
+ dependencyExtractor, false, false, Stage.THEME);
+ if ( proteinInstance != null) {
+ double prediction = argumentRecogniser.predict(argumentDict
+ .instanceToNumeric(proteinInstance)
+ .getFeaturesNumeric(), proteinInstance);
+ if (areSameTokens) {
+ prediction = argumentDict.getLabelNumeric("Theme");
+ }
+ if (prediction != argumentDict.getLabelNumeric("Theme")) {
+ notTheme.add(protein.getId());
+ }
+ }
+ }
+ List proteins = new LinkedList();
+ for (Protein protein : sentenceProteins) {
+ if (!notTheme.contains(protein.getId()) && !farProtein.contains(protein.getId())) {
+ proteins.add(protein);
+ }
+ }
+ if (proteins.size() == 1) {
+ /*Instance bindingInstance = bindingEventToInstance(jcas,
+ sentence, trigger, proteins, dependencyExtractor, false);
+ double prediction = bindingRecogniser.predict(bindingDict
+ .instanceToNumeric(bindingInstance)
+ .getFeaturesNumeric());
+ Token triggerToken = getTriggerToken(jcas, trigger);
+ Token proteinToken = getToken(jcas, proteins.get(0));
+ boolean areSameTokens = (proteinToken.getId() == triggerToken.getId());
+ if (areSameTokens) {
+ prediction = bindingDict.getLabelNumeric("Binding");
+ }
+ if (prediction == bindingDict
+ .getLabelNumeric("Binding")) {*/
+ predictedThemesComb.add(proteins);
+ //}
+ } else if (proteins.size() > 1) {
+ Combinations combs = new Combinations(
+ proteins);
+ loop : for (List themes : combs.getCombinations()) {
+ boolean truepositive = false;
+ if (themes.size() != 2) {
+ continue;
+ }
+ /*for (Protein p : themes) {
+ if (farProtein.contains(p.getId())) {
+ continue loop;
+ }
+ if (notTheme.contains(p.getId())) {
+ continue loop;
+ }
+ }*/
+ int num = 0;
+ for (Protein p : themes) {
+ if (andProtein.contains(p.getId())) {
+ num++;
+ if (num > 1) {
+ List theme = new LinkedList();
+ theme.add(themes.get(0));
+ /*Instance bindingInstance = bindingEventToInstance(jcas,
+ sentence, trigger, theme, dependencyExtractor, truepositive);
+ double prediction = bindingRecogniser.predict(bindingDict
+ .instanceToNumeric(bindingInstance)
+ .getFeaturesNumeric());
+ if (prediction == bindingDict
+ .getLabelNumeric("Binding")) {*/
+ predictedThemesComb.add(theme);
+ //}
+
+ theme.remove(0);
+ theme.add(themes.get(1));
+ /*bindingInstance = bindingEventToInstance(jcas,
+ sentence, trigger, theme, dependencyExtractor, truepositive);
+ prediction = bindingRecogniser.predict(bindingDict
+ .instanceToNumeric(bindingInstance)
+ .getFeaturesNumeric());
+ if (prediction == bindingDict
+ .getLabelNumeric("Binding")) {*/
+ predictedThemesComb.add(theme);
+ //}
+
+ continue loop;
+ }
+ }
+ }
+ for(Protein p : themes) {
+ for(Protein p2 : themes) {
+ if (p.getId().equalsIgnoreCase(p2.getId())) {
+ continue;
+ }
+ if (p.getCoveredText().equalsIgnoreCase(p2.getCoveredText())
+ || p.getCoveredText().contains(p2.getCoveredText())
+ || p2.getCoveredText().contains(p.getCoveredText())) {
+ List theme = new LinkedList();
+ theme.add(themes.get(0));
+ /*Instance bindingInstance = bindingEventToInstance(jcas,
+ sentence, trigger, theme, dependencyExtractor, truepositive);
+ double prediction = bindingRecogniser.predict(bindingDict
+ .instanceToNumeric(bindingInstance)
+ .getFeaturesNumeric());
+ if (prediction == bindingDict
+ .getLabelNumeric("Binding")) {*/
+ predictedThemesComb.add(theme);
+ //}
+
+ theme.remove(0);
+ theme.add(themes.get(1));
+ /*bindingInstance = bindingEventToInstance(jcas,
+ sentence, trigger, theme, dependencyExtractor, truepositive);
+ prediction = bindingRecogniser.predict(bindingDict
+ .instanceToNumeric(bindingInstance)
+ .getFeaturesNumeric());
+ if (prediction == bindingDict
+ .getLabelNumeric("Binding")) {*/
+ predictedThemesComb.add(theme);
+ //}
+ continue loop;
+ }
+ }
+ }
+ Instance instance = bindingEventToInstance(jcas,
+ sentence, trigger, themes, dependencyExtractor, truepositive);
+ double prediction = bindingRecogniser.predict(bindingDict
+ .instanceToNumeric(instance)
+ .getFeaturesNumeric(), instance);
+ if (prediction == bindingDict
+ .getLabelNumeric("Binding")) {
+ List theme0 = new LinkedList();
+ theme0.add(themes.get(0));
+ List theme1 = new LinkedList();
+ theme1.add(themes.get(1));
+ Instance bindInstance0 = bindingEventToInstance(jcas,
+ sentence, trigger, theme0, dependencyExtractor, truepositive);
+ Instance bindInstance1 = bindingEventToInstance(jcas,
+ sentence, trigger, theme1, dependencyExtractor, truepositive);
+
+ double prediction0 = bindingRecogniser.predict(bindingDict
+ .instanceToNumeric(bindInstance0)
+ .getFeaturesNumeric(), bindInstance0);
+ double prediction1 = bindingRecogniser.predict(bindingDict
+ .instanceToNumeric(bindInstance1)
+ .getFeaturesNumeric(), bindInstance1);
+ double predictionValue1 = bindingRecogniser.predict_values(bindingDict
+ .instanceToNumeric(bindingEventToInstance(jcas,
+ sentence, trigger, theme1, dependencyExtractor, truepositive))
+ .getFeaturesNumeric());
+ double predictionValue0 = bindingRecogniser.predict_values(bindingDict
+ .instanceToNumeric(bindingEventToInstance(jcas,
+ sentence, trigger, theme0, dependencyExtractor, truepositive))
+ .getFeaturesNumeric());
+
+ if (prediction0 == bindingDict.getLabelNumeric("Binding")
+ && prediction1 == bindingDict.getLabelNumeric("Binding")) {
+ prediction = bindingRecogniser.predict_values(bindingDict
+ .instanceToNumeric(bindingEventToInstance(jcas,
+ sentence, trigger, themes, dependencyExtractor, truepositive))
+ .getFeaturesNumeric());
+ if (predictionValue1 > prediction && predictionValue0 > prediction) {
+ predictedThemesComb.add(theme0);
+ predictedThemesComb.add(theme1);
+ } /*else if (prediction0 < prediction && prediction1 < prediction) {
+ predictedThemesComb.add(themes);
+ }*/ else {
+ predictedThemesComb.add(themes);
+ }
+ } else {
+ predictedThemesComb.add(themes);
+ }
+ } else {
+ List theme = new LinkedList();
+ theme.add(themes.get(0));
+ /*Instance bindingInstance = bindingEventToInstance(jcas,
+ sentence, trigger, theme, dependencyExtractor, truepositive);
+ double prediction0 = bindingRecogniser.predict(bindingDict
+ .instanceToNumeric(bindingInstance)
+ .getFeaturesNumeric());
+ if (prediction0 == bindingDict
+ .getLabelNumeric("Binding")) {*/
+ predictedThemesComb.add(theme);
+ //}
+
+ theme.remove(0);
+ theme.add(themes.get(1));
+ /*bindingInstance = bindingEventToInstance(jcas,
+ sentence, trigger, theme, dependencyExtractor, truepositive);
+ prediction0 = bindingRecogniser.predict(bindingDict
+ .instanceToNumeric(bindingInstance)
+ .getFeaturesNumeric());
+ if (prediction0 == bindingDict
+ .getLabelNumeric("Binding")) {*/
+ predictedThemesComb.add(theme);
+ //}
+ }
+ }
+ }
+ // clean the themes which are fully covered by another
+ List> checkedThemesComb = new ArrayList>();
+ checkingTheme: for (List beingCheckedThemes : predictedThemesComb) {
+ if (checkedThemesComb.contains(beingCheckedThemes)) {
+ continue;
+ } else {
+
+ List> copy = new ArrayList>(
+ checkedThemesComb);
+ for (List checkedThemes : copy) {
+
+ if (checkedThemes
+ .containsAll(beingCheckedThemes)) {
+ continue checkingTheme;
+ } else if (beingCheckedThemes
+ .containsAll(checkedThemes)) {
+ checkedThemesComb.remove(checkedThemes);
+ }
+ }
+ checkedThemesComb.add(beingCheckedThemes);
+ }
+ }
+
+ for (List predictedThemes : checkedThemesComb) {
+ if (checkedThemesComb.size() > andProtein.size() + 2
+ && predictedThemes.size() == 2
+ && checkedThemesComb.size() >= 10) {
+ break;
+ }
+ Event newBindingEvent = new Event(jcas);
+ newBindingEvent.setTrigger(trigger);
+ newBindingEvent.setId(String.valueOf(eventIndex++));
+ StringArray eventThemes = new StringArray(jcas,
+ predictedThemes.size());
+ List args = new LinkedList();
+ for (Protein theme : predictedThemes) {
+ eventThemes.set(predictedThemes.indexOf(theme),
+ theme.getId());
+ Argument arg = new Argument();
+ arg.setId(theme.getId());
+ arg.setRelation("Theme");
+ args.add(arg);
+ }
+ eventArg.put("E".concat(newBindingEvent.getId()), args);
+
+ newBindingEvent.setThemes(eventThemes);
+ events.get(sentence.getId()).add(newBindingEvent);
+ triggerEvent.add(newBindingEvent);
+ }
+ } else if (EventType.isComplexEvent(trigger.getEventType())
+ && !EventType.isRegulatoryEvent(trigger.getEventType())) {
+
+ for (Protein protein : sentenceProteins) {
+ Token triggerToken = getTriggerToken(jcas, trigger);
+ Token proteinToken = getToken(jcas, protein);
+ boolean areSameTokens = (proteinToken.getId() == triggerToken.getId());
+
+ Instance proteinInstance = argumentDict
+ .instanceToNumeric(argumentToInstance(jcas,
+ sentence, protein, trigger,
+ pairsOfSentence, dependencyExtractor,
+ false, false, Stage.THEME));
+ if ( proteinInstance != null) {
+ double prediction = argumentRecogniser
+ .predict(proteinInstance.getFeaturesNumeric(), proteinInstance);
+ if (areSameTokens) {
+ prediction = argumentDict.getLabelNumeric("Theme");
+ }
+ if (prediction == argumentDict.getLabelNumeric("Theme")) {
+
+ Event event = new Event(jcas);
+ event.setId(String.valueOf(eventIndex++));
+ event.setTrigger(trigger);
+ StringArray themes = new StringArray(jcas, 1);
+ themes.set(0, protein.getId());
+ event.setThemes(themes);
+ events.get(sentence.getId()).add(event);
+ triggerEvent.add(event);
+
+ Argument arg = new Argument();
+ arg.setId(protein.getId());
+ arg.setRelation("Theme");
+ List args = new LinkedList();
+ args.add(arg);
+ eventArg.put("E".concat(event.getId()), args);
+ }else if(prediction == argumentDict.getLabelNumeric("Cause")) {
+ if (sameToken.contains(protein.getId())) {
+ continue;
+ }
+ triggerCause.add(protein.getId());
+ }
+ }
+ }
+ } else if (EventType.isRegulatoryEvent(trigger.getEventType())) {
+
+ for (Protein protein : sentenceProteins) {
+ Token triggerToken = getTriggerToken(jcas, trigger);
+ Token proteinToken = getToken(jcas, protein);
+ boolean areSameTokens = (proteinToken.getId() == triggerToken.getId());
+ if (!areSameTokens && sameToken.contains(protein.getId())) {
+ continue;
+ }
+
+ Instance proteinInstance = argumentDict
+ .instanceToNumeric(argumentToInstance(jcas,
+ sentence, protein, trigger,
+ pairsOfSentence, dependencyExtractor,
+ false, false, Stage.THEME));
+ if ( proteinInstance != null) {
+ double prediction = argumentRecogniser
+ .predict(proteinInstance.getFeaturesNumeric(), proteinInstance);
+
+ if (prediction == argumentDict.getLabelNumeric("Theme")) {
+ Event event = new Event(jcas);
+ event.setId(String.valueOf(eventIndex++));
+ event.setTrigger(trigger);
+ StringArray themes = new StringArray(jcas, 1);
+ themes.set(0, protein.getId());
+ event.setThemes(themes);
+ events.get(sentence.getId()).add(event);
+ triggerEvent.add(event);
+
+ Argument arg = new Argument();
+ arg.setId(protein.getId());
+ arg.setRelation("Theme");
+ List args = new LinkedList();
+ args.add(arg);
+ eventArg.put("E".concat(event.getId()), args);
+
+ }else if(prediction == argumentDict.getLabelNumeric("Cause")) {
+ triggerCause.add(protein.getId());
+ }
+ }
+ }
+ }
+ triggerEvents.put(trigger.getId(), triggerEvent);
+ triggerCauses.put(trigger.getId(), triggerCause);
+ }
+
+
+ // 2. check all discovered events whether they can be arguments
+ Map> arguments = new TreeMap>();
+ for (Trigger argumentTrigger : triggers.get(sentence.getId())) {
+ loop : for (Trigger trigger : triggers.get(sentence.getId())) {
+ if (!EventType.isComplexEvent(trigger.getEventType())) {
+ continue;
+ }
+
+ if (argumentTrigger.getBegin() == trigger.getBegin()) {
+ continue;
+ }
+
+ Instance triggerTokenInstance = argumentToInstance(jcas,
+ sentence, argumentTrigger, trigger,
+ pairsOfSentence, dependencyExtractor, false, false, Stage.THEME);
+
+ double prediction = argumentRecogniser.predict(argumentDict
+ .instanceToNumeric(triggerTokenInstance)
+ .getFeaturesNumeric(), triggerTokenInstance);
+
+ if (prediction == argumentDict.getLabelNumeric("Non_Argument")) {
+ continue;
+ }
+
+ if (arguments.containsKey(argumentTrigger.getId())) {
+ Set removeArg = new HashSet(arguments.get(argumentTrigger.getId()));
+ for (Argument arg : arguments.get(argumentTrigger.getId())) {
+ if (arg.getId().equals(trigger.getId())) {
+ double prediction0 = argumentRecogniser.predict_values(argumentDict
+ .instanceToNumeric(argumentToInstance(jcas,
+ sentence, argumentTrigger, trigger,
+ pairsOfSentence, dependencyExtractor, false, false, Stage.THEME))
+ .getFeaturesNumeric());
+ double prediction1 = argumentRecogniser.predict_values(argumentDict
+ .instanceToNumeric(argumentToInstance(jcas,
+ sentence, trigger, argumentTrigger,
+ pairsOfSentence, dependencyExtractor, false, false, Stage.THEME))
+ .getFeaturesNumeric());
+ if (prediction0 < prediction1) {
+ continue loop;
+ }else {
+ removeArg.remove(arg);
+ }
+ }
+ }
+ arguments.put(argumentTrigger.getId(), removeArg);
+ }
+
+ Set tris = arguments.keySet();
+ if (prediction == argumentDict.getLabelNumeric("Cause")) {
+ Set args = new HashSet();
+ if (tris.contains(trigger.getId())) {
+ args = arguments.get(trigger.getId());
+ }
+
+ Argument arg = new Argument();
+ arg.setId(argumentTrigger.getId());
+ arg.setRelation("Cause");
+ args.add(arg);
+ arguments.put(trigger.getId(), args) ;
+ }else if (prediction == argumentDict.getLabelNumeric("Theme")) {
+
+ if (EventType.isRegulatoryEvent(trigger.getEventType())) {
+
+ Set args = new HashSet();
+ if (tris.contains(trigger.getId())) {
+ args = arguments.get(trigger.getId());
+ }
+
+ Argument arg = new Argument();
+ arg.setId(argumentTrigger.getId());
+ arg.setRelation("Theme");
+ args.add(arg);
+ arguments.put(trigger.getId(), args) ;
+ }
+ }
+ }
+ }
+ //
+ //new event
+ //theme
+ Map> newtriggerEvents = new HashMap