diff --git a/README b/README index 0a1bc02..a31b1cd 100644 --- a/README +++ b/README @@ -10,3 +10,29 @@ src - The source code. LISENSE - The license of LitWay. README - This file. + + +How to run the program? + +1.build path -> add libraries -> ee,uima + +2.build.xml +run as -> ant build... -> Refresh -> The entire workspace ->output is "./dist/litway-0.8.jar" -> build path -> add to build path + +3.train +info.chenli.litway.bionlp13.ge +(1)TriggerRecogniser.java +argument is train data absolute path +(2)BindingRecogniser.java +argument is train data absolute path +(3)ArgumentRecogniser.java +argument is train data absolute path + +ps:The word2vec file at "./word2vec/word2vec100" + +4.test +info.chenli.litway.bionlp13.ge +EventExtractorBind2.java argument is test data absolute path + +5.result path is "./result/" + diff --git a/conf/config.xsd b/conf/config.xsd new file mode 100644 index 0000000..6081039 --- /dev/null +++ b/conf/config.xsd @@ -0,0 +1,57 @@ + + + + + * Filename : config.xsd + * Description: XML Schema for BioNLP + * Author(s) : Chen Li + * Revision : $Id: config.xsd 8275 2013-10-16 16:28:25Z $ + * $HeadURL: https://github.com/li-chen/ee $ + * + * Copyright 2013 Chen Li + * + * This software is licensed according to the terms described in the file + * named "LICENSE.txt" included with this distribution and available + * online at https://github.com/li-chen/ee/LICENSE.txt + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/conf/config_cg.xml b/conf/config_cg.xml new file mode 100644 index 0000000..55d0d6b --- /dev/null +++ b/conf/config_cg.xml @@ -0,0 +1,730 @@ + + + + Amino_acid + Anatomical_system + Cancer + Cell + Cellular_component + Developing_anatomical_structure + Gene_or_gene_product + Immaterial_anatomical_entity + Multi-tissue_structure + Organ + Organism + Organism_subdivision + Organism_substance + Pathological_formation + Simple_chemical + Tissue + + + + Acetylation + + + Gene_or_gene_product + + + + + Amino_acid_catabolism + + + Gene_or_gene_product + Simple_chemical + + + + + Binding + + + Amino_acid + Anatomical_system + Cancer + Cell + Cellular_component + Developing_anatomical_structure + Gene_or_gene_product + Immaterial_anatomical_theme + Multi-tissue_structure + Organ + Organism + Organism_subdivision + Organism_substance + Pathological_formation + Simple_chemical + Tissue + + + Protein_domain_or_region + DNA_domain_or_region + + + + + Blood_vessel_development + + + Organism_subdivision + Anatomical_system + Organ + Multi-tissue_structure + Tissue + Developing_anatomical_structure + Cell + Cellular_component + Cancer + + + + + + + + Breakdown + + + Organism_subdivision + Anatomical_system + Organ + Multi-tissue_structure + Tissue + Developing_anatomical_structure + Cell + Cellular_component + Cancer + + + + + Carcinogenesis + + + Organism_subdivision + Anatomical_system + Organ + Multi-tissue_structure + Tissue + Developing_anatomical_structure + Cell + Cellular_component + Cancer + + + + + Catabolism + + + Gene_or_gene_product + Simple_chemical + + + + + Cell_death + + + Organism_subdivision + Anatomical_system + Organ + Multi-tissue_structure + Tissue + Developing_anatomical_structure + Cell + Cellular_component + Cancer + + + + + Cell_differentiation + + + Organism_subdivision + Anatomical_system + Organ + Multi-tissue_structure + Tissue + Developing_anatomical_structure + Cell + Cellular_component + Cancer + + + + + Cell_division + + + Organism_subdivision + Anatomical_system + Organ + Multi-tissue_structure + Tissue + Developing_anatomical_structure + Cell + Cellular_component + Cancer + + + + + Cell_proliferation + + + Organism_subdivision + Anatomical_system + Organ + Multi-tissue_structure + Tissue + Developing_anatomical_structure + Cell + Cellular_component + Cancer + + + + + Cell_transformation + + + Organism_subdivision + Anatomical_system + Organ + Multi-tissue_structure + Tissue + Developing_anatomical_structure + Cell + Cellular_component + Cancer + + + + + DNA_demethylation + + + Gene_or_gene_product + + + + + DNA_methylation + + + Gene_or_gene_product + + + + + Death + + + Organism_subdivision + Anatomical_system + Organ + Multi-tissue_structure + Tissue + Developing_anatomical_structure + Cell + Cellular_component + Cancer + + + + + Dephosphorylation + + + Gene_or_gene_product + + + + + Development + + + Gene_or_gene_product + + + + + Dissociation + + + Amino_acid + Anatomical_system + Cancer + Cell + Cellular_component + Developing_anatomical_structure + Gene_or_gene_product + Immaterial_anatomical_theme + Multi-tissue_structure + Organ + Organism + Organism_subdivision + Organism_substance + Pathological_formation + Simple_chemical + Tissue + + + + + Gene_expression + + + Gene_or_gene_product + + + + + Glycolysis + + + + + + + + Glycosylation + + + + + + + + Growth + + + Organism_subdivision + Anatomical_system + Organ + Multi-tissue_structure + Tissue + Developing_anatomical_structure + Cell + Cellular_component + Cancer + + + + + Infection + + + + + + + + Localization + + + Amino_acid + Anatomical_system + Cancer + Cell + Cellular_component + Developing_anatomical_structure + Gene_or_gene_product + Immaterial_anatomical_theme + Multi-tissue_structure + Organ + Organism + Organism_subdivision + Organism_substance + Pathological_formation + Simple_chemical + Tissue + + + + + Metabolism + + + Gene_or_gene_product + Simple_chemical + + + + + Metastasis + + + Organism_subdivision + Anatomical_system + Organ + Multi-tissue_structure + Tissue + Developing_anatomical_structure + Cell + Cellular_component + Cancer + + + + + Mutation + + + Gene_or_gene_product + + + + + Negative_regulation + + + Amino_acid + Anatomical_system + Cancer + Cell + Cellular_component + Developing_anatomical_structure + Gene_or_gene_product + Immaterial_anatomical_theme + Multi-tissue_structure + Organ + Organism + Organism_subdivision + Organism_substance + Pathological_formation + Simple_chemical + Tissue + Acetylation + Amino_acid_catabolism + Binding + Blood_vessel_development + Breakdown + Carcinogenesis + Catabolism + Cell_death + Cell_differentiation + Cell_division + Cell_proliferation + Cell_transformation + DNA_demethylation + DNA_domain_or_region + DNA_methylation + Death + Dephosphorylation + Development + Dissociation + Gene_expression + Glycolysis + Glycosylation + Growth + Infection + Localization + Metabolism + Metastasis + Mutation + Negative_regulation + Pathway + Phosphorylation + Planned_process + Positive_regulation + Protein_domain_or_region + Protein_processing + Regulation + Remodeling + Reproduction + Synthesis + Transcription + Translation + Ubiquitination + + + + + Pathway + + + Amino_acid + Anatomical_system + Cancer + Cell + Cellular_component + Developing_anatomical_structure + Gene_or_gene_product + Immaterial_anatomical_theme + Multi-tissue_structure + Organ + Organism + Organism_subdivision + Organism_substance + Pathological_formation + Simple_chemical + Tissue + + + + + Phosphorylation + + + Gene_or_gene_product + + + + + Planned_process + + + Amino_acid + Anatomical_system + Cancer + Cell + Cellular_component + Developing_anatomical_structure + Gene_or_gene_product + Immaterial_anatomical_theme + Multi-tissue_structure + Organ + Organism + Organism_subdivision + Organism_substance + Pathological_formation + Simple_chemical + Tissue + Acetylation + Amino_acid_catabolism + Binding + Blood_vessel_development + Breakdown + Carcinogenesis + Catabolism + Cell_death + Cell_differentiation + Cell_division + Cell_proliferation + Cell_transformation + DNA_demethylation + DNA_domain_or_region + DNA_methylation + Death + Dephosphorylation + Development + Dissociation + Gene_expression + Glycolysis + Glycosylation + Growth + Infection + Localization + Metabolism + Metastasis + Mutation + Negative_regulation + Pathway + Phosphorylation + Planned_process + Positive_regulation + Protein_domain_or_region + Protein_processing + Regulation + Remodeling + Reproduction + Synthesis + Transcription + Translation + Ubiquitination + + + + + Positive_regulation + + + Amino_acid + Anatomical_system + Cancer + Cell + Cellular_component + Developing_anatomical_structure + Gene_or_gene_product + Immaterial_anatomical_theme + Multi-tissue_structure + Organ + Organism + Organism_subdivision + Organism_substance + Pathological_formation + Simple_chemical + Tissue + Acetylation + Amino_acid_catabolism + Binding + Blood_vessel_development + Breakdown + Carcinogenesis + Catabolism + Cell_death + Cell_differentiation + Cell_division + Cell_proliferation + Cell_transformation + DNA_demethylation + DNA_domain_or_region + DNA_methylation + Death + Dephosphorylation + Development + Dissociation + Gene_expression + Glycolysis + Glycosylation + Growth + Infection + Localization + Metabolism + Metastasis + Mutation + Negative_regulation + Pathway + Phosphorylation + Planned_process + Positive_regulation + Protein_domain_or_region + Protein_processing + Regulation + Remodeling + Reproduction + Synthesis + Transcription + Translation + Ubiquitination + + + + + Protein_processing + + + Gene_or_gene_product + + + + + Regulation + + + Amino_acid + Anatomical_system + Cancer + Cell + Cellular_component + Developing_anatomical_structure + Gene_or_gene_product + Immaterial_anatomical_theme + Multi-tissue_structure + Organ + Organism + Organism_subdivision + Organism_substance + Pathological_formation + Simple_chemical + Tissue + Acetylation + Amino_acid_catabolism + Binding + Blood_vessel_development + Breakdown + Carcinogenesis + Catabolism + Cell_death + Cell_differentiation + Cell_division + Cell_proliferation + Cell_transformation + DNA_demethylation + DNA_domain_or_region + DNA_methylation + Death + Dephosphorylation + Development + Dissociation + Gene_expression + Glycolysis + Glycosylation + Growth + Infection + Localization + Metabolism + Metastasis + Mutation + Negative_regulation + Pathway + Phosphorylation + Planned_process + Positive_regulation + Protein_domain_or_region + Protein_processing + Regulation + Remodeling + Reproduction + Synthesis + Transcription + Translation + Ubiquitination + + + + + Remodeling + + + Tissue + + + + + Reproduction + + + Organism + + + + + Synthesis + + + Simple_chemical + + + + + Transcription + + + Gene_or_gene_product + + + + + Translation + + + Gene_or_gene_product + + + + + Ubiquitination + + + Gene_or_gene_product + + + + + + diff --git a/conf/config_ge.xml b/conf/config_ge.xml new file mode 100644 index 0000000..02c35b3 --- /dev/null +++ b/conf/config_ge.xml @@ -0,0 +1,237 @@ + + + + Protein + + + + Gene_expression + + + + + + Transcription + + + + + + Protein_catabolism + + + + + + Localization + + + + + + Binding + + + + + + Protein_modification + + + + + + + + + + + + + + + + + + + + Phosphorylation + + + + + + + + + + + + + + + + + + + + Ubiquitination + + + + + + + + + + + + + + + + + + + + Acetylation + + + + + + + + + + + + + + + + + + + + Deacetylation + + + + + + + + + + + + + + + + + + + + Regulation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Positive_regulation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Negative_regulation + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/conf/config_pc.xml b/conf/config_pc.xml new file mode 100644 index 0000000..a73725d --- /dev/null +++ b/conf/config_pc.xml @@ -0,0 +1,543 @@ + + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + + Acetylation + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Simple_chemical + + + + + Activation + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + Acetylation + Activation + Binding + Conversion + Deacetylation + Degradation + Demethylation + Dephosphorylation + Deubiquitination + Dissociation + Gene_expression + Hydroxylation + Inactivation + Localization + Methylation + Negative_regulation + Pathway + Phosphorylation + Positive_regulation + Regulation + Transcription + Translation + Transport + Ubiquitination + + + + + Binding + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Complex + + + + + Conversion + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + + + Deacetylation + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Simple_chemical + + + + + Degradation + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + + + Demethylation + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Simple_chemical + + + + + Dephosphorylation + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Simple_chemical + + + + + Deubiquitination + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Simple_chemical + + + + + Dissociation + + + Complex + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + + + Gene_expression + + + Gene_or_gene_product + + + + + Hydroxylation + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Simple_chemical + + + + + Inactivation + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + Acetylation + Activation + Binding + Conversion + Deacetylation + Degradation + Demethylation + Dephosphorylation + Deubiquitination + Dissociation + Gene_expression + Hydroxylation + Inactivation + Localization + Methylation + Negative_regulation + Pathway + Phosphorylation + Positive_regulation + Regulation + Transcription + Translation + Transport + Ubiquitination + + + + + Localization + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Cellular_component + Cellular_component + Cellular_component + + + + + Methylation + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Simple_chemical + + + + + Negative_regulation + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + Acetylation + Activation + Binding + Conversion + Deacetylation + Degradation + Demethylation + Dephosphorylation + Deubiquitination + Dissociation + Gene_expression + Hydroxylation + Inactivation + Localization + Methylation + Negative_regulation + Pathway + Phosphorylation + Positive_regulation + Regulation + Transcription + Translation + Transport + Ubiquitination + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + Acetylation + Activation + Binding + Conversion + Deacetylation + Degradation + Demethylation + Dephosphorylation + Deubiquitination + Dissociation + Gene_expression + Hydroxylation + Inactivation + Localization + Methylation + Negative_regulation + Pathway + Phosphorylation + Positive_regulation + Regulation + Transcription + Translation + Transport + Ubiquitination + + + + + Pathway + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + + + Phosphorylation + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Simple_chemical + + + + + Positive_regulation + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + Acetylation + Activation + Binding + Conversion + Deacetylation + Degradation + Demethylation + Dephosphorylation + Deubiquitination + Dissociation + Gene_expression + Hydroxylation + Inactivation + Localization + Methylation + Negative_regulation + Pathway + Phosphorylation + Positive_regulation + Regulation + Transcription + Translation + Transport + Ubiquitination + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + Acetylation + Activation + Binding + Conversion + Deacetylation + Degradation + Demethylation + Dephosphorylation + Deubiquitination + Dissociation + Gene_expression + Hydroxylation + Inactivation + Localization + Methylation + Negative_regulation + Pathway + Phosphorylation + Positive_regulation + Regulation + Transcription + Translation + Transport + Ubiquitination + + + + + Regulation + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + Acetylation + Activation + Binding + Conversion + Deacetylation + Degradation + Demethylation + Dephosphorylation + Deubiquitination + Dissociation + Gene_expression + Hydroxylation + Inactivation + Localization + Methylation + Negative_regulation + Pathway + Phosphorylation + Positive_regulation + Regulation + Transcription + Translation + Transport + Ubiquitination + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + Acetylation + Activation + Binding + Conversion + Deacetylation + Degradation + Demethylation + Dephosphorylation + Deubiquitination + Dissociation + Gene_expression + Hydroxylation + Inactivation + Localization + Methylation + Negative_regulation + Pathway + Phosphorylation + Positive_regulation + Regulation + Transcription + Translation + Transport + Ubiquitination + + + + + Transcription + + + Gene_or_gene_product + + + + + Translation + + + Gene_or_gene_product + + + + + Transport + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Cellular_component + Cellular_component + + + + + Ubiquitination + + + Simple_chemical + Gene_or_gene_product + Complex + Cellular_component + + + Simple_chemical + + + + + + diff --git a/src/info/chenli/classifier/AbstractClassifier.java b/src/info/chenli/classifier/AbstractClassifier.java index 004368d..0718933 100644 --- a/src/info/chenli/classifier/AbstractClassifier.java +++ b/src/info/chenli/classifier/AbstractClassifier.java @@ -31,7 +31,7 @@ public void train(List trainingInstances, int trainingRound) { public int predict(Instance instance) { - return predict(instance.getFeaturesNumeric()); + return predict(instance.getFeaturesNumeric(), instance); } /** @@ -40,7 +40,7 @@ public int predict(Instance instance) { * @param featureVector * @return The predicted label. */ - public abstract int predict(int[] featureSparseVector); + public abstract int predict(int[] featureSparseVector, Instance instance); public abstract String modelToString(); diff --git a/src/info/chenli/classifier/Instance.java b/src/info/chenli/classifier/Instance.java index f9e6b68..0172abd 100644 --- a/src/info/chenli/classifier/Instance.java +++ b/src/info/chenli/classifier/Instance.java @@ -5,12 +5,49 @@ public class Instance { // instance ID + public boolean isReference; + public int tokenId; + public int sentenceId; + public String fileId; private String id; private int label; private String labelString; private List featuresString; private int[] featuresNumeric = null; + private double[] featuresNumericWord2vec = null; + + public boolean getIsReference() { + return this.isReference; + } + + public void setIsReference(boolean isReference) { + this.isReference = isReference; + } + + public int getTokenId() { + return this.tokenId; + } + + public void setTokenId(int tokenId) { + this.tokenId = tokenId; + } + + public int getSentenceId() { + return this.sentenceId; + } + public void setSentenceId(int sentenceId) { + this.sentenceId = sentenceId; + } + + public String getFileId() { + return this.fileId; + } + + public void setFileId(String fileId) { + this.fileId = fileId; + } + public String getId() { return this.id; } @@ -34,6 +71,14 @@ public int[] getFeaturesNumeric() { public void setFeaturesNumeric(int[] featuresNumeric) { this.featuresNumeric = featuresNumeric; } + + public double[] getFeaturesNumericWord2vec() { + return featuresNumericWord2vec; + } + + public void setFeaturesNumericWord2vec(double[] featuresNumericWord2vec) { + this.featuresNumericWord2vec = featuresNumericWord2vec; + } public String getLabelString() { return labelString; diff --git a/src/info/chenli/classifier/LibLinearFacade.java b/src/info/chenli/classifier/LibLinearFacade.java index 8b88645..c3a8fde 100644 --- a/src/info/chenli/classifier/LibLinearFacade.java +++ b/src/info/chenli/classifier/LibLinearFacade.java @@ -24,7 +24,7 @@ public class LibLinearFacade extends AbstractClassifier { private final static Logger logger = Logger.getLogger(LibLinearFacade.class .getName()); - private Model model; + public Model model; public void train(List instances) { @@ -40,6 +40,13 @@ public void train(List instances) { } } } + + double[] fs0 = instances.get(0).getFeaturesNumericWord2vec(); + if (null != fs0) { + featureNum += fs0.length; + } + + System.out.println("number of features:" + featureNum); problem.n = featureNum; // number of features problem.x = new Feature[instances.size()][]; // feature nodes problem.y = new double[instances.size()]; // target values @@ -50,9 +57,20 @@ public void train(List instances) { int previousIndex = 0; List featureNodes = new ArrayList(); + double[] fs = instance.getFeaturesNumericWord2vec(); + if (null != fs) { + for (int m=0; m previousIndex) { - featureNodes.add(new FeatureNode(index, 1)); + if (null != fs) { + featureNodes.add(new FeatureNode(fs.length + index, 1)); + }else { + featureNodes.add(new FeatureNode(index, 1)); + } // System.out.print("\t" + (index )); } previousIndex = index; @@ -82,7 +100,7 @@ public void train(List instances) { } @Override - public int predict(int[] featureSparseVector) { + public int predict(int[] featureSparseVector, Instance instance) { if (featureSparseVector == null) { throw new IllegalArgumentException( @@ -98,10 +116,21 @@ public int predict(int[] featureSparseVector) { } List featureNodes = new ArrayList(); + double[] fs = instance.getFeaturesNumericWord2vec(); + if (null != fs) { + for (int m=0; m previousIndex) { - featureNodes.add(new FeatureNode(index, 1)); + if (null != fs) { + featureNodes.add(new FeatureNode(fs.length + index, 1)); + }else { + featureNodes.add(new FeatureNode(index, 1)); + } } previousIndex = index; } @@ -109,9 +138,9 @@ public int predict(int[] featureSparseVector) { Feature node = new FeatureNode(n, model.getBias()); featureNodes.add(node); } - Feature[] instance = new FeatureNode[featureNodes.size()]; - instance = featureNodes.toArray(instance); - return (int) Math.round(Linear.predict(this.model, instance)); + Feature[] instance0 = new FeatureNode[featureNodes.size()]; + instance0 = featureNodes.toArray(instance0); + return (int) Math.round(Linear.predict(this.model, instance0)); } diff --git a/src/info/chenli/litway/bionlp13/CellularComponent.java b/src/info/chenli/litway/bionlp13/CellularComponent.java index 083bbfe..261278a 100644 --- a/src/info/chenli/litway/bionlp13/CellularComponent.java +++ b/src/info/chenli/litway/bionlp13/CellularComponent.java @@ -11,8 +11,8 @@ /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 - * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 + * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml * @generated */ public class CellularComponent extends Annotation { /** @generated @@ -54,10 +54,13 @@ public CellularComponent(JCas jcas, int begin, int end) { readObject(); } - /** + /** + * * Write your own initialization here * - @generated modifiable */ + * + * @generated modifiable + */ private void readObject() {/*default - does nothing empty block */} @@ -66,14 +69,18 @@ private void readObject() {/*default - does nothing empty block */} //* Feature: id /** getter for id - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getId() { if (CellularComponent_Type.featOkTst && ((CellularComponent_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.CellularComponent"); return jcasType.ll_cas.ll_getStringValue(addr, ((CellularComponent_Type)jcasType).casFeatCode_id);} /** setter for id - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setId(String v) { if (CellularComponent_Type.featOkTst && ((CellularComponent_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.CellularComponent"); diff --git a/src/info/chenli/litway/bionlp13/CellularComponent_Type.java b/src/info/chenli/litway/bionlp13/CellularComponent_Type.java index d405ae1..92d0239 100644 --- a/src/info/chenli/litway/bionlp13/CellularComponent_Type.java +++ b/src/info/chenli/litway/bionlp13/CellularComponent_Type.java @@ -14,7 +14,7 @@ import org.apache.uima.jcas.tcas.Annotation_Type; /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 * @generated */ public class CellularComponent_Type extends Annotation_Type { /** @generated */ diff --git a/src/info/chenli/litway/bionlp13/Chemical.java b/src/info/chenli/litway/bionlp13/Chemical.java index 4f40c8a..8bd227a 100644 --- a/src/info/chenli/litway/bionlp13/Chemical.java +++ b/src/info/chenli/litway/bionlp13/Chemical.java @@ -11,8 +11,8 @@ /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 - * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 + * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml * @generated */ public class Chemical extends Annotation { /** @generated @@ -54,10 +54,13 @@ public Chemical(JCas jcas, int begin, int end) { readObject(); } - /** + /** + * * Write your own initialization here * - @generated modifiable */ + * + * @generated modifiable + */ private void readObject() {/*default - does nothing empty block */} @@ -66,14 +69,18 @@ private void readObject() {/*default - does nothing empty block */} //* Feature: id /** getter for id - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getId() { if (Chemical_Type.featOkTst && ((Chemical_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.Chemical"); return jcasType.ll_cas.ll_getStringValue(addr, ((Chemical_Type)jcasType).casFeatCode_id);} /** setter for id - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setId(String v) { if (Chemical_Type.featOkTst && ((Chemical_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.Chemical"); diff --git a/src/info/chenli/litway/bionlp13/Chemical_Type.java b/src/info/chenli/litway/bionlp13/Chemical_Type.java index 4937af0..6af8c66 100644 --- a/src/info/chenli/litway/bionlp13/Chemical_Type.java +++ b/src/info/chenli/litway/bionlp13/Chemical_Type.java @@ -14,7 +14,7 @@ import org.apache.uima.jcas.tcas.Annotation_Type; /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 * @generated */ public class Chemical_Type extends Annotation_Type { /** @generated */ diff --git a/src/info/chenli/litway/bionlp13/Complex.java b/src/info/chenli/litway/bionlp13/Complex.java index 6230172..9d0ee60 100644 --- a/src/info/chenli/litway/bionlp13/Complex.java +++ b/src/info/chenli/litway/bionlp13/Complex.java @@ -11,8 +11,8 @@ /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 - * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 + * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml * @generated */ public class Complex extends Annotation { /** @generated @@ -54,10 +54,13 @@ public Complex(JCas jcas, int begin, int end) { readObject(); } - /** + /** + * * Write your own initialization here * - @generated modifiable */ + * + * @generated modifiable + */ private void readObject() {/*default - does nothing empty block */} @@ -66,14 +69,18 @@ private void readObject() {/*default - does nothing empty block */} //* Feature: id /** getter for id - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getId() { if (Complex_Type.featOkTst && ((Complex_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.Complex"); return jcasType.ll_cas.ll_getStringValue(addr, ((Complex_Type)jcasType).casFeatCode_id);} /** setter for id - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setId(String v) { if (Complex_Type.featOkTst && ((Complex_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.Complex"); diff --git a/src/info/chenli/litway/bionlp13/Complex_Type.java b/src/info/chenli/litway/bionlp13/Complex_Type.java index 92fc4d4..bc2863f 100644 --- a/src/info/chenli/litway/bionlp13/Complex_Type.java +++ b/src/info/chenli/litway/bionlp13/Complex_Type.java @@ -14,7 +14,7 @@ import org.apache.uima.jcas.tcas.Annotation_Type; /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 * @generated */ public class Complex_Type extends Annotation_Type { /** @generated */ diff --git a/src/info/chenli/litway/bionlp13/EntityTypes.java b/src/info/chenli/litway/bionlp13/EntityTypes.java index 6f87120..41c39bb 100644 --- a/src/info/chenli/litway/bionlp13/EntityTypes.java +++ b/src/info/chenli/litway/bionlp13/EntityTypes.java @@ -1,6 +1,10 @@ package info.chenli.litway.bionlp13; + + public enum EntityTypes { Cellular_component, Complex, Gene_or_gene_product, Protein, Simple_chemical; } + + diff --git a/src/info/chenli/litway/bionlp13/Gene.java b/src/info/chenli/litway/bionlp13/Gene.java index d2870f7..73b1d67 100644 --- a/src/info/chenli/litway/bionlp13/Gene.java +++ b/src/info/chenli/litway/bionlp13/Gene.java @@ -11,8 +11,8 @@ /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 - * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 + * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml * @generated */ public class Gene extends Annotation { /** @generated @@ -54,10 +54,13 @@ public Gene(JCas jcas, int begin, int end) { readObject(); } - /** + /** + * * Write your own initialization here * - @generated modifiable */ + * + * @generated modifiable + */ private void readObject() {/*default - does nothing empty block */} @@ -66,14 +69,18 @@ private void readObject() {/*default - does nothing empty block */} //* Feature: id /** getter for id - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getId() { if (Gene_Type.featOkTst && ((Gene_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.Gene"); return jcasType.ll_cas.ll_getStringValue(addr, ((Gene_Type)jcasType).casFeatCode_id);} /** setter for id - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setId(String v) { if (Gene_Type.featOkTst && ((Gene_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.bionlp13.Gene"); diff --git a/src/info/chenli/litway/bionlp13/Gene_Type.java b/src/info/chenli/litway/bionlp13/Gene_Type.java index 467b0cd..0b711a0 100644 --- a/src/info/chenli/litway/bionlp13/Gene_Type.java +++ b/src/info/chenli/litway/bionlp13/Gene_Type.java @@ -14,7 +14,7 @@ import org.apache.uima.jcas.tcas.Annotation_Type; /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 * @generated */ public class Gene_Type extends Annotation_Type { /** @generated */ diff --git a/src/info/chenli/litway/bionlp13/ge/AbstractInstances.java b/src/info/chenli/litway/bionlp13/ge/AbstractInstances.java index af0ca2e..70ddb11 100644 --- a/src/info/chenli/litway/bionlp13/ge/AbstractInstances.java +++ b/src/info/chenli/litway/bionlp13/ge/AbstractInstances.java @@ -12,6 +12,7 @@ import info.chenli.litway.util.DependencyExtractor; import info.chenli.litway.util.FileFilterImpl; import info.chenli.litway.util.FileUtil; +import info.chenli.litway.util.UimaUtil; import info.chenli.litway.util.StanfordDependencyReader.Pair; import java.io.File; @@ -41,6 +42,8 @@ import org.apache.uima.util.XMLInputSource; import org.uimafit.util.JCasUtil; +import de.bwaldvogel.liblinear.FeatureNode; + public abstract class AbstractInstances { private final static Logger logger = Logger @@ -145,7 +148,7 @@ protected JCas processSingleFile(File aFile) { String document = null; try { - + document = FileUtils.file2String(aFile); } catch (IOException e) { @@ -155,7 +158,6 @@ protected JCas processSingleFile(File aFile) { } document = document.trim(); - try { // create a CAS CAS cas = ae.newCAS(); @@ -173,6 +175,10 @@ protected JCas processSingleFile(File aFile) { FSIterator annoIter = null; JCas jcas = null; jcas = cas.getJCas(); + + + //System.out.println(UimaUtil.getJCasFilePath(jcas)); + for (int annotationType : annotationTypes) { annoIter = jcas.getAnnotationIndex(annotationType).iterator(); structuredInstances.addAll(getStructuredInstances(jcas, @@ -237,7 +243,10 @@ protected Token getTriggerToken(List tokens) { new POSPrioritizer()); for (Token token : tokens) { - + //System.out.println(token.getStem()); + if (!POS.isPos(token.getPos())) { + continue; + } sortedTokens.put(POS.valueOf(token.getPos()), token); if (TriggerWord.isATriggerWord(token.getCoveredText()) != null) { return token; @@ -331,11 +340,22 @@ public void saveSvmLightInstances(File file) { for (Instance instance : instances) { sb.append(String.valueOf(instance.getLabel())); + + double[] fs = instance.getFeaturesNumericWord2vec(); + if (null != fs) { + for (int m=0; m previousIndex) { - sb.append(" ".concat(String.valueOf(feature)).concat(":1")); + if (null != fs) { + sb.append(" ".concat(String.valueOf(fs.length + feature)).concat(":1")); + }else { + sb.append(" ".concat(String.valueOf(feature)).concat(":1")); + } } previousIndex = feature; } @@ -377,72 +397,367 @@ protected Instance causeToInstance(JCas jcas, Sentence sentence, * @param themeToken * @return */ - private Instance themeCauseToInstance(JCas jcas, Sentence sentence, - Annotation anno, Trigger trigger, Set pairsOfSentence, - DependencyExtractor dependencyExtractor, boolean isTruepositive, - Stage stage, Token themeToken) { + protected Instance argumentToInstance(JCas jcas, Sentence sentence, + Annotation annotation, Trigger trigger, Set pairsOfSentence, + DependencyExtractor dependencyExtractor, boolean isTheme, + boolean isCause, Stage stage) { - if (!(anno instanceof Trigger) && !(anno instanceof Protein)) { - throw new IllegalArgumentException( - "The theme/cause has to be a protein or trigger."); + Instance instance = new Instance(); + List featuresString = new ArrayList(); + instance.setFeaturesString(featuresString); + + // get trigger token + Token triggerToken = getTriggerToken(jcas, trigger); + //System.out.println(annotation.getCoveredText()); + Token annotationToken = getToken(jcas, annotation); + if (annotation instanceof Trigger) { + annotationToken = getTriggerToken(jcas, (Trigger)annotation); } + // parser : dependency path between trigger-argument + //int dependencyPathLength = dependencyExtractor.getDijkstraShortestPathLength( + // triggerToken, annotationToken); + //featuresString.add(new String[] { "dependencyPathLength_" + String.valueOf(dependencyPathLength) }); + String featurePath = dependencyExtractor.getShortestPath( + triggerToken, annotationToken, stage); +/* if ( isTruepositive && null == featurePath && !areSameTokens) { + int i = sentence.getId(); + String s = triggerToken.getCoveredText(); + String s2 = annoToken.getCoveredText(); + return null; + }*/ + boolean areSameTokens = (annotationToken.getId() == triggerToken.getId()); + featurePath = areSameTokens ? "SAMETOKEN" : featurePath; + featurePath = (null == featurePath ? null : "featurePath_".concat(featurePath)); + featuresString.add(null == featurePath ? new String[0] + : new String[] { featurePath }); + //instance.setId(featurePath); + //instance.setFileId(trigger.getEventType()); + /*if (areSameTokens && isTheme) { + System.out.println("theme" + "\t" + trigger.getEventType()); + } else if (areSameTokens && !isTheme && !isCause) { + System.out.println("not" + "\t" + trigger.getEventType()); + } else if (areSameTokens && isCause) { + System.out.println("isCause" + "\t" + trigger.getEventType()); + } */ + // parser refined? - List annoTokens = JCasUtil - .selectCovered(jcas, Token.class, anno); + // parser_simple: grouping of dpendency type; + // amod, nn --> nmod + // anything ending in subj --> subj + // anything ending in subjpass --> subjpass + /*String simplifiedFeaturePath = dependencyExtractor + .getSimplifiedShortestPath(triggerToken, annotationToken, stage); + simplifiedFeaturePath = areSameTokens ? "SAMETOKEN" + : simplifiedFeaturePath; + simplifiedFeaturePath = (null == simplifiedFeaturePath ? null + : "simplifiedFeaturePath_".concat(simplifiedFeaturePath)); + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { simplifiedFeaturePath });*/ - // if protein/trigger is within a token - if (annoTokens.size() == 0) { - FSIterator iter = jcas.getAnnotationIndex(Token.type) - .iterator(); - annoTokens = new ArrayList(); - while (iter.hasNext()) { - Token token = (Token) iter.next(); - if (token.getBegin() <= anno.getBegin() - && token.getEnd() >= anno.getEnd()) { - annoTokens.add(token); - break; + // trigger class + String triggerClassString; + if (EventType.isSimpleEvent(trigger.getEventType())) { + triggerClassString = "class_Simple"; + } else if (EventType.isBindingEvent(trigger.getEventType())) { + triggerClassString = "class_Binding"; + } else if (EventType.isRegulatoryEvent(trigger.getEventType())) { + triggerClassString = "class_Regulation"; + } else { + triggerClassString = "class_Complex"; + } + //featuresString.add(new String[] { triggerClassString }); + /*featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerClassString.concat("_").concat( + featurePath) });*/ + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerClassString + //.concat("_").concat(featurePath) + }); + + // trigger token & trigger type + /*String triggerText = "text_".concat(trigger.getCoveredText() + .toLowerCase()); + featuresString.add(new String[] { triggerText }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerText.concat("_").concat(featurePath) }); + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { triggerText.concat("_").concat( + simplifiedFeaturePath) });*/ + + /*String eventType = "eventType_".concat(trigger.getEventType()); + //featuresString.add(new String[] { eventType }); + //featuresString.add(null == featurePath ? new String[0] + // : new String[] { eventType.concat("_").concat(featurePath) }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { eventType + //.concat("_").concat(featurePath) + }); +*/ + // trigger lemma (using the token's POS, which may be inaccurate) + String triggerLemma = "triggerLemma_".concat(triggerToken.getLemma()); + triggerLemma = (null == triggerToken.getSubLemma() ? triggerLemma + : "triggerLemma_".concat(triggerToken.getSubLemma() + .toLowerCase())); + featuresString.add(new String[] { triggerLemma }); + /*featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_") + .concat(featurePath) });*/ + + // trigger POS + /*String triggerPos = "triggerPos_".concat(triggerToken.getPos()); + //featuresString.add(new String[] { triggerPos }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerPos.concat("_").concat(featurePath) });*/ + String triggerPosShort = "triggerShortPos_".concat(triggerToken + .getPos().substring(0, 1)); + //featuresString.add(new String[] { triggerPosShort }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerPosShort.concat("_") + .concat(featurePath) }); + + /*featuresString.add(new String[] { triggerLemma.concat("_").concat( + triggerPos) }); + featuresString.add(new String[] { triggerLemma.concat("_").concat( + triggerPosShort) });*/ + + // argument type + String argClassString, argType, argLemma; + if (annotation instanceof Trigger) { + argType = ((Trigger)annotation).getEventType(); + argLemma = "argLemma_".concat(annotationToken.getLemma()); + argLemma = (null == annotationToken.getSubLemma() ? argLemma + : "argLemma_".concat(annotationToken.getSubLemma() + .toLowerCase())); + if (EventType.isSimpleEvent(((Trigger)annotation).getEventType())) { + argClassString = "arg_class_Simple"; + } else if (EventType.isBindingEvent(((Trigger)annotation).getEventType())) { + argClassString = "arg_class_Binding"; + } else if (EventType.isRegulatoryEvent(((Trigger)annotation).getEventType())) { + argClassString = "arg_class_Regulation"; + } else { + argClassString = "arg_class_Complex"; + } + } else { + argClassString = "arg_class_Protein"; + argType = "arg_class_Protein"; + argLemma = "arg_class_Protein"; + } + //featuresString.add(new String[] { argClassString + triggerClassString}); + //featuresString.add(new String[] { argClassString + triggerLemma}); + //featuresString.add(new String[] { argType }); + //featuresString.add(new String[] { argLemma }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerClassString.concat("_") + .concat(featurePath).concat("_").concat(argClassString) }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_") + .concat(featurePath).concat("_").concat(argClassString) }); + /*featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_") + .concat(simplifiedFeaturePath).concat("_").concat(argLemma) });*/ +/* String argText = "text_Protein"; + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerText.concat("_").concat(featurePath). + concat("_").concat(argText) }); + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { triggerText.concat("_").concat( + simplifiedFeaturePath).concat("_").concat( + argText) }); + + String argLemma = "argLemma_Protein"; + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_").concat( + featurePath).concat("_").concat( + argLemma) }); + + String argPos = "argPos_Protein"; + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerPos + .concat("_").concat(argPos) }); + + String argType = "argType_Protein"; + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_").concat(featurePath) + .concat("_").concat(argType) }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerSubLemma.concat("_") + .concat(featurePath).concat("_").concat(argType) }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerClassString.concat("_") + .concat(featurePath).concat("_").concat(argType) });*/ + + // text string from trigger to theme/cause: compensate when parsing + // fails + String textBetween = "", textAbsBetween = "", textShortBetween = ""; + + List tokensBetween = JCasUtil.selectCovered(jcas, + Token.class, sentence); + List proteinsBetween = JCasUtil.selectCovered(jcas, + Protein.class, sentence); + int start = Math.min(annotationToken.getBegin(), triggerToken.getBegin()); + int end = Math.max(annotationToken.getEnd(), triggerToken.getEnd()); + boolean reversed = (start != triggerToken.getBegin()); + + List tokensTextBetween = new ArrayList(); + List tokensAbsTextBetween = new ArrayList(); + + tokensLoop: for (Token aToken : tokensBetween) { + + if (aToken.getBegin() < start || !POS.isPos(aToken.getPos())) { + continue tokensLoop; + } else if (aToken.getEnd() >= end) { + break tokensLoop; + } + + // if it is a protein + for (Protein aProtein : proteinsBetween) { + if (aToken.getBegin() == aProtein.getBegin()) { + tokensTextBetween.add("PROTEIN"); + tokensAbsTextBetween.add("PROTEIN"); + continue tokensLoop; + } else if (aToken.getBegin() > aProtein.getBegin() + && aToken.getEnd() <= aProtein.getEnd()) { + continue tokensLoop; } } + if (aToken.getBegin() == trigger.getBegin()) { + tokensAbsTextBetween.add(trigger.getEventType()); + continue tokensLoop; + } else if (aToken.getBegin() > trigger.getBegin() + && aToken.getEnd() <= trigger.getEnd()) { + continue tokensLoop; + } + + tokensTextBetween.add(aToken.getLemma().toLowerCase()); + tokensAbsTextBetween.add(aToken.getLemma().toLowerCase()); + } - Token annoToken = null; - if (anno instanceof Protein) - // Take the last non-digital token if protein is - // multi-token. - { - annoToken = annoTokens.get(annoTokens.size() - 1); - // for (Token aToken : annoTokens) { - // - // try { - // Double.parseDouble(aToken.getLemma()); - // break; - // } catch (NumberFormatException e) { - // token = aToken; - // } - // - // } - } else if (anno instanceof Trigger) { - annoToken = getTriggerToken(jcas, (Trigger) anno); + /*int tokensTextBetweenLength = tokensTextBetween.size(); + String[] tokensTextBetweenString = new String[tokensTextBetweenLength]; + featuresString.add(new String[] { "tokensTextBetweenLength_" + String.valueOf(tokensTextBetweenLength) }); + int j= 0; + for (String aText : tokensTextBetween) { + tokensTextBetweenString[j] = aText; + j++; + } + featuresString.add(tokensTextBetweenString);*/ + for (String aText : tokensTextBetween) { + if (reversed) { + textBetween = aText.concat(textBetween.equals("") ? "" + : "_".concat(textBetween)); + } else { + textBetween = textBetween.equals("") ? aText : textBetween + .concat("_").concat(aText); + } + } + for (String aText : tokensAbsTextBetween) { + if (reversed) { + textAbsBetween = aText + .concat(textAbsBetween.equals("") ? "" : "_" + .concat(textAbsBetween)); + } else { + textAbsBetween = textAbsBetween.equals("") ? aText + : textAbsBetween.concat("_").concat(aText); + } + } + // concatenate text between trigger and theme/cause with the + // previous + // features. + textBetween = textBetween.equals("") ? null : "textString_".concat( + reversed ? "reversed_" : "").concat(textBetween); + textAbsBetween = textAbsBetween.equals("") ? null + : "textStringAbs_".concat(reversed ? "reversed_" : "") + .concat(textAbsBetween); + for (int i = 1; i < tokensAbsTextBetween.size() - 1; i++) { + if (reversed) { + textShortBetween = tokensAbsTextBetween.get(i).concat( + textShortBetween.equals("") ? "" : "_" + .concat(textShortBetween)); + } else { + textShortBetween = textShortBetween.equals("") ? tokensAbsTextBetween + .get(i) : textShortBetween.concat("_").concat( + tokensAbsTextBetween.get(i)); + } + } + textShortBetween = textShortBetween.equals("") ? null + : "textStringShort_".concat(reversed ? "reversed_" : "") + .concat(textShortBetween); + if (areSameTokens) { + textBetween = "SAMETOKEN"; + textAbsBetween = "SAMETOKEN"; + textShortBetween = "SAMETOKEN"; + } + + featuresString.add(null == textBetween ? new String[0] + : new String[] { textBetween }); + featuresString.add(null == textBetween ? new String[0] + : new String[] { triggerLemma.concat("_").concat(textBetween) }); + featuresString + .add(null != textBetween && null != featurePath ? new String[] { featurePath + .concat("_").concat(textBetween) } : new String[0]); + + /**/featuresString.add(null == textAbsBetween ? new String[0] + : new String[] { textAbsBetween }); + featuresString + .add(null == textAbsBetween ? new String[0] + : new String[] { triggerLemma.concat("_").concat( + textAbsBetween) }); + /*featuresString + .add(null != textAbsBetween && null != featurePath ? new String[] { featurePath + .concat("_").concat(textAbsBetween) } : new String[0]);*/ + + featuresString.add(null == textShortBetween ? new String[0] + : new String[] { textShortBetween }); + /*featuresString.add(null == textShortBetween ? new String[0] + : new String[] { triggerLemma.concat("_").concat( + textShortBetween) });*/ + /*featuresString + .add(null != textShortBetween && null != featurePath ? new String[] { featurePath + .concat("_").concat(textShortBetween) } : new String[0]);*/ + + if (isTheme) { + instance.setLabelString("Theme"); + } else if (isCause){ + instance.setLabelString("Cause"); + } else { + instance.setLabelString("Non_Argument"); } + return instance; + } + + protected Instance triggerArgumentToInstance(JCas jcas, Sentence sentence, + Trigger arguTrigger, Trigger trigger, Set pairsOfSentence, + DependencyExtractor dependencyExtractor, boolean isTheme, + boolean isCause, Stage stage) { + Instance instance = new Instance(); List featuresString = new ArrayList(); instance.setFeaturesString(featuresString); // get trigger token Token triggerToken = getTriggerToken(jcas, trigger); - + Token arguToken = getTriggerToken(jcas, arguTrigger); // parser : dependency path between trigger-argument String dependencyPath = dependencyExtractor.getShortestPath( - triggerToken, annoToken, stage); + triggerToken, arguToken, stage); String featurePath = dependencyPath; if (null == dependencyPath) { featurePath = dependencyExtractor.getReversedShortestPath( - triggerToken, annoToken, stage); + triggerToken, arguToken, stage); } - boolean areSameTokens = (annoToken.getBegin() == triggerToken - .getBegin() && annoToken.getEnd() == triggerToken.getEnd()); + + boolean areSameTokens = (arguToken.getBegin() == triggerToken + .getBegin() && arguToken.getEnd() == triggerToken.getEnd()); + +/* if ( isTruepositive && null == featurePath && !areSameTokens) { + int i = sentence.getId(); + String s = triggerToken.getCoveredText(); + String s2 = annoToken.getCoveredText(); + return null; + }*/ featurePath = areSameTokens ? "SAMETOKEN" : featurePath; featurePath = (null == featurePath ? null : "dep_".concat(featurePath)); featuresString.add(null == featurePath ? new String[0] @@ -457,10 +772,10 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence, String simplifiedFeaturePath = null; if (null != dependencyPath) { simplifiedFeaturePath = dependencyExtractor - .getSimplifiedShortestPath(triggerToken, annoToken, stage); + .getSimplifiedShortestPath(triggerToken, arguToken, stage); } else { simplifiedFeaturePath = dependencyExtractor - .getSimplifiedReversedShortestPath(triggerToken, annoToken, + .getSimplifiedReversedShortestPath(triggerToken, arguToken, stage); } simplifiedFeaturePath = areSameTokens ? "SAMETOKEN" @@ -481,9 +796,11 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence, } else { triggerClassString = "class_Complex"; } + featuresString.add(null == featurePath ? new String[0] : new String[] { triggerClassString.concat("_").concat( featurePath) }); + featuresString.add(null == simplifiedFeaturePath ? new String[0] : new String[] { triggerClassString.concat("_").concat( simplifiedFeaturePath) }); @@ -493,6 +810,7 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence, .toLowerCase()); featuresString.add(null == featurePath ? new String[0] : new String[] { triggerText.concat("_").concat(featurePath) }); + String eventType = "eventType_".concat(trigger.getEventType()); featuresString.add(null == featurePath ? new String[0] : new String[] { eventType.concat("_").concat(featurePath) }); @@ -500,6 +818,7 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence, featuresString.add(null == simplifiedFeaturePath ? new String[0] : new String[] { triggerText.concat("_").concat( simplifiedFeaturePath) }); + featuresString.add(null == simplifiedFeaturePath ? new String[0] : new String[] { eventType.concat("_").concat( simplifiedFeaturePath) }); @@ -512,6 +831,7 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence, .add(null == featurePath ? new String[0] : new String[] { triggerLemma.concat("_").concat( featurePath) }); + // trigger sublemma String triggerSubLemma = (null == triggerToken.getSubLemma() ? triggerLemma @@ -525,6 +845,7 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence, String triggerPos = "triggerPos_".concat(triggerToken.getPos()); featuresString.add(null == featurePath ? new String[0] : new String[] { triggerPos.concat("_").concat(featurePath) }); + String triggerPosShort = "triggerShortPos_".concat(triggerToken .getPos().substring(0, 1)); featuresString.add(null == featurePath ? new String[0] @@ -541,22 +862,60 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence, triggerPosShort) }); // argument type - String argType = null; - if (anno instanceof Protein) { - argType = "argType_Protein"; - } else if (anno instanceof Trigger) { - argType = "argType_".concat(((Trigger) anno).getEventType()); - } + String argClassString; + if (EventType.isSimpleEvent(arguTrigger.getEventType())) { + argClassString = "class_Simple"; + } else if (EventType.isBindingEvent(arguTrigger.getEventType())) { + argClassString = "class_Binding"; + } else if (EventType.isRegulatoryEvent(arguTrigger.getEventType())) { + argClassString = "class_Regulation"; + } else { + argClassString = "class_Complex"; + } featuresString.add(null == featurePath ? new String[0] : new String[] { triggerLemma.concat("_").concat(featurePath) - .concat("_").concat(argType) }); + .concat("_").concat(argClassString) }); featuresString.add(null == featurePath ? new String[0] : new String[] { triggerSubLemma.concat("_") - .concat(featurePath).concat("_").concat(argType) }); + .concat(featurePath).concat("_").concat(argClassString) }); + String argType = "argType_".concat(arguTrigger.getEventType()); featuresString.add(null == featurePath ? new String[0] : new String[] { triggerClassString.concat("_") .concat(featurePath).concat("_").concat(argType) }); +/* String argText = "text_".concat(arguTrigger.getCoveredText() + .toLowerCase()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerText.concat("_").concat(featurePath). + concat("_").concat(argText) }); + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { triggerText.concat("_").concat( + simplifiedFeaturePath).concat("_").concat( + argText) }); + + String argLemma = "argLemma_".concat(BioLemmatizerUtil + .lemmatizeWord(arguTrigger.getCoveredText(), arguToken.getPos()) + .toLowerCase()); + featuresString + .add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_").concat( + featurePath).concat("_").concat( + argLemma) }); + + String argPos = "argPos_".concat(arguToken.getPos()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerPos + .concat("_").concat(argPos) }); + String argType = "argType_".concat(arguTrigger.getEventType()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_").concat(featurePath) + .concat("_").concat(argType) }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerSubLemma.concat("_") + .concat(featurePath).concat("_").concat(argType) }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerClassString.concat("_") + .concat(featurePath).concat("_").concat(argType) });*/ // text string from trigger to theme/cause: compensate when parsing // fails String textBetween = "", textAbsBetween = "", textShortBetween = ""; @@ -566,8 +925,8 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence, Token.class, sentence); List proteinsBetween = JCasUtil.selectCovered(jcas, Protein.class, sentence); - int start = Math.min(annoToken.getBegin(), triggerToken.getBegin()); - int end = Math.max(annoToken.getEnd(), triggerToken.getEnd()); + int start = Math.min(arguToken.getBegin(), triggerToken.getBegin()); + int end = Math.max(arguToken.getEnd(), triggerToken.getEnd()); boolean reversed = (start != triggerToken.getBegin()); List tokensTextBetween = new ArrayList(); @@ -676,118 +1035,787 @@ private Instance themeCauseToInstance(JCas jcas, Sentence sentence, .add(null != textShortBetween && null != dependencyPath ? new String[] { dependencyPath .concat("_").concat(textShortBetween) } : new String[0]); - if (stage.equals(Stage.CAUSE)) { - String pathToTheme = null; - if (null != themeToken) { - pathToTheme = dependencyExtractor.getShortestPath(annoToken, - themeToken, stage); - if (null == pathToTheme) { - pathToTheme = dependencyExtractor.getReversedShortestPath( - annoToken, themeToken, stage); - } - } - featuresString - .add(null != pathToTheme && themeToken != null ? new String[] { pathToTheme } - : new String[0]); - } - - String label; - switch (stage) { - case THEME: - label = "Theme"; - break; - case CAUSE: - label = "Cause"; - break; - default: - label = null; - } - if (isTruepositive) { - - instance.setLabelString(label); + if (isTheme) { + instance.setLabelString("Theme"); + } else if (isCause){ + instance.setLabelString("Cause"); } else { - instance.setLabelString("Non_".concat(label.toLowerCase())); + instance.setLabelString("Non_Argument"); } return instance; } - protected Instance bindingEventToInstance(JCas jcas, Sentence sentence, - Event bindingEvent, List themes, - DependencyExtractor dependencyExtractor) { - - boolean truepositive = true; - if (null != bindingEvent.getThemes() - && themes.size() == bindingEvent.getThemes().size()) { - themeSearchingLoop: for (Protein protein : themes) { - boolean foundTheProtein = false; - for (int i = 0; i < bindingEvent.getThemes().size(); i++) { - if (protein.getId().equals(bindingEvent.getThemes(i))) { - foundTheProtein = true; - break; - } - } - if (foundTheProtein == false) { - truepositive = false; - break themeSearchingLoop; - } - } - } else { - truepositive = false; - } + protected Instance triggerSpeicalArgumentToInstance(JCas jcas, Sentence sentence, + Trigger arguTrigger, Trigger trigger, Set pairsOfSentence, + DependencyExtractor dependencyExtractor, boolean isTheme, + boolean isCause, Stage stage) { Instance instance = new Instance(); - List featuresString = new ArrayList(); instance.setFeaturesString(featuresString); - Trigger trigger = bindingEvent.getTrigger(); + // get trigger token Token triggerToken = getTriggerToken(jcas, trigger); + Token arguToken = getTriggerToken(jcas, arguTrigger); + // parser : dependency path between trigger-argument + String dependencyPath = dependencyExtractor.getShortestPath( + triggerToken, arguToken, stage); + String featurePath = dependencyPath; - List themeTokens = new ArrayList(); - for (Protein aProtein : themes) { - List annoTokens = JCasUtil.selectCovered(jcas, Token.class, - aProtein); - - // if protein/trigger is within a token - if (annoTokens.size() == 0) { - FSIterator iter = jcas.getAnnotationIndex( - Token.type).iterator(); - annoTokens = new ArrayList(); - while (iter.hasNext()) { - Token token = (Token) iter.next(); - if (token.getBegin() <= aProtein.getBegin() - && token.getEnd() >= aProtein.getEnd()) { - annoTokens.add(token); - break; - } - } + if (null == dependencyPath) { + featurePath = dependencyExtractor.getReversedShortestPath( + triggerToken, arguToken, stage); + } + + boolean areSameTokens = (arguToken.getBegin() == triggerToken + .getBegin() && arguToken.getEnd() == triggerToken.getEnd()); + +/* if ( isTruepositive && null == featurePath && !areSameTokens) { + int i = sentence.getId(); + String s = triggerToken.getCoveredText(); + String s2 = annoToken.getCoveredText(); + return null; + }*/ + featurePath = areSameTokens ? "SAMETOKEN" : featurePath; + featurePath = (null == featurePath ? null : "dep_".concat(featurePath)); + featuresString.add(null == featurePath ? new String[0] + : new String[] { featurePath }); + + // parser refined? + + // parser_simple: grouping of dpendency type; + // amod, nn --> nmod + // anything ending in subj --> subj + // anything ending in subjpass --> subjpass + String simplifiedFeaturePath = null; + if (null != dependencyPath) { + simplifiedFeaturePath = dependencyExtractor + .getSimplifiedShortestPath(triggerToken, arguToken, stage); + } else { + simplifiedFeaturePath = dependencyExtractor + .getSimplifiedReversedShortestPath(triggerToken, arguToken, + stage); + } + simplifiedFeaturePath = areSameTokens ? "SAMETOKEN" + : simplifiedFeaturePath; + simplifiedFeaturePath = (null == simplifiedFeaturePath ? null + : "dep_simple_".concat(simplifiedFeaturePath)); + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { simplifiedFeaturePath }); + + // trigger class + String triggerClassString; + if (EventType.isSimpleEvent(trigger.getEventType())) { + triggerClassString = "class_Simple"; + } else if (EventType.isBindingEvent(trigger.getEventType())) { + triggerClassString = "class_Binding"; + } else if (EventType.isRegulatoryEvent(trigger.getEventType())) { + triggerClassString = "class_Regulation"; + } else { + triggerClassString = "class_Complex"; + } + + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerClassString.concat("_").concat( + featurePath) }); + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { triggerClassString.concat("_").concat( + simplifiedFeaturePath) }); + + // trigger token & trigger type + String triggerText = "text_".concat(trigger.getCoveredText() + .toLowerCase()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerText.concat("_").concat(featurePath) }); + + String eventType = "eventType_".concat(trigger.getEventType()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { eventType.concat("_").concat(featurePath) }); + + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { triggerText.concat("_").concat( + simplifiedFeaturePath) }); + + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { eventType.concat("_").concat( + simplifiedFeaturePath) }); + + // trigger lemma (using the token's POS, which may be inaccurate) + String triggerLemma = "triggerLemma_".concat(BioLemmatizerUtil + .lemmatizeWord(trigger.getCoveredText(), triggerToken.getPos()) + .toLowerCase()); + featuresString + .add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_").concat( + featurePath) }); + + // trigger sublemma + String triggerSubLemma = (null == triggerToken.getSubLemma() ? triggerLemma + : "triggerSubLemma_".concat(triggerToken.getSubLemma() + .toLowerCase())); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerSubLemma.concat("_") + .concat(featurePath) }); + + // trigger POS + String triggerPos = "triggerPos_".concat(triggerToken.getPos()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerPos.concat("_").concat(featurePath) }); + String triggerPosShort = "triggerShortPos_".concat(triggerToken + .getPos().substring(0, 1)); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerPosShort.concat("_") + .concat(featurePath) }); + + featuresString.add(new String[] { triggerLemma.concat("_").concat( + triggerPos) }); + featuresString.add(new String[] { triggerLemma.concat("_").concat( + triggerPosShort) }); + featuresString.add(new String[] { triggerSubLemma.concat("_").concat( + triggerPos) }); + featuresString.add(new String[] { triggerSubLemma.concat("_").concat( + triggerPosShort) }); + + // argument type + String argClassString; + if (EventType.isSimpleEvent(arguTrigger.getEventType())) { + argClassString = "class_Simple"; + } else if (EventType.isBindingEvent(arguTrigger.getEventType())) { + argClassString = "class_Binding"; + } else if (EventType.isRegulatoryEvent(arguTrigger.getEventType())) { + argClassString = "class_Regulation"; + } else { + argClassString = "class_Complex"; + } + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerClassString.concat("_").concat( + featurePath).concat("_").concat(argClassString) }); + + String argText = "text_".concat(arguTrigger.getCoveredText() + .toLowerCase()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerText.concat("_").concat(featurePath). + concat("_").concat(argText) }); + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { triggerText.concat("_").concat( + simplifiedFeaturePath).concat("_").concat( + argText) }); + + String argLemma = "argLemma_".concat(BioLemmatizerUtil + .lemmatizeWord(arguTrigger.getCoveredText(), arguToken.getPos()) + .toLowerCase()); + featuresString + .add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_").concat( + featurePath).concat("_").concat( + argLemma) }); + + String argPos = "argPos_".concat(arguToken.getPos()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerPos + .concat("_").concat(argPos) }); + + String argType = "argType_".concat(arguTrigger.getEventType()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_").concat(featurePath) + .concat("_").concat(argType) }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerSubLemma.concat("_") + .concat(featurePath).concat("_").concat(argType) }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerClassString.concat("_") + .concat(featurePath).concat("_").concat(argType) }); + + // text string from trigger to theme/cause: compensate when parsing + // fails + String textBetween = "", textAbsBetween = "", textShortBetween = ""; + + if (!areSameTokens) { + List tokensBetween = JCasUtil.selectCovered(jcas, + Token.class, sentence); + List proteinsBetween = JCasUtil.selectCovered(jcas, + Protein.class, sentence); + int start = Math.min(arguToken.getBegin(), triggerToken.getBegin()); + int end = Math.max(arguToken.getEnd(), triggerToken.getEnd()); + boolean reversed = (start != triggerToken.getBegin()); + + List tokensTextBetween = new ArrayList(); + List tokensAbsTextBetween = new ArrayList(); + + tokensLoop: for (Token aToken : tokensBetween) { + + if (aToken.getBegin() < start || !POS.isPos(aToken.getPos())) { + continue tokensLoop; + } else if (aToken.getEnd() >= end) { + break tokensLoop; + } + + // if it is a protein + for (Protein aProtein : proteinsBetween) { + if (aToken.getBegin() == aProtein.getBegin()) { + tokensTextBetween.add("PROTEIN"); + tokensAbsTextBetween.add("PROTEIN"); + continue tokensLoop; + } else if (aToken.getBegin() > aProtein.getBegin() + && aToken.getEnd() <= aProtein.getEnd()) { + continue tokensLoop; + } + } + if (aToken.getBegin() == trigger.getBegin()) { + tokensAbsTextBetween.add(trigger.getEventType()); + continue tokensLoop; + } else if (aToken.getBegin() > trigger.getBegin() + && aToken.getEnd() <= trigger.getEnd()) { + continue tokensLoop; + } + + tokensTextBetween.add(aToken.getLemma().toLowerCase()); + tokensAbsTextBetween.add(aToken.getLemma().toLowerCase()); + + } + + for (String aText : tokensTextBetween) { + if (reversed) { + textBetween = aText.concat(textBetween.equals("") ? "" + : "_".concat(textBetween)); + } else { + textBetween = textBetween.equals("") ? aText : textBetween + .concat("_").concat(aText); + } + } + for (String aText : tokensAbsTextBetween) { + if (reversed) { + textAbsBetween = aText + .concat(textAbsBetween.equals("") ? "" : "_" + .concat(textAbsBetween)); + } else { + textAbsBetween = textAbsBetween.equals("") ? aText + : textAbsBetween.concat("_").concat(aText); + } + } + // concatenate text between trigger and theme/cause with the + // previous + // features. + textBetween = textBetween.equals("") ? null : "textString_".concat( + reversed ? "reversed_" : "").concat(textBetween); + textAbsBetween = textAbsBetween.equals("") ? null + : "textStringAbs_".concat(reversed ? "reversed_" : "") + .concat(textAbsBetween); + for (int i = 1; i < tokensAbsTextBetween.size() - 1; i++) { + if (reversed) { + textShortBetween = tokensAbsTextBetween.get(i).concat( + textShortBetween.equals("") ? "" : "_" + .concat(textShortBetween)); + } else { + textShortBetween = textShortBetween.equals("") ? tokensAbsTextBetween + .get(i) : textShortBetween.concat("_").concat( + tokensAbsTextBetween.get(i)); + } + } + textShortBetween = textShortBetween.equals("") ? null + : "textStringShort_".concat(reversed ? "reversed_" : "") + .concat(textShortBetween); + } else { + textBetween = "SAMETOKEN"; + textAbsBetween = "SAMETOKEN"; + textShortBetween = "SAMETOKEN"; + } + + featuresString.add(null == textBetween ? new String[0] + : new String[] { textBetween }); + featuresString.add(null == textBetween ? new String[0] + : new String[] { triggerText.concat("_").concat(textBetween) }); + featuresString + .add(null != textBetween && null != dependencyPath ? new String[] { dependencyPath + .concat("_").concat(textBetween) } : new String[0]); + + featuresString.add(null == textAbsBetween ? new String[0] + : new String[] { textAbsBetween }); + featuresString + .add(null == textAbsBetween ? new String[0] + : new String[] { triggerText.concat("_").concat( + textAbsBetween) }); + + featuresString.add(null == textShortBetween ? new String[0] + : new String[] { textShortBetween }); + featuresString.add(null == textShortBetween ? new String[0] + : new String[] { triggerText.concat("_").concat( + textShortBetween) }); + featuresString + .add(null != textShortBetween && null != dependencyPath ? new String[] { dependencyPath + .concat("_").concat(textShortBetween) } : new String[0]); + + + if (isTheme) { + instance.setLabelString("Theme"); + } else if (isCause){ + instance.setLabelString("Cause"); + } else { + instance.setLabelString("Non_Argument"); + } + + return instance; + } + + private Instance themeCauseToInstance(JCas jcas, Sentence sentence, + Annotation anno, Trigger trigger, Set pairsOfSentence, + DependencyExtractor dependencyExtractor, boolean isTruepositive, + Stage stage, Token themeToken) { + if (!(anno instanceof Trigger) && !(anno instanceof Protein)) { + throw new IllegalArgumentException( + "The theme/cause has to be a protein or trigger."); + } + + List annoTokens = JCasUtil + .selectCovered(jcas, Token.class, anno); + + // if protein/trigger is within a token + if (annoTokens.size() == 0) { + List tokens = JCasUtil.selectCovered(jcas, Token.class, + sentence); + annoTokens = new ArrayList(); + for (Token token : tokens) { + if (token.getBegin() <= anno.getBegin() + && token.getEnd() >= anno.getEnd()) { + annoTokens.add(token); + break; + } + } + } +/* if (annoTokens.size() == 0) { + int i = anno.getBegin(); + int j = anno.getEnd(); + String s = anno.getCoveredText(); + }*/ + Token annoToken = null; + if (anno instanceof Protein) + // Take the last non-digital token if protein is + // multi-token. + { + annoToken = annoTokens.get(annoTokens.size() - 1); + // for (Token aToken : annoTokens) { + // + // try { + // Double.parseDouble(aToken.getLemma()); + // break; + // } catch (NumberFormatException e) { + // token = aToken; + // } + // + // } + } else if (anno instanceof Trigger) { + annoToken = getTriggerToken(jcas, (Trigger) anno); + } + + Instance instance = new Instance(); + List featuresString = new ArrayList(); + instance.setFeaturesString(featuresString); + + // get trigger token + Token triggerToken = getTriggerToken(jcas, trigger); + + // parser : dependency path between trigger-argument + String dependencyPath = dependencyExtractor.getShortestPath( + triggerToken, annoToken, stage); + String featurePath = dependencyPath; + + if (null == dependencyPath) { + featurePath = dependencyExtractor.getReversedShortestPath( + triggerToken, annoToken, stage); + } + + boolean areSameTokens = (annoToken.getBegin() == triggerToken + .getBegin() && annoToken.getEnd() == triggerToken.getEnd()); + +/* if ( isTruepositive && null == featurePath && !areSameTokens) { + int i = sentence.getId(); + String s = triggerToken.getCoveredText(); + String s2 = annoToken.getCoveredText(); + return null; + }*/ + featurePath = areSameTokens ? "SAMETOKEN" : featurePath; + featurePath = (null == featurePath ? null : "dep_".concat(featurePath)); + featuresString.add(null == featurePath ? new String[0] + : new String[] { featurePath }); + + // parser refined? + + // parser_simple: grouping of dpendency type; + // amod, nn --> nmod + // anything ending in subj --> subj + // anything ending in subjpass --> subjpass + String simplifiedFeaturePath = null; + if (null != dependencyPath) { + simplifiedFeaturePath = dependencyExtractor + .getSimplifiedShortestPath(triggerToken, annoToken, stage); + } else { + simplifiedFeaturePath = dependencyExtractor + .getSimplifiedReversedShortestPath(triggerToken, annoToken, + stage); + } + simplifiedFeaturePath = areSameTokens ? "SAMETOKEN" + : simplifiedFeaturePath; + simplifiedFeaturePath = (null == simplifiedFeaturePath ? null + : "dep_simple_".concat(simplifiedFeaturePath)); + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { simplifiedFeaturePath }); + + // trigger class + String triggerClassString; + if (EventType.isSimpleEvent(trigger.getEventType())) { + triggerClassString = "class_Simple"; + } else if (EventType.isBindingEvent(trigger.getEventType())) { + triggerClassString = "class_Binding"; + } else if (EventType.isRegulatoryEvent(trigger.getEventType())) { + triggerClassString = "class_Regulation"; + } else { + triggerClassString = "class_Complex"; + } + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerClassString.concat("_").concat( + featurePath) }); + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { triggerClassString.concat("_").concat( + simplifiedFeaturePath) }); + + // trigger token & trigger type + String triggerText = "text_".concat(trigger.getCoveredText() + .toLowerCase()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerText.concat("_").concat(featurePath) }); + String eventType = "eventType_".concat(trigger.getEventType()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { eventType.concat("_").concat(featurePath) }); + + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { triggerText.concat("_").concat( + simplifiedFeaturePath) }); + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { eventType.concat("_").concat( + simplifiedFeaturePath) }); + + // trigger lemma (using the token's POS, which may be inaccurate) + String triggerLemma = "triggerLemma_".concat(BioLemmatizerUtil + .lemmatizeWord(trigger.getCoveredText(), triggerToken.getPos()) + .toLowerCase()); + featuresString + .add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_").concat( + featurePath) }); + + // trigger sublemma + String triggerSubLemma = (null == triggerToken.getSubLemma() ? triggerLemma + : "triggerSubLemma_".concat(triggerToken.getSubLemma() + .toLowerCase())); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerSubLemma.concat("_") + .concat(featurePath) }); + + // trigger POS + String triggerPos = "triggerPos_".concat(triggerToken.getPos()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerPos.concat("_").concat(featurePath) }); + String triggerPosShort = "triggerShortPos_".concat(triggerToken + .getPos().substring(0, 1)); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerPosShort.concat("_") + .concat(featurePath) }); + + featuresString.add(new String[] { triggerLemma.concat("_").concat( + triggerPos) }); + featuresString.add(new String[] { triggerLemma.concat("_").concat( + triggerPosShort) }); + featuresString.add(new String[] { triggerSubLemma.concat("_").concat( + triggerPos) }); + featuresString.add(new String[] { triggerSubLemma.concat("_").concat( + triggerPosShort) }); + + // argument type + String argType = null; + if (anno instanceof Protein) { + argType = "argType_Protein"; + } else if (anno instanceof Trigger) { + argType = "argType_".concat(((Trigger) anno).getEventType()); + } + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_").concat(featurePath) + .concat("_").concat(argType) }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerSubLemma.concat("_") + .concat(featurePath).concat("_").concat(argType) }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerClassString.concat("_") + .concat(featurePath).concat("_").concat(argType) }); + + // text string from trigger to theme/cause: compensate when parsing + // fails + String textBetween = "", textAbsBetween = "", textShortBetween = ""; + + if (!areSameTokens) { + List tokensBetween = JCasUtil.selectCovered(jcas, + Token.class, sentence); + List proteinsBetween = JCasUtil.selectCovered(jcas, + Protein.class, sentence); + int start = Math.min(annoToken.getBegin(), triggerToken.getBegin()); + int end = Math.max(annoToken.getEnd(), triggerToken.getEnd()); + boolean reversed = (start != triggerToken.getBegin()); + + List tokensTextBetween = new ArrayList(); + List tokensAbsTextBetween = new ArrayList(); + + tokensLoop: for (Token aToken : tokensBetween) { + + if (aToken.getBegin() < start || !POS.isPos(aToken.getPos())) { + continue tokensLoop; + } else if (aToken.getEnd() >= end) { + break tokensLoop; + } + + // if it is a protein + for (Protein aProtein : proteinsBetween) { + if (aToken.getBegin() == aProtein.getBegin()) { + tokensTextBetween.add("PROTEIN"); + tokensAbsTextBetween.add("PROTEIN"); + continue tokensLoop; + } else if (aToken.getBegin() > aProtein.getBegin() + && aToken.getEnd() <= aProtein.getEnd()) { + continue tokensLoop; + } + } + if (aToken.getBegin() == trigger.getBegin()) { + tokensAbsTextBetween.add(trigger.getEventType()); + continue tokensLoop; + } else if (aToken.getBegin() > trigger.getBegin() + && aToken.getEnd() <= trigger.getEnd()) { + continue tokensLoop; + } + + tokensTextBetween.add(aToken.getLemma().toLowerCase()); + tokensAbsTextBetween.add(aToken.getLemma().toLowerCase()); + + } + + for (String aText : tokensTextBetween) { + if (reversed) { + textBetween = aText.concat(textBetween.equals("") ? "" + : "_".concat(textBetween)); + } else { + textBetween = textBetween.equals("") ? aText : textBetween + .concat("_").concat(aText); + } + } + for (String aText : tokensAbsTextBetween) { + if (reversed) { + textAbsBetween = aText + .concat(textAbsBetween.equals("") ? "" : "_" + .concat(textAbsBetween)); + } else { + textAbsBetween = textAbsBetween.equals("") ? aText + : textAbsBetween.concat("_").concat(aText); + } + } + // concatenate text between trigger and theme/cause with the + // previous + // features. + textBetween = textBetween.equals("") ? null : "textString_".concat( + reversed ? "reversed_" : "").concat(textBetween); + textAbsBetween = textAbsBetween.equals("") ? null + : "textStringAbs_".concat(reversed ? "reversed_" : "") + .concat(textAbsBetween); + for (int i = 1; i < tokensAbsTextBetween.size() - 1; i++) { + if (reversed) { + textShortBetween = tokensAbsTextBetween.get(i).concat( + textShortBetween.equals("") ? "" : "_" + .concat(textShortBetween)); + } else { + textShortBetween = textShortBetween.equals("") ? tokensAbsTextBetween + .get(i) : textShortBetween.concat("_").concat( + tokensAbsTextBetween.get(i)); + } } + textShortBetween = textShortBetween.equals("") ? null + : "textStringShort_".concat(reversed ? "reversed_" : "") + .concat(textShortBetween); + } else { + textBetween = "SAMETOKEN"; + textAbsBetween = "SAMETOKEN"; + textShortBetween = "SAMETOKEN"; + } - Token token = null; - token = annoTokens.get(0); - for (Token aToken : annoTokens) { + featuresString.add(null == textBetween ? new String[0] + : new String[] { textBetween }); + featuresString.add(null == textBetween ? new String[0] + : new String[] { triggerText.concat("_").concat(textBetween) }); + featuresString + .add(null != textBetween && null != dependencyPath ? new String[] { dependencyPath + .concat("_").concat(textBetween) } : new String[0]); - try { - Double.parseDouble(aToken.getLemma()); - break; - } catch (NumberFormatException e) { - token = aToken; + featuresString.add(null == textAbsBetween ? new String[0] + : new String[] { textAbsBetween }); + featuresString + .add(null == textAbsBetween ? new String[0] + : new String[] { triggerText.concat("_").concat( + textAbsBetween) }); + + featuresString.add(null == textShortBetween ? new String[0] + : new String[] { textShortBetween }); + featuresString.add(null == textShortBetween ? new String[0] + : new String[] { triggerText.concat("_").concat( + textShortBetween) }); + featuresString + .add(null != textShortBetween && null != dependencyPath ? new String[] { dependencyPath + .concat("_").concat(textShortBetween) } : new String[0]); + + if (stage.equals(Stage.CAUSE)) { + String pathToTheme = null; + if (null != themeToken) { + pathToTheme = dependencyExtractor.getShortestPath(annoToken, + themeToken, stage); + if (null == pathToTheme) { + pathToTheme = dependencyExtractor.getReversedShortestPath( + annoToken, themeToken, stage); } } - themeTokens.add(token); + featuresString + .add(null != pathToTheme && themeToken != null ? new String[] { pathToTheme } + : new String[0]); + } + + String label; + switch (stage) { + case THEME: + label = "Theme"; + break; + case CAUSE: + label = "Cause"; + break; + default: + label = null; + } + if (isTruepositive) { + + instance.setLabelString(label); + + } else { + instance.setLabelString("Non_".concat(label.toLowerCase())); + } + + return instance; + } + + protected Instance bindingEventToInstance(JCas jcas, Sentence sentence, + Trigger trigger, List themes, + DependencyExtractor dependencyExtractor, boolean truepositive) { + + Instance instance = new Instance(); + List featuresString = new ArrayList(); + instance.setFeaturesString(featuresString); + + Token triggerToken = getTriggerToken(jcas, trigger); + List themeTokens = new ArrayList(); + for (Protein aProtein : themes) { + themeTokens.add(getToken(jcas, aProtein)); } + + List tokensBetween = JCasUtil.selectCovered(jcas, + Token.class, sentence); + List proteinsBetween = JCasUtil.selectCovered(jcas, + Protein.class, sentence); + + String[] themePaths = new String[2]; + if (themeTokens.size() == 1) { + themePaths[0] = "themeSize=1"; + themePaths[1] = null; + //themePaths[2] = null; + //themePaths[3] = null; + }else if (themeTokens.size() == 2) { + String themePath0 = dependencyExtractor.getShortestPath( + themeTokens.get(0), themeTokens.get(1), Stage.BINDING); + String themePath1 = dependencyExtractor.getShortestPath( + themeTokens.get(1), themeTokens.get(0), Stage.BINDING); + //String themePath2 = dependencyExtractor + // .getSimplifiedShortestPath(themeTokens.get(0), themeTokens.get(1), Stage.BINDING); + //String themePath3 = dependencyExtractor + // .getSimplifiedShortestPath(themeTokens.get(1), themeTokens.get(0), Stage.BINDING); + themePaths[0] = null == themePath0 ? null : "themePath_" + themePath0; + themePaths[1] = null == themePath1 ? null : "themePath_" + themePath1; + //themePaths[0] = null == themePath2 ? null : "themeSimplifiedPath_" + themePath2; + //themePaths[1] = null == themePath3 ? null : "themeSimplifiedPath_" + themePath3; + //int dependencyPathLength = dependencyExtractor.getDijkstraShortestPathLength( + // themeTokens.get(1), themeTokens.get(0)); + //themePaths[2] = "dependencyPathLength_" + String.valueOf(dependencyPathLength) ; + } + featuresString.add(themePaths); + +/* String[] themeTextBetween = new String[2]; + if (themeTokens.size() == 1) { + themeTextBetween[0] = "themeSize=1"; + themeTextBetween[1] = null; + //themeTextBetween[2] = null; + }else if (themeTokens.size() == 2) { + int start = Math.min(themeTokens.get(0).getBegin(), + themeTokens.get(1).getBegin()); + int end = Math.max(themeTokens.get(0).getEnd(), themeTokens.get(1).getEnd()); + + List tokensTextBetween = new ArrayList(); + List tokensAbsTextBetween = new ArrayList(); + + tokensLoop: for (Token aToken : tokensBetween) { + + if (aToken.getBegin() < start || !POS.isPos(aToken.getPos())) { + continue tokensLoop; + } else if (aToken.getEnd() >= end) { + break tokensLoop; + } + + // if it is a protein + for (Protein aProtein : proteinsBetween) { + if (aToken.getBegin() == aProtein.getBegin()) { + tokensTextBetween.add("PROTEIN"); + tokensAbsTextBetween.add("PROTEIN"); + continue tokensLoop; + } else if (aToken.getBegin() > aProtein.getBegin() + && aToken.getEnd() <= aProtein.getEnd()) { + continue tokensLoop; + } + } + if (aToken.getBegin() == trigger.getBegin()) { + tokensAbsTextBetween.add(trigger.getEventType()); + continue tokensLoop; + } else if (aToken.getBegin() > trigger.getBegin() + && aToken.getEnd() <= trigger.getEnd()) { + continue tokensLoop; + } + + tokensTextBetween.add(aToken.getLemma().toLowerCase()); + tokensAbsTextBetween.add(aToken.getLemma().toLowerCase()); - if (themeTokens.size() == 0) { - throw new RuntimeException("Theme number is zero. Please check."); + } + + String textBetween = "", textAbsBetween = ""; + for (String aText : tokensTextBetween) { + textBetween = textBetween.equals("") ? aText : textBetween + .concat("_").concat(aText); + } + for (String aText : tokensAbsTextBetween) { + textAbsBetween = textAbsBetween.equals("") ? aText + : textAbsBetween.concat("_").concat(aText); + } + themeTextBetween[0] = null == textBetween ? null : "themeTextBetween_" + textBetween; + themeTextBetween[1] = null == textAbsBetween ? null : "themeTextAbsBetween_" + textAbsBetween; + int tokensTextBetweenLength = tokensTextBetween.size(); + //themeTextBetween[2] = "tokensTextBetweenLength_" + String.valueOf(tokensTextBetweenLength); } + featuresString.add(themeTextBetween);*/ + String triggerText = "text_".concat(triggerToken.getCoveredText() .toLowerCase()); String triggerLemma = "triggerLemma_".concat(triggerToken.getLemma() .toLowerCase()); - String triggerSubLemma = (null == triggerToken.getSubLemma() ? triggerToken - .getLemma() : "triggerSubLemma_".concat(triggerToken + triggerLemma = (null == triggerToken.getSubLemma() ? triggerToken + .getLemma() : "triggerLemma_".concat(triggerToken .getSubLemma().toLowerCase())); String triggerPos = "triggerPos_".concat(triggerToken.getPos()); String triggerPosShort = "triggerShortPos_".concat(triggerToken @@ -796,62 +1824,71 @@ protected Instance bindingEventToInstance(JCas jcas, Sentence sentence, // parser : dependency path between trigger-argument int i = 0; String[] dependencyPaths = new String[themeTokens.size()]; + //String[] pathLength = new String[themeTokens.size()]; String[] simplifiedFeaturePaths = new String[themeTokens.size()]; String[] triggerTextPaths = new String[themeTokens.size()]; - String[] triggerTextSimplifiedPaths = new String[themeTokens.size()]; - String[] triggerLemmaPaths = new String[themeTokens.size()]; - String[] triggerSubLemmaPaths = new String[themeTokens.size()]; + String[] triggerLemmaPaths = new String[themeTokens.size()]; + String[] triggerLemmaSimplifiedPaths = new String[themeTokens.size()]; String[] triggerPosPaths = new String[themeTokens.size()]; String[] triggerPosShortPaths = new String[themeTokens.size()]; String[] textBetweens = new String[themeTokens.size()]; - String[] triggerTextBetweens = new String[themeTokens.size()]; + //String[] textBetweenLength = new String[themeTokens.size()]; + //String[] triggerLemmaBetweens = new String[themeTokens.size()]; String[] textBetweenDependencies = new String[themeTokens.size()]; String[] textAbsBetweenDependencies = new String[themeTokens.size()]; - String[] textShortBetweens = new String[themeTokens.size()]; - String[] textShortBetweenDependencyPaths = new String[themeTokens - .size()]; + //String[] textShortBetweens = new String[themeTokens.size()]; + //String[] textShortBetweenDependencyPaths = new String[themeTokens.size()]; for (Token aThemeToken : themeTokens) { - String dependencyPath = dependencyExtractor.getShortestPath( + /*int triggerPathLength = dependencyExtractor.getDijkstraShortestPathLength( + triggerToken, aThemeToken); + pathLength[i] = "triggerPathLength_" + String.valueOf(triggerPathLength); + if (i==1 && pathLength[1].equals(pathLength[0])) { + pathLength[i] = pathLength[i] + "twice"; + }*/ + String featurePath = dependencyExtractor.getShortestPath( triggerToken, aThemeToken, Stage.BINDING); - String featurePath = dependencyPath; - - if (null == dependencyPath) { - featurePath = dependencyExtractor.getReversedShortestPath( - triggerToken, aThemeToken, Stage.BINDING); + boolean areSameTokens = (aThemeToken.getId() == triggerToken.getId()); + featurePath = areSameTokens ? "SAMETOKEN" : featurePath; + featurePath = (null == featurePath ? null : "featurePath_".concat(featurePath)); + if (null != featurePath) { + for (int m=0; m nmod // anything ending in subj --> subj // anything ending in subjpass --> subjpass - if (null != dependencyPath) { - simplifiedFeaturePath = dependencyExtractor - .getSimplifiedShortestPath(triggerToken, aThemeToken, Stage.BINDING); - } else { - simplifiedFeaturePath = dependencyExtractor - .getSimplifiedReversedShortestPath(triggerToken, - aThemeToken, Stage.BINDING); - } + + simplifiedFeaturePath = areSameTokens ? "SAMETOKEN" : simplifiedFeaturePath; simplifiedFeaturePath = (null == simplifiedFeaturePath ? null - : "dep_simple_".concat(simplifiedFeaturePath)); + : "simplifiedFeaturePath_".concat(simplifiedFeaturePath)); + if (null != simplifiedFeaturePath) { + for (int m=0; m tokensBetween = JCasUtil.selectCovered(jcas, - Token.class, sentence); - List proteinsBetween = JCasUtil.selectCovered(jcas, - Protein.class, sentence); int start = Math.min(aThemeToken.getBegin(), triggerToken.getBegin()); int end = Math.max(aThemeToken.getEnd(), triggerToken.getEnd()); @@ -924,19 +1957,38 @@ protected Instance bindingEventToInstance(JCas jcas, Sentence sentence, : textAbsBetween.concat("_").concat(aText); } } - + if (null != textBetween) { + for (int m=0; m iter = jcas.getAnnotationIndex( + Token.type).iterator(); + proteinTokens = new ArrayList(); + while (iter.hasNext()) { + Token token = (Token) iter.next(); + if (token.getBegin() < protein.getBegin() + && token.getEnd() > protein.getBegin()) { + proteinTokens.add(token); + break; + } + } + } if (proteinTokens.size() == 0) { logger.warning("No token found for protein."); return null; } - return proteinTokens.get(proteinTokens.size() - 1); + Token token = proteinTokens.get(0); + for (Token aToken : proteinTokens) { + + try { + Double.parseDouble(aToken.getLemma()); + break; + } catch (NumberFormatException e) { + token = aToken; + } + } + return token; + } + + protected Token getToken(JCas jcas, Annotation annotation) { + + List tokens = JCasUtil.selectCovered(jcas, Token.class, + annotation); + // if protein/trigger is within a token + if (tokens.size() == 0) { + FSIterator iter = jcas.getAnnotationIndex( + Token.type).iterator(); + tokens = new ArrayList(); + while (iter.hasNext()) { + Token token = (Token) iter.next(); + if (token.getBegin() <= annotation.getBegin() + && token.getEnd() >= annotation.getEnd()) { + tokens.add(token); + break; + } + } + } + if (tokens.size() == 0) { + FSIterator iter = jcas.getAnnotationIndex( + Token.type).iterator(); + tokens = new ArrayList(); + while (iter.hasNext()) { + Token token = (Token) iter.next(); + if (token.getBegin() < annotation.getBegin() + && token.getEnd() > annotation.getBegin()) { + tokens.add(token); + break; + } + } + } + if (tokens.size() == 0) { + logger.warning("No token found for annotation."); + return null; + } + + Token token = tokens.get(0); + for (Token aToken : tokens) { + + try { + Double.parseDouble(aToken.getLemma()); + break; + } catch (NumberFormatException e) { + token = aToken; + } + } + return token; } } diff --git a/src/info/chenli/litway/bionlp13/ge/ArgumentInstances.java b/src/info/chenli/litway/bionlp13/ge/ArgumentInstances.java new file mode 100644 index 0000000..f2f3bd9 --- /dev/null +++ b/src/info/chenli/litway/bionlp13/ge/ArgumentInstances.java @@ -0,0 +1,280 @@ +package info.chenli.litway.bionlp13.ge; + +import info.chenli.classifier.Instance; +import info.chenli.classifier.InstanceDictionary; +import info.chenli.litway.corpora.Event; +import info.chenli.litway.corpora.POS; +import info.chenli.litway.corpora.Protein; +import info.chenli.litway.corpora.Sentence; +import info.chenli.litway.corpora.Token; +import info.chenli.litway.corpora.Trigger; +import info.chenli.litway.searn.StructuredInstance; +import info.chenli.litway.util.DependencyExtractor; +import info.chenli.litway.util.FileUtil; +import info.chenli.litway.util.StanfordDependencyReader; +import info.chenli.litway.util.StanfordDependencyReader.Pair; +import info.chenli.litway.util.UimaUtil; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.logging.Logger; + +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.uimafit.util.JCasUtil; + +public class ArgumentInstances extends AbstractInstances { + + private final static Logger logger = Logger.getLogger(ArgumentInstances.class + .getName()); + + public ArgumentInstances() { + super(new int[] { Protein.type, Event.type }); + + } + + @Override + protected List getLabelsString() { + + ArrayList argumentTypes = new ArrayList(); + + argumentTypes.add("Theme"); + argumentTypes.add("Cause"); + argumentTypes.add("Non_Argument"); + + return argumentTypes; + + } + + @Override + protected List getStructuredInstances(JCas jcas, + FSIterator tokenIter) { + + List results = new LinkedList(); + + AnnotationIndex sentenceIndex = jcas + .getAnnotationIndex(Sentence.type); + + FSIterator sentenceIter = sentenceIndex.iterator(); + Map> pairsOfArticle = new HashMap>(); + if (new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc")).exists()) { + pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc"))); + } else { + pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sd"))); + } + + /*String s = FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas));*/ + // Currently, one sentence is considered as one structured instance. + while (sentenceIter.hasNext()) { + + StructuredInstance si = new StructuredInstance(); + List argumentCandidates = new LinkedList(); + si.setNodes(argumentCandidates); + + Sentence sentence = (Sentence) sentenceIter.next(); + Set pairsOfSentence = pairsOfArticle.get(sentence.getId()); + + DependencyExtractor dependencyExtractor = new DependencyExtractor( + JCasUtil.selectCovered(jcas, Token.class, sentence), + pairsOfSentence); + + List events = JCasUtil.selectCovered(jcas, Event.class, + sentence); + List triggers= JCasUtil.selectCovered(jcas, Trigger.class, + sentence); + List proteins = JCasUtil.selectCovered(jcas, + Protein.class, sentence); + if (proteins.size() < 1) { + continue; + } + if (triggers.size() < 1) { + continue; + } + Map> triggerEvevts = new TreeMap>(); + for (Trigger trigger : triggers) { + for (Event event : events) { + if (event.getTrigger().getBegin() == trigger.getBegin()) { + Token themeToken = getThemeToken(jcas, event, sentence); + if (null == themeToken) { + // There are cross sentence themes, which are not considered + // at the moment. + //logger.warning(fileName.concat(": An event must have a theme. It may be caused by cross-sentence event.")); + continue; + } + + Set triggerEvevt = new HashSet(); + if (triggerEvevts.containsKey(trigger.getId())) { + triggerEvevt = triggerEvevts.get(trigger.getId()); + } + triggerEvevt.add(event); + triggerEvevts.put(trigger.getId(), triggerEvevt); + } + } + } + + for (Trigger trigger : triggers) { + /*if (!EventType.isBindingEvent(trigger.getEventType())) { + continue; + }*/ + // check protein arguments + for (Protein protein : proteins) { + boolean isTheme = false, isCause = false; + if (triggerEvevts.containsKey(trigger.getId())) { + loop : for (Event event : triggerEvevts.get(trigger.getId())) { + for (int i = 0; i < event.getThemes().size(); i++) { + isTheme = event.getThemes(i).equals( + protein.getId()); + if (isTheme == true) { + break loop; + } + } + } + + if (!isTheme + && EventType.isComplexEvent(trigger.getEventType())) { + + for (Event event : triggerEvevts.get(trigger.getId())) { + if (null != event.getCause()) { + isCause = event.getCause().equals( + protein.getId()); + if (isCause == true) { + break; + } + } + } + } + } +/* Token triggerToken = getToken(jcas, trigger); + Token token = getToken(jcas, protein); + int pathLength = dependencyExtractor.getDijkstraShortestPathLength( + triggerToken, token); + int distance = token.getId() > triggerToken.getId() ? token.getId() - triggerToken.getId() + : triggerToken.getId() - token.getId(); + if (pathLength > 6) { + if (isTheme || isCause) { + System.out.println("error"); + } + //continue; + } + if (distance > 10) { + //notTheme.add(protein.getId()); + }*/ + + Instance instance = argumentToInstance(jcas, sentence, + protein, trigger, pairsOfSentence, + dependencyExtractor, isTheme, isCause, Stage.THEME); + if ( instance != null) { + argumentCandidates.add(instance); + } + } + + // check event arguments + if (EventType.isComplexEvent(trigger.getEventType())) { + for (Trigger argumentTrigger : triggers) { + if (argumentTrigger.getBegin() == trigger.getBegin()) { + continue; + } + + boolean isTheme =false, isCause =false; + if (triggerEvevts.containsKey(trigger.getId()) + && triggerEvevts.containsKey(argumentTrigger.getId())) { + if (EventType.isRegulatoryEvent(trigger.getEventType())) { + loop : for (Event event : triggerEvevts.get(trigger.getId())) { + for (Event themeEvent : triggerEvevts.get(argumentTrigger.getId())) { + if (event.getThemes(0).equalsIgnoreCase(themeEvent.getId())) { + isTheme = true; + break loop; + } + } + } + } + + if (!isTheme) { + loop : for (Event event : triggerEvevts.get(trigger.getId())) { + for (Event themeEvent : triggerEvevts.get(argumentTrigger.getId())) { + if (null != event.getCause() + && event.getCause().equalsIgnoreCase(themeEvent.getId())) { + isCause = true; + break loop; + } + } + } + } + } + + /*Token triggerToken = getToken(jcas, trigger); + Token token = getToken(jcas, argumentTrigger); + int pathLength = dependencyExtractor.getDijkstraShortestPathLength( + triggerToken, token); + int distance = token.getId() > triggerToken.getId() ? token.getId() - triggerToken.getId() + : triggerToken.getId() - token.getId(); + if (pathLength > 6) { + if (isTheme || isCause) { + System.out.println("error"); + } + //continue; + } + if (distance > 10) { + //notTheme.add(protein.getId()); + }*/ + + argumentCandidates.add(argumentToInstance(jcas, sentence, + argumentTrigger, trigger, pairsOfSentence, + dependencyExtractor, isTheme, isCause, Stage.THEME)); + } + } + } + results.add(si); + } + + return results; + } + + public static void main(String[] args) { + + ArgumentInstances ti = new ArgumentInstances(); + ti.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + + List instances = ti.getInstances(new File(args[0])); + + InstanceDictionary dict = new InstanceDictionary(); + dict.creatNumericDictionary(instances); + String classifierName = "liblinear"; + dict.saveDictionary(new File("./model/arguments.".concat(classifierName) + .concat(".dict"))); + + ti.saveInstances(new File("./model/instances.arguments.txt")); + ti.saveSvmLightInstances(new File("./model/instances.arguments.svm.txt")); + + if (args.length == 2 && args[1].equals("dev")) { + + ArgumentInstances testInstances = new ArgumentInstances(); + testInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List tInstances = testInstances.getInstances(new File( + "./data/development/")); + + tInstances = dict.instancesToNumeric(tInstances); + + testInstances.saveInstances(new File( + "./model/instances.arguments.dev.txt")); + testInstances.saveSvmLightInstances(new File( + "./model/instances.arguments.svm.dev.txt")); + } + + } +} diff --git a/src/info/chenli/litway/bionlp13/ge/ArgumentRecogniser.java b/src/info/chenli/litway/bionlp13/ge/ArgumentRecogniser.java new file mode 100644 index 0000000..3b1c43e --- /dev/null +++ b/src/info/chenli/litway/bionlp13/ge/ArgumentRecogniser.java @@ -0,0 +1,245 @@ +package info.chenli.litway.bionlp13.ge; + +import info.chenli.classifier.Accurary; +import info.chenli.classifier.Instance; +import info.chenli.classifier.InstanceDictionary; +import info.chenli.classifier.LibLinearFacade; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import java.util.logging.Logger; + +import de.bwaldvogel.liblinear.Feature; +import de.bwaldvogel.liblinear.FeatureNode; +import de.bwaldvogel.liblinear.Linear; + +/** + * + * @author rqsong + * + */ +public class ArgumentRecogniser extends LibLinearFacade { + + private final static Logger logger = Logger.getLogger(ArgumentRecogniser.class + .getName()); + private final String classifierName = "liblinear"; + + public void train(File trainingSet, boolean useSearn) { + + if (useSearn) { + + } else { + + InstanceDictionary dict = new InstanceDictionary(); + + ArgumentInstances trainingInstances = new ArgumentInstances(); + trainingInstances + .setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = trainingInstances + .getInstances(trainingSet); + + dict.creatNumericDictionary(instances); + dict.saveDictionary(new File("./model/arguments.".concat( + classifierName).concat(".dict"))); + /* + trainingInstances.saveInstances(new File( + "./model/instances.theme.txt")); + trainingInstances.saveSvmLightInstances(new File( + "./model/instances.theme.svm.txt")); + */ + train(dict.instancesToNumeric(instances)); + saveModel(new File("./model/arguments.".concat(classifierName) + .concat(".model"))); + // System.out.println(accuracy(instances)); + + + + // System.out.println(accuracy(instances)); + } + + } + + public void train2(File trainingSet, boolean useSearn) { + + if (useSearn) { + + } else { + + InstanceDictionary dict = new InstanceDictionary(); + + ArgumentInstances trainingInstances = new ArgumentInstances(); + trainingInstances + .setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = trainingInstances + .getInstances(trainingSet); + + dict.creatNumericDictionary(instances); + dict.saveDictionary(new File("./model/arguments.train.devel.".concat( + classifierName).concat(".dict"))); + /* + trainingInstances.saveInstances(new File( + "./model/instances.theme.txt")); + trainingInstances.saveSvmLightInstances(new File( + "./model/instances.theme.svm.txt")); + */ + train(dict.instancesToNumeric(instances)); + saveModel(new File("./model/arguments.train.devel.".concat(classifierName) + .concat(".model"))); + // System.out.println(accuracy(instances)); + + + + // System.out.println(accuracy(instances)); + } + + } + + public static void main(String[] args) { + + ArgumentRecogniser tr = new ArgumentRecogniser(); + //tr.train2(new File("/media/songrq/soft/litway/数据/BioNLP13/" + // + "BioNLP-ST-2013_GE_train_devel_data_yuanShuJu"), false); + + tr.train2(new File(args[0]), false); + + //tr.train(new File("/media/songrq/soft/litway/数据/BioNLP13/" + // + "BioNLP-ST-2013_GE_train_data_yuanShuJu"), false); + + /*tr.train(new File("/media/songrq/soft/litway/数据/BioNLP11/" + + "BioNLP-ST-2011-2013_GE_train_data"), false); + tr.test(new File("/media/songrq/soft/litway/数据/BioNLP13/" + + "BioNLP-ST-2013_GE_devel_data_yuanShuJu"));*/ + + //tr.train(new File("/media/songrq/soft/litway/数据/BioNLP11/" + // + "b"), false); + + //tr.test(new File("/media/songrq/soft/litway/数据/BioNLP13/" + // + "BioNLP-ST-2013_GE_devel_data_yuanShuJu")); + + tr.train2(new File("/media/songrq/soft/litway/数据/BioNLP11/" + + "BioNLP-ST-2011-2013_GE_train_devel_data"), false); + + + + /* + tr.loadModel(new File("./model/themes.liblinear.model".concat(tr.classifierName) + .concat(".model"))); + + InstanceDictionary dict = new InstanceDictionary(); + dict.loadDictionary(new File("./model/themes." + .concat(tr.classifierName).concat(".dict"))); + + ThemeInstances ti = new ThemeInstances(); + ti.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + + List instances = ti.getInstances(new File(args[0])); + + instances = dict.instancesToNumeric(instances); + + int total = 0, correct = 0; + for (Instance instance : instances) { + int prediction = tr.predict(instance); + System.out.print(instance.getLabel() + ":" + prediction); + for (String[] values : instance.getFeaturesString()) { + for (String value : values) { + System.out.print("\t" + value); + } + } + System.out.println(); + for (int value : instance.getFeaturesNumeric()) { + System.out.print("\t" + value); + } + System.out.println(); + if (prediction == instance.getLabel()) { + correct++; + } + total++; + } + System.out.println(new Accurary(correct, total)); + */ + } + + private void test(File file) { + // TODO Auto-generated method stub + ArgumentInstances testInstances = new ArgumentInstances(); + testInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = testInstances.getInstances(file); + InstanceDictionary dict = new InstanceDictionary(); + dict.loadDictionary(new File("./model/arguments." + .concat(classifierName).concat(".dict"))); + this.loadModel(new File("./model/arguments.".concat( + classifierName).concat(".model"))); + instances = dict.instancesToNumeric(instances); + testInstances.saveSvmLightInstances(new File( + "./model/instances.arguments.svm.dev.txt")); + int total = 0, correct = 0, tp, tn = 0, n = 0, fn, fp; + float p, r, f; + + for (Instance instance : instances) { + int prediction = predict(instance); + if (prediction == instance.getLabel()) { + if (instance.getLabelString().equalsIgnoreCase("Non_Argument")){ + tn++; + }//else if (instance.getFileId().equals("Binding")){ + //System.out.println("TP " + instance.getFileId() + " " + instance.getLabelString() + " " + instance.getId()); + //} + correct++; + }//else if (!instance.getLabelString().equalsIgnoreCase("Non_Argument") + // && prediction == dict.getLabelNumeric("Non_Argument") + // && instance.getFileId().equals("Binding")) { + //System.out.println("FN " + instance.getFileId() + " " + instance.getId()); + //} + + if (instance.getLabelString().equalsIgnoreCase("Non_Argument")){ + n++; + } + total++; + } + + fp = n - tn; + tp = correct - tn; + fn = total - n - tp; + p = (float) tp / (tp + fp); + r = (float) tp / (tp + fn); + f = (float) 2 * p * r / (p + r); + + System.out.println(new Accurary(correct, total)); + System.out.println("tp: " + tp + " fp: " + fp + " fn: " + fn); + System.out.println("p: " + p + " r: " + r + " f: " + f); + } + + public double predict_values(int[] featureSparseVector) { + + if (featureSparseVector == null) { + throw new IllegalArgumentException( + "Empty sparse vector. This probably due to that the dictionary hasn't converted instances to numeric features yet."); + } + + int n; + int nr_feature = this.model.getNrFeature(); + if (this.model.getBias() >= 0) { + n = nr_feature + 1; + } else { + n = nr_feature; + } + + List featureNodes = new ArrayList(); + int previousIndex = 0; + for (int index : featureSparseVector) { + if (index > previousIndex) { + featureNodes.add(new FeatureNode(index, 1)); + } + previousIndex = index; + } + if (model.getBias() >= 0) { + Feature node = new FeatureNode(n, model.getBias()); + featureNodes.add(node); + } + Feature[] instance = new FeatureNode[featureNodes.size()]; + instance = featureNodes.toArray(instance); + double[] dec_values = new double[this.model.getNrClass()]; + int type = (int) Math.round(Linear.predictValues(this.model, instance, dec_values)); + return dec_values[type]; + } +} diff --git a/src/info/chenli/litway/bionlp13/ge/BindingInstances.java b/src/info/chenli/litway/bionlp13/ge/BindingInstances.java index 5491ec8..fb998fe 100644 --- a/src/info/chenli/litway/bionlp13/ge/BindingInstances.java +++ b/src/info/chenli/litway/bionlp13/ge/BindingInstances.java @@ -18,14 +18,18 @@ import java.io.File; import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeMap; import org.apache.uima.cas.FSIterator; import org.apache.uima.cas.text.AnnotationIndex; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; import org.apache.uima.jcas.tcas.Annotation; import org.uimafit.util.JCasUtil; @@ -46,20 +50,38 @@ protected List getLabelsString() { @Override protected List getStructuredInstances(JCas jcas, FSIterator annoIter) { - + final String classifierName = "liblinear"; + + boolean test = true; + ArgumentRecogniser argumentRecogniser = new ArgumentRecogniser(); + argumentRecogniser.loadModel(new File("./model/arguments.".concat( + classifierName).concat(".model"))); + InstanceDictionary argumentDict = new InstanceDictionary(); + argumentDict.loadDictionary(new File("./model/arguments.".concat( + classifierName).concat(".dict"))); + List results = new LinkedList(); AnnotationIndex sentenceIndex = jcas .getAnnotationIndex(Sentence.type); FSIterator sentenceIter = sentenceIndex.iterator(); - Map> pairsOfArticle = StanfordDependencyReader - .getPairs(new File(FileUtil.removeFileNameExtension( - UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc"))); + Map> pairsOfArticle = new HashMap>(); + if (new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc")).exists()) { + pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc"))); + } else { + pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sd"))); + } + int sentenceid = 0; // Currently, one sentence is considered as one structured instance. while (sentenceIter.hasNext()) { - + sentenceid++; StructuredInstance si = new StructuredInstance(); List bindingEventCandidates = new LinkedList(); si.setNodes(bindingEventCandidates); @@ -67,29 +89,196 @@ protected List getStructuredInstances(JCas jcas, Sentence sentence = (Sentence) sentenceIter.next(); Set pairsOfSentence = pairsOfArticle.get(sentence.getId()); + List tokens = JCasUtil.selectCovered(jcas, Token.class, sentence); DependencyExtractor dependencyExtractor = new DependencyExtractor( - JCasUtil.selectCovered(jcas, Token.class, sentence), - pairsOfSentence); + tokens, pairsOfSentence); List events = JCasUtil.selectCovered(jcas, Event.class, sentence); + List triggers= JCasUtil.selectCovered(jcas, Trigger.class, + sentence); List proteins = JCasUtil.selectCovered(jcas, Protein.class, sentence); - - for (Event event : events) { - - if (event.getTrigger().getEventType() - .equals(String.valueOf(EventType.Binding))) { - - Combinations combs = new Combinations( - proteins); - for (List themes : combs.getCombinations()) { - bindingEventCandidates.add(bindingEventToInstance(jcas, - sentence, event, themes, dependencyExtractor)); + if (proteins.size() < 1) { + continue; + } + if (triggers.size() < 1) { + continue; + } + //binding trigger's event + Map> triggerEvevts = new TreeMap>(); + for (Trigger trigger : triggers) { + + if (!trigger.getEventType().equals(String.valueOf(EventType.Binding))) { + continue; + } + for (Event event : events) { + if (event.getTrigger().getBegin() == trigger.getBegin()) { + int themeToken = getThemeToken2(jcas, event, sentence); + if (event.getThemes().size() != themeToken) { + // There are cross sentence themes, which are not considered at the moment. + continue; + } + + Set triggerEvevt = new HashSet(); + if (triggerEvevts.containsKey(trigger.getId())) { + triggerEvevt = triggerEvevts.get(trigger.getId()); + } + triggerEvevt.add(event); + triggerEvevts.put(trigger.getId(), triggerEvevt); } } } + String[] bindingLemma = {"assembly", "recruitment", "ligand", "interact", "association", + "ligation", "binding", "interaction", "recover", "recognize", "bind", "recruit", + "dna-binding", "complex", "form", "immunoprecipitate", "heteromultimer"}; + //proteins that relation is and + Set andProtein = getAndProtein(jcas, proteins, dependencyExtractor); + //extract instances + for (Trigger trigger : triggers) { + //int triggerEventSize = 0; + if (!trigger.getEventType().equals(String.valueOf(EventType.Binding))) { + continue; + } + /*boolean bind =false; + Token token = getTriggerToken(jcas, trigger); + for (int i=0; i farProtein = getFarProtein( jcas, trigger, proteins, dependencyExtractor); + Set notTheme = getNotProtein( jcas, trigger, proteins, dependencyExtractor); + + for (Protein protein : proteins) { + Instance proteinInstance = argumentToInstance(jcas, + sentence, protein, trigger, pairsOfSentence, + dependencyExtractor, false, false, Stage.THEME); + if ( proteinInstance != null) { + double prediction = argumentRecogniser.predict(argumentDict + .instanceToNumeric(proteinInstance) + .getFeaturesNumeric(), proteinInstance); + if (prediction != argumentDict.getLabelNumeric("Theme")) { + notTheme.add(protein.getId()); + } + } + } + Set triggerEvevt = triggerEvevts.get(trigger.getId()); + List themeProteins = new LinkedList(); + for (Protein protein : proteins) { + if (!notTheme.contains(protein.getId()) && !farProtein.contains(protein.getId())) { + themeProteins.add(protein); + } + } + Combinations combs = new Combinations( + themeProteins); + + loop2 : for (List themes : combs.getCombinations()) { + boolean truepositive = false; + int equalNum = 0; + if (triggerEvevts.containsKey(trigger.getId())) { + loop : for (Event bindingEvent : triggerEvevt) { + equalNum = 0; + if (null != bindingEvent.getThemes() + && themes.size() == bindingEvent.getThemes().size()) { + for (Protein protein : themes) { + boolean foundTheProtein = false; + for (int i = 0; i < bindingEvent.getThemes().size(); i++) { + if (protein.getId().equals(bindingEvent.getThemes(i))) { + equalNum++; + if (equalNum == themes.size()) { + truepositive = true; + break loop; + } + foundTheProtein = true; + break; + } + } + if (foundTheProtein == false) { + break; + } + } + } + } + } + /*if (truepositive) { + triggerEventSize++; + } */ + + if (themes.size() > 2) { + /*if (truepositive) { + System.out.println("error"); + } */ + continue; + } + + for (Protein p : themes) { + if (test && farProtein.contains(p.getId())) { + /*if (truepositive) { + System.out.println("farerror"); + } */ + continue loop2; + } + if (test && notTheme.contains(p.getId())) { + /*if (truepositive) { + System.out.println("noterror"); + } */ + continue loop2; + } + } + int num = 0; + if (themes.size() > 1) { + for (Protein p : themes) { + if (andProtein.contains(p.getId())) { + num++; + } + } + for(Protein p : themes) { + for(Protein p2 : themes) { + if (p.getId().equalsIgnoreCase(p2.getId())) { + continue; + } + if (p.getCoveredText().equalsIgnoreCase(p2.getCoveredText()) + || p.getCoveredText().contains(p2.getCoveredText()) + || p2.getCoveredText().contains(p.getCoveredText())) { + /*if (truepositive) { + System.out.println("sameerror"); + } else { + System.out.println("same"); + }*/ + continue loop2; + } + } + } + } + + if (test && num > 1) { + /*if (truepositive) { + System.out.println("anderror"); + } */ + continue; + } + + + + Instance instance = bindingEventToInstance(jcas, + sentence, trigger, themes, dependencyExtractor, truepositive); + instance.setSentenceId(sentenceid); + instance.setFileId(trigger.getCoveredText() + "\t"); + for (Protein p : themes) { + instance.setFileId(instance.getFileId() + p.getCoveredText() + "\t"); + } + bindingEventCandidates.add(instance); + } + //System.out.println(triggerEventSize); + } + results.add(si); } @@ -101,17 +290,18 @@ public static void main(String[] args) { BindingInstances ti = new BindingInstances(); ti.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); - List instances = ti.getInstances(new File(args[0])); + List instances = ti.getInstances( + new File("/media/songrq/soft/litway/数据/BioNLP13/b")); - InstanceDictionary dict = new InstanceDictionary(); +/* InstanceDictionary dict = new InstanceDictionary(); dict.creatNumericDictionary(instances); String classifierName = "liblinear"; ti.saveInstances(new File("./model/instances.binding.txt")); ti.saveSvmLightInstances(new File( "./model/instances.binding.svm.no_dum.txt")); - - if (args.length == 2 && args[1].equals("dev")) { +*/ + /*if (args.length == 2 && args[1].equals("dev")) { dict.saveDictionary(new File("./model/binding.".concat( classifierName).concat(".dict"))); @@ -125,6 +315,150 @@ public static void main(String[] args) { ti.saveInstances(new File("./model/instances.binding.dev.txt")); testInstances.saveSvmLightInstances(new File( "./model/instances.binding.svm.dev.no_dum.txt")); + }*/ + } + protected int getThemeToken2(JCas jcas, Event event, Sentence sentence) { + + int tokenNum = 0; + StringArray themes = event.getThemes(); + + for (Protein protein : JCasUtil.selectCovered(jcas, Protein.class, + sentence)) { + for (int i=0; i getNotProtein(JCas jcas, Trigger trigger, List sentenceProteins, DependencyExtractor dependencyExtractor) { + + //delete protein whose token is too far from triggertoken + Set notTheme = new HashSet(); + for (Protein protein : sentenceProteins) { + Token triggerToken = getTriggerToken(jcas, trigger); + Token token = getToken(jcas, protein); + int pathLength = dependencyExtractor.getDijkstraShortestPathLength( + triggerToken, token); + //int distance = token.getId() > triggerToken.getId() ? token.getId() - triggerToken.getId() + // : triggerToken.getId() - token.getId(); + if (pathLength > 6) { + notTheme.add(protein.getId()); + } + /*if (distance > 10) { + notTheme.add(protein.getId()); + }*/ + } + return notTheme; + } + + protected Set getFarProtein(JCas jcas, Trigger trigger, List sentenceProteins, DependencyExtractor dependencyExtractor) { + + //the same protein, delete far protein + Set farProtein = new HashSet(); + if (sentenceProteins.size() > 1) { + for(Protein p : sentenceProteins) { + for(Protein p2 : sentenceProteins) { + if (p.getId().equalsIgnoreCase(p2.getId())) { + continue; + } + if (p.getCoveredText().equalsIgnoreCase(p2.getCoveredText()) + || p.getCoveredText().contains(p2.getCoveredText()) + || p2.getCoveredText().contains(p.getCoveredText()) + ) { + + Token triggerToken = getTriggerToken(jcas, trigger); + Token token = getToken(jcas, p); + Token token2 = getToken(jcas, p2); + + int distance = token.getId() > triggerToken.getId() ? token.getId() - triggerToken.getId() + : triggerToken.getId() - token.getId(); + int distance2 = token2.getId() > triggerToken.getId() ? token2.getId() - triggerToken.getId() + : triggerToken.getId() - token2.getId(); + int pathLength = dependencyExtractor.getDijkstraShortestPathLength( + triggerToken, token); + int pathLength2 = dependencyExtractor.getDijkstraShortestPathLength( + triggerToken, token2); + if (pathLength > pathLength2) { + farProtein.add(p.getId()); + }else if (pathLength < pathLength2) { + farProtein.add(p2.getId()); + }else if (pathLength == pathLength2 && distance > distance2) { + farProtein.add(p.getId()); + }else if (pathLength == pathLength2 && distance < distance2) { + farProtein.add(p2.getId()); + } + } + } + } + } + return farProtein; + } + + protected Set getAndProtein(JCas jcas, List sentenceProteins, DependencyExtractor dependencyExtractor) { + + Set andProtein = new HashSet(); + if (sentenceProteins.size() > 1) { + for (Protein protein : sentenceProteins) { + for (Protein protein2 : sentenceProteins) { + if (protein.getId().equalsIgnoreCase(protein2.getId())) { + continue; + } + Token token2 = getToken(jcas, protein2); + Token token = getToken(jcas, protein); + String dependencyPath = dependencyExtractor.getShortestPath( + token, token2, Stage.BINDING); + if (dependencyPath != null + && (dependencyPath.equalsIgnoreCase("conj_and") + || dependencyPath.equalsIgnoreCase("-conj_and") + ||dependencyPath.equalsIgnoreCase("conj_or") + || dependencyPath.equalsIgnoreCase("-conj_or") + ||dependencyPath.equalsIgnoreCase("abbrev") //缩写,equlv + || dependencyPath.equalsIgnoreCase("-abbrev") + //||dependencyPath.equalsIgnoreCase("appos") + //|| dependencyPath.equalsIgnoreCase("-appos") + )) { + andProtein.add(protein.getId()); + andProtein.add(protein2.getId()); + } + /*if (token2.getId() == token.getId()) { + //andProtein.add(protein.getId()); + //andProtein.add(protein2.getId()); + }*/ + /*List between = new LinkedList(); + for (Token token : tokens) { + if (protein2.getBegin() >= protein.getEnd()) { + if (token.getBegin() >= protein.getEnd() + && token.getEnd() <= protein2.getBegin()) { + between.add(token); + } + }else if(protein.getBegin() >= protein2.getEnd()) { + if (token.getBegin() >= protein2.getEnd() + && token.getEnd() <= protein.getBegin()) { + between.add(token); + } + } + } + boolean isAnd = true; + for (Token token : between) { + if (!token.getCoveredText().equalsIgnoreCase(",") + && !token.getCoveredText().equalsIgnoreCase("and") + && !token.getCoveredText().equalsIgnoreCase("or") + ) { + isAnd = false; + break; + } + } + if (isAnd) { + andProtein.add(protein.getId()); + andProtein.add(protein2.getId()); + }*/ + } + } } + return andProtein; } + } diff --git a/src/info/chenli/litway/bionlp13/ge/BindingRecogniser.java b/src/info/chenli/litway/bionlp13/ge/BindingRecogniser.java index 9cfd27a..50085d2 100644 --- a/src/info/chenli/litway/bionlp13/ge/BindingRecogniser.java +++ b/src/info/chenli/litway/bionlp13/ge/BindingRecogniser.java @@ -1,15 +1,21 @@ package info.chenli.litway.bionlp13.ge; +import info.chenli.classifier.Accurary; import info.chenli.classifier.Instance; import info.chenli.classifier.InstanceDictionary; import info.chenli.classifier.LibLinearFacade; import info.chenli.litway.util.Timer; import java.io.File; +import java.util.ArrayList; import java.util.Collections; import java.util.List; import java.util.logging.Logger; +import de.bwaldvogel.liblinear.Feature; +import de.bwaldvogel.liblinear.FeatureNode; +import de.bwaldvogel.liblinear.Linear; + public class BindingRecogniser extends LibLinearFacade { private final static Logger logger = Logger @@ -34,10 +40,10 @@ public void train(String trainingDir, int round) { logger.info("Save dictionary."); // save instances - trainingInstances.saveInstances(new File( + /*trainingInstances.saveInstances(new File( "./model/instances.binding.txt")); trainingInstances.saveSvmLightInstances(new File( - "./model/instances.binding.svm.txt")); + "./model/instances.binding.svm.txt"));*/ // shuffle Collections.shuffle(instances); @@ -56,9 +62,152 @@ public void train(String trainingDir, int round) { } + public void train2(String trainingDir, int round) { + // + // collect all instances and fetch syntactical information + // + BindingInstances trainingInstances = new BindingInstances(); + trainingInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = trainingInstances.getInstances(new File( + trainingDir)); + logger.info(String.valueOf(instances.size()).concat( + " instances are collected.")); + + InstanceDictionary dict = new InstanceDictionary(); + dict.creatNumericDictionary(instances); + dict.saveDictionary(new File("./model/bindings.train.devel.".concat(classifierName) + .concat(".dict"))); + logger.info("Save dictionary."); + + // save instances +/* trainingInstances.saveInstances(new File( + "./model/instances.binding.txt")); + trainingInstances.saveSvmLightInstances(new File( + "./model/instances.binding.svm.txt")); +*/ + // shuffle + Collections.shuffle(instances); + logger.info("Shuffle instances."); + + Timer timer = new Timer(); + timer.start(); + + train(instances, round); + timer.stop(); + logger.info("Training takes ".concat(String.valueOf(timer + .getRunningTime()))); + + saveModel(new File("./model/bindings.train.devel.".concat(classifierName).concat( + ".model"))); + + } + + private void test(File file) { + // TODO Auto-generated method stub + BindingInstances testInstances = new BindingInstances(); + testInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = testInstances.getInstances(file); + logger.info(String.valueOf(instances.size()).concat( + " instances are collected.")); + InstanceDictionary dict = new InstanceDictionary(); + dict.loadDictionary(new File("./model/bindings." + .concat(classifierName).concat(".dict"))); + this.loadModel(new File("./model/bindings.".concat( + classifierName).concat(".model"))); + instances = dict.instancesToNumeric(instances); + testInstances.saveSvmLightInstances(new File( + "./model/instances.bindings.svm.dev.txt")); + int total = 0, correct = 0, tp = 0, tn = 0, fn, fp, pp = 0; + float p, r, f; + + for (Instance instance : instances) { + int prediction = predict(instance); + if (prediction == instance.getLabel()) { + if (instance.getLabelString().equalsIgnoreCase("Binding")){ + tp++; + } + correct++; + }else if (prediction != instance.getLabel() + && instance.getLabelString().equalsIgnoreCase("Non_binding")) { + //System.out.print(instance.getSentenceId()); + //System.out.println("\t" + "fp" + "\t" + instance.getFileId()); + }else if (prediction != instance.getLabel() + && instance.getLabelString().equalsIgnoreCase("Binding")) { + //System.out.print(instance.getSentenceId()); + //System.out.println("\t" + "fn" + "\t" + instance.getFileId()); + } + + if (instance.getLabelString().equalsIgnoreCase("Binding")){ + pp++; + } + total++; + } + + fn = pp - tp; + tn = correct - tp; + fp = total - pp - tn; + p = (float) tp / (tp + fp); + r = (float) tp / (tp + fn); + f = (float) 2 * p * r / (p + r); + + System.out.println(new Accurary(correct, total)); + System.out.println("tp: " + tp + " fp: " + fp + " fn: " + fn); + System.out.println("p: " + p + " r: " + r + " f: " + f); + } public static void main(String[] args) { BindingRecogniser br = new BindingRecogniser(); - br.train(args[0], 1); + //br.train2("/media/songrq/soft/litway/数据/BioNLP13/" + // + "BioNLP-ST-2013_GE_train_devel_data_yuanShuJu", 1); + + br.train2(args[0], 1); + + /*br.train("/media/songrq/soft/litway/数据/BioNLP13/" + + "BioNLP-ST-2013_GE_train_data_yuanShuJu", 1); + br.test(new File("/media/songrq/soft/litway/数据/BioNLP13/" + + "BioNLP-ST-2013_GE_devel_data_yuanShuJu"));*/ + + /*br.train("/media/songrq/soft/litway/数据/BioNLP11/" + + "BioNLP-ST-2011-2013_GE_train_data", 1); + br.test(new File("/media/songrq/soft/litway/数据/BioNLP13/" + + "BioNLP-ST-2013_GE_devel_data_yuanShuJu"));*/ + + //br.train2("/media/songrq/soft/litway/数据/BioNLP11/" + // + "BioNLP-ST-2011-2013_GE_train_devel_data", 1); + } + + + public double predict_values(int[] featureSparseVector) { + + if (featureSparseVector == null) { + throw new IllegalArgumentException( + "Empty sparse vector. This probably due to that the dictionary hasn't converted instances to numeric features yet."); + } + + int n; + int nr_feature = this.model.getNrFeature(); + if (this.model.getBias() >= 0) { + n = nr_feature + 1; + } else { + n = nr_feature; + } + + List featureNodes = new ArrayList(); + int previousIndex = 0; + for (int index : featureSparseVector) { + if (index > previousIndex) { + featureNodes.add(new FeatureNode(index, 1)); + } + previousIndex = index; + } + if (model.getBias() >= 0) { + Feature node = new FeatureNode(n, model.getBias()); + featureNodes.add(node); + } + Feature[] instance = new FeatureNode[featureNodes.size()]; + instance = featureNodes.toArray(instance); + double[] dec_values = new double[this.model.getNrClass()]; + int type = (int) Math.round(Linear.predictValues(this.model, instance, dec_values)); + return dec_values[type]; } } diff --git a/src/info/chenli/litway/bionlp13/ge/CauseInstances.java b/src/info/chenli/litway/bionlp13/ge/CauseInstances.java index 9b8bcd1..2efea54 100644 --- a/src/info/chenli/litway/bionlp13/ge/CauseInstances.java +++ b/src/info/chenli/litway/bionlp13/ge/CauseInstances.java @@ -102,10 +102,13 @@ protected List getStructuredInstances(JCas jcas, boolean isCause = event.getCause() == null ? false : event .getCause().equals(protein.getId()); - - causeCandidates.add(causeToInstance(jcas, sentence, + Instance instance = causeToInstance(jcas, sentence, protein, event.getTrigger(), pairsOfSentence, - dependencyExtractor, isCause, themeToken)); + dependencyExtractor, isCause, themeToken); + + if ( instance != null) { + causeCandidates.add(instance); + } } // check event causes diff --git a/src/info/chenli/litway/bionlp13/ge/CauseRecogniser.java b/src/info/chenli/litway/bionlp13/ge/CauseRecogniser.java index d0ed467..85cb0a2 100644 --- a/src/info/chenli/litway/bionlp13/ge/CauseRecogniser.java +++ b/src/info/chenli/litway/bionlp13/ge/CauseRecogniser.java @@ -34,7 +34,7 @@ public void train(File trainingSet, boolean useSearn) { .getInstances(trainingSet); dict.creatNumericDictionary(instances); - dict.saveDictionary(new File("./model/causes.dict")); + dict.saveDictionary(new File("./model/causes.liblinear.dict")); this.train(instances); diff --git a/src/info/chenli/litway/bionlp13/ge/Classify.java b/src/info/chenli/litway/bionlp13/ge/Classify.java new file mode 100644 index 0000000..6daa424 --- /dev/null +++ b/src/info/chenli/litway/bionlp13/ge/Classify.java @@ -0,0 +1,49 @@ +package info.chenli.litway.bionlp13.ge; + +import info.chenli.classifier.Instance; +import info.chenli.litway.util.FileUtil; +import info.chenli.litway.util.UimaUtil; + +import java.io.*; +import java.util.List; + +public class Classify { + + public static void main(String[] args) { + File trainFile = new File("./model/instances.trigger.txt"); + File develFile = new File("./model/instances.trigger.dev.txt"); + List instances; + Instance instance; + try { + InputStreamReader trainFileStream = new InputStreamReader( + new FileInputStream(trainFile), "UTF8"); + BufferedReader trainFileBuffer = new BufferedReader(trainFileStream); + + InputStreamReader develFileStream = new InputStreamReader( + new FileInputStream(develFile), "UTF8"); + BufferedReader develFileBuffer = new BufferedReader(develFileStream); + + String trainFileCh; + while ((trainFileCh = trainFileBuffer.readLine()) != null) { + String[] trainInstance = trainFileCh.split("\t"); + //instance.setLabel(Integer.parseInt(trainInstance[0])); + //instance.getFeaturesNumeric(); + //instances.add(instance); + + } + trainFileBuffer.close(); + trainFileStream.close(); + develFileStream.close(); + develFileBuffer.close(); + + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } catch (IOException e) { + e.printStackTrace(); + } + + } + +} diff --git a/src/info/chenli/litway/bionlp13/ge/EventExtractor.java b/src/info/chenli/litway/bionlp13/ge/EventExtractor.java index 872ca24..aa4e726 100644 --- a/src/info/chenli/litway/bionlp13/ge/EventExtractor.java +++ b/src/info/chenli/litway/bionlp13/ge/EventExtractor.java @@ -3,7 +3,6 @@ import info.chenli.classifier.Instance; import info.chenli.classifier.InstanceDictionary; import info.chenli.litway.corpora.Event; -import info.chenli.litway.corpora.POS; import info.chenli.litway.corpora.Protein; import info.chenli.litway.corpora.Sentence; import info.chenli.litway.corpora.Token; @@ -92,6 +91,10 @@ public String extractFromSingleFile(File file) { Map> pairsOfArticle = StanfordDependencyReader .getPairs(new File(FileUtil.removeFileNameExtension( file.getAbsolutePath()).concat(".sdepcc"))); + + File word2vecFile = new File("/home/songrq/word2vec/data/word2vec100"); + Map word2vec = ReadWord2vec.word2vec(word2vecFile); + // // Initialize the classifiers // @@ -136,15 +139,18 @@ public String extractFromSingleFile(File file) { while (sentenceIter.hasNext()) { Sentence sentence = (Sentence) sentenceIter.next(); - Set pairsOfSentence = pairsOfArticle.get(sentence.getId()); - - // The queue where newly generated events are put - LinkedBlockingQueue newEvents = new LinkedBlockingQueue(); - // protein List sentenceProteins = JCasUtil.selectCovered(jcas, Protein.class, sentence); + if (sentenceProteins.size() <= 0) { + continue; + } + + Set pairsOfSentence = pairsOfArticle.get(sentence.getId()); + // The queue where newly generated events are put + LinkedBlockingQueue newEvents = new LinkedBlockingQueue(); + // // trigger detection // @@ -157,9 +163,9 @@ public String extractFromSingleFile(File file) { triggers.put(sentence.getId(), new ArrayList()); } - triggerDetectionLoop: for (Token token : tokens) { + for (Token token : tokens) { - if (!POS.isPos(token.getPos())) { + /*if (!POS.isPos(token.getPos())) { continue triggerDetectionLoop; } for (Protein protein : sentenceProteins) { @@ -169,26 +175,28 @@ public String extractFromSingleFile(File file) { .getEnd() <= protein.getEnd())) { continue triggerDetectionLoop; } - } + }*/ Instance tokenInstance = tokenToInstance(jcas, token, null, tokens, sentenceProteins, pairsOfSentence, - dependencyExtractor); + dependencyExtractor, word2vec); // set the token filter here // if (!TriggerRecogniser.isConsidered(tokenInstance // .getFeaturesString().get(2))) { // continue; // } - int prediction = triggerRecogniser.predict(triggerDict - .instanceToNumeric(tokenInstance)); - - if (prediction != triggerDict.getLabelNumeric(String - .valueOf(EventType.Non_trigger))) { - - Trigger trigger = new Trigger(jcas, token.getBegin(), - token.getEnd()); - trigger.setEventType(triggerDict.getLabelString(prediction)); - trigger.setId("T".concat(String.valueOf(++proteinNum))); - triggers.get(sentence.getId()).add(trigger); + if (tokenInstance != null) { + int prediction = triggerRecogniser.predict(triggerDict + .instanceToNumeric(tokenInstance)); + + if (prediction != triggerDict.getLabelNumeric(String + .valueOf(EventType.Non_trigger))) { + + Trigger trigger = new Trigger(jcas, token.getBegin(), + token.getEnd()); + trigger.setEventType(triggerDict.getLabelString(prediction)); + trigger.setId("T".concat(String.valueOf(++proteinNum))); + triggers.get(sentence.getId()).add(trigger); + } } } @@ -197,7 +205,9 @@ public String extractFromSingleFile(File file) { // // 1. iterate through all proteins - + if (null == triggers.get(sentence.getId())) { + continue; + } if (null == events.get(sentence.getId())) { events.put(sentence.getId(), new ArrayList()); } @@ -213,7 +223,7 @@ public String extractFromSingleFile(File file) { dependencyExtractor, false); double prediction = themeRecogniser.predict(themeDict .instanceToNumeric(proteinInstance) - .getFeaturesNumeric()); + .getFeaturesNumeric(), proteinInstance); // if (trigger.getEventType().equals( // String.valueOf(EventType.Localization)) @@ -264,7 +274,7 @@ public String extractFromSingleFile(File file) { dependencyExtractor, false); double prediction = themeRecogniser.predict(themeDict .instanceToNumeric(proteinInstance) - .getFeaturesNumeric()); + .getFeaturesNumeric(), proteinInstance); if (prediction == themeDict.getLabelNumeric("Theme")) { themes.add(protein); @@ -273,8 +283,6 @@ public String extractFromSingleFile(File file) { } if (themes.size() > 0) { - Event event = new Event(jcas); - event.setTrigger(trigger); // event.setId(String.valueOf(eventIndex++)); List> predictedThemesComb = new ArrayList>(); @@ -285,11 +293,14 @@ public String extractFromSingleFile(File file) { for (List candidateThemes : combs .getCombinations()) { + if (candidateThemes.size() > 3) { + continue; + } Instance bindingInstance = bindingDict .instanceToNumeric(bindingEventToInstance( - jcas, sentence, event, + jcas, sentence, trigger, candidateThemes, - dependencyExtractor)); + dependencyExtractor, false)); if (bindingRecogniser.predict(bindingInstance) == bindingDict .getLabelNumeric("Binding")) { predictedThemesComb.add(candidateThemes); @@ -358,7 +369,7 @@ public String extractFromSingleFile(File file) { false)); double prediction = themeRecogniser - .predict(proteinInstance.getFeaturesNumeric()); + .predict(proteinInstance.getFeaturesNumeric(), proteinInstance); if (prediction == themeDict.getLabelNumeric("Theme")) { @@ -399,7 +410,7 @@ public String extractFromSingleFile(File file) { double prediction = themeRecogniser.predict(themeDict .instanceToNumeric(triggerTokenInstance) - .getFeaturesNumeric()); + .getFeaturesNumeric(), triggerTokenInstance); if (prediction == themeDict.getLabelNumeric("Theme")) { @@ -436,7 +447,7 @@ public String extractFromSingleFile(File file) { getThemeToken(jcas, event, sentence)); double prediction = causeRecogniser.predict(causeDict .instanceToNumeric(proteinInstance) - .getFeaturesNumeric()); + .getFeaturesNumeric(), proteinInstance); if (prediction == causeDict.getLabelNumeric(String .valueOf("Cause"))) { @@ -464,7 +475,7 @@ public String extractFromSingleFile(File file) { dependencyExtractor, false, themeToken); double prediction = causeRecogniser.predict(causeDict .instanceToNumeric(causeEventInstance) - .getFeaturesNumeric()); + .getFeaturesNumeric(), causeEventInstance); // if (event // .getTrigger() // .getEventType() diff --git a/src/info/chenli/litway/bionlp13/ge/EventExtractorBind2.java b/src/info/chenli/litway/bionlp13/ge/EventExtractorBind2.java new file mode 100644 index 0000000..1b23d9d --- /dev/null +++ b/src/info/chenli/litway/bionlp13/ge/EventExtractorBind2.java @@ -0,0 +1,2295 @@ +package info.chenli.litway.bionlp13.ge; + +import info.chenli.classifier.Instance; +import info.chenli.classifier.InstanceDictionary; +import info.chenli.litway.corpora.Event; +import info.chenli.litway.corpora.Protein; +import info.chenli.litway.corpora.Sentence; +import info.chenli.litway.corpora.Token; +import info.chenli.litway.corpora.Trigger; +import info.chenli.litway.util.Combinations; +import info.chenli.litway.util.DependencyExtractor; +import info.chenli.litway.util.FileFilterImpl; +import info.chenli.litway.util.FileUtil; +import info.chenli.litway.util.StanfordDependencyReader; +import info.chenli.litway.util.UimaUtil; +import info.chenli.litway.util.StanfordDependencyReader.Pair; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.logging.Logger; + +import libsvm.svm; +import libsvm.svm_model; +import libsvm.svm_node; + +import org.apache.uima.cas.FSIterator; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.StringArray; +import org.apache.uima.jcas.tcas.Annotation; +import org.uimafit.util.JCasUtil; + +import de.bwaldvogel.liblinear.Feature; +import de.bwaldvogel.liblinear.FeatureNode; + + +public class EventExtractorBind2 extends TokenInstances { + + private final static Logger logger = Logger.getLogger(EventExtractor.class + .getName()); + + private String classifierName = "liblinear"; + + public void train(File dir) { + + if (!dir.isDirectory()) { + logger.info(dir.getAbsolutePath().concat(" is not a directory.")); + } + + // + // train trigger + // + + // + // train theme + // + + // + // train cause + // + } + + public void extract(File file) throws IOException { + + int[] perform = {0, 0, 0, 0};//tp, tn, fp, fn + + if (file.isDirectory()) { + + for (File f : file.listFiles(new FileFilterImpl(".txt"))) { + + /*perform = extractFromSingleFile(f, perform); + + for(int i:perform) { + System.out.println(i); + }*/ + + extract(f); + } + + + + } else if (file.isFile()) { + + + logger.info("Extracting from ".concat(file.getName())); + String newFileName = "./result/".concat( + file.getName() + .substring(0, file.getName().lastIndexOf("."))) + .concat(".a2"); + FileUtil.saveFile(extractFromSingleFile(file, perform), + new File(newFileName)); + logger.info("Result saved in ".concat(newFileName)); + } + } + + /** + * Extract events from the given file. + * + * @param file + * @throws IOException + */ + public String extractFromSingleFile(File file, int[] perform) throws IOException { + + boolean test = true; + + File word2vecFile = new File("./word2vec/word2vec100"); + //File word2vecFile = new File("/home/songrq/word2vec/data/word2vec100"); + + Map word2vec = ReadWord2vec.word2vec(word2vecFile); + + Map> triggers = new TreeMap>(); + Map> events = new TreeMap>(); + // Initialize the file + JCas jcas = this.processSingleFile(file); + int proteinNum = 0; + FSIterator proteinIter = jcas.getAnnotationIndex( + Protein.type).iterator(); + while(proteinIter.hasNext()) { + Protein protein = (Protein) proteinIter.next(); + String s = protein.getId().replace('T', '0'); + proteinNum = proteinNum < Integer.valueOf(s) ? Integer.valueOf(s) : proteinNum; + } + Map> pairsOfArticle = new HashMap>(); + if (new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc")).exists()) { + pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc"))); + } else { + pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sd"))); + } + + + + // + // Initialize the classifiers + // + + // trigger + TriggerRecogniser triggerRecogniser = new TriggerRecogniser(); + InstanceDictionary triggerDict = new InstanceDictionary(); + triggerRecogniser.loadModel(new File("./model/triggers.".concat( + classifierName).concat(".model"))); + //String triggerModel = "./model/triggers.model"; + triggerDict.loadDictionary(new File("./model/triggers.".concat( + classifierName).concat(".dict"))); + if (test) { + triggerRecogniser.loadModel(new File("./model/triggers.train.devel.".concat( + classifierName).concat(".model"))); + triggerDict.loadDictionary(new File("./model/triggers.train.devel.".concat( + classifierName).concat(".dict"))); + + }else { + triggerRecogniser.loadModel(new File("./model/triggers.".concat( + classifierName).concat(".model"))); + triggerDict.loadDictionary(new File("./model/triggers.".concat( + classifierName).concat(".dict"))); + } + // argument + ArgumentRecogniser argumentRecogniser = new ArgumentRecogniser(); + InstanceDictionary argumentDict = new InstanceDictionary(); + if (test) { + argumentRecogniser.loadModel(new File("./model/arguments.train.devel.".concat( + classifierName).concat(".model"))); + argumentDict.loadDictionary(new File("./model/arguments.train.devel.".concat( + classifierName).concat(".dict"))); + }else { + argumentRecogniser.loadModel(new File("./model/arguments.".concat( + classifierName).concat(".model"))); + argumentDict.loadDictionary(new File("./model/arguments.".concat( + classifierName).concat(".dict"))); + } + /*ArgumentRecogniser reguArgumentRecogniser = new ArgumentRecogniser(); + reguArgumentRecogniser.loadModel(new File("./model/reguArguments.".concat( + classifierName).concat(".model"))); + InstanceDictionary reguArgumentDict = new InstanceDictionary(); + reguArgumentDict.loadDictionary(new File("./model/reguArguments.".concat( + classifierName).concat(".dict")));*/ + /*TriggerArgumentRecogniser triggerArgumentRecogniser = new TriggerArgumentRecogniser(); + triggerArgumentRecogniser.loadModel(new File("./model/triggerArguments.".concat( + classifierName).concat(".model"))); + InstanceDictionary triggerArgumentDict = new InstanceDictionary(); + triggerArgumentDict.loadDictionary(new File("./model/triggerArguments.".concat( + classifierName).concat(".dict")));*/ + + // binding + BindingRecogniser bindingRecogniser = new BindingRecogniser(); + InstanceDictionary bindingDict = new InstanceDictionary(); + if (test) { + bindingRecogniser.loadModel(new File("./model/bindings.train.devel.".concat( + classifierName).concat(".model"))); + bindingDict.loadDictionary(new File("./model/bindings.train.devel.".concat( + classifierName).concat(".dict"))); + }else { + bindingRecogniser.loadModel(new File("./model/bindings.".concat( + classifierName).concat(".model"))); + bindingDict.loadDictionary(new File("./model/bindings.".concat( + classifierName).concat(".dict"))); + } + + + // Initialize the iterator and counter + FSIterator sentenceIter = jcas.getAnnotationIndex( + Sentence.type).iterator(); + int eventIndex = 1; + + while (sentenceIter.hasNext()) { + + Sentence sentence = (Sentence) sentenceIter.next(); + Set pairsOfSentence = pairsOfArticle.get(sentence.getId()); + + // protein + List sentenceProteins = JCasUtil.selectCovered(jcas, + Protein.class, sentence); + + if (sentenceProteins.size() <= 0) { + continue; + } + // + // trigger detection + // + List tokens = JCasUtil.selectCovered(jcas, Token.class, + sentence); + + DependencyExtractor dependencyExtractor = new DependencyExtractor( + tokens, pairsOfSentence); + + if (null == triggers.get(sentence.getId())) { + triggers.put(sentence.getId(), new ArrayList()); + } + + Map triggerId = new HashMap(); + + for (Token token : tokens) { + if(isProtein(token, sentenceProteins)) { + continue; + } + + int tokenBegin = token.getBegin(); + int tokenEnd = token.getEnd(); + token = containsProtein(token, sentenceProteins); + /*if (shouldDelete(jcas, token, sentenceProteins)) { + continue; + }*/ + + Instance tokenInstance = tokenToInstance(jcas, token, null, + tokens, sentenceProteins, pairsOfSentence, + dependencyExtractor, word2vec); + if (tokenInstance != null) { + tokenInstance = triggerDict.instanceToNumeric(tokenInstance); + int prediction = triggerRecogniser.predict(tokenInstance); + //int[] featureSparseVector = tokenInstance.getFeaturesNumeric(); + + //int prediction = this.predict2(featureSparseVector, tokenInstance, triggerModel); + + /*String temp = shouldChange(jcas, token, sentenceProteins); + if (!temp.equals("Non_trigger")) { + prediction = triggerDict.getLabelNumeric(temp); + }*/ + if (prediction != triggerDict.getLabelNumeric(String + .valueOf(EventType.Non_trigger))) { + + Trigger trigger = new Trigger(jcas, token.getBegin(), + token.getEnd()); + trigger.setEventType(triggerDict.getLabelString(prediction)); + trigger.setId("T".concat(String.valueOf(++proteinNum))); + triggers.get(sentence.getId()).add(trigger); + triggerId.put(trigger.getId(), trigger); + } + } + + token.setBegin(tokenBegin); + token.setEnd(tokenEnd); + } + + // + // argument assignment + // + + // 1. iterate through all proteins + if (null == events.get(sentence.getId())) { + events.put(sentence.getId(), new LinkedList()); + } + Set sameToken = new HashSet(); + + Map> eventArg = new HashMap>(); + Map> triggerEvents = new HashMap>(); + Map> triggerCauses = new HashMap>(); + + Set andProtein = getAndProtein(jcas, sentenceProteins, dependencyExtractor); + + for (Trigger trigger : triggers.get(sentence.getId())) { + for (Protein protein : sentenceProteins) { + Token triggerToken = getTriggerToken(jcas, trigger); + Token proteinToken = getToken(jcas, protein); + boolean areSameTokens = (proteinToken.getId() == triggerToken.getId()); + if (areSameTokens) { + sameToken.add(protein.getId()); + } + } + } + for (Trigger trigger : triggers.get(sentence.getId())) { + Set triggerEvent = new HashSet(); + Set triggerCause = new HashSet(); + + if (EventType.isSimpleEvent(trigger.getEventType())) { + for (Protein protein : sentenceProteins) { + Token triggerToken = getTriggerToken(jcas, trigger); + Token proteinToken = getToken(jcas, protein); + boolean areSameTokens = (proteinToken.getId() == triggerToken.getId()); + Instance proteinInstance = argumentToInstance(jcas, + sentence, protein, trigger, pairsOfSentence, + dependencyExtractor, false, false, Stage.THEME); + if ( proteinInstance != null) { + double prediction = argumentRecogniser.predict(argumentDict + .instanceToNumeric(proteinInstance) + .getFeaturesNumeric(), proteinInstance); + if (areSameTokens) { + prediction = argumentDict.getLabelNumeric("Theme"); + } + if (prediction == argumentDict.getLabelNumeric("Theme")) { + Event event = new Event(jcas); + event.setId(String.valueOf(eventIndex++)); + event.setTrigger(trigger); + StringArray themes = new StringArray(jcas, 1); + themes.set(0, protein.getId()); + event.setThemes(themes); + events.get(sentence.getId()).add(event); + triggerEvent.add(event); + + Argument arg = new Argument(); + arg.setId(protein.getId()); + arg.setRelation("Theme"); + List args = new LinkedList(); + args.add(arg); + eventArg.put("E".concat(event.getId()), args); + } + } + } + } else if (EventType.isBindingEvent(trigger.getEventType())) { + List> predictedThemesComb = new ArrayList>(); + Set farProtein = getFarProtein( jcas, trigger, sentenceProteins, dependencyExtractor); + Set notTheme = getNotProtein( jcas, trigger, sentenceProteins, dependencyExtractor); + for (Protein protein : sentenceProteins) { + Token triggerToken = getTriggerToken(jcas, trigger); + Token proteinToken = getToken(jcas, protein); + boolean areSameTokens = (proteinToken.getId() == triggerToken.getId()); + + Instance proteinInstance = argumentToInstance(jcas, + sentence, protein, trigger, pairsOfSentence, + dependencyExtractor, false, false, Stage.THEME); + if ( proteinInstance != null) { + double prediction = argumentRecogniser.predict(argumentDict + .instanceToNumeric(proteinInstance) + .getFeaturesNumeric(), proteinInstance); + if (areSameTokens) { + prediction = argumentDict.getLabelNumeric("Theme"); + } + if (prediction != argumentDict.getLabelNumeric("Theme")) { + notTheme.add(protein.getId()); + } + } + } + List proteins = new LinkedList(); + for (Protein protein : sentenceProteins) { + if (!notTheme.contains(protein.getId()) && !farProtein.contains(protein.getId())) { + proteins.add(protein); + } + } + if (proteins.size() == 1) { + /*Instance bindingInstance = bindingEventToInstance(jcas, + sentence, trigger, proteins, dependencyExtractor, false); + double prediction = bindingRecogniser.predict(bindingDict + .instanceToNumeric(bindingInstance) + .getFeaturesNumeric()); + Token triggerToken = getTriggerToken(jcas, trigger); + Token proteinToken = getToken(jcas, proteins.get(0)); + boolean areSameTokens = (proteinToken.getId() == triggerToken.getId()); + if (areSameTokens) { + prediction = bindingDict.getLabelNumeric("Binding"); + } + if (prediction == bindingDict + .getLabelNumeric("Binding")) {*/ + predictedThemesComb.add(proteins); + //} + } else if (proteins.size() > 1) { + Combinations combs = new Combinations( + proteins); + loop : for (List themes : combs.getCombinations()) { + boolean truepositive = false; + if (themes.size() != 2) { + continue; + } + /*for (Protein p : themes) { + if (farProtein.contains(p.getId())) { + continue loop; + } + if (notTheme.contains(p.getId())) { + continue loop; + } + }*/ + int num = 0; + for (Protein p : themes) { + if (andProtein.contains(p.getId())) { + num++; + if (num > 1) { + List theme = new LinkedList(); + theme.add(themes.get(0)); + /*Instance bindingInstance = bindingEventToInstance(jcas, + sentence, trigger, theme, dependencyExtractor, truepositive); + double prediction = bindingRecogniser.predict(bindingDict + .instanceToNumeric(bindingInstance) + .getFeaturesNumeric()); + if (prediction == bindingDict + .getLabelNumeric("Binding")) {*/ + predictedThemesComb.add(theme); + //} + + theme.remove(0); + theme.add(themes.get(1)); + /*bindingInstance = bindingEventToInstance(jcas, + sentence, trigger, theme, dependencyExtractor, truepositive); + prediction = bindingRecogniser.predict(bindingDict + .instanceToNumeric(bindingInstance) + .getFeaturesNumeric()); + if (prediction == bindingDict + .getLabelNumeric("Binding")) {*/ + predictedThemesComb.add(theme); + //} + + continue loop; + } + } + } + for(Protein p : themes) { + for(Protein p2 : themes) { + if (p.getId().equalsIgnoreCase(p2.getId())) { + continue; + } + if (p.getCoveredText().equalsIgnoreCase(p2.getCoveredText()) + || p.getCoveredText().contains(p2.getCoveredText()) + || p2.getCoveredText().contains(p.getCoveredText())) { + List theme = new LinkedList(); + theme.add(themes.get(0)); + /*Instance bindingInstance = bindingEventToInstance(jcas, + sentence, trigger, theme, dependencyExtractor, truepositive); + double prediction = bindingRecogniser.predict(bindingDict + .instanceToNumeric(bindingInstance) + .getFeaturesNumeric()); + if (prediction == bindingDict + .getLabelNumeric("Binding")) {*/ + predictedThemesComb.add(theme); + //} + + theme.remove(0); + theme.add(themes.get(1)); + /*bindingInstance = bindingEventToInstance(jcas, + sentence, trigger, theme, dependencyExtractor, truepositive); + prediction = bindingRecogniser.predict(bindingDict + .instanceToNumeric(bindingInstance) + .getFeaturesNumeric()); + if (prediction == bindingDict + .getLabelNumeric("Binding")) {*/ + predictedThemesComb.add(theme); + //} + continue loop; + } + } + } + Instance instance = bindingEventToInstance(jcas, + sentence, trigger, themes, dependencyExtractor, truepositive); + double prediction = bindingRecogniser.predict(bindingDict + .instanceToNumeric(instance) + .getFeaturesNumeric(), instance); + if (prediction == bindingDict + .getLabelNumeric("Binding")) { + List theme0 = new LinkedList(); + theme0.add(themes.get(0)); + List theme1 = new LinkedList(); + theme1.add(themes.get(1)); + Instance bindInstance0 = bindingEventToInstance(jcas, + sentence, trigger, theme0, dependencyExtractor, truepositive); + Instance bindInstance1 = bindingEventToInstance(jcas, + sentence, trigger, theme1, dependencyExtractor, truepositive); + + double prediction0 = bindingRecogniser.predict(bindingDict + .instanceToNumeric(bindInstance0) + .getFeaturesNumeric(), bindInstance0); + double prediction1 = bindingRecogniser.predict(bindingDict + .instanceToNumeric(bindInstance1) + .getFeaturesNumeric(), bindInstance1); + double predictionValue1 = bindingRecogniser.predict_values(bindingDict + .instanceToNumeric(bindingEventToInstance(jcas, + sentence, trigger, theme1, dependencyExtractor, truepositive)) + .getFeaturesNumeric()); + double predictionValue0 = bindingRecogniser.predict_values(bindingDict + .instanceToNumeric(bindingEventToInstance(jcas, + sentence, trigger, theme0, dependencyExtractor, truepositive)) + .getFeaturesNumeric()); + + if (prediction0 == bindingDict.getLabelNumeric("Binding") + && prediction1 == bindingDict.getLabelNumeric("Binding")) { + prediction = bindingRecogniser.predict_values(bindingDict + .instanceToNumeric(bindingEventToInstance(jcas, + sentence, trigger, themes, dependencyExtractor, truepositive)) + .getFeaturesNumeric()); + if (predictionValue1 > prediction && predictionValue0 > prediction) { + predictedThemesComb.add(theme0); + predictedThemesComb.add(theme1); + } /*else if (prediction0 < prediction && prediction1 < prediction) { + predictedThemesComb.add(themes); + }*/ else { + predictedThemesComb.add(themes); + } + } else { + predictedThemesComb.add(themes); + } + } else { + List theme = new LinkedList(); + theme.add(themes.get(0)); + /*Instance bindingInstance = bindingEventToInstance(jcas, + sentence, trigger, theme, dependencyExtractor, truepositive); + double prediction0 = bindingRecogniser.predict(bindingDict + .instanceToNumeric(bindingInstance) + .getFeaturesNumeric()); + if (prediction0 == bindingDict + .getLabelNumeric("Binding")) {*/ + predictedThemesComb.add(theme); + //} + + theme.remove(0); + theme.add(themes.get(1)); + /*bindingInstance = bindingEventToInstance(jcas, + sentence, trigger, theme, dependencyExtractor, truepositive); + prediction0 = bindingRecogniser.predict(bindingDict + .instanceToNumeric(bindingInstance) + .getFeaturesNumeric()); + if (prediction0 == bindingDict + .getLabelNumeric("Binding")) {*/ + predictedThemesComb.add(theme); + //} + } + } + } + // clean the themes which are fully covered by another + List> checkedThemesComb = new ArrayList>(); + checkingTheme: for (List beingCheckedThemes : predictedThemesComb) { + if (checkedThemesComb.contains(beingCheckedThemes)) { + continue; + } else { + + List> copy = new ArrayList>( + checkedThemesComb); + for (List checkedThemes : copy) { + + if (checkedThemes + .containsAll(beingCheckedThemes)) { + continue checkingTheme; + } else if (beingCheckedThemes + .containsAll(checkedThemes)) { + checkedThemesComb.remove(checkedThemes); + } + } + checkedThemesComb.add(beingCheckedThemes); + } + } + + for (List predictedThemes : checkedThemesComb) { + if (checkedThemesComb.size() > andProtein.size() + 2 + && predictedThemes.size() == 2 + && checkedThemesComb.size() >= 10) { + break; + } + Event newBindingEvent = new Event(jcas); + newBindingEvent.setTrigger(trigger); + newBindingEvent.setId(String.valueOf(eventIndex++)); + StringArray eventThemes = new StringArray(jcas, + predictedThemes.size()); + List args = new LinkedList(); + for (Protein theme : predictedThemes) { + eventThemes.set(predictedThemes.indexOf(theme), + theme.getId()); + Argument arg = new Argument(); + arg.setId(theme.getId()); + arg.setRelation("Theme"); + args.add(arg); + } + eventArg.put("E".concat(newBindingEvent.getId()), args); + + newBindingEvent.setThemes(eventThemes); + events.get(sentence.getId()).add(newBindingEvent); + triggerEvent.add(newBindingEvent); + } + } else if (EventType.isComplexEvent(trigger.getEventType()) + && !EventType.isRegulatoryEvent(trigger.getEventType())) { + + for (Protein protein : sentenceProteins) { + Token triggerToken = getTriggerToken(jcas, trigger); + Token proteinToken = getToken(jcas, protein); + boolean areSameTokens = (proteinToken.getId() == triggerToken.getId()); + + Instance proteinInstance = argumentDict + .instanceToNumeric(argumentToInstance(jcas, + sentence, protein, trigger, + pairsOfSentence, dependencyExtractor, + false, false, Stage.THEME)); + if ( proteinInstance != null) { + double prediction = argumentRecogniser + .predict(proteinInstance.getFeaturesNumeric(), proteinInstance); + if (areSameTokens) { + prediction = argumentDict.getLabelNumeric("Theme"); + } + if (prediction == argumentDict.getLabelNumeric("Theme")) { + + Event event = new Event(jcas); + event.setId(String.valueOf(eventIndex++)); + event.setTrigger(trigger); + StringArray themes = new StringArray(jcas, 1); + themes.set(0, protein.getId()); + event.setThemes(themes); + events.get(sentence.getId()).add(event); + triggerEvent.add(event); + + Argument arg = new Argument(); + arg.setId(protein.getId()); + arg.setRelation("Theme"); + List args = new LinkedList(); + args.add(arg); + eventArg.put("E".concat(event.getId()), args); + }else if(prediction == argumentDict.getLabelNumeric("Cause")) { + if (sameToken.contains(protein.getId())) { + continue; + } + triggerCause.add(protein.getId()); + } + } + } + } else if (EventType.isRegulatoryEvent(trigger.getEventType())) { + + for (Protein protein : sentenceProteins) { + Token triggerToken = getTriggerToken(jcas, trigger); + Token proteinToken = getToken(jcas, protein); + boolean areSameTokens = (proteinToken.getId() == triggerToken.getId()); + if (!areSameTokens && sameToken.contains(protein.getId())) { + continue; + } + + Instance proteinInstance = argumentDict + .instanceToNumeric(argumentToInstance(jcas, + sentence, protein, trigger, + pairsOfSentence, dependencyExtractor, + false, false, Stage.THEME)); + if ( proteinInstance != null) { + double prediction = argumentRecogniser + .predict(proteinInstance.getFeaturesNumeric(), proteinInstance); + + if (prediction == argumentDict.getLabelNumeric("Theme")) { + Event event = new Event(jcas); + event.setId(String.valueOf(eventIndex++)); + event.setTrigger(trigger); + StringArray themes = new StringArray(jcas, 1); + themes.set(0, protein.getId()); + event.setThemes(themes); + events.get(sentence.getId()).add(event); + triggerEvent.add(event); + + Argument arg = new Argument(); + arg.setId(protein.getId()); + arg.setRelation("Theme"); + List args = new LinkedList(); + args.add(arg); + eventArg.put("E".concat(event.getId()), args); + + }else if(prediction == argumentDict.getLabelNumeric("Cause")) { + triggerCause.add(protein.getId()); + } + } + } + } + triggerEvents.put(trigger.getId(), triggerEvent); + triggerCauses.put(trigger.getId(), triggerCause); + } + + + // 2. check all discovered events whether they can be arguments + Map> arguments = new TreeMap>(); + for (Trigger argumentTrigger : triggers.get(sentence.getId())) { + loop : for (Trigger trigger : triggers.get(sentence.getId())) { + if (!EventType.isComplexEvent(trigger.getEventType())) { + continue; + } + + if (argumentTrigger.getBegin() == trigger.getBegin()) { + continue; + } + + Instance triggerTokenInstance = argumentToInstance(jcas, + sentence, argumentTrigger, trigger, + pairsOfSentence, dependencyExtractor, false, false, Stage.THEME); + + double prediction = argumentRecogniser.predict(argumentDict + .instanceToNumeric(triggerTokenInstance) + .getFeaturesNumeric(), triggerTokenInstance); + + if (prediction == argumentDict.getLabelNumeric("Non_Argument")) { + continue; + } + + if (arguments.containsKey(argumentTrigger.getId())) { + Set removeArg = new HashSet(arguments.get(argumentTrigger.getId())); + for (Argument arg : arguments.get(argumentTrigger.getId())) { + if (arg.getId().equals(trigger.getId())) { + double prediction0 = argumentRecogniser.predict_values(argumentDict + .instanceToNumeric(argumentToInstance(jcas, + sentence, argumentTrigger, trigger, + pairsOfSentence, dependencyExtractor, false, false, Stage.THEME)) + .getFeaturesNumeric()); + double prediction1 = argumentRecogniser.predict_values(argumentDict + .instanceToNumeric(argumentToInstance(jcas, + sentence, trigger, argumentTrigger, + pairsOfSentence, dependencyExtractor, false, false, Stage.THEME)) + .getFeaturesNumeric()); + if (prediction0 < prediction1) { + continue loop; + }else { + removeArg.remove(arg); + } + } + } + arguments.put(argumentTrigger.getId(), removeArg); + } + + Set tris = arguments.keySet(); + if (prediction == argumentDict.getLabelNumeric("Cause")) { + Set args = new HashSet(); + if (tris.contains(trigger.getId())) { + args = arguments.get(trigger.getId()); + } + + Argument arg = new Argument(); + arg.setId(argumentTrigger.getId()); + arg.setRelation("Cause"); + args.add(arg); + arguments.put(trigger.getId(), args) ; + }else if (prediction == argumentDict.getLabelNumeric("Theme")) { + + if (EventType.isRegulatoryEvent(trigger.getEventType())) { + + Set args = new HashSet(); + if (tris.contains(trigger.getId())) { + args = arguments.get(trigger.getId()); + } + + Argument arg = new Argument(); + arg.setId(argumentTrigger.getId()); + arg.setRelation("Theme"); + args.add(arg); + arguments.put(trigger.getId(), args) ; + } + } + } + } + // + //new event + //theme + Map> newtriggerEvents = new HashMap>(); + Set tIds = arguments.keySet(); + for (String tId : tIds) { + Set args = arguments.get(tId); + Set triggerEvent = new HashSet(); + for (Argument arg : args) { + if (arg.getRelation().equalsIgnoreCase("Theme") + && triggerEvents.containsKey(arg.getId())) { + Set evens = triggerEvents.get(arg.getId()); + for (Event eve : evens) { + Event event = new Event(jcas); + event.setId(String.valueOf(eventIndex++)); + event.setTrigger(triggerId.get(tId)); + StringArray themes = new StringArray(jcas, 1); + themes.set(0, "E".concat(eve.getId())); + event.setThemes(themes); + events.get(sentence.getId()).add(event); + triggerEvent.add(event); + newtriggerEvents.put(tId, triggerEvent); + + Argument argu = new Argument(); + argu.setId("E".concat(eve.getId())); + argu.setRelation("Theme"); + List argus = new LinkedList(); + argus.add(argu); + eventArg.put("E".concat(event.getId()), argus); + + } + } + } + } + Map> newtriggerEvents2 = new HashMap>(); + for (String tId : tIds) { + Set args = arguments.get(tId); + Set triggerEvent = new HashSet(); + for (Argument arg : args) { + if (arg.getRelation().equalsIgnoreCase("Theme") + && newtriggerEvents.containsKey(arg.getId())) { + Set evens = newtriggerEvents.get(arg.getId()); + for (Event eve : evens) { + Event event = new Event(jcas); + event.setId(String.valueOf(eventIndex++)); + event.setTrigger(triggerId.get(tId)); + StringArray themes = new StringArray(jcas, 1); + themes.set(0, "E".concat(eve.getId())); + event.setThemes(themes); + events.get(sentence.getId()).add(event); + triggerEvent.add(event); + newtriggerEvents2.put(tId, triggerEvent); + + Argument argu = new Argument(); + argu.setId("E".concat(eve.getId())); + argu.setRelation("Theme"); + List argus = new LinkedList(); + argus.add(argu); + eventArg.put("E".concat(event.getId()), argus); + + } + } + } + } + //cause + Map> equal = new HashMap>(); + Map> newtriggerEvents3 = new HashMap>(); + for (String tId : tIds) { + Set args = arguments.get(tId); + Set triggerEvent = new HashSet(); + for (Argument arg : args) { + if (arg.getRelation().equalsIgnoreCase("Cause") + && triggerEvents.containsKey(arg.getId())) { + Set evens = triggerEvents.get(tId);//trigger event + if (newtriggerEvents.containsKey(tId)) { + evens.addAll(newtriggerEvents.get(tId)); + } + if (newtriggerEvents2.containsKey(tId)) { + evens.addAll(newtriggerEvents2.get(tId)); + } + Set causeEvens = triggerEvents.get(arg.getId());//cause event + for (Event eve : evens) { + for (Event causeEve : causeEvens) { + if (eve.getCause() != null) { + Event event = new Event(jcas); + event.setId(String.valueOf(eventIndex++)); + event.setTrigger(eve.getTrigger()); + event.setThemes(eve.getThemes()); + event.setCause("E".concat(causeEve.getId())); + events.get(sentence.getId()).add(event); + triggerEvent.add(event); + newtriggerEvents3.put(tId, triggerEvent); + + List argus = new LinkedList(); + Argument argu = new Argument(); + argu.setId("E".concat(causeEve.getId())); + argu.setRelation("Cause"); + argus.add(argu); + argu.setId(eve.getThemes(0)); + argu.setRelation("Theme"); + argus.add(argu); + eventArg.put("E".concat(event.getId()), argus); + + Set ss = new HashSet(); + if (equal.containsKey("E".concat(eve.getId()))) { + ss = equal.get("E".concat(eve.getId())); + } + ss.add("E".concat(event.getId())); + equal.put("E".concat(eve.getId()), ss); + }else { + + List argus = eventArg.get("E".concat(eve.getId())); + Argument argu = new Argument(); + argu.setId("E".concat(causeEve.getId())); + argu.setRelation("Cause"); + argus.add(argu); + eventArg.put("E".concat(eve.getId()), argus); + + eve.setCause("E".concat(causeEve.getId())); + } + } + } + } + } + } + for (String tId : tIds) { + Set args = arguments.get(tId); + Set triggerEvent = new HashSet(); + for (Argument arg : args) { + if (arg.getRelation().equalsIgnoreCase("Cause") + && newtriggerEvents.containsKey(arg.getId())) { + Set evens = triggerEvents.get(tId);//trigger event + if (newtriggerEvents.containsKey(tId)) { + evens.addAll(newtriggerEvents.get(tId)); + } + if (newtriggerEvents2.containsKey(tId)) { + evens.addAll(newtriggerEvents2.get(tId)); + } + Set causeEvens = newtriggerEvents.get(arg.getId());//new cause event + for (Event eve : evens) { + for (Event causeEve : causeEvens) { + if (eve.getCause() != null) { + Event event = new Event(jcas); + event.setId(String.valueOf(eventIndex++)); + event.setTrigger(eve.getTrigger()); + event.setThemes(eve.getThemes()); + event.setCause("E".concat(causeEve.getId())); + events.get(sentence.getId()).add(event); + triggerEvent.add(event); + newtriggerEvents3.put(tId, triggerEvent); + + List argus = new LinkedList(); + Argument argu = new Argument(); + argu.setId("E".concat(causeEve.getId())); + argu.setRelation("Cause"); + argus.add(argu); + argu.setId(eve.getThemes(0)); + argu.setRelation("Theme"); + argus.add(argu); + eventArg.put("E".concat(event.getId()), argus); + + Set ss = new HashSet(); + if (equal.containsKey("E".concat(eve.getId()))) { + ss = equal.get("E".concat(eve.getId())); + } + ss.add("E".concat(event.getId())); + equal.put("E".concat(eve.getId()), ss); + }else { + + List argus = eventArg.get("E".concat(eve.getId())); + Argument argu = new Argument(); + argu.setId("E".concat(causeEve.getId())); + argu.setRelation("Cause"); + argus.add(argu); + eventArg.put("E".concat(eve.getId()), argus); + + eve.setCause("E".concat(causeEve.getId())); + } + } + } + } + } + } + + // protein cause + for (Trigger trigger : triggers.get(sentence.getId())) { + if (!EventType + .isComplexEvent(trigger.getEventType())) { + continue; + } + Set triggerEvent2 = new HashSet(); + + Set triggerEvent = triggerEvents.get(trigger.getId()); + if (newtriggerEvents.containsKey(trigger.getId())) { + triggerEvent.addAll(newtriggerEvents.get(trigger.getId())); + } + if (newtriggerEvents2.containsKey(trigger.getId())) { + triggerEvent.addAll(newtriggerEvents2.get(trigger.getId())); + } + + for (Event event : triggerEvent) { + if (triggerCauses.containsKey(trigger.getId())) { + Set triggerCause = triggerCauses.get(trigger.getId()); + for (String proCause : triggerCause) { + if (event.getCause() != null) { + Event event2 = new Event(jcas); + event2.setId(String.valueOf(eventIndex++)); + event2.setTrigger(event.getTrigger()); + event2.setThemes(event.getThemes()); + event2.setCause(proCause); + events.get(sentence.getId()).add(event2); + triggerEvent2.add(event2); + newtriggerEvents3.put(trigger.getId(), triggerEvent2); + + List argus = new LinkedList(); + Argument argu = new Argument(); + argu.setId(proCause); + argu.setRelation("Cause"); + argus.add(argu); + argu.setId(event.getThemes(0)); + argu.setRelation("Theme"); + argus.add(argu); + eventArg.put("E".concat(event2.getId()), argus); + + Set ss = new HashSet(); + if (equal.containsKey("E".concat(event.getId()))) { + ss = equal.get("E".concat(event.getId())); + } + ss.add("E".concat(event2.getId())); + equal.put("E".concat(event.getId()), ss); + }else { + + List argus = eventArg.get("E".concat(event.getId())); + Argument argu = new Argument(); + argu.setId(proCause); + argu.setRelation("Cause"); + argus.add(argu); + eventArg.put("E".concat(event.getId()), argus); + + event.setCause(proCause); + } + } + } + } + } + //only event cause is different + for (Trigger trigger : triggers.get(sentence.getId())) { + + if (!EventType.isComplexEvent(trigger.getEventType())) { + continue; + } + Set triggerEvent2 = new HashSet(); + Set triggerEvent = triggerEvents.get(trigger.getId()); + if (newtriggerEvents.containsKey(trigger.getId())) { + triggerEvent.addAll(newtriggerEvents.get(trigger.getId())); + } + if (newtriggerEvents2.containsKey(trigger.getId())) { + triggerEvent.addAll(newtriggerEvents2.get(trigger.getId())); + } + for (Event event : triggerEvent) { + if (equal.containsKey(event.getCause())) { + Set ss = equal.get(event.getCause()); + for (String s : ss) { + Event event2 = new Event(jcas); + event2.setId(String.valueOf(eventIndex++)); + event2.setTrigger(event.getTrigger()); + event2.setThemes(event.getThemes()); + event2.setCause(s); + events.get(sentence.getId()).add(event2); + triggerEvent2.add(event2); + newtriggerEvents3.put(trigger.getId(), triggerEvent2); + + List argus = new LinkedList(); + Argument argu = new Argument(); + argu.setId(s); + argu.setRelation("Cause"); + argus.add(argu); + argu.setId(event.getThemes(0)); + argu.setRelation("Theme"); + argus.add(argu); + eventArg.put("E".concat(event2.getId()), argus); + + } + } + } + if (!EventType.isRegulatoryEvent(trigger.getEventType())) { + continue; + } + + for (Event event : triggerEvent) { + if (equal.containsKey(event.getThemes(0))) { + Set ss = equal.get(event.getThemes(0)); + for (String s : ss) { + Event event2 = new Event(jcas); + event2.setId(String.valueOf(eventIndex++)); + event2.setTrigger(event.getTrigger()); + StringArray themes = new StringArray(jcas, 1); + themes.set(0, s); + event2.setThemes(themes); + event2.setCause(event.getCause()); + events.get(sentence.getId()).add(event2); + + List argus = new LinkedList(); + Argument argu = new Argument(); + argu.setId(event.getCause()); + argu.setRelation("Cause"); + argus.add(argu); + argu.setId(s); + argu.setRelation("Theme"); + argus.add(argu); + eventArg.put("E".concat(event2.getId()), argus); + + } + } + } + } + //修剪 + List removeEvent = new LinkedList(); + for (Trigger trigger : triggers.get(sentence.getId())) { + if (!EventType.isComplexEvent(trigger.getEventType())) { + continue; + } + Set triggerEvent = triggerEvents.get(trigger.getId()); + if (newtriggerEvents.containsKey(trigger.getId())) { + triggerEvent.addAll(newtriggerEvents.get(trigger.getId())); + } + if (newtriggerEvents2.containsKey(trigger.getId())) { + triggerEvent.addAll(newtriggerEvents2.get(trigger.getId())); + } + if (newtriggerEvents3.containsKey(trigger.getId())) { + triggerEvent.addAll(newtriggerEvents3.get(trigger.getId())); + } + for (Event event : triggerEvent) { + if (null != event.getCause() && event.getCause().contains("E")) { + List args = eventArg.get(event.getCause()); + for (Argument arg : args) { + if (arg.getId().contains("E")) { + List args2 = eventArg.get(arg.getId()); + for (Argument arg2 : args2) { + for (Event event2 : triggerEvent) { + if (event2.getId().equals(event.getId())) { + continue; + } + List args3 = eventArg.get("E".concat(event2.getId())); + for (Argument arg3 : args3) { + if (arg3.getId().contains("T") && arg3.getId().equals(arg2.getId())) { + double prediction0 = 0; + double prediction1 = 0; + for (Event e : events.get(sentence.getId())) { + if (e.getId().equals(event.getCause().replace("E", ""))) { + prediction0 = argumentRecogniser.predict_values(argumentDict + .instanceToNumeric(argumentToInstance(jcas, + sentence, e.getTrigger(), trigger, + pairsOfSentence, dependencyExtractor, false, false, Stage.THEME)) + .getFeaturesNumeric()); + } + } + + if (EventType.isBindingEvent(event2.getTrigger().getEventType())) { + List themes = new LinkedList(); + for (Protein protein : sentenceProteins) { + for (int i=0; i args2 = eventArg.get("E".concat(event2.getId())); + for (Argument arg2 : args2) { + if (arg2.getId().contains("T") && arg2.getId().equals(arg.getId())) { + double prediction0 = 0; + double prediction1 = 0; + for (Event e : events.get(sentence.getId())) { + if (e.getId().equals(event.getCause().replace("E", ""))) { + prediction0 = argumentRecogniser.predict_values(argumentDict + .instanceToNumeric(argumentToInstance(jcas, + sentence, e.getTrigger(), trigger, + pairsOfSentence, dependencyExtractor, false, false, Stage.THEME)) + .getFeaturesNumeric()); + } + } + + if (EventType.isBindingEvent(event2.getTrigger().getEventType())) { + List themes = new LinkedList(); + for (Protein protein : sentenceProteins) { + for (int i=0; i args = eventArg.get(event.getThemes(0)); + for (Argument arg : args) { + if (arg.getId().contains("E")) { + List args2 = eventArg.get(arg.getId()); + for (Argument arg2 : args2) { + for (Event event2 : triggerEvent) { + if (event2.getId().equals(event.getId())) { + continue; + } + List args3 = eventArg.get("E".concat(event2.getId())); + for (Argument arg3 : args3) { + if (arg3.getId().contains("T") && arg3.getId().equals(arg2.getId())) { + double prediction0 = 0; + double prediction1 = 0; + for (Event e : events.get(sentence.getId())) { + if (e.getId().equals(event.getThemes(0).replace("E", ""))) { + prediction0 = argumentRecogniser.predict_values(argumentDict + .instanceToNumeric(argumentToInstance(jcas, + sentence, e.getTrigger(), trigger, + pairsOfSentence, dependencyExtractor, false, false, Stage.THEME)) + .getFeaturesNumeric()); + } + } + + if (EventType.isBindingEvent(event2.getTrigger().getEventType())) { + List themes = new LinkedList(); + for (Protein protein : sentenceProteins) { + for (int i=0; i args4 = eventArg.get(arg3.getId()); + for (Argument arg4 : args4) { + if (arg4.getId().contains("T") && arg4.getId().equals(arg2.getId())) { + double prediction0 = 0; + double prediction1 = 0; + for (Event e : events.get(sentence.getId())) { + if (e.getId().equals(event.getThemes(0).replace("E", ""))) { + prediction0 = argumentRecogniser.predict_values(argumentDict + .instanceToNumeric(argumentToInstance(jcas, + sentence, e.getTrigger(), trigger, + pairsOfSentence, dependencyExtractor, false, false, Stage.THEME)) + .getFeaturesNumeric()); + } + } + + if (EventType.isBindingEvent(event2.getTrigger().getEventType())) { + List themes = new LinkedList(); + for (Protein protein : sentenceProteins) { + for (int i=0; i args2 = eventArg.get("E".concat(event2.getId())); + for (Argument arg2 : args2) { + if (arg2.getId().contains("T") && arg2.getId().equals(arg.getId())) { + double prediction0 = 0; + double prediction1 = 0; + for (Event e : events.get(sentence.getId())) { + if (e.getId().equals(event.getThemes(0).replace("E", ""))) { + prediction0 = argumentRecogniser.predict_values(argumentDict + .instanceToNumeric(argumentToInstance(jcas, + sentence, e.getTrigger(), trigger, + pairsOfSentence, dependencyExtractor, false, false, Stage.THEME)) + .getFeaturesNumeric()); + } + } + + if (EventType.isBindingEvent(event2.getTrigger().getEventType())) { + List themes = new LinkedList(); + for (Protein protein : sentenceProteins) { + for (int i=0; i args4 = eventArg.get(arg2.getId()); + for (Argument arg4 : args4) { + if (arg4.getId().contains("T") && arg4.getId().equals(arg.getId())) { + double prediction0 = 0; + double prediction1 = 0; + for (Event e : events.get(sentence.getId())) { + if (e.getId().equals(event.getThemes(0).replace("E", ""))) { + prediction0 = argumentRecogniser.predict_values(argumentDict + .instanceToNumeric(argumentToInstance(jcas, + sentence, e.getTrigger(), trigger, + pairsOfSentence, dependencyExtractor, false, false, Stage.THEME)) + .getFeaturesNumeric()); + } + } + + if (EventType.isBindingEvent(event2.getTrigger().getEventType())) { + List themes = new LinkedList(); + for (Protein protein : sentenceProteins) { + for (int i=0; i removeEvent2 = new LinkedList(); + for (Event event : events.get(sentence.getId())) { + if (!EventType.isComplexEvent(event.getTrigger().getEventType())) { + continue; + } + if (null != event.getCause() && removeEvent.contains(event.getCause().replace("E", ""))) { + event.setCause(""); + } + if (removeEvent.contains(event.getThemes(0).replace("E", ""))) { + removeEvent2.add(event.getId()); + } + } + for (Event event : events.get(sentence.getId())) { + if (!EventType.isComplexEvent(event.getTrigger().getEventType())) { + continue; + } + if (null != event.getCause() && removeEvent2.contains(event.getCause().replace("E", ""))) { + event.setCause(""); + } + if (removeEvent2.contains(event.getThemes(0).replace("E", ""))) { + removeEvent.add(event.getId()); + } + } + List eventscopy = new LinkedList(); + for (Event event : events.get(sentence.getId())) { + if (!removeEvent.contains(event.getId()) && !removeEvent2.contains(event.getId())) { + eventscopy.add(event); + } + } + events.put(sentence.getId(), eventscopy); + } + return resultToString(triggers, events); + //return perform; + } + + private String resultToString(Map> triggers, + Map> events) { + + StringBuffer sb = new StringBuffer(); + + // print triggers + for (List sentenceTriggers : triggers.values()) { + for (Trigger trigger : sentenceTriggers) { + sb.append(trigger.getId().concat("\t") + .concat(trigger.getEventType()).concat(" ") + .concat(String.valueOf(trigger.getBegin())).concat(" ") + .concat(String.valueOf(trigger.getEnd())).concat("\t") + .concat(trigger.getCoveredText()).concat("\n")); + } + } + + for (List sentenceEvents : events.values()) { + for (Event event : sentenceEvents) { + sb.append("E").append(event.getId()).append("\t") + .append(event.getTrigger().getEventType()).append(":") + .append(event.getTrigger().getId()); + + for (int j = 0; j < event.getThemes().size(); j++) { + sb.append(" Theme").append(j == 0 ? "" : j + 1).append(":") + .append(event.getThemes(j)); + } + + if (null != event.getCause() && "" != event.getCause()) { + sb.append(" Cause:").append(event.getCause()); + } + sb.append("\n"); + } + } + + return sb.toString(); + } + + public static void main(String[] args) { + + EventExtractorBind2 ee = new EventExtractorBind2(); + ee.setTaeDescriptor("/desc/TestSetAnnotator.xml"); + //ee.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + //File inputFile = new File(args[0]); + + //File inputFile = new File("/media/songrq/soft/litway/数据/" + // + "BioNLP13/BioNLP-ST-2013_GE_devel_data_yuanShuJu"); + + //File inputFile = new File("/media/songrq/soft/litway/数据/" + // + "BioNLP13/BioNLP-ST-2013_GE_test_data_yuanShuJu"); + + File inputFile = new File(args[0]); + + //File inputFile = new File("/media/songrq/soft/litway/数据/" + // + "BioNLP13/b"); + try { + ee.extract(inputFile); + } catch (IOException e) { + // TODO 自动生成的 catch 块 + e.printStackTrace(); + } + } + + + + public static boolean isSimpleEvent(String eventType) { + + if (eventType.equalsIgnoreCase("Gene_expression") + || eventType.equalsIgnoreCase("Transcription") + || eventType.equalsIgnoreCase("Protein_catabolism") + || eventType.equalsIgnoreCase("Localization")) { + return true; + } + + return false; + } + + public static boolean isRegulatoryEvent(String eventType) { + + if (eventType.equalsIgnoreCase("Regulation") + || eventType.equalsIgnoreCase("Positive_regulation") + || eventType.equalsIgnoreCase("Negative_regulation")) { + + return true; + } + + return false; + } + + public static boolean isComplexEvent(String eventType) { + + if (eventType.equalsIgnoreCase("Protein_modification") + || eventType.equalsIgnoreCase("Phosphorylation") + || eventType.equalsIgnoreCase("Ubiquitination") + || eventType.equalsIgnoreCase("Acetylation") + || eventType.equalsIgnoreCase("Deacetylation") + || eventType.equalsIgnoreCase("Regulation") + || eventType.equalsIgnoreCase("Positive_regulation") + || eventType.equalsIgnoreCase("Negative_regulation")) { + + return true; + } + + return false; + } + public static final class Argument { + + private String argId; + private String relation; + + public String getId() { + return argId; + } + + public void setId(String argId) { + this.argId = argId; + } + + public String getRelation() { + return relation; + } + + public void setRelation(String relation) { + this.relation = relation; + } + } + + protected Token getToken(JCas jcas, Annotation annotation) { + + List tokens = JCasUtil.selectCovered(jcas, Token.class, + annotation); + // if protein/trigger is within a token + if (tokens.size() == 0) { + FSIterator iter = jcas.getAnnotationIndex( + Token.type).iterator(); + tokens = new ArrayList(); + while (iter.hasNext()) { + Token token = (Token) iter.next(); + if (token.getBegin() <= annotation.getBegin() + && token.getEnd() >= annotation.getEnd()) { + tokens.add(token); + break; + } + } + } + /*if (tokens.size() == 0) { + System.out.println(annotation.getCoveredText()); + }*/ + Token token = tokens.get(0); + for (Token aToken : tokens) { + + try { + Double.parseDouble(aToken.getLemma()); + break; + } catch (NumberFormatException e) { + if (aToken.getCoveredText().equals(")")) { + break; + } + token = aToken; + } + } + return token; + } + + protected Set getNotProtein(JCas jcas, Trigger trigger, List sentenceProteins, DependencyExtractor dependencyExtractor) { + + //delete protein whose token is too far from triggertoken + Set notTheme = new HashSet(); + for (Protein protein : sentenceProteins) { + Token triggerToken = getTriggerToken(jcas, trigger); + Token token = getToken(jcas, protein); + int pathLength = dependencyExtractor.getDijkstraShortestPathLength( + triggerToken, token); + //int distance = token.getId() > triggerToken.getId() ? token.getId() - triggerToken.getId() + // : triggerToken.getId() - token.getId(); + if (pathLength > 6) { + notTheme.add(protein.getId()); + } + /*if (distance > 10) { + notTheme.add(protein.getId()); + }*/ + } + return notTheme; + } + + protected Set getFarProtein(JCas jcas, Trigger trigger, List sentenceProteins, DependencyExtractor dependencyExtractor) { + + //the same protein, delete far protein + Set farProtein = new HashSet(); + if (sentenceProteins.size() > 1) { + for(Protein p : sentenceProteins) { + for(Protein p2 : sentenceProteins) { + if (p.getId().equalsIgnoreCase(p2.getId())) { + continue; + } + if (p.getCoveredText().equalsIgnoreCase(p2.getCoveredText()) + || p.getCoveredText().contains(p2.getCoveredText()) + || p2.getCoveredText().contains(p.getCoveredText()) + ) { + + Token triggerToken = getTriggerToken(jcas, trigger); + Token token = getToken(jcas, p); + Token token2 = getToken(jcas, p2); + + int distance = token.getId() > triggerToken.getId() ? token.getId() - triggerToken.getId() + : triggerToken.getId() - token.getId(); + int distance2 = token2.getId() > triggerToken.getId() ? token2.getId() - triggerToken.getId() + : triggerToken.getId() - token2.getId(); + int pathLength = dependencyExtractor.getDijkstraShortestPathLength( + triggerToken, token); + int pathLength2 = dependencyExtractor.getDijkstraShortestPathLength( + triggerToken, token2); + if (pathLength > pathLength2) { + farProtein.add(p.getId()); + }else if (pathLength < pathLength2) { + farProtein.add(p2.getId()); + }else if (pathLength == pathLength2 && distance > distance2) { + farProtein.add(p.getId()); + }else if (pathLength == pathLength2 && distance < distance2) { + farProtein.add(p2.getId()); + } + } + } + } + } + return farProtein; + } + + protected Set getAndProtein(JCas jcas, List sentenceProteins, DependencyExtractor dependencyExtractor) { + + Set andProtein = new HashSet(); + if (sentenceProteins.size() > 1) { + for (Protein protein : sentenceProteins) { + for (Protein protein2 : sentenceProteins) { + if (protein.getId().equalsIgnoreCase(protein2.getId())) { + continue; + } + Token token2 = getToken(jcas, protein2); + Token token = getToken(jcas, protein); + String dependencyPath = dependencyExtractor.getShortestPath( + token, token2, Stage.BINDING); + if (dependencyPath != null + && (dependencyPath.equalsIgnoreCase("conj_and") + || dependencyPath.equalsIgnoreCase("-conj_and") + ||dependencyPath.equalsIgnoreCase("conj_or") + || dependencyPath.equalsIgnoreCase("-conj_or") + ||dependencyPath.equalsIgnoreCase("abbrev") //缩写,equlv + || dependencyPath.equalsIgnoreCase("-abbrev") + //||dependencyPath.equalsIgnoreCase("appos") + //|| dependencyPath.equalsIgnoreCase("-appos") + )) { + andProtein.add(protein.getId()); + andProtein.add(protein2.getId()); + } + /*if (token2.getId() == token.getId()) { + //andProtein.add(protein.getId()); + //andProtein.add(protein2.getId()); + }*/ + /*List between = new LinkedList(); + for (Token token : tokens) { + if (protein2.getBegin() >= protein.getEnd()) { + if (token.getBegin() >= protein.getEnd() + && token.getEnd() <= protein2.getBegin()) { + between.add(token); + } + }else if(protein.getBegin() >= protein2.getEnd()) { + if (token.getBegin() >= protein2.getEnd() + && token.getEnd() <= protein.getBegin()) { + between.add(token); + } + } + } + boolean isAnd = true; + for (Token token : between) { + if (!token.getCoveredText().equalsIgnoreCase(",") + && !token.getCoveredText().equalsIgnoreCase("and") + && !token.getCoveredText().equalsIgnoreCase("or") + ) { + isAnd = false; + break; + } + } + if (isAnd) { + andProtein.add(protein.getId()); + andProtein.add(protein2.getId()); + }*/ + } + } + } + return andProtein; + } + + protected boolean shouldDelete(JCas jcas, Token token, List sentenceProteins) { + + //delete entity + if (token.getLemma().equals("mrna") + && token.getLeftToken() != null + && token.getLeftToken().getLeftToken() != null + && (token.getLeftToken().getLeftToken().getLemma().equals("induction") + || token.getLeftToken().getLeftToken().getLemma().equals("express") + || token.getLeftToken().getLeftToken().getLemma().equals("expression"))) { + for (Protein protein : sentenceProteins) { + Token proteinToken = getToken(jcas, protein); + Token token2 = token.getLeftToken(); + if (token2.getId() == proteinToken.getId()) { + return true;//transcriotion: induction/express/expression PROTEIN mRNA + } + } + } + if (token.getLemma().equals("mrna") + && token.getLeftToken() != null + && token.getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLemma().equals("of") + && token.getLeftToken().getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLeftToken().getLemma().equals("level") + && token.getLeftToken().getLeftToken().getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLeftToken().getLeftToken().getLemma().equals("higher") + && token.getLeftToken().getLeftToken().getLeftToken().getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLeftToken().getLeftToken().getLeftToken().getLemma().equals("significantly") + && token.getLeftToken().getLeftToken().getLeftToken().getLeftToken().getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLeftToken().getLeftToken().getLeftToken().getLeftToken().getLemma().equals("express") + ) { + for (Protein protein : sentenceProteins) { + Token proteinToken = getToken(jcas, protein); + Token token2 = token.getLeftToken(); + if (token2.getId() == proteinToken.getId()) { + return true;//transcriotion: expressed significantly higher levels of PROTEIN mRNA + } + } + } + if (token.getLemma().equals("mrna") + && token.getLeftToken() != null + && token.getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLemma().equals("of") + && token.getLeftToken().getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLeftToken().getLemma().equals("level") + && token.getLeftToken().getLeftToken().getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLeftToken().getLeftToken().getLemma().equals("higher") + && token.getLeftToken().getLeftToken().getLeftToken().getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLeftToken().getLeftToken().getLeftToken().getLemma().equals("express") + ) { + for (Protein protein : sentenceProteins) { + Token proteinToken = getToken(jcas, protein); + Token token2 = token.getLeftToken(); + if (token2.getId() == proteinToken.getId()) { + return true;//transcriotion: expressed higher levels of PROTEIN mRNA + } + } + } + if (token.getLemma().equals("mrna") + && token.getLeftToken() != null + && token.getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLemma().equals("of") + && token.getLeftToken().getLeftToken().getLeftToken() != null + && (token.getLeftToken().getLeftToken().getLeftToken().getLemma().equals("induction") + || token.getLeftToken().getLeftToken().getLeftToken().getLemma().equals("expression") + || token.getLeftToken().getLeftToken().getLeftToken().getLemma().equals("induction"))) { + for (Protein protein : sentenceProteins) { + Token proteinToken = getToken(jcas, protein); + Token token2 = token.getLeftToken(); + if (token2.getId() == proteinToken.getId()) { + return true;//transcriotion: induction/express/expression of PROTEIN mRNA + } + } + } + if (token.getLemma().equals("transcript") + && token.getLeftToken() != null + && token.getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLemma().equals("of") + && token.getLeftToken().getLeftToken().getLeftToken() != null + && (token.getLeftToken().getLeftToken().getLeftToken().getLemma().equals("production") + || token.getLeftToken().getLeftToken().getLeftToken().getLemma().equals("detect"))) { + for (Protein protein : sentenceProteins) { + Token proteinToken = getToken(jcas, protein); + Token token2 = token.getLeftToken(); + if (token2.getId() == proteinToken.getId()) { + return true;//transcriotion: production/detect of PROTEIN transcript + } + } + } + if (token.getLemma().equals("transcript") + && token.getLeftToken() != null + && token.getLeftToken().getLeftToken() != null + && (token.getLeftToken().getLeftToken().getLemma().equals("production") + || token.getLeftToken().getLeftToken().getLemma().equals("detect"))) { + for (Protein protein : sentenceProteins) { + Token proteinToken = getToken(jcas, protein); + Token token2 = token.getLeftToken(); + if (token2.getId() == proteinToken.getId()) { + return true;//transcriotion: production/detect PROTEIN transcript + } + } + } + if (token.getLemma().equals("mrna") + && token.getRightToken() != null + && (token.getRightToken().getLemma().equals("production") + || token.getRightToken().getLemma().equals("synthesis") + || token.getRightToken().getLemma().equals("synthesize"))) { + return true;//transcriotion: mRNA production/synthesis/synthesize + } + if (token.getLemma().equals("mrna") + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("be") + && token.getRightToken().getRightToken() != null + && (token.getRightToken().getRightToken().getLemma().equals("express") + || token.getRightToken().getRightToken().getLemma().equals("present"))) { + return true;//transcriotion: mRNA be express/present + } + if (token.getLemma().equals("mrna") + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("be") + && token.getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getLemma().equals("also") + && token.getRightToken().getRightToken().getRightToken() != null + && (token.getRightToken().getRightToken().getRightToken().getLemma().equals("express") + || token.getRightToken().getRightToken().getRightToken().getLemma().equals("present"))) { + return true;//transcriotion: mRNA be also express/present + } + if (token.getLemma().equals("transcription") + && token.getRightToken() != null + && (token.getRightToken().getLemma().equals("factor") + || token.getRightToken().getLemma().equals("start"))) { + return true;//transcription factor/start: entity,transcription + } + if (token.getLemma().equals("binding") + && token.getRightToken() != null + && (token.getRightToken().getLemma().equals("site") + || token.getRightToken().getLemma().equals("domain") + || token.getRightToken().getLemma().equals("sequence"))) { + return true;//binding site/domain/sequence: entity,binding + } + if (token.getLemma().equals("localisation") + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("sequence") + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("nuclear")) { + return true;//nuclear localisation sequence: entity,localisation + } + + if (token.getLemma().equals("activation") + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("region")) { + return true;//activation region : entity,Positive_regulation + } + if (token.getLemma().equals("located") + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("site")) { + return true;//site located ,localisation + } + if (token.getLemma().equals("modification") + && token.getLeftToken() != null + && !token.getLeftToken().getSubLemma().equals("translational")) { + return true;//post-translational modification: Protein_modification + } + return false; + } + + protected String shouldChange(JCas jcas, Token token, List sentenceProteins) { + + //multi-token trigger + if (token.getLemma().equals("level") + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("protein")) { + return "Gene-expression";//gene-expression:protein level + } + if (token.getLemma().equals("expression") + && token.getLeftToken() != null + && (token.getLeftToken().getLemma().equals("mrna") + || token.getLeftToken().getLemma().equals("rna") + || (token.getLeftToken().getLemma().equals(")") + && token.getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLemma().equals("mrna")))) { + return "Transcriotion";//transcriotion: mrna/rna expression + } + if (token.getLemma().equals("transcriptional") + && token.getRightToken() != null + && (token.getRightToken().getLemma().equals("level") + || token.getRightToken().getLemma().equals("activity") + || token.getRightToken().getLemma().equals("activation") + || token.getRightToken().getLemma().equals("regulation") + || token.getRightToken().getLemma().equals("elongation"))) { + return "Transcriotion";//transcriotion: transcriptional level/activity/activation/regulation/elongation + } + /*if (token.getLemma().equals("level") + && token.getLeftToken() != null + && (token.getLeftToken().getLemma().equals("mrna") + || token.getLeftToken().getLemma().equals("transcript") + || token.getLeftToken().getLemma().equals("transcription"))) { + return 3;//transcriotion: mrna/transcript level + }*/ + if ((token.getLemma().equals("express") + || token.getLemma().equals("expression") + || token.getLemma().equals("induction")) + && token.getRightToken() != null + && token.getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getLemma().equals("mrna")) { + for (Protein protein : sentenceProteins) { + Token proteinToken = getToken(jcas, protein); + Token token2 = token.getRightToken(); + if (token2.getId() == proteinToken.getId()) { + return "Transcriotion";//transcriotion: induction/express/expression PROTEIN mRNA + } + } + } + if ((token.getLemma().equals("express") + || token.getLemma().equals("expression") + || token.getLemma().equals("induction")) + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("of") + && token.getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getRightToken().getLemma().equals("mrna")) { + for (Protein protein : sentenceProteins) { + Token proteinToken = getToken(jcas, protein); + Token token2 = token.getRightToken().getRightToken(); + if (token2.getId() == proteinToken.getId()) { + return "Transcriotion";//transcriotion: induction/express/expression of PROTEIN mRNA + } + } + } + if (token.getLemma().equals("express") + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("significantly") + && token.getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getLemma().equals("higher") + && token.getRightToken().getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getRightToken().getLemma().equals("level") + && token.getRightToken().getRightToken().getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getRightToken().getRightToken().getLemma().equals("of") + && token.getRightToken().getRightToken().getRightToken().getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getRightToken().getRightToken().getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getRightToken().getRightToken().getRightToken().getRightToken().getLemma().equals("mrna") + ) { + for (Protein protein : sentenceProteins) { + Token proteinToken = getToken(jcas, protein); + Token token2 = token.getRightToken().getRightToken().getRightToken().getRightToken().getRightToken(); + if (token2.getId() == proteinToken.getId()) { + return "Transcriotion";//transcriotion: expressed significantly higher levels of PROTEIN mRNA + } + } + } + if (token.getLemma().equals("express") + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("higher") + && token.getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getLemma().equals("level") + && token.getRightToken().getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getRightToken().getLemma().equals("of") + && token.getRightToken().getRightToken().getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getRightToken().getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getRightToken().getRightToken().getRightToken().getLemma().equals("mrna") + ) { + for (Protein protein : sentenceProteins) { + Token proteinToken = getToken(jcas, protein); + Token token2 = token.getRightToken().getRightToken().getRightToken().getRightToken(); + if (token2.getId() == proteinToken.getId()) { + return "Transcriotion";//transcriotion: expressed higher levels of PROTEIN mRNA + } + } + } + if ((token.getLemma().equals("production") + || token.getLemma().equals("detect")) + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("of") + && token.getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getRightToken().getLemma().equals("transcript")) { + for (Protein protein : sentenceProteins) { + Token proteinToken = getToken(jcas, protein); + Token token2 = token.getRightToken().getRightToken(); + if (token2.getId() == proteinToken.getId()) { + return "Transcriotion";//transcriotion: production/detect of PROTEIN transcript + } + } + } + if ((token.getLemma().equals("production") + || token.getLemma().equals("detect")) + && token.getRightToken() != null + && token.getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getLemma().equals("transcript")) { + for (Protein protein : sentenceProteins) { + Token proteinToken = getToken(jcas, protein); + Token token2 = token.getRightToken(); + if (token2.getId() == proteinToken.getId()) { + return "Transcriotion";//transcriotion: production/detect PROTEIN transcript + } + } + } + if ((token.getLemma().equals("production") + || token.getLemma().equals("synthesis") + || token.getLemma().equals("synthesize")) + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("mrna")) { + return "Transcriotion";//transcriotion: mRNA production/synthesis/synthesize + } + if ((token.getLemma().equals("express") + || token.getLemma().equals("present")) + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("be") + && token.getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLemma().equals("mrna")) { + return "Transcriotion";//transcriotion: mRNA be express/present + } + if ((token.getLemma().equals("express") + || token.getLemma().equals("present")) + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("also") + && token.getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLemma().equals("be") + && token.getLeftToken().getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLeftToken().getLemma().equals("mrna")) { + return "Transcriotion";//transcriotion: mRNA be also express/present + } + if ((token.getLemma().equals("expression") || token.getLemma().equals("detect") || token.getLemma().equals("exclusion")) + && ((token.getLeftToken() != null + && (token.getLeftToken().getLemma().equals("nuclear") + || token.getLeftToken().getLemma().equals("cytoplasmic"))) + || (token.getLeftToken() != null + && token.getLeftToken().getLeftToken() != null + && (token.getLeftToken().getLeftToken().getLemma().equals("nuclear") + || token.getLeftToken().getLeftToken().getLemma().equals("cytoplasmic"))))) { + return "Localization";//Localization: nuclear/cytoplasmic (PROTEIN) expression/detect/exclusion + } + if (token.getLemma().equals("located") + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("in") + && token.getRightToken().getRightToken() != null + && ((token.getRightToken().getRightToken().getLemma().equals("nuclear") + || token.getRightToken().getRightToken().getLemma().equals("cytoplasmic")) + || (token.getRightToken().getRightToken().getRightToken() != null + &&(token.getRightToken().getRightToken().getRightToken().getLemma().equals("nuclear") + || token.getRightToken().getRightToken().getRightToken().getLemma().equals("cytoplasmic"))))) { + return "Localization";//Localization: located in (PROTEIN) nuclear/cytoplasmic + } + if (token.getLemma().equals("detectable") + && token.getRightToken() != null + && ((token.getRightToken().getLemma().equals("nuclear") + || token.getRightToken().getLemma().equals("cytoplasmic")) + || (token.getRightToken().getRightToken() != null + && (token.getRightToken().getLemma().equals("nuclear") + || token.getRightToken().getLemma().equals("cytoplasmic"))))) { + return "Localization";//Localization: detectable (PROTEIN) nuclear/cytoplasmic + } + if (token.getLemma().equals("detectable") + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("be") + && token.getLeftToken().getLeftToken() != null + && ((token.getLeftToken().getLeftToken().getLemma().equals("nuclear") + || token.getLeftToken().getLeftToken().getLemma().equals("cytoplasmic")) + || (token.getLeftToken().getLeftToken().getLeftToken() != null + && (token.getLeftToken().getLeftToken().getLeftToken().getLemma().equals("nuclear") + || token.getLeftToken().getLeftToken().getLeftToken().getLemma().equals("cytoplasmic"))))) { + return "Localization";//Localization: nuclear/cytoplasmic (PROTEIN) be detectable + } + if (token.getLemma().equals("complex") + && ((token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("form")) + ||(token.getLeftToken() != null + && token.getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLemma().equals("form")))) { + return "Binding";//binding: forms a complex; form complexes + } + if (token.getLemma().equals("form") + && ((token.getRightToken() != null + && (token.getRightToken().getLemma().equals("complex") + || token.getRightToken().getLemma().equals("heteromultimer"))) + ||(token.getRightToken() != null + && token.getRightToken().getRightToken() != null + && token.getRightToken().getRightToken().getLemma().equals("complex")))) { + return "Binding";//binding: forms a complex; form complexes; form heteromultimers + } + if (token.getLemma().equals("immunoprecipitate") + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("together")) { + return "Binding";//immunoprecipitated together Binding + } + if (token.getLemma().equals("heteromultimer") + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("form")) { + return "Binding";//form heteromultimers Binding + } + if (token.getLemma().equals("mechanism") + && token.getLeftToken() != null + && token.getLeftToken().getSubLemma().equals("translational")) { + return "Protein_modification";//post-translational mechanisms: Protein_modification + } + if (token.getLemma().equals("activity") + && token.getLeftToken() != null + && token.getLeftToken().getCoveredText().contains("ubiquitin")) { + return "Ubiquitination";//linear-ubiquitin-chain-ligase activity : Ubiquitination + } + if (token.getLemma().equals("downstream") + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("function")) { + return "Regulation";//regulation: function downstream + } + if (token.getLemma().equals("function") + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("downstream")) { + return "Regulation";//regulation: function downstream + } + if (token.getLemma().equals("downstream") + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("target")) { + return "Regulation";//regulation: downstream target + } + if (token.getLemma().equals("control") + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("the") + && token.getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLemma().equals("under")) { + return "Regulation";//regulation: under the control + } + if (token.getLemma().equals("specificity") + && token.getLeftToken() != null + && token.getLeftToken().getCoveredText().contains("substrate")) { + return "Regulation";//substrate specificity : Regulation + } + if (token.getLemma().equals("role") + && ((token.getLeftToken() != null + && token.getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLemma().equals("play")) + || (token.getLeftToken() != null + && token.getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLeftToken().getLemma().equals("play")) + || (token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("dual")))) { + return "Regulation";//regulation: dual role; play an important role; plays a role + } + if (token.getLemma().equals("level") + && token.getLeftToken() != null + && (token.getLeftToken().getLemma().equals("high") + || token.getLeftToken().getLemma().equals("higher"))) { + return "Positive_regulation";//positive-regulation:high/higher level + } + if (token.getLemma().equals("presence") + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("the") + && token.getLeftToken().getLeftToken() != null + && token.getLeftToken().getLeftToken().getLemma().equals("in") + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("of")) { + return "Positive_regulation";//positive-regulation: in the presence of + } + if (token.getLemma().equals("response") + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("in") + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("to")) { + return "Positive_regulation";//positive-regulation:in response to + } + if (token.getLemma().equals("regulated") + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("up")) { + return "Positive_regulation";//positive_regulation: up regulated + } + if (token.getLemma().equals("capacity") + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("suppressive")) { + return "Negative_regulation";//suppressive capacity Negative_regulation + } + if (token.getLemma().equals("effect") + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("inhibitory")) { + return "Negative_regulation";//inhibitory effect Negative_regulation + } + if (token.getLemma().equals("knock") + && token.getRightToken() != null + && token.getRightToken().getLemma().equals("down")) { + return "Negative_regulation";//knocked down Negative_regulation + } + if (token.getLemma().equals("regulate") + && token.getLeftToken() != null + && token.getLeftToken().getLemma().equals("negatively")) { + return "Negative_regulation";//Negative_regulation: negatively regulate + } + return "Non_trigger"; + } + + private int predict2(int[] featureSparseVector, Instance instance,String modelFile) throws IOException + { + svm_model model = svm.svm_load_model(modelFile); + int n = 0; + svm_node[] x = new svm_node[n]; + + double[] fs = instance.getFeaturesNumericWord2vec(); + if (null != fs) { + n = featureSparseVector.length + fs.length; + } else { + n = featureSparseVector.length; + } + int num = 0; + if (null != fs) { + for(int j=0; j previousIndex) { + if (null != fs) { + x[num] = new svm_node(); + x[num].index = fs.length + index; + x[num].value = 1; + }else { + x[num] = new svm_node(); + x[num].index = index; + x[num].value = 1; + } + } + num++; + previousIndex = index; + } + + return (int)svm.svm_predict(model,x); + } + +} diff --git a/src/info/chenli/litway/bionlp13/ge/ReadWord2vec.java b/src/info/chenli/litway/bionlp13/ge/ReadWord2vec.java new file mode 100644 index 0000000..9768ee9 --- /dev/null +++ b/src/info/chenli/litway/bionlp13/ge/ReadWord2vec.java @@ -0,0 +1,56 @@ +package info.chenli.litway.bionlp13.ge; + + + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.util.*; + +public class ReadWord2vec { + + + public static Map word2vec(File word2vecFile) { + Map word2vec = new HashMap(); + //File word2vecFile = new File("./model/proteinKept_skipG.txt"); + //File word2vecFile = new File("./model/ge13train.SkipGram"); + try { + InputStreamReader word2vecFileStream = new InputStreamReader( + new FileInputStream(word2vecFile), "UTF8"); + BufferedReader word2vecFileBuffer = new BufferedReader(word2vecFileStream); + String word2vecTextCh; + //List word = new ArrayList(); + //List vec = new ArrayList(); + word2vecFileBuffer.readLine(); + while ((word2vecTextCh = word2vecFileBuffer.readLine()) != null) { + String[] wordSb = word2vecTextCh.split(" "); + String[] wordSb2 = new String[wordSb.length-1]; + double[] wordSb3 = new double[wordSb.length-1]; + System.arraycopy(wordSb, 1, wordSb2, 0, wordSb.length-1); + + for (int i=0; i getStructuredInstances(JCas jcas, Map> pairsOfArticle = StanfordDependencyReader .getPairs(new File(FileUtil.removeFileNameExtension( UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc"))); - + /*String s = FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas));*/ // Currently, one sentence is considered as one structured instance. while (sentenceIter.hasNext()) { @@ -81,28 +82,39 @@ protected List getStructuredInstances(JCas jcas, sentence); List proteins = JCasUtil.selectCovered(jcas, Protein.class, sentence); - + if (proteins.size() < 1) { + continue; + } + if (events.size() < 1) { + continue; + } for (Event event : events) { - - for (int i = 0; i < event.getThemes().size(); i++) { - - // check protein themes - for (Protein protein : proteins) { - - boolean isTheme = event.getThemes(i).equals( - protein.getId()); - - themeCandidates.add(themeToInstance(jcas, sentence, - protein, event.getTrigger(), pairsOfSentence, - dependencyExtractor, isTheme)); + // check protein themes + for (Protein protein : proteins) { + boolean isTheme = false; + for (int i = 0; i < event.getThemes().size(); i++) { + isTheme = event.getThemes(i).equals( + protein.getId()); + if (isTheme == true) { + break; + } } + Instance instance = themeToInstance(jcas, sentence, + protein, event.getTrigger(), pairsOfSentence, + dependencyExtractor, isTheme); + + if ( instance != null) { + themeCandidates.add(instance); + } + } - // check event themes + // check event themes + if (EventType.isComplexEvent(event.getTrigger().getEventType())) { for (Event themeEvent : events) { if (event != themeEvent) { - boolean isTheme = event.getThemes(i).equals( + boolean isTheme = event.getThemes(0).equals( themeEvent.getId()); themeCandidates.add(themeToInstance(jcas, sentence, diff --git a/src/info/chenli/litway/bionlp13/ge/ThemeRecogniser.java b/src/info/chenli/litway/bionlp13/ge/ThemeRecogniser.java index 5003f25..2d1605a 100644 --- a/src/info/chenli/litway/bionlp13/ge/ThemeRecogniser.java +++ b/src/info/chenli/litway/bionlp13/ge/ThemeRecogniser.java @@ -37,24 +37,19 @@ public void train(File trainingSet, boolean useSearn) { dict.creatNumericDictionary(instances); dict.saveDictionary(new File("./model/themes.".concat( classifierName).concat(".dict"))); - + /* trainingInstances.saveInstances(new File( "./model/instances.theme.txt")); trainingInstances.saveSvmLightInstances(new File( "./model/instances.theme.svm.txt")); - + */ train(dict.instancesToNumeric(instances)); - + saveModel(new File("./model/themes.".concat(classifierName) + .concat(".model"))); // System.out.println(accuracy(instances)); - ThemeInstances testInstances = new ThemeInstances(); - testInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); - instances = testInstances.getInstances(new File( - "./data/development/")); - instances = dict.instancesToNumeric(instances); - testInstances.saveSvmLightInstances(new File( - "./model/instances.theme.svm.dev.txt")); - + + // System.out.println(accuracy(instances)); } @@ -63,7 +58,12 @@ public void train(File trainingSet, boolean useSearn) { public static void main(String[] args) { ThemeRecogniser tr = new ThemeRecogniser(); - tr.loadModel(new File("./model/themes.".concat(tr.classifierName) + tr.train(new File(args[0]), false); + //tr.test(new File(args[1])); + + + /* + tr.loadModel(new File("./model/themes.liblinear.model".concat(tr.classifierName) .concat(".model"))); InstanceDictionary dict = new InstanceDictionary(); @@ -97,5 +97,47 @@ public static void main(String[] args) { total++; } System.out.println(new Accurary(correct, total)); + */ + } + + private void test(File file) { + // TODO Auto-generated method stub + ThemeInstances testInstances = new ThemeInstances(); + testInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = testInstances.getInstances(file); + InstanceDictionary dict = new InstanceDictionary(); + dict.loadDictionary(new File("./model/themes." + .concat(classifierName).concat(".dict"))); + instances = dict.instancesToNumeric(instances); + testInstances.saveSvmLightInstances(new File( + "./model/instances.theme.svm.dev.txt")); + int total = 0, correct = 0, tp, tn = 0, n = 0, fn, fp; + float p, r, f; + + for (Instance instance : instances) { + int prediction = predict(instance); + if (prediction == instance.getLabel()) { + if (instance.getLabelString().equalsIgnoreCase("Non_Theme")){ + tn++; + } + correct++; + } + + if (instance.getLabelString().equalsIgnoreCase("Non_Theme")){ + n++; + } + total++; + } + + fp = n - tn; + tp = correct - tn; + fn = total - n - tp; + p = (float) tp / (tp + fp); + r = (float) tp / (tp + fn); + f = (float) 2 * p * r / (p + r); + + System.out.println(new Accurary(correct, total)); + System.out.println("tp: " + tp + " fp: " + fp + " fn: " + fn); + System.out.println("p: " + p + " r: " + r + " f: " + f); } } diff --git a/src/info/chenli/litway/bionlp13/ge/TokenInstances.java b/src/info/chenli/litway/bionlp13/ge/TokenInstances.java index 0fbf788..cbdfcf6 100644 --- a/src/info/chenli/litway/bionlp13/ge/TokenInstances.java +++ b/src/info/chenli/litway/bionlp13/ge/TokenInstances.java @@ -1,6 +1,7 @@ package info.chenli.litway.bionlp13.ge; import info.chenli.classifier.Instance; +import info.chenli.classifier.InstanceDictionary; import info.chenli.litway.corpora.POS; import info.chenli.litway.corpora.Protein; import info.chenli.litway.corpora.Sentence; @@ -16,8 +17,14 @@ import info.chenli.litway.util.UimaUtil; import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; import java.util.ArrayList; import java.util.HashMap; +import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -32,6 +39,8 @@ public class TokenInstances extends AbstractInstances { + private final static String classifierName = "liblinear"; + private final static Logger logger = Logger.getLogger(TokenInstances.class .getName()); @@ -57,16 +66,31 @@ protected List getLabelsString() { protected List getStructuredInstances(JCas jcas, FSIterator tokenIter) { + File word2vecFile = new File("./word2vec/word2vec100"); + //File word2vecFile = new File("/home/songrq/word2vec/data/word2vec100"); + Map word2vec = ReadWord2vec.word2vec(word2vecFile); + List results = new LinkedList(); + AnnotationIndex sentenceIndex = jcas .getAnnotationIndex(Sentence.type); FSIterator sentenceIter = sentenceIndex.iterator(); - Map> pairsOfArticle = StanfordDependencyReader - .getPairs(new File(FileUtil.removeFileNameExtension( - UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc"))); - + Map> pairsOfArticle = new HashMap>(); + if (new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc")).exists()) { + pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc"))); + } else { + pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sd"))); + } + + + // Currently, one sentence is considered as one structured instance. while (sentenceIter.hasNext()) { @@ -76,7 +100,16 @@ protected List getStructuredInstances(JCas jcas, Sentence sentence = (Sentence) sentenceIter.next(); Set pairsOfSentence = pairsOfArticle.get(sentence.getId()); + + List sentenceProteins = JCasUtil.selectCovered(jcas, + Protein.class, sentence); + + if (sentenceProteins.size() <= 0) { + continue; + } + + List triggers = JCasUtil.selectCovered(jcas, Trigger.class, sentence); @@ -89,57 +122,29 @@ protected List getStructuredInstances(JCas jcas, trigger.getEventType()); } - // List originalTokens = JCasUtil.selectCovered(jcas, - // Token.class, sentence); - List sentenceProteins = JCasUtil.selectCovered(jcas, - Protein.class, sentence); - - // // print proteins which are within a token - // for (Token token : originalTokens) { - // - // for (Protein protein : sentenceProteins) { - // if (token.getBegin() == protein.getBegin() - // && token.getEnd() == protein.getEnd()) { - // continue; - // } - // if (token.getBegin() <= protein.getBegin() - // && token.getEnd() >= protein.getEnd()) { - // System.out.println(token.getCoveredText().concat("\t") - // .concat(protein.getCoveredText())); - // } - // } - // } - // - // if (true) { - // continue; - // } - // postProcessSentenceTokens(jcas, originalTokens, sentenceProteins, - // pairsOfSentence); List tokensOfSentence = JCasUtil.selectCovered(jcas, Token.class, sentence); DependencyExtractor dependencyExtractor = new DependencyExtractor( JCasUtil.selectCovered(jcas, Token.class, sentence), pairsOfSentence); + for (Token token : tokensOfSentence) { - creatingInstanceLoop: for (Token token : tokensOfSentence) { - - // the tokens with protein have to be considered, as they may - // have trigger - // for (Protein protein : sentenceProteins) { - // if (token.getBegin() == protein.getBegin() - // && token.getEnd() == protein.getEnd()) { - // continue creatingInstanceLoop; - // } - // } - - nodes.add(tokenToInstance(jcas, token, triggerTokens, + Instance instance = new Instance(); + instance = tokenToInstance(jcas, token, triggerTokens, tokensOfSentence, sentenceProteins, pairsOfSentence, - dependencyExtractor)); + dependencyExtractor, word2vec); + if (instance != null) { + instance.setFileId(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".word2vec")); + nodes.add(instance); + } + } results.add(si); } - + + //System.out.println(posSet.toString()); return results; } @@ -154,25 +159,135 @@ protected List getStructuredInstances(JCas jcas, * @param dependencyExtractor * @return */ + /** + * @param jcas + * @param token + * @param triggerTokens + * @param tokensOfSentence + * @param sentenceProteins + * @param pairsOfSentence + * @param dependencyExtractor + * @return + */ protected Instance tokenToInstance(JCas jcas, Token token, Map triggerTokens, List tokensOfSentence, List sentenceProteins, Set pairsOfSentence, - DependencyExtractor dependencyExtractor) { + DependencyExtractor dependencyExtractor, Map word2vec) { + + if(isProtein(token, sentenceProteins)) { + return null; + } + String pos2 = token.getPos(); + if (!POS.isPos(pos2)) { + return null; + } + if(pos2.equals("EX") | pos2.equals("LS") + | pos2.equals("NNP") | pos2.equals("PRP") + | pos2.equals("MD") | pos2.equals("NNPS") + | pos2.equals("WDT") + | pos2.equals("PRP$") | pos2.equals("PDT") + | pos2.equals("SYM") | pos2.equals("RP") + | pos2.equals("FW") | pos2.equals("POS") + | pos2.equals("WP") | pos2.equals("RBS") + | pos2.equals("VH") | pos2.equals("WP$") + ) { + return null; + } + + if(token.getCoveredText().equals("%") | token.getCoveredText().equals("#") + | token.getCoveredText().equals("\"") | token.getCoveredText().equals("\'") + | token.getCoveredText().equals("&") | token.getCoveredText().equals("+") + | token.getCoveredText().equals("-") | token.getCoveredText().equals("/") + | token.getCoveredText().equals("[") | token.getCoveredText().equals("]") + | token.getCoveredText().equals("(") | token.getCoveredText().equals(")") + | token.getCoveredText().equals("<") | token.getCoveredText().equals(":") + | token.getCoveredText().equals(";") + | token.getCoveredText().equals(",") | token.getCoveredText().equals(".") + | token.getCoveredText().equals("?") | token.getCoveredText().equals("\\") + | token.getCoveredText().equals("|") | token.getCoveredText().equals("{") + | token.getCoveredText().equals("}") | token.getCoveredText().equals(">") + | token.getCoveredText().equals("*") | token.getCoveredText().equals("=") + | token.getCoveredText().equals("^") | token.getCoveredText().equals("$") + | token.getCoveredText().equals("@") | token.getCoveredText().equals("!") + | token.getCoveredText().equals("~") | token.getCoveredText().equals("`")) { + return null; + } + + String s = token.getCoveredText(); + String s1 = "+-/*()[]{}?!:;\\\"\'.<>,@#¥%^&_|=$~`"; + boolean b = true; + for (int i=0; i featureString = new ArrayList(); instance.setFeaturesString(featureString); featureString.add(new String[] { "text_".concat(token.getCoveredText() - .toLowerCase()) }); + .toLowerCase()) }); + + //String isPro = isProtein(token, sentenceProteins) ? token.getCoveredText() + // .toLowerCase() : null; + //featureString.add(null == isPro ? new String[0] + // : new String[] { "proText_".concat("_").concat(isPro) }); + String lemma = "lemma_".concat(token.getLemma().toLowerCase()); featureString.add(new String[] { lemma }); String stem = "stem_".concat(token.getStem().toLowerCase()); - featureString.add(new String[] { stem }); + //featureString.add(new String[] { stem }); String pos = "pos_".concat(token.getPos()); - featureString.add(new String[] { lemma.concat("_").concat(pos) }); + //featureString.add(new String[] { lemma.concat("_").concat(pos) }); + featureString.add(new String[] { pos }); + + if (!token.getSubLemma().equals(token.getLemma())) { + String subLemma = "sublemma_" + .concat(token.getSubLemma().toLowerCase()); + featureString.add(new String[] { subLemma }); + }else { + featureString.add(new String[0]); + } + + if (!token.getSubStem().equals(token.getStem())) { + String subStem = "substem_" + .concat(token.getSubStem().toLowerCase()); + featureString.add(new String[] { subStem }); + }else { + featureString.add(new String[0]); + } List modifiers = new ArrayList(); List heads = new ArrayList(); @@ -190,6 +305,26 @@ protected Instance tokenToInstance(JCas jcas, Token token, if (pair.getHead() == token.getId()) { for (Token aToken : tokensOfSentence) { if (aToken.getId() == pair.getModifier()) { + + tokenBegin = aToken.getBegin(); + tokenEnd = aToken.getEnd(); + aToken = containsProtein(aToken, sentenceProteins); + aToken.setBegin(tokenBegin); + aToken.setEnd(tokenEnd); + lemma2 = aToken.getLemma(); + if(isProtein(aToken, sentenceProteins)) { + lemma2 = "PROTEIN"; + } + if(word2vec.containsKey(lemma2) + || word2vec.containsKey(lemma2.toUpperCase())) { + if (word2vec.containsKey(lemma2)) { + fs2 = word2vec.get(lemma2); + }else { + fs2 = word2vec.get(lemma2.toUpperCase()); + } + //instance.setFeaturesNumericWord2vec(fs); + } + String tokenLemma = isProtein(aToken, sentenceProteins) ? "PROTEIN" : aToken.getLemma().toLowerCase(); modifiers.add(lemma.concat("_") @@ -197,6 +332,13 @@ protected Instance tokenToInstance(JCas jcas, Token token, .concat(tokenLemma)); noLemmaModifiers.add(pair.getRelation() .concat("_lemma_").concat(tokenLemma)); + + if (!aToken.getSubLemma().equals(aToken.getLemma())) { + String subLemma = isProtein(aToken, sentenceProteins) ? "PROTEIN" + : aToken.getSubLemma().toLowerCase(); + noLemmaModifiers.add(pair.getRelation() + .concat("_sublemma_").concat(subLemma)); + } noDepModifiers.add(lemma.concat("_lemma_").concat( tokenLemma)); @@ -215,13 +357,41 @@ protected Instance tokenToInstance(JCas jcas, Token token, } else if (pair.getModifier() == token.getId()) { for (Token aToken : tokensOfSentence) { if (aToken.getId() == pair.getHead()) { + tokenBegin = aToken.getBegin(); + tokenEnd = aToken.getEnd(); + aToken = containsProtein(aToken, sentenceProteins); + aToken.setBegin(tokenBegin); + aToken.setEnd(tokenEnd); + lemma2 = aToken.getLemma(); + if(isProtein(aToken, sentenceProteins)) { + lemma2 = "PROTEIN"; + } + if(word2vec.containsKey(lemma2) + || word2vec.containsKey(lemma2.toUpperCase())) { + if (word2vec.containsKey(lemma2)) { + fs3 = word2vec.get(lemma2); + }else { + fs3 = word2vec.get(lemma2.toUpperCase()); + } + //instance.setFeaturesNumericWord2vec(fs); + } + + String tokenLemma = isProtein(aToken, sentenceProteins) ? "PROTEIN" + : aToken.getLemma().toLowerCase(); heads.add(lemma.concat("_-").concat(pair.getRelation()) .concat("_lemma_") - .concat(aToken.getLemma().toLowerCase())); + .concat(tokenLemma)); noLemmaHeads.add(pair.getRelation().concat("_lemma_") - .concat(aToken.getLemma().toLowerCase())); + .concat(tokenLemma)); + + if (!aToken.getSubLemma().equals(aToken.getLemma())) { + String subLemma = isProtein(aToken, sentenceProteins) ? "PROTEIN" + : aToken.getSubLemma().toLowerCase(); + noLemmaHeads.add(pair.getRelation().concat("_sublemma_") + .concat(subLemma)); + } noDepHeads.add(lemma.concat("_-").concat("_lemma_") - .concat(aToken.getLemma().toLowerCase())); + .concat(tokenLemma)); } } } @@ -246,34 +416,57 @@ protected Instance tokenToInstance(JCas jcas, Token token, String[] iobjFeature = new String[iobjList.size()]; iobjFeature = iobjList.toArray(iobjFeature); - featureString.add(modifiersFeature); - featureString.add(headsFeature); - // featureString.add(noLemmaModifiersFeature); - // featureString.add(noLemmaHeadsFeature); + //featureString.add(modifiersFeature); + //featureString.add(headsFeature); + featureString.add(noLemmaModifiersFeature); + featureString.add(noLemmaHeadsFeature); // featureString.add(noDepModifiersFeature); // featureString.add(noDepHeadsFeature); // featureString.add(nsubjFeature); // featureString.add(dobjFeature); // featureString.add(iobjFeature); - String subLemma = "sublemma_" - .concat(null == token.getSubLemma() ? token.getLemma() - .toLowerCase() : token.getSubLemma().toLowerCase()); - featureString.add(new String[] { subLemma }); - String subStem = "substem_".concat(null == token.getSubStem() ? token - .getStem().toLowerCase() : token.getSubStem().toLowerCase()); - featureString.add(new String[] { subStem }); - // // ngram // previous word String leftTokenStr = token.getLeftToken() == null ? null : (POS - .isPos(token.getLeftToken().getPos()) ? "previousWord_" - .concat(token.getLeftToken().getLemma()) : null); - // featureString.add(null == leftTokenStr ? new String[0] - // : new String[] { leftTokenStr }); - // featureString.add(null == leftTokenStr ? new String[0] - // : new String[] { lemma.concat("_").concat(leftTokenStr) }); + .isPos(token.getLeftToken().getPos()) ? + isProtein(token.getLeftToken(), sentenceProteins) ? "PROTEIN": + "previousWord_".concat(token.getLeftToken().getLemma()) : null); + + String leftTokenSubLemma = null; + if (token.getLeftToken() != null) { + Token leftToken = token.getLeftToken(); + tokenBegin = leftToken.getBegin(); + tokenEnd = leftToken.getEnd(); + leftToken = containsProtein(leftToken, sentenceProteins); + leftToken.setBegin(tokenBegin); + leftToken.setEnd(tokenEnd); + lemma2 = leftToken.getLemma(); + if(isProtein(leftToken, sentenceProteins)) { + lemma2 = "PROTEIN"; + } + if(word2vec.containsKey(lemma2) + || word2vec.containsKey(lemma2.toUpperCase())) { + if (word2vec.containsKey(lemma2)) { + fs4 = word2vec.get(lemma2); + }else { + fs4 = word2vec.get(lemma2.toUpperCase()); + } + //instance.setFeaturesNumericWord2vec(fs); + } + + if (!token.getLeftToken().getSubLemma().equals(token.getLeftToken().getLemma())) { + leftTokenSubLemma = isProtein(token.getLeftToken(), sentenceProteins) ? "PROTEIN" + : token.getLeftToken().getSubLemma().toLowerCase(); + } + } + featureString.add(null == leftTokenStr ? new String[0] + : new String[] { leftTokenStr }); + featureString.add(null == leftTokenSubLemma ? new String[0] + : new String[] { leftTokenSubLemma }); + //featureString.add(null == leftTokenStr ? new String[0] + //: new String[] { lemma.concat("_").concat(leftTokenStr) }); String posLeftTokenStr = token.getLeftToken() == null ? null : ((token .getLeftToken().getPos().indexOf("NN") > -1 || token.getLeftToken().getPos().indexOf("JJ") > -1 || token @@ -284,15 +477,48 @@ protected Instance tokenToInstance(JCas jcas, Token token, // : new String[] { posLeftTokenStr }); // after word String rightTokenStr = token.getRightToken() == null ? null : (POS - .isPos(token.getRightToken().getPos()) ? "afterWord_" - .concat(token.getRightToken().getLemma()) : null); - // featureString.add(null == rightTokenStr ? new String[0] - // : new String[] { rightTokenStr }); - // featureString.add(null == rightTokenStr ? new String[0] - // : new String[] { lemma.concat("_").concat(rightTokenStr) }); - String posRightTokenStr = token.getRightToken() == null ? null : (token + .isPos(token.getRightToken().getPos()) ? + isProtein(token.getRightToken(), sentenceProteins) ? "PROTEIN": + "afterWord_".concat(token.getRightToken().getLemma()) : null); + + String rightTokenSubLemma = null; + if (token.getRightToken() != null) { + Token leftToken = token.getRightToken(); + tokenBegin = leftToken.getBegin(); + tokenEnd = leftToken.getEnd(); + leftToken = containsProtein(leftToken, sentenceProteins); + leftToken.setBegin(tokenBegin); + leftToken.setEnd(tokenEnd); + lemma2 = leftToken.getLemma(); + if(isProtein(leftToken, sentenceProteins)) { + lemma2 = "PROTEIN"; + } + if(word2vec.containsKey(lemma2) + || word2vec.containsKey(lemma2.toUpperCase())) { + if (word2vec.containsKey(lemma2)) { + fs5 = word2vec.get(lemma2); + }else { + fs5 = word2vec.get(lemma2.toUpperCase()); + } + //instance.setFeaturesNumericWord2vec(fs); + } + + if (!token.getRightToken().getSubLemma().equals(token.getRightToken().getLemma())) { + rightTokenSubLemma = isProtein(token.getRightToken(), sentenceProteins) ? "PROTEIN" + : token.getRightToken().getSubLemma().toLowerCase(); + } + } + featureString.add(null == rightTokenStr ? new String[0] + : new String[] { rightTokenStr }); + featureString.add(null == rightTokenSubLemma ? new String[0] + : new String[] { rightTokenSubLemma }); + //featureString.add(null == rightTokenStr ? new String[0] + //: new String[] { lemma.concat("_").concat(rightTokenStr) }); + + String posRightTokenStr = token.getRightToken() == null ? null : ((token .getRightToken().getPos().indexOf("NN") > -1) ? lemma - + "_afterWord_".concat(token.getLeftToken().getLemma()) : null; + + "_afterWord_".concat(token.getRightToken().getLemma()) : null); + // featureString.add(null == posRightTokenStr ? new String[0] // : new String[] { posRightTokenStr }); @@ -323,7 +549,7 @@ protected Instance tokenToInstance(JCas jcas, Token token, } } // featureString.add(proteins); - featureString.add(proteinsDummy); + // featureString.add(proteinsDummy); // featureString.add(proteinsLemma); boolean isDepNull = true; for (String dep : proteinsDep) { @@ -333,7 +559,12 @@ protected Instance tokenToInstance(JCas jcas, Token token, } } // featureString.add(isDepNull ? new String[0] : proteinsDep); - + /*System.arraycopy(fs, 0, fs0, 0, 100); + System.arraycopy(fs2, 0, fs0, 100, 100); + System.arraycopy(fs3, 0, fs0, 200, 100); + System.arraycopy(fs4, 0, fs0, 300, 100); + System.arraycopy(fs5, 0, fs0, 400, 100); + instance.setFeaturesNumericWord2vec(fs0);*/ if (null != triggerTokens) { instance.setLabelString(triggerTokens.containsKey(token.getId()) ? triggerTokens @@ -341,120 +572,143 @@ protected Instance tokenToInstance(JCas jcas, Token token, } else { instance.setLabelString(String.valueOf(EventType.Non_trigger)); } - + return instance; } - // public void postProcessSentenceTokens(JCas jcas, List tokens, - // List sentenceProteins, Set pairsOfSentence) { - // - // int i = tokens.size() + 1; - // - // tokenCollectingLoop: for (Token token : tokens) { - // - // if (!POS.isPos(token.getPos())) { - // token.removeFromIndexes(); - // continue; - // } - // - // for (Protein protein : sentenceProteins) { - // // token is a protein - // if (protein.getBegin() == token.getBegin() - // && protein.getEnd() == token.getEnd()) { - // continue tokenCollectingLoop; - // } - // // token is within a protein - // if ((token.getBegin() >= protein.getBegin() && token.getBegin() < protein - // .getEnd()) - // || (token.getBegin() > protein.getBegin() && token - // .getBegin() <= protein.getEnd())) { - // continue tokenCollectingLoop; - // } - // // protein is within a token (tricky part) - // if ((token.getBegin() <= protein.getBegin() && token.getEnd() > protein - // .getEnd()) - // || (token.getBegin() < protein.getBegin() && token - // .getEnd() >= protein.getEnd())) { - // Token proteinToken = createNewToken(jcas, - // protein.getBegin(), protein.getEnd(), protein - // .getCoveredText().toLowerCase(), - // String.valueOf(POS.NN)); - // if (protein.getBegin() != token.getBegin()) { - // Token leftToken = createNewToken( - // jcas, - // token.getBegin(), - // protein.getBegin(), - // token.getCoveredText() - // .substring( - // 0, - // protein.getBegin() - // - token.getBegin()) - // .toLowerCase(), String.valueOf(POS.NN)); - // leftToken.setId(i++); - // leftToken.setLeftToken(token.getLeftToken()); - // leftToken.setRightToken(proteinToken); - // proteinToken.setLeftToken(leftToken); - // } else { - // proteinToken.setLeftToken(token.getLeftToken()); - // } - // - // if (protein.getEnd() != token.getEnd()) { - // Token rightToken = createNewToken( - // jcas, - // protein.getEnd(), - // token.getEnd(), - // token.getCoveredText() - // .substring( - // 0, - // token.getEnd() - // - protein.getEnd()) - // .toLowerCase(), String.valueOf(POS.NN)); - // - // // use the original id of the token for the last token - // rightToken.setId(token.getId()); - // proteinToken.setId(i++); - // rightToken.setLeftToken(proteinToken); - // proteinToken.setRightToken(rightToken); - // rightToken.setRightToken(token.getRightToken()); - // - // } else { - // proteinToken.setRightToken(token.getRightToken()); - // proteinToken.setId(token.getId()); - // } - // token.removeFromIndexes(); - // continue tokenCollectingLoop; - // } - // } - // } - // } - - private boolean isProtein(Token token, List proteinsOfSentence) { - for (Protein protein : proteinsOfSentence) { - if ((token.getBegin() >= protein.getBegin() && token.getEnd() <= protein - .getEnd()) - || (protein.getBegin() >= token.getBegin() && protein - .getEnd() <= token.getEnd())) { - return true; - } + private boolean isDigit(Character i) { + if (i >= '0' && i <= '9') { + return true; + }else { + return false; } - return false; } public static void main(String[] args) { TokenInstances ti = new TokenInstances(); ti.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); - List instances = ti.getInstances(new File(args[0])); - - for (Instance instance : instances) { - System.out.print(instance.getLabelString()); - for (String[] features : instance.getFeaturesString()) { - for (String feature : features) { - System.out.print("\t".concat(feature)); + //List instances = ti.getInstances(new File("/media/songrq/soft/litway/数据/" + // + "BioNLP13/BioNLP-ST-2013_GE_train_data_yuanShuJu")); + List instances = ti.getInstances(new File("/media/songrq/soft/litway/数据/" + + "BioNLP13/BioNLP-ST-2013_GE_devel_data_yuanShuJu")); + InstanceDictionary dict = new InstanceDictionary(); +/* dict.creatNumericDictionary(instances); + dict.saveDictionary(new File("./model/triggers.".concat(classifierName) + .concat(".dict"))); + logger.info("Save dictionary.");*/ + dict.loadDictionary(new File("./model/triggers.".concat(classifierName) + .concat(".dict"))); + dict.instancesToNumeric(instances); + //File f = new File("./model/instances.trigger.svm.txt"); + File f = new File("./model/instances.trigger.svm.dev.txt"); + OutputStreamWriter word2vecFileStream; + try { + word2vecFileStream = new OutputStreamWriter( + new FileOutputStream(f), "UTF8"); + + StringBuffer sb = new StringBuffer(); + for (Instance instance : instances) { + + sb.append(String.valueOf(instance.getLabel())); + double[] fs = instance.getFeaturesNumericWord2vec(); + if (null != fs) { + for (int m=0; m previousIndex) { + if (null != fs) { + sb.append(" ".concat(String.valueOf(fs.length + feature)).concat(":1")); + }else { + sb.append(" ".concat(String.valueOf(feature)).concat(":1")); + } + } + previousIndex = feature; + } + sb.append("\n"); + String instancesStr = sb.toString(); + word2vecFileStream.write(instancesStr); + sb.delete(0,sb.length()); + } + word2vecFileStream.close(); + } catch (UnsupportedEncodingException | FileNotFoundException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + protected boolean isProtein(Token token, List proteinsOfSentence) { + List proteins = new LinkedList(); + for (Protein protein : proteinsOfSentence) { + if (token.getBegin() >= protein.getBegin() && token.getEnd() <= protein + .getEnd()) { + return true; + }else if (((protein.getBegin() >= token.getBegin() && protein + .getEnd() < token.getEnd()) || + (protein.getBegin() > token.getBegin() && protein + .getEnd() <= token.getEnd()))) { + proteins.add(protein); } - System.out.println(); } + if (proteins.size() < 1) { + return false; + } + + String tokenText = token.getCoveredText(); + for (Protein protein : proteins) { + tokenText = tokenText.replace(protein.getCoveredText(), ""); + } + tokenText = tokenText.replace("/", ""); + tokenText = tokenText.replace("-", ""); + if (tokenText.trim().equals("")) { + return true; + } + return false; } + protected Token containsProtein(Token token, List proteinsOfSentence) { + List proteins = new LinkedList(); + for (Protein protein : proteinsOfSentence) { + if (((protein.getBegin() >= token.getBegin() && protein + .getEnd() < token.getEnd()) || + (protein.getBegin() > token.getBegin() && protein + .getEnd() <= token.getEnd())) + ) { + proteins.add(protein); + } + } + if (proteins.size() < 1) { + return token; + } + + String tokenText = token.getCoveredText(); + for (Protein protein : proteins) { + tokenText = tokenText.replace(protein.getCoveredText(), " PROTEIN "); + } + tokenText = tokenText.replace("/", " "); + tokenText = tokenText.replace("-", " "); + while(tokenText.contains(" ")) { + tokenText = tokenText.replace(" ", " "); + } + tokenText = tokenText.trim(); + String[] s = tokenText.split(" "); + for (int i=s.length-1; i>=0; i--) { + if (!s[i].equals("PROTEIN")) { + tokenText = s[i]; + break; + } + } + token.setBegin(token.getBegin() + token.getCoveredText().indexOf(tokenText)); + token.setEnd(token.getBegin() + tokenText.length()); + token.setLemma(tokenText); + return token; + } + + } diff --git a/src/info/chenli/litway/bionlp13/ge/TokenRecognizer.java b/src/info/chenli/litway/bionlp13/ge/TokenRecognizer.java new file mode 100644 index 0000000..bfe59aa --- /dev/null +++ b/src/info/chenli/litway/bionlp13/ge/TokenRecognizer.java @@ -0,0 +1,35 @@ +package info.chenli.litway.bionlp13.ge; + +import java.io.IOException; + +import service.svm_train; +import service.svm_predict; + +public class TokenRecognizer { + public static void main(String[] args) { + String []arg ={ "-s", "0", + "-t", "2", + "-c", "32", + "-g", "0.5", + "-h", "1", + "./model/instances.trigger.svm.txt", //存放SVM训练模型用的数据的路径 + "./model/triggers.model"//存放SVM通过训练数据训练出来的模型的路径 + + }; + + String []parg={"./model/instances.trigger.svm.dev.txt", + "./model/triggers.model", + "./model/trigger.out"}; + + try { + System.out.println("........SVM运行开始.........."); + svm_train.main(arg); + System.out.println("........SVM训练结束.........."); + svm_predict.main(parg); + System.out.println("........SVM运行结束.........."); + } catch (IOException e) { + // TODO 自动生成的 catch 块 + e.printStackTrace(); + } + } +} diff --git a/src/info/chenli/litway/bionlp13/ge/TriggerInstances.java b/src/info/chenli/litway/bionlp13/ge/TriggerInstances.java deleted file mode 100644 index ab157dd..0000000 --- a/src/info/chenli/litway/bionlp13/ge/TriggerInstances.java +++ /dev/null @@ -1,115 +0,0 @@ -package info.chenli.litway.bionlp13.ge; - -import info.chenli.litway.corpora.Token; -import info.chenli.litway.corpora.Trigger; -import info.chenli.litway.searn.StructuredInstance; - -import java.io.File; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; -import java.util.logging.Logger; - -import org.apache.uima.cas.FSIterator; -import org.apache.uima.jcas.JCas; -import org.apache.uima.jcas.tcas.Annotation; -import org.uimafit.util.JCasUtil; - -public class TriggerInstances extends AbstractInstances { - - private final static Logger logger = Logger - .getLogger(TriggerInstances.class.getName()); - - public TriggerInstances() { - - super("triggers", new int[]{Trigger.type}); - - } - - @Override - protected List getLabelsString() { - - ArrayList tokenTypes = new ArrayList(); - for (EventType eventType : EventType.values()) { - tokenTypes.add(String.valueOf(eventType)); - } - - return tokenTypes; - - } - - @Override - protected List getStructuredInstances(JCas jcas, - FSIterator annoIter) { - - List results = new LinkedList(); - - // set annotations to the instance - while (annoIter.isValid()) { - - Trigger trigger = (Trigger) annoIter.get(); - - // get tokens - List tokens = JCasUtil.selectCovered(jcas, Token.class, - trigger); - Token triggerToken = null; - - if (tokens.size() == 0) - // if trigger is within a token, then take - // the nesting token. It - // happens, e.g. in PMC-2065877-01-Introduction. - { - FSIterator iter = jcas.getAnnotationIndex( - Token.type).iterator(); - while (iter.hasNext()) { - Token token = (Token) iter.next(); - if (token.getBegin() <= trigger.getBegin() - && token.getEnd() >= trigger.getEnd()) { - triggerToken = token; - break; - } - } - - } else - // take one of the nested tokens. - { - - triggerToken = getTriggerToken(tokens); - } - -// double[] values = new double[instances.numAttributes()]; -// -// values[0] = instances.attribute(0).addStringValue( -// triggerToken.getCoveredText()); -// values[1] = instances.attribute(1).addStringValue( -// triggerToken.getLemma()); -// values[2] = instances.attribute(2).addStringValue( -// triggerToken.getPos()); -// values[3] = instances.attribute(3).addStringValue( -// null == triggerToken.getLeftToken() ? "" : triggerToken -// .getLeftToken().getCoveredText()); -// values[4] = instances.attribute(4).addStringValue( -// null == triggerToken.getRightToken() ? "" : triggerToken -// .getRightToken().getCoveredText()); -// values[5] = classes.indexOfValue(trigger.getEventType()); -// -// StructuredInstance si = new StructuredInstance(); -// // TODO this part is wrong and doesn't work properly due to the -// // change in the upper class. need to be updated. -// new DenseInstance(1.0, values); -// results.add(si); - - annoIter.moveToNext(); - } - - return results; - } - - public static void main(String[] args) { - - TriggerInstances ti = new TriggerInstances(); - ti.getInstances(new File(args[0])); - System.out.println(ti.getInstances()); - } - -} diff --git a/src/info/chenli/litway/bionlp13/ge/TriggerRecogniser.java b/src/info/chenli/litway/bionlp13/ge/TriggerRecogniser.java index ed1ce7e..0b232cc 100644 --- a/src/info/chenli/litway/bionlp13/ge/TriggerRecogniser.java +++ b/src/info/chenli/litway/bionlp13/ge/TriggerRecogniser.java @@ -14,13 +14,23 @@ import info.chenli.litway.util.DependencyExtractor; import info.chenli.litway.util.FileUtil; import info.chenli.litway.util.StanfordDependencyReader; +import info.chenli.litway.util.UimaUtil; import info.chenli.litway.util.StanfordDependencyReader.Pair; import info.chenli.litway.util.Stemmer; import info.chenli.litway.util.Timer; import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; +import java.lang.reflect.Array; import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; @@ -100,43 +110,57 @@ public void train(String trainingDir, int round) { trainingDir)); logger.info(String.valueOf(instances.size()).concat( " instances are collected.")); - + InstanceDictionary dict = new InstanceDictionary(); dict.creatNumericDictionary(instances); dict.saveDictionary(new File("./model/triggers.".concat(classifierName) .concat(".dict"))); logger.info("Save dictionary."); - trainingInstances.saveInstances(new File( +/* trainingInstances.saveInstances(new File( "./model/instances.trigger.txt")); trainingInstances.saveNumericInstances(new File( "./model/instances.trigger.num.txt")); trainingInstances.saveSvmLightInstances(new File( "./model/instances.trigger.svm.txt")); +*/ + Timer timer = new Timer(); + timer.start(); - // development instances - // TokenInstances devInstances = new TokenInstances(); - // devInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); - // List devInstancesList = devInstances.getInstances(new File( - // "./data/development/")); - // logger.info(String.valueOf(devInstancesList.size()).concat( - // " instances are collected.")); - // - // dict.instancesToNumeric(devInstancesList); + train(instances, round); + timer.stop(); + logger.info("Training takes ".concat(String.valueOf(timer + .getRunningTime()))); + + saveModel(new File("./model/triggers.".concat(classifierName).concat( + ".model"))); + + } + + public void train2(String trainingDir, int round) { // - // devInstances.saveInstances(new File( - // "./model/instances.trigger.dev.txt")); - // devInstances.saveNumericInstances(new File( - // "./model/instances.trigger.num.dev.txt")); - // devInstances.saveSvmLightInstances(new File( - // "./model/instances.trigger.svm.dev.txt")); + // collect all instances and fetch syntactical information // - // System.out.print("Finish collecting events."); - // System.exit(0); - - // Collections.shuffle(instances); - // logger.info("Shuffle instances."); + TokenInstances trainingInstances = new TokenInstances(); + trainingInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = trainingInstances.getInstances(new File( + trainingDir)); + logger.info(String.valueOf(instances.size()).concat( + " instances are collected.")); + + InstanceDictionary dict = new InstanceDictionary(); + dict.creatNumericDictionary(instances); + dict.saveDictionary(new File("./model/triggers.train.devel.".concat(classifierName) + .concat(".dict"))); + logger.info("Save dictionary."); +/* trainingInstances.saveInstances(new File( + "./model/instances.trigger.txt")); + trainingInstances.saveNumericInstances(new File( + "./model/instances.trigger.num.txt")); + trainingInstances.saveSvmLightInstances(new File( + "./model/instances.trigger.svm.txt")); +*/ Timer timer = new Timer(); timer.start(); @@ -145,11 +169,11 @@ public void train(String trainingDir, int round) { logger.info("Training takes ".concat(String.valueOf(timer .getRunningTime()))); - saveModel(new File("./model/triggers.".concat(classifierName).concat( + saveModel(new File("./model/triggers.train.devel.".concat(classifierName).concat( ".model"))); } - + public List predict(File file, InstanceDictionary dict, boolean printConfusionMatrix) { @@ -186,9 +210,17 @@ public List predict(File file, InstanceDictionary dict, } else { JCas jcas = instancesGetter.processSingleFile(file); - Map> pairsOfArticle = StanfordDependencyReader - .getPairs(new File(FileUtil.removeFileNameExtension( - file.getAbsolutePath()).concat(".sdepcc"))); + Map> pairsOfArticle = new HashMap>(); + if (new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc")).exists()) { + pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc"))); + } else { + pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sd"))); + } FSIterator sentenceIter = jcas.getAnnotationIndex( Sentence.type).iterator(); @@ -211,10 +243,13 @@ public List predict(File file, InstanceDictionary dict, DependencyExtractor dependencyExtractor = new DependencyExtractor( JCasUtil.selectCovered(jcas, Token.class, sentence), pairsOfSentence); + File word2vecFile = new File("/home/songrq/word2vec/data/word2vec100"); + Map word2vec = ReadWord2vec.word2vec(word2vecFile); + for (Token token : tokens) { Instance instance = instancesGetter.tokenToInstance(jcas, token, null, tokens, sentenceProteins, - pairsOfSentence, dependencyExtractor); + pairsOfSentence, dependencyExtractor, word2vec); instance = dict.instanceToNumeric(instance); int prediction = instance.getFeaturesNumeric().length == 0 ? dict .getLabelNumeric(String @@ -350,15 +385,12 @@ private Fscore test(List instances, InstanceDictionary dict, int counter) { int tp = 0, fp = 0, tn = 0, fn = 0, correct = 0, total = 0; - StringBuffer tp_instances = new StringBuffer(); +/* StringBuffer tp_instances = new StringBuffer(); StringBuffer fp_nonTrigger_instances = new StringBuffer(); - StringBuffer fp_trigger_instances = new StringBuffer(); - + StringBuffer fp_trigger_instances = new StringBuffer();*/ // Collections.shuffle(instances); for (Instance instance : instances) { - instance = dict.instanceToNumeric(instance); - int prediction = this.predict(instance); if (instance.getLabelString() == String .valueOf(EventType.Non_trigger)) { @@ -366,32 +398,47 @@ private Fscore test(List instances, InstanceDictionary dict, if (prediction != instance.getLabel()) { fp++; total++; - fp_nonTrigger_instances.append(dict - .getLabelString(prediction).concat("\t") - .concat(instance.toString()).concat("\n")); + + /*fp_nonTrigger_instances.append(String.valueOf(instance.getSentenceId()) + .concat("\t").concat(String.valueOf(instance.getTokenId())) + .concat("\t").concat(dict.getLabelString(prediction)) + .concat("\t").concat(instance.getLabelString()) + .concat("\t").concat(instance.getFeaturesString().get(0)[0]) + .concat("\n"));*/ + } else { + tn++; } } else { if (prediction != instance.getLabel()) { - fp++; + //fp++; fn++; - fp_trigger_instances.append(dict.getLabelString(prediction) - .concat("\t").concat(instance.toString()) - .concat("\n")); + /*fp_trigger_instances.append(String.valueOf(instance.getSentenceId()) + .concat("\t").concat(String.valueOf(instance.getTokenId())) + .concat("\t").concat(dict.getLabelString(prediction)) + .concat("\t").concat(instance.getLabelString()) + .concat("\t").concat(instance.getFeaturesString().get(0)[0]) + //.concat("\t").concat(instance.getFeaturesString().get(1)[0]) + .concat("\n"));*/ } else { - correct++; + correct++;//鎻愬彇鍑虹殑trigger鏈夊嚑涓纭殑 tp++; - tp_instances.append(dict.getLabelString(prediction) - .concat("\t").concat(instance.toString()) - .concat("\n")); + /*tp_instances.append(String.valueOf(instance.getSentenceId()) + .concat("\t").concat(String.valueOf(instance.getTokenId())) + .concat("\t").concat(dict.getLabelString(prediction)) + .concat("\t").concat(instance.getLabelString()) + .concat("\t").concat(instance.getFeaturesString().get(0)[0]) + //.concat("\t").concat(instance.getFeaturesString().get(1)[0]) + .concat("\n"));*/ } + if (prediction != dict.getLabelNumeric(String .valueOf(EventType.Non_trigger))) { - total++; + total++;//鎻愬彇鍑哄灏戜釜trigger } } } - FileUtil.saveFile(fp_nonTrigger_instances.toString(), new File( +/* FileUtil.saveFile(fp_nonTrigger_instances.toString(), new File( "./result/fp_nonTrigger".concat(String.valueOf(counter)) .concat(".txt"))); FileUtil.saveFile(fp_trigger_instances.toString(), @@ -399,92 +446,110 @@ private Fscore test(List instances, InstanceDictionary dict, .concat(".txt"))); FileUtil.saveFile(tp_instances.toString(), new File("./result/tp" .concat(String.valueOf(counter)).concat(".txt"))); - +*/ Fscore fscore = new Fscore(tp, fp, tn, fn); System.out.println(fscore); System.out.println(new Accurary(correct, total)); return fscore; } - + /** + * train 閽堝train锛岃緭鍑簃odel锛宨nstance + * predict 閽堝test锛岃緭鍑簍rigger + * test 閽堝development锛岃緭鍑篿nstance锛孉ccuracy + * cross 閽堝train锛屼氦鍙夐獙璇� + * @param args + */ public static void main(String[] args) { - if (args.length == 0 || !args[0].equals("predict") - && !args[0].equals("train") && !args[0].equals("test") - && !args[0].equals("cross")) { - throw new IllegalArgumentException( - "The first argument has to be \"predict\" or \"train\". "); - } - - File file = new File(args[1]); TriggerRecogniser tr = new TriggerRecogniser(); - - if (args[0].equals("train")) { - if (!file.isDirectory()) { - throw new IllegalArgumentException( - "The second argument has to be the training directory. "); - } - tr.train(args[1], 1); - } else if (args[0].equals("predict")) { - - if (!file.isFile() && !file.isDirectory()) { - throw new IllegalArgumentException( - "The second argument has to be a file."); - } - - tr.loadModel(new File("./model/triggers.".concat(tr.classifierName) - .concat(".model"))); - InstanceDictionary dict = new InstanceDictionary(); - dict.loadDictionary(new File("./model/triggers.".concat( - tr.classifierName).concat(".dict"))); - - List triggers = tr.predict(file, dict, false); - for (Trigger trigger : triggers) { - System.out.println(trigger.getId().concat("\t") - .concat(trigger.getEventType()).concat(" ") - .concat(String.valueOf(trigger.getBegin())).concat(" ") - .concat(String.valueOf(trigger.getEnd())).concat("\t") - .concat(trigger.getCoveredText())); + //tr.train("/media/songrq/soft/litway/数据/" + // + "BioNLP13/BioNLP-ST-2013_GE_train_data_yuanShuJu", 1); + tr.train2(args[0], 1); + + + //tr.train2("/media/songrq/soft/litway/数据/" + // + "BioNLP13/BioNLP-ST-2013_GE_train_devel_data_yuanShuJu", 1); +//////////////////////////////////////////////////////////////////////////////////////// + + + /*tr.loadModel(new File("./model/triggers.".concat(tr.classifierName) + .concat(".model"))); + + TokenInstances testInstances = new TokenInstances(); + testInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = testInstances.getInstances(new File("/media/songrq/soft/litway/数据/" + + "BioNLP13/BioNLP-ST-2013_GE_devel_data_yuanShuJu")); + logger.info(String.valueOf(instances.size()).concat( + " instances are collected.")); + + InstanceDictionary dict = new InstanceDictionary(); + dict.loadDictionary(new File("./model/triggers.".concat( + tr.classifierName).concat(".dict"))); + tr.test(instances, dict, 1); + dict.instancesToNumeric(instances); + + //testInstances.saveInstances(new File( + // "./model/instances.trigger.dev.txt")); + //testInstances.saveNumericInstances(new File( + // "./model/instances.trigger.num.dev.txt")); + //testInstances.saveSvmLightInstances(new File( + // "./model/instances.trigger.svm.dev.txt")); + + System.out.print("Finish collecting events."); + + int total = 0, correct = 0, tp, tn = 0, n = 0, fn, fp; + float p, r, f; + int total14[] = new int [14]; + int correct14[] = new int [14]; + String ss[] = {"Non_trigger", "Gene_expression", "Transcription", "Protein_catabolism", "Localization", "Binding", "Protein_modification", "Phosphorylation", "Ubiquitination", "Acetylation", "Deacetylation", "Regulation", "Positive_regulation", "Negative_regulation"}; + for (Instance instance : instances) { + for (int i=0; i<14; i++) { + if (instance.getLabelString().equalsIgnoreCase(ss[i])) { + total14[i]++; + } } - - } else if (args[0].equals("test")) { - - tr.loadModel(new File("./model/triggers.".concat(tr.classifierName) - .concat(".model"))); - - TokenInstances testInstances = new TokenInstances(); - testInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); - List instances = testInstances.getInstances(new File( - "./data/development/")); - logger.info(String.valueOf(instances.size()).concat( - " instances are collected.")); - - InstanceDictionary dict = new InstanceDictionary(); - dict.loadDictionary(new File("./model/triggers.".concat( - tr.classifierName).concat(".dict"))); - dict.instancesToNumeric(instances); - - testInstances.saveInstances(new File( - "./model/instances.trigger.dev.txt")); - testInstances.saveNumericInstances(new File( - "./model/instances.trigger.num.dev.txt")); - testInstances.saveSvmLightInstances(new File( - "./model/instances.trigger.svm.dev.txt")); - - System.out.print("Finish collecting events."); - - int total = 0, correct = 0; - for (Instance instance : instances) { - int prediction = tr.predict(instance); - if (prediction == instance.getLabel()) { - correct++; + int prediction = tr.predict(instance); + if (prediction == instance.getLabel()) { + for (int i=0; i<14; i++) { + if (instance.getLabelString().equalsIgnoreCase(ss[i])) { + correct14[i]++; + } } - total++; + if (instance.getLabelString() == String + .valueOf(EventType.Non_trigger)){ + tn++; + } + correct++; + } + + if (instance.getLabelString() == String + .valueOf(EventType.Non_trigger)){ + n++; } - System.out.println(new Accurary(correct, total)); - } else if (args[0].equals("cross")) { - tr.crossValidate(args[1]); + total++; } + + fp = n - tn; + tp = correct - tn; + fn = total - n - tp; + p = (float) tp / (tp + fp); + r = (float) tp / (tp + fn); + f = (float) 2 * p * r / (p + r); + + System.out.println(new Accurary(correct, total)); + System.out.println("tp: " + tp + " fp: " + fp + " fn: " + fn); + System.out.println("p: " + p + " r: " + r + " f: " + f); + for (int i=0; i<14; i++) { + System.out.print(ss[i]); + System.out.print("\t\t"); + System.out.print(total14[i]); + System.out.print("\t\t"); + System.out.print(correct14[i]); + System.out.print("\t\t"); + System.out.print((float)correct14[i]/total14[i]); + System.out.print("\n"); + }*/ } } diff --git a/src/info/chenli/litway/bionlp13/ge/Word2vec.java b/src/info/chenli/litway/bionlp13/ge/Word2vec.java new file mode 100644 index 0000000..a096eb1 --- /dev/null +++ b/src/info/chenli/litway/bionlp13/ge/Word2vec.java @@ -0,0 +1,94 @@ +package info.chenli.litway.bionlp13.ge; + +import info.chenli.classifier.Instance; +import info.chenli.litway.corpora.POS; +import info.chenli.litway.corpora.Protein; +import info.chenli.litway.corpora.Sentence; +import info.chenli.litway.corpora.Token; +import info.chenli.litway.corpora.Trigger; +import info.chenli.litway.searn.StructuredInstance; +import info.chenli.litway.util.BioLemmatizerUtil; +import info.chenli.litway.util.DependencyExtractor; +import info.chenli.litway.util.FileUtil; +import info.chenli.litway.util.StanfordDependencyReader; +import info.chenli.litway.util.StanfordDependencyReader.Pair; +import info.chenli.litway.util.Stemmer; +import info.chenli.litway.util.UimaUtil; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.logging.Logger; + +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.uimafit.util.JCasUtil; + +public class Word2vec { + + public static void main(String[] args) { + + TokenInstances ti = new TokenInstances(); + ti.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = ti.getInstances(new File(args[0])); + StringBuffer sb = new StringBuffer(); + StringBuffer sb2 = new StringBuffer(); + int i = 0; + String fileDir = instances.get(0).getFileId(); + String[] fileNames = instances.get(0).getFileId().split("/"); + String fileName = fileNames[fileNames.length - 1]; + File file = new File("./word2vec/GE13/".concat(fileName)); + + for (Instance instance : instances) { + + if(instance.getSentenceId() == i){ + sb.append(instance.getId()); + sb.append(" "); + }else { + sb.append("\n"); + i = instance.getSentenceId(); + sb.append(instance.getId()); + sb.append(" "); + } + + if(instance.getLabelString() != String.valueOf(EventType.Non_trigger)){ + sb2.append(instance.getId()); + sb2.append(" "); + sb2.append(instance.getLabelString().toLowerCase()); + sb2.append("\n"); + } + + if(!fileDir.equals(instance.getFileId()) ) { + sb.append("\n"); + String instancesStr = sb2.toString(); + sb.append(instancesStr); + instancesStr = sb.toString(); + FileUtil.saveFile(instancesStr, file); + fileDir=instance.getFileId(); + + i = 0; + sb.delete(0, sb.length()-1); + sb2.delete(0, sb2.length()-1); + fileNames = instance.getFileId().split("/"); + fileName = fileNames[fileNames.length - 1]; + file = new File("./word2vec/GE13/".concat(fileName)); + sb.append(instance.getId()); + sb.append(" "); + } + } + + sb.append("\n"); + String instancesStr = sb2.toString(); + sb.append(instancesStr); + instancesStr = sb.toString(); + FileUtil.saveFile(instancesStr, file); + } + +} diff --git a/src/info/chenli/litway/config/Argument.java b/src/info/chenli/litway/config/Argument.java new file mode 100644 index 0000000..8350e4e --- /dev/null +++ b/src/info/chenli/litway/config/Argument.java @@ -0,0 +1,14 @@ +package info.chenli.litway.config; + +public interface Argument { + + /** + * Whether the argument is an event. It is impossible that an argument is + * neither an entity nor an event. + * + * @return If True, it is an event instead of an entity. If + * False, it is an entity instead of an event. + */ + public boolean isEvent(); + +} diff --git a/src/info/chenli/litway/config/Attribute.java b/src/info/chenli/litway/config/Attribute.java new file mode 100644 index 0000000..10d7bdf --- /dev/null +++ b/src/info/chenli/litway/config/Attribute.java @@ -0,0 +1,21 @@ +package info.chenli.litway.config; + +public class Attribute { + + private String name, value; + + public Attribute(String name, String value) { + super(); + this.name = name; + this.value = value; + } + + public String getName() { + return name; + } + + public String getValue() { + return value; + } + +} diff --git a/src/info/chenli/litway/config/Configuration.java b/src/info/chenli/litway/config/Configuration.java new file mode 100644 index 0000000..ac08f53 --- /dev/null +++ b/src/info/chenli/litway/config/Configuration.java @@ -0,0 +1,128 @@ +package info.chenli.litway.config; + +import info.chenli.litway.util.XMLUtil; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import org.w3c.dom.Document; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; + +public class Configuration { + + private Set entityTypes = new HashSet(); + private Set argTypes = new HashSet(); + private Set eventTypes = new HashSet(); + + public Configuration(String file) { + this(new File(file)); + } + + public Configuration(File file) { + + Document document = XMLUtil.getDocument(file); + + // + // Entity types + // + NodeList entities = document.getElementsByTagName("entity"); + + for (int i = 0; i < entities.getLength(); i++) { + Node entityType = entities.item(i); + if (entityType.getNodeType() == Node.ELEMENT_NODE) { + + entityTypes.add(entityType.getNodeValue()); + } + } + + // + // Event types + // + NodeList events = document.getElementsByTagName("event"); + + for (int i = 0; i < events.getLength(); i++) { + + Node event = events.item(i); + EventType et = new EventType(); + + NodeList eventNodes = event.getChildNodes(); + for (int j = 0; j < eventNodes.getLength(); j++) { + + Node eventNode = eventNodes.item(j); + + if (eventNode.getNodeType() == Node.ELEMENT_NODE + && eventNode.getNodeName().equals("type")) { + et.setType(eventNode.getNodeValue()); + if (!eventTypes.contains(et)) { + eventTypes.add(et); + } + } + + if (eventNode.getNodeType() == Node.ELEMENT_NODE + && eventNode.getNodeName().equals("arguments")) { + + Map> argumentsMap = new HashMap>(); + NodeList arguments = eventNode.getChildNodes(); + for (int k = 0; k < arguments.getLength(); k++) { + Node argument = arguments.item(k); + if (argument.getNodeType() == Node.ELEMENT_NODE) { + String argumentName = argument.getNodeName(); + if (!argTypes.contains(argumentName)) { + argTypes.add(argumentName); + } + List valueTypeList = new LinkedList(); + argumentsMap.put(argumentName, valueTypeList); + NodeList valueTypes = argument.getChildNodes(); + for (int l = 0; l < valueTypes.getLength(); l++) { + Node valueType = valueTypes.item(l); + if (valueType.getNodeType() == Node.ELEMENT_NODE) { + valueTypeList.add(valueType.getNodeValue()); + } + } + } + } + + et.setArguments(argumentsMap); + } + } + + } + } + + public List getEntityTypes() { + return entityTypes; + } + + public List getArgTypes() { + return argTypes; + } + + public List getEventTypes() { + return eventTypes; + } + + public List getEventTypeNames() { + + List eventTypeNames = new LinkedList(); + for (EventType et : eventTypes) { + eventTypeNames.add(et.getType()); + } + + return eventTypeNames; + } + + public void validate() { + throw new UnsupportedOperationException("Unimplemented method."); + } + + public static void main(String[] args) { + new Configuration("../data/ge/config.xml"); + } +} diff --git a/src/info/chenli/litway/config/EntityType.java b/src/info/chenli/litway/config/EntityType.java new file mode 100644 index 0000000..234e71d --- /dev/null +++ b/src/info/chenli/litway/config/EntityType.java @@ -0,0 +1,11 @@ +package info.chenli.litway.config; + +public class EntityType implements Argument { + + @Override + public boolean isEvent() { + + return false; + } + +} diff --git a/src/info/chenli/litway/config/EventType.java b/src/info/chenli/litway/config/EventType.java new file mode 100644 index 0000000..4d35d5a --- /dev/null +++ b/src/info/chenli/litway/config/EventType.java @@ -0,0 +1,33 @@ +package info.chenli.litway.config; + +import java.util.List; +import java.util.Map; + +public class EventType implements Argument { + + private String type; + private Map> arguments; + + public String getType() { + return type; + } + + public void setType(String type) { + this.type = type; + } + + public Map> getArguments() { + return arguments; + } + + public void setArguments(Map> arguments) { + this.arguments = arguments; + } + + @Override + public boolean isEvent() { + + return true; + } + +} diff --git a/src/info/chenli/litway/config/EventTypes.java b/src/info/chenli/litway/config/EventTypes.java new file mode 100644 index 0000000..241b732 --- /dev/null +++ b/src/info/chenli/litway/config/EventTypes.java @@ -0,0 +1,52 @@ +package info.chenli.litway.config; + +import java.util.logging.Logger; + +public enum EventTypes { + + INSTANCE; + + private final static Logger logger = Logger.getLogger(EventTypes.class + .getName()); + + private EventType[] eventTypes; + + public void init(EventType[] eventTypes) { + + if (null != this.eventTypes) { + logger.warning("The event types have been initialised and can't be updated."); + return; + } + + this.eventTypes = eventTypes; + } + + public EventType[] getEventTypes() { + return this.eventTypes; + } + + public static boolean isAnEventType(String eventType) { + + boolean isAnEventType = false; + for (EventType et : INSTANCE.eventTypes) { + if (et.getType().equals(eventType)) { + isAnEventType = true; + break; + } + } + + return isAnEventType; + } + + public static boolean isSimpleEvent(String eventType) { + + for (EventType et : INSTANCE.eventTypes) { + if (et.getType().equals(eventType)) { + isAnEventType = true; + break; + } + } + + return false; + } +} diff --git a/src/info/chenli/litway/corpora/A1Reader.java b/src/info/chenli/litway/corpora/A1Reader.java new file mode 100644 index 0000000..6ca2d80 --- /dev/null +++ b/src/info/chenli/litway/corpora/A1Reader.java @@ -0,0 +1,57 @@ +package info.chenli.litway.corpora; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.StringTokenizer; +import java.util.logging.Logger; + +import org.apache.uima.jcas.JCas; + +public class A1Reader { + + private final static Logger logger = Logger.getLogger(A1Reader.class + .getName()); + + public void fetchEntities(String a1FileName, JCas jcas) { + + BufferedReader br = null; + try { + + br = new BufferedReader(new FileReader(new File(a1FileName))); + + String line; + + while ((line = br.readLine()) != null) { + + StringTokenizer st = new StringTokenizer(line); + + Entity entity = new Entity(jcas); + entity.setId(st.nextToken()); + entity.setEntityType(st.nextToken()); + entity.setBegin(Integer.parseInt(st.nextToken())); + entity.setEnd(Integer.parseInt(st.nextToken())); + + entity.addToIndexes(); + } + + } catch (FileNotFoundException e) { + logger.severe(e.getMessage()); + throw new RuntimeException(e); + } catch (IOException e) { + logger.severe(e.getMessage()); + throw new RuntimeException(e); + } finally { + + try { + br.close(); + } catch (IOException e) { + logger.severe(e.getMessage()); + throw new RuntimeException(e); + } + } + + } +} diff --git a/src/info/chenli/litway/corpora/Application.java b/src/info/chenli/litway/corpora/Application.java index 9d28f13..f3ec920 100644 --- a/src/info/chenli/litway/corpora/Application.java +++ b/src/info/chenli/litway/corpora/Application.java @@ -114,7 +114,8 @@ private static void processFile(File aFile, AnalysisEngine aAE, CAS aCAS) // set the path of resource file aCAS.createView("FilePath").setSofaDataURI(aFile.getAbsolutePath(), "text"); - + //System.out.println(aFile.getAbsolutePath()); + // process aAE.process(aCAS); diff --git a/src/info/chenli/litway/corpora/BioNLPSyntacticAnnotator.java b/src/info/chenli/litway/corpora/BioNLPSyntacticAnnotator.java index 1a1ba0b..7b9a040 100644 --- a/src/info/chenli/litway/corpora/BioNLPSyntacticAnnotator.java +++ b/src/info/chenli/litway/corpora/BioNLPSyntacticAnnotator.java @@ -13,8 +13,9 @@ import java.io.FileInputStream; import java.io.InputStreamReader; import java.util.ArrayList; -import java.util.Collections; +import java.util.HashMap; import java.util.Iterator; +import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; @@ -25,33 +26,53 @@ import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.jcas.JCas; -import org.uimafit.util.JCasUtil; public class BioNLPSyntacticAnnotator extends JCasAnnotator_ImplBase { private final static Logger logger = Logger .getLogger(BioNLPSyntacticAnnotator.class.getName()); - + @Override public void process(JCas jcas) throws AnalysisEngineProcessException { File sentencisedFile = new File(FileUtil.removeFileNameExtension( UimaUtil.getJCasFilePath(jcas)).concat(".ss")); + if (!sentencisedFile.exists()) { + sentencisedFile = new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".tok")); + } File tokenisedFile = new File(FileUtil.removeFileNameExtension( UimaUtil.getJCasFilePath(jcas)).concat(".tok")); - List tokens = ConnlxReader - .getTokens(new File(FileUtil.removeFileNameExtension( - UimaUtil.getJCasFilePath(jcas)).concat(".connlx"))); - Map> pairsOfArticle = StanfordDependencyReader - .getPairs(new File(FileUtil.removeFileNameExtension( - UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc"))); - + List tokens = new LinkedList(); + if (new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".connlx")).exists()) { + tokens = ConnlxReader + .getTokens(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".connlx"))); + } else { + tokens = ConnlxReader + .getTokens(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".conll"))); + } + Map> pairsOfArticle = new HashMap>(); + if (new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc")).exists()) { + pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc"))); + } else { + pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sd"))); + } + Token leftToken = null; try { // get the stream of the original text InputStreamReader originalTextStream = new InputStreamReader( new ByteArrayInputStream(jcas.getDocumentText().getBytes())); - + InputStreamReader originalTextStream2 = new InputStreamReader( + new ByteArrayInputStream(jcas.getDocumentText().getBytes())); // get the stream of the sentencised text InputStreamReader sentencisedFileStream = new InputStreamReader( new FileInputStream(sentencisedFile), "UTF8"); @@ -60,19 +81,28 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { InputStreamReader tokenisedFileStream = new InputStreamReader( new FileInputStream(tokenisedFile), "UTF8"); - int originalTextCh, sentencisedTextCh, tokenisedTextCh, offset = 0, sentenceBegin = 0, tokenBegin = 0; + int originalTextCh, originalTextCh2 = 0, sentencisedTextCh, tokenisedTextCh, offset = 0, sentenceBegin = 0, tokenBegin = 0; Iterator tokenItor = tokens.iterator(); - Token leftToken = null; + // token map of each sentence TreeMap tokensOfSentence = new TreeMap(); int sentenceId = 0; + originalTextStream2.read(); while ((originalTextCh = originalTextStream.read()) != -1) { - + originalTextCh2 = originalTextStream2.read(); Character originalTextChar = (char) originalTextCh; - + if (originalTextChar == ' ' && originalTextCh2 != -1) { + Character originalTextChar2 = (char) originalTextCh2; + if (originalTextChar2 == System.getProperty( + "line.separator").charAt(0)) { + offset++; + continue; + } + } + // // Tokens // @@ -83,21 +113,21 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { || tokenisedFileChar == System.getProperty( "line.separator").charAt(0)) { - Token token = fetchToken(jcas, tokenBegin, offset, - tokenItor, leftToken); - + List tokenss = fetchToken(jcas, tokenBegin, offset, + tokenItor, leftToken, UimaUtil.getJCasFilePath(jcas)); + Token token = tokenss.get(0); + leftToken = tokenss.get(1); // put tokens in same sentence into a map. tokensOfSentence.put(token.getId(), token); - + tokenBegin = offset; if (originalTextChar == ' ' || originalTextChar == System.getProperty( "line.separator").charAt(0)) { tokenBegin++; - } else { + }else { tokenisedFileStream.read(); } - } } @@ -106,6 +136,11 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { // if ((sentencisedTextCh = sentencisedFileStream.read()) != -1) { Character sentencisedFileChar = (char) sentencisedTextCh; + if (sentencisedFileChar == ' ') { + if (originalTextChar != ' ') { + sentencisedFileStream.read(); + } + } if (sentencisedFileChar == System.getProperty( "line.separator").charAt(0)) { @@ -120,10 +155,9 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { || originalTextChar == System.getProperty( "line.separator").charAt(0)) { sentenceBegin++; - } else { + }else { sentencisedFileStream.read(); } - } } @@ -131,7 +165,7 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { } if (tokenItor.hasNext()) { - fetchToken(jcas, tokenBegin, offset, tokenItor, leftToken); + fetchToken(jcas, tokenBegin, offset, tokenItor, leftToken, UimaUtil.getJCasFilePath(jcas)); } fetchSentence(jcas, sentenceBegin, offset, sentenceId, pairsOfArticle.get(sentenceId)); @@ -147,203 +181,76 @@ public void process(JCas jcas) throws AnalysisEngineProcessException { } } - private Token fetchToken(JCas jcas, int tokenBegin, int offset, - Iterator tokenItor, Token leftToken) { + private List fetchToken(JCas jcas, int tokenBegin, int offset, + Iterator tokenItor, Token leftToken, String fileName) { // the last token is missed due to reaching the end of the file. + /*System.out.println(tokenBegin); + if (null != leftToken) { + System.out.println(leftToken.getCoveredText()); + }*/ ConnlxReader.Token connlxToken = tokenItor.next(); String pos = connlxToken.getPos(); Token token = createNewToken(jcas, tokenBegin, offset, pos); token.setId(connlxToken.getId()); - + //token.setStem(fileName); + token.setLeftToken(leftToken); if (null != leftToken) { leftToken.setRightToken(token); } leftToken = token; token.addToIndexes(); - - return token; + + List tokenss = new ArrayList(); + tokenss.add(token); + tokenss.add(leftToken); + return tokenss; } private void fetchSentence(JCas jcas, int sentenceBegin, int offset, int sentenceId, Set pairsOfSentence) { - // the last sentence is missed due to reaching the end of the file. Sentence sentence = new Sentence(jcas, sentenceBegin, offset); sentence.setId(sentenceId); sentence.addToIndexes(); - // as many protein (1036 in bionlp development data) are within token. - // They will be separated as tokens -// postProcessSentence(jcas, sentence, pairsOfSentence); } - private void postProcessSentence(JCas jcas, Sentence sentence, - Set pairsOfSentence) { - - List sentenceProteins = JCasUtil.selectCovered(jcas, - Protein.class, sentence); - - List originalTokens = JCasUtil.selectCovered(jcas, Token.class, - sentence); - - System.out.println(sentence.getCoveredText()); - int tokenId = originalTokens.size() + 1; - - // process the tokens which may contain protein and/or trigger. - for (Token token : originalTokens) { - - List containedProteins = new ArrayList(); - - for (Protein protein : sentenceProteins) { - // if the protein is the token - if (protein.getBegin() == token.getBegin() - && protein.getEnd() == token.getEnd()) { - continue; - } - if (protein.getBegin() >= token.getBegin() - && protein.getEnd() <= token.getEnd()) { - containedProteins.add(protein); - } - } - - if (containedProteins.size() < 1) { - continue; - } - - Collections.sort(containedProteins, new AnnotationSorter()); - - // - // if there is contained protein(s), start breaking the old token - // into new tokens - // - List newTokens = new ArrayList(); - - // collect all candidate new tokens - int tokenBegin = token.getBegin(), tokenEnd; - for (Protein protein : containedProteins) { - tokenEnd = protein.getBegin(); - if (tokenBegin == tokenEnd) { - tokenEnd = protein.getEnd(); - Token proteinToken = createNewToken(jcas, tokenBegin, - tokenEnd, null); - newTokens.add(proteinToken); - } else if (tokenBegin < tokenEnd) { - Token newToken = createNewToken(jcas, tokenBegin, tokenEnd, - null); - newTokens.addAll(furtherBreakToken(jcas, newToken)); - tokenBegin = protein.getBegin(); - tokenEnd = protein.getEnd(); - Token proteinToken = createNewToken(jcas, tokenBegin, - tokenEnd, null); - newTokens.add(proteinToken); - } - tokenBegin = tokenEnd; - } - if (tokenBegin != token.getEnd()) { - newTokens - .addAll(furtherBreakToken( - jcas, - createNewToken(jcas, tokenBegin, - token.getEnd(), null))); - } - - System.out.print(token.getCoveredText() + "\t|"); - for (Protein protein : containedProteins) { - System.out.print("\t" + protein.getCoveredText()); - } - System.out.print("\t|"); - Collections.sort(newTokens, new AnnotationSorter()); - for (Token newToken : newTokens) { - System.out.print("\t" + newToken.getCoveredText()); - } - System.out.println(); - - Token leftToken = token.getLeftToken(); - for (Token newToken : newTokens) { - if (newToken.getBegin() == token.getBegin() - && newToken.getEnd() == token.getEnd()) { - continue; - } + private Token createNewToken(JCas jcas, int begin, int end, String pos) { - newToken.setLeftToken(leftToken); - if (null != leftToken) { - leftToken.setRightToken(newToken); - } - leftToken = newToken; - newToken.setId(tokenId++); + Token token = new Token(jcas, begin, end); - newToken.addToIndexes(); + if (token.getCoveredText().length() > 1) { + if (token.getCoveredText().charAt(0) == '-' || + token.getCoveredText().charAt(0) == '.' || + token.getCoveredText().charAt(0) == '#' || + token.getCoveredText().charAt(0) == '/') { + token.setBegin(token.getBegin() + 1); } - Token lastToken = newTokens.get(newTokens.size() - 1); - lastToken.setRightToken(token.getRightToken()); - if (null != token.getRightToken()) { - token.getRightToken().setLeftToken(lastToken); - } - - token.removeFromIndexes(); } - - } - - private List furtherBreakToken(JCas jcas, Token token) { - - List result = new ArrayList(); - if (token.getCoveredText().length() == 1) { - result.add(token); - } else { - Token lastToken = token; - if (token.getCoveredText().startsWith("/") - || token.getCoveredText().startsWith("-") - || token.getCoveredText().startsWith("+") - || token.getCoveredText().startsWith(":")) { - Token newToken = createNewToken(jcas, token.getBegin(), - token.getBegin() + 1, null); - result.add(newToken); - result.addAll(furtherBreakToken( - jcas, - createNewToken(jcas, token.getBegin() + 1, - token.getEnd(), null))); - for (Token aNewToken : result) { - if (aNewToken.getEnd() == token.getEnd()) { - lastToken = aNewToken; - break; - } - } - if (lastToken.getCoveredText().length() == 1) { - return result; - } - } else if (lastToken.getCoveredText().endsWith("/") - || lastToken.getCoveredText().endsWith("-") - || lastToken.getCoveredText().endsWith("+") - || lastToken.getCoveredText().endsWith(":")) { - Token newToken = createNewToken(jcas, lastToken.getEnd() - 1, - lastToken.getEnd(), null); - result.add(newToken); - result.addAll(furtherBreakToken(jcas, - new Token(jcas, lastToken.getBegin(), - token.getEnd() - 1))); - } else { - result.add(lastToken); + + if (token.getCoveredText().length() > 2) { + if (token.getCoveredText().charAt(0) == '(' && + token.getCoveredText(). + charAt(token.getCoveredText().length()-1) == ')') { + token.setBegin(token.getBegin() + 1); + token.setEnd(token.getEnd() - 1); } } - return result; - } - private Token createNewToken(JCas jcas, int begin, int end, String pos) { - - Token token = new Token(jcas, begin, end); + token.setPos(pos); String text = token.getCoveredText(); - token.setLemma(BioLemmatizerUtil.lemmatizeWord(text.toLowerCase(), - token.getPos())); + String lemma = BioLemmatizerUtil.lemmatizeWord(text.toLowerCase(), + token.getPos()); + token.setLemma(lemma); Stemmer stem = new Stemmer(); stem.add(text.toCharArray(), text.length()); stem.stem(); token.setStem(stem.toString()); - String subWord = null, subLemma = null, subStem = null; + String subWord = null, subLemma = lemma, subStem = stem.toString(); if (token.getCoveredText().indexOf("-") > -1) { subWord = token.getCoveredText().substring( token.getCoveredText().lastIndexOf("-") + 1); @@ -355,8 +262,8 @@ private Token createNewToken(JCas jcas, int begin, int end, String pos) { } token.setSubLemma(subLemma); token.setSubStem(subStem); - + return token; } -} +} \ No newline at end of file diff --git a/src/info/chenli/litway/corpora/Dependency.java b/src/info/chenli/litway/corpora/Dependency.java index 6b6ac45..5d87b6f 100644 --- a/src/info/chenli/litway/corpora/Dependency.java +++ b/src/info/chenli/litway/corpora/Dependency.java @@ -13,8 +13,8 @@ /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 - * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 + * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml * @generated */ public class Dependency extends Annotation { /** @generated @@ -56,10 +56,13 @@ public Dependency(JCas jcas, int begin, int end) { readObject(); } - /** + /** + * * Write your own initialization here * - @generated modifiable */ + * + * @generated modifiable + */ private void readObject() {/*default - does nothing empty block */} @@ -68,14 +71,18 @@ private void readObject() {/*default - does nothing empty block */} //* Feature: sentenceId /** getter for sentenceId - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getSentenceId() { if (Dependency_Type.featOkTst && ((Dependency_Type)jcasType).casFeat_sentenceId == null) jcasType.jcas.throwFeatMissing("sentenceId", "info.chenli.litway.corpora.Dependency"); return jcasType.ll_cas.ll_getStringValue(addr, ((Dependency_Type)jcasType).casFeatCode_sentenceId);} /** setter for sentenceId - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setSentenceId(String v) { if (Dependency_Type.featOkTst && ((Dependency_Type)jcasType).casFeat_sentenceId == null) jcasType.jcas.throwFeatMissing("sentenceId", "info.chenli.litway.corpora.Dependency"); @@ -86,14 +93,18 @@ public void setSentenceId(String v) { //* Feature: heads /** getter for heads - gets - * @generated */ + * @generated + * @return value of the feature + */ public IntegerList getHeads() { if (Dependency_Type.featOkTst && ((Dependency_Type)jcasType).casFeat_heads == null) jcasType.jcas.throwFeatMissing("heads", "info.chenli.litway.corpora.Dependency"); return (IntegerList)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Dependency_Type)jcasType).casFeatCode_heads)));} /** setter for heads - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setHeads(IntegerList v) { if (Dependency_Type.featOkTst && ((Dependency_Type)jcasType).casFeat_heads == null) jcasType.jcas.throwFeatMissing("heads", "info.chenli.litway.corpora.Dependency"); @@ -104,14 +115,18 @@ public void setHeads(IntegerList v) { //* Feature: relations /** getter for relations - gets - * @generated */ + * @generated + * @return value of the feature + */ public StringList getRelations() { if (Dependency_Type.featOkTst && ((Dependency_Type)jcasType).casFeat_relations == null) jcasType.jcas.throwFeatMissing("relations", "info.chenli.litway.corpora.Dependency"); return (StringList)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Dependency_Type)jcasType).casFeatCode_relations)));} /** setter for relations - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setRelations(StringList v) { if (Dependency_Type.featOkTst && ((Dependency_Type)jcasType).casFeat_relations == null) jcasType.jcas.throwFeatMissing("relations", "info.chenli.litway.corpora.Dependency"); @@ -122,14 +137,18 @@ public void setRelations(StringList v) { //* Feature: modifiers /** getter for modifiers - gets - * @generated */ + * @generated + * @return value of the feature + */ public IntegerList getModifiers() { if (Dependency_Type.featOkTst && ((Dependency_Type)jcasType).casFeat_modifiers == null) jcasType.jcas.throwFeatMissing("modifiers", "info.chenli.litway.corpora.Dependency"); return (IntegerList)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Dependency_Type)jcasType).casFeatCode_modifiers)));} /** setter for modifiers - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setModifiers(IntegerList v) { if (Dependency_Type.featOkTst && ((Dependency_Type)jcasType).casFeat_modifiers == null) jcasType.jcas.throwFeatMissing("modifiers", "info.chenli.litway.corpora.Dependency"); diff --git a/src/info/chenli/litway/corpora/Dependency_Type.java b/src/info/chenli/litway/corpora/Dependency_Type.java index 302535f..f2bec32 100644 --- a/src/info/chenli/litway/corpora/Dependency_Type.java +++ b/src/info/chenli/litway/corpora/Dependency_Type.java @@ -14,7 +14,7 @@ import org.apache.uima.jcas.tcas.Annotation_Type; /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 * @generated */ public class Dependency_Type extends Annotation_Type { /** @generated */ diff --git a/src/info/chenli/litway/corpora/Entity.java b/src/info/chenli/litway/corpora/Entity.java new file mode 100644 index 0000000..e948fd4 --- /dev/null +++ b/src/info/chenli/litway/corpora/Entity.java @@ -0,0 +1,101 @@ + + +/* First created by JCasGen Thu Aug 08 16:36:25 BST 2013 */ +package info.chenli.litway.corpora; + +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.JCasRegistry; +import org.apache.uima.jcas.cas.TOP_Type; + +import org.apache.uima.jcas.tcas.Annotation; + + +/** + * Updated by JCasGen Thu Aug 08 16:36:25 BST 2013 + * XML source: /automount/isilon4_ifs-research/textmining/chenli/projects/bionlp/2013/eventExtractor/desc/typeSystemDescriptor.xml + * @generated */ +public class Entity extends Annotation { + /** @generated + * @ordered + */ + @SuppressWarnings ("hiding") + public final static int typeIndexID = JCasRegistry.register(Entity.class); + /** @generated + * @ordered + */ + @SuppressWarnings ("hiding") + public final static int type = typeIndexID; + /** @generated */ + @Override + public int getTypeIndexID() {return typeIndexID;} + + /** Never called. Disable default constructor + * @generated */ + protected Entity() {/* intentionally empty block */} + + /** Internal - constructor used by generator + * @generated */ + public Entity(int addr, TOP_Type type) { + super(addr, type); + readObject(); + } + + /** @generated */ + public Entity(JCas jcas) { + super(jcas); + readObject(); + } + + /** @generated */ + public Entity(JCas jcas, int begin, int end) { + super(jcas); + setBegin(begin); + setEnd(end); + readObject(); + } + + /** + * Write your own initialization here + * + @generated modifiable */ + private void readObject() {/*default - does nothing empty block */} + + + + //*--------------* + //* Feature: id + + /** getter for id - gets + * @generated */ + public String getId() { + if (Entity_Type.featOkTst && ((Entity_Type)jcasType).casFeat_id == null) + jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Entity"); + return jcasType.ll_cas.ll_getStringValue(addr, ((Entity_Type)jcasType).casFeatCode_id);} + + /** setter for id - sets + * @generated */ + public void setId(String v) { + if (Entity_Type.featOkTst && ((Entity_Type)jcasType).casFeat_id == null) + jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Entity"); + jcasType.ll_cas.ll_setStringValue(addr, ((Entity_Type)jcasType).casFeatCode_id, v);} + + + //*--------------* + //* Feature: entityType + + /** getter for entityType - gets + * @generated */ + public String getEntityType() { + if (Entity_Type.featOkTst && ((Entity_Type)jcasType).casFeat_entityType == null) + jcasType.jcas.throwFeatMissing("entityType", "info.chenli.litway.corpora.Entity"); + return jcasType.ll_cas.ll_getStringValue(addr, ((Entity_Type)jcasType).casFeatCode_entityType);} + + /** setter for entityType - sets + * @generated */ + public void setEntityType(String v) { + if (Entity_Type.featOkTst && ((Entity_Type)jcasType).casFeat_entityType == null) + jcasType.jcas.throwFeatMissing("entityType", "info.chenli.litway.corpora.Entity"); + jcasType.ll_cas.ll_setStringValue(addr, ((Entity_Type)jcasType).casFeatCode_entityType, v);} + } + + \ No newline at end of file diff --git a/src/info/chenli/litway/corpora/EntityAnnotator.java b/src/info/chenli/litway/corpora/EntityAnnotator.java new file mode 100644 index 0000000..e4551b0 --- /dev/null +++ b/src/info/chenli/litway/corpora/EntityAnnotator.java @@ -0,0 +1,23 @@ +package info.chenli.litway.corpora; + +import info.chenli.litway.util.FileUtil; +import info.chenli.litway.util.UimaUtil; + +import org.apache.uima.analysis_component.JCasAnnotator_ImplBase; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.jcas.JCas; + +public class EntityAnnotator extends JCasAnnotator_ImplBase { + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + + (new A1Reader()) + .fetchEntities( + FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jCas)).concat(".a1"), + jCas); + + } + +} diff --git a/src/info/chenli/litway/corpora/Entity_Type.java b/src/info/chenli/litway/corpora/Entity_Type.java new file mode 100644 index 0000000..ec81472 --- /dev/null +++ b/src/info/chenli/litway/corpora/Entity_Type.java @@ -0,0 +1,104 @@ + +/* First created by JCasGen Thu Aug 08 16:36:25 BST 2013 */ +package info.chenli.litway.corpora; + +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.JCasRegistry; +import org.apache.uima.cas.impl.CASImpl; +import org.apache.uima.cas.impl.FSGenerator; +import org.apache.uima.cas.FeatureStructure; +import org.apache.uima.cas.impl.TypeImpl; +import org.apache.uima.cas.Type; +import org.apache.uima.cas.impl.FeatureImpl; +import org.apache.uima.cas.Feature; +import org.apache.uima.jcas.tcas.Annotation_Type; + +/** + * Updated by JCasGen Thu Aug 08 16:36:25 BST 2013 + * @generated */ +public class Entity_Type extends Annotation_Type { + /** @generated */ + @Override + protected FSGenerator getFSGenerator() {return fsGenerator;} + /** @generated */ + private final FSGenerator fsGenerator = + new FSGenerator() { + public FeatureStructure createFS(int addr, CASImpl cas) { + if (Entity_Type.this.useExistingInstance) { + // Return eq fs instance if already created + FeatureStructure fs = Entity_Type.this.jcas.getJfsFromCaddr(addr); + if (null == fs) { + fs = new Entity(addr, Entity_Type.this); + Entity_Type.this.jcas.putJfsFromCaddr(addr, fs); + return fs; + } + return fs; + } else return new Entity(addr, Entity_Type.this); + } + }; + /** @generated */ + @SuppressWarnings ("hiding") + public final static int typeIndexID = Entity.typeIndexID; + /** @generated + @modifiable */ + @SuppressWarnings ("hiding") + public final static boolean featOkTst = JCasRegistry.getFeatOkTst("info.chenli.litway.corpora.Entity"); + + /** @generated */ + final Feature casFeat_id; + /** @generated */ + final int casFeatCode_id; + /** @generated */ + public String getId(int addr) { + if (featOkTst && casFeat_id == null) + jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Entity"); + return ll_cas.ll_getStringValue(addr, casFeatCode_id); + } + /** @generated */ + public void setId(int addr, String v) { + if (featOkTst && casFeat_id == null) + jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Entity"); + ll_cas.ll_setStringValue(addr, casFeatCode_id, v);} + + + + /** @generated */ + final Feature casFeat_entityType; + /** @generated */ + final int casFeatCode_entityType; + /** @generated */ + public String getEntityType(int addr) { + if (featOkTst && casFeat_entityType == null) + jcas.throwFeatMissing("entityType", "info.chenli.litway.corpora.Entity"); + return ll_cas.ll_getStringValue(addr, casFeatCode_entityType); + } + /** @generated */ + public void setEntityType(int addr, String v) { + if (featOkTst && casFeat_entityType == null) + jcas.throwFeatMissing("entityType", "info.chenli.litway.corpora.Entity"); + ll_cas.ll_setStringValue(addr, casFeatCode_entityType, v);} + + + + + + /** initialize variables to correspond with Cas Type and Features + * @generated */ + public Entity_Type(JCas jcas, Type casType) { + super(jcas, casType); + casImpl.getFSClassRegistry().addGeneratorForType((TypeImpl)this.casType, getFSGenerator()); + + + casFeat_id = jcas.getRequiredFeatureDE(casType, "id", "uima.cas.String", featOkTst); + casFeatCode_id = (null == casFeat_id) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_id).getCode(); + + + casFeat_entityType = jcas.getRequiredFeatureDE(casType, "entityType", "uima.cas.String", featOkTst); + casFeatCode_entityType = (null == casFeat_entityType) ? JCas.INVALID_FEATURE_CODE : ((FeatureImpl)casFeat_entityType).getCode(); + + } +} + + + + \ No newline at end of file diff --git a/src/info/chenli/litway/corpora/Event.java b/src/info/chenli/litway/corpora/Event.java index 2728bae..7cb0de9 100644 --- a/src/info/chenli/litway/corpora/Event.java +++ b/src/info/chenli/litway/corpora/Event.java @@ -14,8 +14,8 @@ /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 - * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 + * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml * @generated */ public class Event extends Annotation { /** @generated @@ -57,24 +57,31 @@ public Event(JCas jcas, int begin, int end) { readObject(); } - /** + /** + * * Write your own initialization here * - @generated modifiable */ + * + * @generated modifiable + */ private void readObject() {/*default - does nothing empty block */} //*--------------* - //* Feature: Themes + //* Feature: themes - /** getter for Themes - gets - * @generated */ + /** getter for themes - gets + * @generated + * @return value of the feature + */ public StringArray getThemes() { if (Event_Type.featOkTst && ((Event_Type)jcasType).casFeat_themes == null) jcasType.jcas.throwFeatMissing("themes", "info.chenli.litway.corpora.Event"); return (StringArray)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Event_Type)jcasType).casFeatCode_themes)));} - /** setter for Themes - sets - * @generated */ + /** setter for themes - sets + * @generated + * @param v value to set into the feature + */ public void setThemes(StringArray v) { if (Event_Type.featOkTst && ((Event_Type)jcasType).casFeat_themes == null) jcasType.jcas.throwFeatMissing("themes", "info.chenli.litway.corpora.Event"); @@ -101,14 +108,18 @@ public void setThemes(int i, String v) { //* Feature: cause /** getter for cause - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getCause() { if (Event_Type.featOkTst && ((Event_Type)jcasType).casFeat_cause == null) jcasType.jcas.throwFeatMissing("cause", "info.chenli.litway.corpora.Event"); return jcasType.ll_cas.ll_getStringValue(addr, ((Event_Type)jcasType).casFeatCode_cause);} /** setter for cause - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setCause(String v) { if (Event_Type.featOkTst && ((Event_Type)jcasType).casFeat_cause == null) jcasType.jcas.throwFeatMissing("cause", "info.chenli.litway.corpora.Event"); @@ -119,30 +130,38 @@ public void setCause(String v) { //* Feature: product /** getter for product - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getProduct() { if (Event_Type.featOkTst && ((Event_Type)jcasType).casFeat_product == null) jcasType.jcas.throwFeatMissing("product", "info.chenli.litway.corpora.Event"); return jcasType.ll_cas.ll_getStringValue(addr, ((Event_Type)jcasType).casFeatCode_product);} /** setter for product - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setProduct(String v) { if (Event_Type.featOkTst && ((Event_Type)jcasType).casFeat_product == null) jcasType.jcas.throwFeatMissing("product", "info.chenli.litway.corpora.Event"); jcasType.ll_cas.ll_setStringValue(addr, ((Event_Type)jcasType).casFeatCode_product, v);} - //*--------------* + //*--------------* //* Feature: id /** getter for id - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getId() { if (Event_Type.featOkTst && ((Event_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Event"); return jcasType.ll_cas.ll_getStringValue(addr, ((Event_Type)jcasType).casFeatCode_id);} /** setter for id - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setId(String v) { if (Event_Type.featOkTst && ((Event_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Event"); @@ -153,14 +172,18 @@ public void setId(String v) { //* Feature: trigger /** getter for trigger - gets - * @generated */ + * @generated + * @return value of the feature + */ public Trigger getTrigger() { if (Event_Type.featOkTst && ((Event_Type)jcasType).casFeat_trigger == null) jcasType.jcas.throwFeatMissing("trigger", "info.chenli.litway.corpora.Event"); return (Trigger)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Event_Type)jcasType).casFeatCode_trigger)));} /** setter for trigger - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setTrigger(Trigger v) { if (Event_Type.featOkTst && ((Event_Type)jcasType).casFeat_trigger == null) jcasType.jcas.throwFeatMissing("trigger", "info.chenli.litway.corpora.Event"); diff --git a/src/info/chenli/litway/corpora/Event_Type.java b/src/info/chenli/litway/corpora/Event_Type.java index ab1e89a..0d3c4e5 100644 --- a/src/info/chenli/litway/corpora/Event_Type.java +++ b/src/info/chenli/litway/corpora/Event_Type.java @@ -14,7 +14,7 @@ import org.apache.uima.jcas.tcas.Annotation_Type; /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 * @generated */ public class Event_Type extends Annotation_Type { /** @generated */ diff --git a/src/info/chenli/litway/corpora/Protein.java b/src/info/chenli/litway/corpora/Protein.java index 51cb0ae..c71a102 100644 --- a/src/info/chenli/litway/corpora/Protein.java +++ b/src/info/chenli/litway/corpora/Protein.java @@ -11,8 +11,8 @@ /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 - * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 + * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml * @generated */ public class Protein extends Annotation { /** @generated @@ -54,24 +54,31 @@ public Protein(JCas jcas, int begin, int end) { readObject(); } - /** + /** + * * Write your own initialization here * - @generated modifiable */ + * + * @generated modifiable + */ private void readObject() {/*default - does nothing empty block */} //*--------------* //* Feature: id /** getter for id - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getId() { if (Protein_Type.featOkTst && ((Protein_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Protein"); return jcasType.ll_cas.ll_getStringValue(addr, ((Protein_Type)jcasType).casFeatCode_id);} /** setter for id - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setId(String v) { if (Protein_Type.featOkTst && ((Protein_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Protein"); diff --git a/src/info/chenli/litway/corpora/Protein_Type.java b/src/info/chenli/litway/corpora/Protein_Type.java index c48e6f1..b933e40 100644 --- a/src/info/chenli/litway/corpora/Protein_Type.java +++ b/src/info/chenli/litway/corpora/Protein_Type.java @@ -14,7 +14,7 @@ import org.apache.uima.jcas.tcas.Annotation_Type; /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 * @generated */ public class Protein_Type extends Annotation_Type { /** @generated */ diff --git a/src/info/chenli/litway/corpora/Sentence.java b/src/info/chenli/litway/corpora/Sentence.java index 6f2b43f..167066e 100644 --- a/src/info/chenli/litway/corpora/Sentence.java +++ b/src/info/chenli/litway/corpora/Sentence.java @@ -12,8 +12,8 @@ /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 - * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 + * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml * @generated */ public class Sentence extends Annotation { /** @generated @@ -55,10 +55,13 @@ public Sentence(JCas jcas, int begin, int end) { readObject(); } - /** + /** + * * Write your own initialization here * - @generated modifiable */ + * + * @generated modifiable + */ private void readObject() {/*default - does nothing empty block */} @@ -67,14 +70,18 @@ private void readObject() {/*default - does nothing empty block */} //* Feature: id /** getter for id - gets - * @generated */ + * @generated + * @return value of the feature + */ public int getId() { if (Sentence_Type.featOkTst && ((Sentence_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Sentence"); return jcasType.ll_cas.ll_getIntValue(addr, ((Sentence_Type)jcasType).casFeatCode_id);} /** setter for id - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setId(int v) { if (Sentence_Type.featOkTst && ((Sentence_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Sentence"); @@ -85,14 +92,18 @@ public void setId(int v) { //* Feature: dependencies /** getter for dependencies - gets - * @generated */ + * @generated + * @return value of the feature + */ public Dependency getDependencies() { if (Sentence_Type.featOkTst && ((Sentence_Type)jcasType).casFeat_dependencies == null) jcasType.jcas.throwFeatMissing("dependencies", "info.chenli.litway.corpora.Sentence"); return (Dependency)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Sentence_Type)jcasType).casFeatCode_dependencies)));} /** setter for dependencies - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setDependencies(Dependency v) { if (Sentence_Type.featOkTst && ((Sentence_Type)jcasType).casFeat_dependencies == null) jcasType.jcas.throwFeatMissing("dependencies", "info.chenli.litway.corpora.Sentence"); diff --git a/src/info/chenli/litway/corpora/Sentence_Type.java b/src/info/chenli/litway/corpora/Sentence_Type.java index cd66887..b0a6432 100644 --- a/src/info/chenli/litway/corpora/Sentence_Type.java +++ b/src/info/chenli/litway/corpora/Sentence_Type.java @@ -13,7 +13,7 @@ import org.apache.uima.jcas.tcas.Annotation_Type; /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 * @generated */ public class Sentence_Type extends Annotation_Type { /** @generated */ diff --git a/src/info/chenli/litway/corpora/Token.java b/src/info/chenli/litway/corpora/Token.java index c94f1b8..32d2cba 100644 --- a/src/info/chenli/litway/corpora/Token.java +++ b/src/info/chenli/litway/corpora/Token.java @@ -11,8 +11,8 @@ /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 - * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 + * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml * @generated */ public class Token extends Annotation { /** @generated @@ -54,10 +54,13 @@ public Token(JCas jcas, int begin, int end) { readObject(); } - /** + /** + * * Write your own initialization here * - @generated modifiable */ + * + * @generated modifiable + */ private void readObject() {/*default - does nothing empty block */} @@ -66,14 +69,18 @@ private void readObject() {/*default - does nothing empty block */} //* Feature: pos /** getter for pos - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getPos() { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_pos == null) jcasType.jcas.throwFeatMissing("pos", "info.chenli.litway.corpora.Token"); return jcasType.ll_cas.ll_getStringValue(addr, ((Token_Type)jcasType).casFeatCode_pos);} /** setter for pos - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setPos(String v) { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_pos == null) jcasType.jcas.throwFeatMissing("pos", "info.chenli.litway.corpora.Token"); @@ -84,14 +91,18 @@ public void setPos(String v) { //* Feature: lemma /** getter for lemma - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getLemma() { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_lemma == null) jcasType.jcas.throwFeatMissing("lemma", "info.chenli.litway.corpora.Token"); return jcasType.ll_cas.ll_getStringValue(addr, ((Token_Type)jcasType).casFeatCode_lemma);} /** setter for lemma - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setLemma(String v) { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_lemma == null) jcasType.jcas.throwFeatMissing("lemma", "info.chenli.litway.corpora.Token"); @@ -102,14 +113,18 @@ public void setLemma(String v) { //* Feature: stem /** getter for stem - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getStem() { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_stem == null) jcasType.jcas.throwFeatMissing("stem", "info.chenli.litway.corpora.Token"); return jcasType.ll_cas.ll_getStringValue(addr, ((Token_Type)jcasType).casFeatCode_stem);} /** setter for stem - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setStem(String v) { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_stem == null) jcasType.jcas.throwFeatMissing("stem", "info.chenli.litway.corpora.Token"); @@ -120,14 +135,18 @@ public void setStem(String v) { //* Feature: subLemma /** getter for subLemma - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getSubLemma() { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_subLemma == null) jcasType.jcas.throwFeatMissing("subLemma", "info.chenli.litway.corpora.Token"); return jcasType.ll_cas.ll_getStringValue(addr, ((Token_Type)jcasType).casFeatCode_subLemma);} /** setter for subLemma - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setSubLemma(String v) { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_subLemma == null) jcasType.jcas.throwFeatMissing("subLemma", "info.chenli.litway.corpora.Token"); @@ -138,14 +157,18 @@ public void setSubLemma(String v) { //* Feature: subStem /** getter for subStem - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getSubStem() { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_subStem == null) jcasType.jcas.throwFeatMissing("subStem", "info.chenli.litway.corpora.Token"); return jcasType.ll_cas.ll_getStringValue(addr, ((Token_Type)jcasType).casFeatCode_subStem);} /** setter for subStem - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setSubStem(String v) { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_subStem == null) jcasType.jcas.throwFeatMissing("subStem", "info.chenli.litway.corpora.Token"); @@ -156,14 +179,18 @@ public void setSubStem(String v) { //* Feature: leftToken /** getter for leftToken - gets - * @generated */ + * @generated + * @return value of the feature + */ public Token getLeftToken() { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_leftToken == null) jcasType.jcas.throwFeatMissing("leftToken", "info.chenli.litway.corpora.Token"); return (Token)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Token_Type)jcasType).casFeatCode_leftToken)));} /** setter for leftToken - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setLeftToken(Token v) { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_leftToken == null) jcasType.jcas.throwFeatMissing("leftToken", "info.chenli.litway.corpora.Token"); @@ -174,30 +201,38 @@ public void setLeftToken(Token v) { //* Feature: rightToken /** getter for rightToken - gets - * @generated */ + * @generated + * @return value of the feature + */ public Token getRightToken() { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_rightToken == null) jcasType.jcas.throwFeatMissing("rightToken", "info.chenli.litway.corpora.Token"); return (Token)(jcasType.ll_cas.ll_getFSForRef(jcasType.ll_cas.ll_getRefValue(addr, ((Token_Type)jcasType).casFeatCode_rightToken)));} /** setter for rightToken - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setRightToken(Token v) { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_rightToken == null) jcasType.jcas.throwFeatMissing("rightToken", "info.chenli.litway.corpora.Token"); jcasType.ll_cas.ll_setRefValue(addr, ((Token_Type)jcasType).casFeatCode_rightToken, jcasType.ll_cas.ll_getFSRef(v));} - //*--------------* + //*--------------* //* Feature: id /** getter for id - gets - * @generated */ + * @generated + * @return value of the feature + */ public int getId() { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Token"); return jcasType.ll_cas.ll_getIntValue(addr, ((Token_Type)jcasType).casFeatCode_id);} /** setter for id - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setId(int v) { if (Token_Type.featOkTst && ((Token_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Token"); diff --git a/src/info/chenli/litway/corpora/Token_Type.java b/src/info/chenli/litway/corpora/Token_Type.java index 3c97ffb..786c684 100644 --- a/src/info/chenli/litway/corpora/Token_Type.java +++ b/src/info/chenli/litway/corpora/Token_Type.java @@ -14,7 +14,7 @@ import org.apache.uima.jcas.tcas.Annotation_Type; /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 * @generated */ public class Token_Type extends Annotation_Type { /** @generated */ diff --git a/src/info/chenli/litway/corpora/Trigger.java b/src/info/chenli/litway/corpora/Trigger.java index 882f9eb..df198f6 100644 --- a/src/info/chenli/litway/corpora/Trigger.java +++ b/src/info/chenli/litway/corpora/Trigger.java @@ -11,8 +11,8 @@ /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 - * XML source: /Users/chenli/projects/bionlp2013/eventExtractor/desc/typeSystemDescriptor.xml + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 + * XML source: /media/songrq/soft/litway/workspace/LitWay/desc/typeSystemDescriptor.xml * @generated */ public class Trigger extends Annotation { /** @generated @@ -54,24 +54,31 @@ public Trigger(JCas jcas, int begin, int end) { readObject(); } - /** + /** + * * Write your own initialization here * - @generated modifiable */ + * + * @generated modifiable + */ private void readObject() {/*default - does nothing empty block */} //*--------------* - //* Feature: EventType + //* Feature: eventType - /** getter for EventType - gets - * @generated */ + /** getter for eventType - gets + * @generated + * @return value of the feature + */ public String getEventType() { if (Trigger_Type.featOkTst && ((Trigger_Type)jcasType).casFeat_eventType == null) jcasType.jcas.throwFeatMissing("eventType", "info.chenli.litway.corpora.Trigger"); return jcasType.ll_cas.ll_getStringValue(addr, ((Trigger_Type)jcasType).casFeatCode_eventType);} - /** setter for EventType - sets - * @generated */ + /** setter for eventType - sets + * @generated + * @param v value to set into the feature + */ public void setEventType(String v) { if (Trigger_Type.featOkTst && ((Trigger_Type)jcasType).casFeat_eventType == null) jcasType.jcas.throwFeatMissing("eventType", "info.chenli.litway.corpora.Trigger"); @@ -79,33 +86,41 @@ public void setEventType(String v) { //*--------------* - //* Feature: EventType2 + //* Feature: eventType2 - /** getter for EventType2 - gets - * @generated */ + /** getter for eventType2 - gets + * @generated + * @return value of the feature + */ public String getEventType2() { if (Trigger_Type.featOkTst && ((Trigger_Type)jcasType).casFeat_eventType2 == null) jcasType.jcas.throwFeatMissing("eventType2", "info.chenli.litway.corpora.Trigger"); return jcasType.ll_cas.ll_getStringValue(addr, ((Trigger_Type)jcasType).casFeatCode_eventType2);} - /** setter for EventType2 - sets - * @generated */ + /** setter for eventType2 - sets + * @generated + * @param v value to set into the feature + */ public void setEventType2(String v) { if (Trigger_Type.featOkTst && ((Trigger_Type)jcasType).casFeat_eventType2 == null) jcasType.jcas.throwFeatMissing("eventType2", "info.chenli.litway.corpora.Trigger"); jcasType.ll_cas.ll_setStringValue(addr, ((Trigger_Type)jcasType).casFeatCode_eventType2, v);} - //*--------------* + //*--------------* //* Feature: id /** getter for id - gets - * @generated */ + * @generated + * @return value of the feature + */ public String getId() { if (Trigger_Type.featOkTst && ((Trigger_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Trigger"); return jcasType.ll_cas.ll_getStringValue(addr, ((Trigger_Type)jcasType).casFeatCode_id);} /** setter for id - sets - * @generated */ + * @generated + * @param v value to set into the feature + */ public void setId(String v) { if (Trigger_Type.featOkTst && ((Trigger_Type)jcasType).casFeat_id == null) jcasType.jcas.throwFeatMissing("id", "info.chenli.litway.corpora.Trigger"); diff --git a/src/info/chenli/litway/corpora/Trigger_Type.java b/src/info/chenli/litway/corpora/Trigger_Type.java index 7d77b76..1e6ec6c 100644 --- a/src/info/chenli/litway/corpora/Trigger_Type.java +++ b/src/info/chenli/litway/corpora/Trigger_Type.java @@ -14,7 +14,7 @@ import org.apache.uima.jcas.tcas.Annotation_Type; /** - * Updated by JCasGen Thu Apr 25 13:17:47 BST 2013 + * Updated by JCasGen Fri Jan 30 10:19:46 CST 2015 * @generated */ public class Trigger_Type extends Annotation_Type { /** @generated */ diff --git a/src/info/chenli/litway/exec/AbstractInstances.java b/src/info/chenli/litway/exec/AbstractInstances.java new file mode 100644 index 0000000..46b8d9d --- /dev/null +++ b/src/info/chenli/litway/exec/AbstractInstances.java @@ -0,0 +1,1075 @@ +package info.chenli.litway.exec; + +import info.chenli.classifier.Instance; +import info.chenli.litway.config.EventType; +import info.chenli.litway.corpora.Event; +import info.chenli.litway.corpora.POS; +import info.chenli.litway.corpora.Protein; +import info.chenli.litway.corpora.Sentence; +import info.chenli.litway.corpora.Token; +import info.chenli.litway.corpora.Trigger; +import info.chenli.litway.searn.StructuredInstance; +import info.chenli.litway.util.BioLemmatizerUtil; +import info.chenli.litway.util.DependencyExtractor; +import info.chenli.litway.util.FileFilterImpl; +import info.chenli.litway.util.FileUtil; +import info.chenli.litway.util.StanfordDependencyReader.Pair; + +import java.io.File; +import java.io.IOException; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.Set; +import java.util.TreeMap; +import java.util.logging.Level; +import java.util.logging.Logger; + +import org.apache.uima.UIMAFramework; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.CASException; +import org.apache.uima.cas.CASRuntimeException; +import org.apache.uima.cas.FSIterator; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.resource.ResourceSpecifier; +import org.apache.uima.util.FileUtils; +import org.apache.uima.util.XMLInputSource; +import org.uimafit.util.JCasUtil; + +public abstract class AbstractInstances { + + private final static Logger logger = Logger + .getLogger(AbstractInstances.class.getName()); + + private int[] annotationTypes; // the annotation types need consideration + private String taeDescriptor; + protected List structuredInstances = new LinkedList(); + protected List instances = null; + protected List labelsString; + + // public static final String aStopWord = "!AStopWord!"; + + protected void setTaeDescriptor(String taeDescriptor) { + + this.taeDescriptor = taeDescriptor; + + } + + protected XMLInputSource getXMLInputSource() throws IOException, + URISyntaxException { + + URL url = this.getClass().getResource(taeDescriptor); + return new XMLInputSource(url); + }; + + private AnalysisEngine ae = null; + + public AbstractInstances(int[] annotationTypes) { + + this.annotationTypes = annotationTypes; + + } + + private void init() { + + labelsString = getLabelsString(); + + instances = new ArrayList(); + + try { + + XMLInputSource in = getXMLInputSource(); + ResourceSpecifier specifier = UIMAFramework.getXMLParser() + .parseResourceSpecifier(in); + + logger.info(specifier.getSourceUrlString()); + + // create Analysis Engine + ae = UIMAFramework.produceAnalysisEngine(specifier); + + } catch (Exception e) { + + logger.log(Level.SEVERE, e.getMessage()); + throw new RuntimeException(e); + + } + } + + protected abstract List getLabelsString(); + + public List getInstances(File dataDir, String argument) { + + if (null == ae) { + init(); + } + + if (dataDir.isFile()) { + + processSingleFile(dataDir, argument); + + } else { + // get all files in the input directory + File[] files = dataDir.listFiles(new FileFilterImpl(".txt")); + if (files == null) { + + logger.log(Level.WARNING, "Empty directory."); + + instances = null; + + } else { + // process documents + for (int i = 0; i < files.length; i++) { + if (!files[i].isDirectory()) { + + processSingleFile(files[i], argument); + } + } + } + } + ae.destroy(); + + return getInstances(); + } + + protected JCas processSingleFile(File aFile, String argument) { + + if (null == ae) { + init(); + } + + String document = null; + + try { + + document = FileUtils.file2String(aFile); + + } catch (IOException e) { + + logger.log(Level.SEVERE, e.getMessage()); + throw new RuntimeException(e); + } + + document = document.trim(); + + try { + // create a CAS + CAS cas = ae.newCAS(); + + // put document text in CAS + cas.setDocumentText(document); + + // set the path of resource file + cas.createView("FilePath").setSofaDataURI(aFile.getAbsolutePath(), + "text"); + + // process + ae.process(cas); + + FSIterator annoIter = null; + JCas jcas = null; + jcas = cas.getJCas(); + for (int annotationType : annotationTypes) { + annoIter = jcas.getAnnotationIndex(annotationType).iterator(); + structuredInstances.addAll(getStructuredInstances(jcas, + annoIter, argument)); + } + + return jcas; + + } catch (AnalysisEngineProcessException e) { + + logger.log(Level.SEVERE, e.getMessage()); + throw new RuntimeException(e); + } catch (CASRuntimeException e) { + + logger.log(Level.SEVERE, e.getMessage()); + throw new RuntimeException(e); + + } catch (CASException e) { + + logger.log(Level.SEVERE, e.getMessage()); + throw new RuntimeException(e); + + } catch (ResourceInitializationException e) { + logger.log(Level.SEVERE, e.getMessage()); + throw new RuntimeException(e); + } + + } + + protected abstract List getStructuredInstances( + JCas jcas, FSIterator annoIter, String argument); + + public List getStructuredInstances() { + + return structuredInstances; + } + + public List getInstances() { + + if (instances.size() == 0) { + for (StructuredInstance si : structuredInstances) { + instances.addAll(si.getNodes()); + } + } + + return instances; + } + + /** + * if the trigger is multi-token, then it takes the order of noun, + * adjective, adverb and verb. It is based on the observation of training. + * See detail in {@link info.chenli.ee.bionlp13.POSPrioritizer} + * + * TODO the multi-token policy can be put in a configuration file. + * Alternatively, it can be dynamically learned. + */ + protected Token getTriggerToken(List tokens) { + + if (tokens.size() > 1) { + + TreeMap sortedTokens = new TreeMap( + new POSPrioritizer()); + + for (Token token : tokens) { + + sortedTokens.put(POS.valueOf(token.getPos()), token); + if (TriggerWord.isATriggerWord(token.getCoveredText()) != null) { + return token; + } + } + + return sortedTokens.firstEntry().getValue(); + + } else if (tokens.size() == 1) { + + return tokens.get(0); + } + + return null; + } + + protected Token getTriggerToken(JCas jcas, Trigger trigger) { + // get tokens + List tokens = JCasUtil.selectCovered(jcas, Token.class, trigger); + + if (tokens.size() == 0) + // if trigger is within a token, then take + // the nesting token. It + // happens, e.g. in PMC-2065877-01-Introduction. + { + FSIterator iter = jcas.getAnnotationIndex(Token.type) + .iterator(); + while (iter.hasNext()) { + Token token = (Token) iter.next(); + if (token.getBegin() <= trigger.getBegin() + && token.getEnd() >= trigger.getEnd()) { + return token; + } + } + + } else + // take one of the nested tokens. + { + return getTriggerToken(tokens); + } + return null; + } + + public void saveInstances(File file) { + + StringBuffer sb = new StringBuffer(); + + for (Instance instance : instances) { + + sb.append(instance.getLabelString()); + + for (String[] features : instance.getFeaturesString()) { + for (String feature : features) { + if (null == feature) { + continue; + } + sb.append("\t".concat(feature)); + } + } + + sb.append("\n"); + } + + String instancesStr = sb.toString(); + FileUtil.saveFile(instancesStr, file); + } + + public void saveNumericInstances(File file) { + + StringBuffer sb = new StringBuffer(); + + for (Instance instance : instances) { + + sb.append(String.valueOf(instance.getLabel())); + + for (int feature : instance.getFeaturesNumeric()) { + sb.append("\t".concat(String.valueOf(feature))); + } + + sb.append("\n"); + } + + String instancesStr = sb.toString(); + FileUtil.saveFile(instancesStr, file); + } + + public void saveSvmLightInstances(File file) { + + StringBuffer sb = new StringBuffer(); + + for (Instance instance : instances) { + + sb.append(String.valueOf(instance.getLabel())); + + int previousIndex = 0; + for (int feature : instance.getFeaturesNumeric()) { + if (feature > previousIndex) { + sb.append(" ".concat(String.valueOf(feature)).concat(":1")); + } + previousIndex = feature; + } + + sb.append("\n"); + } + + String instancesStr = sb.toString(); + FileUtil.saveFile(instancesStr, file); + } + + protected Instance themeToInstance(JCas jcas, Sentence sentence, + Annotation anno, Trigger trigger, Set pairsOfSentence, + DependencyExtractor dependencyExtractor, boolean isTruepositive) { + return themeCauseToInstance(jcas, sentence, anno, trigger, + pairsOfSentence, dependencyExtractor, isTruepositive, + Stage.THEME, null); + } + + protected Instance causeToInstance(JCas jcas, Sentence sentence, + Annotation anno, Trigger trigger, Set pairsOfSentence, + DependencyExtractor dependencyExtractor, boolean isTruepositive, + Token themeToken) { + return themeCauseToInstance(jcas, sentence, anno, trigger, + pairsOfSentence, dependencyExtractor, isTruepositive, + Stage.CAUSE, themeToken); + } + + /** + * Convert a protein or event into an theme instance ready for training or + * predicting. + * + * @param jcas + * @param anno + * It could be a protein or an event trigger. + * @param event + * @param dependencyExtractor + * @param themeOrCause + * @param themeToken + * @return + */ + private Instance themeCauseToInstance(JCas jcas, Sentence sentence, + Annotation anno, Trigger trigger, Set pairsOfSentence, + DependencyExtractor dependencyExtractor, boolean isTruepositive, + Stage stage, Token themeToken) { + + if (!(anno instanceof Trigger) && !(anno instanceof Protein)) { + throw new IllegalArgumentException( + "The theme/cause has to be a protein or trigger."); + } + + List annoTokens = JCasUtil + .selectCovered(jcas, Token.class, anno); + + // if protein/trigger is within a token + if (annoTokens.size() == 0) { + FSIterator iter = jcas.getAnnotationIndex(Token.type) + .iterator(); + annoTokens = new ArrayList(); + while (iter.hasNext()) { + Token token = (Token) iter.next(); + if (token.getBegin() <= anno.getBegin() + && token.getEnd() >= anno.getEnd()) { + annoTokens.add(token); + break; + } + } + } + + Token annoToken = null; + if (anno instanceof Protein) + // Take the last non-digital token if protein is + // multi-token. + { + annoToken = annoTokens.get(annoTokens.size() - 1); + // for (Token aToken : annoTokens) { + // + // try { + // Double.parseDouble(aToken.getLemma()); + // break; + // } catch (NumberFormatException e) { + // token = aToken; + // } + // + // } + } else if (anno instanceof Trigger) { + annoToken = getTriggerToken(jcas, (Trigger) anno); + } + + Instance instance = new Instance(); + List featuresString = new ArrayList(); + instance.setFeaturesString(featuresString); + + // get trigger token + Token triggerToken = getTriggerToken(jcas, trigger); + + // parser : dependency path between trigger-argument + String dependencyPath = dependencyExtractor.getShortestPath( + triggerToken, annoToken, stage); + String featurePath = dependencyPath; + + if (null == dependencyPath) { + featurePath = dependencyExtractor.getReversedShortestPath( + triggerToken, annoToken, stage); + } + boolean areSameTokens = (annoToken.getBegin() == triggerToken + .getBegin() && annoToken.getEnd() == triggerToken.getEnd()); + featurePath = areSameTokens ? "SAMETOKEN" : featurePath; + featurePath = (null == featurePath ? null : "dep_".concat(featurePath)); + featuresString.add(null == featurePath ? new String[0] + : new String[] { featurePath }); + + // parser refined? + + // parser_simple: grouping of dpendency type; + // amod, nn --> nmod + // anything ending in subj --> subj + // anything ending in subjpass --> subjpass + String simplifiedFeaturePath = null; + if (null != dependencyPath) { + simplifiedFeaturePath = dependencyExtractor + .getSimplifiedShortestPath(triggerToken, annoToken, stage); + } else { + simplifiedFeaturePath = dependencyExtractor + .getSimplifiedReversedShortestPath(triggerToken, annoToken, + stage); + } + simplifiedFeaturePath = areSameTokens ? "SAMETOKEN" + : simplifiedFeaturePath; + simplifiedFeaturePath = (null == simplifiedFeaturePath ? null + : "dep_simple_".concat(simplifiedFeaturePath)); + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { simplifiedFeaturePath }); + + // trigger class + String triggerClassString; + if (EventTypes.isSimpleEvent(trigger.getEventType())) { + triggerClassString = "class_Simple"; + } else if (EventTypes.isBindingEvent(trigger.getEventType())) { + triggerClassString = "class_Binding"; + } else if (EventTypes.isRegulatoryEvent(trigger.getEventType())) { + triggerClassString = "class_Regulation"; + } else { + triggerClassString = "class_Complex"; + } + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerClassString.concat("_").concat( + featurePath) }); + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { triggerClassString.concat("_").concat( + simplifiedFeaturePath) }); + + // trigger token & trigger type + String triggerText = "text_".concat(trigger.getCoveredText() + .toLowerCase()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerText.concat("_").concat(featurePath) }); + String eventType = "eventType_".concat(trigger.getEventType()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { eventType.concat("_").concat(featurePath) }); + + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { triggerText.concat("_").concat( + simplifiedFeaturePath) }); + featuresString.add(null == simplifiedFeaturePath ? new String[0] + : new String[] { eventType.concat("_").concat( + simplifiedFeaturePath) }); + + // trigger lemma (using the token's POS, which may be inaccurate) + String triggerLemma = "triggerLemma_".concat(BioLemmatizerUtil + .lemmatizeWord(trigger.getCoveredText(), triggerToken.getPos()) + .toLowerCase()); + featuresString + .add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_").concat( + featurePath) }); + + // trigger sublemma + String triggerSubLemma = (null == triggerToken.getSubLemma() ? triggerLemma + : "triggerSubLemma_".concat(triggerToken.getSubLemma() + .toLowerCase())); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerSubLemma.concat("_") + .concat(featurePath) }); + + // trigger POS + String triggerPos = "triggerPos_".concat(triggerToken.getPos()); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerPos.concat("_").concat(featurePath) }); + String triggerPosShort = "triggerShortPos_".concat(triggerToken + .getPos().substring(0, 1)); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerPosShort.concat("_") + .concat(featurePath) }); + + featuresString.add(new String[] { triggerLemma.concat("_").concat( + triggerPos) }); + featuresString.add(new String[] { triggerLemma.concat("_").concat( + triggerPosShort) }); + featuresString.add(new String[] { triggerSubLemma.concat("_").concat( + triggerPos) }); + featuresString.add(new String[] { triggerSubLemma.concat("_").concat( + triggerPosShort) }); + + // argument type + String argType = null; + if (anno instanceof Protein) { + argType = "argType_Protein"; + } else if (anno instanceof Trigger) { + argType = "argType_".concat(((Trigger) anno).getEventType()); + } + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerLemma.concat("_").concat(featurePath) + .concat("_").concat(argType) }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerSubLemma.concat("_") + .concat(featurePath).concat("_").concat(argType) }); + featuresString.add(null == featurePath ? new String[0] + : new String[] { triggerClassString.concat("_") + .concat(featurePath).concat("_").concat(argType) }); + + // text string from trigger to theme/cause: compensate when parsing + // fails + String textBetween = "", textAbsBetween = "", textShortBetween = ""; + + if (!areSameTokens) { + List tokensBetween = JCasUtil.selectCovered(jcas, + Token.class, sentence); + List proteinsBetween = JCasUtil.selectCovered(jcas, + Protein.class, sentence); + int start = Math.min(annoToken.getBegin(), triggerToken.getBegin()); + int end = Math.max(annoToken.getEnd(), triggerToken.getEnd()); + boolean reversed = (start != triggerToken.getBegin()); + + List tokensTextBetween = new ArrayList(); + List tokensAbsTextBetween = new ArrayList(); + + tokensLoop: for (Token aToken : tokensBetween) { + + if (aToken.getBegin() < start || !POS.isPos(aToken.getPos())) { + continue tokensLoop; + } else if (aToken.getEnd() >= end) { + break tokensLoop; + } + + // if it is a protein + for (Protein aProtein : proteinsBetween) { + if (aToken.getBegin() == aProtein.getBegin()) { + tokensTextBetween.add("PROTEIN"); + tokensAbsTextBetween.add("PROTEIN"); + continue tokensLoop; + } else if (aToken.getBegin() > aProtein.getBegin() + && aToken.getEnd() <= aProtein.getEnd()) { + continue tokensLoop; + } + } + if (aToken.getBegin() == trigger.getBegin()) { + tokensAbsTextBetween.add(trigger.getEventType()); + continue tokensLoop; + } else if (aToken.getBegin() > trigger.getBegin() + && aToken.getEnd() <= trigger.getEnd()) { + continue tokensLoop; + } + + tokensTextBetween.add(aToken.getLemma().toLowerCase()); + tokensAbsTextBetween.add(aToken.getLemma().toLowerCase()); + + } + + for (String aText : tokensTextBetween) { + if (reversed) { + textBetween = aText.concat(textBetween.equals("") ? "" + : "_".concat(textBetween)); + } else { + textBetween = textBetween.equals("") ? aText : textBetween + .concat("_").concat(aText); + } + } + for (String aText : tokensAbsTextBetween) { + if (reversed) { + textAbsBetween = aText + .concat(textAbsBetween.equals("") ? "" : "_" + .concat(textAbsBetween)); + } else { + textAbsBetween = textAbsBetween.equals("") ? aText + : textAbsBetween.concat("_").concat(aText); + } + } + // concatenate text between trigger and theme/cause with the + // previous + // features. + textBetween = textBetween.equals("") ? null : "textString_".concat( + reversed ? "reversed_" : "").concat(textBetween); + textAbsBetween = textAbsBetween.equals("") ? null + : "textStringAbs_".concat(reversed ? "reversed_" : "") + .concat(textAbsBetween); + for (int i = 1; i < tokensAbsTextBetween.size() - 1; i++) { + if (reversed) { + textShortBetween = tokensAbsTextBetween.get(i).concat( + textShortBetween.equals("") ? "" : "_" + .concat(textShortBetween)); + } else { + textShortBetween = textShortBetween.equals("") ? tokensAbsTextBetween + .get(i) : textShortBetween.concat("_").concat( + tokensAbsTextBetween.get(i)); + } + } + textShortBetween = textShortBetween.equals("") ? null + : "textStringShort_".concat(reversed ? "reversed_" : "") + .concat(textShortBetween); + } else { + textBetween = "SAMETOKEN"; + textAbsBetween = "SAMETOKEN"; + textShortBetween = "SAMETOKEN"; + } + + featuresString.add(null == textBetween ? new String[0] + : new String[] { textBetween }); + featuresString.add(null == textBetween ? new String[0] + : new String[] { triggerText.concat("_").concat(textBetween) }); + featuresString + .add(null != textBetween && null != dependencyPath ? new String[] { dependencyPath + .concat("_").concat(textBetween) } : new String[0]); + + featuresString.add(null == textAbsBetween ? new String[0] + : new String[] { textAbsBetween }); + featuresString + .add(null == textAbsBetween ? new String[0] + : new String[] { triggerText.concat("_").concat( + textAbsBetween) }); + + featuresString.add(null == textShortBetween ? new String[0] + : new String[] { textShortBetween }); + featuresString.add(null == textShortBetween ? new String[0] + : new String[] { triggerText.concat("_").concat( + textShortBetween) }); + featuresString + .add(null != textShortBetween && null != dependencyPath ? new String[] { dependencyPath + .concat("_").concat(textShortBetween) } : new String[0]); + + if (stage.equals(Stage.CAUSE)) { + String pathToTheme = null; + if (null != themeToken) { + pathToTheme = dependencyExtractor.getShortestPath(annoToken, + themeToken, stage); + if (null == pathToTheme) { + pathToTheme = dependencyExtractor.getReversedShortestPath( + annoToken, themeToken, stage); + } + } + featuresString + .add(null != pathToTheme && themeToken != null ? new String[] { pathToTheme } + : new String[0]); + } + + String label; + switch (stage) { + case THEME: + label = "Theme"; + break; + case CAUSE: + label = "Cause"; + break; + default: + label = null; + } + if (isTruepositive) { + + instance.setLabelString(label); + + } else { + instance.setLabelString("Non_".concat(label.toLowerCase())); + } + + return instance; + } + + protected Instance bindingEventToInstance(JCas jcas, Sentence sentence, + Event bindingEvent, List themes, + DependencyExtractor dependencyExtractor) { + + boolean truepositive = true; + if (null != bindingEvent.getThemes() + && themes.size() == bindingEvent.getThemes().size()) { + themeSearchingLoop: for (Protein protein : themes) { + boolean foundTheProtein = false; + for (int i = 0; i < bindingEvent.getThemes().size(); i++) { + if (protein.getId().equals(bindingEvent.getThemes(i))) { + foundTheProtein = true; + break; + } + } + if (foundTheProtein == false) { + truepositive = false; + break themeSearchingLoop; + } + } + } else { + truepositive = false; + } + + Instance instance = new Instance(); + + List featuresString = new ArrayList(); + instance.setFeaturesString(featuresString); + + Trigger trigger = bindingEvent.getTrigger(); + Token triggerToken = getTriggerToken(jcas, trigger); + + List themeTokens = new ArrayList(); + for (Protein aProtein : themes) { + List annoTokens = JCasUtil.selectCovered(jcas, Token.class, + aProtein); + + // if protein/trigger is within a token + if (annoTokens.size() == 0) { + FSIterator iter = jcas.getAnnotationIndex( + Token.type).iterator(); + annoTokens = new ArrayList(); + while (iter.hasNext()) { + Token token = (Token) iter.next(); + if (token.getBegin() <= aProtein.getBegin() + && token.getEnd() >= aProtein.getEnd()) { + annoTokens.add(token); + break; + } + } + } + + Token token = null; + token = annoTokens.get(0); + for (Token aToken : annoTokens) { + + try { + Double.parseDouble(aToken.getLemma()); + break; + } catch (NumberFormatException e) { + token = aToken; + } + } + themeTokens.add(token); + } + + if (themeTokens.size() == 0) { + throw new RuntimeException("Theme number is zero. Please check."); + } + String triggerText = "text_".concat(triggerToken.getCoveredText() + .toLowerCase()); + String triggerLemma = "triggerLemma_".concat(triggerToken.getLemma() + .toLowerCase()); + String triggerSubLemma = (null == triggerToken.getSubLemma() ? triggerToken + .getLemma() : "triggerSubLemma_".concat(triggerToken + .getSubLemma().toLowerCase())); + String triggerPos = "triggerPos_".concat(triggerToken.getPos()); + String triggerPosShort = "triggerShortPos_".concat(triggerToken + .getPos().substring(0, 1)); + + // parser : dependency path between trigger-argument + int i = 0; + String[] dependencyPaths = new String[themeTokens.size()]; + String[] simplifiedFeaturePaths = new String[themeTokens.size()]; + String[] triggerTextPaths = new String[themeTokens.size()]; + String[] triggerTextSimplifiedPaths = new String[themeTokens.size()]; + String[] triggerLemmaPaths = new String[themeTokens.size()]; + String[] triggerSubLemmaPaths = new String[themeTokens.size()]; + String[] triggerPosPaths = new String[themeTokens.size()]; + String[] triggerPosShortPaths = new String[themeTokens.size()]; + String[] textBetweens = new String[themeTokens.size()]; + String[] triggerTextBetweens = new String[themeTokens.size()]; + String[] textBetweenDependencies = new String[themeTokens.size()]; + String[] textAbsBetweenDependencies = new String[themeTokens.size()]; + String[] textShortBetweens = new String[themeTokens.size()]; + String[] textShortBetweenDependencyPaths = new String[themeTokens + .size()]; + for (Token aThemeToken : themeTokens) { + String dependencyPath = dependencyExtractor.getShortestPath( + triggerToken, aThemeToken, null); + String featurePath = dependencyPath; + + if (null == dependencyPath) { + featurePath = dependencyExtractor.getReversedShortestPath( + triggerToken, aThemeToken, null); + } + featurePath = (null == featurePath ? null : "dep_" + .concat(featurePath)); + dependencyPaths[i] = featurePath; + + String simplifiedFeaturePath = null; + // parser refined? + + // parser_simple: grouping of dpendency type; + // amod, nn --> nmod + // anything ending in subj --> subj + // anything ending in subjpass --> subjpass + if (null != dependencyPath) { + simplifiedFeaturePath = dependencyExtractor + .getSimplifiedShortestPath(triggerToken, aThemeToken, + null); + } else { + simplifiedFeaturePath = dependencyExtractor + .getSimplifiedReversedShortestPath(triggerToken, + aThemeToken, null); + } + simplifiedFeaturePath = (null == simplifiedFeaturePath ? null + : "dep_simple_".concat(simplifiedFeaturePath)); + simplifiedFeaturePaths[i] = simplifiedFeaturePath; + + triggerTextPaths[i] = null == featurePath ? null : triggerText + .concat("_").concat(featurePath); + triggerTextSimplifiedPaths[i] = null == simplifiedFeaturePath ? null + : triggerText.concat("_").concat(simplifiedFeaturePath); + + triggerLemmaPaths[i] = null == featurePath ? null : triggerLemma + .concat("_").concat(featurePath); + + triggerSubLemmaPaths[i] = null == featurePath ? null + : triggerSubLemma.concat("_").concat(featurePath); + + triggerPosPaths[i] = null == featurePath ? null : triggerPos + .concat("_").concat(featurePath); + triggerPosShortPaths[i] = null == featurePath ? null + : triggerPosShort.concat("_").concat(featurePath); + + // text string from trigger to theme/cause: compensate when parsing + // fails + List tokensBetween = JCasUtil.selectCovered(jcas, + Token.class, sentence); + List proteinsBetween = JCasUtil.selectCovered(jcas, + Protein.class, sentence); + int start = Math.min(aThemeToken.getBegin(), + triggerToken.getBegin()); + int end = Math.max(aThemeToken.getEnd(), triggerToken.getEnd()); + boolean reversed = (start != triggerToken.getBegin()); + + List tokensTextBetween = new ArrayList(); + List tokensAbsTextBetween = new ArrayList(); + + tokensLoop: for (Token aToken : tokensBetween) { + + if (aToken.getBegin() < start || !POS.isPos(aToken.getPos())) { + continue tokensLoop; + } else if (aToken.getEnd() >= end) { + break tokensLoop; + } + + // if it is a protein + for (Protein aProtein : proteinsBetween) { + if (aToken.getBegin() == aProtein.getBegin()) { + tokensTextBetween.add("PROTEIN"); + tokensAbsTextBetween.add("PROTEIN"); + continue tokensLoop; + } else if (aToken.getBegin() > aProtein.getBegin() + && aToken.getEnd() <= aProtein.getEnd()) { + continue tokensLoop; + } + } + if (aToken.getBegin() == trigger.getBegin()) { + tokensAbsTextBetween.add(trigger.getEventType()); + continue tokensLoop; + } else if (aToken.getBegin() > trigger.getBegin() + && aToken.getEnd() <= trigger.getEnd()) { + continue tokensLoop; + } + + tokensTextBetween.add(aToken.getLemma().toLowerCase()); + tokensAbsTextBetween.add(aToken.getLemma().toLowerCase()); + + } + + String textBetween = "", textAbsBetween = ""; + for (String aText : tokensTextBetween) { + if (reversed) { + textBetween = aText.concat(textBetween.equals("") ? "" + : "_".concat(textBetween)); + } else { + textBetween = textBetween.equals("") ? aText : textBetween + .concat("_").concat(aText); + } + } + for (String aText : tokensAbsTextBetween) { + if (reversed) { + textAbsBetween = aText + .concat(textAbsBetween.equals("") ? "" : "_" + .concat(textAbsBetween)); + } else { + textAbsBetween = textAbsBetween.equals("") ? aText + : textAbsBetween.concat("_").concat(aText); + } + } + + textBetweens[i] = textBetween.equals("") ? null : "textString_" + .concat(reversed ? "reversed_" : "").concat(textBetween); + + triggerTextBetweens[i] = null == textBetween ? null : triggerText + .concat("_").concat(textBetween); + textBetweenDependencies[i] = null != textBetween + && null != dependencyPath ? dependencyPath.concat("_") + .concat(textBetween) : null; + textAbsBetweenDependencies[i] = textAbsBetween.equals("") ? null + : "textStringAbs_".concat(reversed ? "reversed_" : "") + .concat(textAbsBetween); + + String textShortBetween = ""; + for (int j = 1; j < tokensAbsTextBetween.size() - 1; j++) { + if (reversed) { + textShortBetween = tokensAbsTextBetween.get(j).concat( + textShortBetween.equals("") ? "" : "_" + .concat(textShortBetween)); + } else { + textShortBetween = textShortBetween.equals("") ? tokensAbsTextBetween + .get(j) : textShortBetween.concat("_").concat( + tokensAbsTextBetween.get(j)); + } + } + + textShortBetweens[i] = textShortBetween.equals("") ? null + : "textStringShort_".concat(reversed ? "reversed_" : "") + .concat(textShortBetween); + textShortBetweenDependencyPaths[i] = null != textShortBetween + && null != dependencyPath ? dependencyPath.concat("_") + .concat(textShortBetween) : null; + i++; + } + + featuresString.add(dependencyPaths); + featuresString.add(simplifiedFeaturePaths); + + // trigger token & trigger type + featuresString.add(triggerTextPaths); + featuresString.add(triggerTextSimplifiedPaths); + + // trigger lemma + featuresString.add(triggerLemmaPaths); + + // trigger sublemma + featuresString.add(triggerSubLemmaPaths); + + // trigger POS + featuresString.add(triggerPosPaths); + featuresString.add(triggerPosShortPaths); + + featuresString.add(new String[] { triggerLemma.concat("_").concat( + triggerPos) }); + featuresString.add(new String[] { triggerLemma.concat("_").concat( + triggerPosShort) }); + featuresString.add(new String[] { triggerSubLemma.concat("_").concat( + triggerPos) }); + featuresString.add(new String[] { triggerSubLemma.concat("_").concat( + triggerPosShort) }); + + // concatenate text between trigger and theme/cause with the previous + // features. + featuresString.add(textBetweens); + featuresString.add(triggerTextBetweens); + featuresString.add(textBetweenDependencies); + featuresString.add(textAbsBetweenDependencies); + + featuresString.add(textShortBetweens); + featuresString.add(textShortBetweenDependencyPaths); + + if (truepositive) { + + instance.setLabelString("Binding"); + + } else { + instance.setLabelString("Non_binding"); + } + + return instance; + } + + protected boolean isWord(String text) { + + return text.matches("^[a-zA-Z].+"); + } + + /** + * Only Binding may have more than one theme, but it doesn't have cause. + * Therefore, there is only one returned theme token. + * + * @param event + * @return + */ + protected Token getThemeToken(JCas jcas, Event event, Sentence sentence) { + + String themeId = event.getThemes(0); + + for (Protein protein : JCasUtil.selectCovered(jcas, Protein.class, + sentence)) { + if (themeId.equals(protein.getId())) { + return getProteinToken(jcas, protein); + } + } + for (Event anEvent : JCasUtil + .selectCovered(jcas, Event.class, sentence)) { + // an event can't be the theme of itself + if (event.getBegin() == anEvent.getBegin() + && event.getEnd() == anEvent.getEnd()) { + continue; + } + if (themeId.equals(anEvent.getId())) { + return getTriggerToken(jcas, anEvent.getTrigger()); + } + } + return null; + } + + protected Token getProteinToken(JCas jcas, Protein protein) { + // find the protein token + List proteinTokens = JCasUtil.selectCovered(jcas, Token.class, + protein); + + // if protein is within a token + if (proteinTokens.size() == 0) { + FSIterator iter = jcas.getAnnotationIndex(Token.type) + .iterator(); + proteinTokens = new ArrayList(); + while (iter.hasNext()) { + Token aToken = (Token) iter.next(); + if (aToken.getBegin() <= protein.getBegin() + && aToken.getEnd() >= protein.getEnd()) { + proteinTokens.add(aToken); + break; + } + } + } + + if (proteinTokens.size() == 0) { + logger.warning("No token found for protein."); + return null; + } + + return proteinTokens.get(proteinTokens.size() - 1); + } + +} diff --git a/src/info/chenli/litway/exec/ArgumentInstances.java b/src/info/chenli/litway/exec/ArgumentInstances.java new file mode 100644 index 0000000..f89136d --- /dev/null +++ b/src/info/chenli/litway/exec/ArgumentInstances.java @@ -0,0 +1,156 @@ +package info.chenli.litway.exec; + +import info.chenli.classifier.Instance; +import info.chenli.classifier.InstanceDictionary; +import info.chenli.litway.corpora.Entity; +import info.chenli.litway.corpora.Event; +import info.chenli.litway.corpora.Protein; +import info.chenli.litway.corpora.Sentence; +import info.chenli.litway.corpora.Token; +import info.chenli.litway.searn.StructuredInstance; +import info.chenli.litway.util.DependencyExtractor; +import info.chenli.litway.util.FileUtil; +import info.chenli.litway.util.StanfordDependencyReader; +import info.chenli.litway.util.StanfordDependencyReader.Pair; +import info.chenli.litway.util.UimaUtil; + +import java.io.File; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.logging.Logger; + +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.uimafit.util.JCasUtil; + +public class ArgumentInstances extends AbstractInstances { + + private final static Logger logger = Logger.getLogger(ArgumentInstances.class + .getName()); + + public ArgumentInstances() { + super(new int[] { Protein.type, Event.type }); + + } + + @Override + protected List getLabelsString() { + + ArrayList themeTypes = new ArrayList(); + + themeTypes.add("Theme"); + themeTypes.add("Non_theme"); + + return themeTypes; + + } + + @Override + protected List getStructuredInstances(JCas jcas, + FSIterator tokenIter, String argument) { + + List results = new LinkedList(); + + AnnotationIndex sentenceIndex = jcas + .getAnnotationIndex(Sentence.type); + + FSIterator sentenceIter = sentenceIndex.iterator(); + Map> pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc"))); + + // Currently, one sentence is considered as one structured instance. + while (sentenceIter.hasNext()) { + + StructuredInstance si = new StructuredInstance(); + List candidates = new LinkedList(); + si.setNodes(candidates); + + Sentence sentence = (Sentence) sentenceIter.next(); + Set pairsOfSentence = pairsOfArticle.get(sentence.getId()); + + DependencyExtractor dependencyExtractor = new DependencyExtractor( + JCasUtil.selectCovered(jcas, Token.class, sentence), + pairsOfSentence); + + List events = JCasUtil.selectCovered(jcas, Event.class, + sentence); + List proteins = JCasUtil.selectCovered(jcas, + Entity.class, sentence); + + for (Event event : events) { + + for (int i = 0; i < event.getArguments(argument).size(); i++) { + + // check protein themes + for (Entity protein : proteins) { + + boolean isTheme = event.getThemes(i).equals( + protein.getId()); + + candidates.add(themeToInstance(jcas, sentence, + protein, event.getTrigger(), pairsOfSentence, + dependencyExtractor, isTheme)); + } + + // check event themes + for (Event themeEvent : events) { + + if (event != themeEvent) { + + boolean isTheme = event.getThemes(i).equals( + themeEvent.getId()); + + candidates.add(themeToInstance(jcas, sentence, + themeEvent.getTrigger(), + event.getTrigger(), pairsOfSentence, + dependencyExtractor, isTheme)); + } + } + } + } + + results.add(si); + } + + return results; + } + + public static void main(String[] args) { + + ArgumentInstances ti = new ArgumentInstances(); + ti.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + + List instances = ti.getInstances(new File(args[0])); + + InstanceDictionary dict = new InstanceDictionary(); + dict.creatNumericDictionary(instances); + String classifierName = "liblinear"; + dict.saveDictionary(new File("./model/themes.".concat(classifierName) + .concat(".dict"))); + + ti.saveInstances(new File("./model/instances.theme.txt")); + ti.saveSvmLightInstances(new File("./model/instances.theme.svm.txt")); + + if (args.length == 2 && args[1].equals("dev")) { + + ArgumentInstances testInstances = new ArgumentInstances(); + testInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List tInstances = testInstances.getInstances(new File( + "./data/development/")); + + tInstances = dict.instancesToNumeric(tInstances); + + testInstances.saveInstances(new File( + "./model/instances.theme.dev.txt")); + testInstances.saveSvmLightInstances(new File( + "./model/instances.theme.svm.dev.txt")); + } + + } +} diff --git a/src/info/chenli/litway/exec/ArgumentRecogniser.java b/src/info/chenli/litway/exec/ArgumentRecogniser.java new file mode 100644 index 0000000..05666b2 --- /dev/null +++ b/src/info/chenli/litway/exec/ArgumentRecogniser.java @@ -0,0 +1,101 @@ +package info.chenli.litway.exec; + +import info.chenli.classifier.Accurary; +import info.chenli.classifier.Instance; +import info.chenli.classifier.InstanceDictionary; +import info.chenli.classifier.LibLinearFacade; + +import java.io.File; +import java.util.List; +import java.util.logging.Logger; + +/** + * + * @author Chen Li + * + */ +public class ArgumentRecogniser extends LibLinearFacade { + + private final static Logger logger = Logger.getLogger(ArgumentRecogniser.class + .getName()); + private final String classifierName = "liblinear"; + + public void train(File trainingSet, boolean useSearn) { + + if (useSearn) { + + } else { + + InstanceDictionary dict = new InstanceDictionary(); + + ArgumentInstances trainingInstances = new ArgumentInstances(); + trainingInstances + .setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = trainingInstances + .getInstances(trainingSet); + + dict.creatNumericDictionary(instances); + dict.saveDictionary(new File("./model/themes.".concat( + classifierName).concat(".dict"))); + + trainingInstances.saveInstances(new File( + "./model/instances.theme.txt")); + trainingInstances.saveSvmLightInstances(new File( + "./model/instances.theme.svm.txt")); + + train(dict.instancesToNumeric(instances)); + + // System.out.println(accuracy(instances)); + + ArgumentInstances testInstances = new ArgumentInstances(); + testInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + instances = testInstances.getInstances(new File( + "./data/development/")); + instances = dict.instancesToNumeric(instances); + testInstances.saveSvmLightInstances(new File( + "./model/instances.theme.svm.dev.txt")); + + // System.out.println(accuracy(instances)); + } + + } + + public static void main(String[] args) { + + ArgumentRecogniser tr = new ArgumentRecogniser(); + tr.loadModel(new File("./model/themes.".concat(tr.classifierName) + .concat(".model"))); + + InstanceDictionary dict = new InstanceDictionary(); + dict.loadDictionary(new File("./model/themes." + .concat(tr.classifierName).concat(".dict"))); + + ArgumentInstances ti = new ArgumentInstances(); + ti.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + + List instances = ti.getInstances(new File(args[0])); + + instances = dict.instancesToNumeric(instances); + + int total = 0, correct = 0; + for (Instance instance : instances) { + int prediction = tr.predict(instance); + System.out.print(instance.getLabel() + ":" + prediction); + for (String[] values : instance.getFeaturesString()) { + for (String value : values) { + System.out.print("\t" + value); + } + } + System.out.println(); + for (int value : instance.getFeaturesNumeric()) { + System.out.print("\t" + value); + } + System.out.println(); + if (prediction == instance.getLabel()) { + correct++; + } + total++; + } + System.out.println(new Accurary(correct, total)); + } +} diff --git a/src/info/chenli/litway/exec/POSPrioritizer.java b/src/info/chenli/litway/exec/POSPrioritizer.java new file mode 100644 index 0000000..b576921 --- /dev/null +++ b/src/info/chenli/litway/exec/POSPrioritizer.java @@ -0,0 +1,34 @@ +package info.chenli.litway.exec; + +import info.chenli.litway.corpora.POS; + +import java.util.Comparator; + +/** + * The class prioritize different pos tag, which is, for example, useful for + * choosing one token out of a multi-token trigger. + * + * @author Chen Li + * + */ +public class POSPrioritizer implements Comparator { + + /** + * The available tags in this list should be identical to the ones in + * {@link POS} + * + * @author Chen Li + * + */ + public enum order { + NN, NNS, NNP, NNPS, VB, VBD, VBG, VBN, VBP, VBZ, RB, RBR, RBS, JJ, JJR, JJS, CC, CD, DT, EX, FW, IN, LS, MD, PDT, POS, PRP, PRP$, RP, SYM, TO, UH, WDT, WP, WP$, WRB + } + + @Override + public int compare(POS pos1, POS pos2) { + + return order.valueOf(String.valueOf(pos1)).compareTo( + order.valueOf(String.valueOf(pos2))); + } + +} diff --git a/src/info/chenli/litway/exec/Trainer.java b/src/info/chenli/litway/exec/Trainer.java new file mode 100644 index 0000000..5654354 --- /dev/null +++ b/src/info/chenli/litway/exec/Trainer.java @@ -0,0 +1,30 @@ +package info.chenli.litway.exec; + +import info.chenli.litway.config.Configuration; + +import java.io.File; +import java.util.logging.Logger; + +public class Trainer { + + private final static Logger logger = Logger.getLogger(Trainer.class + .getName()); + + public void collectInstances() { + + Configuration conf = new Configuration(""); + + TriggerInstances ti = new TriggerInstances(); + ti.getInstances(new File(""), "Trigger"); + + for (String arg : conf.getArgTypes()) { + ArgumentInstances ai = new ArgumentInstances(); + ai.getInstances(new File(""), arg); + } + + } + + private void trainArgument(String arg) { + } + +} diff --git a/src/info/chenli/litway/exec/TriggerInstances.java b/src/info/chenli/litway/exec/TriggerInstances.java new file mode 100644 index 0000000..b665fda --- /dev/null +++ b/src/info/chenli/litway/exec/TriggerInstances.java @@ -0,0 +1,471 @@ +package info.chenli.litway.exec; + +import info.chenli.classifier.Instance; +import info.chenli.litway.config.Configuration; +import info.chenli.litway.corpora.POS; +import info.chenli.litway.corpora.Protein; +import info.chenli.litway.corpora.Sentence; +import info.chenli.litway.corpora.Token; +import info.chenli.litway.corpora.Trigger; +import info.chenli.litway.searn.StructuredInstance; +import info.chenli.litway.util.DependencyExtractor; +import info.chenli.litway.util.FileUtil; +import info.chenli.litway.util.StanfordDependencyReader; +import info.chenli.litway.util.StanfordDependencyReader.Pair; +import info.chenli.litway.util.UimaUtil; + +import java.io.File; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.logging.Logger; + +import org.apache.uima.cas.FSIterator; +import org.apache.uima.cas.text.AnnotationIndex; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.uimafit.util.JCasUtil; + +public class TriggerInstances extends AbstractInstances { + + private final static Logger logger = Logger + .getLogger(TriggerInstances.class.getName()); + + public TriggerInstances() { + + super(new int[] { Token.type }); + + } + + @Override + protected List getLabelsString() { + + List tokenTypes = new Configuration("").getEventTypeNames(); + + return tokenTypes; + } + + @Override + protected List getStructuredInstances(JCas jcas, + FSIterator tokenIter, String argument) { + + List results = new LinkedList(); + + AnnotationIndex sentenceIndex = jcas + .getAnnotationIndex(Sentence.type); + + FSIterator sentenceIter = sentenceIndex.iterator(); + Map> pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + UimaUtil.getJCasFilePath(jcas)).concat(".sdepcc"))); + + // Currently, one sentence is considered as one structured instance. + while (sentenceIter.hasNext()) { + + StructuredInstance si = new StructuredInstance(); + LinkedList nodes = new LinkedList(); + si.setNodes(nodes); + + Sentence sentence = (Sentence) sentenceIter.next(); + Set pairsOfSentence = pairsOfArticle.get(sentence.getId()); + + List triggers = JCasUtil.selectCovered(jcas, + Trigger.class, sentence); + + // token id and event type + Map triggerTokens = new HashMap(); + + // mark trigger tokens + for (Trigger trigger : triggers) { + triggerTokens.put(getTriggerToken(jcas, trigger).getId(), + trigger.getEventType()); + } + + // List originalTokens = JCasUtil.selectCovered(jcas, + // Token.class, sentence); + List sentenceProteins = JCasUtil.selectCovered(jcas, + Protein.class, sentence); + + // // print proteins which are within a token + // for (Token token : originalTokens) { + // + // for (Protein protein : sentenceProteins) { + // if (token.getBegin() == protein.getBegin() + // && token.getEnd() == protein.getEnd()) { + // continue; + // } + // if (token.getBegin() <= protein.getBegin() + // && token.getEnd() >= protein.getEnd()) { + // System.out.println(token.getCoveredText().concat("\t") + // .concat(protein.getCoveredText())); + // } + // } + // } + // + // if (true) { + // continue; + // } + // postProcessSentenceTokens(jcas, originalTokens, sentenceProteins, + // pairsOfSentence); + List tokensOfSentence = JCasUtil.selectCovered(jcas, + Token.class, sentence); + DependencyExtractor dependencyExtractor = new DependencyExtractor( + JCasUtil.selectCovered(jcas, Token.class, sentence), + pairsOfSentence); + + creatingInstanceLoop: for (Token token : tokensOfSentence) { + + // the tokens with protein have to be considered, as they may + // have trigger + // for (Protein protein : sentenceProteins) { + // if (token.getBegin() == protein.getBegin() + // && token.getEnd() == protein.getEnd()) { + // continue creatingInstanceLoop; + // } + // } + + nodes.add(tokenToInstance(jcas, token, triggerTokens, + tokensOfSentence, sentenceProteins, pairsOfSentence, + dependencyExtractor)); + } + + results.add(si); + } + + return results; + } + + /** + * + * @param jcas + * @param token + * @param triggerTokens + * Used for extracting instances from training set. null when + * being used for testing set. + * @param sentenceProteins + * @param dependencyExtractor + * @return + */ + protected Instance tokenToInstance(JCas jcas, Token token, + Map triggerTokens, List tokensOfSentence, + List sentenceProteins, Set pairsOfSentence, + DependencyExtractor dependencyExtractor) { + + // only consider the tokens, which are words. + Instance instance = new Instance(); + + List featureString = new ArrayList(); + instance.setFeaturesString(featureString); + + featureString.add(new String[] { "text_".concat(token.getCoveredText() + .toLowerCase()) }); + String lemma = "lemma_".concat(token.getLemma().toLowerCase()); + featureString.add(new String[] { lemma }); + String stem = "stem_".concat(token.getStem().toLowerCase()); + featureString.add(new String[] { stem }); + String pos = "pos_".concat(token.getPos()); + featureString.add(new String[] { lemma.concat("_").concat(pos) }); + + List modifiers = new ArrayList(); + List heads = new ArrayList(); + List simpleModifiers = new ArrayList(); + List simpleHeads = new ArrayList(); + List noLemmaModifiers = new ArrayList(); + List noLemmaHeads = new ArrayList(); + List noDepModifiers = new ArrayList(); + List noDepHeads = new ArrayList(); + List nsubjList = new ArrayList(); + List dobjList = new ArrayList(); + List iobjList = new ArrayList(); + for (Pair pair : pairsOfSentence) { + if (pair.getRelation().equalsIgnoreCase("punct")) { + continue; + } + if (pair.getHead() == token.getId()) { + for (Token aToken : tokensOfSentence) { + if (aToken.getId() == pair.getModifier()) { + String tokenLemma = isProtein(aToken, sentenceProteins) ? "PROTEIN" + : aToken.getLemma().toLowerCase(); + modifiers.add(lemma.concat("_") + .concat(pair.getRelation()).concat("_lemma_") + .concat(tokenLemma)); + simpleModifiers.add(lemma.concat("_") + .concat(pair.getSimpleRelation()) + .concat("_lemma_").concat(tokenLemma)); + noLemmaModifiers.add(pair.getRelation() + .concat("_lemma_").concat(tokenLemma)); + noDepModifiers.add(lemma.concat("_lemma_").concat( + tokenLemma)); + + // if (pair.getRelation().equalsIgnoreCase("nsubj")) { + // nsubjList.add("nsubj_lemma_".concat(aToken + // .getLemma())); + // } + // if (pair.getRelation().equalsIgnoreCase("dobj")) { + // dobjList.add("dobj_lemma_".concat(aToken.getLemma())); + // } + // if (pair.getRelation().equalsIgnoreCase("iobj")) { + // iobjList.add("iobj_lemma_".concat(aToken.getLemma())); + // } + } + } + } else if (pair.getModifier() == token.getId()) { + for (Token aToken : tokensOfSentence) { + if (aToken.getId() == pair.getHead()) { + String tokenLemma = isProtein(aToken, sentenceProteins) ? "PROTEIN" + : aToken.getLemma().toLowerCase(); + heads.add(lemma.concat("_-").concat(pair.getRelation()) + .concat("_lemma_").concat(tokenLemma)); + simpleHeads.add(lemma.concat("_-") + .concat(pair.getSimpleRelation()) + .concat("_lemma_").concat(tokenLemma)); + noLemmaHeads.add(pair.getRelation().concat("_lemma_") + .concat(tokenLemma)); + noDepHeads.add(lemma.concat("_-_lemma_").concat( + tokenLemma)); + } + } + } + } + String[] modifiersFeature = new String[modifiers.size()]; + modifiersFeature = modifiers.toArray(modifiersFeature); + String[] headsFeature = new String[heads.size()]; + headsFeature = heads.toArray(headsFeature); + String[] simpleModifiersFeature = new String[simpleModifiers.size()]; + simpleModifiersFeature = simpleModifiers + .toArray(simpleModifiersFeature); + String[] simpleHeadsFeature = new String[simpleHeads.size()]; + simpleHeadsFeature = simpleHeads.toArray(simpleHeadsFeature); + String[] noLemmaModifiersFeature = new String[noLemmaModifiers.size()]; + noLemmaModifiersFeature = noLemmaModifiers + .toArray(noLemmaModifiersFeature); + String[] noLemmaHeadsFeature = new String[noLemmaHeads.size()]; + noLemmaHeadsFeature = noLemmaHeads.toArray(noLemmaHeadsFeature); + String[] noDepModifiersFeature = new String[noDepModifiers.size()]; + noDepModifiersFeature = noDepModifiers.toArray(noDepModifiersFeature); + String[] noDepHeadsFeature = new String[noDepHeads.size()]; + noDepHeadsFeature = noDepHeads.toArray(noDepHeadsFeature); + String[] nsubjFeature = new String[nsubjList.size()]; + nsubjFeature = nsubjList.toArray(nsubjFeature); + String[] dobjFeature = new String[dobjList.size()]; + dobjFeature = dobjList.toArray(dobjFeature); + String[] iobjFeature = new String[iobjList.size()]; + iobjFeature = iobjList.toArray(iobjFeature); + + featureString.add(modifiersFeature); + featureString.add(headsFeature); + // featureString.add(simpleModifiersFeature); + // featureString.add(simpleHeadsFeature); + // featureString.add(noLemmaModifiersFeature); + // featureString.add(noLemmaHeadsFeature); + featureString.add(noDepModifiersFeature); + featureString.add(noDepHeadsFeature); + // featureString.add(nsubjFeature); + // featureString.add(dobjFeature); + // featureString.add(iobjFeature); + + String subLemma = "sublemma_" + .concat(null == token.getSubLemma() ? token.getLemma() + .toLowerCase() : token.getSubLemma().toLowerCase()); + featureString.add(new String[] { subLemma }); + String subStem = "substem_".concat(null == token.getSubStem() ? token + .getStem().toLowerCase() : token.getSubStem().toLowerCase()); + featureString.add(new String[] { subStem }); + + // + // ngram + // previous word + String leftTokenStr = token.getLeftToken() == null ? null : (POS + .isPos(token.getLeftToken().getPos()) ? "previousWord_" + .concat(token.getLeftToken().getLemma()) : null); + // featureString.add(null == leftTokenStr ? new String[0] + // : new String[] { leftTokenStr }); + // featureString.add(null == leftTokenStr ? new String[0] + // : new String[] { lemma.concat("_").concat(leftTokenStr) }); + String posLeftTokenStr = token.getLeftToken() == null ? null : ((token + .getLeftToken().getPos().indexOf("NN") > -1 + || token.getLeftToken().getPos().indexOf("JJ") > -1 || token + .getLeftToken().getPos().indexOf("V") > -1) ? lemma + + "_previousWord_".concat(token.getLeftToken().getLemma()) + : null); + // featureString.add(null == posLeftTokenStr ? new String[0] + // : new String[] { posLeftTokenStr }); + // after word + String rightTokenStr = token.getRightToken() == null ? null : (POS + .isPos(token.getRightToken().getPos()) ? "afterWord_" + .concat(token.getRightToken().getLemma()) : null); + // featureString.add(null == rightTokenStr ? new String[0] + // : new String[] { rightTokenStr }); + // featureString.add(null == rightTokenStr ? new String[0] + // : new String[] { lemma.concat("_").concat(rightTokenStr) }); + String posRightTokenStr = token.getRightToken() == null ? null : (token + .getRightToken().getPos().indexOf("NN") > -1) ? lemma + + "_afterWord_".concat(token.getLeftToken().getLemma()) : null; + // featureString.add(null == posRightTokenStr ? new String[0] + // : new String[] { posRightTokenStr }); + + // protein in the sentence + String[] proteins = new String[sentenceProteins.size()]; + String[] proteinsDummy = sentenceProteins.size() > 0 ? new String[] { "PROTEIN" } + : new String[0]; + String[] proteinsLemma = new String[sentenceProteins.size()]; + String[] proteinsDep = new String[sentenceProteins.size()]; + + int i = 0; + for (Protein protein : sentenceProteins) { + + Token aProteinToken = getProteinToken(jcas, protein); + + proteins[i] = "protein_" + + protein.getCoveredText().toLowerCase() + .replaceAll(" ", "_"); + proteinsLemma[i] = lemma.concat("_").concat( + protein.getCoveredText().toLowerCase()); + + proteinsDep[i] = dependencyExtractor.getShortestPath(token, + aProteinToken, null); + + if (null == proteinsDep[i]) { + proteinsDep[i] = dependencyExtractor.getReversedShortestPath( + token, aProteinToken, null); + } + } + // featureString.add(proteins); + featureString.add(proteinsDummy); + // featureString.add(proteinsLemma); + boolean isDepNull = true; + for (String dep : proteinsDep) { + if (null != dep) { + isDepNull = false; + break; + } + } + // featureString.add(isDepNull ? new String[0] : proteinsDep); + + if (null != triggerTokens) { + + instance.setLabelString(triggerTokens.containsKey(token.getId()) ? triggerTokens + .get(token.getId()) : "Non_trigger"); + } else { + instance.setLabelString("Non_trigger"); + } + + return instance; + } + + // public void postProcessSentenceTokens(JCas jcas, List tokens, + // List sentenceProteins, Set pairsOfSentence) { + // + // int i = tokens.size() + 1; + // + // tokenCollectingLoop: for (Token token : tokens) { + // + // if (!POS.isPos(token.getPos())) { + // token.removeFromIndexes(); + // continue; + // } + // + // for (Protein protein : sentenceProteins) { + // // token is a protein + // if (protein.getBegin() == token.getBegin() + // && protein.getEnd() == token.getEnd()) { + // continue tokenCollectingLoop; + // } + // // token is within a protein + // if ((token.getBegin() >= protein.getBegin() && token.getBegin() < protein + // .getEnd()) + // || (token.getBegin() > protein.getBegin() && token + // .getBegin() <= protein.getEnd())) { + // continue tokenCollectingLoop; + // } + // // protein is within a token (tricky part) + // if ((token.getBegin() <= protein.getBegin() && token.getEnd() > protein + // .getEnd()) + // || (token.getBegin() < protein.getBegin() && token + // .getEnd() >= protein.getEnd())) { + // Token proteinToken = createNewToken(jcas, + // protein.getBegin(), protein.getEnd(), protein + // .getCoveredText().toLowerCase(), + // String.valueOf(POS.NN)); + // if (protein.getBegin() != token.getBegin()) { + // Token leftToken = createNewToken( + // jcas, + // token.getBegin(), + // protein.getBegin(), + // token.getCoveredText() + // .substring( + // 0, + // protein.getBegin() + // - token.getBegin()) + // .toLowerCase(), String.valueOf(POS.NN)); + // leftToken.setId(i++); + // leftToken.setLeftToken(token.getLeftToken()); + // leftToken.setRightToken(proteinToken); + // proteinToken.setLeftToken(leftToken); + // } else { + // proteinToken.setLeftToken(token.getLeftToken()); + // } + // + // if (protein.getEnd() != token.getEnd()) { + // Token rightToken = createNewToken( + // jcas, + // protein.getEnd(), + // token.getEnd(), + // token.getCoveredText() + // .substring( + // 0, + // token.getEnd() + // - protein.getEnd()) + // .toLowerCase(), String.valueOf(POS.NN)); + // + // // use the original id of the token for the last token + // rightToken.setId(token.getId()); + // proteinToken.setId(i++); + // rightToken.setLeftToken(proteinToken); + // proteinToken.setRightToken(rightToken); + // rightToken.setRightToken(token.getRightToken()); + // + // } else { + // proteinToken.setRightToken(token.getRightToken()); + // proteinToken.setId(token.getId()); + // } + // token.removeFromIndexes(); + // continue tokenCollectingLoop; + // } + // } + // } + // } + + private boolean isProtein(Token token, List proteinsOfSentence) { + for (Protein protein : proteinsOfSentence) { + if ((token.getBegin() >= protein.getBegin() && token.getEnd() <= protein + .getEnd()) + || (protein.getBegin() >= token.getBegin() && protein + .getEnd() <= token.getEnd())) { + return true; + } + } + return false; + } + + public static void main(String[] args) { + + TriggerInstances ti = new TriggerInstances(); + ti.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = ti.getInstances(new File(args[0]), "Trigger"); + + for (Instance instance : instances) { + System.out.print(instance.getLabelString()); + for (String[] features : instance.getFeaturesString()) { + for (String feature : features) { + System.out.print("\t".concat(feature)); + } + } + System.out.println(); + } + } + +} diff --git a/src/info/chenli/litway/exec/TriggerRecogniser.java b/src/info/chenli/litway/exec/TriggerRecogniser.java new file mode 100644 index 0000000..c65c378 --- /dev/null +++ b/src/info/chenli/litway/exec/TriggerRecogniser.java @@ -0,0 +1,494 @@ +package info.chenli.litway.exec; + +import info.chenli.classifier.Accurary; +import info.chenli.classifier.Fscore; +import info.chenli.classifier.Instance; +import info.chenli.classifier.InstanceDictionary; +import info.chenli.classifier.LibLinearFacade; +import info.chenli.litway.corpora.Protein; +import info.chenli.litway.corpora.Sentence; +import info.chenli.litway.corpora.Token; +import info.chenli.litway.corpora.Trigger; +import info.chenli.litway.util.DependencyExtractor; +import info.chenli.litway.util.FileUtil; +import info.chenli.litway.util.StanfordDependencyReader; +import info.chenli.litway.util.StanfordDependencyReader.Pair; +import info.chenli.litway.util.Timer; + +import java.io.File; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.logging.Logger; + +import org.apache.uima.cas.FSIterator; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.uimafit.util.JCasUtil; + +/** + * Trigger POS statistics on training set: 1:CC, 1:CD, 2:DT, 8:IN, 290:JJ, + * 7:JJR, 2:JJS, 995:NN, 36,:NNS, 4:RB, 3:RBR, 97:VB, 109:VBD, 99:VBG, 267:VBN, + * 25:VBP, 78:VBZ, 1:WRB + * + * @author Chen Li + * + * + * + */ +public class TriggerRecogniser extends LibLinearFacade { + + private final static Logger logger = Logger + .getLogger(TriggerRecogniser.class.getName()); + + private final String classifierName = "liblinear"; + +// private static List consideredPOS = new ArrayList(); +// +// static { +// consideredPOS.add(POS.JJ); +// consideredPOS.add(POS.JJR); +// consideredPOS.add(POS.JJS); +// consideredPOS.add(POS.NN); +// consideredPOS.add(POS.NNS); +// consideredPOS.add(POS.VB); +// consideredPOS.add(POS.VBD); +// consideredPOS.add(POS.VBG); +// consideredPOS.add(POS.VBN); +// consideredPOS.add(POS.VBP); +// consideredPOS.add(POS.VBZ); +// }; +// +// public static boolean isConsidered(String pos) { +// +// POS aPos = null; +// try { +// aPos = POS.valueOf(pos); +// } catch (IllegalArgumentException e) { +// return false; +// } +// +// return isConsidered(aPos); +// } +// +// public static boolean isConsidered(POS pos) { +// +// for (POS aPos : consideredPOS) { +// if (aPos.equals(pos)) { +// return true; +// } +// } +// +// return false; +// } + + public void train(String trainingDir, int round) { + // + // collect all instances and fetch syntactical information + // + TriggerInstances trainingInstances = new TriggerInstances(); + trainingInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = trainingInstances.getInstances(new File( + trainingDir)); + logger.info(String.valueOf(instances.size()).concat( + " instances are collected.")); + + InstanceDictionary dict = new InstanceDictionary(); + dict.creatNumericDictionary(instances); + dict.saveDictionary(new File("./model/triggers.".concat(classifierName) + .concat(".dict"))); + logger.info("Save dictionary."); + + trainingInstances.saveInstances(new File( + "./model/instances.trigger.txt")); + trainingInstances.saveNumericInstances(new File( + "./model/instances.trigger.num.txt")); + trainingInstances.saveSvmLightInstances(new File( + "./model/instances.trigger.svm.txt")); + + // development instances + // TokenInstances devInstances = new TokenInstances(); + // devInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + // List devInstancesList = devInstances.getInstances(new File( + // "./data/development/")); + // logger.info(String.valueOf(devInstancesList.size()).concat( + // " instances are collected.")); + // + // dict.instancesToNumeric(devInstancesList); + // + // devInstances.saveInstances(new File( + // "./model/instances.trigger.dev.txt")); + // devInstances.saveNumericInstances(new File( + // "./model/instances.trigger.num.dev.txt")); + // devInstances.saveSvmLightInstances(new File( + // "./model/instances.trigger.svm.dev.txt")); + // + // System.out.print("Finish collecting events."); + // System.exit(0); + + // Collections.shuffle(instances); + // logger.info("Shuffle instances."); + + Timer timer = new Timer(); + timer.start(); + + train(instances, round); + timer.stop(); + logger.info("Training takes ".concat(String.valueOf(timer + .getRunningTime()))); + + saveModel(new File("./model/triggers.".concat(classifierName).concat( + ".model"))); + + } + + public List predict(File file, InstanceDictionary dict, + boolean printConfusionMatrix) { + + List triggers = new ArrayList(); + + TriggerInstances instancesGetter = new TriggerInstances(); + instancesGetter.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + if (printConfusionMatrix) { + List instances = dict.instancesToNumeric(instancesGetter + .getInstances(file)); + Map> confusionMatrix = new TreeMap>(); + for (Instance instance : instances) { + int prediction = instance.getFeaturesNumeric().length == 0 ? dict + .getLabelNumeric("Non_trigger") : this + .predict(instance); + + // confusion matrix + if (!confusionMatrix.containsKey(instance.getLabel())) { + TreeMap values = new TreeMap(); + values.put(prediction, 1); + confusionMatrix.put(instance.getLabel(), values); + } else { + Map values = confusionMatrix.get(instance + .getLabel()); + if (!values.containsKey(prediction)) { + values.put(prediction, 1); + } else { + int count = values.get(prediction) + 1; + values.put(prediction, count); + } + } + } + printConfusionMatrix(confusionMatrix, dict); + } else { + + JCas jcas = instancesGetter.processSingleFile(file); + Map> pairsOfArticle = StanfordDependencyReader + .getPairs(new File(FileUtil.removeFileNameExtension( + file.getAbsolutePath()).concat(".sdepcc"))); + + FSIterator sentenceIter = jcas.getAnnotationIndex( + Sentence.type).iterator(); + int proteinNum = jcas.getAnnotationIndex(Protein.type).size(); + while (sentenceIter.hasNext()) { + + Sentence sentence = (Sentence) sentenceIter.next(); + // List originalTokens = JCasUtil.selectCovered(jcas, + // Token.class, sentence); + List sentenceProteins = JCasUtil.selectCovered(jcas, + Protein.class, sentence); + Set pairsOfSentence = pairsOfArticle + .get(sentence.getId()); + + // instancesGetter.postProcessSentenceTokens(jcas, + // originalTokens, + // sentenceProteins, pairsOfSentence); + List tokens = JCasUtil.selectCovered(jcas, Token.class, + sentence); + DependencyExtractor dependencyExtractor = new DependencyExtractor( + JCasUtil.selectCovered(jcas, Token.class, sentence), + pairsOfSentence); + for (Token token : tokens) { + Instance instance = instancesGetter.tokenToInstance(jcas, + token, null, tokens, sentenceProteins, + pairsOfSentence, dependencyExtractor); + + // constraints + String et = TriggerWord.getEventType(token.getStem()); + int prediction = dict.getLabelNumeric(et); + + if (null == et) { + instance = dict.instanceToNumeric(instance); + prediction = instance.getFeaturesNumeric().length == 0 ? dict + .getLabelNumeric("Non_trigger") : this + .predict(instance); + } + + // if + // (token.getCoveredText().toLowerCase().indexOf("express") + // > -1 + // || token.getCoveredText().toLowerCase() + // .indexOf("secret") > -1 + // || token.getCoveredText().toLowerCase() + // .indexOf("produc") > -1) { + // System.out.println(instance.getLabel() + ":" + // + instance.getLabelString() + "\t" + // + token.getBegin() + ":" + token.getEnd()); + // for (String[] feature : instance.getFeaturesString()) { + // for (String value : feature) { + // System.out.print("\t" + value); + // } + // } + // System.out.println(); + // System.out.print(prediction); + // for (int value : instance.getFeaturesNumeric()) { + // System.out.print("\t" + value); + // } + // System.out.println(); + // } + if (prediction != dict.getLabelNumeric("Non_trigger")) { + Trigger trigger = new Trigger(jcas, token.getBegin(), + token.getEnd()); + trigger.setEventType(dict.getLabelString(prediction)); + trigger.setId("T".concat(String.valueOf(++proteinNum))); + triggers.add(trigger); + } + } + } + } + return triggers; + } + + private void printConfusionMatrix( + Map> confusionMatrix, + InstanceDictionary dict) { + // for (EventType goldType : EventType.values()) { + // System.out.print("\t".concat(String.valueOf(goldType))); + // } + // System.out.println(); + // for (EventType goldType : EventType.values()) { + // System.out.print(String.valueOf(goldType)); + // Map predictions = confusionMatrix.get(dict + // .getLabelNumeric(String.valueOf(goldType))); + // for (EventType predictedType : EventType.values()) { + // System.out.print("\t"); + // if (null != predictions) { + // System.out.print(String.valueOf(predictions.get(dict + // .getLabelNumeric(String.valueOf(predictedType))))); + // } + // } + // System.out.println(); + // } + } + + public void crossValidate(String dir) { + // + // collect all instances and fetch syntactical information + // + TriggerInstances trainingInstances = new TriggerInstances(); + trainingInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = trainingInstances + .getInstances(new File(dir)); + logger.info(String.valueOf(instances.size()).concat( + " instances are collected.")); + + Collections.shuffle(instances); + trainingInstances.saveInstances(new File("./model/instances.csv")); + InstanceDictionary dictAll = new InstanceDictionary(); + dictAll.creatNumericDictionary(instances); + trainingInstances.saveNumericInstances(new File( + "./model/instances.num.csv")); + trainingInstances.saveSvmLightInstances(new File( + "./model/instances.svm.csv")); + logger.info("Shuffle instances."); + + // + // n-fold cross validation + // + int fold = 10; + int step = instances.size() / fold; + + logger.info(String.valueOf(fold).concat(" fold cross validatation.")); + + double recallSum = 0, precisionSum = 0; + for (int i = 0; i < fold; i++) { + + logger.info(String.valueOf(i).concat(" fold cross validatation.")); + + List subTestingInstances = instances.subList(step * i, + step * (i + 1)); + List subTrainingInstances = new ArrayList(); + subTrainingInstances.addAll(instances.subList(0, step * i)); + subTrainingInstances.addAll(instances.subList(step * (i + 1), + instances.size())); + + InstanceDictionary dict = new InstanceDictionary(); + dict.creatNumericDictionary(subTrainingInstances); + logger.info("Create dictionary."); + dict.saveDictionary(new File("./model/triggers." + i + ".dict")); + logger.info("Save dictionary."); + + // Collections.shuffle(subTrainingInstances); + Collections.shuffle(subTestingInstances); + + Timer timer = new Timer(); + timer.start(); + + TriggerRecogniser tr = new TriggerRecogniser(); + // tr.train(subTrainingInstances, 50); + tr.train(subTrainingInstances); + timer.stop(); + logger.info(String.valueOf(i).concat(" fold training takes ") + .concat(String.valueOf(timer.getRunningTime()))); + + Fscore fscore = tr.test(subTestingInstances, dict, i); + + recallSum = recallSum + fscore.getRecall(); + precisionSum = precisionSum + fscore.getPrecision(); + + tr.saveModel(new File("./model/triggers.".concat(classifierName) + .concat("." + i + ".model"))); + } + + System.out.println(new Fscore(recallSum / fold, precisionSum / fold)); + + } + + private Fscore test(List instances, InstanceDictionary dict, + int counter) { + + int tp = 0, fp = 0, tn = 0, fn = 0, correct = 0, total = 0; + StringBuffer tp_instances = new StringBuffer(); + StringBuffer fp_nonTrigger_instances = new StringBuffer(); + StringBuffer fp_trigger_instances = new StringBuffer(); + + // Collections.shuffle(instances); + for (Instance instance : instances) { + + instance = dict.instanceToNumeric(instance); + + int prediction = this.predict(instance); + if (instance.getLabelString().equals("Non_trigger")) { + + if (prediction != instance.getLabel()) { + fp++; + total++; + fp_nonTrigger_instances.append(dict + .getLabelString(prediction).concat("\t") + .concat(instance.toString()).concat("\n")); + } + } else { + if (prediction != instance.getLabel()) { + fp++; + fn++; + fp_trigger_instances.append(dict.getLabelString(prediction) + .concat("\t").concat(instance.toString()) + .concat("\n")); + } else { + correct++; + tp++; + tp_instances.append(dict.getLabelString(prediction) + .concat("\t").concat(instance.toString()) + .concat("\n")); + } + if (prediction != dict.getLabelNumeric("Non_trigger")) { + total++; + } + } + } + + FileUtil.saveFile(fp_nonTrigger_instances.toString(), new File( + "./result/fp_nonTrigger".concat(String.valueOf(counter)) + .concat(".txt"))); + FileUtil.saveFile(fp_trigger_instances.toString(), + new File("./result/fp_trigger".concat(String.valueOf(counter)) + .concat(".txt"))); + FileUtil.saveFile(tp_instances.toString(), new File("./result/tp" + .concat(String.valueOf(counter)).concat(".txt"))); + + Fscore fscore = new Fscore(tp, fp, tn, fn); + System.out.println(fscore); + System.out.println(new Accurary(correct, total)); + + return fscore; + } + + public static void main(String[] args) { + + if (args.length == 0 || !args[0].equals("predict") + && !args[0].equals("train") && !args[0].equals("test") + && !args[0].equals("cross")) { + throw new IllegalArgumentException( + "The first argument has to be \"predict\" or \"train\". "); + } + + File file = new File(args[1]); + + TriggerRecogniser tr = new TriggerRecogniser(); + + if (args[0].equals("train")) { + if (!file.isDirectory()) { + throw new IllegalArgumentException( + "The second argument has to be the training directory. "); + } + tr.train(args[1], 1); + } else if (args[0].equals("predict")) { + + if (!file.isFile() && !file.isDirectory()) { + throw new IllegalArgumentException( + "The second argument has to be a file."); + } + + tr.loadModel(new File("./model/triggers.".concat(tr.classifierName) + .concat(".model"))); + InstanceDictionary dict = new InstanceDictionary(); + dict.loadDictionary(new File("./model/triggers.".concat( + tr.classifierName).concat(".dict"))); + + List triggers = tr.predict(file, dict, false); + for (Trigger trigger : triggers) { + System.out.println(trigger.getId().concat("\t") + .concat(trigger.getEventType()).concat(" ") + .concat(String.valueOf(trigger.getBegin())).concat(" ") + .concat(String.valueOf(trigger.getEnd())).concat("\t") + .concat(trigger.getCoveredText())); + } + + } else if (args[0].equals("test")) { + + tr.loadModel(new File("./model/triggers.".concat(tr.classifierName) + .concat(".model"))); + + TriggerInstances testInstances = new TriggerInstances(); + testInstances.setTaeDescriptor("/desc/GeTrainingSetAnnotator.xml"); + List instances = testInstances.getInstances(new File( + "./data/development/")); + logger.info(String.valueOf(instances.size()).concat( + " instances are collected.")); + + InstanceDictionary dict = new InstanceDictionary(); + dict.loadDictionary(new File("./model/triggers.".concat( + tr.classifierName).concat(".dict"))); + dict.instancesToNumeric(instances); + + testInstances.saveInstances(new File( + "./model/instances.trigger.dev.txt")); + testInstances.saveNumericInstances(new File( + "./model/instances.trigger.num.dev.txt")); + testInstances.saveSvmLightInstances(new File( + "./model/instances.trigger.svm.dev.txt")); + + System.out.print("Finish collecting events."); + + int total = 0, correct = 0; + for (Instance instance : instances) { + int prediction = tr.predict(instance); + if (prediction == instance.getLabel()) { + correct++; + } + total++; + } + System.out.println(new Accurary(correct, total)); + } else if (args[0].equals("cross")) { + tr.crossValidate(args[1]); + } + } +} diff --git a/src/info/chenli/litway/exec/TriggerWord.java b/src/info/chenli/litway/exec/TriggerWord.java new file mode 100644 index 0000000..753c95c --- /dev/null +++ b/src/info/chenli/litway/exec/TriggerWord.java @@ -0,0 +1,48 @@ +package info.chenli.litway.exec; + +/** + * This is the place where trigger words can be pre-provided as constraints. + * + * @author Chen Li + * + */ +public enum TriggerWord { + + absenc, activ, acetyl, affect, alter, assembl, associ, bind, bound, chang, control, correl, deacetyl, depend, deregul, detect, dimer, downstream, effect, engag, express, function, independ, induc, influenc, interact, involv, level, ligand, ligat, link, local, locat, modul, mrna, mutat, phospho, presen, produc, recogn, recruit, regul, releas, respons, role, secret, synthes, target, transcript, transloc, ubiquitin, unaffect; + + public static TriggerWord isATriggerWord(String word) { + + for (TriggerWord tw : TriggerWord.values()) { + + if (word.toLowerCase().startsWith(String.valueOf(tw))) { + return tw; + } + } + + return null; + } + + public static String getEventType(String word) { + + // if (word.startsWith(String.valueOf(TriggerWord.bind)) + // || word.startsWith(String.valueOf(TriggerWord.dimer)) + // || word.startsWith(String.valueOf(TriggerWord.ligat)) + // || word.startsWith(String.valueOf(TriggerWord.ligand)) + // || word.startsWith(String.valueOf(TriggerWord.interact))) { + // return EventType.Binding; + // } else if (word.indexOf(String.valueOf(TriggerWord.phospho)) > -1) { + // return EventType.Phosphorylation; + // } else if (word.indexOf(String.valueOf(TriggerWord.ubiquitin)) > -1) + // { + // return EventType.Ubiquitination; + // } else if (word.startsWith(String.valueOf(TriggerWord.acetyl))) { + // return EventType.Acetylation; + // } else if (word.startsWith(String.valueOf(TriggerWord.deacetyl))) { + // return EventType.Deacetylation; + // } else if (word.startsWith(String.valueOf(TriggerWord.induc))) { + // return EventType.Positive_regulation; + // } + // + return null; + } +} diff --git a/src/info/chenli/litway/util/ConnlxReader.java b/src/info/chenli/litway/util/ConnlxReader.java index 31127b3..2676171 100644 --- a/src/info/chenli/litway/util/ConnlxReader.java +++ b/src/info/chenli/litway/util/ConnlxReader.java @@ -85,6 +85,8 @@ public static List getTokens(File file) { Token token = new Token(); token.setId(Integer.valueOf(st.nextToken()).intValue()); token.setText(st.nextToken()); + //int i = token.getId(); + //String s = token.getText(); st.nextToken(); token.setPos(st.nextToken()); st.nextToken(); diff --git a/src/info/chenli/litway/util/DependencyExtractor.java b/src/info/chenli/litway/util/DependencyExtractor.java index 7eb4704..9d99d9a 100644 --- a/src/info/chenli/litway/util/DependencyExtractor.java +++ b/src/info/chenli/litway/util/DependencyExtractor.java @@ -47,7 +47,9 @@ public DependencyExtractor(List tokens, Set pairsOfSentence) { if (!tokenMap.containsKey(token.getId())) { tokenMap.put(token.getId(), token); + //System.out.println(token.getCoveredText()); } else { + //System.out.println(token.getCoveredText()); logger.warning("Duplicated token error."); } } @@ -64,7 +66,16 @@ public DependencyExtractor(List tokens, Set pairsOfSentence) { network.addEdge(token.getId(), pair.getModifier()); reversedNetwork.addEdge(pair.getModifier(), token.getId()); - + //if (pair.getRelation().equalsIgnoreCase("conj_and") + //|| pair.getRelation().equalsIgnoreCase("nsubj") + //|| pair.getRelation().equalsIgnoreCase("nsubjpass") + //|| pair.getRelation().equalsIgnoreCase("nn") + //|| pair.getRelation().equalsIgnoreCase("amod") + //) { + network.addEdge(pair.getModifier(), + token.getId()); + reversedNetwork.addEdge(token.getId(), pair.getModifier()); + //} } catch (IllegalArgumentException e) { logger.severe("The token couldn't be found."); } @@ -119,60 +130,231 @@ private String getDijkstraShortestPath(Token startToken, Token endToken, String dependencyPath = null; List edges = DijkstraShortestPath.findPathBetween( theNetwork, startToken.getId(), endToken.getId()); - + /*if (null != edges && edges.size() > 5) { + return dependencyPath; + }*/ + //int a = startToken.getId(); + //int b = endToken.getId(); if (null != edges && edges.size() > 0) { + //int del = edges.size(); for (DefaultEdge edge : edges) { for (Pair pair : dependencyPairs) { - if (theNetwork.getEdgeSource(edge) == pair.getHead() - && theNetwork.getEdgeTarget(edge) == pair - .getModifier()) { + if ((theNetwork.getEdgeSource(edge) == pair.getHead() + && theNetwork.getEdgeTarget(edge) == pair.getModifier()) + || (theNetwork.getEdgeSource(edge) == pair.getModifier() + && theNetwork.getEdgeTarget(edge) == pair.getHead())) { String relation = pair.getRelation(); - if (simplified) { + /*if (del > 1 && (relation.equals("nn") || relation.equals("amod"))) { + del--; + break; + }*/ + /*if (simplified) { if (relation.equals("nn") - || relation.equals("amond")) { + || relation.equals("amod")) { relation = "nmod"; } else if (relation.endsWith("subj")) { relation = "subj"; - } else if (relation.endsWith("subjpass")) { + } else if (relation.endsWith("subjpass")) { relation = "subjpass"; - // } else if (stage.equals(Stage.CAUSE) - // && relation.startsWith("prep")) { - // relation = "prep"; - } + } //else if (relation.startsWith("prep")) { + //relation = "prep"; + //} + }*/ + if ((((theNetwork.getEdgeSource(edge) == pair.getModifier() + && theNetwork.getEdgeTarget(edge) == pair.getHead()) + //&& (!pair.getRelation().equalsIgnoreCase("conj_and") + // && !pair.getRelation().equalsIgnoreCase("conj_or") + // && !pair.getRelation().equalsIgnoreCase("appos")) + ))) { + relation = "-".concat(relation); } if (null == dependencyPath) { dependencyPath = relation; } else { dependencyPath = dependencyPath.concat("_") - .concat(reversedNetwork ? "-" : "") .concat(relation); } } } } + /*if (stage.equals(Stage.THEME)) { + // create equal path for the tokens connected by "or","and" + if (null != dependencyPath) { + dependencyPath = dependencyPath.replaceAll("prep_of_conj_or", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_-conj_or", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_conj_and", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_-conj_and", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_appos", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_-appos", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_nn", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_-nn", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_amod", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_-amod", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_nmod", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_-nmod", "prep_of"); + + dependencyPath = dependencyPath.replaceAll("prep_for_conj_or", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_-conj_or", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_conj_and", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_-conj_and", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_appos", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_-appos", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_nn", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_-nn", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_amod", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_-amod", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_nmod", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_-nmod", "prep_for"); + + dependencyPath = dependencyPath.replaceAll("prep_between_conj_and", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_between_-conj_and", "prep_of"); - if (stage.equals(Stage.THEME)) { + dependencyPath = dependencyPath.replaceAll("prep_with_conj_or", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_-conj_or", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_conj_and", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_-conj_and", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_appos", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_-appos", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_nn", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_-nn", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_amod", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_-amod", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_nmod", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_-nmod", "prep_with"); + + dependencyPath = dependencyPath.replaceAll("prep_through_conj_or", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_-conj_or", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_conj_and", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_-conj_and", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_appos", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_-appos", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_nn", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_-nn", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_amod", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_-amod", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_nmod", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_-nmod", "prep_through"); + + dependencyPath = dependencyPath.replaceAll("prep_in_conj_or", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_-conj_or", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_conj_and", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_-conj_and", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_appos", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_-appos", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_nn", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_-nn", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_amod", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_-amod", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_nmod", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_-nmod", "prep_in"); + } + }else if (stage.equals(Stage.BINDING)) { // create equal path for the tokens connected by "or" - if (null != dependencyPath - && dependencyPath.indexOf("prep_of") > -1) { - if (dependencyPath.indexOf("conj_or_") > -1) { - dependencyPath = dependencyPath.replaceAll("conj_or_", - ""); - } else if (dependencyPath.indexOf("_conj_or") > -1) { - dependencyPath = dependencyPath.replaceAll("_conj_or", - ""); - } + if (null != dependencyPath) { + dependencyPath = dependencyPath.replaceAll("prep_of_conj_or", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_-conj_or", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_conj_and", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_-conj_and", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_appos", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_-appos", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_nn", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_-nn", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_amod", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_-amod", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_nmod", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_of_-nmod", "prep_of"); + + dependencyPath = dependencyPath.replaceAll("prep_for_conj_or", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_-conj_or", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_conj_and", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_-conj_and", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_appos", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_-appos", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_nn", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_-nn", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_amod", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_-amod", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_nmod", "prep_for"); + dependencyPath = dependencyPath.replaceAll("prep_for_-nmod", "prep_for"); + + dependencyPath = dependencyPath.replaceAll("prep_between_conj_and", "prep_of"); + dependencyPath = dependencyPath.replaceAll("prep_between_-conj_and", "prep_of"); + + dependencyPath = dependencyPath.replaceAll("prep_with_conj_or", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_-conj_or", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_conj_and", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_-conj_and", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_appos", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_-appos", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_nn", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_-nn", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_amod", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_-amod", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_nmod", "prep_with"); + dependencyPath = dependencyPath.replaceAll("prep_with_-nmod", "prep_with"); + + dependencyPath = dependencyPath.replaceAll("prep_through_conj_or", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_-conj_or", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_conj_and", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_-conj_and", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_appos", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_-appos", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_nn", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_-nn", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_amod", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_-amod", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_nmod", "prep_through"); + dependencyPath = dependencyPath.replaceAll("prep_through_-nmod", "prep_through"); + + dependencyPath = dependencyPath.replaceAll("prep_in_conj_or", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_-conj_or", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_conj_and", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_-conj_and", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_appos", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_-appos", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_nn", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_-nn", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_amod", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_-amod", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_nmod", "prep_in"); + dependencyPath = dependencyPath.replaceAll("prep_in_-nmod", "prep_in"); +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + dependencyPath = dependencyPath.replaceAll("conj_or_", ""); + dependencyPath = dependencyPath.replaceAll("_conj_or", ""); + dependencyPath = dependencyPath.replaceAll("_-conj_or", ""); + dependencyPath = dependencyPath.replaceAll("-conj_or_", ""); + + dependencyPath = dependencyPath.replaceAll("conj_and_", ""); + dependencyPath = dependencyPath.replaceAll("_conj_and", ""); + dependencyPath = dependencyPath.replaceAll("_-conj_and", ""); + dependencyPath = dependencyPath.replaceAll("-conj_and_", ""); + + dependencyPath = dependencyPath.replaceAll("appos_", ""); + dependencyPath = dependencyPath.replaceAll("_appos", ""); + dependencyPath = dependencyPath.replaceAll("_-appos", ""); + dependencyPath = dependencyPath.replaceAll("-appos_", ""); + + dependencyPath = dependencyPath.replaceAll("nn_", ""); + dependencyPath = dependencyPath.replaceAll("_nn", ""); + dependencyPath = dependencyPath.replaceAll("_-nn", ""); + dependencyPath = dependencyPath.replaceAll("-nn_", ""); + + dependencyPath = dependencyPath.replaceAll("amod_", ""); + dependencyPath = dependencyPath.replaceAll("_amod", ""); + dependencyPath = dependencyPath.replaceAll("_-amod", ""); + dependencyPath = dependencyPath.replaceAll("-amod_", ""); + + dependencyPath = dependencyPath.replaceAll("dep_", ""); + dependencyPath = dependencyPath.replaceAll("_dep", ""); + dependencyPath = dependencyPath.replaceAll("-dep_", ""); + dependencyPath = dependencyPath.replaceAll("_-dep", ""); + + dependencyPath = dependencyPath.replaceAll("nmod_", ""); + dependencyPath = dependencyPath.replaceAll("_nmod", ""); + dependencyPath = dependencyPath.replaceAll("_-nmod", ""); + dependencyPath = dependencyPath.replaceAll("-nmod_", ""); } - // } else if (stage.equals(Stage.CAUSE)) { - // if (null != dependencyPath - // && dependencyPath.indexOf("dep") > -1) { - // if (dependencyPath.indexOf("dep_") > -1) { - // dependencyPath = dependencyPath.replaceAll("dep_", ""); - // } else if (dependencyPath.indexOf("_dep") > -1) { - // dependencyPath = dependencyPath.replaceAll("_dep", ""); - // } - // } - } + }*/ } return dependencyPath; @@ -188,4 +370,23 @@ private String getDijkstraShortestPath(Token startToken, Token endToken, public static void main(String[] args) { } + public int getDijkstraShortestPathLength(Token startToken, Token endToken) { + + if (null == tokenMap || null == startToken + || null == endToken // TODO check later + || !tokenMap.get(startToken.getId()).equals(startToken) + || !tokenMap.get(endToken.getId()).equals(endToken)) { + return 0; + // throw new + // RuntimeException("Tokens are not from the same sentence."); + } + + List edges = DijkstraShortestPath.findPathBetween( + network, startToken.getId(), endToken.getId()); + if (null != edges) { + return edges.size(); + }else { + return 0; + } + } } \ No newline at end of file diff --git a/src/info/chenli/litway/util/StanfordDependencyReader.java b/src/info/chenli/litway/util/StanfordDependencyReader.java index 5dddb54..5d02d3f 100644 --- a/src/info/chenli/litway/util/StanfordDependencyReader.java +++ b/src/info/chenli/litway/util/StanfordDependencyReader.java @@ -96,6 +96,9 @@ public static Map> getPairs(File file) { String pairString = line.substring(line.indexOf("(") + 1, line.lastIndexOf(")")); String head = pairString.substring(0, pairString.indexOf(", ")); + if (!head.contains("-")) { + head = pairString.substring(0, pairString.lastIndexOf(", ")); + } head = head.substring(head.lastIndexOf("-") + 1); if (head.endsWith("'")) { head = head.substring(0, head.indexOf("'")); diff --git a/src/info/chenli/litway/util/XMLUtil.java b/src/info/chenli/litway/util/XMLUtil.java new file mode 100644 index 0000000..cfe6ecc --- /dev/null +++ b/src/info/chenli/litway/util/XMLUtil.java @@ -0,0 +1,40 @@ +package info.chenli.litway.util; + +import java.io.File; +import java.io.IOException; +import java.util.logging.Logger; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.w3c.dom.Document; +import org.xml.sax.SAXException; + +public class XMLUtil { + + private final static Logger logger = Logger.getLogger(XMLUtil.class + .getName()); + + public static Document getDocument(File file) { + + try { + DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory + .newInstance(); + DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder(); + + return docBuilder.parse(file); + } catch (SAXException e) { + logger.severe(e.getMessage()); + throw new RuntimeException(e); + } catch (IOException e) { + logger.severe(e.getMessage()); + throw new RuntimeException(e); + } catch (ParserConfigurationException e) { + logger.severe(e.getMessage()); + throw new RuntimeException(e); + } + + } + +} diff --git a/src/libsvm/svm.java b/src/libsvm/svm.java new file mode 100644 index 0000000..e5f2dd5 --- /dev/null +++ b/src/libsvm/svm.java @@ -0,0 +1,2849 @@ + + + + + +package libsvm; +import java.io.*; +import java.util.*; + +// +// Kernel Cache +// +// l is the number of total data items +// size is the cache size limit in bytes +// +class Cache { + private final int l; + private long size; + private final class head_t + { + head_t prev, next; // a cicular list + float[] data; + int len; // data[0,len) is cached in this entry + } + private final head_t[] head; + private head_t lru_head; + + Cache(int l_, long size_) + { + l = l_; + size = size_; + head = new head_t[l]; + for(int i=0;i= len if nothing needs to be filled) + // java: simulate pointer using single-element array + int get_data(int index, float[][] data, int len) + { + head_t h = head[index]; + if(h.len > 0) lru_delete(h); + int more = len - h.len; + + if(more > 0) + { + // free old space + while(size < more) + { + head_t old = lru_head.next; + lru_delete(old); + size += old.len; + old.data = null; + old.len = 0; + } + + // allocate new space + float[] new_data = new float[len]; + if(h.data != null) System.arraycopy(h.data,0,new_data,0,h.len); + h.data = new_data; + size -= more; + do {int _=h.len; h.len=len; len=_;} while(false); + } + + lru_insert(h); + data[0] = h.data; + return len; + } + + void swap_index(int i, int j) + { + if(i==j) return; + + if(head[i].len > 0) lru_delete(head[i]); + if(head[j].len > 0) lru_delete(head[j]); + do {float[] _=head[i].data; head[i].data=head[j].data; head[j].data=_;} while(false); + do {int _=head[i].len; head[i].len=head[j].len; head[j].len=_;} while(false); + if(head[i].len > 0) lru_insert(head[i]); + if(head[j].len > 0) lru_insert(head[j]); + + if(i>j) do {int _=i; i=j; j=_;} while(false); + for(head_t h = lru_head.next; h!=lru_head; h=h.next) + { + if(h.len > i) + { + if(h.len > j) + do {float _=h.data[i]; h.data[i]=h.data[j]; h.data[j]=_;} while(false); + else + { + // give up + lru_delete(h); + size += h.len; + h.data = null; + h.len = 0; + } + } + } + } +} + +// +// Kernel evaluation +// +// the static method k_function is for doing single kernel evaluation +// the constructor of Kernel prepares to calculate the l*l kernel matrix +// the member function get_Q is for getting one column from the Q Matrix +// +abstract class QMatrix { + abstract float[] get_Q(int column, int len); + abstract double[] get_QD(); + abstract void swap_index(int i, int j); +}; + +abstract class Kernel extends QMatrix { + private svm_node[][] x; + private final double[] x_square; + + // svm_parameter + private final int kernel_type; + private final int degree; + private final double gamma; + private final double coef0; + + abstract float[] get_Q(int column, int len); + abstract double[] get_QD(); + + void swap_index(int i, int j) + { + do {svm_node[] _=x[i]; x[i]=x[j]; x[j]=_;} while(false); + if(x_square != null) do {double _=x_square[i]; x_square[i]=x_square[j]; x_square[j]=_;} while(false); + } + + private static double powi(double base, int times) + { + double tmp = base, ret = 1.0; + + for(int t=times; t>0; t/=2) + { + if(t%2==1) ret*=tmp; + tmp = tmp * tmp; + } + return ret; + } + + double kernel_function(int i, int j) + { + switch(kernel_type) + { + case svm_parameter.LINEAR: + return dot(x[i],x[j]); + case svm_parameter.POLY: + return powi(gamma*dot(x[i],x[j])+coef0,degree); + case svm_parameter.RBF: + return Math.exp(-gamma*(x_square[i]+x_square[j]-2*dot(x[i],x[j]))); + case svm_parameter.SIGMOID: + return Math.tanh(gamma*dot(x[i],x[j])+coef0); + case svm_parameter.PRECOMPUTED: + return x[i][(int)(x[j][0].value)].value; + default: + return 0; // java + } + } + + Kernel(int l, svm_node[][] x_, svm_parameter param) + { + this.kernel_type = param.kernel_type; + this.degree = param.degree; + this.gamma = param.gamma; + this.coef0 = param.coef0; + + x = (svm_node[][])x_.clone(); + + if(kernel_type == svm_parameter.RBF) + { + x_square = new double[l]; + for(int i=0;i y[j].index) + ++j; + else + ++i; + } + } + return sum; + } + + static double k_function(svm_node[] x, svm_node[] y, + svm_parameter param) + { + switch(param.kernel_type) + { + case svm_parameter.LINEAR: + return dot(x,y); + case svm_parameter.POLY: + return powi(param.gamma*dot(x,y)+param.coef0,param.degree); + case svm_parameter.RBF: + { + double sum = 0; + int xlen = x.length; + int ylen = y.length; + int i = 0; + int j = 0; + while(i < xlen && j < ylen) + { + if(x[i].index == y[j].index) + { + double d = x[i++].value - y[j++].value; + sum += d*d; + } + else if(x[i].index > y[j].index) + { + sum += y[j].value * y[j].value; + ++j; + } + else + { + sum += x[i].value * x[i].value; + ++i; + } + } + + while(i < xlen) + { + sum += x[i].value * x[i].value; + ++i; + } + + while(j < ylen) + { + sum += y[j].value * y[j].value; + ++j; + } + + return Math.exp(-param.gamma*sum); + } + case svm_parameter.SIGMOID: + return Math.tanh(param.gamma*dot(x,y)+param.coef0); + case svm_parameter.PRECOMPUTED: + return x[(int)(y[0].value)].value; + default: + return 0; // java + } + } +} + +// An SMO algorithm in Fan et al., JMLR 6(2005), p. 1889--1918 +// Solves: +// +// min 0.5(\alpha^T Q \alpha) + p^T \alpha +// +// y^T \alpha = \delta +// y_i = +1 or -1 +// 0 <= alpha_i <= Cp for y_i = 1 +// 0 <= alpha_i <= Cn for y_i = -1 +// +// Given: +// +// Q, p, y, Cp, Cn, and an initial feasible point \alpha +// l is the size of vectors and matrices +// eps is the stopping tolerance +// +// solution will be put in \alpha, objective value will be put in obj +// +class Solver { + int active_size; + byte[] y; + double[] G; // gradient of objective function + static final byte LOWER_BOUND = 0; + static final byte UPPER_BOUND = 1; + static final byte FREE = 2; + byte[] alpha_status; // LOWER_BOUND, UPPER_BOUND, FREE + double[] alpha; + QMatrix Q; + double[] QD; + double eps; + double Cp,Cn; + double[] p; + int[] active_set; + double[] G_bar; // gradient, if we treat free variables as 0 + int l; + boolean unshrink; // XXX + + static final double INF = java.lang.Double.POSITIVE_INFINITY; + + double get_C(int i) + { + return (y[i] > 0)? Cp : Cn; + } + void update_alpha_status(int i) + { + if(alpha[i] >= get_C(i)) + alpha_status[i] = UPPER_BOUND; + else if(alpha[i] <= 0) + alpha_status[i] = LOWER_BOUND; + else alpha_status[i] = FREE; + } + boolean is_upper_bound(int i) { return alpha_status[i] == UPPER_BOUND; } + boolean is_lower_bound(int i) { return alpha_status[i] == LOWER_BOUND; } + boolean is_free(int i) { return alpha_status[i] == FREE; } + + // java: information about solution except alpha, + // because we cannot return multiple values otherwise... + static class SolutionInfo { + double obj; + double rho; + double upper_bound_p; + double upper_bound_n; + double r; // for Solver_NU + } + + void swap_index(int i, int j) + { + Q.swap_index(i,j); + do {byte _=y[i]; y[i]=y[j]; y[j]=_;} while(false); + do {double _=G[i]; G[i]=G[j]; G[j]=_;} while(false); + do {byte _=alpha_status[i]; alpha_status[i]=alpha_status[j]; alpha_status[j]=_;} while(false); + do {double _=alpha[i]; alpha[i]=alpha[j]; alpha[j]=_;} while(false); + do {double _=p[i]; p[i]=p[j]; p[j]=_;} while(false); + do {int _=active_set[i]; active_set[i]=active_set[j]; active_set[j]=_;} while(false); + do {double _=G_bar[i]; G_bar[i]=G_bar[j]; G_bar[j]=_;} while(false); + } + + void reconstruct_gradient() + { + // reconstruct inactive elements of G from G_bar and free variables + + if(active_size == l) return; + + int i,j; + int nr_free = 0; + + for(j=active_size;j 2*active_size*(l-active_size)) + { + for(i=active_size;iInteger.MAX_VALUE/100 ? Integer.MAX_VALUE : 100*l); + int counter = Math.min(l,1000)+1; + int[] working_set = new int[2]; + + while(iter < max_iter) + { + // show progress and do shrinking + + if(--counter == 0) + { + counter = Math.min(l,1000); + if(shrinking!=0) do_shrinking(); + svm.info("."); + } + + if(select_working_set(working_set)!=0) + { + // reconstruct the whole gradient + reconstruct_gradient(); + // reset active set size and check + active_size = l; + svm.info("*"); + if(select_working_set(working_set)!=0) + break; + else + counter = 1; // do shrinking next iteration + } + + int i = working_set[0]; + int j = working_set[1]; + + ++iter; + + // update alpha[i] and alpha[j], handle bounds carefully + + float[] Q_i = Q.get_Q(i,active_size); + float[] Q_j = Q.get_Q(j,active_size); + + double C_i = get_C(i); + double C_j = get_C(j); + + double old_alpha_i = alpha[i]; + double old_alpha_j = alpha[j]; + + if(y[i]!=y[j]) + { + double quad_coef = QD[i]+QD[j]+2*Q_i[j]; + if (quad_coef <= 0) + quad_coef = 1e-12; + double delta = (-G[i]-G[j])/quad_coef; + double diff = alpha[i] - alpha[j]; + alpha[i] += delta; + alpha[j] += delta; + + if(diff > 0) + { + if(alpha[j] < 0) + { + alpha[j] = 0; + alpha[i] = diff; + } + } + else + { + if(alpha[i] < 0) + { + alpha[i] = 0; + alpha[j] = -diff; + } + } + if(diff > C_i - C_j) + { + if(alpha[i] > C_i) + { + alpha[i] = C_i; + alpha[j] = C_i - diff; + } + } + else + { + if(alpha[j] > C_j) + { + alpha[j] = C_j; + alpha[i] = C_j + diff; + } + } + } + else + { + double quad_coef = QD[i]+QD[j]-2*Q_i[j]; + if (quad_coef <= 0) + quad_coef = 1e-12; + double delta = (G[i]-G[j])/quad_coef; + double sum = alpha[i] + alpha[j]; + alpha[i] -= delta; + alpha[j] += delta; + + if(sum > C_i) + { + if(alpha[i] > C_i) + { + alpha[i] = C_i; + alpha[j] = sum - C_i; + } + } + else + { + if(alpha[j] < 0) + { + alpha[j] = 0; + alpha[i] = sum; + } + } + if(sum > C_j) + { + if(alpha[j] > C_j) + { + alpha[j] = C_j; + alpha[i] = sum - C_j; + } + } + else + { + if(alpha[i] < 0) + { + alpha[i] = 0; + alpha[j] = sum; + } + } + } + + // update G + + double delta_alpha_i = alpha[i] - old_alpha_i; + double delta_alpha_j = alpha[j] - old_alpha_j; + + for(int k=0;k= max_iter) + { + if(active_size < l) + { + // reconstruct the whole gradient to calculate objective value + reconstruct_gradient(); + active_size = l; + svm.info("*"); + } + System.err.print("\nWARNING: reaching max number of iterations\n"); + } + + // calculate rho + + si.rho = calculate_rho(); + + // calculate objective value + { + double v = 0; + int i; + for(i=0;i= Gmax) + { + Gmax = -G[t]; + Gmax_idx = t; + } + } + else + { + if(!is_lower_bound(t)) + if(G[t] >= Gmax) + { + Gmax = G[t]; + Gmax_idx = t; + } + } + + int i = Gmax_idx; + float[] Q_i = null; + if(i != -1) // null Q_i not accessed: Gmax=-INF if i=-1 + Q_i = Q.get_Q(i,active_size); + + for(int j=0;j= Gmax2) + Gmax2 = G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[i]+QD[j]-2.0*y[i]*Q_i[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/1e-12; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + else + { + if (!is_upper_bound(j)) + { + double grad_diff= Gmax-G[j]; + if (-G[j] >= Gmax2) + Gmax2 = -G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[i]+QD[j]+2.0*y[i]*Q_i[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/1e-12; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + } + + if(Gmax+Gmax2 < eps) + return 1; + + working_set[0] = Gmax_idx; + working_set[1] = Gmin_idx; + return 0; + } + + private boolean be_shrunk(int i, double Gmax1, double Gmax2) + { + if(is_upper_bound(i)) + { + if(y[i]==+1) + return(-G[i] > Gmax1); + else + return(-G[i] > Gmax2); + } + else if(is_lower_bound(i)) + { + if(y[i]==+1) + return(G[i] > Gmax2); + else + return(G[i] > Gmax1); + } + else + return(false); + } + + void do_shrinking() + { + int i; + double Gmax1 = -INF; // max { -y_i * grad(f)_i | i in I_up(\alpha) } + double Gmax2 = -INF; // max { y_i * grad(f)_i | i in I_low(\alpha) } + + // find maximal violating pair first + for(i=0;i= Gmax1) + Gmax1 = -G[i]; + } + if(!is_lower_bound(i)) + { + if(G[i] >= Gmax2) + Gmax2 = G[i]; + } + } + else + { + if(!is_upper_bound(i)) + { + if(-G[i] >= Gmax2) + Gmax2 = -G[i]; + } + if(!is_lower_bound(i)) + { + if(G[i] >= Gmax1) + Gmax1 = G[i]; + } + } + } + + if(unshrink == false && Gmax1 + Gmax2 <= eps*10) + { + unshrink = true; + reconstruct_gradient(); + active_size = l; + } + + for(i=0;i i) + { + if (!be_shrunk(active_size, Gmax1, Gmax2)) + { + swap_index(i,active_size); + break; + } + active_size--; + } + } + } + + double calculate_rho() + { + double r; + int nr_free = 0; + double ub = INF, lb = -INF, sum_free = 0; + for(int i=0;i 0) + ub = Math.min(ub,yG); + else + lb = Math.max(lb,yG); + } + else if(is_upper_bound(i)) + { + if(y[i] < 0) + ub = Math.min(ub,yG); + else + lb = Math.max(lb,yG); + } + else + { + ++nr_free; + sum_free += yG; + } + } + + if(nr_free>0) + r = sum_free/nr_free; + else + r = (ub+lb)/2; + + return r; + } + +} + +// +// Solver for nu-svm classification and regression +// +// additional constraint: e^T \alpha = constant +// +final class Solver_NU extends Solver +{ + private SolutionInfo si; + + void Solve(int l, QMatrix Q, double[] p, byte[] y, + double[] alpha, double Cp, double Cn, double eps, + SolutionInfo si, int shrinking) + { + this.si = si; + super.Solve(l,Q,p,y,alpha,Cp,Cn,eps,si,shrinking); + } + + // return 1 if already optimal, return 0 otherwise + int select_working_set(int[] working_set) + { + // return i,j such that y_i = y_j and + // i: maximizes -y_i * grad(f)_i, i in I_up(\alpha) + // j: minimizes the decrease of obj value + // (if quadratic coefficeint <= 0, replace it with tau) + // -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha) + + double Gmaxp = -INF; + double Gmaxp2 = -INF; + int Gmaxp_idx = -1; + + double Gmaxn = -INF; + double Gmaxn2 = -INF; + int Gmaxn_idx = -1; + + int Gmin_idx = -1; + double obj_diff_min = INF; + + for(int t=0;t= Gmaxp) + { + Gmaxp = -G[t]; + Gmaxp_idx = t; + } + } + else + { + if(!is_lower_bound(t)) + if(G[t] >= Gmaxn) + { + Gmaxn = G[t]; + Gmaxn_idx = t; + } + } + + int ip = Gmaxp_idx; + int in = Gmaxn_idx; + float[] Q_ip = null; + float[] Q_in = null; + if(ip != -1) // null Q_ip not accessed: Gmaxp=-INF if ip=-1 + Q_ip = Q.get_Q(ip,active_size); + if(in != -1) + Q_in = Q.get_Q(in,active_size); + + for(int j=0;j= Gmaxp2) + Gmaxp2 = G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[ip]+QD[j]-2*Q_ip[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/1e-12; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + else + { + if (!is_upper_bound(j)) + { + double grad_diff=Gmaxn-G[j]; + if (-G[j] >= Gmaxn2) + Gmaxn2 = -G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[in]+QD[j]-2*Q_in[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/1e-12; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + } + + if(Math.max(Gmaxp+Gmaxp2,Gmaxn+Gmaxn2) < eps) + return 1; + + if(y[Gmin_idx] == +1) + working_set[0] = Gmaxp_idx; + else + working_set[0] = Gmaxn_idx; + working_set[1] = Gmin_idx; + + return 0; + } + + private boolean be_shrunk(int i, double Gmax1, double Gmax2, double Gmax3, double Gmax4) + { + if(is_upper_bound(i)) + { + if(y[i]==+1) + return(-G[i] > Gmax1); + else + return(-G[i] > Gmax4); + } + else if(is_lower_bound(i)) + { + if(y[i]==+1) + return(G[i] > Gmax2); + else + return(G[i] > Gmax3); + } + else + return(false); + } + + void do_shrinking() + { + double Gmax1 = -INF; // max { -y_i * grad(f)_i | y_i = +1, i in I_up(\alpha) } + double Gmax2 = -INF; // max { y_i * grad(f)_i | y_i = +1, i in I_low(\alpha) } + double Gmax3 = -INF; // max { -y_i * grad(f)_i | y_i = -1, i in I_up(\alpha) } + double Gmax4 = -INF; // max { y_i * grad(f)_i | y_i = -1, i in I_low(\alpha) } + + // find maximal violating pair first + int i; + for(i=0;i Gmax1) Gmax1 = -G[i]; + } + else if(-G[i] > Gmax4) Gmax4 = -G[i]; + } + if(!is_lower_bound(i)) + { + if(y[i]==+1) + { + if(G[i] > Gmax2) Gmax2 = G[i]; + } + else if(G[i] > Gmax3) Gmax3 = G[i]; + } + } + + if(unshrink == false && Math.max(Gmax1+Gmax2,Gmax3+Gmax4) <= eps*10) + { + unshrink = true; + reconstruct_gradient(); + active_size = l; + } + + for(i=0;i i) + { + if (!be_shrunk(active_size, Gmax1, Gmax2, Gmax3, Gmax4)) + { + swap_index(i,active_size); + break; + } + active_size--; + } + } + } + + double calculate_rho() + { + int nr_free1 = 0,nr_free2 = 0; + double ub1 = INF, ub2 = INF; + double lb1 = -INF, lb2 = -INF; + double sum_free1 = 0, sum_free2 = 0; + + for(int i=0;i 0) + r1 = sum_free1/nr_free1; + else + r1 = (ub1+lb1)/2; + + if(nr_free2 > 0) + r2 = sum_free2/nr_free2; + else + r2 = (ub2+lb2)/2; + + si.r = (r1+r2)/2; + return (r1-r2)/2; + } +} + +// +// Q matrices for various formulations +// +class SVC_Q extends Kernel +{ + private final byte[] y; + private final Cache cache; + private final double[] QD; + + SVC_Q(svm_problem prob, svm_parameter param, byte[] y_) + { + super(prob.l, prob.x, param); + y = (byte[])y_.clone(); + cache = new Cache(prob.l,(long)(param.cache_size*(1<<20))); + QD = new double[prob.l]; + for(int i=0;i 0) y[i] = +1; else y[i] = -1; + } + + Solver s = new Solver(); + s.Solve(l, new SVC_Q(prob,param,y), minus_ones, y, + alpha, Cp, Cn, param.eps, si, param.shrinking); + + double sum_alpha=0; + for(i=0;i0) + y[i] = +1; + else + y[i] = -1; + + double sum_pos = nu*l/2; + double sum_neg = nu*l/2; + + for(i=0;i 0) + { + ++nSV; + if(prob.y[i] > 0) + { + if(Math.abs(alpha[i]) >= si.upper_bound_p) + ++nBSV; + } + else + { + if(Math.abs(alpha[i]) >= si.upper_bound_n) + ++nBSV; + } + } + } + + svm.info("nSV = "+nSV+", nBSV = "+nBSV+"\n"); + + decision_function f = new decision_function(); + f.alpha = alpha; + f.rho = si.rho; + return f; + } + + // Platt's binary SVM Probablistic Output: an improvement from Lin et al. + private static void sigmoid_train(int l, double[] dec_values, double[] labels, + double[] probAB) + { + double A, B; + double prior1=0, prior0 = 0; + int i; + + for (i=0;i 0) prior1+=1; + else prior0+=1; + + int max_iter=100; // Maximal number of iterations + double min_step=1e-10; // Minimal step taken in line search + double sigma=1e-12; // For numerically strict PD of Hessian + double eps=1e-5; + double hiTarget=(prior1+1.0)/(prior1+2.0); + double loTarget=1/(prior0+2.0); + double[] t= new double[l]; + double fApB,p,q,h11,h22,h21,g1,g2,det,dA,dB,gd,stepsize; + double newA,newB,newf,d1,d2; + int iter; + + // Initial Point and Initial Fun Value + A=0.0; B=Math.log((prior0+1.0)/(prior1+1.0)); + double fval = 0.0; + + for (i=0;i0) t[i]=hiTarget; + else t[i]=loTarget; + fApB = dec_values[i]*A+B; + if (fApB>=0) + fval += t[i]*fApB + Math.log(1+Math.exp(-fApB)); + else + fval += (t[i] - 1)*fApB +Math.log(1+Math.exp(fApB)); + } + for (iter=0;iter= 0) + { + p=Math.exp(-fApB)/(1.0+Math.exp(-fApB)); + q=1.0/(1.0+Math.exp(-fApB)); + } + else + { + p=1.0/(1.0+Math.exp(fApB)); + q=Math.exp(fApB)/(1.0+Math.exp(fApB)); + } + d2=p*q; + h11+=dec_values[i]*dec_values[i]*d2; + h22+=d2; + h21+=dec_values[i]*d2; + d1=t[i]-p; + g1+=dec_values[i]*d1; + g2+=d1; + } + + // Stopping Criteria + if (Math.abs(g1)= min_step) + { + newA = A + stepsize * dA; + newB = B + stepsize * dB; + + // New function value + newf = 0.0; + for (i=0;i= 0) + newf += t[i]*fApB + Math.log(1+Math.exp(-fApB)); + else + newf += (t[i] - 1)*fApB +Math.log(1+Math.exp(fApB)); + } + // Check sufficient decrease + if (newf=max_iter) + svm.info("Reaching maximal iterations in two-class probability estimates\n"); + probAB[0]=A;probAB[1]=B; + } + + private static double sigmoid_predict(double decision_value, double A, double B) + { + double fApB = decision_value*A+B; + if (fApB >= 0) + return Math.exp(-fApB)/(1.0+Math.exp(-fApB)); + else + return 1.0/(1+Math.exp(fApB)) ; + } + + // Method 2 from the multiclass_prob paper by Wu, Lin, and Weng + private static void multiclass_probability(int k, double[][] r, double[] p) + { + int t,j; + int iter = 0, max_iter=Math.max(100,k); + double[][] Q=new double[k][k]; + double[] Qp=new double[k]; + double pQp, eps=0.005/k; + + for (t=0;tmax_error) + max_error=error; + } + if (max_error=max_iter) + svm.info("Exceeds max_iter in multiclass_prob\n"); + } + + // Cross-validation decision values for probability estimates + private static void svm_binary_svc_probability(svm_problem prob, svm_parameter param, double Cp, double Cn, double[] probAB) + { + int i; + int nr_fold = 5; + int[] perm = new int[prob.l]; + double[] dec_values = new double[prob.l]; + + // random shuffle + for(i=0;i0) + p_count++; + else + n_count++; + + if(p_count==0 && n_count==0) + for(j=begin;j 0 && n_count == 0) + for(j=begin;j 0) + for(j=begin;j 5*std) + count=count+1; + else + mae+=Math.abs(ymv[i]); + mae /= (prob.l-count); + svm.info("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma="+mae+"\n"); + return mae; + } + + // label: label name, start: begin of each class, count: #data of classes, perm: indices to the original data + // perm, length l, must be allocated before calling this subroutine + private static void svm_group_classes(svm_problem prob, int[] nr_class_ret, int[][] label_ret, int[][] start_ret, int[][] count_ret, int[] perm) + { + int l = prob.l; + int max_nr_class = 16; + int nr_class = 0; + int[] label = new int[max_nr_class]; + int[] count = new int[max_nr_class]; + int[] data_label = new int[l]; + int i; + + for(i=0;i 0) ++nSV; + model.l = nSV; + model.SV = new svm_node[nSV][]; + model.sv_coef[0] = new double[nSV]; + model.sv_indices = new int[nSV]; + int j = 0; + for(i=0;i 0) + { + model.SV[j] = prob.x[i]; + model.sv_coef[0][j] = f.alpha[i]; + model.sv_indices[j] = i+1; + ++j; + } + } + else + { + // classification + int l = prob.l; + int[] tmp_nr_class = new int[1]; + int[][] tmp_label = new int[1][]; + int[][] tmp_start = new int[1][]; + int[][] tmp_count = new int[1][]; + int[] perm = new int[l]; + + // group training data of the same class + svm_group_classes(prob,tmp_nr_class,tmp_label,tmp_start,tmp_count,perm); + int nr_class = tmp_nr_class[0]; + int[] label = tmp_label[0]; + int[] start = tmp_start[0]; + int[] count = tmp_count[0]; + + if(nr_class == 1) + svm.info("WARNING: training data in only one class. See README for details.\n"); + + svm_node[][] x = new svm_node[l][]; + int i; + for(i=0;i 0) + nonzero[si+k] = true; + for(k=0;k 0) + nonzero[sj+k] = true; + ++p; + } + + // build output + + model.nr_class = nr_class; + + model.label = new int[nr_class]; + for(i=0;i some folds may have zero elements + if((param.svm_type == svm_parameter.C_SVC || + param.svm_type == svm_parameter.NU_SVC) && nr_fold < l) + { + int[] tmp_nr_class = new int[1]; + int[][] tmp_label = new int[1][]; + int[][] tmp_start = new int[1][]; + int[][] tmp_count = new int[1][]; + + svm_group_classes(prob,tmp_nr_class,tmp_label,tmp_start,tmp_count,perm); + + int nr_class = tmp_nr_class[0]; + int[] start = tmp_start[0]; + int[] count = tmp_count[0]; + + // random shuffle and then data grouped by fold using the array perm + int[] fold_count = new int[nr_fold]; + int c; + int[] index = new int[l]; + for(i=0;i0)?1:-1; + else + return sum; + } + else + { + int nr_class = model.nr_class; + int l = model.l; + + double[] kvalue = new double[l]; + for(i=0;i 0) + ++vote[i]; + else + ++vote[j]; + p++; + } + + int vote_max_idx = 0; + for(i=1;i vote[vote_max_idx]) + vote_max_idx = i; + + return model.label[vote_max_idx]; + } + } + + public static double svm_predict(svm_model model, svm_node[] x) + { + int nr_class = model.nr_class; + double[] dec_values; + if(model.param.svm_type == svm_parameter.ONE_CLASS || + model.param.svm_type == svm_parameter.EPSILON_SVR || + model.param.svm_type == svm_parameter.NU_SVR) + dec_values = new double[1]; + else + dec_values = new double[nr_class*(nr_class-1)/2]; + double pred_result = svm_predict_values(model, x, dec_values); + return pred_result; + } + + public static double svm_predict_probability(svm_model model, svm_node[] x, double[] prob_estimates) + { + if ((model.param.svm_type == svm_parameter.C_SVC || model.param.svm_type == svm_parameter.NU_SVC) && + model.probA!=null && model.probB!=null) + { + int i; + int nr_class = model.nr_class; + double[] dec_values = new double[nr_class*(nr_class-1)/2]; + svm_predict_values(model, x, dec_values); + + double min_prob=1e-7; + double[][] pairwise_prob=new double[nr_class][nr_class]; + + int k=0; + for(i=0;i prob_estimates[prob_max_idx]) + prob_max_idx = i; + return model.label[prob_max_idx]; + } + else + return svm_predict(model, x); + } + + static final String svm_type_table[] = + { + "c_svc","nu_svc","one_class","epsilon_svr","nu_svr", + }; + + static final String kernel_type_table[]= + { + "linear","polynomial","rbf","sigmoid","precomputed" + }; + + public static void svm_save_model(String model_file_name, svm_model model) throws IOException + { + DataOutputStream fp = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(model_file_name))); + + svm_parameter param = model.param; + + fp.writeBytes("svm_type "+svm_type_table[param.svm_type]+"\n"); + fp.writeBytes("kernel_type "+kernel_type_table[param.kernel_type]+"\n"); + + if(param.kernel_type == svm_parameter.POLY) + fp.writeBytes("degree "+param.degree+"\n"); + + if(param.kernel_type == svm_parameter.POLY || + param.kernel_type == svm_parameter.RBF || + param.kernel_type == svm_parameter.SIGMOID) + fp.writeBytes("gamma "+param.gamma+"\n"); + + if(param.kernel_type == svm_parameter.POLY || + param.kernel_type == svm_parameter.SIGMOID) + fp.writeBytes("coef0 "+param.coef0+"\n"); + + int nr_class = model.nr_class; + int l = model.l; + fp.writeBytes("nr_class "+nr_class+"\n"); + fp.writeBytes("total_sv "+l+"\n"); + + { + fp.writeBytes("rho"); + for(int i=0;i 1) + return "nu <= 0 or nu > 1"; + + if(svm_type == svm_parameter.EPSILON_SVR) + if(param.p < 0) + return "p < 0"; + + if(param.shrinking != 0 && + param.shrinking != 1) + return "shrinking != 0 and shrinking != 1"; + + if(param.probability != 0 && + param.probability != 1) + return "probability != 0 and probability != 1"; + + if(param.probability == 1 && + svm_type == svm_parameter.ONE_CLASS) + return "one-class SVM probability output not supported yet"; + + // check whether nu-svc is feasible + + if(svm_type == svm_parameter.NU_SVC) + { + int l = prob.l; + int max_nr_class = 16; + int nr_class = 0; + int[] label = new int[max_nr_class]; + int[] count = new int[max_nr_class]; + + int i; + for(i=0;i Math.min(n1,n2)) + return "specified nu is infeasible"; + } + } + } + + return null; + } + + public static int svm_check_probability_model(svm_model model) + { + if (((model.param.svm_type == svm_parameter.C_SVC || model.param.svm_type == svm_parameter.NU_SVC) && + model.probA!=null && model.probB!=null) || + ((model.param.svm_type == svm_parameter.EPSILON_SVR || model.param.svm_type == svm_parameter.NU_SVR) && + model.probA!=null)) + return 1; + else + return 0; + } + + public static void svm_set_print_string_function(svm_print_interface print_func) + { + if (print_func == null) + svm_print_string = svm_print_stdout; + else + svm_print_string = print_func; + } +} diff --git a/src/libsvm/svm.m4 b/src/libsvm/svm.m4 new file mode 100644 index 0000000..028b63c --- /dev/null +++ b/src/libsvm/svm.m4 @@ -0,0 +1,2849 @@ +define(`swap',`do {$1 _=$2; $2=$3; $3=_;} while(false)') +define(`Qfloat',`float') +define(`SIZE_OF_QFLOAT',4) +define(`TAU',1e-12) +changecom(`//', ) +package libsvm; +import java.io.*; +import java.util.*; + +// +// Kernel Cache +// +// l is the number of total data items +// size is the cache size limit in bytes +// +class Cache { + private final int l; + private long size; + private final class head_t + { + head_t prev, next; // a cicular list + Qfloat[] data; + int len; // data[0,len) is cached in this entry + } + private final head_t[] head; + private head_t lru_head; + + Cache(int l_, long size_) + { + l = l_; + size = size_; + head = new head_t[l]; + for(int i=0;i= len if nothing needs to be filled) + // java: simulate pointer using single-element array + int get_data(int index, Qfloat[][] data, int len) + { + head_t h = head[index]; + if(h.len > 0) lru_delete(h); + int more = len - h.len; + + if(more > 0) + { + // free old space + while(size < more) + { + head_t old = lru_head.next; + lru_delete(old); + size += old.len; + old.data = null; + old.len = 0; + } + + // allocate new space + Qfloat[] new_data = new Qfloat[len]; + if(h.data != null) System.arraycopy(h.data,0,new_data,0,h.len); + h.data = new_data; + size -= more; + swap(int,h.len,len); + } + + lru_insert(h); + data[0] = h.data; + return len; + } + + void swap_index(int i, int j) + { + if(i==j) return; + + if(head[i].len > 0) lru_delete(head[i]); + if(head[j].len > 0) lru_delete(head[j]); + swap(Qfloat[],head[i].data,head[j].data); + swap(int,head[i].len,head[j].len); + if(head[i].len > 0) lru_insert(head[i]); + if(head[j].len > 0) lru_insert(head[j]); + + if(i>j) swap(int,i,j); + for(head_t h = lru_head.next; h!=lru_head; h=h.next) + { + if(h.len > i) + { + if(h.len > j) + swap(Qfloat,h.data[i],h.data[j]); + else + { + // give up + lru_delete(h); + size += h.len; + h.data = null; + h.len = 0; + } + } + } + } +} + +// +// Kernel evaluation +// +// the static method k_function is for doing single kernel evaluation +// the constructor of Kernel prepares to calculate the l*l kernel matrix +// the member function get_Q is for getting one column from the Q Matrix +// +abstract class QMatrix { + abstract Qfloat[] get_Q(int column, int len); + abstract double[] get_QD(); + abstract void swap_index(int i, int j); +}; + +abstract class Kernel extends QMatrix { + private svm_node[][] x; + private final double[] x_square; + + // svm_parameter + private final int kernel_type; + private final int degree; + private final double gamma; + private final double coef0; + + abstract Qfloat[] get_Q(int column, int len); + abstract double[] get_QD(); + + void swap_index(int i, int j) + { + swap(svm_node[],x[i],x[j]); + if(x_square != null) swap(double,x_square[i],x_square[j]); + } + + private static double powi(double base, int times) + { + double tmp = base, ret = 1.0; + + for(int t=times; t>0; t/=2) + { + if(t%2==1) ret*=tmp; + tmp = tmp * tmp; + } + return ret; + } + + double kernel_function(int i, int j) + { + switch(kernel_type) + { + case svm_parameter.LINEAR: + return dot(x[i],x[j]); + case svm_parameter.POLY: + return powi(gamma*dot(x[i],x[j])+coef0,degree); + case svm_parameter.RBF: + return Math.exp(-gamma*(x_square[i]+x_square[j]-2*dot(x[i],x[j]))); + case svm_parameter.SIGMOID: + return Math.tanh(gamma*dot(x[i],x[j])+coef0); + case svm_parameter.PRECOMPUTED: + return x[i][(int)(x[j][0].value)].value; + default: + return 0; // java + } + } + + Kernel(int l, svm_node[][] x_, svm_parameter param) + { + this.kernel_type = param.kernel_type; + this.degree = param.degree; + this.gamma = param.gamma; + this.coef0 = param.coef0; + + x = (svm_node[][])x_.clone(); + + if(kernel_type == svm_parameter.RBF) + { + x_square = new double[l]; + for(int i=0;i y[j].index) + ++j; + else + ++i; + } + } + return sum; + } + + static double k_function(svm_node[] x, svm_node[] y, + svm_parameter param) + { + switch(param.kernel_type) + { + case svm_parameter.LINEAR: + return dot(x,y); + case svm_parameter.POLY: + return powi(param.gamma*dot(x,y)+param.coef0,param.degree); + case svm_parameter.RBF: + { + double sum = 0; + int xlen = x.length; + int ylen = y.length; + int i = 0; + int j = 0; + while(i < xlen && j < ylen) + { + if(x[i].index == y[j].index) + { + double d = x[i++].value - y[j++].value; + sum += d*d; + } + else if(x[i].index > y[j].index) + { + sum += y[j].value * y[j].value; + ++j; + } + else + { + sum += x[i].value * x[i].value; + ++i; + } + } + + while(i < xlen) + { + sum += x[i].value * x[i].value; + ++i; + } + + while(j < ylen) + { + sum += y[j].value * y[j].value; + ++j; + } + + return Math.exp(-param.gamma*sum); + } + case svm_parameter.SIGMOID: + return Math.tanh(param.gamma*dot(x,y)+param.coef0); + case svm_parameter.PRECOMPUTED: + return x[(int)(y[0].value)].value; + default: + return 0; // java + } + } +} + +// An SMO algorithm in Fan et al., JMLR 6(2005), p. 1889--1918 +// Solves: +// +// min 0.5(\alpha^T Q \alpha) + p^T \alpha +// +// y^T \alpha = \delta +// y_i = +1 or -1 +// 0 <= alpha_i <= Cp for y_i = 1 +// 0 <= alpha_i <= Cn for y_i = -1 +// +// Given: +// +// Q, p, y, Cp, Cn, and an initial feasible point \alpha +// l is the size of vectors and matrices +// eps is the stopping tolerance +// +// solution will be put in \alpha, objective value will be put in obj +// +class Solver { + int active_size; + byte[] y; + double[] G; // gradient of objective function + static final byte LOWER_BOUND = 0; + static final byte UPPER_BOUND = 1; + static final byte FREE = 2; + byte[] alpha_status; // LOWER_BOUND, UPPER_BOUND, FREE + double[] alpha; + QMatrix Q; + double[] QD; + double eps; + double Cp,Cn; + double[] p; + int[] active_set; + double[] G_bar; // gradient, if we treat free variables as 0 + int l; + boolean unshrink; // XXX + + static final double INF = java.lang.Double.POSITIVE_INFINITY; + + double get_C(int i) + { + return (y[i] > 0)? Cp : Cn; + } + void update_alpha_status(int i) + { + if(alpha[i] >= get_C(i)) + alpha_status[i] = UPPER_BOUND; + else if(alpha[i] <= 0) + alpha_status[i] = LOWER_BOUND; + else alpha_status[i] = FREE; + } + boolean is_upper_bound(int i) { return alpha_status[i] == UPPER_BOUND; } + boolean is_lower_bound(int i) { return alpha_status[i] == LOWER_BOUND; } + boolean is_free(int i) { return alpha_status[i] == FREE; } + + // java: information about solution except alpha, + // because we cannot return multiple values otherwise... + static class SolutionInfo { + double obj; + double rho; + double upper_bound_p; + double upper_bound_n; + double r; // for Solver_NU + } + + void swap_index(int i, int j) + { + Q.swap_index(i,j); + swap(byte, y[i],y[j]); + swap(double, G[i],G[j]); + swap(byte, alpha_status[i],alpha_status[j]); + swap(double, alpha[i],alpha[j]); + swap(double, p[i],p[j]); + swap(int, active_set[i],active_set[j]); + swap(double, G_bar[i],G_bar[j]); + } + + void reconstruct_gradient() + { + // reconstruct inactive elements of G from G_bar and free variables + + if(active_size == l) return; + + int i,j; + int nr_free = 0; + + for(j=active_size;j 2*active_size*(l-active_size)) + { + for(i=active_size;iInteger.MAX_VALUE/100 ? Integer.MAX_VALUE : 100*l); + int counter = Math.min(l,1000)+1; + int[] working_set = new int[2]; + + while(iter < max_iter) + { + // show progress and do shrinking + + if(--counter == 0) + { + counter = Math.min(l,1000); + if(shrinking!=0) do_shrinking(); + svm.info("."); + } + + if(select_working_set(working_set)!=0) + { + // reconstruct the whole gradient + reconstruct_gradient(); + // reset active set size and check + active_size = l; + svm.info("*"); + if(select_working_set(working_set)!=0) + break; + else + counter = 1; // do shrinking next iteration + } + + int i = working_set[0]; + int j = working_set[1]; + + ++iter; + + // update alpha[i] and alpha[j], handle bounds carefully + + Qfloat[] Q_i = Q.get_Q(i,active_size); + Qfloat[] Q_j = Q.get_Q(j,active_size); + + double C_i = get_C(i); + double C_j = get_C(j); + + double old_alpha_i = alpha[i]; + double old_alpha_j = alpha[j]; + + if(y[i]!=y[j]) + { + double quad_coef = QD[i]+QD[j]+2*Q_i[j]; + if (quad_coef <= 0) + quad_coef = TAU; + double delta = (-G[i]-G[j])/quad_coef; + double diff = alpha[i] - alpha[j]; + alpha[i] += delta; + alpha[j] += delta; + + if(diff > 0) + { + if(alpha[j] < 0) + { + alpha[j] = 0; + alpha[i] = diff; + } + } + else + { + if(alpha[i] < 0) + { + alpha[i] = 0; + alpha[j] = -diff; + } + } + if(diff > C_i - C_j) + { + if(alpha[i] > C_i) + { + alpha[i] = C_i; + alpha[j] = C_i - diff; + } + } + else + { + if(alpha[j] > C_j) + { + alpha[j] = C_j; + alpha[i] = C_j + diff; + } + } + } + else + { + double quad_coef = QD[i]+QD[j]-2*Q_i[j]; + if (quad_coef <= 0) + quad_coef = TAU; + double delta = (G[i]-G[j])/quad_coef; + double sum = alpha[i] + alpha[j]; + alpha[i] -= delta; + alpha[j] += delta; + + if(sum > C_i) + { + if(alpha[i] > C_i) + { + alpha[i] = C_i; + alpha[j] = sum - C_i; + } + } + else + { + if(alpha[j] < 0) + { + alpha[j] = 0; + alpha[i] = sum; + } + } + if(sum > C_j) + { + if(alpha[j] > C_j) + { + alpha[j] = C_j; + alpha[i] = sum - C_j; + } + } + else + { + if(alpha[i] < 0) + { + alpha[i] = 0; + alpha[j] = sum; + } + } + } + + // update G + + double delta_alpha_i = alpha[i] - old_alpha_i; + double delta_alpha_j = alpha[j] - old_alpha_j; + + for(int k=0;k= max_iter) + { + if(active_size < l) + { + // reconstruct the whole gradient to calculate objective value + reconstruct_gradient(); + active_size = l; + svm.info("*"); + } + System.err.print("\nWARNING: reaching max number of iterations\n"); + } + + // calculate rho + + si.rho = calculate_rho(); + + // calculate objective value + { + double v = 0; + int i; + for(i=0;i= Gmax) + { + Gmax = -G[t]; + Gmax_idx = t; + } + } + else + { + if(!is_lower_bound(t)) + if(G[t] >= Gmax) + { + Gmax = G[t]; + Gmax_idx = t; + } + } + + int i = Gmax_idx; + Qfloat[] Q_i = null; + if(i != -1) // null Q_i not accessed: Gmax=-INF if i=-1 + Q_i = Q.get_Q(i,active_size); + + for(int j=0;j= Gmax2) + Gmax2 = G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[i]+QD[j]-2.0*y[i]*Q_i[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/TAU; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + else + { + if (!is_upper_bound(j)) + { + double grad_diff= Gmax-G[j]; + if (-G[j] >= Gmax2) + Gmax2 = -G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[i]+QD[j]+2.0*y[i]*Q_i[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/TAU; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + } + + if(Gmax+Gmax2 < eps) + return 1; + + working_set[0] = Gmax_idx; + working_set[1] = Gmin_idx; + return 0; + } + + private boolean be_shrunk(int i, double Gmax1, double Gmax2) + { + if(is_upper_bound(i)) + { + if(y[i]==+1) + return(-G[i] > Gmax1); + else + return(-G[i] > Gmax2); + } + else if(is_lower_bound(i)) + { + if(y[i]==+1) + return(G[i] > Gmax2); + else + return(G[i] > Gmax1); + } + else + return(false); + } + + void do_shrinking() + { + int i; + double Gmax1 = -INF; // max { -y_i * grad(f)_i | i in I_up(\alpha) } + double Gmax2 = -INF; // max { y_i * grad(f)_i | i in I_low(\alpha) } + + // find maximal violating pair first + for(i=0;i= Gmax1) + Gmax1 = -G[i]; + } + if(!is_lower_bound(i)) + { + if(G[i] >= Gmax2) + Gmax2 = G[i]; + } + } + else + { + if(!is_upper_bound(i)) + { + if(-G[i] >= Gmax2) + Gmax2 = -G[i]; + } + if(!is_lower_bound(i)) + { + if(G[i] >= Gmax1) + Gmax1 = G[i]; + } + } + } + + if(unshrink == false && Gmax1 + Gmax2 <= eps*10) + { + unshrink = true; + reconstruct_gradient(); + active_size = l; + } + + for(i=0;i i) + { + if (!be_shrunk(active_size, Gmax1, Gmax2)) + { + swap_index(i,active_size); + break; + } + active_size--; + } + } + } + + double calculate_rho() + { + double r; + int nr_free = 0; + double ub = INF, lb = -INF, sum_free = 0; + for(int i=0;i 0) + ub = Math.min(ub,yG); + else + lb = Math.max(lb,yG); + } + else if(is_upper_bound(i)) + { + if(y[i] < 0) + ub = Math.min(ub,yG); + else + lb = Math.max(lb,yG); + } + else + { + ++nr_free; + sum_free += yG; + } + } + + if(nr_free>0) + r = sum_free/nr_free; + else + r = (ub+lb)/2; + + return r; + } + +} + +// +// Solver for nu-svm classification and regression +// +// additional constraint: e^T \alpha = constant +// +final class Solver_NU extends Solver +{ + private SolutionInfo si; + + void Solve(int l, QMatrix Q, double[] p, byte[] y, + double[] alpha, double Cp, double Cn, double eps, + SolutionInfo si, int shrinking) + { + this.si = si; + super.Solve(l,Q,p,y,alpha,Cp,Cn,eps,si,shrinking); + } + + // return 1 if already optimal, return 0 otherwise + int select_working_set(int[] working_set) + { + // return i,j such that y_i = y_j and + // i: maximizes -y_i * grad(f)_i, i in I_up(\alpha) + // j: minimizes the decrease of obj value + // (if quadratic coefficeint <= 0, replace it with tau) + // -y_j*grad(f)_j < -y_i*grad(f)_i, j in I_low(\alpha) + + double Gmaxp = -INF; + double Gmaxp2 = -INF; + int Gmaxp_idx = -1; + + double Gmaxn = -INF; + double Gmaxn2 = -INF; + int Gmaxn_idx = -1; + + int Gmin_idx = -1; + double obj_diff_min = INF; + + for(int t=0;t= Gmaxp) + { + Gmaxp = -G[t]; + Gmaxp_idx = t; + } + } + else + { + if(!is_lower_bound(t)) + if(G[t] >= Gmaxn) + { + Gmaxn = G[t]; + Gmaxn_idx = t; + } + } + + int ip = Gmaxp_idx; + int in = Gmaxn_idx; + Qfloat[] Q_ip = null; + Qfloat[] Q_in = null; + if(ip != -1) // null Q_ip not accessed: Gmaxp=-INF if ip=-1 + Q_ip = Q.get_Q(ip,active_size); + if(in != -1) + Q_in = Q.get_Q(in,active_size); + + for(int j=0;j= Gmaxp2) + Gmaxp2 = G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[ip]+QD[j]-2*Q_ip[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/TAU; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + else + { + if (!is_upper_bound(j)) + { + double grad_diff=Gmaxn-G[j]; + if (-G[j] >= Gmaxn2) + Gmaxn2 = -G[j]; + if (grad_diff > 0) + { + double obj_diff; + double quad_coef = QD[in]+QD[j]-2*Q_in[j]; + if (quad_coef > 0) + obj_diff = -(grad_diff*grad_diff)/quad_coef; + else + obj_diff = -(grad_diff*grad_diff)/TAU; + + if (obj_diff <= obj_diff_min) + { + Gmin_idx=j; + obj_diff_min = obj_diff; + } + } + } + } + } + + if(Math.max(Gmaxp+Gmaxp2,Gmaxn+Gmaxn2) < eps) + return 1; + + if(y[Gmin_idx] == +1) + working_set[0] = Gmaxp_idx; + else + working_set[0] = Gmaxn_idx; + working_set[1] = Gmin_idx; + + return 0; + } + + private boolean be_shrunk(int i, double Gmax1, double Gmax2, double Gmax3, double Gmax4) + { + if(is_upper_bound(i)) + { + if(y[i]==+1) + return(-G[i] > Gmax1); + else + return(-G[i] > Gmax4); + } + else if(is_lower_bound(i)) + { + if(y[i]==+1) + return(G[i] > Gmax2); + else + return(G[i] > Gmax3); + } + else + return(false); + } + + void do_shrinking() + { + double Gmax1 = -INF; // max { -y_i * grad(f)_i | y_i = +1, i in I_up(\alpha) } + double Gmax2 = -INF; // max { y_i * grad(f)_i | y_i = +1, i in I_low(\alpha) } + double Gmax3 = -INF; // max { -y_i * grad(f)_i | y_i = -1, i in I_up(\alpha) } + double Gmax4 = -INF; // max { y_i * grad(f)_i | y_i = -1, i in I_low(\alpha) } + + // find maximal violating pair first + int i; + for(i=0;i Gmax1) Gmax1 = -G[i]; + } + else if(-G[i] > Gmax4) Gmax4 = -G[i]; + } + if(!is_lower_bound(i)) + { + if(y[i]==+1) + { + if(G[i] > Gmax2) Gmax2 = G[i]; + } + else if(G[i] > Gmax3) Gmax3 = G[i]; + } + } + + if(unshrink == false && Math.max(Gmax1+Gmax2,Gmax3+Gmax4) <= eps*10) + { + unshrink = true; + reconstruct_gradient(); + active_size = l; + } + + for(i=0;i i) + { + if (!be_shrunk(active_size, Gmax1, Gmax2, Gmax3, Gmax4)) + { + swap_index(i,active_size); + break; + } + active_size--; + } + } + } + + double calculate_rho() + { + int nr_free1 = 0,nr_free2 = 0; + double ub1 = INF, ub2 = INF; + double lb1 = -INF, lb2 = -INF; + double sum_free1 = 0, sum_free2 = 0; + + for(int i=0;i 0) + r1 = sum_free1/nr_free1; + else + r1 = (ub1+lb1)/2; + + if(nr_free2 > 0) + r2 = sum_free2/nr_free2; + else + r2 = (ub2+lb2)/2; + + si.r = (r1+r2)/2; + return (r1-r2)/2; + } +} + +// +// Q matrices for various formulations +// +class SVC_Q extends Kernel +{ + private final byte[] y; + private final Cache cache; + private final double[] QD; + + SVC_Q(svm_problem prob, svm_parameter param, byte[] y_) + { + super(prob.l, prob.x, param); + y = (byte[])y_.clone(); + cache = new Cache(prob.l,(long)(param.cache_size*(1<<20))); + QD = new double[prob.l]; + for(int i=0;i 0) y[i] = +1; else y[i] = -1; + } + + Solver s = new Solver(); + s.Solve(l, new SVC_Q(prob,param,y), minus_ones, y, + alpha, Cp, Cn, param.eps, si, param.shrinking); + + double sum_alpha=0; + for(i=0;i0) + y[i] = +1; + else + y[i] = -1; + + double sum_pos = nu*l/2; + double sum_neg = nu*l/2; + + for(i=0;i 0) + { + ++nSV; + if(prob.y[i] > 0) + { + if(Math.abs(alpha[i]) >= si.upper_bound_p) + ++nBSV; + } + else + { + if(Math.abs(alpha[i]) >= si.upper_bound_n) + ++nBSV; + } + } + } + + svm.info("nSV = "+nSV+", nBSV = "+nBSV+"\n"); + + decision_function f = new decision_function(); + f.alpha = alpha; + f.rho = si.rho; + return f; + } + + // Platt's binary SVM Probablistic Output: an improvement from Lin et al. + private static void sigmoid_train(int l, double[] dec_values, double[] labels, + double[] probAB) + { + double A, B; + double prior1=0, prior0 = 0; + int i; + + for (i=0;i 0) prior1+=1; + else prior0+=1; + + int max_iter=100; // Maximal number of iterations + double min_step=1e-10; // Minimal step taken in line search + double sigma=1e-12; // For numerically strict PD of Hessian + double eps=1e-5; + double hiTarget=(prior1+1.0)/(prior1+2.0); + double loTarget=1/(prior0+2.0); + double[] t= new double[l]; + double fApB,p,q,h11,h22,h21,g1,g2,det,dA,dB,gd,stepsize; + double newA,newB,newf,d1,d2; + int iter; + + // Initial Point and Initial Fun Value + A=0.0; B=Math.log((prior0+1.0)/(prior1+1.0)); + double fval = 0.0; + + for (i=0;i0) t[i]=hiTarget; + else t[i]=loTarget; + fApB = dec_values[i]*A+B; + if (fApB>=0) + fval += t[i]*fApB + Math.log(1+Math.exp(-fApB)); + else + fval += (t[i] - 1)*fApB +Math.log(1+Math.exp(fApB)); + } + for (iter=0;iter= 0) + { + p=Math.exp(-fApB)/(1.0+Math.exp(-fApB)); + q=1.0/(1.0+Math.exp(-fApB)); + } + else + { + p=1.0/(1.0+Math.exp(fApB)); + q=Math.exp(fApB)/(1.0+Math.exp(fApB)); + } + d2=p*q; + h11+=dec_values[i]*dec_values[i]*d2; + h22+=d2; + h21+=dec_values[i]*d2; + d1=t[i]-p; + g1+=dec_values[i]*d1; + g2+=d1; + } + + // Stopping Criteria + if (Math.abs(g1)= min_step) + { + newA = A + stepsize * dA; + newB = B + stepsize * dB; + + // New function value + newf = 0.0; + for (i=0;i= 0) + newf += t[i]*fApB + Math.log(1+Math.exp(-fApB)); + else + newf += (t[i] - 1)*fApB +Math.log(1+Math.exp(fApB)); + } + // Check sufficient decrease + if (newf=max_iter) + svm.info("Reaching maximal iterations in two-class probability estimates\n"); + probAB[0]=A;probAB[1]=B; + } + + private static double sigmoid_predict(double decision_value, double A, double B) + { + double fApB = decision_value*A+B; + if (fApB >= 0) + return Math.exp(-fApB)/(1.0+Math.exp(-fApB)); + else + return 1.0/(1+Math.exp(fApB)) ; + } + + // Method 2 from the multiclass_prob paper by Wu, Lin, and Weng + private static void multiclass_probability(int k, double[][] r, double[] p) + { + int t,j; + int iter = 0, max_iter=Math.max(100,k); + double[][] Q=new double[k][k]; + double[] Qp=new double[k]; + double pQp, eps=0.005/k; + + for (t=0;tmax_error) + max_error=error; + } + if (max_error=max_iter) + svm.info("Exceeds max_iter in multiclass_prob\n"); + } + + // Cross-validation decision values for probability estimates + private static void svm_binary_svc_probability(svm_problem prob, svm_parameter param, double Cp, double Cn, double[] probAB) + { + int i; + int nr_fold = 5; + int[] perm = new int[prob.l]; + double[] dec_values = new double[prob.l]; + + // random shuffle + for(i=0;i0) + p_count++; + else + n_count++; + + if(p_count==0 && n_count==0) + for(j=begin;j 0 && n_count == 0) + for(j=begin;j 0) + for(j=begin;j 5*std) + count=count+1; + else + mae+=Math.abs(ymv[i]); + mae /= (prob.l-count); + svm.info("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma="+mae+"\n"); + return mae; + } + + // label: label name, start: begin of each class, count: #data of classes, perm: indices to the original data + // perm, length l, must be allocated before calling this subroutine + private static void svm_group_classes(svm_problem prob, int[] nr_class_ret, int[][] label_ret, int[][] start_ret, int[][] count_ret, int[] perm) + { + int l = prob.l; + int max_nr_class = 16; + int nr_class = 0; + int[] label = new int[max_nr_class]; + int[] count = new int[max_nr_class]; + int[] data_label = new int[l]; + int i; + + for(i=0;i 0) ++nSV; + model.l = nSV; + model.SV = new svm_node[nSV][]; + model.sv_coef[0] = new double[nSV]; + model.sv_indices = new int[nSV]; + int j = 0; + for(i=0;i 0) + { + model.SV[j] = prob.x[i]; + model.sv_coef[0][j] = f.alpha[i]; + model.sv_indices[j] = i+1; + ++j; + } + } + else + { + // classification + int l = prob.l; + int[] tmp_nr_class = new int[1]; + int[][] tmp_label = new int[1][]; + int[][] tmp_start = new int[1][]; + int[][] tmp_count = new int[1][]; + int[] perm = new int[l]; + + // group training data of the same class + svm_group_classes(prob,tmp_nr_class,tmp_label,tmp_start,tmp_count,perm); + int nr_class = tmp_nr_class[0]; + int[] label = tmp_label[0]; + int[] start = tmp_start[0]; + int[] count = tmp_count[0]; + + if(nr_class == 1) + svm.info("WARNING: training data in only one class. See README for details.\n"); + + svm_node[][] x = new svm_node[l][]; + int i; + for(i=0;i 0) + nonzero[si+k] = true; + for(k=0;k 0) + nonzero[sj+k] = true; + ++p; + } + + // build output + + model.nr_class = nr_class; + + model.label = new int[nr_class]; + for(i=0;i some folds may have zero elements + if((param.svm_type == svm_parameter.C_SVC || + param.svm_type == svm_parameter.NU_SVC) && nr_fold < l) + { + int[] tmp_nr_class = new int[1]; + int[][] tmp_label = new int[1][]; + int[][] tmp_start = new int[1][]; + int[][] tmp_count = new int[1][]; + + svm_group_classes(prob,tmp_nr_class,tmp_label,tmp_start,tmp_count,perm); + + int nr_class = tmp_nr_class[0]; + int[] start = tmp_start[0]; + int[] count = tmp_count[0]; + + // random shuffle and then data grouped by fold using the array perm + int[] fold_count = new int[nr_fold]; + int c; + int[] index = new int[l]; + for(i=0;i0)?1:-1; + else + return sum; + } + else + { + int nr_class = model.nr_class; + int l = model.l; + + double[] kvalue = new double[l]; + for(i=0;i 0) + ++vote[i]; + else + ++vote[j]; + p++; + } + + int vote_max_idx = 0; + for(i=1;i vote[vote_max_idx]) + vote_max_idx = i; + + return model.label[vote_max_idx]; + } + } + + public static double svm_predict(svm_model model, svm_node[] x) + { + int nr_class = model.nr_class; + double[] dec_values; + if(model.param.svm_type == svm_parameter.ONE_CLASS || + model.param.svm_type == svm_parameter.EPSILON_SVR || + model.param.svm_type == svm_parameter.NU_SVR) + dec_values = new double[1]; + else + dec_values = new double[nr_class*(nr_class-1)/2]; + double pred_result = svm_predict_values(model, x, dec_values); + return pred_result; + } + + public static double svm_predict_probability(svm_model model, svm_node[] x, double[] prob_estimates) + { + if ((model.param.svm_type == svm_parameter.C_SVC || model.param.svm_type == svm_parameter.NU_SVC) && + model.probA!=null && model.probB!=null) + { + int i; + int nr_class = model.nr_class; + double[] dec_values = new double[nr_class*(nr_class-1)/2]; + svm_predict_values(model, x, dec_values); + + double min_prob=1e-7; + double[][] pairwise_prob=new double[nr_class][nr_class]; + + int k=0; + for(i=0;i prob_estimates[prob_max_idx]) + prob_max_idx = i; + return model.label[prob_max_idx]; + } + else + return svm_predict(model, x); + } + + static final String svm_type_table[] = + { + "c_svc","nu_svc","one_class","epsilon_svr","nu_svr", + }; + + static final String kernel_type_table[]= + { + "linear","polynomial","rbf","sigmoid","precomputed" + }; + + public static void svm_save_model(String model_file_name, svm_model model) throws IOException + { + DataOutputStream fp = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(model_file_name))); + + svm_parameter param = model.param; + + fp.writeBytes("svm_type "+svm_type_table[param.svm_type]+"\n"); + fp.writeBytes("kernel_type "+kernel_type_table[param.kernel_type]+"\n"); + + if(param.kernel_type == svm_parameter.POLY) + fp.writeBytes("degree "+param.degree+"\n"); + + if(param.kernel_type == svm_parameter.POLY || + param.kernel_type == svm_parameter.RBF || + param.kernel_type == svm_parameter.SIGMOID) + fp.writeBytes("gamma "+param.gamma+"\n"); + + if(param.kernel_type == svm_parameter.POLY || + param.kernel_type == svm_parameter.SIGMOID) + fp.writeBytes("coef0 "+param.coef0+"\n"); + + int nr_class = model.nr_class; + int l = model.l; + fp.writeBytes("nr_class "+nr_class+"\n"); + fp.writeBytes("total_sv "+l+"\n"); + + { + fp.writeBytes("rho"); + for(int i=0;i 1) + return "nu <= 0 or nu > 1"; + + if(svm_type == svm_parameter.EPSILON_SVR) + if(param.p < 0) + return "p < 0"; + + if(param.shrinking != 0 && + param.shrinking != 1) + return "shrinking != 0 and shrinking != 1"; + + if(param.probability != 0 && + param.probability != 1) + return "probability != 0 and probability != 1"; + + if(param.probability == 1 && + svm_type == svm_parameter.ONE_CLASS) + return "one-class SVM probability output not supported yet"; + + // check whether nu-svc is feasible + + if(svm_type == svm_parameter.NU_SVC) + { + int l = prob.l; + int max_nr_class = 16; + int nr_class = 0; + int[] label = new int[max_nr_class]; + int[] count = new int[max_nr_class]; + + int i; + for(i=0;i Math.min(n1,n2)) + return "specified nu is infeasible"; + } + } + } + + return null; + } + + public static int svm_check_probability_model(svm_model model) + { + if (((model.param.svm_type == svm_parameter.C_SVC || model.param.svm_type == svm_parameter.NU_SVC) && + model.probA!=null && model.probB!=null) || + ((model.param.svm_type == svm_parameter.EPSILON_SVR || model.param.svm_type == svm_parameter.NU_SVR) && + model.probA!=null)) + return 1; + else + return 0; + } + + public static void svm_set_print_string_function(svm_print_interface print_func) + { + if (print_func == null) + svm_print_string = svm_print_stdout; + else + svm_print_string = print_func; + } +} diff --git a/src/libsvm/svm_model.java b/src/libsvm/svm_model.java new file mode 100644 index 0000000..a38be3f --- /dev/null +++ b/src/libsvm/svm_model.java @@ -0,0 +1,22 @@ +// +// svm_model +// +package libsvm; +public class svm_model implements java.io.Serializable +{ + public svm_parameter param; // parameter + public int nr_class; // number of classes, = 2 in regression/one class svm + public int l; // total #SV + public svm_node[][] SV; // SVs (SV[l]) + public double[][] sv_coef; // coefficients for SVs in decision functions (sv_coef[k-1][l]) + public double[] rho; // constants in decision functions (rho[k*(k-1)/2]) + public double[] probA; // pariwise probability information + public double[] probB; + public int[] sv_indices; // sv_indices[0,...,nSV-1] are values in [1,...,num_traning_data] to indicate SVs in the training set + + // for classification only + + public int[] label; // label of each class (label[k]) + public int[] nSV; // number of SVs for each class (nSV[k]) + // nSV[0] + nSV[1] + ... + nSV[k-1] = l +}; diff --git a/src/libsvm/svm_node.java b/src/libsvm/svm_node.java new file mode 100644 index 0000000..9ab0a10 --- /dev/null +++ b/src/libsvm/svm_node.java @@ -0,0 +1,6 @@ +package libsvm; +public class svm_node implements java.io.Serializable +{ + public int index; + public double value; +} diff --git a/src/libsvm/svm_parameter.java b/src/libsvm/svm_parameter.java new file mode 100644 index 0000000..429f041 --- /dev/null +++ b/src/libsvm/svm_parameter.java @@ -0,0 +1,47 @@ +package libsvm; +public class svm_parameter implements Cloneable,java.io.Serializable +{ + /* svm_type */ + public static final int C_SVC = 0; + public static final int NU_SVC = 1; + public static final int ONE_CLASS = 2; + public static final int EPSILON_SVR = 3; + public static final int NU_SVR = 4; + + /* kernel_type */ + public static final int LINEAR = 0; + public static final int POLY = 1; + public static final int RBF = 2; + public static final int SIGMOID = 3; + public static final int PRECOMPUTED = 4; + + public int svm_type; + public int kernel_type; + public int degree; // for poly + public double gamma; // for poly/rbf/sigmoid + public double coef0; // for poly/sigmoid + + // these are for training only + public double cache_size; // in MB + public double eps; // stopping criteria + public double C; // for C_SVC, EPSILON_SVR and NU_SVR + public int nr_weight; // for C_SVC + public int[] weight_label; // for C_SVC + public double[] weight; // for C_SVC + public double nu; // for NU_SVC, ONE_CLASS, and NU_SVR + public double p; // for EPSILON_SVR + public int shrinking; // use the shrinking heuristics + public int probability; // do probability estimates + + public Object clone() + { + try + { + return super.clone(); + } catch (CloneNotSupportedException e) + { + return null; + } + } + +} diff --git a/src/libsvm/svm_print_interface.java b/src/libsvm/svm_print_interface.java new file mode 100644 index 0000000..ff4d0e8 --- /dev/null +++ b/src/libsvm/svm_print_interface.java @@ -0,0 +1,5 @@ +package libsvm; +public interface svm_print_interface +{ + public void print(String s); +} diff --git a/src/libsvm/svm_problem.java b/src/libsvm/svm_problem.java new file mode 100644 index 0000000..5d74609 --- /dev/null +++ b/src/libsvm/svm_problem.java @@ -0,0 +1,7 @@ +package libsvm; +public class svm_problem implements java.io.Serializable +{ + public int l; + public double[] y; + public svm_node[][] x; +} diff --git a/src/service/svm_predict.java b/src/service/svm_predict.java new file mode 100644 index 0000000..fe14d51 --- /dev/null +++ b/src/service/svm_predict.java @@ -0,0 +1,309 @@ +package service; +import libsvm.svm; +import libsvm.svm_model; +import libsvm.svm_node; +import libsvm.svm_parameter; +import libsvm.svm_print_interface; +import info.chenli.classifier.Accurary; + +import java.io.*; +import java.util.*; + +import service.*; + +public class svm_predict { + private static svm_print_interface svm_print_null = new svm_print_interface() + { + public void print(String s) {} + }; + + private static svm_print_interface svm_print_stdout = new svm_print_interface() + { + public void print(String s) + { + System.out.print(s); + } + }; + + private static svm_print_interface svm_print_string = svm_print_stdout; + + static void info(String s) + { + svm_print_string.print(s); + } + + private static double atof(String s) + { + return Double.valueOf(s).doubleValue(); + } + + private static int atoi(String s) + { + return Integer.parseInt(s); + } + + private static void predict(BufferedReader input, DataOutputStream output, svm_model model, int predict_probability) throws IOException + { + int correct = 0, tp = 0, tn = 0, fp = 0, fn = 0; + int total = 0; + double error = 0; + float p = 0, r = 0, f = 0; + double sumv = 0, sumy = 0, sumvv = 0, sumyy = 0, sumvy = 0; + + int svm_type=svm.svm_get_svm_type(model); + int nr_class=svm.svm_get_nr_class(model); + double[] prob_estimates=null; + + if(predict_probability == 1) + { + if(svm_type == svm_parameter.EPSILON_SVR || + svm_type == svm_parameter.NU_SVR) + { + svm_predict.info("Prob. model for test data: target value = predicted value + z,\nz: Laplace distribution e^(-|z|/sigma)/(2sigma),sigma="+svm.svm_get_svr_probability(model)+"\n"); + } + else + { + int[] labels=new int[nr_class]; + svm.svm_get_labels(model,labels); + prob_estimates = new double[nr_class]; + output.writeBytes("labels"); + for(int j=0;j token = new HashMap(); + while ((triggerTextCh = tokenFileBuffer.readLine()) != null) { + String[] wordSb = triggerTextCh.split(" "); + token.put(wordSb[0], wordSb[2]); + } + tokenFileBuffer.close(); + tokenFileStream.close(); + StringBuffer sbfn = new StringBuffer(); + StringBuffer sbfp = new StringBuffer(); + String ss[] = { "Non_trigger", "Gene_expression", "Transcription", "Protein_catabolism", + "Localization", "Binding", "Protein_modification", "Phosphorylation", "Ubiquitination", "Acetylation" + , "Deacetylation", "Regulation", "Positive_regulation", "Negative_regulation"}; + int total14[] = new int [14]; + int answer14[] = new int [14]; + int correct14[] = new int [14]; + + while(true) + { + String line = input.readLine(); + if(line == null) break; + + StringTokenizer st = new StringTokenizer(line," \t\n\r\f:"); + + double target = atof(st.nextToken()); + int m = st.countTokens()/2; + svm_node[] x = new svm_node[m]; + for(int j=0;j=argv.length-2) + exit_with_help(); + try + { + BufferedReader input = new BufferedReader(new FileReader(argv[i])); + DataOutputStream output = new DataOutputStream(new BufferedOutputStream(new FileOutputStream(argv[i+2]))); + svm_model model = svm.svm_load_model(argv[i+1]); + if (model == null) + { + System.err.print("can't open model file "+argv[i+1]+"\n"); + System.exit(1); + } + if(predict_probability == 1) + { + if(svm.svm_check_probability_model(model)==0) + { + System.err.print("Model does not support probabiliy estimates\n"); + System.exit(1); + } + } + else + { + if(svm.svm_check_probability_model(model)!=0) + { + svm_predict.info("Model supports probability estimates, but disabled in prediction.\n"); + } + } + predict(input,output,model,predict_probability); + input.close(); + output.close(); + } + catch(FileNotFoundException e) + { + exit_with_help(); + } + catch(ArrayIndexOutOfBoundsException e) + { + exit_with_help(); + } + } +} diff --git a/src/service/svm_scale.java b/src/service/svm_scale.java new file mode 100644 index 0000000..490e832 --- /dev/null +++ b/src/service/svm_scale.java @@ -0,0 +1,352 @@ +package service; +import java.io.*; +import java.util.*; +import java.text.DecimalFormat; + +import service.*; + +class svm_scale +{ + private String line = null; + private double lower = -1.0; + private double upper = 1.0; + private double y_lower; + private double y_upper; + private boolean y_scaling = false; + private double[] feature_max; + private double[] feature_min; + private double y_max = -Double.MAX_VALUE; + private double y_min = Double.MAX_VALUE; + private int max_index; + private long num_nonzeros = 0; + private long new_num_nonzeros = 0; + + private static void exit_with_help() + { + System.out.print( + "Usage: svm-scale [options] data_filename\n" + +"options:\n" + +"-l lower : x scaling lower limit (default -1)\n" + +"-u upper : x scaling upper limit (default +1)\n" + +"-y y_lower y_upper : y scaling limits (default: no y scaling)\n" + +"-s save_filename : save scaling parameters to save_filename\n" + +"-r restore_filename : restore scaling parameters from restore_filename\n" + ); + System.exit(1); + } + + private BufferedReader rewind(BufferedReader fp, String filename) throws IOException + { + fp.close(); + return new BufferedReader(new FileReader(filename)); + } + + private void output_target(double value) + { + if(y_scaling) + { + if(value == y_min) + value = y_lower; + else if(value == y_max) + value = y_upper; + else + value = y_lower + (y_upper-y_lower) * + (value-y_min) / (y_max-y_min); + } + + System.out.print(value + " "); + } + + private void output(int index, double value) + { + /* skip single-valued attribute */ + if(feature_max[index] == feature_min[index]) + return; + + if(value == feature_min[index]) + value = lower; + else if(value == feature_max[index]) + value = upper; + else + value = lower + (upper-lower) * + (value-feature_min[index])/ + (feature_max[index]-feature_min[index]); + + if(value != 0) + { + System.out.print(index + ":" + value + " "); + new_num_nonzeros++; + } + } + + private String readline(BufferedReader fp) throws IOException + { + line = fp.readLine(); + return line; + } + + private void run(String []argv) throws IOException + { + int i,index; + BufferedReader fp = null, fp_restore = null; + String save_filename = null; + String restore_filename = null; + String data_filename = null; + + + for(i=0;i lower) || (y_scaling && !(y_upper > y_lower))) + { + System.err.println("inconsistent lower/upper specification"); + System.exit(1); + } + if(restore_filename != null && save_filename != null) + { + System.err.println("cannot use -r and -s simultaneously"); + System.exit(1); + } + + if(argv.length != i+1) + exit_with_help(); + + data_filename = argv[i]; + try { + fp = new BufferedReader(new FileReader(data_filename)); + } catch (Exception e) { + System.err.println("can't open file " + data_filename); + System.exit(1); + } + + /* assumption: min index of attributes is 1 */ + /* pass 1: find out max index of attributes */ + max_index = 0; + + if(restore_filename != null) + { + int idx, c; + + try { + fp_restore = new BufferedReader(new FileReader(restore_filename)); + } + catch (Exception e) { + System.err.println("can't open file " + restore_filename); + System.exit(1); + } + if((c = fp_restore.read()) == 'y') + { + fp_restore.readLine(); + fp_restore.readLine(); + fp_restore.readLine(); + } + fp_restore.readLine(); + fp_restore.readLine(); + + String restore_line = null; + while((restore_line = fp_restore.readLine())!=null) + { + StringTokenizer st2 = new StringTokenizer(restore_line); + idx = Integer.parseInt(st2.nextToken()); + max_index = Math.max(max_index, idx); + } + fp_restore = rewind(fp_restore, restore_filename); + } + + while (readline(fp) != null) + { + StringTokenizer st = new StringTokenizer(line," \t\n\r\f:"); + st.nextToken(); + while(st.hasMoreTokens()) + { + index = Integer.parseInt(st.nextToken()); + max_index = Math.max(max_index, index); + st.nextToken(); + num_nonzeros++; + } + } + + try { + feature_max = new double[(max_index+1)]; + feature_min = new double[(max_index+1)]; + } catch(OutOfMemoryError e) { + System.err.println("can't allocate enough memory"); + System.exit(1); + } + + for(i=0;i<=max_index;i++) + { + feature_max[i] = -Double.MAX_VALUE; + feature_min[i] = Double.MAX_VALUE; + } + + fp = rewind(fp, data_filename); + + /* pass 2: find out min/max value */ + while(readline(fp) != null) + { + int next_index = 1; + double target; + double value; + + StringTokenizer st = new StringTokenizer(line," \t\n\r\f:"); + target = Double.parseDouble(st.nextToken()); + y_max = Math.max(y_max, target); + y_min = Math.min(y_min, target); + + while (st.hasMoreTokens()) + { + index = Integer.parseInt(st.nextToken()); + value = Double.parseDouble(st.nextToken()); + + for (i = next_index; i num_nonzeros) + System.err.print( + "WARNING: original #nonzeros " + num_nonzeros+"\n" + +" new #nonzeros " + new_num_nonzeros+"\n" + +"Use -l 0 if many original feature values are zeros\n"); + + fp.close(); + } + + public static void main(String argv[]) throws IOException + { + svm_scale s = new svm_scale(); + s.run(argv); + } +} diff --git a/src/service/svm_toy.java b/src/service/svm_toy.java new file mode 100644 index 0000000..227a2c4 --- /dev/null +++ b/src/service/svm_toy.java @@ -0,0 +1,510 @@ +package service; +import libsvm.svm; +import libsvm.svm_model; +import libsvm.svm_node; +import libsvm.svm_parameter; +import libsvm.svm_problem; + +import java.applet.*; +import java.awt.*; +import java.util.*; +import java.awt.event.*; +import java.io.*; + +import service.*; + +public class svm_toy extends Applet { + + static final String DEFAULT_PARAM="-t 2 -c 100"; + int XLEN; + int YLEN; + + // off-screen buffer + + Image buffer; + Graphics buffer_gc; + + // pre-allocated colors + + final static Color colors[] = + { + new Color(0,0,0), + new Color(0,120,120), + new Color(120,120,0), + new Color(120,0,120), + new Color(0,200,200), + new Color(200,200,0), + new Color(200,0,200) + }; + + class point { + point(double x, double y, byte value) + { + this.x = x; + this.y = y; + this.value = value; + } + double x, y; + byte value; + } + + Vector point_list = new Vector(); + byte current_value = 1; + + public void init() + { + setSize(getSize()); + + final Button button_change = new Button("Change"); + Button button_run = new Button("Run"); + Button button_clear = new Button("Clear"); + Button button_save = new Button("Save"); + Button button_load = new Button("Load"); + final TextField input_line = new TextField(DEFAULT_PARAM); + + BorderLayout layout = new BorderLayout(); + this.setLayout(layout); + + Panel p = new Panel(); + GridBagLayout gridbag = new GridBagLayout(); + p.setLayout(gridbag); + + GridBagConstraints c = new GridBagConstraints(); + c.fill = GridBagConstraints.HORIZONTAL; + c.weightx = 1; + c.gridwidth = 1; + gridbag.setConstraints(button_change,c); + gridbag.setConstraints(button_run,c); + gridbag.setConstraints(button_clear,c); + gridbag.setConstraints(button_save,c); + gridbag.setConstraints(button_load,c); + c.weightx = 5; + c.gridwidth = 5; + gridbag.setConstraints(input_line,c); + + button_change.setBackground(colors[current_value]); + + p.add(button_change); + p.add(button_run); + p.add(button_clear); + p.add(button_save); + p.add(button_load); + p.add(input_line); + this.add(p,BorderLayout.SOUTH); + + button_change.addActionListener(new ActionListener() + { public void actionPerformed (ActionEvent e) + { button_change_clicked(); button_change.setBackground(colors[current_value]); }}); + + button_run.addActionListener(new ActionListener() + { public void actionPerformed (ActionEvent e) + { button_run_clicked(input_line.getText()); }}); + + button_clear.addActionListener(new ActionListener() + { public void actionPerformed (ActionEvent e) + { button_clear_clicked(); }}); + + button_save.addActionListener(new ActionListener() + { public void actionPerformed (ActionEvent e) + { button_save_clicked(input_line.getText()); }}); + + button_load.addActionListener(new ActionListener() + { public void actionPerformed (ActionEvent e) + { button_load_clicked(); }}); + + input_line.addActionListener(new ActionListener() + { public void actionPerformed (ActionEvent e) + { button_run_clicked(input_line.getText()); }}); + + this.enableEvents(AWTEvent.MOUSE_EVENT_MASK); + } + + void draw_point(point p) + { + Color c = colors[p.value+3]; + + Graphics window_gc = getGraphics(); + buffer_gc.setColor(c); + buffer_gc.fillRect((int)(p.x*XLEN),(int)(p.y*YLEN),4,4); + window_gc.setColor(c); + window_gc.fillRect((int)(p.x*XLEN),(int)(p.y*YLEN),4,4); + } + + void clear_all() + { + point_list.removeAllElements(); + if(buffer != null) + { + buffer_gc.setColor(colors[0]); + buffer_gc.fillRect(0,0,XLEN,YLEN); + } + repaint(); + } + + void draw_all_points() + { + int n = point_list.size(); + for(int i=0;i 3) current_value = 1; + } + + private static double atof(String s) + { + return Double.valueOf(s).doubleValue(); + } + + private static int atoi(String s) + { + return Integer.parseInt(s); + } + + void button_run_clicked(String args) + { + // guard + if(point_list.isEmpty()) return; + + svm_parameter param = new svm_parameter(); + + // default values + param.svm_type = svm_parameter.C_SVC; + param.kernel_type = svm_parameter.RBF; + param.degree = 3; + param.gamma = 0; + param.coef0 = 0; + param.nu = 0.5; + param.cache_size = 40; + param.C = 1; + param.eps = 1e-3; + param.p = 0.1; + param.shrinking = 1; + param.probability = 0; + param.nr_weight = 0; + param.weight_label = new int[0]; + param.weight = new double[0]; + + // parse options + StringTokenizer st = new StringTokenizer(args); + String[] argv = new String[st.countTokens()]; + for(int i=0;i=argv.length) + { + System.err.print("unknown option\n"); + break; + } + switch(argv[i-1].charAt(1)) + { + case 's': + param.svm_type = atoi(argv[i]); + break; + case 't': + param.kernel_type = atoi(argv[i]); + break; + case 'd': + param.degree = atoi(argv[i]); + break; + case 'g': + param.gamma = atof(argv[i]); + break; + case 'r': + param.coef0 = atof(argv[i]); + break; + case 'n': + param.nu = atof(argv[i]); + break; + case 'm': + param.cache_size = atof(argv[i]); + break; + case 'c': + param.C = atof(argv[i]); + break; + case 'e': + param.eps = atof(argv[i]); + break; + case 'p': + param.p = atof(argv[i]); + break; + case 'h': + param.shrinking = atoi(argv[i]); + break; + case 'b': + param.probability = atoi(argv[i]); + break; + case 'w': + ++param.nr_weight; + { + int[] old = param.weight_label; + param.weight_label = new int[param.nr_weight]; + System.arraycopy(old,0,param.weight_label,0,param.nr_weight-1); + } + + { + double[] old = param.weight; + param.weight = new double[param.nr_weight]; + System.arraycopy(old,0,param.weight,0,param.nr_weight-1); + } + + param.weight_label[param.nr_weight-1] = atoi(argv[i-1].substring(2)); + param.weight[param.nr_weight-1] = atof(argv[i]); + break; + default: + System.err.print("unknown option\n"); + } + } + + // build problem + svm_problem prob = new svm_problem(); + prob.l = point_list.size(); + prob.y = new double[prob.l]; + + if(param.kernel_type == svm_parameter.PRECOMPUTED) + { + } + else if(param.svm_type == svm_parameter.EPSILON_SVR || + param.svm_type == svm_parameter.NU_SVR) + { + if(param.gamma == 0) param.gamma = 1; + prob.x = new svm_node[prob.l][1]; + for(int i=0;i= XLEN || e.getY() >= YLEN) return; + point p = new point((double)e.getX()/XLEN, + (double)e.getY()/YLEN, + current_value); + point_list.addElement(p); + draw_point(p); + } + } + + public void paint(Graphics g) + { + // create buffer first time + if(buffer == null) { + buffer = this.createImage(XLEN,YLEN); + buffer_gc = buffer.getGraphics(); + buffer_gc.setColor(colors[0]); + buffer_gc.fillRect(0,0,XLEN,YLEN); + } + g.drawImage(buffer,0,0,this); + } + + public Dimension getPreferredSize() { return new Dimension(XLEN,YLEN+50); } + + public void setSize(Dimension d) { setSize(d.width,d.height); } + public void setSize(int w,int h) { + super.setSize(w,h); + XLEN = w; + YLEN = h-50; + clear_all(); + } + + public static void main(String[] argv) + { + new AppletFrame("svm_toy",new svm_toy(),500,500+50); + } +} + +class AppletFrame extends Frame { + AppletFrame(String title, Applet applet, int width, int height) + { + super(title); + this.addWindowListener(new WindowAdapter() { + public void windowClosing(WindowEvent e) { + System.exit(0); + } + }); + applet.init(); + applet.setSize(width,height); + applet.start(); + this.add(applet); + this.pack(); + this.setVisible(true); + } +} diff --git a/src/service/svm_train.java b/src/service/svm_train.java new file mode 100644 index 0000000..14f7f6e --- /dev/null +++ b/src/service/svm_train.java @@ -0,0 +1,338 @@ +package service; +import libsvm.svm; +import libsvm.svm_model; +import libsvm.svm_node; +import libsvm.svm_parameter; +import libsvm.svm_print_interface; +import libsvm.svm_problem; + +import java.io.*; +import java.util.*; + +import service.*; + +public class svm_train { + private svm_parameter param; // set by parse_command_line + private svm_problem prob; // set by read_problem + private svm_model model; + private String input_file_name; // set by parse_command_line + private String model_file_name; // set by parse_command_line + private String error_msg; + private int cross_validation; + private int nr_fold; + + private static svm_print_interface svm_print_null = new svm_print_interface() + { + public void print(String s) {} + }; + + private static void exit_with_help() + { + System.out.print( + "Usage: svm_train [options] training_set_file [model_file]\n" + +"options:\n" + +"-s svm_type : set type of SVM (default 0)\n" + +" 0 -- C-SVC (multi-class classification)\n" + +" 1 -- nu-SVC (multi-class classification)\n" + +" 2 -- one-class SVM\n" + +" 3 -- epsilon-SVR (regression)\n" + +" 4 -- nu-SVR (regression)\n" + +"-t kernel_type : set type of kernel function (default 2)\n" + +" 0 -- linear: u'*v\n" + +" 1 -- polynomial: (gamma*u'*v + coef0)^degree\n" + +" 2 -- radial basis function: exp(-gamma*|u-v|^2)\n" + +" 3 -- sigmoid: tanh(gamma*u'*v + coef0)\n" + +" 4 -- precomputed kernel (kernel values in training_set_file)\n" + +"-d degree : set degree in kernel function (default 3)\n" + +"-g gamma : set gamma in kernel function (default 1/num_features)\n" + +"-r coef0 : set coef0 in kernel function (default 0)\n" + +"-c cost : set the parameter C of C-SVC, epsilon-SVR, and nu-SVR (default 1)\n" + +"-n nu : set the parameter nu of nu-SVC, one-class SVM, and nu-SVR (default 0.5)\n" + +"-p epsilon : set the epsilon in loss function of epsilon-SVR (default 0.1)\n" + +"-m cachesize : set cache memory size in MB (default 100)\n" + +"-e epsilon : set tolerance of termination criterion (default 0.001)\n" + +"-h shrinking : whether to use the shrinking heuristics, 0 or 1 (default 1)\n" + +"-b probability_estimates : whether to train a SVC or SVR model for probability estimates, 0 or 1 (default 0)\n" + +"-wi weight : set the parameter C of class i to weight*C, for C-SVC (default 1)\n" + +"-v n : n-fold cross validation mode\n" + +"-q : quiet mode (no outputs)\n" + ); + System.exit(1); + } + + private void do_cross_validation() + { + int i; + int total_correct = 0; + double total_error = 0; + double sumv = 0, sumy = 0, sumvv = 0, sumyy = 0, sumvy = 0; + double[] target = new double[prob.l]; + + svm.svm_cross_validation(prob,param,nr_fold,target); + if(param.svm_type == svm_parameter.EPSILON_SVR || + param.svm_type == svm_parameter.NU_SVR) + { + for(i=0;i=argv.length) + exit_with_help(); + switch(argv[i-1].charAt(1)) + { + case 's': + param.svm_type = atoi(argv[i]); + break; + case 't': + param.kernel_type = atoi(argv[i]); + break; + case 'd': + param.degree = atoi(argv[i]); + break; + case 'g': + param.gamma = atof(argv[i]); + break; + case 'r': + param.coef0 = atof(argv[i]); + break; + case 'n': + param.nu = atof(argv[i]); + break; + case 'm': + param.cache_size = atof(argv[i]); + break; + case 'c': + param.C = atof(argv[i]); + break; + case 'e': + param.eps = atof(argv[i]); + break; + case 'p': + param.p = atof(argv[i]); + break; + case 'h': + param.shrinking = atoi(argv[i]); + break; + case 'b': + param.probability = atoi(argv[i]); + break; + case 'q': + print_func = svm_print_null; + i--; + break; + case 'v': + cross_validation = 1; + nr_fold = atoi(argv[i]); + if(nr_fold < 2) + { + System.err.print("n-fold cross validation: n must >= 2\n"); + exit_with_help(); + } + break; + case 'w': + ++param.nr_weight; + { + int[] old = param.weight_label; + param.weight_label = new int[param.nr_weight]; + System.arraycopy(old,0,param.weight_label,0,param.nr_weight-1); + } + + { + double[] old = param.weight; + param.weight = new double[param.nr_weight]; + System.arraycopy(old,0,param.weight,0,param.nr_weight-1); + } + + param.weight_label[param.nr_weight-1] = atoi(argv[i-1].substring(2)); + param.weight[param.nr_weight-1] = atof(argv[i]); + break; + default: + System.err.print("Unknown option: " + argv[i-1] + "\n"); + exit_with_help(); + } + } + + svm.svm_set_print_string_function(print_func); + + // determine filenames + + if(i>=argv.length) + exit_with_help(); + + input_file_name = argv[i]; + + if(i vy = new Vector(); + Vector vx = new Vector(); + int max_index = 0; + + while(true) + { + String line = fp.readLine(); + if(line == null) break; + + StringTokenizer st = new StringTokenizer(line," \t\n\r\f:"); + vy.addElement(atof(st.nextToken())); + int m = st.countTokens()/2; + svm_node[] x = new svm_node[m]; + for(int j=0;j0) max_index = Math.max(max_index, x[m-1].index); + + /*String[] ss = line.split("\t"); + vy.addElement(Double.valueOf(ss[0])); + svm_node[] x = new svm_node[ss.length-1]; + for(int j=0;j 0) + param.gamma = 1.0/max_index; + + if(param.kernel_type == svm_parameter.PRECOMPUTED) + for(int i=0;i max_index) + { + System.err.print("Wrong input format: sample_serial_number out of range\n"); + System.exit(1); + } + } + + fp.close(); + } +}