diff --git a/.gitignore b/.gitignore
index ab54e98..68fcec4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
/bin
/target
.Rproj.user
+.idea/
+*.iml
+
diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml
new file mode 100644
index 0000000..21f6b75
--- /dev/null
+++ b/.idea/inspectionProfiles/Project_Default.xml
@@ -0,0 +1,418 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
new file mode 100644
index 0000000..8c1e55e
--- /dev/null
+++ b/.idea/inspectionProfiles/profiles_settings.xml
@@ -0,0 +1,8 @@
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/libraries/Maven__com_goldmansachs_gs_collections_6_2_0.xml b/.idea/libraries/Maven__com_goldmansachs_gs_collections_6_2_0.xml
new file mode 100644
index 0000000..ef054b6
--- /dev/null
+++ b/.idea/libraries/Maven__com_goldmansachs_gs_collections_6_2_0.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/libraries/Maven__com_goldmansachs_gs_collections_api_6_2_0.xml b/.idea/libraries/Maven__com_goldmansachs_gs_collections_api_6_2_0.xml
new file mode 100644
index 0000000..189b0a3
--- /dev/null
+++ b/.idea/libraries/Maven__com_goldmansachs_gs_collections_api_6_2_0.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/.idea/libraries/Maven__joda_time_joda_time_2_8_1.xml b/.idea/libraries/Maven__joda_time_joda_time_2_8_1.xml
new file mode 100644
index 0000000..91ce913
--- /dev/null
+++ b/.idea/libraries/Maven__joda_time_joda_time_2_8_1.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/pom.xml b/pom.xml
index 5646b37..7ec845e 100644
--- a/pom.xml
+++ b/pom.xml
@@ -55,8 +55,8 @@
maven-compiler-plugin
3.3
- 1.7
- 1.7
+ 1.8
+ 1.8
@@ -190,7 +190,7 @@
maven-assembly-plugin
- 2.5.4
+ 2.5.3
jar-with-dependencies
@@ -219,25 +219,30 @@
+
+ com.goldmansachs
+ gs-collections
+ 6.2.0
+
joda-time
joda-time
- 2.7
+ 2.8.1
org.slf4j
slf4j-api
- 1.7.10
+ 1.7.12
ch.qos.logback
logback-classic
- 1.1.2
+ 1.1.3
ch.qos.logback
logback-core
- 1.1.2
+ 1.1.3
junit
diff --git a/src/main/java/net/seninp/gi/GrammarRuleRecord.java b/src/main/java/net/seninp/gi/GrammarRuleRecord.java
index bc0616c..9ae44d5 100644
--- a/src/main/java/net/seninp/gi/GrammarRuleRecord.java
+++ b/src/main/java/net/seninp/gi/GrammarRuleRecord.java
@@ -1,5 +1,7 @@
package net.seninp.gi;
+import com.gs.collections.impl.set.mutable.primitive.IntHashSet;
+
import java.util.ArrayList;
import java.util.Arrays;
@@ -22,7 +24,7 @@ public class GrammarRuleRecord {
private String expandedRuleString;
/* The indexes at which the rule occurs in the discretized time series. */
- private ArrayList timeSeriesOccurrenceIndexes = new ArrayList();
+ private IntHashSet timeSeriesOccurrenceIndexes = new IntHashSet();
/* This rule intervals on the original time series. */
private ArrayList ruleIntervals;
@@ -40,7 +42,7 @@ public class GrammarRuleRecord {
private int maxLength;
/* The rule mean length - i.e. mean value of all subsequences corresponding to the rule. */
- private Integer meanLength;
+ private int meanLength;
/* The rule mean period - i.e. the mean length of intra-rule intervals. */
private double period;
@@ -58,7 +60,7 @@ public int ruleNumber() {
return ruleNumber;
}
- public Integer getMeanLength() {
+ public int getMeanLength() {
return meanLength;
}
@@ -123,19 +125,21 @@ public void setExpandedRuleString(String expandedRuleString) {
}
public String occurrencesToString() {
- return Arrays.toString(this.timeSeriesOccurrenceIndexes
- .toArray(new Integer[this.timeSeriesOccurrenceIndexes.size()]));
+// return Arrays.toString(this.timeSeriesOccurrenceIndexes
+// .toArray(new Integer[this.timeSeriesOccurrenceIndexes.size()]));
+ return timeSeriesOccurrenceIndexes.toString();
}
- public ArrayList getOccurrences() {
+ public IntHashSet getOccurrences() {
return this.timeSeriesOccurrenceIndexes;
}
- public void setOccurrences(int[] indexes) {
- this.timeSeriesOccurrenceIndexes = new ArrayList();
- for (Integer idx : indexes) {
- this.timeSeriesOccurrenceIndexes.add(idx);
- }
+ public void setOccurrences(IntHashSet indexes) {
+ this.timeSeriesOccurrenceIndexes = indexes;
+// this.timeSeriesOccurrenceIndexes = new ArrayList();
+// for (Integer idx : indexes.) {
+// this.timeSeriesOccurrenceIndexes.add(idx);
+// }
}
public double getPeriod() {
diff --git a/src/main/java/net/seninp/gi/GrammarRules.java b/src/main/java/net/seninp/gi/GrammarRules.java
index c081b32..29db2e3 100644
--- a/src/main/java/net/seninp/gi/GrammarRules.java
+++ b/src/main/java/net/seninp/gi/GrammarRules.java
@@ -1,16 +1,24 @@
package net.seninp.gi;
+import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap;
+
import java.util.Iterator;
-import java.util.SortedMap;
-import java.util.TreeMap;
+import java.util.stream.Collectors;
public class GrammarRules implements Iterable {
- private SortedMap rules;
+ //private SortedMap rules;
+
+ final IntObjectHashMap rules = new IntObjectHashMap();
public GrammarRules() {
super();
- this.rules = new TreeMap();
+ //this.rules = new TreeMap();
+ }
+
+ @Override
+ public String toString() {
+ return rules.values().stream().map(x -> x.toString()).collect(Collectors.joining(", ")).toString();
}
public void addRule(GrammarRuleRecord arrRule) {
diff --git a/src/main/java/net/seninp/gi/Interval.java b/src/main/java/net/seninp/gi/Interval.java
index 4f3f29e..98d1b1a 100644
--- a/src/main/java/net/seninp/gi/Interval.java
+++ b/src/main/java/net/seninp/gi/Interval.java
@@ -8,9 +8,9 @@
*/
public class Interval {
- private int start;
- private int end;
- private double coverage;
+ final private int start;
+ final private int end;
+ final private double coverage;
/**
* Constructor; start inclusive, end exclusive.
@@ -29,21 +29,22 @@ public double getCoverage() {
return coverage;
}
- public void setCoverage(double coverage) {
+ /*public void setCoverage(double coverage) {
this.coverage = coverage;
}
public void setStart(int start) {
this.start = start;
}
+ public void setEnd(int end) {
+ this.end = end;
+ }
+*/
public int getStart() {
return this.start;
}
- public void setEnd(int end) {
- this.end = end;
- }
public int getEnd() {
return this.end;
diff --git a/src/main/java/net/seninp/gi/repair/DigramFrequencies.java b/src/main/java/net/seninp/gi/repair/DigramFrequencies.java
index e20c160..993de4a 100644
--- a/src/main/java/net/seninp/gi/repair/DigramFrequencies.java
+++ b/src/main/java/net/seninp/gi/repair/DigramFrequencies.java
@@ -1,10 +1,11 @@
package net.seninp.gi.repair;
-import java.util.ArrayList;
-import java.util.Collections;
+import com.gs.collections.impl.list.mutable.FastList;
+import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap;
+
import java.util.HashMap;
-import java.util.SortedMap;
-import java.util.TreeMap;
+import java.util.LinkedHashMap;
+import java.util.List;
/**
* Implements the digram frequency queue.
@@ -15,18 +16,18 @@
public class DigramFrequencies {
/** A map of strings to digram frequencies. */
- private HashMap digramsToEntries;
+ private final LinkedHashMap digramsToEntries;
/** A map of buckets, each bucket is the frequency number pointing on the collection of entries. */
- private SortedMap> bucketsToEntries;
+ private final IntObjectHashMap> bucketsToEntries;
/**
* Constructor. Inits data structures.
*/
public DigramFrequencies() {
super();
- digramsToEntries = new HashMap();
- bucketsToEntries = new TreeMap>();
+ digramsToEntries = new LinkedHashMap();
+ bucketsToEntries = new IntObjectHashMap();
}
/**
@@ -37,9 +38,9 @@ public DigramFrequencies() {
public void put(DigramFrequencyEntry digramFrequencyEntry) {
this.digramsToEntries.put(digramFrequencyEntry.getDigram(), digramFrequencyEntry);
Integer freq = digramFrequencyEntry.getFrequency();
- ArrayList bucket = this.bucketsToEntries.get(freq);
+ List bucket = this.bucketsToEntries.get(freq);
if (null == bucket) {
- bucket = new ArrayList();
+ bucket = new FastList();
this.bucketsToEntries.put(freq, bucket);
}
bucket.add(digramFrequencyEntry);
@@ -64,20 +65,20 @@ public DigramFrequencyEntry get(String string) {
public void incrementFrequency(DigramFrequencyEntry entry, int increment) {
// findout the old bucket and remove this entry
- ArrayList oldBucket = this.bucketsToEntries.get(entry.getFrequency());
+ List oldBucket = this.bucketsToEntries.get(entry.getFrequency());
oldBucket.remove(entry);
if (oldBucket.isEmpty()) {
this.bucketsToEntries.remove(entry.getFrequency());
}
// get the increment added
- int newFreq = entry.getFrequency() + increment;
- entry.setFrequency(newFreq);
+
+ int newFreq = entry.add(increment);
// put into the new bucket
- ArrayList bucket = this.bucketsToEntries.get(newFreq);
+ List bucket = this.bucketsToEntries.get(newFreq);
if (null == bucket) {
- bucket = new ArrayList();
+ bucket = new FastList(1);
this.bucketsToEntries.put(newFreq, bucket);
}
bucket.add(entry);
@@ -97,7 +98,8 @@ public DigramFrequencyEntry getTop() {
}
else {
// by the default there are no empty buckets
- Integer maxBucket = Collections.max(bucketsToEntries.keySet());
+ int maxBucket = bucketsToEntries.keysView().max();
+ //Integer maxBucket = Collections.max(bucketsToEntries.keySet());
return bucketsToEntries.get(maxBucket).get(0);
}
}
@@ -116,7 +118,7 @@ public void remove(String digramStr) {
else {
// get its frequency and the corresponding bucket
int freq = entry.getFrequency();
- ArrayList bucket = this.bucketsToEntries.get(freq);
+ List bucket = this.bucketsToEntries.get(freq);
if (!bucket.remove(entry)) {
throw (new RuntimeException("There was an error!"));
}
diff --git a/src/main/java/net/seninp/gi/repair/DigramFrequencyEntry.java b/src/main/java/net/seninp/gi/repair/DigramFrequencyEntry.java
index a635199..be3252f 100644
--- a/src/main/java/net/seninp/gi/repair/DigramFrequencyEntry.java
+++ b/src/main/java/net/seninp/gi/repair/DigramFrequencyEntry.java
@@ -9,7 +9,7 @@
public class DigramFrequencyEntry {
/** The payload - the digram string itself. */
- private String digram;
+ private final String digram;
/** The observed frequency. */
private int frequency;
@@ -45,9 +45,9 @@ public String getDigram() {
*
* @param digram the string.
*/
- public void setDigram(String digram) {
- this.digram = digram;
- }
+// public void setDigram(String digram) {
+// this.digram = digram;
+// }
/**
* Frequency getter.
@@ -63,9 +63,9 @@ public int getFrequency() {
*
* @param frequency the new frequency value.
*/
- public void setFrequency(int frequency) {
- this.frequency = frequency;
- }
+// public void setFrequency(int frequency) {
+// this.frequency = frequency;
+// }
/**
* Get the first occurrence.
@@ -78,7 +78,7 @@ public int getFirstOccurrence() {
/**
* Set the first occurrence.
- *
+ *
* @param firstOccurrence the new value.
*/
public void setFirstOccurrence(int firstOccurrence) {
@@ -91,7 +91,7 @@ public int hashCode() {
int result = 1;
result = prime * result + ((digram == null) ? 0 : digram.hashCode());
result = prime * result + firstOccurrence;
- result = prime * result + frequency;
+ //result = prime * result + frequency;
return result;
}
@@ -99,25 +99,31 @@ public int hashCode() {
public boolean equals(Object obj) {
if (this == obj)
return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
+
+ if (!(obj instanceof DigramFrequencyEntry))
return false;
DigramFrequencyEntry other = (DigramFrequencyEntry) obj;
+
+ if (firstOccurrence != other.firstOccurrence)
+ return false;
if (digram == null) {
if (other.digram != null)
return false;
}
else if (!digram.equals(other.digram))
return false;
- if (firstOccurrence != other.firstOccurrence)
- return false;
- if (frequency != other.frequency)
- return false;
+
+ /*if (frequency != other.frequency)
+ return false;*/
return true;
}
public String toString() {
return this.digram + " " + this.frequency;
}
+
+ public int add(int increment) {
+ frequency += increment;
+ return frequency;
+ }
}
diff --git a/src/main/java/net/seninp/gi/repair/ParallelGrammarKeeper.java b/src/main/java/net/seninp/gi/repair/ParallelGrammarKeeper.java
deleted file mode 100644
index 9c16d3e..0000000
--- a/src/main/java/net/seninp/gi/repair/ParallelGrammarKeeper.java
+++ /dev/null
@@ -1,181 +0,0 @@
-package net.seninp.gi.repair;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Hashtable;
-import java.util.concurrent.atomic.AtomicInteger;
-
-/**
- * This implements a handler for the Re-Pair grammar built in parallel. This data structure is
- * responsible for enumerating rules and for tracking changes in the R0 of the grammar.
- *
- * @author psenin
- *
- */
-public class ParallelGrammarKeeper {
-
- private static final char SPACE = ' ';
- private static final char THE_R = 'R';
-
- // rule 0 gets a separate treatment, so we start from 1
- //
- protected AtomicInteger numRules = new AtomicInteger(1);
-
- // the rules table
- protected Hashtable theRules = new Hashtable();
-
- // the grammar id
- private long id;
-
- // R0 strings
- //
- protected String r0String;
- protected String r0ExpandedString;
-
- // keeps a working string of this grammar
- //
- protected ArrayList workString;
-
- /**
- * Constructor.
- *
- * @param id The handler id.
- */
- public ParallelGrammarKeeper(long id) {
- super();
- this.id = id;
- }
-
- /**
- * The id is used to keep track of parallel chunks.
- *
- * @return the current ID.
- */
- public long getId() {
- return this.id;
- }
-
- /**
- * This is used in parallel.
- *
- * @param string the string we work with in parallel.
- */
- public void setWorkString(ArrayList string) {
- this.workString = string;
- }
-
- /**
- * Set the R0 string.
- *
- * @param string the R0 string value.
- */
- public void setR0String(String string) {
- this.r0String = string;
- }
-
- /**
- * Get the expanded R0 out.
- *
- * @return the expanded R0.
- */
- public String getR0ExpandedString() {
- return this.r0ExpandedString;
- }
-
- /**
- * This adds an existing rule to this grammar. Useful in merging.
- *
- * @param r The rule. It is not yet clear how to treat rules, be careful. This will not set the
- * rule number, but it will increment the internal rule counter.
- */
- public void addExistingRule(ParallelRePairRule r) {
- r.grammarHandler = this;
- if (this.theRules.containsKey(r.ruleNumber)) {
- // we do override an existing rule
- theRules.put(r.ruleNumber, r);
- }
- else {
- // plus 1 because the rule 0 has a special treatment
- theRules.put(r.ruleNumber, r);
- numRules.set(theRules.size() + 1);
- }
- }
-
- /**
- * Expands all rules EXCEPT R0.
- */
- public void expandRules() {
- // iterate over all SAX containers
- ArrayList keys = new ArrayList(theRules.keySet());
- Collections.sort(keys);
- for (Integer key : keys) {
- ParallelRePairRule rr = theRules.get(key);
- String resultString = rr.toRuleString();
-
- int currentSearchStart = resultString.indexOf(THE_R);
- while (currentSearchStart >= 0) {
- int spaceIdx = resultString.indexOf(" ", currentSearchStart);
- // if (spaceIdx < 0) {
- // System.out.println("gotcha!");
- // }
- String ruleName = resultString.substring(currentSearchStart, spaceIdx + 1);
- Integer ruleId = Integer.valueOf(ruleName.substring(1, ruleName.length() - 1));
-
- ParallelRePairRule rule = theRules.get(ruleId);
- if (rule != null) {
- if (rule.expandedRuleString.charAt(rule.expandedRuleString.length() - 1) == ' ') {
- resultString = resultString.replaceAll(ruleName, rule.expandedRuleString);
- }
- else {
- resultString = resultString.replaceAll(ruleName, rule.expandedRuleString + SPACE);
- }
- }
-
- currentSearchStart = resultString.indexOf(THE_R, spaceIdx);
- }
-
- rr.setExpandedRule(resultString.trim());
-
- }
- }
-
- /**
- * Expands R0 specifically.
- */
- public void expandR0() {
- // string is immutable it will get copied
- String finalString = this.r0String;
- int currentSearchStart = finalString.indexOf(THE_R);
- while (currentSearchStart >= 0) {
-
- int spaceIdx = finalString.indexOf(" ", currentSearchStart + 1);
-
- String ruleName = finalString.substring(currentSearchStart, spaceIdx + 1);
- Integer ruleId = Integer.valueOf(ruleName.substring(1, ruleName.length() - 1));
-
- ParallelRePairRule rr = theRules.get(ruleId);
- if (null == rr.expandedRuleString) {
- finalString = finalString.replaceAll(ruleName, theRules.get(ruleId).toRuleString());
- }
- else {
- finalString = finalString.replaceAll(ruleName, theRules.get(ruleId).expandedRuleString
- + SPACE);
- }
-
- currentSearchStart = finalString.indexOf(THE_R);
- }
- this.r0ExpandedString = finalString;
- }
-
- public String toGrammarString() {
- StringBuffer sb = new StringBuffer();
- System.out.println("R0 -> " + r0String);
- for (int i = 1; i < theRules.size(); i++) {
- ParallelRePairRule r = theRules.get(i);
- sb.append("R").append(r.ruleNumber).append(" -> ").append(r.toRuleString()).append(" : ")
- .append(r.expandedRuleString).append(", ").append(r.positions).append("\n");
- }
- return sb.toString();
- }
-
-}
diff --git a/src/main/java/net/seninp/gi/repair/RePairFactory.java b/src/main/java/net/seninp/gi/repair/RePairFactory.java
index ddc692f..17b525b 100644
--- a/src/main/java/net/seninp/gi/repair/RePairFactory.java
+++ b/src/main/java/net/seninp/gi/repair/RePairFactory.java
@@ -1,337 +1,358 @@
package net.seninp.gi.repair;
+import ch.qos.logback.classic.Level;
+import ch.qos.logback.classic.Logger;
+import net.seninp.jmotif.sax.datastructures.SAXRecord;
+import net.seninp.jmotif.sax.datastructures.SAXRecords;
+import org.slf4j.LoggerFactory;
+
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Map.Entry;
import java.util.Set;
import java.util.StringTokenizer;
-import net.seninp.jmotif.sax.datastructures.SAXRecord;
-import net.seninp.jmotif.sax.datastructures.SAXRecords;
-import org.slf4j.LoggerFactory;
-import ch.qos.logback.classic.Level;
-import ch.qos.logback.classic.Logger;
/**
* Implements RePair.
- *
+ *
* @author psenin
- *
*/
public final class RePairFactory {
- private static final char SPACE = ' ';
-
- // logging stuff
- //
- private static Logger consoleLogger;
- private static Level LOGGING_LEVEL = Level.WARN;
- static {
- consoleLogger = (Logger) LoggerFactory.getLogger(RePairFactory.class);
- consoleLogger.setLevel(LOGGING_LEVEL);
- }
-
- /**
- * Disable constructor.
- */
- private RePairFactory() {
- assert true;
- }
-
- /**
- * Builds a repair grammar given a set of SAX records.
- *
- * @param saxRecords the records to process.
- *
- * @return the grammar.
- */
- public static RePairGrammar buildGrammar(SAXRecords saxRecords) {
-
- consoleLogger.debug("Starting RePair with an input string of " + saxRecords.getIndexes().size()
- + " words.");
-
- RePairGrammar rg = new RePairGrammar();
-
- // get all indexes and sort them
- Set index = saxRecords.getIndexes();
- Integer[] sortedIndexes = index.toArray(new Integer[index.size()]);
- Arrays.sort(sortedIndexes);
-
- // two data structures
- //
- // 1.0. - the string
- ArrayList string = new ArrayList();
- // LinkedList string = new LinkedList();
+ private static final char SPACE = ' ';
+ // logging stuff
//
- // 2.0. - the digram frequency table, digram, frequency, and the first occurrence index
- DigramFrequencies digramFrequencies = new DigramFrequencies();
-
- // build data structures
- int stringPositionCounter = 0;
- for (Integer saxWordPosition : sortedIndexes) {
- // i is the index of a symbol in the input discretized string
- // counter is the index in the grammar rule R0 string
- SAXRecord r = saxRecords.getByIndex(saxWordPosition);
- RePairSymbol symbol = new RePairSymbol(r, stringPositionCounter);
- // put it into the string
- string.add(symbol);
- // and into the index
- // take care about digram frequencies
- if (stringPositionCounter > 0) {
-
- StringBuffer digramStr = new StringBuffer();
- digramStr.append(string.get(stringPositionCounter - 1).toString()).append(SPACE)
- .append(string.get(stringPositionCounter).toString());
-
- DigramFrequencyEntry entry = digramFrequencies.get(digramStr.toString());
- if (null == entry) {
- digramFrequencies.put(new DigramFrequencyEntry(digramStr.toString(), 1,
- stringPositionCounter - 1));
- }
- else {
- digramFrequencies.incrementFrequency(entry, 1);
- }
- }
- // go on
- stringPositionCounter++;
+ private static Logger consoleLogger;
+ private static Level LOGGING_LEVEL = Level.DEBUG;
+
+ static {
+ consoleLogger = (Logger) LoggerFactory.getLogger(RePairFactory.class);
+ consoleLogger.setLevel(LOGGING_LEVEL);
}
- consoleLogger.debug("String length " + string.size() + " unique digrams "
- + digramFrequencies.size());
+ /**
+ * Disable constructor.
+ */
+ private RePairFactory() {
+ assert true;
+ }
- DigramFrequencyEntry entry;
- while ((entry = digramFrequencies.getTop()) != null && entry.getFrequency() >= 2) {
+ /**
+ * Builds a repair grammar given a set of SAX records.
+ *
+ * @param saxRecords the records to process.
+ * @return the grammar.
+ */
+ public static RePairGrammar buildGrammar(SAXRecords saxRecords) {
+
+ consoleLogger.debug("Starting RePair with an input string of " + saxRecords.getIndexes().size()
+ + " words.");
+
+ RePairGrammar rg = new RePairGrammar();
+
+ // get all indexes and sort them
+ Set index = saxRecords.getIndexes();
+ Integer[] sortedIndexes = index.toArray(new Integer[index.size()]);
+ Arrays.sort(sortedIndexes);
+
+ // two data structures
+ //
+ // 1.0. - the string
+ ArrayList string = new ArrayList();
+ // LinkedList string = new LinkedList();
+
+ //
+ // 2.0. - the digram frequency table, digram, frequency, and the first occurrence index
+ DigramFrequencies digramFrequencies = new DigramFrequencies();
+
+ // build data structures
+ int stringPositionCounter = 0;
+ for (Integer saxWordPosition : sortedIndexes) {
+ // i is the index of a symbol in the input discretized string
+ // counter is the index in the grammar rule R0 string
+ SAXRecord r = saxRecords.getByIndex(saxWordPosition);
+ RePairSymbol symbol = new RePairSymbol(r, stringPositionCounter);
+ // put it into the string
+ string.add(symbol);
+ // and into the index
+ // take care about digram frequencies
+ if (stringPositionCounter > 0) {
+
+ StringBuffer digramStr = new StringBuffer();
+ digramStr.append(string.get(stringPositionCounter - 1).toString()).append(SPACE)
+ .append(string.get(stringPositionCounter).toString());
+
+ DigramFrequencyEntry entry = digramFrequencies.get(digramStr.toString());
+ if (null == entry) {
+ digramFrequencies.put(new DigramFrequencyEntry(digramStr.toString(), 1,
+ stringPositionCounter - 1));
+ } else {
+ digramFrequencies.incrementFrequency(entry, 1);
+ }
+ }
+ // go on
+ stringPositionCounter++;
+ }
- // take the most frequent rule
- //
- // Entry entry = entries.get(0);
- // DigramFrequencyEntry entry = digramFrequencies.getTop();
+ consoleLogger.debug("String length " + string.size() + " unique digrams "
+ + digramFrequencies.size());
+ DigramFrequencyEntry entry;
+ while ((entry = digramFrequencies.getTop()) != null && entry.getFrequency() >= 2) {
+
+ // take the most frequent rule
+ //
+ // Entry entry = entries.get(0);
+ // DigramFrequencyEntry entry = digramFrequencies.getTop();
+
+ /*
consoleLogger.info("re-pair iteration, digram \"" + entry.getDigram() + "\", frequency: "
+ entry.getFrequency());
+
consoleLogger.debug("Going to substitute the digram " + entry.getDigram()
+ " first occurring at position " + entry.getFirstOccurrence() + " with frequency "
+ entry.getFrequency() + ", '" + string.get(entry.getFirstOccurrence()) + SPACE
+ string.get(entry.getFirstOccurrence() + 1) + "'");
-
- // create new rule
- //
- RePairRule r = new RePairRule(rg);
- r.setFirst(string.get(entry.getFirstOccurrence()));
- r.setSecond(string.get(entry.getFirstOccurrence() + 1));
- r.assignLevel();
-
- // substitute each digram entry with a rule
- //
- String digramToSubstitute = entry.getDigram();
- int currentIndex = entry.getFirstOccurrence();
- while (currentIndex < string.size() - 1) {
-
- StringBuffer currentDigram = new StringBuffer();
- currentDigram.append(string.get(currentIndex).toString()).append(SPACE)
- .append(string.get(currentIndex + 1).toString());
-
- if (digramToSubstitute.equalsIgnoreCase(currentDigram.toString())) {
+ */
+
+ // create new rule
+ //
+ RePairRule r = new RePairRule(rg,
+ string.get(entry.getFirstOccurrence()),
+ string.get(entry.getFirstOccurrence() + 1)
+ );
+ r.assignLevel();
+
+ // substitute each digram entry with a rule
+ //
+ String digramToSubstitute = entry.getDigram();
+ int currentIndex = entry.getFirstOccurrence();
+ while (currentIndex < string.size() - 1) {
+
+ StringBuffer currentDigram = new StringBuffer();
+ currentDigram.append(string.get(currentIndex).toString()).append(SPACE)
+ .append(string.get(currentIndex + 1).toString());
+
+ if (digramToSubstitute.equalsIgnoreCase(currentDigram.toString())) {
+ /*
consoleLogger.debug(" next digram occurrence is at " + currentIndex + ", '"
+ string.get(currentIndex) + SPACE + string.get(currentIndex + 1) + "'");
-
- // correct entries at left and right
- if (currentIndex > 0) {
- // taking care about immediate neighbor
- removeDigramFrequencyEntry(currentIndex - 1, string, digramFrequencies);
- }
- if (currentIndex < string.size() - 2) {
- removeDigramFrequencyEntry(currentIndex + 1, string, digramFrequencies);
- }
-
- // create the new guard to insert
- RePairGuard g = new RePairGuard(r);
- g.setStringPosition(string.get(currentIndex).getStringPosition());
- r.addOccurrence(string.get(currentIndex).getStringPosition());
- substituteDigramAt(rg, currentIndex, g, string, digramFrequencies);
-
- }
- currentIndex++;
- }
-
- // // sort the entries of digram table by the size of indexes
- // entries = new ArrayList>();
- // entries.addAll(digramFrequencies.entrySet());
- // Collections.sort(entries, new Comparator>() {
- // @Override
- // public int compare(Entry o1, Entry o2) {
- // return -Integer.valueOf(o1.getValue()[0]).compareTo(Integer.valueOf(o2.getValue()[0]));
- // }
- // });
-
+ */
+
+ // correct entries at left and right
+ if (currentIndex > 0) {
+ // taking care about immediate neighbor
+ removeDigramFrequencyEntry(currentIndex - 1, string, digramFrequencies);
+ }
+ if (currentIndex < string.size() - 2) {
+ removeDigramFrequencyEntry(currentIndex + 1, string, digramFrequencies);
+ }
+
+ // create the new guard to insert
+ RePairGuard g = new RePairGuard(r);
+ g.setStringPosition(string.get(currentIndex).getStringPosition());
+ r.addOccurrence(string.get(currentIndex).getStringPosition());
+ substituteDigramAt(rg, currentIndex, g, string, digramFrequencies);
+
+ }
+ currentIndex++;
+ }
+
+ // // sort the entries of digram table by the size of indexes
+ // entries = new ArrayList>();
+ // entries.addAll(digramFrequencies.entrySet());
+ // Collections.sort(entries, new Comparator>() {
+ // @Override
+ // public int compare(Entry o1, Entry o2) {
+ // return -Integer.valueOf(o1.getValue()[0]).compareTo(Integer.valueOf(o2.getValue()[0]));
+ // }
+ // });
+
+/*
consoleLogger.debug("*** iteration finished, top count "
+ digramFrequencies.getTop().getFrequency());
- }
-
- rg.setR0String(stringToDisplay(string));
-
- return rg;
- }
-
- /**
- * Builds a grammar given a string of terminals delimeted by space.
- *
- * @param inputString the input string.
- * @return the RePair grammar.
- */
- public static RePairGrammar buildGrammar(String inputString) {
-
- // consoleLogger.debug("Starting RePair with an input string of " +
- // saxRecords.getIndexes().size()
- // + " words.");
-
- RePairGrammar rg = new RePairGrammar();
-
- // two data structures
- //
- // 1.0. - the string
- ArrayList string = new ArrayList();
- // LinkedList string = new LinkedList();
-
- //
- // 2.0. - the digram frequency table, digram, frequency, and the first occurrence index
- DigramFrequencies digramFrequencies = new DigramFrequencies();
-
- // build data structures
- // tokenize the input string
- //
- StringTokenizer st = new StringTokenizer(inputString, " ");
-
- int stringPositionCounter = 0;
-
- // while there are tokens
- while (st.hasMoreTokens()) {
-
- String token = st.nextToken();
-
- RePairSymbol symbol = new RePairSymbol(token, stringPositionCounter);
- // put it into the string
- string.add(symbol);
- // and into the index
- // take care about digram frequencies
- if (stringPositionCounter > 0) {
+ */
- StringBuffer digramStr = new StringBuffer();
- digramStr.append(string.get(stringPositionCounter - 1).toString()).append(SPACE)
- .append(string.get(stringPositionCounter).toString());
-
- DigramFrequencyEntry entry = digramFrequencies.get(digramStr.toString());
- if (null == entry) {
- digramFrequencies.put(new DigramFrequencyEntry(digramStr.toString(), 1,
- stringPositionCounter - 1));
- }
- else {
- digramFrequencies.incrementFrequency(entry, 1);
}
- }
- // go on
- stringPositionCounter++;
- }
-
- consoleLogger.debug("String length " + string.size() + " unique digrams "
- + digramFrequencies.size());
-
- DigramFrequencyEntry entry;
- while ((entry = digramFrequencies.getTop()) != null && entry.getFrequency() > 1) {
-
- // take the most frequent rule
- //
- // Entry entry = entries.get(0);
- // DigramFrequencyEntry entry = digramFrequencies.getTop();
- consoleLogger.info("re-pair iteration, digram \"" + entry.getDigram() + "\", frequency: "
- + entry.getFrequency());
-
- consoleLogger.debug("Going to substitute the digram " + entry.getDigram()
- + " first occurring at position " + entry.getFirstOccurrence() + " with frequency "
- + entry.getFrequency() + ", '" + string.get(entry.getFirstOccurrence()) + SPACE
- + string.get(entry.getFirstOccurrence() + 1) + "'");
+ rg.setR0String(stringToDisplay(string));
- // create new rule
- //
- RePairRule r = new RePairRule(rg);
- r.setFirst(string.get(entry.getFirstOccurrence()));
- r.setSecond(string.get(entry.getFirstOccurrence() + 1));
- r.assignLevel();
+ return rg;
+ }
- // substitute each digram entry with a rule
- //
- String digramToSubstitute = entry.getDigram();
- int currentIndex = entry.getFirstOccurrence();
- while (currentIndex < string.size() - 1) {
+ /**
+ * Builds a grammar given a string of terminals delimeted by space.
+ *
+ * @param inputString the input string.
+ * @return the RePair grammar.
+ */
+ public static RePairGrammar buildGrammar(String inputString) {
+
+ // consoleLogger.debug("Starting RePair with an input string of " +
+ // saxRecords.getIndexes().size()
+ // + " words.");
+
+ RePairGrammar rg = new RePairGrammar();
+
+ // two data structures
+ //
+ // 1.0. - the string
+ ArrayList string = new ArrayList();
+ // LinkedList string = new LinkedList();
+
+ //
+ // 2.0. - the digram frequency table, digram, frequency, and the first occurrence index
+ DigramFrequencies digramFrequencies = new DigramFrequencies();
+
+ // build data structures
+ // tokenize the input string
+ //
+ StringTokenizer st = new StringTokenizer(inputString, " ");
+
+ int stringPositionCounter = 0;
+
+ // while there are tokens
+ while (st.hasMoreTokens()) {
+
+ String token = st.nextToken();
+
+ RePairSymbol symbol = new RePairSymbol(token, stringPositionCounter);
+ // put it into the string
+ string.add(symbol);
+ // and into the index
+ // take care about digram frequencies
+ if (stringPositionCounter > 0) {
+
+ StringBuffer digramStr = new StringBuffer();
+ digramStr.append(string.get(stringPositionCounter - 1).toString()).append(SPACE)
+ .append(string.get(stringPositionCounter).toString());
+
+ DigramFrequencyEntry entry = digramFrequencies.get(digramStr.toString());
+ if (null == entry) {
+ digramFrequencies.put(new DigramFrequencyEntry(digramStr.toString(), 1,
+ stringPositionCounter - 1));
+ } else {
+ digramFrequencies.incrementFrequency(entry, 1);
+ }
+ }
+ // go on
+ stringPositionCounter++;
+ }
- StringBuffer currentDigram = new StringBuffer();
- currentDigram.append(string.get(currentIndex).toString()).append(SPACE)
- .append(string.get(currentIndex + 1).toString());
+ consoleLogger.debug("String length " + string.size() + " unique digrams "
+ + digramFrequencies.size());
+
+ DigramFrequencyEntry entry;
+ while ((entry = digramFrequencies.getTop()) != null && entry.getFrequency() > 1) {
+
+ // take the most frequent rule
+ //
+ // Entry entry = entries.get(0);
+ // DigramFrequencyEntry entry = digramFrequencies.getTop();
+
+ /*
+ consoleLogger.info("re-pair iteration, digram \"" + entry.getDigram() + "\", frequency: "
+ + entry.getFrequency());
+
+ consoleLogger.debug("Going to substitute the digram " + entry.getDigram()
+ + " first occurring at position " + entry.getFirstOccurrence() + " with frequency "
+ + entry.getFrequency() + ", '" + string.get(entry.getFirstOccurrence()) + SPACE
+ + string.get(entry.getFirstOccurrence() + 1) + "'");
+ */
+
+ // create new rule
+ //
+ RePairRule r = new RePairRule(rg, string.get(entry.getFirstOccurrence()), string.get(entry.getFirstOccurrence() + 1));
+ r.setFirst(string.get(entry.getFirstOccurrence()));
+ r.setSecond(string.get(entry.getFirstOccurrence() + 1));
+ r.assignLevel();
+
+ // substitute each digram entry with a rule
+ //
+ String digramToSubstitute = entry.getDigram();
+ int currentIndex = entry.getFirstOccurrence();
+
+ StringBuffer currentDigram = new StringBuffer();
+ while (currentIndex < string.size() - 1) {
+
+
+ currentDigram.setLength(0);
+
+ currentDigram.append(string.get(currentIndex)).append(SPACE)
+ .append(string.get(currentIndex + 1));
+
+ //if (digramToSubstitute.equalsIgnoreCase(currentDigram.toString())) {
+ if (digramToSubstitute.equals(currentDigram.toString())) {
+ /*consoleLogger.debug(" next digram occurrence is at " + currentIndex + ", '"
+ + string.get(currentIndex) + SPACE + string.get(currentIndex + 1) + "'");*/
+
+ // correct entries at left and right
+ if (currentIndex > 0) {
+ // taking care about immediate neighbor
+ removeDigramFrequencyEntry(currentIndex - 1, string, digramFrequencies);
+ }
+ if (currentIndex < string.size() - 2) {
+ removeDigramFrequencyEntry(currentIndex + 1, string, digramFrequencies);
+ }
+
+ // create the new guard to insert
+ RePairGuard g = new RePairGuard(r);
+ g.setStringPosition(string.get(currentIndex).getStringPosition());
+ r.addOccurrence(string.get(currentIndex).getStringPosition());
+ substituteDigramAt(rg, currentIndex, g, string, digramFrequencies);
+
+ }
+ currentIndex++;
+ }
+
+ // // sort the entries of digram table by the size of indexes
+ // entries = new ArrayList>();
+ // entries.addAll(digramFrequencies.entrySet());
+ // Collections.sort(entries, new Comparator>() {
+ // @Override
+ // public int compare(Entry o1, Entry o2) {
+ // return -Integer.valueOf(o1.getValue()[0]).compareTo(Integer.valueOf(o2.getValue()[0]));
+ // }
+ // });
+
+ consoleLogger.debug("*** iteration finished, top count "
+ + digramFrequencies.getTop().getFrequency());
+ }
- if (digramToSubstitute.equalsIgnoreCase(currentDigram.toString())) {
- consoleLogger.debug(" next digram occurrence is at " + currentIndex + ", '"
- + string.get(currentIndex) + SPACE + string.get(currentIndex + 1) + "'");
+ rg.setR0String(stringToDisplay(string));
- // correct entries at left and right
- if (currentIndex > 0) {
- // taking care about immediate neighbor
- removeDigramFrequencyEntry(currentIndex - 1, string, digramFrequencies);
- }
- if (currentIndex < string.size() - 2) {
- removeDigramFrequencyEntry(currentIndex + 1, string, digramFrequencies);
- }
-
- // create the new guard to insert
- RePairGuard g = new RePairGuard(r);
- g.setStringPosition(string.get(currentIndex).getStringPosition());
- r.addOccurrence(string.get(currentIndex).getStringPosition());
- substituteDigramAt(rg, currentIndex, g, string, digramFrequencies);
+ rg.expandRules();
- }
- currentIndex++;
- }
-
- // // sort the entries of digram table by the size of indexes
- // entries = new ArrayList>();
- // entries.addAll(digramFrequencies.entrySet());
- // Collections.sort(entries, new Comparator>() {
- // @Override
- // public int compare(Entry o1, Entry o2) {
- // return -Integer.valueOf(o1.getValue()[0]).compareTo(Integer.valueOf(o2.getValue()[0]));
- // }
- // });
+ return rg;
- consoleLogger.debug("*** iteration finished, top count "
- + digramFrequencies.getTop().getFrequency());
}
- rg.setR0String(stringToDisplay(string));
+ /**
+ * Substitute the digram by a rule.
+ *
+ * @param currentIndex
+ * @param g
+ * @param string
+ * @param digramFrequencies
+ */
+ private static void substituteDigramAt(RePairGrammar rg, int currentIndex, RePairGuard g,
+ ArrayList string, DigramFrequencies digramFrequencies) {
- rg.expandRules();
+ // create entry for two new digram
+ //
- return rg;
+ final RePairSymbol digramL = string.get(currentIndex);
+ final RePairSymbol digramR = string.get(currentIndex + 1);
+ final char[] digramLkey = digramL.key();
+ final char[] digramRkey = digramR.key();
+ final StringBuffer digram = new StringBuffer(digramLkey.length+1+digramRkey.length);
+ digram.append(digramL).append(SPACE).append(digramR);
- }
- /**
- * Substitute the digram by a rule.
- *
- * @param currentIndex
- * @param g
- * @param string
- * @param digramFrequencies
- */
- private static void substituteDigramAt(RePairGrammar rg, Integer currentIndex, RePairGuard g,
- ArrayList string, DigramFrequencies digramFrequencies) {
-
- // create entry for two new digram
- //
- StringBuffer digram = new StringBuffer();
- digram.append(string.get(currentIndex).toString()).append(SPACE)
- .append(string.get(currentIndex + 1));
+ /*
consoleLogger.debug(" substituting the digram " + digram + " at " + currentIndex + " with "
+ g.toString());
@@ -341,136 +362,150 @@ private static void substituteDigramAt(RePairGrammar rg, Integer currentIndex, R
if (currentIndex < string.size() - 2) {
consoleLogger.debug(" next " + string.get(currentIndex + 2).toString());
}
+ */
+
+ // update the new left digram frequency
+ //
+ if (currentIndex > 0) {
+ StringBuffer newDigram = new StringBuffer();
+ newDigram.append(string.get(currentIndex - 1).toString()).append(SPACE).append(g.toString());
+
+ //consoleLogger.debug(" updating the frequency entry for digram " + newDigram.toString());
+
+ final String nds = newDigram.toString();
+ DigramFrequencyEntry entry = digramFrequencies.get(nds);
+ if (null == entry) {
+ digramFrequencies.put(new DigramFrequencyEntry(nds, 1, currentIndex - 1));
+ } else {
+ digramFrequencies.incrementFrequency(entry, 1);
+ if (currentIndex - 1 < entry.getFirstOccurrence()) {
+ entry.setFirstOccurrence(currentIndex - 1);
+ }
+ }
+ }
- // update the new left digram frequency
- //
- if (currentIndex > 0) {
- StringBuffer newDigram = new StringBuffer();
- newDigram.append(string.get(currentIndex - 1).toString()).append(SPACE).append(g.toString());
- consoleLogger.debug(" updating the frequency entry for digram " + newDigram.toString());
- DigramFrequencyEntry entry = digramFrequencies.get(newDigram.toString());
- if (null == entry) {
- digramFrequencies.put(new DigramFrequencyEntry(newDigram.toString(), 1, currentIndex - 1));
- }
- else {
- digramFrequencies.incrementFrequency(entry, 1);
- if (currentIndex - 1 < entry.getFirstOccurrence()) {
- entry.setFirstOccurrence(currentIndex - 1);
+ // update the new right digram frequency
+ //
+ if (currentIndex < string.size() - 2) {
+ StringBuffer newDigram = new StringBuffer();
+ newDigram.append(g.toString()).append(SPACE).append(string.get(currentIndex + 2));
+
+
+ //consoleLogger.debug(" updating the frequency entry for digram " + newDigram.toString());
+ final String nds = newDigram.toString();
+ DigramFrequencyEntry entry = digramFrequencies.get(nds);
+ if (null == entry) {
+ digramFrequencies.put(new DigramFrequencyEntry(nds, 1, currentIndex));
+ } else {
+ digramFrequencies.incrementFrequency(entry, 1);
+ if (currentIndex + 1 < entry.getFirstOccurrence()) {
+ entry.setFirstOccurrence(currentIndex);
+ }
+ }
}
- }
- }
- // update the new right digram frequency
- //
- if (currentIndex < string.size() - 2) {
- StringBuffer newDigram = new StringBuffer();
- newDigram.append(g.toString()).append(SPACE).append(string.get(currentIndex + 2));
- consoleLogger.debug(" updating the frequency entry for digram " + newDigram.toString());
- DigramFrequencyEntry entry = digramFrequencies.get(newDigram.toString());
- if (null == entry) {
- digramFrequencies.put(new DigramFrequencyEntry(newDigram.toString(), 1, currentIndex));
- }
- else {
- digramFrequencies.incrementFrequency(entry, 1);
- if (currentIndex + 1 < entry.getFirstOccurrence()) {
- entry.setFirstOccurrence(currentIndex);
+ // remove and substitute
+ //
+ // 1. decrease to be substituted digram frequency
+ //
+ //consoleLogger.debug(" updating the frequency entry for digram " + digram.toString());
+ final String ds = digram.toString();
+ DigramFrequencyEntry entry = digramFrequencies.get(ds);
+ if (1 == entry.getFrequency()) {
+ //consoleLogger.debug(" removing the frequency entry");
+ digramFrequencies.remove(ds);
+ } else {
+ /*consoleLogger.debug(" setting the frequency entry to "
+ + Integer.valueOf(entry.getFrequency() - 1));*/
+ digramFrequencies.incrementFrequency(entry, -1);
+ if (currentIndex == entry.getFirstOccurrence()) {
+ //consoleLogger.debug(" this was an index entry, finding another digram index...");
+ repairLRFreqMatch(currentIndex, string, digramLkey, digramRkey, entry);
+ }
+ }
+ // 2. substitute
+ string.set(currentIndex, g);
+ /*consoleLogger.debug(" deleting symbol " + string.get(currentIndex + 1).toString() + " at "
+ + Integer.valueOf(currentIndex + 1));*/
+ // 3. delete
+ string.remove(currentIndex + 1);
+
+ // need to take care about all the indexes
+ // as all the indexes above _currentIndex_ shall be shifted by -1
+ // NO NEED for TLinkedList string = new TLinkedList();
+ // HashMap digramFrequencies = new HashMap();
+ //
+ // traverse the string to the right decreasing indexes
+ for (Entry e : digramFrequencies.getEntries().entrySet()) {
+ final DigramFrequencyEntry eval = e.getValue();
+ int idx = eval.getFirstOccurrence();
+ if (idx >= currentIndex + 2) {
+ // consoleLogger.debug(" shifting entry for " + e.getValue().getDigram() + " from "
+ // + e.getValue().getFirstOccurrence() + " to " + Integer.valueOf(idx - 1));
+ eval.setFirstOccurrence(idx - 1);
+ }
}
- }
- }
- // remove and substitute
- //
- // 1. decrease to be substituted digram frequency
- //
- consoleLogger.debug(" updating the frequency entry for digram " + digram.toString());
- DigramFrequencyEntry entry = digramFrequencies.get(digram.toString());
- if (1 == entry.getFrequency()) {
- consoleLogger.debug(" removing the frequency entry");
- digramFrequencies.remove(digram.toString());
}
- else {
- consoleLogger.debug(" setting the frequency entry to "
- + Integer.valueOf(entry.getFrequency() - 1));
- digramFrequencies.incrementFrequency(entry, -1);
- if (currentIndex == entry.getFirstOccurrence()) {
- consoleLogger.debug(" this was an index entry, finding another digram index...");
+
+ private static void repairLRFreqMatch(int currentIndex, ArrayList string, char[] digramLkey, char[] digramRkey, DigramFrequencyEntry entry) {
for (int i = currentIndex + 1; i < string.size() - 1; i++) {
- StringBuffer cDigram = new StringBuffer();
- cDigram.append(string.get(i).toString()).append(SPACE)
- .append(string.get(i + 1).toString());
- if (digram.toString().equals(cDigram.toString())) {
- consoleLogger.debug(" for digram " + cDigram.toString() + " new index " + i);
- entry.setFirstOccurrence(i);
- break;
- }
- }
- }
- }
- // 2. substitute
- string.set(currentIndex, g);
- consoleLogger.debug(" deleting symbol " + string.get(currentIndex + 1).toString() + " at "
- + Integer.valueOf(currentIndex + 1));
- // 3. delete
- string.remove(Integer.valueOf(currentIndex + 1).intValue());
-
- // need to take care about all the indexes
- // as all the indexes above _currentIndex_ shall be shifted by -1
- // NO NEED for TLinkedList string = new TLinkedList();
- // HashMap digramFrequencies = new HashMap();
- //
- // traverse the string to the right decreasing indexes
- for (Entry e : digramFrequencies.getEntries().entrySet()) {
- int idx = e.getValue().getFirstOccurrence();
- if (idx >= currentIndex + 2) {
- // consoleLogger.debug(" shifting entry for " + e.getValue().getDigram() + " from "
- // + e.getValue().getFirstOccurrence() + " to " + Integer.valueOf(idx - 1));
- e.getValue().setFirstOccurrence(idx - 1);
- }
- }
- }
- private static void removeDigramFrequencyEntry(int index, ArrayList string,
- DigramFrequencies digramFrequencies) {
+ if (Arrays.equals(digramLkey, string.get(i).key()) &&
+ Arrays.equals(digramRkey, string.get(i + 1).key())) {
- StringBuffer digramToRemove = new StringBuffer();
- digramToRemove.append(string.get(index).toString()).append(SPACE)
- .append(string.get(index + 1).toString());
+ //cDigram.setLength(0);
+ //cDigram.append(string.get(i)).append(SPACE).append(string.get(i + 1));
- DigramFrequencyEntry digramEntry = digramFrequencies.get(digramToRemove.toString());
+ //consoleLogger.debug(" for digram " + cDigram.toString() + " new index " + i);
+ entry.setFirstOccurrence(i);
+ break;
- if (digramEntry.getFrequency() == 1) {
- digramFrequencies.remove(digramToRemove.toString());
- consoleLogger.debug(" completely removing the frequency entry for digram "
- + digramToRemove.toString() + " at position " + index);
- }
- else {
- consoleLogger.debug(" decreasing the frequency entry for digram "
- + digramToRemove.toString() + " at position " + index + " from "
- + digramEntry.getFrequency() + " to " + Integer.valueOf(digramEntry.getFrequency() - 1));
- digramFrequencies.incrementFrequency(digramEntry, -1);
- if (index == digramEntry.getFirstOccurrence()) {
- consoleLogger.debug(" this was an index entry, finding another digram index...");
- for (int i = index + 1; i < string.size() - 1; i++) {
- StringBuffer cDigram = new StringBuffer();
- cDigram.append(string.get(i).toString()).append(SPACE)
- .append(string.get(i + 1).toString());
- if (digramToRemove.toString().equals(cDigram.toString())) {
- consoleLogger.debug(" for digram " + cDigram.toString() + " new index " + i);
- digramEntry.setFirstOccurrence(i);
- break;
- }
+ }
}
- }
}
- }
+ private static void removeDigramFrequencyEntry(int index, ArrayList string,
+ DigramFrequencies digramFrequencies) {
+
+ StringBuffer digramToRemove = new StringBuffer();
+ digramToRemove.append(string.get(index)).append(SPACE)
+ .append(string.get(index + 1));
+
+ DigramFrequencyEntry digramEntry = digramFrequencies.get(digramToRemove.toString());
+
+ if (digramEntry.getFrequency() == 1) {
+ digramFrequencies.remove(digramToRemove.toString());
+ /*consoleLogger.debug(" completely removing the frequency entry for digram "
+ + digramToRemove.toString() + " at position " + index);*/
+ } else {
+ /*consoleLogger.debug(" decreasing the frequency entry for digram "
+ + digramToRemove.toString() + " at position " + index + " from "
+ + digramEntry.getFrequency() + " to " + Integer.valueOf(digramEntry.getFrequency() - 1));*/
+ digramFrequencies.incrementFrequency(digramEntry, -1);
+ if (index == digramEntry.getFirstOccurrence()) {
+ //consoleLogger.debug(" this was an index entry, finding another digram index...");
+ for (int i = index + 1; i < string.size() - 1; i++) {
+ StringBuffer cDigram = new StringBuffer();
+ cDigram.append(string.get(i).toString()).append(SPACE)
+ .append(string.get(i + 1).toString());
+ if (digramToRemove.toString().equals(cDigram.toString())) {
+ //consoleLogger.debug(" for digram " + cDigram.toString() + " new index " + i);
+ digramEntry.setFirstOccurrence(i);
+ break;
+ }
+ }
+ }
+ }
- private static String stringToDisplay(ArrayList string) {
- StringBuffer sb = new StringBuffer();
- for (int i = 0; i < string.size(); i++) {
- sb.append(string.get(i).toString()).append(SPACE);
}
- return sb.toString();
- }
+
+ private static String stringToDisplay(ArrayList string) {
+ StringBuffer sb = new StringBuffer();
+ for (int i = 0; i < string.size(); i++) {
+ sb.append(string.get(i).toString()).append(SPACE);
+ }
+ return sb.toString();
+ }
}
diff --git a/src/main/java/net/seninp/gi/repair/RePairGrammar.java b/src/main/java/net/seninp/gi/repair/RePairGrammar.java
index 900b87a..3556072 100644
--- a/src/main/java/net/seninp/gi/repair/RePairGrammar.java
+++ b/src/main/java/net/seninp/gi/repair/RePairGrammar.java
@@ -1,13 +1,16 @@
package net.seninp.gi.repair;
-import java.util.ArrayList;
-import java.util.Hashtable;
-import java.util.concurrent.atomic.AtomicInteger;
+import com.gs.collections.api.iterator.MutableIntIterator;
+import com.gs.collections.impl.set.mutable.primitive.IntHashSet;
import net.seninp.gi.GrammarRuleRecord;
import net.seninp.gi.GrammarRules;
import net.seninp.gi.RuleInterval;
import net.seninp.jmotif.sax.datastructures.SAXRecords;
+import java.util.ArrayList;
+import java.util.Hashtable;
+import java.util.concurrent.atomic.AtomicInteger;
+
/**
* A repair grammar container.
*
@@ -141,7 +144,7 @@ public GrammarRules toGrammarRulesData() {
r0.setRuleNumber(0);
r0.setRuleString(this.r0String);
r0.setExpandedRuleString(this.r0ExpandedString);
- r0.setOccurrences(new int[1]);
+ r0.setOccurrences(new IntHashSet());
res.addRule(r0);
for (RePairRule rule : theRules.values()) {
@@ -178,7 +181,11 @@ public void buildIntervals(SAXRecords records, double[] originalTimeSeries, int
// System.out.println("R" + rr.ruleNumber + ", " + rr.toRuleString() + ", "
// + rr.expandedRuleString);
String[] split = rr.expandedRuleString.split(" ");
- for (int pos : rr.getOccurrences()) {
+
+ MutableIntIterator ii = rr.getOccurrences().intIterator();
+ while (ii.hasNext()) {
+ int pos = ii.next();
+
Integer p2 = records.mapStringIndexToTSPosition(pos + split.length - 1);
if (null == p2) {
rr.ruleIntervals.add(new RuleInterval(records.mapStringIndexToTSPosition(pos),
diff --git a/src/main/java/net/seninp/gi/repair/RePairRule.java b/src/main/java/net/seninp/gi/repair/RePairRule.java
index 96b3ebc..21d1213 100644
--- a/src/main/java/net/seninp/gi/repair/RePairRule.java
+++ b/src/main/java/net/seninp/gi/repair/RePairRule.java
@@ -1,8 +1,10 @@
package net.seninp.gi.repair;
-import java.util.ArrayList;
+import com.gs.collections.impl.set.mutable.primitive.IntHashSet;
import net.seninp.gi.RuleInterval;
+import java.util.ArrayList;
+
/**
* The grammar rule.
*
@@ -35,7 +37,7 @@ public class RePairRule {
protected int level;
/** Occurrences. */
- protected ArrayList occurrences;
+ protected IntHashSet occurrences;
/** Which TS interval covered. */
protected ArrayList ruleIntervals;
@@ -46,7 +48,7 @@ public class RePairRule {
/**
* Constructor, assigns a rule ID using the global counter.
*/
- public RePairRule(RePairGrammar rg) {
+ public RePairRule(RePairGrammar rg, RePairSymbol first, RePairSymbol second) {
this.grammar = rg;
@@ -56,9 +58,11 @@ public RePairRule(RePairGrammar rg) {
rg.theRules.put(this.ruleNumber, this);
- this.occurrences = new ArrayList();
+ this.occurrences = new IntHashSet();
this.ruleIntervals = new ArrayList();
+ this.first = first; this.second = second;
+
}
/**
@@ -125,9 +129,9 @@ public String toExpandedRuleString() {
* @param value the new value.
*/
public void addOccurrence(int value) {
- if (!this.occurrences.contains(value)) {
- this.occurrences.add(value);
- }
+
+ this.occurrences.add(value);
+
}
/**
@@ -135,12 +139,13 @@ public void addOccurrence(int value) {
*
* @return all rule's occurrences.
*/
- public int[] getOccurrences() {
- int[] res = new int[this.occurrences.size()];
- for (int i = 0; i < this.occurrences.size(); i++) {
- res[i] = this.occurrences.get(i);
- }
- return res;
+ public IntHashSet getOccurrences() {
+ return occurrences;
+// int[] res = new int[this.occurrences.size()];
+// for (int i = 0; i < this.occurrences.size(); i++) {
+// res[i] = this.occurrences.get(i);
+// }
+// return res;
}
public String toString() {
diff --git a/src/main/java/net/seninp/gi/repair/RePairSymbol.java b/src/main/java/net/seninp/gi/repair/RePairSymbol.java
index c07af57..eed7556 100644
--- a/src/main/java/net/seninp/gi/repair/RePairSymbol.java
+++ b/src/main/java/net/seninp/gi/repair/RePairSymbol.java
@@ -14,19 +14,22 @@ public class RePairSymbol {
/**
* Payload.
*/
- private char[] string;
+ final private char[] string;
/**
* Position of the symbol in the string.
*/
- private Integer stringPosition;
+ private int stringPosition;
+
+ final static char[] blank = new char[0];
/**
* Constructor.
*/
public RePairSymbol() {
super();
- this.stringPosition = null;
+ this.stringPosition = -1;
+ this.string = blank;
}
/**
@@ -89,6 +92,9 @@ public int getLevel() {
return 0;
}
+
+ public char[] key() { return string; }
+
public String toString() {
return String.valueOf(this.string);
}
@@ -98,7 +104,7 @@ public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + Arrays.hashCode(string);
- result = prime * result + ((stringPosition == null) ? 0 : stringPosition.hashCode());
+ result = prime * result + ((stringPosition == -1) ? -1 : stringPosition);
return result;
}
@@ -106,19 +112,22 @@ public int hashCode() {
public boolean equals(Object obj) {
if (this == obj)
return true;
- if (obj == null)
- return false;
- if (getClass() != obj.getClass())
+ if (!(obj instanceof RePairSymbol))
return false;
+
RePairSymbol other = (RePairSymbol) obj;
- if (!Arrays.equals(string, other.string))
- return false;
- if (stringPosition == null) {
- if (other.stringPosition != null)
+
+ if (stringPosition == -1) {
+ if (other.stringPosition != -1)
return false;
}
- else if (!stringPosition.equals(other.stringPosition))
+ else if (stringPosition!=other.stringPosition)
+ return false;
+
+ if (!Arrays.equals(string, other.string))
return false;
+
+
return true;
}
diff --git a/src/main/java/net/seninp/gi/repair/parallel/ParallelGrammarKeeper.java b/src/main/java/net/seninp/gi/repair/parallel/ParallelGrammarKeeper.java
new file mode 100644
index 0000000..123d57c
--- /dev/null
+++ b/src/main/java/net/seninp/gi/repair/parallel/ParallelGrammarKeeper.java
@@ -0,0 +1,188 @@
+package net.seninp.gi.repair.parallel;
+
+import com.gs.collections.api.list.MutableList;
+import com.gs.collections.api.tuple.primitive.IntObjectPair;
+import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap;
+import net.seninp.gi.repair.RePairSymbol;
+
+import java.util.ArrayList;
+import java.util.concurrent.atomic.AtomicInteger;
+
+/**
+ * This implements a handler for the Re-Pair grammar built in parallel. This data structure is
+ * responsible for enumerating rules and for tracking changes in the R0 of the grammar.
+ *
+ * @author psenin
+ */
+public class ParallelGrammarKeeper {
+
+ private static final char SPACE = ' ';
+ private static final char THE_R = 'R';
+
+ // rule 0 gets a separate treatment, so we start from 1
+ //
+ protected AtomicInteger numRules = new AtomicInteger(1);
+
+ // the rules table
+ protected IntObjectHashMap theRules = new IntObjectHashMap();
+
+ // the grammar id
+ private long id;
+
+ // R0 strings
+ //
+ protected String r0String;
+ public String r0ExpandedString;
+
+ // keeps a working string of this grammar
+ //
+ protected ArrayList workString;
+ private MutableList> keys;
+
+ /**
+ * Constructor.
+ *
+ * @param id The handler id.
+ */
+ public ParallelGrammarKeeper(long id) {
+ super();
+ this.id = id;
+ }
+
+ /**
+ * The id is used to keep track of parallel chunks.
+ *
+ * @return the current ID.
+ */
+ public long getId() {
+ return this.id;
+ }
+
+ /**
+ * This is used in parallel.
+ *
+ * @param string the string we work with in parallel.
+ */
+ public void setWorkString(ArrayList string) {
+ this.workString = string;
+ }
+
+ /**
+ * Set the R0 string.
+ *
+ * @param string the R0 string value.
+ */
+ public void setR0String(String string) {
+ this.r0String = string;
+ }
+
+ /**
+ * Get the expanded R0 out.
+ *
+ * @return the expanded R0.
+ */
+ public String getR0ExpandedString() {
+ return this.r0ExpandedString;
+ }
+
+ /**
+ * This adds an existing rule to this grammar. Useful in merging.
+ *
+ * @param r The rule. It is not yet clear how to treat rules, be careful. This will not set the
+ * rule number, but it will increment the internal rule counter.
+ */
+ public void addExistingRule(ParallelRePairRule r) {
+ r.grammarHandler = this;
+ if (this.theRules.containsKey(r.ruleNumber)) {
+ // we do override an existing rule
+ theRules.put(r.ruleNumber, r);
+ } else {
+ // plus 1 because the rule 0 has a special treatment
+ theRules.put(r.ruleNumber, r);
+ numRules.set(theRules.size() + 1);
+ }
+ }
+
+ /**
+ * Expands all rules EXCEPT R0.
+ */
+ public void expandRules() {
+ // iterate over all SAX containers
+ //ArrayList keys = new ArrayList(theRules.keySet());
+ //Collections.sort(keys);
+ keys = keys();
+ for (IntObjectPair key : keys) {
+ ParallelRePairRule rr = key.getTwo();
+
+
+ String resultString = rr.toRuleString();
+
+ int currentSearchStart = resultString.indexOf(THE_R);
+ while (currentSearchStart >= 0) {
+ int spaceIdx = resultString.indexOf(" ", currentSearchStart);
+ // if (spaceIdx < 0) {
+ // System.out.println("gotcha!");
+ // }
+ String ruleName = resultString.substring(currentSearchStart, spaceIdx + 1);
+ int ruleId = Integer.valueOf(ruleName.substring(1, ruleName.length() - 1));
+
+ ParallelRePairRule rule = theRules.get(ruleId);
+ if (rule != null) {
+ if (rule.expandedRuleString.charAt(rule.expandedRuleString.length() - 1) == ' ') {
+ resultString = resultString.replaceAll(ruleName, rule.expandedRuleString);
+ } else {
+ resultString = resultString.replaceAll(ruleName, rule.expandedRuleString + SPACE);
+ }
+ }
+
+ currentSearchStart = resultString.indexOf(THE_R, spaceIdx);
+ }
+
+ rr.setExpandedRule(resultString.trim());
+
+ }
+ }
+
+ public MutableList> keys() {
+ return theRules.keyValuesView().toSortedList();
+ }
+
+ /**
+ * Expands R0 specifically.
+ */
+ public void expandR0() {
+ // string is immutable it will get copied
+ String finalString = this.r0String;
+ int currentSearchStart = finalString.indexOf(THE_R);
+ while (currentSearchStart >= 0) {
+
+ int spaceIdx = finalString.indexOf(" ", currentSearchStart + 1);
+
+ String ruleName = finalString.substring(currentSearchStart, spaceIdx + 1);
+ Integer ruleId = Integer.valueOf(ruleName.substring(1, ruleName.length() - 1));
+
+ ParallelRePairRule rr = theRules.get(ruleId);
+ if (null == rr.expandedRuleString) {
+ finalString = finalString.replaceAll(ruleName, theRules.get(ruleId).toRuleString());
+ } else {
+ finalString = finalString.replaceAll(ruleName, theRules.get(ruleId).expandedRuleString
+ + SPACE);
+ }
+
+ currentSearchStart = finalString.indexOf(THE_R);
+ }
+ this.r0ExpandedString = finalString;
+ }
+
+ public String toGrammarString() {
+ StringBuffer sb = new StringBuffer();
+ System.out.println("R0 -> " + r0String);
+ for (int i = 1; i < theRules.size(); i++) {
+ ParallelRePairRule r = theRules.get(i);
+ sb.append("R").append(r.ruleNumber).append(" -> ").append(r.toRuleString()).append(" : ")
+ .append(r.expandedRuleString).append(", ").append(r.positions).append("\n");
+ }
+ return sb.toString();
+ }
+
+}
diff --git a/src/main/java/net/seninp/gi/repair/ParallelRePairGuard.java b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairGuard.java
similarity index 85%
rename from src/main/java/net/seninp/gi/repair/ParallelRePairGuard.java
rename to src/main/java/net/seninp/gi/repair/parallel/ParallelRePairGuard.java
index dc29c97..c5503ef 100644
--- a/src/main/java/net/seninp/gi/repair/ParallelRePairGuard.java
+++ b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairGuard.java
@@ -1,4 +1,6 @@
-package net.seninp.gi.repair;
+package net.seninp.gi.repair.parallel;
+
+import net.seninp.gi.repair.RePairSymbol;
/**
* The guard used for non-terminals.
diff --git a/src/main/java/net/seninp/gi/repair/ParallelRePairImplementation.java b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairImplementation.java
similarity index 96%
rename from src/main/java/net/seninp/gi/repair/ParallelRePairImplementation.java
rename to src/main/java/net/seninp/gi/repair/parallel/ParallelRePairImplementation.java
index d4afca4..46395d4 100644
--- a/src/main/java/net/seninp/gi/repair/ParallelRePairImplementation.java
+++ b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairImplementation.java
@@ -1,20 +1,19 @@
-package net.seninp.gi.repair;
+package net.seninp.gi.repair.parallel;
+
+import ch.qos.logback.classic.Level;
+import ch.qos.logback.classic.Logger;
+import com.gs.collections.api.list.MutableList;
+import com.gs.collections.api.tuple.primitive.IntObjectPair;
+import net.seninp.gi.repair.DigramFrequencies;
+import net.seninp.gi.repair.DigramFrequencyEntry;
+import net.seninp.gi.repair.RePairSymbol;
+import net.seninp.util.StackTrace;
+import org.slf4j.LoggerFactory;
import java.util.ArrayList;
-import java.util.Collections;
import java.util.HashMap;
-import java.util.Hashtable;
import java.util.Map.Entry;
-import java.util.concurrent.CompletionService;
-import java.util.concurrent.ExecutorCompletionService;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeUnit;
-import net.seninp.util.StackTrace;
-import org.slf4j.LoggerFactory;
-import ch.qos.logback.classic.Level;
-import ch.qos.logback.classic.Logger;
+import java.util.concurrent.*;
public class ParallelRePairImplementation {
@@ -64,8 +63,8 @@ public ParallelGrammarKeeper buildGrammar(ParallelGrammarKeeper grammar, int thr
// the mapping of rule ID to the rule instance
final HashMap ruleNumToRecord = new HashMap();
if (!(grammar.theRules.isEmpty())) {
- for (Entry e : grammar.theRules.entrySet()) {
- ruleNumToRecord.put(e.getKey(), e.getValue());
+ for (IntObjectPair e : grammar.keys()) {
+ ruleNumToRecord.put(e.getOne(), e.getTwo());
}
}
// the data structure which keeps R0 strings that have been returned from workers
@@ -159,7 +158,7 @@ public ParallelGrammarKeeper buildGrammar(ParallelGrammarKeeper grammar, int thr
consoleLogger.debug("job " + chunkRes.getId() + " of chunk " + chunkJobIndex
+ " has finished");
- Hashtable chunkGrammarRulesData = chunkRes.theRules;
+ //IntObjectHashMap chunkGrammarRulesData = chunkRes.theRules;
String R0String = chunkRes.r0String;
chunkStrings.put(chunkJobIndex, R0String);
@@ -178,8 +177,7 @@ public ParallelGrammarKeeper buildGrammar(ParallelGrammarKeeper grammar, int thr
// these are the rule keys, they'll be used twice
//
- ArrayList keys = new ArrayList(chunkGrammarRulesData.keySet());
- Collections.sort(keys);
+ MutableList> keys = chunkRes.keys();
// for (int i = 0; i < keys.size(); i++) {
// ParallelRePairRule r = chunkGrammarRulesData.get(keys.get(i));
@@ -193,9 +191,9 @@ public ParallelGrammarKeeper buildGrammar(ParallelGrammarKeeper grammar, int thr
// these are guaranteed to come out in order
//
- for (int i = 0; i < keys.size(); i++) {
+ for (IntObjectPair k : keys) {
- ParallelRePairRule r = chunkGrammarRulesData.get(keys.get(i));
+ ParallelRePairRule r = k.getTwo();
consoleLogger.trace("processing rule " + r.getRuleName() + " -> " + r.toRuleString()
+ " : " + r.expandedRuleString);
diff --git a/src/main/java/net/seninp/gi/repair/ParallelRePairRule.java b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairRule.java
similarity index 95%
rename from src/main/java/net/seninp/gi/repair/ParallelRePairRule.java
rename to src/main/java/net/seninp/gi/repair/parallel/ParallelRePairRule.java
index 08b872f..03797da 100644
--- a/src/main/java/net/seninp/gi/repair/ParallelRePairRule.java
+++ b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairRule.java
@@ -1,4 +1,6 @@
-package net.seninp.gi.repair;
+package net.seninp.gi.repair.parallel;
+
+import net.seninp.gi.repair.RePairSymbol;
import java.util.ArrayList;
diff --git a/src/main/java/net/seninp/gi/repair/ParallelRePairWorkerSingleLevel.java b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairWorkerSingleLevel.java
similarity index 98%
rename from src/main/java/net/seninp/gi/repair/ParallelRePairWorkerSingleLevel.java
rename to src/main/java/net/seninp/gi/repair/parallel/ParallelRePairWorkerSingleLevel.java
index 462956b..1f64c97 100644
--- a/src/main/java/net/seninp/gi/repair/ParallelRePairWorkerSingleLevel.java
+++ b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairWorkerSingleLevel.java
@@ -1,4 +1,8 @@
-package net.seninp.gi.repair;
+package net.seninp.gi.repair.parallel;
+
+import net.seninp.gi.repair.DigramFrequencies;
+import net.seninp.gi.repair.DigramFrequencyEntry;
+import net.seninp.gi.repair.RePairSymbol;
import java.util.ArrayList;
import java.util.Map.Entry;
diff --git a/src/main/java/net/seninp/gi/sequitur/SAXRule.java b/src/main/java/net/seninp/gi/sequitur/SAXRule.java
index a81034b..2f1d232 100644
--- a/src/main/java/net/seninp/gi/sequitur/SAXRule.java
+++ b/src/main/java/net/seninp/gi/sequitur/SAXRule.java
@@ -19,15 +19,13 @@ of the License, or (at your option) any later version.
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.Vector;
-import java.util.concurrent.atomic.AtomicInteger;
+import com.gs.collections.impl.set.mutable.primitive.IntHashSet;
import net.seninp.gi.GrammarRuleRecord;
import net.seninp.gi.GrammarRules;
+import java.util.*;
+import java.util.concurrent.atomic.AtomicInteger;
+
/**
* The Rule. Adaption of Eibe Frank code for JMotif API, see {@link sequitur.info} for original
* version.
@@ -77,7 +75,7 @@ public class SAXRule {
* This keeps rule indexes - once rule created or used, its placement position is extracted from
* the TerminalSymbol position and stored here.
*/
- protected Set indexes = new TreeSet();
+ protected IntHashSet indexes = new IntHashSet();
/**
* Constructor.
@@ -218,15 +216,19 @@ private static void expandRules() {
// }
// });
+ StringBuilder resultString = new StringBuilder(64);
+
// for (SAXMapEntry entry : recs) {
for (GrammarRuleRecord ruleRecord : arrRuleRecords) {
+
if (ruleRecord.getRuleNumber() == 0) {
continue;
}
+ resultString.setLength(0);
+
String curString = ruleRecord.getRuleString();
- StringBuilder resultString = new StringBuilder(8192);
String[] split = curString.split(" ");
@@ -240,12 +242,13 @@ private static void expandRules() {
}
// need to trim space at the very end
- String rr = resultString.delete(0, 1).append(" ").toString();
+ String rr = resultString.delete(0, 1).append(' ').toString();
ruleRecord.setExpandedRuleString(rr);
ruleRecord.setRuleYield(countSpaces(rr));
}
- StringBuilder resultString = new StringBuilder(8192);
+ //StringBuilder resultString = new StringBuilder(8192);
+ resultString.setLength(0);
GrammarRuleRecord ruleRecord = arrRuleRecords.get(0);
resultString.append(ruleRecord.getRuleString());
@@ -324,14 +327,8 @@ public void addIndex(int position) {
*
* @return all the rule occurrences.
*/
- private int[] getIndexes() {
- int[] res = new int[this.indexes.size()];
- int i = 0;
- for (Integer idx : this.indexes) {
- res[i] = idx;
- i++;
- }
- return res;
+ private IntHashSet getIndexes() {
+ return indexes;
}
/**
@@ -481,7 +478,7 @@ public static String printRules() {
}
text.append(TAB).append(arrRuleRecords.get(processedRules).getExpandedRuleString())
.append(TAB);
- text.append(Arrays.toString(currentRule.getIndexes())).append(CR);
+ text.append(currentRule.getIndexes()).append(CR);
processedRules++;
diff --git a/src/main/java/net/seninp/gi/sequitur/SAXSymbol.java b/src/main/java/net/seninp/gi/sequitur/SAXSymbol.java
index 0d08331..82040f6 100644
--- a/src/main/java/net/seninp/gi/sequitur/SAXSymbol.java
+++ b/src/main/java/net/seninp/gi/sequitur/SAXSymbol.java
@@ -19,7 +19,9 @@ of the License, or (at your option) any later version.
Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*/
+import java.util.HashMap;
import java.util.Hashtable;
+import java.util.Map;
import java.util.Map.Entry;
/**
@@ -41,10 +43,10 @@ public abstract class SAXSymbol {
private static final int prime = 2265539;
/** Hashtable to keep track of all digrams. This is static - single instance for all. */
- protected static final Hashtable theDigrams = new Hashtable(
+ protected static final Map theDigrams = new HashMap(
SAXSymbol.prime);
- public static Hashtable> theSubstituteTable = new Hashtable>(
+ public static Map> theSubstituteTable = new HashMap(
SAXSymbol.prime);
/** The symbol value. */
@@ -170,11 +172,12 @@ public boolean check() {
return false;
}
- if (!theDigrams.containsKey(this)) {
+ SAXSymbol found;
+ if ((found = theDigrams.putIfAbsent(this, this))==null) {
// System.out.println("[sequitur debug] *check...* digrams contain this (" + this.value + "~"
// + this.n.value + ")? NO. Checking in.");
// found = theDigrams.put(this, this);
- theDigrams.put(this, this);
+ //theDigrams.put(this, this);
// System.out.println(" *** Digrams now: " + makeDigramsTable());
// System.out.println("[sequitur debug] *digrams* " + hash2String());
return false;
@@ -184,7 +187,7 @@ public boolean check() {
// + this.n.value + ")? Yes. Oh-Oh...");
// well the same hash is in the store, lemme see...
- SAXSymbol found = theDigrams.get(this);
+ //found = theDigrams.get(this);
// if it's not me, then lets call match magic?
if (found.n != this) {
diff --git a/src/main/java/net/seninp/gi/sequitur/SequiturFactory.java b/src/main/java/net/seninp/gi/sequitur/SequiturFactory.java
index 4b4686d..487013f 100644
--- a/src/main/java/net/seninp/gi/sequitur/SequiturFactory.java
+++ b/src/main/java/net/seninp/gi/sequitur/SequiturFactory.java
@@ -1,14 +1,8 @@
package net.seninp.gi.sequitur;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Hashtable;
-import java.util.StringTokenizer;
-import java.util.concurrent.atomic.AtomicInteger;
+import ch.qos.logback.classic.Level;
+import ch.qos.logback.classic.Logger;
+import com.gs.collections.api.iterator.MutableIntIterator;
import net.seninp.gi.GrammarRuleRecord;
import net.seninp.gi.GrammarRules;
import net.seninp.gi.RuleInterval;
@@ -19,8 +13,16 @@
import net.seninp.jmotif.sax.alphabet.NormalAlphabet;
import net.seninp.jmotif.sax.datastructures.SAXRecords;
import org.slf4j.LoggerFactory;
-import ch.qos.logback.classic.Level;
-import ch.qos.logback.classic.Logger;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Hashtable;
+import java.util.StringTokenizer;
+import java.util.concurrent.atomic.AtomicInteger;
/**
* Sort of a stand-alone factory to digesting strings with Sequitur.
@@ -157,7 +159,7 @@ public static SAXRule runSequiturWithEditDistanceThreshold(String string, Intege
normalA.getDistanceMatrix(alphabetSize));
if (dist < threshold) {
merged = true;
- SAXSymbol.theSubstituteTable.get(str).put(token.substring(0), currentPosition);
+ SAXSymbol.theSubstituteTable.get(str).put(token, currentPosition);
token = str;
}
}
@@ -214,13 +216,16 @@ public static ArrayList getRulePositionsByRuleNum(int ruleIdx, SAX
// array of all words of this expanded rule
String[] expandedRuleSplit = ruleContainer.getExpandedRuleString().trim().split(" ");
- for (Integer currentIndex : ruleContainer.getOccurrences()) {
+ MutableIntIterator ii = ruleContainer.getOccurrences().intIterator();
+ while (ii.hasNext()) {
+
+ int currentIndex = ii.next();
// System.out.println("Index: " + currentIndex);
String extractedStr = "";
int[] extractedPositions = new int[expandedRuleSplit.length];
for (int i = 0; i < expandedRuleSplit.length; i++) {
- consoleLogger.trace("currentIndex " + currentIndex + ", i: " + i);
+ //consoleLogger.trace("currentIndex " + currentIndex + ", i: " + i);
extractedStr = extractedStr.concat(" ").concat(
String.valueOf(saxFrequencyData.getByIndex(saxWordsIndexes.get(currentIndex + i))
.getPayload()));
@@ -543,7 +548,10 @@ public static void updateRuleIntervals(GrammarRules rules, SAXRecords saxFrequen
// iterate over all occurrences of this rule
// the currentIndex here is the position of the rule in the input string
//
- for (Integer currentIndex : ruleContainer.getOccurrences()) {
+ MutableIntIterator ii = ruleContainer.getOccurrences().intIterator();
+ while (ii.hasNext()) {
+
+ int currentIndex = ii.next();
// System.out.println("Index: " + currentIndex);
// String extractedStr = "";
diff --git a/src/main/java/net/seninp/gi/performance/EvaluateParallelRePair.java b/src/main/java/net/seninp/gi/util/EvaluateParallelRePair.java
similarity index 96%
rename from src/main/java/net/seninp/gi/performance/EvaluateParallelRePair.java
rename to src/main/java/net/seninp/gi/util/EvaluateParallelRePair.java
index dc71937..09ef101 100644
--- a/src/main/java/net/seninp/gi/performance/EvaluateParallelRePair.java
+++ b/src/main/java/net/seninp/gi/util/EvaluateParallelRePair.java
@@ -1,4 +1,4 @@
-package net.seninp.gi.performance;
+package net.seninp.gi.util;
import java.io.BufferedReader;
import java.io.FileInputStream;
@@ -9,8 +9,8 @@
import java.util.ArrayList;
import java.util.Date;
import java.util.zip.GZIPInputStream;
-import net.seninp.gi.repair.ParallelGrammarKeeper;
-import net.seninp.gi.repair.ParallelRePairImplementation;
+import net.seninp.gi.repair.parallel.ParallelGrammarKeeper;
+import net.seninp.gi.repair.parallel.ParallelRePairImplementation;
import net.seninp.gi.repair.RePairFactory;
import net.seninp.gi.repair.RePairGrammar;
import net.seninp.gi.repair.RePairSymbol;
diff --git a/src/main/java/net/seninp/gi/MemoryLeakTester.java b/src/main/java/net/seninp/gi/util/MemoryLeakTester.java
similarity index 69%
rename from src/main/java/net/seninp/gi/MemoryLeakTester.java
rename to src/main/java/net/seninp/gi/util/MemoryLeakTester.java
index aff60ea..11b231e 100644
--- a/src/main/java/net/seninp/gi/MemoryLeakTester.java
+++ b/src/main/java/net/seninp/gi/util/MemoryLeakTester.java
@@ -1,5 +1,6 @@
-package net.seninp.gi;
+package net.seninp.gi.util;
+import net.seninp.gi.GrammarRules;
import net.seninp.gi.sequitur.SequiturFactory;
import net.seninp.jmotif.sax.NumerosityReductionStrategy;
import net.seninp.jmotif.sax.TSProcessor;
@@ -16,12 +17,12 @@ public class MemoryLeakTester {
public static void main(String[] args) throws Exception {
- try {
- Thread.sleep(10000); // 1000 milliseconds is one second.
- }
- catch (InterruptedException ex) {
- Thread.currentThread().interrupt();
- }
+// try {
+// Thread.sleep(10000); // 1000 milliseconds is one second.
+// }
+// catch (InterruptedException ex) {
+// Thread.currentThread().interrupt();
+// }
double[] ts = TSProcessor.readFileColumn(INPUT_FNAME, 0, 0);
System.out.println("Read " + ts.length + " points from " + INPUT_FNAME);
@@ -33,22 +34,22 @@ public static void main(String[] args) throws Exception {
Thread.currentThread().interrupt();
}
- for (int i = 0; i < 20; i++) {
+ /*for (int i = 0; i < 20; i++) {
System.out.println("Iteration " + i);
- System.gc();
+ System.gc();*/
GrammarRules g = SequiturFactory.series2SequiturRules(ts, SAX_WIN_SIZE, SAX_PAA_SIZE,
SAX_A_SIZE, NumerosityReductionStrategy.EXACT, SAX_NORM_THRESHOLD);
System.out.println("Inferred " + g.size() + " rules.");
- try {
- Thread.sleep(10000); // 1000 milliseconds is one second.
- }
- catch (InterruptedException ex) {
- Thread.currentThread().interrupt();
- }
+// try {
+// Thread.sleep(10000); // 1000 milliseconds is one second.
+// }
+// catch (InterruptedException ex) {
+// Thread.currentThread().interrupt();
+// }
- }
+ //}
}
diff --git a/src/main/java/net/seninp/gi/util/RunLive.java b/src/main/java/net/seninp/gi/util/RunLive.java
new file mode 100644
index 0000000..e7b04c4
--- /dev/null
+++ b/src/main/java/net/seninp/gi/util/RunLive.java
@@ -0,0 +1,21 @@
+package net.seninp.gi.util;
+
+import net.seninp.gi.repair.RePairFactory;
+import net.seninp.gi.repair.RePairGrammar;
+
+import java.util.Scanner;
+
+/**
+ * Created by me on 7/11/15.
+ */
+public class RunLive {
+
+ public static void main(String args[]) {
+ while (true) {
+ String l = new Scanner(System.in).nextLine();
+ RePairGrammar x = RePairFactory.buildGrammar(l);
+ System.out.println(x.toGrammarRules());
+ System.out.println(x.toGrammarRulesData());
+ }
+ }
+}
diff --git a/src/test/java/net/seninp/gi/repair/TestParallelRePairImplementation.java b/src/test/java/net/seninp/gi/repair/TestParallelRePairImplementation.java
index 065aad0..09c5fdc 100644
--- a/src/test/java/net/seninp/gi/repair/TestParallelRePairImplementation.java
+++ b/src/test/java/net/seninp/gi/repair/TestParallelRePairImplementation.java
@@ -3,6 +3,9 @@
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.util.ArrayList;
+
+import net.seninp.gi.repair.parallel.ParallelGrammarKeeper;
+import net.seninp.gi.repair.parallel.ParallelRePairImplementation;
import net.seninp.jmotif.sax.NumerosityReductionStrategy;
import net.seninp.jmotif.sax.TSProcessor;
import net.seninp.jmotif.sax.datastructures.SAXRecord;
diff --git a/src/test/java/net/seninp/gi/sequitur/TestSequiturPaperGrammars.java b/src/test/java/net/seninp/gi/sequitur/TestSequiturPaperGrammars.java
index 3522bbd..0d2af82 100644
--- a/src/test/java/net/seninp/gi/sequitur/TestSequiturPaperGrammars.java
+++ b/src/test/java/net/seninp/gi/sequitur/TestSequiturPaperGrammars.java
@@ -1,12 +1,11 @@
package net.seninp.gi.sequitur;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
import net.seninp.gi.GrammarRules;
import net.seninp.util.StackTrace;
import org.junit.Test;
+import static org.junit.Assert.*;
+
public class TestSequiturPaperGrammars {
private static final String TEST1_STRING = "a b c d b c";
@@ -28,6 +27,8 @@ public void test3() {
SAXRule r = SequiturFactory.runSequitur(TEST3_STRING);
GrammarRules rules = r.toGrammarRulesData();
+ System.out.println(rules);
+
assertEquals("test hierarchy", 5, rules.size());
assertTrue("test r0", TEST3_R0.equals(rules.get(0).getRuleString().trim()));
@@ -45,7 +46,7 @@ public void test2() {
SAXRule r = SequiturFactory.runSequitur(TEST2_STRING);
GrammarRules rules = r.toGrammarRulesData();
- assertTrue("test r0", TEST2_R0.equals(rules.get(0).getRuleString().trim()));
+ assertEquals(TEST2_R0, (rules.get(0).getRuleString().trim()));
assertTrue("test r1", TEST2_R1.equals(rules.get(1).getRuleString().trim()));
assertTrue("test r1", TEST2_R2.equals(rules.get(2).getRuleString().trim()));
}