diff --git a/.gitignore b/.gitignore index ab54e98..68fcec4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ /bin /target .Rproj.user +.idea/ +*.iml + diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 0000000..21f6b75 --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,418 @@ + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..8c1e55e --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,8 @@ + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_goldmansachs_gs_collections_6_2_0.xml b/.idea/libraries/Maven__com_goldmansachs_gs_collections_6_2_0.xml new file mode 100644 index 0000000..ef054b6 --- /dev/null +++ b/.idea/libraries/Maven__com_goldmansachs_gs_collections_6_2_0.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__com_goldmansachs_gs_collections_api_6_2_0.xml b/.idea/libraries/Maven__com_goldmansachs_gs_collections_api_6_2_0.xml new file mode 100644 index 0000000..189b0a3 --- /dev/null +++ b/.idea/libraries/Maven__com_goldmansachs_gs_collections_api_6_2_0.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/libraries/Maven__joda_time_joda_time_2_8_1.xml b/.idea/libraries/Maven__joda_time_joda_time_2_8_1.xml new file mode 100644 index 0000000..91ce913 --- /dev/null +++ b/.idea/libraries/Maven__joda_time_joda_time_2_8_1.xml @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + \ No newline at end of file diff --git a/pom.xml b/pom.xml index 5646b37..7ec845e 100644 --- a/pom.xml +++ b/pom.xml @@ -55,8 +55,8 @@ maven-compiler-plugin 3.3 - 1.7 - 1.7 + 1.8 + 1.8 @@ -190,7 +190,7 @@ maven-assembly-plugin - 2.5.4 + 2.5.3 jar-with-dependencies @@ -219,25 +219,30 @@ + + com.goldmansachs + gs-collections + 6.2.0 + joda-time joda-time - 2.7 + 2.8.1 org.slf4j slf4j-api - 1.7.10 + 1.7.12 ch.qos.logback logback-classic - 1.1.2 + 1.1.3 ch.qos.logback logback-core - 1.1.2 + 1.1.3 junit diff --git a/src/main/java/net/seninp/gi/GrammarRuleRecord.java b/src/main/java/net/seninp/gi/GrammarRuleRecord.java index bc0616c..9ae44d5 100644 --- a/src/main/java/net/seninp/gi/GrammarRuleRecord.java +++ b/src/main/java/net/seninp/gi/GrammarRuleRecord.java @@ -1,5 +1,7 @@ package net.seninp.gi; +import com.gs.collections.impl.set.mutable.primitive.IntHashSet; + import java.util.ArrayList; import java.util.Arrays; @@ -22,7 +24,7 @@ public class GrammarRuleRecord { private String expandedRuleString; /* The indexes at which the rule occurs in the discretized time series. */ - private ArrayList timeSeriesOccurrenceIndexes = new ArrayList(); + private IntHashSet timeSeriesOccurrenceIndexes = new IntHashSet(); /* This rule intervals on the original time series. */ private ArrayList ruleIntervals; @@ -40,7 +42,7 @@ public class GrammarRuleRecord { private int maxLength; /* The rule mean length - i.e. mean value of all subsequences corresponding to the rule. */ - private Integer meanLength; + private int meanLength; /* The rule mean period - i.e. the mean length of intra-rule intervals. */ private double period; @@ -58,7 +60,7 @@ public int ruleNumber() { return ruleNumber; } - public Integer getMeanLength() { + public int getMeanLength() { return meanLength; } @@ -123,19 +125,21 @@ public void setExpandedRuleString(String expandedRuleString) { } public String occurrencesToString() { - return Arrays.toString(this.timeSeriesOccurrenceIndexes - .toArray(new Integer[this.timeSeriesOccurrenceIndexes.size()])); +// return Arrays.toString(this.timeSeriesOccurrenceIndexes +// .toArray(new Integer[this.timeSeriesOccurrenceIndexes.size()])); + return timeSeriesOccurrenceIndexes.toString(); } - public ArrayList getOccurrences() { + public IntHashSet getOccurrences() { return this.timeSeriesOccurrenceIndexes; } - public void setOccurrences(int[] indexes) { - this.timeSeriesOccurrenceIndexes = new ArrayList(); - for (Integer idx : indexes) { - this.timeSeriesOccurrenceIndexes.add(idx); - } + public void setOccurrences(IntHashSet indexes) { + this.timeSeriesOccurrenceIndexes = indexes; +// this.timeSeriesOccurrenceIndexes = new ArrayList(); +// for (Integer idx : indexes.) { +// this.timeSeriesOccurrenceIndexes.add(idx); +// } } public double getPeriod() { diff --git a/src/main/java/net/seninp/gi/GrammarRules.java b/src/main/java/net/seninp/gi/GrammarRules.java index c081b32..29db2e3 100644 --- a/src/main/java/net/seninp/gi/GrammarRules.java +++ b/src/main/java/net/seninp/gi/GrammarRules.java @@ -1,16 +1,24 @@ package net.seninp.gi; +import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap; + import java.util.Iterator; -import java.util.SortedMap; -import java.util.TreeMap; +import java.util.stream.Collectors; public class GrammarRules implements Iterable { - private SortedMap rules; + //private SortedMap rules; + + final IntObjectHashMap rules = new IntObjectHashMap(); public GrammarRules() { super(); - this.rules = new TreeMap(); + //this.rules = new TreeMap(); + } + + @Override + public String toString() { + return rules.values().stream().map(x -> x.toString()).collect(Collectors.joining(", ")).toString(); } public void addRule(GrammarRuleRecord arrRule) { diff --git a/src/main/java/net/seninp/gi/Interval.java b/src/main/java/net/seninp/gi/Interval.java index 4f3f29e..98d1b1a 100644 --- a/src/main/java/net/seninp/gi/Interval.java +++ b/src/main/java/net/seninp/gi/Interval.java @@ -8,9 +8,9 @@ */ public class Interval { - private int start; - private int end; - private double coverage; + final private int start; + final private int end; + final private double coverage; /** * Constructor; start inclusive, end exclusive. @@ -29,21 +29,22 @@ public double getCoverage() { return coverage; } - public void setCoverage(double coverage) { + /*public void setCoverage(double coverage) { this.coverage = coverage; } public void setStart(int start) { this.start = start; } + public void setEnd(int end) { + this.end = end; + } +*/ public int getStart() { return this.start; } - public void setEnd(int end) { - this.end = end; - } public int getEnd() { return this.end; diff --git a/src/main/java/net/seninp/gi/repair/DigramFrequencies.java b/src/main/java/net/seninp/gi/repair/DigramFrequencies.java index e20c160..993de4a 100644 --- a/src/main/java/net/seninp/gi/repair/DigramFrequencies.java +++ b/src/main/java/net/seninp/gi/repair/DigramFrequencies.java @@ -1,10 +1,11 @@ package net.seninp.gi.repair; -import java.util.ArrayList; -import java.util.Collections; +import com.gs.collections.impl.list.mutable.FastList; +import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap; + import java.util.HashMap; -import java.util.SortedMap; -import java.util.TreeMap; +import java.util.LinkedHashMap; +import java.util.List; /** * Implements the digram frequency queue. @@ -15,18 +16,18 @@ public class DigramFrequencies { /** A map of strings to digram frequencies. */ - private HashMap digramsToEntries; + private final LinkedHashMap digramsToEntries; /** A map of buckets, each bucket is the frequency number pointing on the collection of entries. */ - private SortedMap> bucketsToEntries; + private final IntObjectHashMap> bucketsToEntries; /** * Constructor. Inits data structures. */ public DigramFrequencies() { super(); - digramsToEntries = new HashMap(); - bucketsToEntries = new TreeMap>(); + digramsToEntries = new LinkedHashMap(); + bucketsToEntries = new IntObjectHashMap(); } /** @@ -37,9 +38,9 @@ public DigramFrequencies() { public void put(DigramFrequencyEntry digramFrequencyEntry) { this.digramsToEntries.put(digramFrequencyEntry.getDigram(), digramFrequencyEntry); Integer freq = digramFrequencyEntry.getFrequency(); - ArrayList bucket = this.bucketsToEntries.get(freq); + List bucket = this.bucketsToEntries.get(freq); if (null == bucket) { - bucket = new ArrayList(); + bucket = new FastList(); this.bucketsToEntries.put(freq, bucket); } bucket.add(digramFrequencyEntry); @@ -64,20 +65,20 @@ public DigramFrequencyEntry get(String string) { public void incrementFrequency(DigramFrequencyEntry entry, int increment) { // findout the old bucket and remove this entry - ArrayList oldBucket = this.bucketsToEntries.get(entry.getFrequency()); + List oldBucket = this.bucketsToEntries.get(entry.getFrequency()); oldBucket.remove(entry); if (oldBucket.isEmpty()) { this.bucketsToEntries.remove(entry.getFrequency()); } // get the increment added - int newFreq = entry.getFrequency() + increment; - entry.setFrequency(newFreq); + + int newFreq = entry.add(increment); // put into the new bucket - ArrayList bucket = this.bucketsToEntries.get(newFreq); + List bucket = this.bucketsToEntries.get(newFreq); if (null == bucket) { - bucket = new ArrayList(); + bucket = new FastList(1); this.bucketsToEntries.put(newFreq, bucket); } bucket.add(entry); @@ -97,7 +98,8 @@ public DigramFrequencyEntry getTop() { } else { // by the default there are no empty buckets - Integer maxBucket = Collections.max(bucketsToEntries.keySet()); + int maxBucket = bucketsToEntries.keysView().max(); + //Integer maxBucket = Collections.max(bucketsToEntries.keySet()); return bucketsToEntries.get(maxBucket).get(0); } } @@ -116,7 +118,7 @@ public void remove(String digramStr) { else { // get its frequency and the corresponding bucket int freq = entry.getFrequency(); - ArrayList bucket = this.bucketsToEntries.get(freq); + List bucket = this.bucketsToEntries.get(freq); if (!bucket.remove(entry)) { throw (new RuntimeException("There was an error!")); } diff --git a/src/main/java/net/seninp/gi/repair/DigramFrequencyEntry.java b/src/main/java/net/seninp/gi/repair/DigramFrequencyEntry.java index a635199..be3252f 100644 --- a/src/main/java/net/seninp/gi/repair/DigramFrequencyEntry.java +++ b/src/main/java/net/seninp/gi/repair/DigramFrequencyEntry.java @@ -9,7 +9,7 @@ public class DigramFrequencyEntry { /** The payload - the digram string itself. */ - private String digram; + private final String digram; /** The observed frequency. */ private int frequency; @@ -45,9 +45,9 @@ public String getDigram() { * * @param digram the string. */ - public void setDigram(String digram) { - this.digram = digram; - } +// public void setDigram(String digram) { +// this.digram = digram; +// } /** * Frequency getter. @@ -63,9 +63,9 @@ public int getFrequency() { * * @param frequency the new frequency value. */ - public void setFrequency(int frequency) { - this.frequency = frequency; - } +// public void setFrequency(int frequency) { +// this.frequency = frequency; +// } /** * Get the first occurrence. @@ -78,7 +78,7 @@ public int getFirstOccurrence() { /** * Set the first occurrence. - * + * * @param firstOccurrence the new value. */ public void setFirstOccurrence(int firstOccurrence) { @@ -91,7 +91,7 @@ public int hashCode() { int result = 1; result = prime * result + ((digram == null) ? 0 : digram.hashCode()); result = prime * result + firstOccurrence; - result = prime * result + frequency; + //result = prime * result + frequency; return result; } @@ -99,25 +99,31 @@ public int hashCode() { public boolean equals(Object obj) { if (this == obj) return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) + + if (!(obj instanceof DigramFrequencyEntry)) return false; DigramFrequencyEntry other = (DigramFrequencyEntry) obj; + + if (firstOccurrence != other.firstOccurrence) + return false; if (digram == null) { if (other.digram != null) return false; } else if (!digram.equals(other.digram)) return false; - if (firstOccurrence != other.firstOccurrence) - return false; - if (frequency != other.frequency) - return false; + + /*if (frequency != other.frequency) + return false;*/ return true; } public String toString() { return this.digram + " " + this.frequency; } + + public int add(int increment) { + frequency += increment; + return frequency; + } } diff --git a/src/main/java/net/seninp/gi/repair/ParallelGrammarKeeper.java b/src/main/java/net/seninp/gi/repair/ParallelGrammarKeeper.java deleted file mode 100644 index 9c16d3e..0000000 --- a/src/main/java/net/seninp/gi/repair/ParallelGrammarKeeper.java +++ /dev/null @@ -1,181 +0,0 @@ -package net.seninp.gi.repair; - -import java.util.ArrayList; -import java.util.Collections; -import java.util.Hashtable; -import java.util.concurrent.atomic.AtomicInteger; - -/** - * This implements a handler for the Re-Pair grammar built in parallel. This data structure is - * responsible for enumerating rules and for tracking changes in the R0 of the grammar. - * - * @author psenin - * - */ -public class ParallelGrammarKeeper { - - private static final char SPACE = ' '; - private static final char THE_R = 'R'; - - // rule 0 gets a separate treatment, so we start from 1 - // - protected AtomicInteger numRules = new AtomicInteger(1); - - // the rules table - protected Hashtable theRules = new Hashtable(); - - // the grammar id - private long id; - - // R0 strings - // - protected String r0String; - protected String r0ExpandedString; - - // keeps a working string of this grammar - // - protected ArrayList workString; - - /** - * Constructor. - * - * @param id The handler id. - */ - public ParallelGrammarKeeper(long id) { - super(); - this.id = id; - } - - /** - * The id is used to keep track of parallel chunks. - * - * @return the current ID. - */ - public long getId() { - return this.id; - } - - /** - * This is used in parallel. - * - * @param string the string we work with in parallel. - */ - public void setWorkString(ArrayList string) { - this.workString = string; - } - - /** - * Set the R0 string. - * - * @param string the R0 string value. - */ - public void setR0String(String string) { - this.r0String = string; - } - - /** - * Get the expanded R0 out. - * - * @return the expanded R0. - */ - public String getR0ExpandedString() { - return this.r0ExpandedString; - } - - /** - * This adds an existing rule to this grammar. Useful in merging. - * - * @param r The rule. It is not yet clear how to treat rules, be careful. This will not set the - * rule number, but it will increment the internal rule counter. - */ - public void addExistingRule(ParallelRePairRule r) { - r.grammarHandler = this; - if (this.theRules.containsKey(r.ruleNumber)) { - // we do override an existing rule - theRules.put(r.ruleNumber, r); - } - else { - // plus 1 because the rule 0 has a special treatment - theRules.put(r.ruleNumber, r); - numRules.set(theRules.size() + 1); - } - } - - /** - * Expands all rules EXCEPT R0. - */ - public void expandRules() { - // iterate over all SAX containers - ArrayList keys = new ArrayList(theRules.keySet()); - Collections.sort(keys); - for (Integer key : keys) { - ParallelRePairRule rr = theRules.get(key); - String resultString = rr.toRuleString(); - - int currentSearchStart = resultString.indexOf(THE_R); - while (currentSearchStart >= 0) { - int spaceIdx = resultString.indexOf(" ", currentSearchStart); - // if (spaceIdx < 0) { - // System.out.println("gotcha!"); - // } - String ruleName = resultString.substring(currentSearchStart, spaceIdx + 1); - Integer ruleId = Integer.valueOf(ruleName.substring(1, ruleName.length() - 1)); - - ParallelRePairRule rule = theRules.get(ruleId); - if (rule != null) { - if (rule.expandedRuleString.charAt(rule.expandedRuleString.length() - 1) == ' ') { - resultString = resultString.replaceAll(ruleName, rule.expandedRuleString); - } - else { - resultString = resultString.replaceAll(ruleName, rule.expandedRuleString + SPACE); - } - } - - currentSearchStart = resultString.indexOf(THE_R, spaceIdx); - } - - rr.setExpandedRule(resultString.trim()); - - } - } - - /** - * Expands R0 specifically. - */ - public void expandR0() { - // string is immutable it will get copied - String finalString = this.r0String; - int currentSearchStart = finalString.indexOf(THE_R); - while (currentSearchStart >= 0) { - - int spaceIdx = finalString.indexOf(" ", currentSearchStart + 1); - - String ruleName = finalString.substring(currentSearchStart, spaceIdx + 1); - Integer ruleId = Integer.valueOf(ruleName.substring(1, ruleName.length() - 1)); - - ParallelRePairRule rr = theRules.get(ruleId); - if (null == rr.expandedRuleString) { - finalString = finalString.replaceAll(ruleName, theRules.get(ruleId).toRuleString()); - } - else { - finalString = finalString.replaceAll(ruleName, theRules.get(ruleId).expandedRuleString - + SPACE); - } - - currentSearchStart = finalString.indexOf(THE_R); - } - this.r0ExpandedString = finalString; - } - - public String toGrammarString() { - StringBuffer sb = new StringBuffer(); - System.out.println("R0 -> " + r0String); - for (int i = 1; i < theRules.size(); i++) { - ParallelRePairRule r = theRules.get(i); - sb.append("R").append(r.ruleNumber).append(" -> ").append(r.toRuleString()).append(" : ") - .append(r.expandedRuleString).append(", ").append(r.positions).append("\n"); - } - return sb.toString(); - } - -} diff --git a/src/main/java/net/seninp/gi/repair/RePairFactory.java b/src/main/java/net/seninp/gi/repair/RePairFactory.java index ddc692f..17b525b 100644 --- a/src/main/java/net/seninp/gi/repair/RePairFactory.java +++ b/src/main/java/net/seninp/gi/repair/RePairFactory.java @@ -1,337 +1,358 @@ package net.seninp.gi.repair; +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.Logger; +import net.seninp.jmotif.sax.datastructures.SAXRecord; +import net.seninp.jmotif.sax.datastructures.SAXRecords; +import org.slf4j.LoggerFactory; + import java.util.ArrayList; import java.util.Arrays; import java.util.Map.Entry; import java.util.Set; import java.util.StringTokenizer; -import net.seninp.jmotif.sax.datastructures.SAXRecord; -import net.seninp.jmotif.sax.datastructures.SAXRecords; -import org.slf4j.LoggerFactory; -import ch.qos.logback.classic.Level; -import ch.qos.logback.classic.Logger; /** * Implements RePair. - * + * * @author psenin - * */ public final class RePairFactory { - private static final char SPACE = ' '; - - // logging stuff - // - private static Logger consoleLogger; - private static Level LOGGING_LEVEL = Level.WARN; - static { - consoleLogger = (Logger) LoggerFactory.getLogger(RePairFactory.class); - consoleLogger.setLevel(LOGGING_LEVEL); - } - - /** - * Disable constructor. - */ - private RePairFactory() { - assert true; - } - - /** - * Builds a repair grammar given a set of SAX records. - * - * @param saxRecords the records to process. - * - * @return the grammar. - */ - public static RePairGrammar buildGrammar(SAXRecords saxRecords) { - - consoleLogger.debug("Starting RePair with an input string of " + saxRecords.getIndexes().size() - + " words."); - - RePairGrammar rg = new RePairGrammar(); - - // get all indexes and sort them - Set index = saxRecords.getIndexes(); - Integer[] sortedIndexes = index.toArray(new Integer[index.size()]); - Arrays.sort(sortedIndexes); - - // two data structures - // - // 1.0. - the string - ArrayList string = new ArrayList(); - // LinkedList string = new LinkedList(); + private static final char SPACE = ' '; + // logging stuff // - // 2.0. - the digram frequency table, digram, frequency, and the first occurrence index - DigramFrequencies digramFrequencies = new DigramFrequencies(); - - // build data structures - int stringPositionCounter = 0; - for (Integer saxWordPosition : sortedIndexes) { - // i is the index of a symbol in the input discretized string - // counter is the index in the grammar rule R0 string - SAXRecord r = saxRecords.getByIndex(saxWordPosition); - RePairSymbol symbol = new RePairSymbol(r, stringPositionCounter); - // put it into the string - string.add(symbol); - // and into the index - // take care about digram frequencies - if (stringPositionCounter > 0) { - - StringBuffer digramStr = new StringBuffer(); - digramStr.append(string.get(stringPositionCounter - 1).toString()).append(SPACE) - .append(string.get(stringPositionCounter).toString()); - - DigramFrequencyEntry entry = digramFrequencies.get(digramStr.toString()); - if (null == entry) { - digramFrequencies.put(new DigramFrequencyEntry(digramStr.toString(), 1, - stringPositionCounter - 1)); - } - else { - digramFrequencies.incrementFrequency(entry, 1); - } - } - // go on - stringPositionCounter++; + private static Logger consoleLogger; + private static Level LOGGING_LEVEL = Level.DEBUG; + + static { + consoleLogger = (Logger) LoggerFactory.getLogger(RePairFactory.class); + consoleLogger.setLevel(LOGGING_LEVEL); } - consoleLogger.debug("String length " + string.size() + " unique digrams " - + digramFrequencies.size()); + /** + * Disable constructor. + */ + private RePairFactory() { + assert true; + } - DigramFrequencyEntry entry; - while ((entry = digramFrequencies.getTop()) != null && entry.getFrequency() >= 2) { + /** + * Builds a repair grammar given a set of SAX records. + * + * @param saxRecords the records to process. + * @return the grammar. + */ + public static RePairGrammar buildGrammar(SAXRecords saxRecords) { + + consoleLogger.debug("Starting RePair with an input string of " + saxRecords.getIndexes().size() + + " words."); + + RePairGrammar rg = new RePairGrammar(); + + // get all indexes and sort them + Set index = saxRecords.getIndexes(); + Integer[] sortedIndexes = index.toArray(new Integer[index.size()]); + Arrays.sort(sortedIndexes); + + // two data structures + // + // 1.0. - the string + ArrayList string = new ArrayList(); + // LinkedList string = new LinkedList(); + + // + // 2.0. - the digram frequency table, digram, frequency, and the first occurrence index + DigramFrequencies digramFrequencies = new DigramFrequencies(); + + // build data structures + int stringPositionCounter = 0; + for (Integer saxWordPosition : sortedIndexes) { + // i is the index of a symbol in the input discretized string + // counter is the index in the grammar rule R0 string + SAXRecord r = saxRecords.getByIndex(saxWordPosition); + RePairSymbol symbol = new RePairSymbol(r, stringPositionCounter); + // put it into the string + string.add(symbol); + // and into the index + // take care about digram frequencies + if (stringPositionCounter > 0) { + + StringBuffer digramStr = new StringBuffer(); + digramStr.append(string.get(stringPositionCounter - 1).toString()).append(SPACE) + .append(string.get(stringPositionCounter).toString()); + + DigramFrequencyEntry entry = digramFrequencies.get(digramStr.toString()); + if (null == entry) { + digramFrequencies.put(new DigramFrequencyEntry(digramStr.toString(), 1, + stringPositionCounter - 1)); + } else { + digramFrequencies.incrementFrequency(entry, 1); + } + } + // go on + stringPositionCounter++; + } - // take the most frequent rule - // - // Entry entry = entries.get(0); - // DigramFrequencyEntry entry = digramFrequencies.getTop(); + consoleLogger.debug("String length " + string.size() + " unique digrams " + + digramFrequencies.size()); + DigramFrequencyEntry entry; + while ((entry = digramFrequencies.getTop()) != null && entry.getFrequency() >= 2) { + + // take the most frequent rule + // + // Entry entry = entries.get(0); + // DigramFrequencyEntry entry = digramFrequencies.getTop(); + + /* consoleLogger.info("re-pair iteration, digram \"" + entry.getDigram() + "\", frequency: " + entry.getFrequency()); + consoleLogger.debug("Going to substitute the digram " + entry.getDigram() + " first occurring at position " + entry.getFirstOccurrence() + " with frequency " + entry.getFrequency() + ", '" + string.get(entry.getFirstOccurrence()) + SPACE + string.get(entry.getFirstOccurrence() + 1) + "'"); - - // create new rule - // - RePairRule r = new RePairRule(rg); - r.setFirst(string.get(entry.getFirstOccurrence())); - r.setSecond(string.get(entry.getFirstOccurrence() + 1)); - r.assignLevel(); - - // substitute each digram entry with a rule - // - String digramToSubstitute = entry.getDigram(); - int currentIndex = entry.getFirstOccurrence(); - while (currentIndex < string.size() - 1) { - - StringBuffer currentDigram = new StringBuffer(); - currentDigram.append(string.get(currentIndex).toString()).append(SPACE) - .append(string.get(currentIndex + 1).toString()); - - if (digramToSubstitute.equalsIgnoreCase(currentDigram.toString())) { + */ + + // create new rule + // + RePairRule r = new RePairRule(rg, + string.get(entry.getFirstOccurrence()), + string.get(entry.getFirstOccurrence() + 1) + ); + r.assignLevel(); + + // substitute each digram entry with a rule + // + String digramToSubstitute = entry.getDigram(); + int currentIndex = entry.getFirstOccurrence(); + while (currentIndex < string.size() - 1) { + + StringBuffer currentDigram = new StringBuffer(); + currentDigram.append(string.get(currentIndex).toString()).append(SPACE) + .append(string.get(currentIndex + 1).toString()); + + if (digramToSubstitute.equalsIgnoreCase(currentDigram.toString())) { + /* consoleLogger.debug(" next digram occurrence is at " + currentIndex + ", '" + string.get(currentIndex) + SPACE + string.get(currentIndex + 1) + "'"); - - // correct entries at left and right - if (currentIndex > 0) { - // taking care about immediate neighbor - removeDigramFrequencyEntry(currentIndex - 1, string, digramFrequencies); - } - if (currentIndex < string.size() - 2) { - removeDigramFrequencyEntry(currentIndex + 1, string, digramFrequencies); - } - - // create the new guard to insert - RePairGuard g = new RePairGuard(r); - g.setStringPosition(string.get(currentIndex).getStringPosition()); - r.addOccurrence(string.get(currentIndex).getStringPosition()); - substituteDigramAt(rg, currentIndex, g, string, digramFrequencies); - - } - currentIndex++; - } - - // // sort the entries of digram table by the size of indexes - // entries = new ArrayList>(); - // entries.addAll(digramFrequencies.entrySet()); - // Collections.sort(entries, new Comparator>() { - // @Override - // public int compare(Entry o1, Entry o2) { - // return -Integer.valueOf(o1.getValue()[0]).compareTo(Integer.valueOf(o2.getValue()[0])); - // } - // }); - + */ + + // correct entries at left and right + if (currentIndex > 0) { + // taking care about immediate neighbor + removeDigramFrequencyEntry(currentIndex - 1, string, digramFrequencies); + } + if (currentIndex < string.size() - 2) { + removeDigramFrequencyEntry(currentIndex + 1, string, digramFrequencies); + } + + // create the new guard to insert + RePairGuard g = new RePairGuard(r); + g.setStringPosition(string.get(currentIndex).getStringPosition()); + r.addOccurrence(string.get(currentIndex).getStringPosition()); + substituteDigramAt(rg, currentIndex, g, string, digramFrequencies); + + } + currentIndex++; + } + + // // sort the entries of digram table by the size of indexes + // entries = new ArrayList>(); + // entries.addAll(digramFrequencies.entrySet()); + // Collections.sort(entries, new Comparator>() { + // @Override + // public int compare(Entry o1, Entry o2) { + // return -Integer.valueOf(o1.getValue()[0]).compareTo(Integer.valueOf(o2.getValue()[0])); + // } + // }); + +/* consoleLogger.debug("*** iteration finished, top count " + digramFrequencies.getTop().getFrequency()); - } - - rg.setR0String(stringToDisplay(string)); - - return rg; - } - - /** - * Builds a grammar given a string of terminals delimeted by space. - * - * @param inputString the input string. - * @return the RePair grammar. - */ - public static RePairGrammar buildGrammar(String inputString) { - - // consoleLogger.debug("Starting RePair with an input string of " + - // saxRecords.getIndexes().size() - // + " words."); - - RePairGrammar rg = new RePairGrammar(); - - // two data structures - // - // 1.0. - the string - ArrayList string = new ArrayList(); - // LinkedList string = new LinkedList(); - - // - // 2.0. - the digram frequency table, digram, frequency, and the first occurrence index - DigramFrequencies digramFrequencies = new DigramFrequencies(); - - // build data structures - // tokenize the input string - // - StringTokenizer st = new StringTokenizer(inputString, " "); - - int stringPositionCounter = 0; - - // while there are tokens - while (st.hasMoreTokens()) { - - String token = st.nextToken(); - - RePairSymbol symbol = new RePairSymbol(token, stringPositionCounter); - // put it into the string - string.add(symbol); - // and into the index - // take care about digram frequencies - if (stringPositionCounter > 0) { + */ - StringBuffer digramStr = new StringBuffer(); - digramStr.append(string.get(stringPositionCounter - 1).toString()).append(SPACE) - .append(string.get(stringPositionCounter).toString()); - - DigramFrequencyEntry entry = digramFrequencies.get(digramStr.toString()); - if (null == entry) { - digramFrequencies.put(new DigramFrequencyEntry(digramStr.toString(), 1, - stringPositionCounter - 1)); - } - else { - digramFrequencies.incrementFrequency(entry, 1); } - } - // go on - stringPositionCounter++; - } - - consoleLogger.debug("String length " + string.size() + " unique digrams " - + digramFrequencies.size()); - - DigramFrequencyEntry entry; - while ((entry = digramFrequencies.getTop()) != null && entry.getFrequency() > 1) { - - // take the most frequent rule - // - // Entry entry = entries.get(0); - // DigramFrequencyEntry entry = digramFrequencies.getTop(); - consoleLogger.info("re-pair iteration, digram \"" + entry.getDigram() + "\", frequency: " - + entry.getFrequency()); - - consoleLogger.debug("Going to substitute the digram " + entry.getDigram() - + " first occurring at position " + entry.getFirstOccurrence() + " with frequency " - + entry.getFrequency() + ", '" + string.get(entry.getFirstOccurrence()) + SPACE - + string.get(entry.getFirstOccurrence() + 1) + "'"); + rg.setR0String(stringToDisplay(string)); - // create new rule - // - RePairRule r = new RePairRule(rg); - r.setFirst(string.get(entry.getFirstOccurrence())); - r.setSecond(string.get(entry.getFirstOccurrence() + 1)); - r.assignLevel(); + return rg; + } - // substitute each digram entry with a rule - // - String digramToSubstitute = entry.getDigram(); - int currentIndex = entry.getFirstOccurrence(); - while (currentIndex < string.size() - 1) { + /** + * Builds a grammar given a string of terminals delimeted by space. + * + * @param inputString the input string. + * @return the RePair grammar. + */ + public static RePairGrammar buildGrammar(String inputString) { + + // consoleLogger.debug("Starting RePair with an input string of " + + // saxRecords.getIndexes().size() + // + " words."); + + RePairGrammar rg = new RePairGrammar(); + + // two data structures + // + // 1.0. - the string + ArrayList string = new ArrayList(); + // LinkedList string = new LinkedList(); + + // + // 2.0. - the digram frequency table, digram, frequency, and the first occurrence index + DigramFrequencies digramFrequencies = new DigramFrequencies(); + + // build data structures + // tokenize the input string + // + StringTokenizer st = new StringTokenizer(inputString, " "); + + int stringPositionCounter = 0; + + // while there are tokens + while (st.hasMoreTokens()) { + + String token = st.nextToken(); + + RePairSymbol symbol = new RePairSymbol(token, stringPositionCounter); + // put it into the string + string.add(symbol); + // and into the index + // take care about digram frequencies + if (stringPositionCounter > 0) { + + StringBuffer digramStr = new StringBuffer(); + digramStr.append(string.get(stringPositionCounter - 1).toString()).append(SPACE) + .append(string.get(stringPositionCounter).toString()); + + DigramFrequencyEntry entry = digramFrequencies.get(digramStr.toString()); + if (null == entry) { + digramFrequencies.put(new DigramFrequencyEntry(digramStr.toString(), 1, + stringPositionCounter - 1)); + } else { + digramFrequencies.incrementFrequency(entry, 1); + } + } + // go on + stringPositionCounter++; + } - StringBuffer currentDigram = new StringBuffer(); - currentDigram.append(string.get(currentIndex).toString()).append(SPACE) - .append(string.get(currentIndex + 1).toString()); + consoleLogger.debug("String length " + string.size() + " unique digrams " + + digramFrequencies.size()); + + DigramFrequencyEntry entry; + while ((entry = digramFrequencies.getTop()) != null && entry.getFrequency() > 1) { + + // take the most frequent rule + // + // Entry entry = entries.get(0); + // DigramFrequencyEntry entry = digramFrequencies.getTop(); + + /* + consoleLogger.info("re-pair iteration, digram \"" + entry.getDigram() + "\", frequency: " + + entry.getFrequency()); + + consoleLogger.debug("Going to substitute the digram " + entry.getDigram() + + " first occurring at position " + entry.getFirstOccurrence() + " with frequency " + + entry.getFrequency() + ", '" + string.get(entry.getFirstOccurrence()) + SPACE + + string.get(entry.getFirstOccurrence() + 1) + "'"); + */ + + // create new rule + // + RePairRule r = new RePairRule(rg, string.get(entry.getFirstOccurrence()), string.get(entry.getFirstOccurrence() + 1)); + r.setFirst(string.get(entry.getFirstOccurrence())); + r.setSecond(string.get(entry.getFirstOccurrence() + 1)); + r.assignLevel(); + + // substitute each digram entry with a rule + // + String digramToSubstitute = entry.getDigram(); + int currentIndex = entry.getFirstOccurrence(); + + StringBuffer currentDigram = new StringBuffer(); + while (currentIndex < string.size() - 1) { + + + currentDigram.setLength(0); + + currentDigram.append(string.get(currentIndex)).append(SPACE) + .append(string.get(currentIndex + 1)); + + //if (digramToSubstitute.equalsIgnoreCase(currentDigram.toString())) { + if (digramToSubstitute.equals(currentDigram.toString())) { + /*consoleLogger.debug(" next digram occurrence is at " + currentIndex + ", '" + + string.get(currentIndex) + SPACE + string.get(currentIndex + 1) + "'");*/ + + // correct entries at left and right + if (currentIndex > 0) { + // taking care about immediate neighbor + removeDigramFrequencyEntry(currentIndex - 1, string, digramFrequencies); + } + if (currentIndex < string.size() - 2) { + removeDigramFrequencyEntry(currentIndex + 1, string, digramFrequencies); + } + + // create the new guard to insert + RePairGuard g = new RePairGuard(r); + g.setStringPosition(string.get(currentIndex).getStringPosition()); + r.addOccurrence(string.get(currentIndex).getStringPosition()); + substituteDigramAt(rg, currentIndex, g, string, digramFrequencies); + + } + currentIndex++; + } + + // // sort the entries of digram table by the size of indexes + // entries = new ArrayList>(); + // entries.addAll(digramFrequencies.entrySet()); + // Collections.sort(entries, new Comparator>() { + // @Override + // public int compare(Entry o1, Entry o2) { + // return -Integer.valueOf(o1.getValue()[0]).compareTo(Integer.valueOf(o2.getValue()[0])); + // } + // }); + + consoleLogger.debug("*** iteration finished, top count " + + digramFrequencies.getTop().getFrequency()); + } - if (digramToSubstitute.equalsIgnoreCase(currentDigram.toString())) { - consoleLogger.debug(" next digram occurrence is at " + currentIndex + ", '" - + string.get(currentIndex) + SPACE + string.get(currentIndex + 1) + "'"); + rg.setR0String(stringToDisplay(string)); - // correct entries at left and right - if (currentIndex > 0) { - // taking care about immediate neighbor - removeDigramFrequencyEntry(currentIndex - 1, string, digramFrequencies); - } - if (currentIndex < string.size() - 2) { - removeDigramFrequencyEntry(currentIndex + 1, string, digramFrequencies); - } - - // create the new guard to insert - RePairGuard g = new RePairGuard(r); - g.setStringPosition(string.get(currentIndex).getStringPosition()); - r.addOccurrence(string.get(currentIndex).getStringPosition()); - substituteDigramAt(rg, currentIndex, g, string, digramFrequencies); + rg.expandRules(); - } - currentIndex++; - } - - // // sort the entries of digram table by the size of indexes - // entries = new ArrayList>(); - // entries.addAll(digramFrequencies.entrySet()); - // Collections.sort(entries, new Comparator>() { - // @Override - // public int compare(Entry o1, Entry o2) { - // return -Integer.valueOf(o1.getValue()[0]).compareTo(Integer.valueOf(o2.getValue()[0])); - // } - // }); + return rg; - consoleLogger.debug("*** iteration finished, top count " - + digramFrequencies.getTop().getFrequency()); } - rg.setR0String(stringToDisplay(string)); + /** + * Substitute the digram by a rule. + * + * @param currentIndex + * @param g + * @param string + * @param digramFrequencies + */ + private static void substituteDigramAt(RePairGrammar rg, int currentIndex, RePairGuard g, + ArrayList string, DigramFrequencies digramFrequencies) { - rg.expandRules(); + // create entry for two new digram + // - return rg; + final RePairSymbol digramL = string.get(currentIndex); + final RePairSymbol digramR = string.get(currentIndex + 1); + final char[] digramLkey = digramL.key(); + final char[] digramRkey = digramR.key(); + final StringBuffer digram = new StringBuffer(digramLkey.length+1+digramRkey.length); + digram.append(digramL).append(SPACE).append(digramR); - } - /** - * Substitute the digram by a rule. - * - * @param currentIndex - * @param g - * @param string - * @param digramFrequencies - */ - private static void substituteDigramAt(RePairGrammar rg, Integer currentIndex, RePairGuard g, - ArrayList string, DigramFrequencies digramFrequencies) { - - // create entry for two new digram - // - StringBuffer digram = new StringBuffer(); - digram.append(string.get(currentIndex).toString()).append(SPACE) - .append(string.get(currentIndex + 1)); + /* consoleLogger.debug(" substituting the digram " + digram + " at " + currentIndex + " with " + g.toString()); @@ -341,136 +362,150 @@ private static void substituteDigramAt(RePairGrammar rg, Integer currentIndex, R if (currentIndex < string.size() - 2) { consoleLogger.debug(" next " + string.get(currentIndex + 2).toString()); } + */ + + // update the new left digram frequency + // + if (currentIndex > 0) { + StringBuffer newDigram = new StringBuffer(); + newDigram.append(string.get(currentIndex - 1).toString()).append(SPACE).append(g.toString()); + + //consoleLogger.debug(" updating the frequency entry for digram " + newDigram.toString()); + + final String nds = newDigram.toString(); + DigramFrequencyEntry entry = digramFrequencies.get(nds); + if (null == entry) { + digramFrequencies.put(new DigramFrequencyEntry(nds, 1, currentIndex - 1)); + } else { + digramFrequencies.incrementFrequency(entry, 1); + if (currentIndex - 1 < entry.getFirstOccurrence()) { + entry.setFirstOccurrence(currentIndex - 1); + } + } + } - // update the new left digram frequency - // - if (currentIndex > 0) { - StringBuffer newDigram = new StringBuffer(); - newDigram.append(string.get(currentIndex - 1).toString()).append(SPACE).append(g.toString()); - consoleLogger.debug(" updating the frequency entry for digram " + newDigram.toString()); - DigramFrequencyEntry entry = digramFrequencies.get(newDigram.toString()); - if (null == entry) { - digramFrequencies.put(new DigramFrequencyEntry(newDigram.toString(), 1, currentIndex - 1)); - } - else { - digramFrequencies.incrementFrequency(entry, 1); - if (currentIndex - 1 < entry.getFirstOccurrence()) { - entry.setFirstOccurrence(currentIndex - 1); + // update the new right digram frequency + // + if (currentIndex < string.size() - 2) { + StringBuffer newDigram = new StringBuffer(); + newDigram.append(g.toString()).append(SPACE).append(string.get(currentIndex + 2)); + + + //consoleLogger.debug(" updating the frequency entry for digram " + newDigram.toString()); + final String nds = newDigram.toString(); + DigramFrequencyEntry entry = digramFrequencies.get(nds); + if (null == entry) { + digramFrequencies.put(new DigramFrequencyEntry(nds, 1, currentIndex)); + } else { + digramFrequencies.incrementFrequency(entry, 1); + if (currentIndex + 1 < entry.getFirstOccurrence()) { + entry.setFirstOccurrence(currentIndex); + } + } } - } - } - // update the new right digram frequency - // - if (currentIndex < string.size() - 2) { - StringBuffer newDigram = new StringBuffer(); - newDigram.append(g.toString()).append(SPACE).append(string.get(currentIndex + 2)); - consoleLogger.debug(" updating the frequency entry for digram " + newDigram.toString()); - DigramFrequencyEntry entry = digramFrequencies.get(newDigram.toString()); - if (null == entry) { - digramFrequencies.put(new DigramFrequencyEntry(newDigram.toString(), 1, currentIndex)); - } - else { - digramFrequencies.incrementFrequency(entry, 1); - if (currentIndex + 1 < entry.getFirstOccurrence()) { - entry.setFirstOccurrence(currentIndex); + // remove and substitute + // + // 1. decrease to be substituted digram frequency + // + //consoleLogger.debug(" updating the frequency entry for digram " + digram.toString()); + final String ds = digram.toString(); + DigramFrequencyEntry entry = digramFrequencies.get(ds); + if (1 == entry.getFrequency()) { + //consoleLogger.debug(" removing the frequency entry"); + digramFrequencies.remove(ds); + } else { + /*consoleLogger.debug(" setting the frequency entry to " + + Integer.valueOf(entry.getFrequency() - 1));*/ + digramFrequencies.incrementFrequency(entry, -1); + if (currentIndex == entry.getFirstOccurrence()) { + //consoleLogger.debug(" this was an index entry, finding another digram index..."); + repairLRFreqMatch(currentIndex, string, digramLkey, digramRkey, entry); + } + } + // 2. substitute + string.set(currentIndex, g); + /*consoleLogger.debug(" deleting symbol " + string.get(currentIndex + 1).toString() + " at " + + Integer.valueOf(currentIndex + 1));*/ + // 3. delete + string.remove(currentIndex + 1); + + // need to take care about all the indexes + // as all the indexes above _currentIndex_ shall be shifted by -1 + // NO NEED for TLinkedList string = new TLinkedList(); + // HashMap digramFrequencies = new HashMap(); + // + // traverse the string to the right decreasing indexes + for (Entry e : digramFrequencies.getEntries().entrySet()) { + final DigramFrequencyEntry eval = e.getValue(); + int idx = eval.getFirstOccurrence(); + if (idx >= currentIndex + 2) { + // consoleLogger.debug(" shifting entry for " + e.getValue().getDigram() + " from " + // + e.getValue().getFirstOccurrence() + " to " + Integer.valueOf(idx - 1)); + eval.setFirstOccurrence(idx - 1); + } } - } - } - // remove and substitute - // - // 1. decrease to be substituted digram frequency - // - consoleLogger.debug(" updating the frequency entry for digram " + digram.toString()); - DigramFrequencyEntry entry = digramFrequencies.get(digram.toString()); - if (1 == entry.getFrequency()) { - consoleLogger.debug(" removing the frequency entry"); - digramFrequencies.remove(digram.toString()); } - else { - consoleLogger.debug(" setting the frequency entry to " - + Integer.valueOf(entry.getFrequency() - 1)); - digramFrequencies.incrementFrequency(entry, -1); - if (currentIndex == entry.getFirstOccurrence()) { - consoleLogger.debug(" this was an index entry, finding another digram index..."); + + private static void repairLRFreqMatch(int currentIndex, ArrayList string, char[] digramLkey, char[] digramRkey, DigramFrequencyEntry entry) { for (int i = currentIndex + 1; i < string.size() - 1; i++) { - StringBuffer cDigram = new StringBuffer(); - cDigram.append(string.get(i).toString()).append(SPACE) - .append(string.get(i + 1).toString()); - if (digram.toString().equals(cDigram.toString())) { - consoleLogger.debug(" for digram " + cDigram.toString() + " new index " + i); - entry.setFirstOccurrence(i); - break; - } - } - } - } - // 2. substitute - string.set(currentIndex, g); - consoleLogger.debug(" deleting symbol " + string.get(currentIndex + 1).toString() + " at " - + Integer.valueOf(currentIndex + 1)); - // 3. delete - string.remove(Integer.valueOf(currentIndex + 1).intValue()); - - // need to take care about all the indexes - // as all the indexes above _currentIndex_ shall be shifted by -1 - // NO NEED for TLinkedList string = new TLinkedList(); - // HashMap digramFrequencies = new HashMap(); - // - // traverse the string to the right decreasing indexes - for (Entry e : digramFrequencies.getEntries().entrySet()) { - int idx = e.getValue().getFirstOccurrence(); - if (idx >= currentIndex + 2) { - // consoleLogger.debug(" shifting entry for " + e.getValue().getDigram() + " from " - // + e.getValue().getFirstOccurrence() + " to " + Integer.valueOf(idx - 1)); - e.getValue().setFirstOccurrence(idx - 1); - } - } - } - private static void removeDigramFrequencyEntry(int index, ArrayList string, - DigramFrequencies digramFrequencies) { + if (Arrays.equals(digramLkey, string.get(i).key()) && + Arrays.equals(digramRkey, string.get(i + 1).key())) { - StringBuffer digramToRemove = new StringBuffer(); - digramToRemove.append(string.get(index).toString()).append(SPACE) - .append(string.get(index + 1).toString()); + //cDigram.setLength(0); + //cDigram.append(string.get(i)).append(SPACE).append(string.get(i + 1)); - DigramFrequencyEntry digramEntry = digramFrequencies.get(digramToRemove.toString()); + //consoleLogger.debug(" for digram " + cDigram.toString() + " new index " + i); + entry.setFirstOccurrence(i); + break; - if (digramEntry.getFrequency() == 1) { - digramFrequencies.remove(digramToRemove.toString()); - consoleLogger.debug(" completely removing the frequency entry for digram " - + digramToRemove.toString() + " at position " + index); - } - else { - consoleLogger.debug(" decreasing the frequency entry for digram " - + digramToRemove.toString() + " at position " + index + " from " - + digramEntry.getFrequency() + " to " + Integer.valueOf(digramEntry.getFrequency() - 1)); - digramFrequencies.incrementFrequency(digramEntry, -1); - if (index == digramEntry.getFirstOccurrence()) { - consoleLogger.debug(" this was an index entry, finding another digram index..."); - for (int i = index + 1; i < string.size() - 1; i++) { - StringBuffer cDigram = new StringBuffer(); - cDigram.append(string.get(i).toString()).append(SPACE) - .append(string.get(i + 1).toString()); - if (digramToRemove.toString().equals(cDigram.toString())) { - consoleLogger.debug(" for digram " + cDigram.toString() + " new index " + i); - digramEntry.setFirstOccurrence(i); - break; - } + } } - } } - } + private static void removeDigramFrequencyEntry(int index, ArrayList string, + DigramFrequencies digramFrequencies) { + + StringBuffer digramToRemove = new StringBuffer(); + digramToRemove.append(string.get(index)).append(SPACE) + .append(string.get(index + 1)); + + DigramFrequencyEntry digramEntry = digramFrequencies.get(digramToRemove.toString()); + + if (digramEntry.getFrequency() == 1) { + digramFrequencies.remove(digramToRemove.toString()); + /*consoleLogger.debug(" completely removing the frequency entry for digram " + + digramToRemove.toString() + " at position " + index);*/ + } else { + /*consoleLogger.debug(" decreasing the frequency entry for digram " + + digramToRemove.toString() + " at position " + index + " from " + + digramEntry.getFrequency() + " to " + Integer.valueOf(digramEntry.getFrequency() - 1));*/ + digramFrequencies.incrementFrequency(digramEntry, -1); + if (index == digramEntry.getFirstOccurrence()) { + //consoleLogger.debug(" this was an index entry, finding another digram index..."); + for (int i = index + 1; i < string.size() - 1; i++) { + StringBuffer cDigram = new StringBuffer(); + cDigram.append(string.get(i).toString()).append(SPACE) + .append(string.get(i + 1).toString()); + if (digramToRemove.toString().equals(cDigram.toString())) { + //consoleLogger.debug(" for digram " + cDigram.toString() + " new index " + i); + digramEntry.setFirstOccurrence(i); + break; + } + } + } + } - private static String stringToDisplay(ArrayList string) { - StringBuffer sb = new StringBuffer(); - for (int i = 0; i < string.size(); i++) { - sb.append(string.get(i).toString()).append(SPACE); } - return sb.toString(); - } + + private static String stringToDisplay(ArrayList string) { + StringBuffer sb = new StringBuffer(); + for (int i = 0; i < string.size(); i++) { + sb.append(string.get(i).toString()).append(SPACE); + } + return sb.toString(); + } } diff --git a/src/main/java/net/seninp/gi/repair/RePairGrammar.java b/src/main/java/net/seninp/gi/repair/RePairGrammar.java index 900b87a..3556072 100644 --- a/src/main/java/net/seninp/gi/repair/RePairGrammar.java +++ b/src/main/java/net/seninp/gi/repair/RePairGrammar.java @@ -1,13 +1,16 @@ package net.seninp.gi.repair; -import java.util.ArrayList; -import java.util.Hashtable; -import java.util.concurrent.atomic.AtomicInteger; +import com.gs.collections.api.iterator.MutableIntIterator; +import com.gs.collections.impl.set.mutable.primitive.IntHashSet; import net.seninp.gi.GrammarRuleRecord; import net.seninp.gi.GrammarRules; import net.seninp.gi.RuleInterval; import net.seninp.jmotif.sax.datastructures.SAXRecords; +import java.util.ArrayList; +import java.util.Hashtable; +import java.util.concurrent.atomic.AtomicInteger; + /** * A repair grammar container. * @@ -141,7 +144,7 @@ public GrammarRules toGrammarRulesData() { r0.setRuleNumber(0); r0.setRuleString(this.r0String); r0.setExpandedRuleString(this.r0ExpandedString); - r0.setOccurrences(new int[1]); + r0.setOccurrences(new IntHashSet()); res.addRule(r0); for (RePairRule rule : theRules.values()) { @@ -178,7 +181,11 @@ public void buildIntervals(SAXRecords records, double[] originalTimeSeries, int // System.out.println("R" + rr.ruleNumber + ", " + rr.toRuleString() + ", " // + rr.expandedRuleString); String[] split = rr.expandedRuleString.split(" "); - for (int pos : rr.getOccurrences()) { + + MutableIntIterator ii = rr.getOccurrences().intIterator(); + while (ii.hasNext()) { + int pos = ii.next(); + Integer p2 = records.mapStringIndexToTSPosition(pos + split.length - 1); if (null == p2) { rr.ruleIntervals.add(new RuleInterval(records.mapStringIndexToTSPosition(pos), diff --git a/src/main/java/net/seninp/gi/repair/RePairRule.java b/src/main/java/net/seninp/gi/repair/RePairRule.java index 96b3ebc..21d1213 100644 --- a/src/main/java/net/seninp/gi/repair/RePairRule.java +++ b/src/main/java/net/seninp/gi/repair/RePairRule.java @@ -1,8 +1,10 @@ package net.seninp.gi.repair; -import java.util.ArrayList; +import com.gs.collections.impl.set.mutable.primitive.IntHashSet; import net.seninp.gi.RuleInterval; +import java.util.ArrayList; + /** * The grammar rule. * @@ -35,7 +37,7 @@ public class RePairRule { protected int level; /** Occurrences. */ - protected ArrayList occurrences; + protected IntHashSet occurrences; /** Which TS interval covered. */ protected ArrayList ruleIntervals; @@ -46,7 +48,7 @@ public class RePairRule { /** * Constructor, assigns a rule ID using the global counter. */ - public RePairRule(RePairGrammar rg) { + public RePairRule(RePairGrammar rg, RePairSymbol first, RePairSymbol second) { this.grammar = rg; @@ -56,9 +58,11 @@ public RePairRule(RePairGrammar rg) { rg.theRules.put(this.ruleNumber, this); - this.occurrences = new ArrayList(); + this.occurrences = new IntHashSet(); this.ruleIntervals = new ArrayList(); + this.first = first; this.second = second; + } /** @@ -125,9 +129,9 @@ public String toExpandedRuleString() { * @param value the new value. */ public void addOccurrence(int value) { - if (!this.occurrences.contains(value)) { - this.occurrences.add(value); - } + + this.occurrences.add(value); + } /** @@ -135,12 +139,13 @@ public void addOccurrence(int value) { * * @return all rule's occurrences. */ - public int[] getOccurrences() { - int[] res = new int[this.occurrences.size()]; - for (int i = 0; i < this.occurrences.size(); i++) { - res[i] = this.occurrences.get(i); - } - return res; + public IntHashSet getOccurrences() { + return occurrences; +// int[] res = new int[this.occurrences.size()]; +// for (int i = 0; i < this.occurrences.size(); i++) { +// res[i] = this.occurrences.get(i); +// } +// return res; } public String toString() { diff --git a/src/main/java/net/seninp/gi/repair/RePairSymbol.java b/src/main/java/net/seninp/gi/repair/RePairSymbol.java index c07af57..eed7556 100644 --- a/src/main/java/net/seninp/gi/repair/RePairSymbol.java +++ b/src/main/java/net/seninp/gi/repair/RePairSymbol.java @@ -14,19 +14,22 @@ public class RePairSymbol { /** * Payload. */ - private char[] string; + final private char[] string; /** * Position of the symbol in the string. */ - private Integer stringPosition; + private int stringPosition; + + final static char[] blank = new char[0]; /** * Constructor. */ public RePairSymbol() { super(); - this.stringPosition = null; + this.stringPosition = -1; + this.string = blank; } /** @@ -89,6 +92,9 @@ public int getLevel() { return 0; } + + public char[] key() { return string; } + public String toString() { return String.valueOf(this.string); } @@ -98,7 +104,7 @@ public int hashCode() { final int prime = 31; int result = 1; result = prime * result + Arrays.hashCode(string); - result = prime * result + ((stringPosition == null) ? 0 : stringPosition.hashCode()); + result = prime * result + ((stringPosition == -1) ? -1 : stringPosition); return result; } @@ -106,19 +112,22 @@ public int hashCode() { public boolean equals(Object obj) { if (this == obj) return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) + if (!(obj instanceof RePairSymbol)) return false; + RePairSymbol other = (RePairSymbol) obj; - if (!Arrays.equals(string, other.string)) - return false; - if (stringPosition == null) { - if (other.stringPosition != null) + + if (stringPosition == -1) { + if (other.stringPosition != -1) return false; } - else if (!stringPosition.equals(other.stringPosition)) + else if (stringPosition!=other.stringPosition) + return false; + + if (!Arrays.equals(string, other.string)) return false; + + return true; } diff --git a/src/main/java/net/seninp/gi/repair/parallel/ParallelGrammarKeeper.java b/src/main/java/net/seninp/gi/repair/parallel/ParallelGrammarKeeper.java new file mode 100644 index 0000000..123d57c --- /dev/null +++ b/src/main/java/net/seninp/gi/repair/parallel/ParallelGrammarKeeper.java @@ -0,0 +1,188 @@ +package net.seninp.gi.repair.parallel; + +import com.gs.collections.api.list.MutableList; +import com.gs.collections.api.tuple.primitive.IntObjectPair; +import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap; +import net.seninp.gi.repair.RePairSymbol; + +import java.util.ArrayList; +import java.util.concurrent.atomic.AtomicInteger; + +/** + * This implements a handler for the Re-Pair grammar built in parallel. This data structure is + * responsible for enumerating rules and for tracking changes in the R0 of the grammar. + * + * @author psenin + */ +public class ParallelGrammarKeeper { + + private static final char SPACE = ' '; + private static final char THE_R = 'R'; + + // rule 0 gets a separate treatment, so we start from 1 + // + protected AtomicInteger numRules = new AtomicInteger(1); + + // the rules table + protected IntObjectHashMap theRules = new IntObjectHashMap(); + + // the grammar id + private long id; + + // R0 strings + // + protected String r0String; + public String r0ExpandedString; + + // keeps a working string of this grammar + // + protected ArrayList workString; + private MutableList> keys; + + /** + * Constructor. + * + * @param id The handler id. + */ + public ParallelGrammarKeeper(long id) { + super(); + this.id = id; + } + + /** + * The id is used to keep track of parallel chunks. + * + * @return the current ID. + */ + public long getId() { + return this.id; + } + + /** + * This is used in parallel. + * + * @param string the string we work with in parallel. + */ + public void setWorkString(ArrayList string) { + this.workString = string; + } + + /** + * Set the R0 string. + * + * @param string the R0 string value. + */ + public void setR0String(String string) { + this.r0String = string; + } + + /** + * Get the expanded R0 out. + * + * @return the expanded R0. + */ + public String getR0ExpandedString() { + return this.r0ExpandedString; + } + + /** + * This adds an existing rule to this grammar. Useful in merging. + * + * @param r The rule. It is not yet clear how to treat rules, be careful. This will not set the + * rule number, but it will increment the internal rule counter. + */ + public void addExistingRule(ParallelRePairRule r) { + r.grammarHandler = this; + if (this.theRules.containsKey(r.ruleNumber)) { + // we do override an existing rule + theRules.put(r.ruleNumber, r); + } else { + // plus 1 because the rule 0 has a special treatment + theRules.put(r.ruleNumber, r); + numRules.set(theRules.size() + 1); + } + } + + /** + * Expands all rules EXCEPT R0. + */ + public void expandRules() { + // iterate over all SAX containers + //ArrayList keys = new ArrayList(theRules.keySet()); + //Collections.sort(keys); + keys = keys(); + for (IntObjectPair key : keys) { + ParallelRePairRule rr = key.getTwo(); + + + String resultString = rr.toRuleString(); + + int currentSearchStart = resultString.indexOf(THE_R); + while (currentSearchStart >= 0) { + int spaceIdx = resultString.indexOf(" ", currentSearchStart); + // if (spaceIdx < 0) { + // System.out.println("gotcha!"); + // } + String ruleName = resultString.substring(currentSearchStart, spaceIdx + 1); + int ruleId = Integer.valueOf(ruleName.substring(1, ruleName.length() - 1)); + + ParallelRePairRule rule = theRules.get(ruleId); + if (rule != null) { + if (rule.expandedRuleString.charAt(rule.expandedRuleString.length() - 1) == ' ') { + resultString = resultString.replaceAll(ruleName, rule.expandedRuleString); + } else { + resultString = resultString.replaceAll(ruleName, rule.expandedRuleString + SPACE); + } + } + + currentSearchStart = resultString.indexOf(THE_R, spaceIdx); + } + + rr.setExpandedRule(resultString.trim()); + + } + } + + public MutableList> keys() { + return theRules.keyValuesView().toSortedList(); + } + + /** + * Expands R0 specifically. + */ + public void expandR0() { + // string is immutable it will get copied + String finalString = this.r0String; + int currentSearchStart = finalString.indexOf(THE_R); + while (currentSearchStart >= 0) { + + int spaceIdx = finalString.indexOf(" ", currentSearchStart + 1); + + String ruleName = finalString.substring(currentSearchStart, spaceIdx + 1); + Integer ruleId = Integer.valueOf(ruleName.substring(1, ruleName.length() - 1)); + + ParallelRePairRule rr = theRules.get(ruleId); + if (null == rr.expandedRuleString) { + finalString = finalString.replaceAll(ruleName, theRules.get(ruleId).toRuleString()); + } else { + finalString = finalString.replaceAll(ruleName, theRules.get(ruleId).expandedRuleString + + SPACE); + } + + currentSearchStart = finalString.indexOf(THE_R); + } + this.r0ExpandedString = finalString; + } + + public String toGrammarString() { + StringBuffer sb = new StringBuffer(); + System.out.println("R0 -> " + r0String); + for (int i = 1; i < theRules.size(); i++) { + ParallelRePairRule r = theRules.get(i); + sb.append("R").append(r.ruleNumber).append(" -> ").append(r.toRuleString()).append(" : ") + .append(r.expandedRuleString).append(", ").append(r.positions).append("\n"); + } + return sb.toString(); + } + +} diff --git a/src/main/java/net/seninp/gi/repair/ParallelRePairGuard.java b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairGuard.java similarity index 85% rename from src/main/java/net/seninp/gi/repair/ParallelRePairGuard.java rename to src/main/java/net/seninp/gi/repair/parallel/ParallelRePairGuard.java index dc29c97..c5503ef 100644 --- a/src/main/java/net/seninp/gi/repair/ParallelRePairGuard.java +++ b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairGuard.java @@ -1,4 +1,6 @@ -package net.seninp.gi.repair; +package net.seninp.gi.repair.parallel; + +import net.seninp.gi.repair.RePairSymbol; /** * The guard used for non-terminals. diff --git a/src/main/java/net/seninp/gi/repair/ParallelRePairImplementation.java b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairImplementation.java similarity index 96% rename from src/main/java/net/seninp/gi/repair/ParallelRePairImplementation.java rename to src/main/java/net/seninp/gi/repair/parallel/ParallelRePairImplementation.java index d4afca4..46395d4 100644 --- a/src/main/java/net/seninp/gi/repair/ParallelRePairImplementation.java +++ b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairImplementation.java @@ -1,20 +1,19 @@ -package net.seninp.gi.repair; +package net.seninp.gi.repair.parallel; + +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.Logger; +import com.gs.collections.api.list.MutableList; +import com.gs.collections.api.tuple.primitive.IntObjectPair; +import net.seninp.gi.repair.DigramFrequencies; +import net.seninp.gi.repair.DigramFrequencyEntry; +import net.seninp.gi.repair.RePairSymbol; +import net.seninp.util.StackTrace; +import org.slf4j.LoggerFactory; import java.util.ArrayList; -import java.util.Collections; import java.util.HashMap; -import java.util.Hashtable; import java.util.Map.Entry; -import java.util.concurrent.CompletionService; -import java.util.concurrent.ExecutorCompletionService; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.Future; -import java.util.concurrent.TimeUnit; -import net.seninp.util.StackTrace; -import org.slf4j.LoggerFactory; -import ch.qos.logback.classic.Level; -import ch.qos.logback.classic.Logger; +import java.util.concurrent.*; public class ParallelRePairImplementation { @@ -64,8 +63,8 @@ public ParallelGrammarKeeper buildGrammar(ParallelGrammarKeeper grammar, int thr // the mapping of rule ID to the rule instance final HashMap ruleNumToRecord = new HashMap(); if (!(grammar.theRules.isEmpty())) { - for (Entry e : grammar.theRules.entrySet()) { - ruleNumToRecord.put(e.getKey(), e.getValue()); + for (IntObjectPair e : grammar.keys()) { + ruleNumToRecord.put(e.getOne(), e.getTwo()); } } // the data structure which keeps R0 strings that have been returned from workers @@ -159,7 +158,7 @@ public ParallelGrammarKeeper buildGrammar(ParallelGrammarKeeper grammar, int thr consoleLogger.debug("job " + chunkRes.getId() + " of chunk " + chunkJobIndex + " has finished"); - Hashtable chunkGrammarRulesData = chunkRes.theRules; + //IntObjectHashMap chunkGrammarRulesData = chunkRes.theRules; String R0String = chunkRes.r0String; chunkStrings.put(chunkJobIndex, R0String); @@ -178,8 +177,7 @@ public ParallelGrammarKeeper buildGrammar(ParallelGrammarKeeper grammar, int thr // these are the rule keys, they'll be used twice // - ArrayList keys = new ArrayList(chunkGrammarRulesData.keySet()); - Collections.sort(keys); + MutableList> keys = chunkRes.keys(); // for (int i = 0; i < keys.size(); i++) { // ParallelRePairRule r = chunkGrammarRulesData.get(keys.get(i)); @@ -193,9 +191,9 @@ public ParallelGrammarKeeper buildGrammar(ParallelGrammarKeeper grammar, int thr // these are guaranteed to come out in order // - for (int i = 0; i < keys.size(); i++) { + for (IntObjectPair k : keys) { - ParallelRePairRule r = chunkGrammarRulesData.get(keys.get(i)); + ParallelRePairRule r = k.getTwo(); consoleLogger.trace("processing rule " + r.getRuleName() + " -> " + r.toRuleString() + " : " + r.expandedRuleString); diff --git a/src/main/java/net/seninp/gi/repair/ParallelRePairRule.java b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairRule.java similarity index 95% rename from src/main/java/net/seninp/gi/repair/ParallelRePairRule.java rename to src/main/java/net/seninp/gi/repair/parallel/ParallelRePairRule.java index 08b872f..03797da 100644 --- a/src/main/java/net/seninp/gi/repair/ParallelRePairRule.java +++ b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairRule.java @@ -1,4 +1,6 @@ -package net.seninp.gi.repair; +package net.seninp.gi.repair.parallel; + +import net.seninp.gi.repair.RePairSymbol; import java.util.ArrayList; diff --git a/src/main/java/net/seninp/gi/repair/ParallelRePairWorkerSingleLevel.java b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairWorkerSingleLevel.java similarity index 98% rename from src/main/java/net/seninp/gi/repair/ParallelRePairWorkerSingleLevel.java rename to src/main/java/net/seninp/gi/repair/parallel/ParallelRePairWorkerSingleLevel.java index 462956b..1f64c97 100644 --- a/src/main/java/net/seninp/gi/repair/ParallelRePairWorkerSingleLevel.java +++ b/src/main/java/net/seninp/gi/repair/parallel/ParallelRePairWorkerSingleLevel.java @@ -1,4 +1,8 @@ -package net.seninp.gi.repair; +package net.seninp.gi.repair.parallel; + +import net.seninp.gi.repair.DigramFrequencies; +import net.seninp.gi.repair.DigramFrequencyEntry; +import net.seninp.gi.repair.RePairSymbol; import java.util.ArrayList; import java.util.Map.Entry; diff --git a/src/main/java/net/seninp/gi/sequitur/SAXRule.java b/src/main/java/net/seninp/gi/sequitur/SAXRule.java index a81034b..2f1d232 100644 --- a/src/main/java/net/seninp/gi/sequitur/SAXRule.java +++ b/src/main/java/net/seninp/gi/sequitur/SAXRule.java @@ -19,15 +19,13 @@ of the License, or (at your option) any later version. Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Set; -import java.util.TreeSet; -import java.util.Vector; -import java.util.concurrent.atomic.AtomicInteger; +import com.gs.collections.impl.set.mutable.primitive.IntHashSet; import net.seninp.gi.GrammarRuleRecord; import net.seninp.gi.GrammarRules; +import java.util.*; +import java.util.concurrent.atomic.AtomicInteger; + /** * The Rule. Adaption of Eibe Frank code for JMotif API, see {@link sequitur.info} for original * version. @@ -77,7 +75,7 @@ public class SAXRule { * This keeps rule indexes - once rule created or used, its placement position is extracted from * the TerminalSymbol position and stored here. */ - protected Set indexes = new TreeSet(); + protected IntHashSet indexes = new IntHashSet(); /** * Constructor. @@ -218,15 +216,19 @@ private static void expandRules() { // } // }); + StringBuilder resultString = new StringBuilder(64); + // for (SAXMapEntry entry : recs) { for (GrammarRuleRecord ruleRecord : arrRuleRecords) { + if (ruleRecord.getRuleNumber() == 0) { continue; } + resultString.setLength(0); + String curString = ruleRecord.getRuleString(); - StringBuilder resultString = new StringBuilder(8192); String[] split = curString.split(" "); @@ -240,12 +242,13 @@ private static void expandRules() { } // need to trim space at the very end - String rr = resultString.delete(0, 1).append(" ").toString(); + String rr = resultString.delete(0, 1).append(' ').toString(); ruleRecord.setExpandedRuleString(rr); ruleRecord.setRuleYield(countSpaces(rr)); } - StringBuilder resultString = new StringBuilder(8192); + //StringBuilder resultString = new StringBuilder(8192); + resultString.setLength(0); GrammarRuleRecord ruleRecord = arrRuleRecords.get(0); resultString.append(ruleRecord.getRuleString()); @@ -324,14 +327,8 @@ public void addIndex(int position) { * * @return all the rule occurrences. */ - private int[] getIndexes() { - int[] res = new int[this.indexes.size()]; - int i = 0; - for (Integer idx : this.indexes) { - res[i] = idx; - i++; - } - return res; + private IntHashSet getIndexes() { + return indexes; } /** @@ -481,7 +478,7 @@ public static String printRules() { } text.append(TAB).append(arrRuleRecords.get(processedRules).getExpandedRuleString()) .append(TAB); - text.append(Arrays.toString(currentRule.getIndexes())).append(CR); + text.append(currentRule.getIndexes()).append(CR); processedRules++; diff --git a/src/main/java/net/seninp/gi/sequitur/SAXSymbol.java b/src/main/java/net/seninp/gi/sequitur/SAXSymbol.java index 0d08331..82040f6 100644 --- a/src/main/java/net/seninp/gi/sequitur/SAXSymbol.java +++ b/src/main/java/net/seninp/gi/sequitur/SAXSymbol.java @@ -19,7 +19,9 @@ of the License, or (at your option) any later version. Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ +import java.util.HashMap; import java.util.Hashtable; +import java.util.Map; import java.util.Map.Entry; /** @@ -41,10 +43,10 @@ public abstract class SAXSymbol { private static final int prime = 2265539; /** Hashtable to keep track of all digrams. This is static - single instance for all. */ - protected static final Hashtable theDigrams = new Hashtable( + protected static final Map theDigrams = new HashMap( SAXSymbol.prime); - public static Hashtable> theSubstituteTable = new Hashtable>( + public static Map> theSubstituteTable = new HashMap( SAXSymbol.prime); /** The symbol value. */ @@ -170,11 +172,12 @@ public boolean check() { return false; } - if (!theDigrams.containsKey(this)) { + SAXSymbol found; + if ((found = theDigrams.putIfAbsent(this, this))==null) { // System.out.println("[sequitur debug] *check...* digrams contain this (" + this.value + "~" // + this.n.value + ")? NO. Checking in."); // found = theDigrams.put(this, this); - theDigrams.put(this, this); + //theDigrams.put(this, this); // System.out.println(" *** Digrams now: " + makeDigramsTable()); // System.out.println("[sequitur debug] *digrams* " + hash2String()); return false; @@ -184,7 +187,7 @@ public boolean check() { // + this.n.value + ")? Yes. Oh-Oh..."); // well the same hash is in the store, lemme see... - SAXSymbol found = theDigrams.get(this); + //found = theDigrams.get(this); // if it's not me, then lets call match magic? if (found.n != this) { diff --git a/src/main/java/net/seninp/gi/sequitur/SequiturFactory.java b/src/main/java/net/seninp/gi/sequitur/SequiturFactory.java index 4b4686d..487013f 100644 --- a/src/main/java/net/seninp/gi/sequitur/SequiturFactory.java +++ b/src/main/java/net/seninp/gi/sequitur/SequiturFactory.java @@ -1,14 +1,8 @@ package net.seninp.gi.sequitur; -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Hashtable; -import java.util.StringTokenizer; -import java.util.concurrent.atomic.AtomicInteger; +import ch.qos.logback.classic.Level; +import ch.qos.logback.classic.Logger; +import com.gs.collections.api.iterator.MutableIntIterator; import net.seninp.gi.GrammarRuleRecord; import net.seninp.gi.GrammarRules; import net.seninp.gi.RuleInterval; @@ -19,8 +13,16 @@ import net.seninp.jmotif.sax.alphabet.NormalAlphabet; import net.seninp.jmotif.sax.datastructures.SAXRecords; import org.slf4j.LoggerFactory; -import ch.qos.logback.classic.Level; -import ch.qos.logback.classic.Logger; + +import java.io.BufferedWriter; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Hashtable; +import java.util.StringTokenizer; +import java.util.concurrent.atomic.AtomicInteger; /** * Sort of a stand-alone factory to digesting strings with Sequitur. @@ -157,7 +159,7 @@ public static SAXRule runSequiturWithEditDistanceThreshold(String string, Intege normalA.getDistanceMatrix(alphabetSize)); if (dist < threshold) { merged = true; - SAXSymbol.theSubstituteTable.get(str).put(token.substring(0), currentPosition); + SAXSymbol.theSubstituteTable.get(str).put(token, currentPosition); token = str; } } @@ -214,13 +216,16 @@ public static ArrayList getRulePositionsByRuleNum(int ruleIdx, SAX // array of all words of this expanded rule String[] expandedRuleSplit = ruleContainer.getExpandedRuleString().trim().split(" "); - for (Integer currentIndex : ruleContainer.getOccurrences()) { + MutableIntIterator ii = ruleContainer.getOccurrences().intIterator(); + while (ii.hasNext()) { + + int currentIndex = ii.next(); // System.out.println("Index: " + currentIndex); String extractedStr = ""; int[] extractedPositions = new int[expandedRuleSplit.length]; for (int i = 0; i < expandedRuleSplit.length; i++) { - consoleLogger.trace("currentIndex " + currentIndex + ", i: " + i); + //consoleLogger.trace("currentIndex " + currentIndex + ", i: " + i); extractedStr = extractedStr.concat(" ").concat( String.valueOf(saxFrequencyData.getByIndex(saxWordsIndexes.get(currentIndex + i)) .getPayload())); @@ -543,7 +548,10 @@ public static void updateRuleIntervals(GrammarRules rules, SAXRecords saxFrequen // iterate over all occurrences of this rule // the currentIndex here is the position of the rule in the input string // - for (Integer currentIndex : ruleContainer.getOccurrences()) { + MutableIntIterator ii = ruleContainer.getOccurrences().intIterator(); + while (ii.hasNext()) { + + int currentIndex = ii.next(); // System.out.println("Index: " + currentIndex); // String extractedStr = ""; diff --git a/src/main/java/net/seninp/gi/performance/EvaluateParallelRePair.java b/src/main/java/net/seninp/gi/util/EvaluateParallelRePair.java similarity index 96% rename from src/main/java/net/seninp/gi/performance/EvaluateParallelRePair.java rename to src/main/java/net/seninp/gi/util/EvaluateParallelRePair.java index dc71937..09ef101 100644 --- a/src/main/java/net/seninp/gi/performance/EvaluateParallelRePair.java +++ b/src/main/java/net/seninp/gi/util/EvaluateParallelRePair.java @@ -1,4 +1,4 @@ -package net.seninp.gi.performance; +package net.seninp.gi.util; import java.io.BufferedReader; import java.io.FileInputStream; @@ -9,8 +9,8 @@ import java.util.ArrayList; import java.util.Date; import java.util.zip.GZIPInputStream; -import net.seninp.gi.repair.ParallelGrammarKeeper; -import net.seninp.gi.repair.ParallelRePairImplementation; +import net.seninp.gi.repair.parallel.ParallelGrammarKeeper; +import net.seninp.gi.repair.parallel.ParallelRePairImplementation; import net.seninp.gi.repair.RePairFactory; import net.seninp.gi.repair.RePairGrammar; import net.seninp.gi.repair.RePairSymbol; diff --git a/src/main/java/net/seninp/gi/MemoryLeakTester.java b/src/main/java/net/seninp/gi/util/MemoryLeakTester.java similarity index 69% rename from src/main/java/net/seninp/gi/MemoryLeakTester.java rename to src/main/java/net/seninp/gi/util/MemoryLeakTester.java index aff60ea..11b231e 100644 --- a/src/main/java/net/seninp/gi/MemoryLeakTester.java +++ b/src/main/java/net/seninp/gi/util/MemoryLeakTester.java @@ -1,5 +1,6 @@ -package net.seninp.gi; +package net.seninp.gi.util; +import net.seninp.gi.GrammarRules; import net.seninp.gi.sequitur.SequiturFactory; import net.seninp.jmotif.sax.NumerosityReductionStrategy; import net.seninp.jmotif.sax.TSProcessor; @@ -16,12 +17,12 @@ public class MemoryLeakTester { public static void main(String[] args) throws Exception { - try { - Thread.sleep(10000); // 1000 milliseconds is one second. - } - catch (InterruptedException ex) { - Thread.currentThread().interrupt(); - } +// try { +// Thread.sleep(10000); // 1000 milliseconds is one second. +// } +// catch (InterruptedException ex) { +// Thread.currentThread().interrupt(); +// } double[] ts = TSProcessor.readFileColumn(INPUT_FNAME, 0, 0); System.out.println("Read " + ts.length + " points from " + INPUT_FNAME); @@ -33,22 +34,22 @@ public static void main(String[] args) throws Exception { Thread.currentThread().interrupt(); } - for (int i = 0; i < 20; i++) { + /*for (int i = 0; i < 20; i++) { System.out.println("Iteration " + i); - System.gc(); + System.gc();*/ GrammarRules g = SequiturFactory.series2SequiturRules(ts, SAX_WIN_SIZE, SAX_PAA_SIZE, SAX_A_SIZE, NumerosityReductionStrategy.EXACT, SAX_NORM_THRESHOLD); System.out.println("Inferred " + g.size() + " rules."); - try { - Thread.sleep(10000); // 1000 milliseconds is one second. - } - catch (InterruptedException ex) { - Thread.currentThread().interrupt(); - } +// try { +// Thread.sleep(10000); // 1000 milliseconds is one second. +// } +// catch (InterruptedException ex) { +// Thread.currentThread().interrupt(); +// } - } + //} } diff --git a/src/main/java/net/seninp/gi/util/RunLive.java b/src/main/java/net/seninp/gi/util/RunLive.java new file mode 100644 index 0000000..e7b04c4 --- /dev/null +++ b/src/main/java/net/seninp/gi/util/RunLive.java @@ -0,0 +1,21 @@ +package net.seninp.gi.util; + +import net.seninp.gi.repair.RePairFactory; +import net.seninp.gi.repair.RePairGrammar; + +import java.util.Scanner; + +/** + * Created by me on 7/11/15. + */ +public class RunLive { + + public static void main(String args[]) { + while (true) { + String l = new Scanner(System.in).nextLine(); + RePairGrammar x = RePairFactory.buildGrammar(l); + System.out.println(x.toGrammarRules()); + System.out.println(x.toGrammarRulesData()); + } + } +} diff --git a/src/test/java/net/seninp/gi/repair/TestParallelRePairImplementation.java b/src/test/java/net/seninp/gi/repair/TestParallelRePairImplementation.java index 065aad0..09c5fdc 100644 --- a/src/test/java/net/seninp/gi/repair/TestParallelRePairImplementation.java +++ b/src/test/java/net/seninp/gi/repair/TestParallelRePairImplementation.java @@ -3,6 +3,9 @@ import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; import java.util.ArrayList; + +import net.seninp.gi.repair.parallel.ParallelGrammarKeeper; +import net.seninp.gi.repair.parallel.ParallelRePairImplementation; import net.seninp.jmotif.sax.NumerosityReductionStrategy; import net.seninp.jmotif.sax.TSProcessor; import net.seninp.jmotif.sax.datastructures.SAXRecord; diff --git a/src/test/java/net/seninp/gi/sequitur/TestSequiturPaperGrammars.java b/src/test/java/net/seninp/gi/sequitur/TestSequiturPaperGrammars.java index 3522bbd..0d2af82 100644 --- a/src/test/java/net/seninp/gi/sequitur/TestSequiturPaperGrammars.java +++ b/src/test/java/net/seninp/gi/sequitur/TestSequiturPaperGrammars.java @@ -1,12 +1,11 @@ package net.seninp.gi.sequitur; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; -import static org.junit.Assert.fail; import net.seninp.gi.GrammarRules; import net.seninp.util.StackTrace; import org.junit.Test; +import static org.junit.Assert.*; + public class TestSequiturPaperGrammars { private static final String TEST1_STRING = "a b c d b c"; @@ -28,6 +27,8 @@ public void test3() { SAXRule r = SequiturFactory.runSequitur(TEST3_STRING); GrammarRules rules = r.toGrammarRulesData(); + System.out.println(rules); + assertEquals("test hierarchy", 5, rules.size()); assertTrue("test r0", TEST3_R0.equals(rules.get(0).getRuleString().trim())); @@ -45,7 +46,7 @@ public void test2() { SAXRule r = SequiturFactory.runSequitur(TEST2_STRING); GrammarRules rules = r.toGrammarRulesData(); - assertTrue("test r0", TEST2_R0.equals(rules.get(0).getRuleString().trim())); + assertEquals(TEST2_R0, (rules.get(0).getRuleString().trim())); assertTrue("test r1", TEST2_R1.equals(rules.get(1).getRuleString().trim())); assertTrue("test r1", TEST2_R2.equals(rules.get(2).getRuleString().trim())); }