diff --git a/src/cc/mallet/pipe/TokenSequenceRemoveStopwords.java b/src/cc/mallet/pipe/TokenSequenceRemoveStopwords.java
index adb6b7a23..555198a67 100644
--- a/src/cc/mallet/pipe/TokenSequenceRemoveStopwords.java
+++ b/src/cc/mallet/pipe/TokenSequenceRemoveStopwords.java
@@ -12,7 +12,6 @@
package cc.mallet.pipe;
-import java.util.HashSet;
import java.util.ArrayList;
import java.io.*;
@@ -20,6 +19,8 @@
import cc.mallet.types.Instance;
import cc.mallet.types.Token;
import cc.mallet.types.TokenSequence;
+import gnu.trove.THashSet;
+
/**
* Remove tokens from the token sequence in the data field whose text is in the stopword list.
@author Andrew McCallum mccallum@cs.umass.edu
@@ -28,13 +29,13 @@
public class TokenSequenceRemoveStopwords extends Pipe implements Serializable
{
// xxx Use a gnu.trove collection instead
- HashSet stoplist = null;
+ THashSet stoplist = null;
boolean caseSensitive = true;
boolean markDeletions = false;
- private HashSet newDefaultStopList ()
+ private THashSet newDefaultStopList ()
{
- HashSet sl = new HashSet();
+ THashSet sl = new THashSet<>();
for (int i = 0; i < stopwords.length; i++)
sl.add (stopwords[i]);
return sl;
@@ -81,7 +82,7 @@ private static InputStream fileToInputStream(File file)
public TokenSequenceRemoveStopwords(InputStream stoplistStream, String encoding, boolean includeDefault,
boolean caseSensitive, boolean markDeletions) {
- if (! includeDefault) { stoplist = new HashSet(); }
+ if (! includeDefault) { stoplist = new THashSet(); }
else { stoplist = newDefaultStopList(); }
try {
@@ -164,7 +165,6 @@ private String[] streamToStringArray(InputStream stream, String encoding) throws
return wordarray.toArray(new String[]{});
}
-
public Instance pipe (Instance carrier)
{
TokenSequence ts = (TokenSequence) carrier.getData();
@@ -186,30 +186,30 @@ public Instance pipe (Instance carrier)
return carrier;
}
- // Serialization
-
+ // Serialization
+
private static final long serialVersionUID = 1;
private static final int CURRENT_SERIAL_VERSION = 2;
-
+
private void writeObject (ObjectOutputStream out) throws IOException {
out.writeInt (CURRENT_SERIAL_VERSION);
out.writeBoolean(caseSensitive);
out.writeBoolean(markDeletions);
out.writeObject(stoplist); // New as of CURRENT_SERIAL_VERSION 2
}
-
+
private void readObject (ObjectInputStream in) throws IOException, ClassNotFoundException {
int version = in.readInt ();
caseSensitive = in.readBoolean();
if (version > 0)
markDeletions = in.readBoolean();
if (version > 1) {
- stoplist = (HashSet) in.readObject();
+ stoplist = (THashSet) in.readObject();
}
}
-
+
static final String[] stopwords =
{
"a",
@@ -746,7 +746,7 @@ private void readObject (ObjectInputStream in) throws IOException, ClassNotFound
//"concludes",
//"based",
//"approach"
- };
+ };
//stopwords for french, added by Limin Yao
static final String[] stopwordsFrench = {
"fut",