Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions TXTfpbsupervised/TXTfpbsupervised.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# svm
from sklearn.linear_model import SGDClassifier

# grid search
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import GridSearchCV
from pprint import pprint
from time import time

Expand Down Expand Up @@ -115,18 +115,18 @@ def indexer(index1, index2):
return [index1[i] for i in index2]


def cver(y, x, folds, seed):
def cver(y, x, splits, seed):
"""Stratified k-fold crossvalidation with upsampling."""
skf = StratifiedKFold(y, n_folds = folds, shuffle = True,
random_state = seed)
kf = KFold(n_splits = splits, shuffle = True,
random_state = seed)

ind = np.array(list(range(0, len(x))))

ind_train, ind_test = [], []
x_train, x_test = [], []
y_train, y_test = [], []

for train_set, test_set in skf:
for train_set, test_set in kf.split(x):
x_train.append(x[train_set])
x_test.append(x[test_set])
y_train.append(y[train_set])
Expand Down Expand Up @@ -209,7 +209,7 @@ def cv_pred(x, y, custom_cv, piper, unique_y = True):

# setup
seed = 123
folds = 5
splits = 5

# CountVectorizer
stop_words = (None, 'english')
Expand All @@ -231,12 +231,12 @@ def cv_pred(x, y, custom_cv, piper, unique_y = True):
lem = df["lemma"]
sco = df["sentiment"]

custom_cv = cver(sco, lem, folds, seed)
custom_cv = cver(sco, lem, splits, seed)

piper = Pipeline([("vect", CountVectorizer(tokenizer = tokenize)),
("tfidf", TfidfTransformer()),
("clf", SGDClassifier(shuffle = True,
n_iter = 80,
max_iter = 80,
random_state = seed)),
])

Expand Down