From 7ca0fd00f6a71c8d7c188ac43eb50a439fe8edf2 Mon Sep 17 00:00:00 2001 From: Boyuan Ning <73546421+ningboyuan@users.noreply.github.com> Date: Sun, 13 Mar 2022 02:51:59 +0100 Subject: [PATCH] Update TXTfpbsupervised.py --- TXTfpbsupervised/TXTfpbsupervised.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/TXTfpbsupervised/TXTfpbsupervised.py b/TXTfpbsupervised/TXTfpbsupervised.py index 53e3217..777c600 100644 --- a/TXTfpbsupervised/TXTfpbsupervised.py +++ b/TXTfpbsupervised/TXTfpbsupervised.py @@ -10,14 +10,14 @@ from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer -from sklearn.cross_validation import StratifiedKFold -from sklearn.cross_validation import cross_val_score +from sklearn.model_selection import KFold +from sklearn.model_selection import cross_val_score # svm from sklearn.linear_model import SGDClassifier # grid search -from sklearn.grid_search import GridSearchCV +from sklearn.model_selection import GridSearchCV from pprint import pprint from time import time @@ -115,10 +115,10 @@ def indexer(index1, index2): return [index1[i] for i in index2] -def cver(y, x, folds, seed): +def cver(y, x, splits, seed): """Stratified k-fold crossvalidation with upsampling.""" - skf = StratifiedKFold(y, n_folds = folds, shuffle = True, - random_state = seed) + kf = KFold(n_splits = splits, shuffle = True, + random_state = seed) ind = np.array(list(range(0, len(x)))) @@ -126,7 +126,7 @@ def cver(y, x, folds, seed): x_train, x_test = [], [] y_train, y_test = [], [] - for train_set, test_set in skf: + for train_set, test_set in kf.split(x): x_train.append(x[train_set]) x_test.append(x[test_set]) y_train.append(y[train_set]) @@ -209,7 +209,7 @@ def cv_pred(x, y, custom_cv, piper, unique_y = True): # setup seed = 123 -folds = 5 +splits = 5 # CountVectorizer stop_words = (None, 'english') @@ -231,12 +231,12 @@ def cv_pred(x, y, custom_cv, piper, unique_y = True): lem = df["lemma"] sco = df["sentiment"] -custom_cv = cver(sco, lem, folds, seed) +custom_cv = cver(sco, lem, splits, seed) piper = Pipeline([("vect", CountVectorizer(tokenizer = tokenize)), ("tfidf", TfidfTransformer()), ("clf", SGDClassifier(shuffle = True, - n_iter = 80, + max_iter = 80, random_state = seed)), ])