在我的sklearn分类模型中,当我设置" ngram_range =(4,4)"手动的TfidfVectorizer的参数,由于f1_macro我得到了0.58,例如对于unigram(1,1),结果是0.49 ..
问题是,当我使用GridSearchCv方法选择最佳参数时,它没有给出最佳参数但它返回参数集中第一个元素的结果,看看我的代码让它更清楚:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, average_precision_score
import re
from os import walk
import csv
import operator
# variables --
co = dict()
lex = []
def tokenizeManu(txt):
txt = clean_str(txt)
return txt.split()
def tokenizeTfidf(txt):
return txt.split() return txt.split() # It just split the text without any operation
def repAllTxt(txt):
out = re.sub("[a-z]|[A-Z]", '*', txt)
out = re.sub("[0-9]", '#', out)
return out
def corpDict(x):
count = CountVectorizer(ngram_range=(1, 1), tokenizer=tokenizeManu, lowercase=False)
countFit = count.fit_transform(x)
vocab = count.get_feature_names()
dist = np.sum(countFit.toarray(), axis=0)
for tag, count in zip(vocab, dist):
co[tag] = count
# print(len(co))
def clean_str(string):
string = re.sub(r"[^A-Za-z0-9()\-\":.$,!?\'\`]", r" ", string)
string = re.sub(r"([()\-\":.,!?\'\`])", r" \1 ", string)
string = re.sub(r"\'s", r" \'s", string)
string = re.sub(r"\'m", r" \'m", string)
string = re.sub(r"\'ve", r" \'ve", string)
string = re.sub(r"n\'t", r" n\'t", string)
string = re.sub(r"\'re", r" \'re", string)
string = re.sub(r"\'d", r" \'d", string)
string = re.sub(r"\'ll", r" \'ll", string)
string = re.sub(r"\s{2,}", r" ", string)
return string.strip()
def readLexicons():
path = 'lexicons'
# Load data from files
f = []
for (dirpath, dirnames, filenames) in walk(path):
for i in filenames:
f.append(str(dirpath+'\\'+i))
lexList = []
for pa in f:
if pa.endswith('txt') == True:
with open(pa, encoding="utf8") as inf:
reader = csv.reader(inf, delimiter='\n',quoting=csv.QUOTE_NONE)
col = list(zip(*reader))
lexList.extend(col[0])
else:
file_object = open(pa, "r")
file_object = file_object.read()
file_object = re.findall(r'((?<=word1=)\w+)', file_object)
lexList.extend(file_object)
lex.extend(lexList)
def prepTxtStar(X, kValue, maintainLex):
sorted_co = sorted(co.items(), key=operator.itemgetter(1), reverse=True)[:kValue]
sorted_co = list([i[0] for i in sorted_co])
for row in range(len(X)):
c = str(X[row]).split()
for i in range(len(c)):
if c[i] in co.keys():
if not sorted_co.__contains__(c[i]):
if maintainLex == 0:
c[i] = repAllTxt(c[i])
else:
if not lex.__contains__(c[i]):
c[i] = repAllTxt(c[i])
X[row] = ' '.join(c)
for x in X[:3]:
print(x)
return X
def readFiles():
path = 'datasetpaaaaaaaaaaath/ds.txt'
f = []
for (dirpath, dirnames, filenames) in walk(path):
for i in filenames:
f.append(str(dirpath+'\\'+i))
x = []
y = []
lexList = []
for pa in f:
if pa.endswith('txt') == True:
with open(pa, encoding="utf8") as inf:
reader = csv.reader(inf, delimiter='\t',quoting=csv.QUOTE_NONE)
col = list(zip(*reader))
x.extend(col[2])
y.extend(col[3])
return x,y
if __name__ == "__main__":
xOri, yOri = readFiles()
xOri = [clean_str(i) for i in xOri]
readLexicons()
corpDict(xOri)
xOri = prepTxtStar(xOri, kValue=10000000, maintainLex=0)
x, xTest, y, yTest = train_test_split(xOri, yOri, test_size=0.32, random_state=42)
model = Pipeline([
('tfidf', TfidfVectorizer( analyzer='char_wb', min_df=0.0007,lowercase=False,tokenizer=tokenizeTfidf)),
('clf', SGDClassifier(tol=None, loss='hinge', random_state=38, max_iter=5))
])
# Grid search
parameters = {
'tfidf__ngram_range': [(1,1),(2,2),(3,3),(4,4),(5,5),(6,6)]
}
gs_clf = GridSearchCV(model, parameters, n_jobs=-1, scoring='f1_macro')
gs_clf = gs_clf.fit(x, y)
predicted = gs_clf.predict(xTest)
for param_name in sorted(parameters.keys()):
print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
print('F1 Macro: ', f1_score(yTest, predicted, average='macro'))
在这个例子中,我得到了以下结果:
tfidf__ngram_range: (1, 1)
F1 Macro: 0.4927875243664717
因此,它选择参数集(1,1)的第一个元素,而根据f1_score的最佳元素是(4,4)!
问题是什么,我错过了什么?
编辑:完整源代码随数据集添加:Dataset
答案 0 :(得分:0)
GridSearchCV
会根据 mean 得分选择最佳超参数,并根据特定超参数集的所有折叠进行计算。即您可以手动选择这样的整理子集,与GridSearchCV计算的平均分数相比,性能更好。
最后我抽出时间玩你的数据集了。我的训练数据组得分为0.559,测试数据组得到0.633。
结果:
In [116]: grid.best_score_
Out[116]: 0.5598812206142373
In [117]: grid.score(X_test, y_test)
Out[117]: 0.6330340557275542
In [118]: joblib.dump(grid, 'grid_SGD_stats_speaker.pkl')
Out[118]: ['grid_SGD_stats_speaker.pkl']
In [119]: grid.best_params_
Out[119]:
{'clf': SGDClassifier(alpha=1e-05, average=False, class_weight=None, epsilon=0.1,
eta0=0.0, fit_intercept=True, l1_ratio=0.15,
learning_rate='optimal', loss='hinge', max_iter=500, n_iter=None,
n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
shuffle=True, tol=None, verbose=0, warm_start=False),
'clf__alpha': 1e-05,
'clf__max_iter': 500,
'union__text__vect': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
lowercase=True, max_df=1.0, max_features=None, min_df=1,
ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
stop_words=None, strip_accents=None, sublinear_tf=False,
token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
vocabulary=None),
'union__text__vect__ngram_range': (1, 1)}
代码:
# https://stackoverflow.com/questions/49438764/gridsearchcv-not-choose-the-optimal-result
# data set: http://www.mediafire.com/file/8tdb7p9hjfom7x1/ds.txt
try:
from pathlib import Path
except ImportError: # Python 2
from pathlib2 import Path
import os
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.externals import joblib
from scipy.sparse import csr_matrix, hstack
def get_data(path):
cols = ['num','speaker','text','label']
df = (pd.read_csv(path, sep='\t', header=None, dtype={'speaker':'category'},
names=cols, usecols=['speaker','text','label'])
.query("speaker != 'SYSTEM'"))
#df['speaker_code'], speaker_names = pd.factorize(df['speaker'])
return df
class FeatureSelector(BaseEstimator, TransformerMixin):
def __init__(self, name=None, position=None,
as_cat_codes=False, sparse=False):
self.name = name
self.position = position
self.as_cat_codes = as_cat_codes
self.sparse = sparse
def fit(self, X, y=None):
return self
def transform(self, X, **kwargs):
if self.name is not None:
col_pos = X.columns.get_loc(self.name)
elif self.position is not None:
col_pos = self.position
else:
raise Exception('either [name] or [position] parameter must be not-None')
if self.as_cat_codes and X.dtypes.iloc[col_pos] == 'category':
ret = X.iloc[:, col_pos].cat.codes
else:
ret = X.iloc[:, col_pos]
if self.sparse:
ret = csr_matrix(ret.values.reshape(-1,1))
return ret
class TextStats(BaseEstimator, TransformerMixin):
"""Extract features from each document for DictVectorizer"""
def fit(self, x, y=None):
return self
def transform(self, posts):
return [{#'length': len(text),
'num_sentences': text.count('. '),
#'has_dollar_sign': '$' in text,
}
for text in posts]
def train_models(X_train, y_train):
union = FeatureUnion([
('text',
Pipeline([
('select', FeatureSelector('text')),
('vect', TfidfVectorizer(ngram_range=(1, 1))),
]) ),
('stats',
Pipeline([
('select', FeatureSelector('text')),
('stats', TextStats()),
('dict_vect', DictVectorizer()),
]) ),
('speaker',
Pipeline([
('select', FeatureSelector('speaker', sparse=True,
as_cat_codes=True)),
#('scale', StandardScaler(with_mean=False)),
]) )
])
pipe = Pipeline([
('union', union),
('clf', MLPClassifier(hidden_layer_sizes=(100,)))
])
param_grid = [
#{
# #'union__text__vect': [TfidfVectorizer()],
# 'clf': [MLPClassifier()],
# #'union__text__vect__ngram_range': [(1,1), (2,5)],
# #'union__text__vect__analyzer': ['word','char_wb'],
# 'clf__alpha': np.logspace(-5, -2, 4),
#},
{
'union__text__vect': [TfidfVectorizer(ngram_range=(1, 1))],
'clf': [SGDClassifier()],
'union__text__vect__ngram_range': [(1,1), (2,5)],
#'vect__analyzer': ['word','char_wb'],
'clf__alpha': np.logspace(-5, 0, 6),
'clf__max_iter': [500],
},
#{
# 'union__text__vect': [TfidfVectorizer(ngram_range=(1, 1))],
# 'clf': [MultinomialNB()],
# 'union__text__vect__ngram_range': [(1,1), (2,5)],
# #'vect__analyzer': ['word','char_wb'],
# 'clf__alpha': np.logspace(-3, 1, 5),
#},
]
grid = (GridSearchCV(pipe, param_grid=param_grid,
scoring='f1_macro', cv=3,
n_jobs=1, verbose=2)
.fit(X_train, y_train))
return grid
###########
os.chdir(r'/path/to/your/dataset/dir')
path = './ds.txt'
df = get_data(path)
X_train, X_test, y_train, y_test = train_test_split(df[['speaker','text']], df['label'], test_size=0.2)
grid = train_models(X_train, y_train)
joblib.dump(grid, 'grid_SGD_stats_speaker.pkl')