GridSearchCV没有选择最佳结果

时间:2018-03-22 21:26:40

标签: python scikit-learn grid-search tfidfvectorizer

在我的sklearn分类模型中,当我设置" ngram_range =(4,4)"手动的TfidfVectorizer的参数,由于f1_macro我得到了0.58,例如对于unigram(1,1),结果是0.49 ..

问题是,当我使用GridSearchCv方法选择最佳参数时,它没有给出最佳参数但它返回参数集中第一个元素的结果,看看我的代码让它更清楚:

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, average_precision_score
import re
from os import walk
import csv
import operator

# variables --
co = dict()
lex = []

def tokenizeManu(txt):
    txt = clean_str(txt)
    return txt.split()

def tokenizeTfidf(txt):
    return txt.split() return txt.split() # It just split the text without any operation

def repAllTxt(txt):
    out = re.sub("[a-z]|[A-Z]", '*', txt)
    out = re.sub("[0-9]", '#', out)
    return out

def corpDict(x):
    count = CountVectorizer(ngram_range=(1, 1), tokenizer=tokenizeManu, lowercase=False)
    countFit = count.fit_transform(x)
    vocab = count.get_feature_names()
    dist = np.sum(countFit.toarray(), axis=0)
    for tag, count in zip(vocab, dist):
        co[tag] = count

    # print(len(co))

def clean_str(string):
    string = re.sub(r"[^A-Za-z0-9()\-\":.$,!?\'\`]", r" ", string)
    string = re.sub(r"([()\-\":.,!?\'\`])", r" \1 ", string)
    string = re.sub(r"\'s", r" \'s", string)
    string = re.sub(r"\'m", r" \'m", string)
    string = re.sub(r"\'ve", r" \'ve", string)
    string = re.sub(r"n\'t", r" n\'t", string)
    string = re.sub(r"\'re", r" \'re", string)
    string = re.sub(r"\'d", r" \'d", string)
    string = re.sub(r"\'ll", r" \'ll", string)
    string = re.sub(r"\s{2,}", r" ", string)
    return string.strip()

def readLexicons():
    path = 'lexicons'
    # Load data from files
    f = []
    for (dirpath, dirnames, filenames) in walk(path):
        for i in filenames:
            f.append(str(dirpath+'\\'+i))

    lexList = []
    for pa in f:
        if pa.endswith('txt') == True:
            with open(pa, encoding="utf8") as inf:
                reader = csv.reader(inf, delimiter='\n',quoting=csv.QUOTE_NONE)
                col = list(zip(*reader))
                lexList.extend(col[0])
        else:
            file_object = open(pa, "r")
            file_object = file_object.read()
            file_object = re.findall(r'((?<=word1=)\w+)', file_object)
            lexList.extend(file_object)
    lex.extend(lexList)

def prepTxtStar(X, kValue, maintainLex):
    sorted_co = sorted(co.items(), key=operator.itemgetter(1), reverse=True)[:kValue]
    sorted_co = list([i[0] for i in sorted_co])

    for row in range(len(X)):
        c = str(X[row]).split()
        for i in range(len(c)):
            if c[i] in co.keys():
                if not sorted_co.__contains__(c[i]):
                    if maintainLex == 0:
                        c[i] = repAllTxt(c[i])
                    else:
                        if not lex.__contains__(c[i]):
                            c[i] = repAllTxt(c[i])
        X[row] = ' '.join(c)
    for x in X[:3]:
        print(x)
    return X

def readFiles():
    path = 'datasetpaaaaaaaaaaath/ds.txt'

    f = []
    for (dirpath, dirnames, filenames) in walk(path):
        for i in filenames:
            f.append(str(dirpath+'\\'+i))
    x = []
    y = []
    lexList = []
    for pa in f:
        if pa.endswith('txt') == True:
            with open(pa, encoding="utf8") as inf:
                reader = csv.reader(inf, delimiter='\t',quoting=csv.QUOTE_NONE)
                col = list(zip(*reader))
                x.extend(col[2])
                y.extend(col[3])
    return x,y





if __name__ == "__main__":
    xOri, yOri = readFiles()
    xOri = [clean_str(i) for i in xOri]
    readLexicons()
    corpDict(xOri)
    xOri = prepTxtStar(xOri, kValue=10000000, maintainLex=0)


    x, xTest, y, yTest = train_test_split(xOri, yOri, test_size=0.32, random_state=42)
    model = Pipeline([
                        ('tfidf', TfidfVectorizer( analyzer='char_wb', min_df=0.0007,lowercase=False,tokenizer=tokenizeTfidf)),
                        ('clf', SGDClassifier(tol=None, loss='hinge', random_state=38, max_iter=5))
                     ])

    # Grid search
    parameters = {
                    'tfidf__ngram_range': [(1,1),(2,2),(3,3),(4,4),(5,5),(6,6)]
                 }
    gs_clf = GridSearchCV(model, parameters, n_jobs=-1, scoring='f1_macro')
    gs_clf = gs_clf.fit(x, y)

    predicted = gs_clf.predict(xTest)
    for param_name in sorted(parameters.keys()):
        print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
    print('F1 Macro: ', f1_score(yTest, predicted, average='macro'))

在这个例子中,我得到了以下结果:

tfidf__ngram_range: (1, 1)
F1 Macro:  0.4927875243664717

因此,它选择参数集(1,1)的第一个元素,而根据f1_score的最佳元素是(4,4)!

问题是什么,我错过了什么?

编辑:完整源代码随数据集添加:Dataset

1 个答案:

答案 0 :(得分:0)

GridSearchCV会根据 mean 得分选择最佳超参数,并根据特定超参数集的所有折叠进行计算。即您可以手动选择这样的整理子集,与GridSearchCV计算的平均分数相比,性能更好。

最后我抽出时间玩你的数据集了。我的训练数据组得分为0.559,测试数据组得到0.633。

结果:

In [116]: grid.best_score_
Out[116]: 0.5598812206142373

In [117]: grid.score(X_test, y_test)
Out[117]: 0.6330340557275542

In [118]: joblib.dump(grid, 'grid_SGD_stats_speaker.pkl')
Out[118]: ['grid_SGD_stats_speaker.pkl']

In [119]: grid.best_params_
Out[119]:
{'clf': SGDClassifier(alpha=1e-05, average=False, class_weight=None, epsilon=0.1,
        eta0=0.0, fit_intercept=True, l1_ratio=0.15,
        learning_rate='optimal', loss='hinge', max_iter=500, n_iter=None,
        n_jobs=1, penalty='l2', power_t=0.5, random_state=None,
        shuffle=True, tol=None, verbose=0, warm_start=False),
 'clf__alpha': 1e-05,
 'clf__max_iter': 500,
 'union__text__vect': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=1.0, max_features=None, min_df=1,
         ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
         stop_words=None, strip_accents=None, sublinear_tf=False,
         token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
         vocabulary=None),
 'union__text__vect__ngram_range': (1, 1)}

代码:

# https://stackoverflow.com/questions/49438764/gridsearchcv-not-choose-the-optimal-result
# data set: http://www.mediafire.com/file/8tdb7p9hjfom7x1/ds.txt

try:
    from pathlib import Path
except ImportError:             # Python 2
    from pathlib2 import Path
import os
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, LabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.externals import joblib
from scipy.sparse import csr_matrix, hstack


def get_data(path):
    cols = ['num','speaker','text','label']
    df = (pd.read_csv(path, sep='\t', header=None, dtype={'speaker':'category'},
                      names=cols, usecols=['speaker','text','label'])
            .query("speaker != 'SYSTEM'"))

    #df['speaker_code'], speaker_names = pd.factorize(df['speaker'])
    return df

class FeatureSelector(BaseEstimator, TransformerMixin):

    def __init__(self, name=None, position=None,
                 as_cat_codes=False, sparse=False):
        self.name = name
        self.position = position
        self.as_cat_codes = as_cat_codes
        self.sparse = sparse

    def fit(self, X, y=None):
        return self

    def transform(self, X, **kwargs):
        if self.name is not None:
            col_pos = X.columns.get_loc(self.name)
        elif self.position is not None:
            col_pos = self.position
        else:
            raise Exception('either [name] or [position] parameter must be not-None')
        if self.as_cat_codes and X.dtypes.iloc[col_pos] == 'category':
                ret = X.iloc[:, col_pos].cat.codes
        else:
            ret = X.iloc[:, col_pos]
        if self.sparse:
            ret = csr_matrix(ret.values.reshape(-1,1))
        return ret

class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, posts):
        return [{#'length': len(text),
                 'num_sentences': text.count('. '),
                 #'has_dollar_sign': '$' in text,
                }
                for text in posts]


def train_models(X_train, y_train):
    union = FeatureUnion([
                ('text', 
                 Pipeline([
                    ('select', FeatureSelector('text')),
                    ('vect', TfidfVectorizer(ngram_range=(1, 1))),
                 ]) ),
                ('stats', 
                 Pipeline([
                    ('select', FeatureSelector('text')),
                    ('stats', TextStats()),
                    ('dict_vect', DictVectorizer()),
                 ]) ),
                ('speaker',
                 Pipeline([
                    ('select', FeatureSelector('speaker', sparse=True,
                                               as_cat_codes=True)),
                    #('scale', StandardScaler(with_mean=False)),
                 ]) )
            ])

    pipe = Pipeline([
        ('union', union),
        ('clf', MLPClassifier(hidden_layer_sizes=(100,)))
    ])

    param_grid = [
        #{
        #    #'union__text__vect': [TfidfVectorizer()],
        #    'clf': [MLPClassifier()],
        #    #'union__text__vect__ngram_range': [(1,1), (2,5)],
        #    #'union__text__vect__analyzer': ['word','char_wb'],
        #    'clf__alpha': np.logspace(-5, -2, 4),
        #},
        {
            'union__text__vect': [TfidfVectorizer(ngram_range=(1, 1))],
            'clf': [SGDClassifier()],
            'union__text__vect__ngram_range': [(1,1), (2,5)],
            #'vect__analyzer': ['word','char_wb'],
            'clf__alpha': np.logspace(-5, 0, 6),
            'clf__max_iter': [500],
        },
        #{
        #    'union__text__vect': [TfidfVectorizer(ngram_range=(1, 1))],
        #    'clf': [MultinomialNB()],
        #    'union__text__vect__ngram_range': [(1,1), (2,5)],
        #    #'vect__analyzer': ['word','char_wb'],
        #    'clf__alpha': np.logspace(-3, 1, 5),
        #},
    ]

    grid = (GridSearchCV(pipe, param_grid=param_grid,
                         scoring='f1_macro', cv=3,
                         n_jobs=1, verbose=2)
            .fit(X_train, y_train))
    return grid

###########

os.chdir(r'/path/to/your/dataset/dir')
path = './ds.txt'
df = get_data(path)
X_train, X_test, y_train, y_test = train_test_split(df[['speaker','text']], df['label'], test_size=0.2)
grid = train_models(X_train, y_train)
joblib.dump(grid, 'grid_SGD_stats_speaker.pkl')