手动拆分与Scikit网格搜索

时间:2015-07-13 15:43:11

标签: python machine-learning scikit-learn

在依赖"手册"时,看到非常不同的结果让我感到困惑。在训练集和测试集之间划分数据并使用scikit-learn网格搜索功能。我正在使用来自两个运行的kaggle竞赛的评估函数,并且网格搜索超过单个值(与手动拆分相同的值)。由此产生的基尼值是如此不同,某处有错误,但我没有看到它,我想知道我在比较中是否存在疏忽?

为我跑的第一个代码块导致了" Validation Sample Score: 0.0033997889 (normalized gini)."

的gini

第二个块(使用scikit)会产生更高的值:

Fitting 2 folds for each of 1 candidates, totalling 2 fits
0.334467621189
0.339421569449
[Parallel(n_jobs=-1)]: Done   3 out of   2 | elapsed:  9.9min remaining:  -198.0s
[Parallel(n_jobs=-1)]: Done   2 out of   2 | elapsed:  9.9min finished
{'n_estimators': 1000}
0.336944643888
[mean: 0.33694, std: 0.00248, params: {'n_estimators': 1000}]

评估功能:

def gini(solution, submission):
    df = zip(solution, submission)
    df = sorted(df, key=lambda x: (x[1],x[0]), reverse=True)
    rand = [float(i+1)/float(len(df)) for i in range(len(df))]
    totalPos = float(sum([x[0] for x in df]))
    cumPosFound = [df[0][0]]
    for i in range(1,len(df)):
        cumPosFound.append(cumPosFound[len(cumPosFound)-1] + df[i][0])
    Lorentz = [float(x)/totalPos for x in cumPosFound]
    Gini = [Lorentz[i]-rand[i] for i in range(len(df))]
    return sum(Gini)

def normalized_gini(solution, submission):
    normalized_gini = gini(solution, submission)/gini(solution, solution)
    print normalized_gini
    return normalized_gini


gini_scorer = metrics.make_scorer(normalized_gini, greater_is_better = True)

第1区:

if __name__ == '__main__':

    dat=pd.read_table('train.csv',sep=",")

    y=dat[['Hazard']].values.ravel()
    dat=dat.drop(['Hazard','Id'],axis=1)

    #sample out 30% for validation
    folds=train_test_split(range(len(y)),test_size=0.3) #30% test
    train_X=dat.iloc[folds[0],:]
    train_y=y[folds[0]]
    test_X=dat.iloc[folds[1],:]
    test_y=y[folds[1]]


    #assume no leakage by OH whole data
    dat_dict=train_X.T.to_dict().values()
    vectorizer = DV( sparse = False )
    vectorizer.fit( dat_dict )
    train_X = vectorizer.transform( dat_dict )

    del dat_dict

    dat_dict=test_X.T.to_dict().values()
    test_X = vectorizer.transform( dat_dict )

    del dat_dict



    rf=RandomForestRegressor(n_estimators=1000, n_jobs=-1)
    rf.fit(train_X,train_y)
    y_submission=rf.predict(test_X)
    print "Validation Sample Score: %.10f (normalized gini)." % normalized_gini(test_y,y_submission)

第2栏:

dat_dict=dat.T.to_dict().values()
vectorizer = DV( sparse = False )
vectorizer.fit( dat_dict )
X = vectorizer.transform( dat_dict )

parameters= {'n_estimators': [1000]}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid=parameters,cv=2, verbose=1, scoring=gini_scorer,n_jobs=-1)
grid_search.fit(X,y)

print grid_search.best_params_
print grid_search.best_score_
print grid_search.grid_scores_

修改

这是一个自包含的例子,我得到了同样的区别。

from sklearn.cross_validation import StratifiedKFold, KFold, ShuffleSplit,train_test_split
from sklearn.ensemble import RandomForestRegressor , ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from scipy.stats import randint, uniform
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston




if __name__ == '__main__':

    b=load_boston()
    X = pd.DataFrame(b.data)
    y = b.target

    #sample out 30% for validation
    folds=train_test_split(range(len(y)),test_size=0.5) #50% test
    train_X=X.iloc[folds[0],:]
    train_y=y[folds[0]]
    test_X=X.iloc[folds[1],:]
    test_y=y[folds[1]]


    rf=RandomForestRegressor(n_estimators=1000, n_jobs=-1)
    rf.fit(train_X,train_y)
    y_submission=rf.predict(test_X)

    print "Validation Sample Score: %.10f (mean squared)." % mean_squared_error(test_y,y_submission)


    parameters= {'n_estimators': [1000]}
    grid_search = GridSearchCV(RandomForestRegressor(), param_grid=parameters,cv=2, verbose=1, scoring='mean_squared_error',n_jobs=-1)
    grid_search.fit(X,y)

    print grid_search.best_params_
    print grid_search.best_score_
    print grid_search.grid_scores_

4 个答案:

答案 0 :(得分:5)

不确定我是否可以为您提供完整的解决方案,但这里有一些指示:

  1. 调试此类问题时,请使用scikit-learn对象的 random_state 参数,因为它会使您的结果真正重现。 以下内容将始终返回完全相同的数字:

    rf=RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
    rf.fit(train_X,train_y)
    y_submission=rf.predict(test_X)
    mean_squared_error(test_y,y_submission)
    
  2. 它会重置随机数生成器,以确保您始终获得相同的随机性"。您应该在train_test_splitGridSearchCV上使用它。

    1. 您在自包含示例中获得的结果是正常的。通常我得到了:

      Validation Sample Score: 9.8136434847 (mean squared).
      [mean: -22.38918, std: 11.56372, params: {'n_estimators': 1000}]
      
    2. 首先,请注意GridSearchCV返回的均方误差是否定均方误差。我认为这是设计保持分数功能的精神(对于分数,越大越好)。

      现在这仍然是9.81对22.38。但是这里的标准偏差是巨大的。它可以解释得分看起来如此不同。 如果你想检查GridSearchCV没有做某些可疑的事情你可以强迫它只使用一个分割,和手动分割一样:

      from sklearn.cross_validation import StratifiedKFold, KFold, ShuffleSplit,train_test_split, PredefinedSplit
      from sklearn.ensemble import RandomForestRegressor , ExtraTreesRegressor, GradientBoostingRegressor
      from sklearn.linear_model import LogisticRegression
      import numpy as np
      import pandas as pd
      from sklearn.feature_extraction import DictVectorizer as DV
      from sklearn import metrics
      from sklearn.preprocessing import StandardScaler
      from sklearn.grid_search import GridSearchCV,RandomizedSearchCV
      from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
      from scipy.stats import randint, uniform
      from sklearn.metrics import mean_squared_error
      from sklearn.datasets import load_boston
      
      if __name__ == '__main__':
          b=load_boston()
          X = pd.DataFrame(b.data)
          y = b.target
          folds=train_test_split(range(len(y)),test_size=0.5, random_state=15) #50% test
          folds_split = np.ones_like(y)
          folds_split[folds[0]] = -1
          ps = PredefinedSplit(folds_split)
      
          for tr, te in ps:
              train_X=X.iloc[tr,:]
              train_y=y[tr]
              test_X=X.iloc[te,:]
              test_y=y[te]
              rf=RandomForestRegressor(n_estimators=1000, n_jobs=1, random_state=15)
              rf.fit(train_X,train_y)
              y_submission=rf.predict(test_X)
              print("Validation Sample Score: {:.10f} (mean squared).".format(mean_squared_error(test_y, y_submission)))
      
          parameters= {'n_estimators': [1000], 'n_jobs': [1], 'random_state': [15]}
          grid_search = GridSearchCV(RandomForestRegressor(), param_grid=parameters,cv=ps, verbose=2, scoring='mean_squared_error', n_jobs=1)
          grid_search.fit(X,y)
      
          print("best_params: ", grid_search.best_params_)
          print("best_score", grid_search.best_score_)
          print("grid_scores", grid_search.grid_scores_)
      

      希望这会有所帮助。

      抱歉,我无法弄清楚你的基尼得分手的情况。我说0.0033xxx似乎是一个非常低的值(几乎没有模型?),用于标准化的基尼评分。

答案 1 :(得分:5)

按照用户3914041和Andreus的最小示例和响应,这可以按预期工作。的确,我得到了:

Validation Sample Score: 10.176958 (mean squared).
Fitting 1 folds for each of 1 candidates, totalling 1 fits
mean: 10.19074, std: 0.00000, params: {'n_estimators': 1000}

在这种情况下,我们在两种方法中都有相同的结果(省略一些舍入)。以下是重现相同分数的代码:

from sklearn.cross_validation import train_test_split, PredefinedSplit
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.datasets import load_boston

b=load_boston()
X = b.data
y = b.target

folds=train_test_split(range(len(y)),test_size=0.5, random_state=10)
train_X=X[folds[0],:]
train_y=y[folds[0]]
test_X=X[folds[1],:]
test_y=y[folds[1]]

folds_split = np.zeros_like(y)
folds_split[folds[0]] = -1
ps = PredefinedSplit(folds_split)

rf=RandomForestRegressor(n_estimators=1000, random_state=42)
rf.fit(train_X,train_y)
y_submission=rf.predict(test_X)

print "Validation Sample Score: %f (mean squared)." % mean_squared_error(test_y,y_submission)

mse_scorer = make_scorer(mean_squared_error)
parameters= {'n_estimators': [1000]}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), cv=ps,
                           param_grid=parameters, verbose=1, scoring=mse_scorer)
grid_search.fit(X,y)

print grid_search.grid_scores_[0]

在第一个示例中,尝试删除greater_is_better=True。实际上,基尼系数应该被最小化,而不是最大化。

试着看看这是否解决了这个问题。您还可以添加一些随机种子,以确保以完全相同的方式完成拆分。

答案 2 :(得分:4)

我可以告诉两个代码块之间有一个区别。通过使用cv=2,您将数据拆分为两个50%大小的块。然后在它们之间平均得到的基尼。

作为旁注,您确定要在得分手中greater_is_better=True吗?从您的帖子中,您暗示您希望降低该分数。在这一点上要特别小心,因为GridSearchCV最大化得分。

来自GridSearchCV documentation

  

选择的参数是那些最大化遗漏数据得分的参数,除非传递明确的分数,在这种情况下使用它。

答案 3 :(得分:0)

这个帖子现在已经很老了,所以我假设所有人都认为这一点,但为了清楚起见,原来的2个区块中至少有3个问题导致它们产生不同的结果:简而言之,未能设置一对随机种子,并且未能在train_test_split返回的折叠上使用PredefinedSplit(迭代可以最终重新排序分割)。下面是使用不同的gini实现来说明的独立代码:

import sys
import numpy as np
import pandas as pd

from sklearn.cross_validation import train_test_split, PredefinedSplit
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

def gini(expected, predicted):
    assert expected.shape[0] == predicted.shape[0], 'unequal number of rows: [ %d vs %d ]' \
        % ( expected.shape[0] == predicted.shape[0] )

    _all = np.asarray(np.c_[
        expected,
        predicted,
        np.arange(expected.shape[0])], dtype=np.float)

    _EXPECTED = 0
    _PREDICTED = 1
    _INDEX = 2

    # sort by predicted descending, then by index ascending
    sort_order = np.lexsort((_all[:, _INDEX], -1 * _all[:, _PREDICTED]))
    _all = _all[sort_order]

    total_losses = _all[:, _EXPECTED].sum()
    gini_sum = _all[:, _EXPECTED].cumsum().sum() / total_losses
    gini_sum -= (expected.shape[0] + 1.0) / 2.0
    return gini_sum / expected.shape[0]

def gini_normalized(solution, submission, gini=gini):
    solution = np.array(solution)
    submission = np.array(submission)
    return gini(solution, submission) / gini(solution, solution)


gini_scorer = metrics.make_scorer( gini_normalized, greater_is_better=True )


dat=pd.read_table('train.csv',sep=',')
y=dat[['Hazard']].values.ravel()
dat=dat.drop(['Hazard','Id'],axis=1)

# 1. set seed for train_test_split()
folds = train_test_split( range(len(y)), test_size=0.7, random_state=15 ) # 70% test

dat_dict=dat.T.to_dict().values()
vectorizer=DV( sparse = False )
vectorizer.fit( dat_dict )
dat=vectorizer.transform( dat_dict )
dat=pd.DataFrame(dat)


# 2. instead of using the raw folds returned by train_test_split,
#   use the PredefinedSplit iterator, just like GridSearchCV does
if 0:
    train_X=dat.iloc[folds[0]]
    train_y=y[folds[0]]
    test_X=dat.iloc[folds[1]]
    test_y=y[folds[1]]
else:
    folds_split = np.zeros_like(y)
    folds_split[folds[0]] = -1
    ps = PredefinedSplit(folds_split)

    # in this example, there's only one iteration here
    for train_index, test_index in ps:
        train_X, test_X = dat.iloc[train_index], dat.iloc[test_index]
        train_y, test_y = y[train_index], y[test_index]


n_estimators = [ 100, 200 ]

# 3. also set seed for RFR
rfr_params = { 'n_jobs':7, 'random_state':15 }


######################################################################
# manual grid search ( block 1 )

for n_est in n_estimators:

    print 'n_estimators = %d:' % n_est; sys.stdout.flush()

    rfr = RandomForestRegressor( n_estimators=n_est, **rfr_params )
    rfr.fit( train_X, train_y )
    y_pred = rfr.predict( test_X )

    gscore = gini_normalized( test_y, y_pred )

    print ' validation score: %.5f (normalized gini)' % gscore


######################################################################
# GridSearchCV grid search ( block 2 )

ps = PredefinedSplit(folds_split)      
rfr = RandomForestRegressor( **rfr_params )

grid_params = { 'n_estimators':n_estimators }
gcv = GridSearchCV( rfr, grid_params, scoring=gini_scorer, cv=ps )

gcv.fit( dat, y )

print gcv.grid_scores_