在依赖"手册"时,看到非常不同的结果让我感到困惑。在训练集和测试集之间划分数据并使用scikit-learn网格搜索功能。我正在使用来自两个运行的kaggle竞赛的评估函数,并且网格搜索超过单个值(与手动拆分相同的值)。由此产生的基尼值是如此不同,某处有错误,但我没有看到它,我想知道我在比较中是否存在疏忽?
为我跑的第一个代码块导致了" Validation Sample Score: 0.0033997889 (normalized gini).
"
第二个块(使用scikit)会产生更高的值:
Fitting 2 folds for each of 1 candidates, totalling 2 fits
0.334467621189
0.339421569449
[Parallel(n_jobs=-1)]: Done 3 out of 2 | elapsed: 9.9min remaining: -198.0s
[Parallel(n_jobs=-1)]: Done 2 out of 2 | elapsed: 9.9min finished
{'n_estimators': 1000}
0.336944643888
[mean: 0.33694, std: 0.00248, params: {'n_estimators': 1000}]
评估功能:
def gini(solution, submission):
df = zip(solution, submission)
df = sorted(df, key=lambda x: (x[1],x[0]), reverse=True)
rand = [float(i+1)/float(len(df)) for i in range(len(df))]
totalPos = float(sum([x[0] for x in df]))
cumPosFound = [df[0][0]]
for i in range(1,len(df)):
cumPosFound.append(cumPosFound[len(cumPosFound)-1] + df[i][0])
Lorentz = [float(x)/totalPos for x in cumPosFound]
Gini = [Lorentz[i]-rand[i] for i in range(len(df))]
return sum(Gini)
def normalized_gini(solution, submission):
normalized_gini = gini(solution, submission)/gini(solution, solution)
print normalized_gini
return normalized_gini
gini_scorer = metrics.make_scorer(normalized_gini, greater_is_better = True)
第1区:
if __name__ == '__main__':
dat=pd.read_table('train.csv',sep=",")
y=dat[['Hazard']].values.ravel()
dat=dat.drop(['Hazard','Id'],axis=1)
#sample out 30% for validation
folds=train_test_split(range(len(y)),test_size=0.3) #30% test
train_X=dat.iloc[folds[0],:]
train_y=y[folds[0]]
test_X=dat.iloc[folds[1],:]
test_y=y[folds[1]]
#assume no leakage by OH whole data
dat_dict=train_X.T.to_dict().values()
vectorizer = DV( sparse = False )
vectorizer.fit( dat_dict )
train_X = vectorizer.transform( dat_dict )
del dat_dict
dat_dict=test_X.T.to_dict().values()
test_X = vectorizer.transform( dat_dict )
del dat_dict
rf=RandomForestRegressor(n_estimators=1000, n_jobs=-1)
rf.fit(train_X,train_y)
y_submission=rf.predict(test_X)
print "Validation Sample Score: %.10f (normalized gini)." % normalized_gini(test_y,y_submission)
第2栏:
dat_dict=dat.T.to_dict().values()
vectorizer = DV( sparse = False )
vectorizer.fit( dat_dict )
X = vectorizer.transform( dat_dict )
parameters= {'n_estimators': [1000]}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid=parameters,cv=2, verbose=1, scoring=gini_scorer,n_jobs=-1)
grid_search.fit(X,y)
print grid_search.best_params_
print grid_search.best_score_
print grid_search.grid_scores_
修改
这是一个自包含的例子,我得到了同样的区别。
from sklearn.cross_validation import StratifiedKFold, KFold, ShuffleSplit,train_test_split
from sklearn.ensemble import RandomForestRegressor , ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from scipy.stats import randint, uniform
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
if __name__ == '__main__':
b=load_boston()
X = pd.DataFrame(b.data)
y = b.target
#sample out 30% for validation
folds=train_test_split(range(len(y)),test_size=0.5) #50% test
train_X=X.iloc[folds[0],:]
train_y=y[folds[0]]
test_X=X.iloc[folds[1],:]
test_y=y[folds[1]]
rf=RandomForestRegressor(n_estimators=1000, n_jobs=-1)
rf.fit(train_X,train_y)
y_submission=rf.predict(test_X)
print "Validation Sample Score: %.10f (mean squared)." % mean_squared_error(test_y,y_submission)
parameters= {'n_estimators': [1000]}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid=parameters,cv=2, verbose=1, scoring='mean_squared_error',n_jobs=-1)
grid_search.fit(X,y)
print grid_search.best_params_
print grid_search.best_score_
print grid_search.grid_scores_
答案 0 :(得分:5)
不确定我是否可以为您提供完整的解决方案,但这里有一些指示:
调试此类问题时,请使用scikit-learn对象的 random_state 参数,因为它会使您的结果真正重现。 以下内容将始终返回完全相同的数字:
rf=RandomForestRegressor(n_estimators=1000, n_jobs=-1, random_state=0)
rf.fit(train_X,train_y)
y_submission=rf.predict(test_X)
mean_squared_error(test_y,y_submission)
它会重置随机数生成器,以确保您始终获得相同的随机性"。您应该在train_test_split
和GridSearchCV
上使用它。
您在自包含示例中获得的结果是正常的。通常我得到了:
Validation Sample Score: 9.8136434847 (mean squared).
[mean: -22.38918, std: 11.56372, params: {'n_estimators': 1000}]
首先,请注意GridSearchCV
返回的均方误差是否定均方误差。我认为这是设计保持分数功能的精神(对于分数,越大越好)。
现在这仍然是9.81对22.38。但是这里的标准偏差是巨大的。它可以解释得分看起来如此不同。
如果你想检查GridSearchCV
没有做某些可疑的事情你可以强迫它只使用一个分割,和手动分割一样:
from sklearn.cross_validation import StratifiedKFold, KFold, ShuffleSplit,train_test_split, PredefinedSplit
from sklearn.ensemble import RandomForestRegressor , ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LogisticRegression
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV,RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from scipy.stats import randint, uniform
from sklearn.metrics import mean_squared_error
from sklearn.datasets import load_boston
if __name__ == '__main__':
b=load_boston()
X = pd.DataFrame(b.data)
y = b.target
folds=train_test_split(range(len(y)),test_size=0.5, random_state=15) #50% test
folds_split = np.ones_like(y)
folds_split[folds[0]] = -1
ps = PredefinedSplit(folds_split)
for tr, te in ps:
train_X=X.iloc[tr,:]
train_y=y[tr]
test_X=X.iloc[te,:]
test_y=y[te]
rf=RandomForestRegressor(n_estimators=1000, n_jobs=1, random_state=15)
rf.fit(train_X,train_y)
y_submission=rf.predict(test_X)
print("Validation Sample Score: {:.10f} (mean squared).".format(mean_squared_error(test_y, y_submission)))
parameters= {'n_estimators': [1000], 'n_jobs': [1], 'random_state': [15]}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid=parameters,cv=ps, verbose=2, scoring='mean_squared_error', n_jobs=1)
grid_search.fit(X,y)
print("best_params: ", grid_search.best_params_)
print("best_score", grid_search.best_score_)
print("grid_scores", grid_search.grid_scores_)
希望这会有所帮助。
抱歉,我无法弄清楚你的基尼得分手的情况。我说0.0033xxx似乎是一个非常低的值(几乎没有模型?),用于标准化的基尼评分。
答案 1 :(得分:5)
按照用户3914041和Andreus的最小示例和响应,这可以按预期工作。的确,我得到了:
Validation Sample Score: 10.176958 (mean squared).
Fitting 1 folds for each of 1 candidates, totalling 1 fits
mean: 10.19074, std: 0.00000, params: {'n_estimators': 1000}
在这种情况下,我们在两种方法中都有相同的结果(省略一些舍入)。以下是重现相同分数的代码:
from sklearn.cross_validation import train_test_split, PredefinedSplit
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn import metrics
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.datasets import load_boston
b=load_boston()
X = b.data
y = b.target
folds=train_test_split(range(len(y)),test_size=0.5, random_state=10)
train_X=X[folds[0],:]
train_y=y[folds[0]]
test_X=X[folds[1],:]
test_y=y[folds[1]]
folds_split = np.zeros_like(y)
folds_split[folds[0]] = -1
ps = PredefinedSplit(folds_split)
rf=RandomForestRegressor(n_estimators=1000, random_state=42)
rf.fit(train_X,train_y)
y_submission=rf.predict(test_X)
print "Validation Sample Score: %f (mean squared)." % mean_squared_error(test_y,y_submission)
mse_scorer = make_scorer(mean_squared_error)
parameters= {'n_estimators': [1000]}
grid_search = GridSearchCV(RandomForestRegressor(random_state=42), cv=ps,
param_grid=parameters, verbose=1, scoring=mse_scorer)
grid_search.fit(X,y)
print grid_search.grid_scores_[0]
在第一个示例中,尝试删除greater_is_better=True
。实际上,基尼系数应该被最小化,而不是最大化。
试着看看这是否解决了这个问题。您还可以添加一些随机种子,以确保以完全相同的方式完成拆分。
答案 2 :(得分:4)
我可以告诉两个代码块之间有一个区别。通过使用cv=2
,您将数据拆分为两个50%大小的块。然后在它们之间平均得到的基尼。
作为旁注,您确定要在得分手中greater_is_better=True
吗?从您的帖子中,您暗示您希望降低该分数。在这一点上要特别小心,因为GridSearchCV最大化得分。
选择的参数是那些最大化遗漏数据得分的参数,除非传递明确的分数,在这种情况下使用它。
答案 3 :(得分:0)
这个帖子现在已经很老了,所以我假设所有人都认为这一点,但为了清楚起见,原来的2个区块中至少有3个问题导致它们产生不同的结果:简而言之,未能设置一对随机种子,并且未能在train_test_split返回的折叠上使用PredefinedSplit(迭代可以最终重新排序分割)。下面是使用不同的gini实现来说明的独立代码:
import sys
import numpy as np
import pandas as pd
from sklearn.cross_validation import train_test_split, PredefinedSplit
from sklearn.feature_extraction import DictVectorizer as DV
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
def gini(expected, predicted):
assert expected.shape[0] == predicted.shape[0], 'unequal number of rows: [ %d vs %d ]' \
% ( expected.shape[0] == predicted.shape[0] )
_all = np.asarray(np.c_[
expected,
predicted,
np.arange(expected.shape[0])], dtype=np.float)
_EXPECTED = 0
_PREDICTED = 1
_INDEX = 2
# sort by predicted descending, then by index ascending
sort_order = np.lexsort((_all[:, _INDEX], -1 * _all[:, _PREDICTED]))
_all = _all[sort_order]
total_losses = _all[:, _EXPECTED].sum()
gini_sum = _all[:, _EXPECTED].cumsum().sum() / total_losses
gini_sum -= (expected.shape[0] + 1.0) / 2.0
return gini_sum / expected.shape[0]
def gini_normalized(solution, submission, gini=gini):
solution = np.array(solution)
submission = np.array(submission)
return gini(solution, submission) / gini(solution, solution)
gini_scorer = metrics.make_scorer( gini_normalized, greater_is_better=True )
dat=pd.read_table('train.csv',sep=',')
y=dat[['Hazard']].values.ravel()
dat=dat.drop(['Hazard','Id'],axis=1)
# 1. set seed for train_test_split()
folds = train_test_split( range(len(y)), test_size=0.7, random_state=15 ) # 70% test
dat_dict=dat.T.to_dict().values()
vectorizer=DV( sparse = False )
vectorizer.fit( dat_dict )
dat=vectorizer.transform( dat_dict )
dat=pd.DataFrame(dat)
# 2. instead of using the raw folds returned by train_test_split,
# use the PredefinedSplit iterator, just like GridSearchCV does
if 0:
train_X=dat.iloc[folds[0]]
train_y=y[folds[0]]
test_X=dat.iloc[folds[1]]
test_y=y[folds[1]]
else:
folds_split = np.zeros_like(y)
folds_split[folds[0]] = -1
ps = PredefinedSplit(folds_split)
# in this example, there's only one iteration here
for train_index, test_index in ps:
train_X, test_X = dat.iloc[train_index], dat.iloc[test_index]
train_y, test_y = y[train_index], y[test_index]
n_estimators = [ 100, 200 ]
# 3. also set seed for RFR
rfr_params = { 'n_jobs':7, 'random_state':15 }
######################################################################
# manual grid search ( block 1 )
for n_est in n_estimators:
print 'n_estimators = %d:' % n_est; sys.stdout.flush()
rfr = RandomForestRegressor( n_estimators=n_est, **rfr_params )
rfr.fit( train_X, train_y )
y_pred = rfr.predict( test_X )
gscore = gini_normalized( test_y, y_pred )
print ' validation score: %.5f (normalized gini)' % gscore
######################################################################
# GridSearchCV grid search ( block 2 )
ps = PredefinedSplit(folds_split)
rfr = RandomForestRegressor( **rfr_params )
grid_params = { 'n_estimators':n_estimators }
gcv = GridSearchCV( rfr, grid_params, scoring=gini_scorer, cv=ps )
gcv.fit( dat, y )
print gcv.grid_scores_