每个样本都有不同权重的数据。在我的应用中,重要的是在估算模型和比较替代模型时考虑这些权重。
我正在使用sklearn
估算模型并比较其他超参数选项。但是,此单元测试显示GridSearchCV
不会应用sample_weights
来估算分数。
有没有办法让sklearn
使用sample_weight
对模型进行评分?
单元测试:
from __future__ import division
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, RepeatedKFold
def grid_cv(X_in, y_in, w_in, cv, max_features_grid, use_weighting):
out_results = dict()
for k in max_features_grid:
clf = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=-1,
random_state=RANDOM_STATE,
max_features=k)
for train_ndx, test_ndx in cv.split(X=X_in, y=y_in):
X_train = X_in[train_ndx, :]
y_train = y_in[train_ndx]
w_train = w_in[train_ndx]
y_test = y[test_ndx]
clf.fit(X=X_train, y=y_train, sample_weight=w_train)
y_hat = clf.predict_proba(X=X_in[test_ndx, :])
if use_weighting:
w_test = w_in[test_ndx]
w_i_sum = w_test.sum()
score = w_i_sum / w_in.sum() * log_loss(y_true=y_test, y_pred=y_hat, sample_weight=w_test)
else:
score = log_loss(y_true=y_test, y_pred=y_hat)
results = out_results.get(k, [])
results.append(score)
out_results.update({k: results})
for k, v in out_results.items():
if use_weighting:
mean_score = sum(v)
else:
mean_score = np.mean(v)
out_results.update({k: mean_score})
best_score = min(out_results.values())
best_param = min(out_results, key=out_results.get)
return best_score, best_param
if __name__ == "__main__":
RANDOM_STATE = 1337
X, y = load_iris(return_X_y=True)
sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
# sample_weight = np.array([1 for _ in range(len(X))])
inner_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
outer_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
rfc = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=-1,
random_state=RANDOM_STATE)
search_params = {"max_features": [1, 2, 3, 4]}
fit_params = {"sample_weight": sample_weight}
my_scorer = make_scorer(log_loss,
greater_is_better=False,
needs_proba=True,
needs_threshold=False)
grid_clf = GridSearchCV(estimator=rfc,
scoring=my_scorer,
cv=inner_cv,
param_grid=search_params,
refit=True,
return_train_score=False,
iid=False) # in this usage, the results are the same for `iid=True` and `iid=False`
grid_clf.fit(X, y, **fit_params)
print("This is the best out-of-sample score using GridSearchCV: %.6f." % -grid_clf.best_score_)
msg = """This is the best out-of-sample score %s weighting using grid_cv: %.6f."""
score_with_weights, param_with_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=True)
print(msg % ("WITH", score_with_weights))
score_without_weights, param_without_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=False)
print(msg % ("WITHOUT", score_without_weights))
产生输出:
This is the best out-of-sample score using GridSearchCV: 0.135692.
This is the best out-of-sample score WITH weighting using grid_cv: 0.099367.
This is the best out-of-sample score WITHOUT weighting using grid_cv: 0.135692.
说明:由于在没有加权的情况下手动计算损失会产生与GridSearchCV
相同的得分,我们知道未使用样本权重。
答案 0 :(得分:6)
GridSearchCV
将scoring
作为输入,可以调用。您可以查看有关如何更改评分功能的详细信息,以及如何传递自己的评分函数here。为了完整起见,这是该页面的相关代码:
编辑: fit_params 仅传递给fit函数,而不是score函数。如果有应该传递给scorer
的参数,则应将它们传递给make_scorer
。但是这仍然没有解决这里的问题,因为这意味着整个sample_weight
参数将传递给log_loss
,而只有与y_test
对应的部分传递给sklearn
计算损失应该通过。
padas.DataFrame
不支持此类内容,但您可以使用sklearn
来破解。好消息是,DataFrame
了解index
,并保持这种方式。这意味着您可以在此处的代码中使用DataFrame
的{{1}}:
# more code
X, y = load_iris(return_X_y=True)
index = ['r%d' % x for x in range(len(y))]
y_frame = pd.DataFrame(y, index=index)
sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
sample_weight_frame = pd.DataFrame(sample_weight, index=index)
# more code
def score_f(y_true, y_pred, sample_weight):
return log_loss(y_true.values, y_pred,
sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1),
normalize=True)
score_params = {"sample_weight": sample_weight_frame}
my_scorer = make_scorer(score_f,
greater_is_better=False,
needs_proba=True,
needs_threshold=False,
**score_params)
grid_clf = GridSearchCV(estimator=rfc,
scoring=my_scorer,
cv=inner_cv,
param_grid=search_params,
refit=True,
return_train_score=False,
iid=False) # in this usage, the results are the same for `iid=True` and `iid=False`
grid_clf.fit(X, y_frame)
# more code
如您所见,score_f
使用index
的{{1}}来查找要使用的y_true
部分。为了完整起见,这里是整个代码:
sample_weight
代码的输出是:
from __future__ import division
import numpy as np
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.metrics import make_scorer
import pandas as pd
def grid_cv(X_in, y_in, w_in, cv, max_features_grid, use_weighting):
out_results = dict()
for k in max_features_grid:
clf = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=1,
random_state=RANDOM_STATE,
max_features=k)
for train_ndx, test_ndx in cv.split(X=X_in, y=y_in):
X_train = X_in[train_ndx, :]
y_train = y_in[train_ndx]
w_train = w_in[train_ndx]
y_test = y_in[test_ndx]
clf.fit(X=X_train, y=y_train, sample_weight=w_train)
y_hat = clf.predict_proba(X=X_in[test_ndx, :])
if use_weighting:
w_test = w_in[test_ndx]
w_i_sum = w_test.sum()
score = w_i_sum / w_in.sum() * log_loss(y_true=y_test, y_pred=y_hat, sample_weight=w_test)
else:
score = log_loss(y_true=y_test, y_pred=y_hat)
results = out_results.get(k, [])
results.append(score)
out_results.update({k: results})
for k, v in out_results.items():
if use_weighting:
mean_score = sum(v)
else:
mean_score = np.mean(v)
out_results.update({k: mean_score})
best_score = min(out_results.values())
best_param = min(out_results, key=out_results.get)
return best_score, best_param
#if __name__ == "__main__":
if True:
RANDOM_STATE = 1337
X, y = load_iris(return_X_y=True)
index = ['r%d' % x for x in range(len(y))]
y_frame = pd.DataFrame(y, index=index)
sample_weight = np.array([1 + 100 * (i % 25) for i in range(len(X))])
sample_weight_frame = pd.DataFrame(sample_weight, index=index)
# sample_weight = np.array([1 for _ in range(len(X))])
inner_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
outer_cv = RepeatedKFold(n_splits=3, n_repeats=1, random_state=RANDOM_STATE)
rfc = RandomForestClassifier(n_estimators=256,
criterion="entropy",
warm_start=False,
n_jobs=1,
random_state=RANDOM_STATE)
search_params = {"max_features": [1, 2, 3, 4]}
def score_f(y_true, y_pred, sample_weight):
return log_loss(y_true.values, y_pred,
sample_weight=sample_weight.loc[y_true.index.values].values.reshape(-1),
normalize=True)
score_params = {"sample_weight": sample_weight_frame}
my_scorer = make_scorer(score_f,
greater_is_better=False,
needs_proba=True,
needs_threshold=False,
**score_params)
grid_clf = GridSearchCV(estimator=rfc,
scoring=my_scorer,
cv=inner_cv,
param_grid=search_params,
refit=True,
return_train_score=False,
iid=False) # in this usage, the results are the same for `iid=True` and `iid=False`
grid_clf.fit(X, y_frame)
print("This is the best out-of-sample score using GridSearchCV: %.6f." % -grid_clf.best_score_)
msg = """This is the best out-of-sample score %s weighting using grid_cv: %.6f."""
score_with_weights, param_with_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=True)
print(msg % ("WITH", score_with_weights))
score_without_weights, param_without_weights = grid_cv(X_in=X,
y_in=y,
w_in=sample_weight,
cv=inner_cv,
max_features_grid=search_params.get(
"max_features"),
use_weighting=False)
print(msg % ("WITHOUT", score_without_weights))
编辑2 :正如评论所说:
使用此解决方案得分和sklearn得分的差异 起源于我计算加权平均值的方式 分数。如果省略代码的加权平均部分,则两者 输出与机器精度匹配。
答案 1 :(得分:1)
目前在 sklearn 中,GridSearchCV
(以及任何继承 BaseSearchCV
的类)只允许 sample_weight
中的 **fit_params
但不允许在评分中使用它,这是不正确的,因为 CV通过未加权的分数选择“最佳估计器”。注意,当您 grid.fit(X, y, sample_weight=w)
仅在 fit
中使用样本权重,而不是 score
时。
有两种方法可以解决这个问题:
from sklearn.base import BaseEstimator, TransformerMixin
# customized scorer
def weight_remover_scorer(estimator, X, y):
y_pred = estimator.predict(X)
w = X[:,0]
return your_scorer(y, y_pred, sample_weight=w)
# customized transformer
class WeightRemover(TransformerMixin, BaseEstimator):
def fit(self, X, y=None, **fit_params):
return self
def transform(self, X, y=None, **fit_params):
return X[:,1:]
# in your main function
if __name__=='__main__':
pipe = Pipeline([('remove_weight', WeightRemover()),('model',model)])
params_grid = {'model__'+k:v for k,v in params_grid.items()}
X = np.c_[train_w, X]
X_test = np.c_[test_w, X_test]
grid = GridSearchCV(pipe, params_grid, cv=5, scoring=weight_remover_scorer)
grid.fit(X, y)
sklearn
类中添加功能(等待新的升级)。只需在 sample_weight
中添加参数 BaseSearchCV
(默认为 None
),以与 fit_params = _check_fit_params(X, fit_params)
相同的方式对其进行更安全的索引。答案 2 :(得分:0)
只需指出,我们正在努力支持这一重要功能:https://github.com/scikit-learn/scikit-learn/pull/13432
但是,由于后向兼容性问题以及解决传递任意样本相关信息的更普遍问题的愿望,似乎花费了太长时间。上一次尝试似乎是:https://github.com/scikit-learn/scikit-learn/pull/16079
以下是对该问题的很好的回顾:http://deaktator.github.io/2019/03/10/the-error-in-the-comparator/