我刚开始接触机器学习,当我使用模型融合时遇到了一些问题,以下代码是我在研究其他人的代码时遇到的一些问题。我想与Xgboost
堆叠, ExtraTreesRegressor
,RandomForestRegressor
,岭,套索;
这是代码:
# coding=utf-8
import pandas as pd
import numpy as np
from scipy.stats import skew
import xgboost as xgb
from sklearn.cross_validation import KFold
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, LassoCV, Lasso
from math import sqrt
NTRAIN = 100
NTEST = 0
X_train = []
Y_train = []
X_test = []
NFOLDS = 3
SEED = 0
kf = KFold(NTRAIN, n_folds=NFOLDS, shuffle=True, random_state=SEED)
class SklearnWrapper(object):
def __init__(self, clf, seed=0, params=None):
params['random_state'] = seed
self.clf = clf(**params)
def train(self, x_train, y_train):
self.clf.fit(x_train, y_train)
def predict(self, x):
return self.clf.predict(x)
class XgbWrapper(object):
def __init__(self, seed=0, params=None):
self.param = params
self.param['seed'] = seed
self.nrounds = params.pop('nrounds', 250)
def train(self, x_train, y_train):
dtrain = xgb.DMatrix(x_train, label=y_train)
self.gbdt = xgb.train(self.param, dtrain, self.nrounds)
def predict(self, x):
return self.gbdt.predict(xgb.DMatrix(x))
def get_oof(clf):
oof_train = np.zeros((NTRAIN,))
oof_test = np.zeros((NTEST,))
oof_test_skf = np.empty((NFOLDS, NTEST))
for i, (train_index, test_index) in enumerate(kf):
print("TRAIN:", train_index, "TEST:", test_index)
x_tr = X_train[train_index]
y_tr = Y_train[train_index]
x_te = X_train[test_index]
clf.train(x_tr, y_tr)
oof_train[test_index] = clf.predict(x_te)
oof_test_skf[i, :] = clf.predict(X_test)
oof_test[:] = oof_test_skf.mean(axis=0)
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)
def stacking(nfolds, seed, et_params, rf_params, xgb_params, rd_params, ls_params, x_train, y_train, x_test):
NTRAIN = x_train.shape[0]
NTEST = len(x_test)
NFOLDS = nfolds
SEED = seed
X_train = x_train
Y_train = y_train
X_test = x_test
kf = KFold(NTRAIN, n_folds=NFOLDS, shuffle=True, random_state=SEED)
xg = XgbWrapper(seed=SEED, params=xgb_params)
et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
rd = SklearnWrapper(clf=Ridge, seed=SEED, params=rd_params)
ls = SklearnWrapper(clf=Lasso, seed=SEED, params=ls_params)
xg_oof_train, xg_oof_test = get_oof(xg)
et_oof_train, et_oof_test = get_oof(et)
rf_oof_train, rf_oof_test = get_oof(rf)
rd_oof_train, rd_oof_test = get_oof(rd)
ls_oof_train, ls_oof_test = get_oof(ls)
print("XG-CV: {}".format(sqrt(mean_squared_error(Y_train, xg_oof_train))))
print("ET-CV: {}".format(sqrt(mean_squared_error(Y_train, et_oof_train))))
print("RF-CV: {}".format(sqrt(mean_squared_error(Y_train, rf_oof_train))))
print("RD-CV: {}".format(sqrt(mean_squared_error(Y_train, rd_oof_train))))
print("LS-CV: {}".format(sqrt(mean_squared_error(Y_train, ls_oof_train))))
dtrain = xgb.DMatrix(X_train, label=Y_train)
dtest = xgb.DMatrix(X_test)
res = xgb.cv(xgb_params, dtrain, num_boost_round=1000, nfold=4, seed=SEED, stratified=False,
early_stopping_rounds=25, verbose_eval=10, show_stdv=True)
best_nrounds = res.shape[0] - 1
cv_mean = res.iloc[-1, 0]
cv_std = res.iloc[-1, 1]
print('Ensemble-CV: {0}+{1}'.format(cv_mean, cv_std))
gbdt = xgb.train(xgb_params, dtrain, best_nrounds)
submission = pd.read_csv("../house_price/data/submission_xgboosting2.csv")
submission.iloc[:, 1] = gbdt.predict(dtest)
print(submission['SalePrice'])
saleprice = np.exp(submission['SalePrice']) - 1
print(saleprice)
submission['SalePrice'] = saleprice
submission.to_csv('xgstacker_starter.sub.csv', index=None)
尝试在我的数据上运行stacking
方法时出现此错误:
TypeError:只能将整数标量数组转换为标量索引
错误来自 get_oof 方法中的第x_tr = X_train[train_index]
行。
很多人在使用TensorFlow
时遇到过它,但我觉得它看起来像Numpy中的一个错误?
答案 0 :(得分:2)
在文件的开头,您可以定义一些全局变量:
NTRAIN = 100
NTEST = 0
X_train = []
Y_train = []
X_test = []
NFOLDS = 3
SEED = 0
kf = KFold(NTRAIN, n_folds=NFOLDS, shuffle=True, random_state=SEED)
然后在方法 get_oof 中使用这些全局变量,尤其是X_train
,这是一个空列表而不是您认为的那样。
在 get_oof 中添加print(X_train)
即可查看。
因为X_train
是一个列表而不是一个numpy.ndarray,你不能对你使用a numpy.ndarray做同样的索引操作,这就是你得到这个TypeError
的原因。 / p>
**如何修复它:**
在脚本开头删除/注释这些全局变量 然后,您必须将所有这些变量传递给 get_oof 方法:
def get_oof(clf, X_train, Y_train, kf, NTRAIN, NTEST, NFOLDS, X_test):
oof_train = np.zeros((NTRAIN,))
oof_test = np.zeros((NTEST,))
oof_test_skf = np.empty((NFOLDS, NTEST))
#...
然后在堆叠方法中调整代码:
xg_oof_train, xg_oof_test = get_oof(xg,X_train, Y_train, kf, NTRAIN, NTEST, NFOLDS, X_test)
et_oof_train, et_oof_test = get_oof(et,X_train, Y_train, kf, NTRAIN, NTEST, NFOLDS, X_test)
rf_oof_train, rf_oof_test = get_oof(rf,X_train, Y_train, kf, NTRAIN, NTEST, NFOLDS, X_test)
rd_oof_train, rd_oof_test = get_oof(rd,X_train, Y_train, kf, NTRAIN, NTEST, NFOLDS, X_test)
ls_oof_train, ls_oof_test = get_oof(ls,X_train, Y_train, kf, NTRAIN, NTEST, NFOLDS, X_test)