当我直接进行简单线性回归而不是可以接受任何建模参数并进行交叉验证的函数时,我会得到不同的结果。这似乎是对我的唯一区别,但为什么结果如此不同。
x_train,x_test,y_train,y_test = train_test_split(features.values,target.values)
regr = linear_model.LinearRegression()
regr.fit(x_train, y_train)
print(np.mean((regr.predict(x_test) - y_test) ** 2))
395.68
from sklearn.model_selection import KFold
from scipy.sparse import csr_matrix
from sklearn.metrics import r2_score
def cv_predict_report(func,params,features,target,fold,verbose=False):
model_details,rmse = [],[]
kf = KFold(n_splits=fold)
for train_ix, test_ix in kf.split(features):
x_train_poly, x_test_poly = features.iloc[train_ix].values,features.iloc[test_ix].values
y_train,y_test = target.iloc[train_ix],target.iloc[test_ix]
#model
model = func(**params)
model.fit(csr_matrix(x_train_poly),y_train)
#save model details
model_details.append(model)
#predictions
preds = model.predict(x_test_poly)
model_rmse = np.mean((preds - y_test) ** 2)
rmse.append(model_rmse)
return model_details,rmse
from sklearn import linear_model
lm_model,lm_rmse = cv_predict_report(linear_model.LinearRegression,{},features,target,fold = 2)
print(lm_rmse)
[440.2067193330667, 437.6263639733618]
答案 0 :(得分:0)
我刚才意识到问题所在。通用函数调用' csr_matrix'这表现得很奇怪并导致更高的错误。我将阅读csr_matrix及其导致这种差异的原因