我是python机器学习的新手,并且已经了解了堆栈模型的概念,并想尝试一下。问题是我不完全了解python中的机器学习实现,因此我不知道如何预测新数据。我管理的报废代码看起来像这样:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from vecstack import stacking
import pandas as pd
X = pd.read_csv('db/file_name3.csv')
y = pd.read_csv('db/train_labels(1).csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
models = [
CatBoostRegressor(iterations=200,
learning_rate=0.03,
depth=4,
loss_function='RMSE',
eval_metric='RMSE',
random_seed=99,
od_type='Iter',
od_wait=50,
logging_level='Silent'),
CatBoostRegressor(iterations=500,
learning_rate=0.06,
depth=3,
loss_function='RMSE',
eval_metric='RMSE',
random_seed=99,
od_type='Iter',
od_wait=50,
logging_level='Silent'),
ExtraTreesRegressor(random_state = 0, n_jobs = -1,
n_estimators = 100, max_depth = 3),
RandomForestRegressor(random_state = 0, n_jobs = -1,
n_estimators = 300, max_depth = 3),
XGBRegressor(eta=0.02,reg_lambda=5,reg_alpha=1),
XGBRegressor(eta=0.1,reg_lambda=1,reg_alpha=10),
XGBRegressor(eta=0.02,reg_lambda=1,reg_alpha=10,n_estimators=300),
XGBRegressor(eta=0.012,max_depth=3,n_estimators=200),
GradientBoostingRegressor(),
BaggingRegressor(),
]
test1= pd.read_csv('db/Cleaned Data.csv')
S_train, S_test = stacking(models, X_train, y_train, X_train,
regression = True, metric = mean_absolute_error, n_folds = 10 ,
shuffle = True, random_state = 0, verbose = 2)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
print(y_pred.shape)
如您所见,test1是我想预测但无法弄清楚的数据。我可以从我的训练集中预测数据,而不能从新训练集中预测。我没有从文档中更改任何模型参数。
答案 0 :(得分:0)
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
# load your X, y and test1 data here
RF = RandomForestRegressor(random_state = 0, n_jobs = -1,
n_estimators = 300, max_depth = 3)
# Validation function
n_folds = 5
def rmsle_cv(model):
kf = KFold(n_folds, shuffle=True,
random_state=42).get_n_splits(X.values)
rmse= np.sqrt(-cross_val_score(model, X.values, y.values.ravel(),
scoring="neg_mean_squared_error", cv = kf))
return(rmse)
score = rmsle_cv(RF)
print("Random Forest score: {:.3f} ({:.3f})\n".format(score.mean(),
score.std()))
RF.fit(X,y.values.ravel())
RF_train_pred = RF.predict(X)
RF_pred = RF.predict(test1)
print (RF_pred.shape)