我试图通过改变数据点的特征数量(最多 10 个)并同时应用 5 倍交叉验证来计算训练和验证错误。我的想法是计算每次特征迭代的折叠之间的平均误差。
我找不到计算这些平均误差值的正确方法
n=10 # Number of features
err_train = np.zeros(n) # Array for storing training errors
err_val = np.zeros(n) # Array for storing validation errors
for i in range(n):
train_errors_per_cv_iteration =[]
test_errors_per_cv_iteration =[]
K=5 # Specify the number of folds of split data into
kf = KFold(n_splits=K, shuffle=False) # Create a KFold object with 'K' splits
iteration = 0
for train_indices, test_indices in kf.split(X):
iteration += 1
X_train = X[train_indices,:] # Get the training set
X_val = X[test_indices,:] # Get the validation set
reg = LinearRegression(fit_intercept=True) # create an object for linear predictors
reg = reg.fit(X_train[:,:(i+1)], y_train) # find best linear predictor (minimize training error)
pred = reg.predict(X_train[:,:(i+1)]) # compute predictions of best predictors
err_train[i] = mean_squared_error(y_train, pred)
pred = reg.predict(X_val[:,:(i+1)]) # compute predictions of best predictors
err_val[i] = mean_squared_error(y_val, pred)