我创建了一个tidymodel管道,该管道工作得很好,直到将创建rsplit对象的方法从initial_split()
更改为initial_time_split()
为止。当我这样做时,last_fit()
函数会中断并显示错误:
x : internal: Error in data.frame(..., check.names = FALSE): arguments imply differing number of rows: 2, 0 Warning message: All
models failed in [fit_resamples()]. See the .notes column.
我需要使用initial_time_split()
,因为我有一组特定的训练集和另一组用于测试(它们是不同的时间段),所以随机抽取行是行不通的。
有人知道如何解决此问题,或者知道更好的方法吗?
非常感谢您的光临!
这是产生错误的代码。请注意,如果您在df_split
注释后更改ATTENTION
行,它将可以正常工作。
library(tidymodels)
library(dplyr)
library(fastDummies)
#download the data
data = read.csv("https://uc4f84ae07955bebed2c3804d381.dl.dropboxusercontent.com/cd/0/get/A-PcNiWAKII0M8OlwmYxE1fYXFhtTUPnLw2x_AvL3IlUR_HE8_IPdTVPaYj1mtQwByPgcq2qpj-bfb4O8-wgW4rqgAnff4cLbNSSGe44FewPUmxenJZBpvxXikDQUyVVXXY/file?_download_id=89899030331414668534005769335371927551009313297349360886651029534&_notify_domain=www.dropbox.com&dl=1")
data = data %>%
select(-X) %>%
dummy_cols(remove_selected_columns = TRUE) %>%
mutate(label = as.factor(label))
#Creation of train and test splits
set.seed(123)
proportion = sum(data$train)/nrow(data) #proportion of train observations in the data
# ATTENTION #
df_split = initial_time_split(data, prop = proportion) #IT SEEMS THAT THIS CREATES THE PROBLEM
#df_split = initial_split(data, prop = 3/4, strata = label) #changing this line with the previous fixes the problem
df_train <- training(df_split)
df_test <- testing(df_split)
#Recipe
recipe <-
recipe(label ~ ., data = df_train) %>%
update_role(x, y, train, new_role = "ID")
#model specificaction
cores = parallel::detectCores()
xgb_spec <- boost_tree(
trees = tune(),
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(),
sample_size = tune(),
mtry = tune(),
learn_rate = tune(),
) %>%
set_engine("xgboost", nthread = cores) %>%
set_mode("classification")
#Hiperparameters
set.seed(123)
xgb_grid <- grid_max_entropy(
trees(),
tree_depth(),
min_n(),
loss_reduction(),
sample_size = sample_prop(),
finalize(mtry(), df_train),
learn_rate(),
size = 7
)
#Cross validation
set.seed(123)
folds <- vfold_cv(df_train, v = 2, strata = label)
#Model workflow
xgb_workflow <-
workflow() %>%
add_model(xgb_spec) %>%
add_recipe(recipe)
### Training###
set.seed(123)
xgb_res = xgb_workflow %>%
tune_grid(resamples = folds,
grid = xgb_grid,
metrics = metric_set(roc_auc))
#selection of best model
best_auc <- select_best(xgb_res, "roc_auc")
#adding best model to workflow
final_xgb <- finalize_workflow(
xgb_workflow,
best_auc)
#Last fit
final_res <- last_fit(object = final_xgb, split = df_split)
x : internal: Error in data.frame(..., check.names = FALSE): arguments
imply differing number of rows: 2, 0 Warning message: All models
failed in [fit_resamples()]. See the `.notes` column.
会话信息
R version 4.0.2 (2020-06-22) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows 10 x64 (build 18362)
Matrix products: default
locale: [1] LC_COLLATE=English_United States.1252
LC_CTYPE=English_United States.1252 LC_MONETARY=English_United
States.1252 [4] LC_NUMERIC=C LC_TIME=English_United States.1252
attached base packages: [1] stats graphics grDevices utils datasets
methods base
other attached packages: [1] fastDummies_1.6.1 xgboost_1.1.1.1
vip_0.2.2 themis_0.1.2 yardstick_0.0.7 workflows_0.1.3 tune_0.1.1
tidyr_1.1.1 tibble_3.0.3 rsample_0.0.7 recipes_0.1.13 purrr_0.3.4
parsnip_0.1.3 modeldata_0.0.2 infer_0.5.3 ggplot2_3.3.2 dplyr_1.0.2
dials_0.0.8 scales_1.1.1 broom_0.7.0 tidymodels_0.1.1