Question

我创建了一个tidymodel管道，该管道工作得很好，直到将创建rsplit对象的方法从initial_split()更改为initial_time_split()为止。当我这样做时，last_fit()函数会中断并显示错误：

x : internal: Error in data.frame(..., check.names = FALSE): arguments imply differing number of rows: 2, 0 Warning message: All
models failed in [fit_resamples()]. See the .notes column.

我需要使用initial_time_split()，因为我有一组特定的训练集和另一组用于测试（它们是不同的时间段），所以随机抽取行是行不通的。

有人知道如何解决此问题，或者知道更好的方法吗？

非常感谢您的光临！

这是产生错误的代码。请注意，如果您在df_split注释后更改ATTENTION行，它将可以正常工作。

library(tidymodels)
library(dplyr)
library(fastDummies)

#download the data
data = read.csv("https://uc4f84ae07955bebed2c3804d381.dl.dropboxusercontent.com/cd/0/get/A-PcNiWAKII0M8OlwmYxE1fYXFhtTUPnLw2x_AvL3IlUR_HE8_IPdTVPaYj1mtQwByPgcq2qpj-bfb4O8-wgW4rqgAnff4cLbNSSGe44FewPUmxenJZBpvxXikDQUyVVXXY/file?_download_id=89899030331414668534005769335371927551009313297349360886651029534&_notify_domain=www.dropbox.com&dl=1")

data = data %>%
  select(-X) %>%
  dummy_cols(remove_selected_columns = TRUE) %>%
  mutate(label = as.factor(label))
  

#Creation of train and test splits
set.seed(123)

proportion = sum(data$train)/nrow(data) #proportion of train observations in the data

# ATTENTION #
df_split = initial_time_split(data, prop = proportion) #IT SEEMS THAT THIS CREATES THE PROBLEM  
#df_split = initial_split(data, prop = 3/4, strata = label) #changing this line with the previous fixes the problem


df_train <- training(df_split)
df_test  <- testing(df_split)

#Recipe
recipe <- 
  recipe(label ~ ., data = df_train) %>%
  update_role(x, y, train, new_role = "ID")

#model specificaction
cores = parallel::detectCores() 

xgb_spec <- boost_tree(
  trees = tune(), 
  tree_depth = tune(),
  min_n = tune(), 
  loss_reduction = tune(),                     
  sample_size = tune(),
  mtry = tune(),         
  learn_rate = tune(),                         
) %>% 
  set_engine("xgboost", nthread = cores) %>% 
  set_mode("classification")

#Hiperparameters
set.seed(123)
xgb_grid <- grid_max_entropy(
  trees(),
  tree_depth(),
  min_n(),
  loss_reduction(),
  sample_size = sample_prop(),
  finalize(mtry(), df_train),
  learn_rate(),
  size = 7
)

#Cross validation
set.seed(123)
folds <- vfold_cv(df_train, v = 2, strata = label) 

#Model workflow
xgb_workflow <- 
  workflow() %>% 
  add_model(xgb_spec) %>% 
  add_recipe(recipe)


### Training###
set.seed(123)
xgb_res = xgb_workflow %>% 
  tune_grid(resamples = folds,
            grid = xgb_grid,
            metrics = metric_set(roc_auc)) 
#selection of best model
best_auc <- select_best(xgb_res, "roc_auc")

#adding best model to workflow
final_xgb <- finalize_workflow(
  xgb_workflow,
  best_auc)

#Last fit
final_res <- last_fit(object = final_xgb, split = df_split)

x : internal: Error in data.frame(..., check.names = FALSE): arguments
imply differing number of rows: 2, 0 Warning message: All models
failed in [fit_resamples()]. See the `.notes` column.

会话信息

R version 4.0.2 (2020-06-22) Platform: x86_64-w64-mingw32/x64 (64-bit) Running under: Windows 10 x64 (build 18362)

Matrix products: default

locale: [1] LC_COLLATE=English_United States.1252
LC_CTYPE=English_United States.1252 LC_MONETARY=English_United
States.1252 [4] LC_NUMERIC=C LC_TIME=English_United States.1252

attached base packages: [1] stats graphics grDevices utils datasets
methods base

other attached packages: [1] fastDummies_1.6.1 xgboost_1.1.1.1
vip_0.2.2 themis_0.1.2 yardstick_0.0.7 workflows_0.1.3 tune_0.1.1
tidyr_1.1.1 tibble_3.0.3 rsample_0.0.7 recipes_0.1.13 purrr_0.3.4
parsnip_0.1.3 modeldata_0.0.2 infer_0.5.3 ggplot2_3.3.2 dplyr_1.0.2
dials_0.0.8 scales_1.1.1 broom_0.7.0 tidymodels_0.1.1

Tidymodels。使用rsample :: initial_time_split（）

0 个答案: