Question

我试图将阶跃函数拟合到数据框，并确定有多少个切点产生最低的mse。而且我一直遇到相同的错误消息：

Error in data.frame(..., check.names = FALSE) : arguments imply differing number of rows: 149, 1332

我的代码和虚拟数据帧如下：

library(tidyverse)
library{rsample)
library(broom)
library(rcfss)

set.seed(666)
df <- tibble(egalit_scale = runif(1481, 1, 35), income06 = runif(1481, 1, 25))
training_df <- vfold_cv(df, 10)
mse_df <- function(splits, cc){
  model <- glm(egalit_scale ~ cut(income06, cc),
              data = analysis(splits))
  model_mse <- augment(model, newdata = assessment(splits)) %>%
    mse(truth = egalit_scale, estimate = round(.fitted))
  model_mse$.estimate
}

tidyr::expand(training_df, id, cc = 2:15) %>%
  left_join(training_df) %>%
  mutate(mse = map2(splits, cc, mse_df))

错误发生在使用map2的步骤中。我尝试用特定数量的剪切点运行10折CV中的每一个，结果发现该功能可以使用的10折中有9折，但是没有。有人可以帮我吗？

Answer 1

问题来自

augment(model, newdata = assessment(splits))

因为在上一步中

model <- glm(egalit_scale ~ cut(income06, cc),
          data = analysis(splits))

我们在'splits'上执行analysis而不是assessment，这导致获得不同数量的行，例如

out <- tidyr::expand(training_df, id, cc = 2:15) %>%
         left_join(training_df)

tmp <- out$splits[[1]]



analysis(tmp)
# A tibble: 1,332 x 2
#   egalit_scale income06
#          <dbl>    <dbl>
# 1        27.3      9.69
# 2         7.71     8.48
# 3        34.3     21.3 
# 4         7.85    15.8 
# 5        13.3     24.6 
# 6        26.2      8.67
# 7        34.3      4.78
# 8        17.9     16.8 
# 9         1.45    21.2 
#10         9.84    15.7 
# … with 1,322 more rows

assessment(tmp)
# A tibble: 149 x 2
#   egalit_scale income06
#          <dbl>    <dbl>
# 1        28.6     14.8 
# 2        17.8      2.47
# 3         5.03    24.3 
# 4        31.5      5.79
# 5        18.4     18.0 
# 6         4.05     8.06
# 7         2.28     8.16
# 8        28.6     16.8 
# 9        21.1      7.03
#10         3.67    14.2 
# … with 139 more rows

因此，如果我们使用assessment更改模型语句

mse_df <- function(splits, cc){
  model <- glm(egalit_scale ~ cut(income06, cc),
              data = assessment(splits))
  model_mse <- augment(model, newdata = assessment(splits)) %>%
    mse(truth = egalit_scale, estimate = round(.fitted))
  model_mse$.estimate
}



library(yardstick)
out1 <- tidyr::expand(training_df, id, cc = 2:15) %>%
   left_join(training_df) %>%
   mutate(mse = map2_dbl(splits, cc, mse_df))
out1
# A tibble: 140 x 4
#   id        cc splits               mse
#   <chr>  <int> <named list>       <dbl>
# 1 Fold01     2 <split [1.3K/149]>  94.9
# 2 Fold01     3 <split [1.3K/149]>  94.6
# 3 Fold01     4 <split [1.3K/149]>  93.8
# 4 Fold01     5 <split [1.3K/149]>  94.5
# 5 Fold01     6 <split [1.3K/149]>  94.0
# 6 Fold01     7 <split [1.3K/149]>  92.0
# 7 Fold01     8 <split [1.3K/149]>  88.9
# 8 Fold01     9 <split [1.3K/149]>  91.2
# 9 Fold01    10 <split [1.3K/149]>  92.8
#10 Fold01    11 <split [1.3K/149]>  86.0
# … with 130 more rows

map2错误“参数暗示行数不同”

1 个答案: