用dplyr拟合几个回归模型

时间:2014-03-28 12:47:58

标签: r dplyr

我想使用dplyr为每小时(因子变量)拟合一个模型,我得到一个错误,并且我不太确定是什么错误。

df.h <- data.frame( 
  hour     = factor(rep(1:24, each = 21)),
  price    = runif(504, min = -10, max = 125),
  wind     = runif(504, min = 0, max = 2500),
  temp     = runif(504, min = - 10, max = 25)  
)

df.h <- tbl_df(df.h)
df.h <- group_by(df.h, hour)

group_size(df.h) # checks out, 21 obs. for each factor variable

# different attempts:
reg.models <- do(df.h, formula = price ~ wind + temp)

reg.models <- do(df.h, .f = lm(price ~ wind + temp, data = df.h))

我尝试了各种各样的变化,但我无法让它发挥作用。

6 个答案:

答案 0 :(得分:25)

在dplyr 0.4中,你可以这样做:

df.h %>% do(model = lm(price ~ wind + temp, data = .))

答案 1 :(得分:9)

来自do的文档:

.f:适用于每件作品的功能。提供给.f的第一个未命名参数将是一个数据框。

所以:

reg.models <- do(df.h, 
                 .f=function(data){
                     lm(price ~ wind + temp, data=data)
                 })

可能还可以保存模型适合的小时:

reg.models <- do(df.h, 
                 .f=function(data){
                     m <- lm(price ~ wind + temp, data=data)
                     m$hour <- unique(data$hour)
                     m
                 })

答案 2 :(得分:8)

我认为你可以以更合适的方式使用dplyr,而不需要在@fabians anwser中定义函数。

results<-df.h %.% 
group_by(hour) %.% 
do(failwith(NULL, lm), formula = price ~ wind + temp)

results<-do(group_by(tbl_df(df.h), hour),
failwith(NULL, lm), formula = price ~ wind + temp)

修改 当然,它也可以在没有failwith

的情况下运行
results<-df.h %.% 
    group_by(hour) %.% 
    do(lm, formula = price ~ wind + temp)


results<-do(group_by(tbl_df(df.h), hour),
lm, formula = price ~ wind + temp)

答案 3 :(得分:5)

从2020年中开始tchakravarty's answer将失败。为了绕开broomdpylr似乎相互影响的新方法,可以使用以下broom::tidybroom::augmentbroom::glance的用法。我们只需要在do()内使用它们,然后在unnest()内使用小工具即可。

library(dplyr)
library(broom)

df.h = data.frame( 
  hour     = factor(rep(1:24, each = 21)),
  price    = runif(504, min = -10, max = 125),
  wind     = runif(504, min = 0, max = 2500),
  temp     = runif(504, min = - 10, max = 25)  
)

df.h %>% group_by(hour) %>%
  do(fitHour = tidy(lm(price ~ wind + temp, data = .))) %>% 
  unnest(fitHour)
# # A tibble: 72 x 6
#    hour  term        estimate std.error statistic   p.value
#    <fct> <chr>          <dbl>     <dbl>     <dbl>     <dbl>
#  1 1     (Intercept)   82.4     18.1         4.55  0.000248 
#  2 1     wind         -0.0212   0.0108      -1.96  0.0655   
#  3 1     temp         -1.01     0.792       -1.28  0.218    
#  4 2     (Intercept)   25.9     19.7         1.31  0.206    
#  5 2     wind          0.0204   0.0131       1.57  0.135    
#  6 2     temp          0.680    1.01         0.670 0.511    
#  7 3     (Intercept)   88.3     15.5         5.69  0.0000214
#  8 3     wind         -0.0188   0.00998     -1.89  0.0754   
#  9 3     temp         -0.669    0.653       -1.02  0.319    
# 10 4     (Intercept)   73.4     14.2         5.17  0.0000639

df.h %>% group_by(hour) %>%
  do(fitHour = augment(lm(price ~ wind + temp, data = .))) %>% 
  unnest(fitHour)
# # A tibble: 24 x 13
#    hour  r.squared adj.r.squared sigma statistic p.value    df logLik   AIC   BIC deviance
#    <fct>     <dbl>         <dbl> <dbl>     <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>    <dbl>
#  1 1        0.246        0.162    39.0     2.93   0.0790     2  -105.  218.  222.   27334.
#  2 2        0.161        0.0674   43.5     1.72   0.207      2  -107.  223.  227.   34029.
#  3 3        0.192        0.102    33.9     2.14   0.147      2  -102.  212.  217.   20739.
#  4 4        0.0960      -0.00445  34.3     0.956  0.403      2  -102.  213.  217.   21169.
#  5 5        0.230        0.144    31.7     2.68   0.0955     2  -101.  210.  214.   18088.
#  6 6        0.0190      -0.0900   39.8     0.174  0.842      2  -106.  219.  223.   28507.
#  7 7        0.0129      -0.0967   37.1     0.118  0.889      2  -104.  216.  220.   24801.
#  8 8        0.197        0.108    35.3     2.21   0.139      2  -103.  214.  218.   22438.
#  9 9        0.0429      -0.0634   39.4     0.403  0.674      2  -105.  219.  223.   27918.
# 10 10       0.0943      -0.00633  35.6     0.937  0.410      2  -103.  214.  219.   22854.
# # … with 14 more rows, and 2 more variables: df.residual <int>, nobs <int>

df.h %>% group_by(hour) %>%
  do(fitHour = glance(lm(price ~ wind + temp, data = .))) %>% 
  unnest(fitHour)
# # A tibble: 504 x 10
#    hour   price  wind   temp .fitted .resid .std.resid   .hat .sigma  .cooksd
#    <fct>  <dbl> <dbl>  <dbl>   <dbl>  <dbl>      <dbl>  <dbl>  <dbl>    <dbl>
#  1 1      94.2   883. -6.64     70.4  23.7       0.652 0.129    39.6 0.0209  
#  2 1      19.3  2107.  2.40     35.4 -16.0      -0.431 0.0864   39.9 0.00584 
#  3 1      60.5  2161. 18.3      18.1  42.5       1.18  0.146    38.5 0.0795  
#  4 1     116.   1244. 12.0      44.0  71.9       1.91  0.0690   35.8 0.0902  
#  5 1     117.   1624. -8.05     56.1  60.6       1.67  0.128    36.9 0.136   
#  6 1      75.0   220. -0.838    78.6  -3.58     -0.101 0.175    40.1 0.000724
#  7 1     106.    765.  6.15     60.0  45.7       1.22  0.0845   38.4 0.0461  
#  8 1      -9.89 2055. 12.3      26.5 -36.4      -0.979 0.0909   39.0 0.0319  
#  9 1      96.1   215. -8.36     86.3   9.82      0.287 0.232    40.0 0.00830 
# 10 1      27.2   323. 22.4      52.9 -25.7      -0.777 0.278    39.4 0.0774  
# # … with 494 more rows

以此为灵感向Bob Muenchen's Blog致谢。

答案 4 :(得分:4)

我相信有比 loki's answer 更简洁的答案,它放弃了因为已替换/superseded do()

library(dplyr)
library(broom)
library(tidyr)

h.lm <- df.h %>%
      nest_by(hour) %>%
      mutate(fitHour = list(lm(price ~ wind + temp, data = data))) %>%
      summarise(tidy_out = list(tidy(fitHour)),
                glance_out = list(glance(fitHour)),
                augment_out = list(augment(fitHour))) %>%
      ungroup()

h.lm
# # A tibble: 24 x 4
#    hour  tidy_out         glance_out        augment_out
#    <fct> <list>           <list>            <list>
#  1 1     <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
#  2 2     <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
#  3 3     <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
#  4 4     <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
#  5 5     <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
#  6 6     <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
#  7 7     <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
#  8 8     <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
#  9 9     <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
# 10 10    <tibble [3 × 5]> <tibble [1 × 12]> <tibble [21 × 9]>
# # … with 14 more rows

类似于他们的回答,为了访问,只需取消嵌套所需的任何组件:

unnest(select(h.lm, hour, tidy_out))
# # A tibble: 72 x 6
#    hour  term        estimate std.error statistic p.value
#    <fct> <chr>          <dbl>     <dbl>     <dbl>   <dbl>
#  1 1     (Intercept) 63.2       20.9        3.02  0.00728
#  2 1     wind        -0.00237    0.0139    -0.171 0.866
#  3 1     temp        -0.266      0.950     -0.280 0.783
#  4 2     (Intercept) 65.1       23.0        2.83  0.0111
#  5 2     wind         0.00691    0.0129     0.535 0.599
#  6 2     temp        -0.448      0.877     -0.510 0.616
#  7 3     (Intercept) 65.2       17.8        3.67  0.00175
#  8 3     wind         0.00515    0.0112     0.458 0.652
#  9 3     temp        -1.87       0.695     -2.69  0.0148
# 10 4     (Intercept) 49.7       17.6        2.83  0.0111
# # … with 62 more rows

答案 5 :(得分:2)

从 dplyr 1.0.0 开始,group_split 为这个操作提供了一个方便的快捷方式:

library(dplyr)
library(broom)
library(purrr)
df.h <- data.frame( 
  hour     = factor(rep(1:24, each = 21)),
  price    = runif(504, min = -10, max = 125),
  wind     = runif(504, min = 0, max = 2500),
  temp     = runif(504, min = - 10, max = 25)  
)

df.g <- group_split(df.h, hour)
map_dfr(df.g, function(x) tidy(lm(price ~ wind + temp, data=x)))
#> # A tibble: 72 x 5
#>    term        estimate std.error statistic p.value
#>    <chr>          <dbl>     <dbl>     <dbl>   <dbl>
#>  1 (Intercept) -10.4      20.3       -0.512 0.615  
#>  2 wind          0.0377    0.0117     3.23  0.00467
#>  3 temp          1.34      0.890      1.50  0.150  
#>  4 (Intercept)  34.6      18.6        1.86  0.0799 
#>  5 wind          0.0214    0.0125     1.71  0.104  
#>  6 temp          0.332     0.865      0.384 0.706  
#>  7 (Intercept)  42.5      15.3        2.79  0.0122 
#>  8 wind          0.0103    0.0116     0.888 0.386  
#>  9 temp         -0.542     0.736     -0.736 0.471  
#> 10 (Intercept)  64.1      18.8        3.41  0.00312
#> # … with 62 more rows

reprex package (v1.0.0) 于 2021 年 3 月 4 日创建