无法使用dplyr进行分组

时间:2016-05-20 19:37:46

标签: r dplyr reshape

我有下表,我想采用tf_diffcnrd_marshtimefact_hour分组的平均值。但是,当我使用dplyr时,我收到此错误并且不确定原因。我还试图删除NA值并将第一列从因子转换为字符,但这并没有解决问题。有任何想法吗?我在下面列出了一些数据样本。

library(dplyr)
library(tidyr)

output <- foo %>%
  group_by(cnrd_marsh, timefact_hour) %>%
  summarise(tf_diff = mean(tf_diff)) %>%
  spread(cnrd_marsh, tf_diff)

Error: Key column 'cnrd_marsh' does not exist in input.



> foo
   cnrd_marsh timefact_hour tf_diff
1        <NA>            NA      NA
2          БЧ            14      19
3          БЧ            14       5
4        <NA>            NA      NA
5          БЧ            13       1
6          БЧ            13      18
7          БЧ            13      31
8          БЧ            13       2
9        <NA>            NA      NA
10         БЧ            12       5
11         БЧ            12      10
12         БЧ            12       1
13         БЧ            12      17
14       <NA>            NA      NA
15         БЧ            11       2
16         БЧ            11      18
17         БЧ            11       4
18         БЧ            11       7
19         БЧ            11      16
20         БЧ            11       3
21       <NA>            NA      NA
22         БЧ            10      11
23         БЧ            10       6
24         БЧ            10      10
25       <NA>            NA      NA
26         БЧ             9      17
27         БЧ             9       6
28         БЧ             9      15
29         БЧ             9       4
30         БЧ             9       9
31         БЧ             9       1
32       <NA>            NA      NA
33         БЧ             8      16
34         БЧ             8       8
35         БЧ             8      14
36         БЧ             8       3
37         БЧ             8      11
38         БЧ             8       1
39       <NA>            NA      NA
40         БЧ             7      14
41         БЧ             7       6
42         БЧ             7      14
43         БЧ             7       5
44       <NA>            NA      NA
45         БЧ             6       0
46         БЧ             6       9
47         БЧ             6      10
48         БЧ             6      15
49       <NA>            NA      NA
50       <NA>            NA      NA

输出应该像这样

> head(output)
  cnrd_marsh timefact_hour           tf_diff
1         БЧ            14 12.00000000000000
2         БЧ            13 13.00000000000000
3         БЧ            12  8.25000000000000
4         БЧ            11  8.33333330000000
5         БЧ            10  9.00000000000000
6         БЧ             9  8.66666666666667

以下是数据样本。

> dput(foo)
structure(list(cnrd_marsh = c(NA, "БЧ", "БЧ", NA, "БЧ", 
"БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", NA, 
"БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ", 
"БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, 
"БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ", 
"БЧ", "БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", NA, NA), 
    timefact_hour = c(NA, 14L, 14L, NA, 13L, 13L, 13L, 13L, NA, 
    12L, 12L, 12L, 12L, NA, 11L, 11L, 11L, 11L, 11L, 11L, NA, 
    10L, 10L, 10L, NA, 9L, 9L, 9L, 9L, 9L, 9L, NA, 8L, 8L, 8L, 
    8L, 8L, 8L, NA, 7L, 7L, 7L, 7L, NA, 6L, 6L, 6L, 6L, NA, NA
    ), tf_diff = c(NA, 19, 5, NA, 1, 18, 31, 2, NA, 5, 10, 1, 
    17, NA, 2, 18, 4, 7, 16, 3, NA, 11, 6, 10, NA, 17, 6, 15, 
    4, 9, 1, NA, 16, 8, 14, 3, 11, 1, NA, 14, 6, 14, 5, NA, 0, 
    9, 10, 15, NA, NA)), .Names = c("cnrd_marsh", "timefact_hour", 
"tf_diff"), vars = list(timefact_hour), row.names = c(NA, 50L
), class = "data.frame")

4 个答案:

答案 0 :(得分:2)

如果您将该输入值中的vars值更改为引用值,则可以获取代码以生成输出:

foo <-
structure(list(cnrd_marsh = c(NA, "БЧ", "БЧ", NA, "БЧ", 
"БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", NA, 
"БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ", 
"БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, 
"БЧ", "БЧ", "БЧ", "БЧ", "БЧ", "БЧ", NA, "БЧ", "БЧ", 
"БЧ", "БЧ", NA, "БЧ", "БЧ", "БЧ", "БЧ", NA, NA), 
    timefact_hour = c(NA, 14L, 14L, NA, 13L, 13L, 13L, 13L, NA, 
    12L, 12L, 12L, 12L, NA, 11L, 11L, 11L, 11L, 11L, 11L, NA, 
    10L, 10L, 10L, NA, 9L, 9L, 9L, 9L, 9L, 9L, NA, 8L, 8L, 8L, 
    8L, 8L, 8L, NA, 7L, 7L, 7L, 7L, NA, 6L, 6L, 6L, 6L, NA, NA
    ), tf_diff = c(NA, 19, 5, NA, 1, 18, 31, 2, NA, 5, 10, 1, 
    17, NA, 2, 18, 4, 7, 16, 3, NA, 11, 6, 10, NA, 17, 6, 15, 
    4, 9, 1, NA, 16, 8, 14, 3, 11, 1, NA, 14, 6, 14, 5, NA, 0, 
    9, 10, 15, NA, NA)), .Names = c("cnrd_marsh", "timefact_hour", 
"tf_diff"), vars = list('timefact_hour'), row.names = c(NA, 50L
), class = "data.frame")


 output <- foo %>%
   group_by(cnrd_marsh, timefact_hour) %>%
   summarise(tf_diff = mean(tf_diff)) %>%
   spread(cnrd_marsh, tf_diff)
 output
#-----------------------
Source: local data frame [10 x 3]

   timefact_hour         БЧ    NA
*          <int>      <dbl> <dbl>
1              6  8.5000000    NA
2              7  9.7500000    NA
3              8  8.8333333    NA
4              9  8.6666667    NA
5             10  9.0000000    NA
6             11  8.3333333    NA
7             12  8.2500000    NA
8             13 13.0000000    NA
9             14 12.0000000    NA
10            NA         NA    NA

这说明了@joran的担忧。以下是我认为对我更有意义的“基础”解决方案:

> with(foo, tapply(tf_diff, list(timefact_hour, cnrd_marsh), mean, na.rm=TRUE))
          БЧ
6   8.500000
7   9.750000
8   8.833333
9   8.666667
10  9.000000
11  8.333333
12  8.250000
13 13.000000
14 12.000000

答案 1 :(得分:1)

您的错误“错误:输入中不存在键列'cnrd_marsh'”是因为当您在group by之后使用汇总时它将返回“tf_diff”列。因此在计算传播时,它无法找到'cnrd_marsh'和'tf_diff'列。 而是试试这个。它应该符合您的目的:

 foo <- foo[complete.cases(foo),] ### or spread will fail
 output <- foo %>%
              group_by(cnrd_marsh, timefact_hour) %>%
              mutate(tf_diff = mean(tf_diff,na.rm=T)) %>% unique() %>% 
              spread(cnrd_marsh, tf_diff)

答案 2 :(得分:0)

这段代码似乎可以解决问题:

library(plyr)

cdata <- ddply(foo, c("cnrd_marsh", "timefact_hour"), summarise,
               mean = mean(tf_diff)
)
cdata

答案 3 :(得分:0)

只需清除NA的数据集,你就应该好好去。

output <- foo %>%
  na.omit %>%
  group_by(cnrd_marsh, timefact_hour) %>%
  summarise(tf_diff = mean(tf_diff)) %>%
  spread(cnrd_marsh, tf_diff)