在R中,使用dplyr包中的summary来创建一系列列

时间:2015-03-30 17:23:08

标签: r dplyr

你知道如何创建一系列列而不逐一编写吗?例如,我想总结一些国家的州。例如,如果我的df是

 head(df)
     Time   day  State  Vote
     d1-h1   1    state1  5
     d1-h2   1    state1  3
     d1-h3   1    state1  8
     d2-h1   2    state1  4
     d2-h2   2    state1  5
     d2-h3   2    state1  1
     d1-h1   1    state2  5
     d1-h2   1    state2  3
     d1-h3   1    state2  8
     d2-h1   2    state2  3
     d2-h2   2    state2  1
     d2-h3   2    state2  4

逐个案例如下

 df2<- df %>% group_by(Time) %>% summarise(state1 = sum(votes),
                                          state2 = sum(votes),
                                          ......
                                          state27 = sum(vote))

这个问题的一个解决方案是循环,但它不起作用

   states<- unique(df$State)
   for(i in 1:length(states))
   { 
       df2<- df %>% group_by(Time) %>% summarise( paste0("Vote_",states[i]) = sum(Vote[State == states[i]]))
   }

1 个答案:

答案 0 :(得分:3)

我想这是一个reshaping问题。您可以尝试任何“长”到“宽”格式函数(dcastspreadreshape来自base R)以获得正确的输出。

library(reshape2)
dcast(df, Time~State, value.var='Vote')
#     Time state1 state2
#1 d1-h1      5      5
#2 d1-h2      3      3
#3 d1-h3      8      8
#4 d2-h1      4      3
#5 d2-h2      5      1
#6 d2-h3      1      4

或者

library(tidyr)
spread(df, State,Vote)

获得相同的输出(基于提供的示例)

library(dplyr)
df %>% 
   group_by(Time) %>% 
    summarise(state1=sum(Vote[State=='state1']), 
              state2=sum(Vote[State=='state2']))
#   Time state1 state2
#1 d1-h1      5      5
#2 d1-h2      3      3
#3 d1-h3      8      8
#4 d2-h1      4      3
#5 d2-h2      5      1
#6 d2-h3      1      4

更新

假设有多个'value'列,

set.seed(22)
df$Vote2 <- sample(1:10, 12, replace=TRUE)

然后,我们可以使用develdata.table即。 'v1.9.5',可以使用多个'value.vars'。它可以从here

安装
 library(data.table)
 dcast(setDT(df), Time~State, value.var=c('Vote', 'Vote2'))
 #    Time state1_Vote state2_Vote state1_Vote2 state2_Vote2
 #1: d1-h1           5           5            4            7
 #2: d1-h2           3           3            5            8
 #3: d1-h3           8           8           10            5
 #4: d2-h1           4           3            6            4
 #5: d2-h2           5           1            9           10
 #6: d2-h3           1           4            8            7

或使用reshape

中的base R
 reshape(df[-2], idvar='Time', timevar='State', direction='wide')
 #    Time Vote.state1 Vote2.state1 Vote.state2 Vote2.state2
 #1 d1-h1           5            4           5            7
 #2 d1-h2           3            5           3            8
 #3 d1-h3           8           10           8            5
 #4 d2-h1           4            6           3            4
 #5 d2-h2           5            9           1           10
 #6 d2-h3           1            8           4            7

数据

df <- structure(list(Time = c("d1-h1", "d1-h2", "d1-h3", "d2-h1",
"d2-h2", 
"d2-h3", "d1-h1", "d1-h2", "d1-h3", "d2-h1", "d2-h2", "d2-h3"
 ), day = c(1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 2L, 2L), 
 State = c("state1", 
"state1", "state1", "state1", "state1", "state1", "state2", "state2", 
"state2", "state2", "state2", "state2"), Vote = c(5L, 3L, 8L, 
4L, 5L, 1L, 5L, 3L, 8L, 3L, 1L, 4L)), .Names = c("Time", "day", 
"State", "Vote"), class = "data.frame", row.names = c(NA, -12L))