修改所选变量的dplyr输出

时间:2015-07-23 07:36:22

标签: r dataframe dplyr

我必须根据varb组计算cat1中每个变量的频率,均值,sd。以下代码可以根据需要正常工作。

# Random generation of values for categorical data
set.seed(33)
df <- data.frame(cat1 = sample( LETTERS[1:2], 100, replace=TRUE ), 
                cat2 = sample( LETTERS[3:5], 100, replace=TRUE ),
                cat3 = sample( LETTERS[2:4], 100, replace=TRUE ),
                con1 = runif(100,0,100),
                con2 = runif(100,23,45),
                con3 = runif(100,55,100),
                con4 = runif(100,12,49))

# Introducing null values 
df$con1[c(23,53,92)] <- NA
df$con2[c(33,46)] <- NA
df$con3[c(59,72)] <- NA
df$con4[c(33,46)] <- NA

# Selecting the variables of interest
varb <- c("con1","con2","con3","con4")

# Calculating the stats
results <- df %>% group_by(cat1) %>% summarise_each(funs(count=sum(!is.na(.)),
                                                              mean(., na.rm = TRUE),
                                                              sd(., na.rm = TRUE)),
                                                         one_of(varb))
#Output    
> results
    Source: local data frame [2 x 13]

      cat1 con1_count con2_count con3_count con4_count con1_mean con2_mean con3_mean con4_mean  con1_sd  con2_sd  con3_sd  con4_sd
    1    A         50         51         50         51   45.0175  34.24199  77.95732  30.84059 29.41333 6.327222 13.13494 11.53573
    2    B         47         47         48         47   46.6868  33.30931  81.14928  28.14640 25.80322 6.909720 11.93321 10.04867

除了上一个varb2之外,我想对varb({1}}的{​​{1}}子集数据中的每个变量应用相同的函数分组。下面的代码就是这样做的。

df[which(df$cat2 == 'D'),]

我想修改cat1中每个变量的新输出# Selecting the variables of interest varb2 <- c("con2","con4") # Calculating the stats results2 <- df[which(df$cat2 == 'D'),] %>% group_by(cat1) %>% summarise_each(funs(count=sum(!is.na(.)), mean(., na.rm = TRUE), sd(., na.rm = TRUE)), one_of(varb2)) # Output results2 Source: local data frame [2 x 7] cat1 con2_count con4_count con2_mean con4_mean con2_sd con4_sd 1 A 13 13 36.08892 30.28429 7.172574 14.308223 2 B 13 13 31.83272 28.24502 6.497423 8.763573 的第一个输出results。是否可以提供修改输出的建议,而不是删除{{ 1}}来自任务的results2

varb2

1 个答案:

答案 0 :(得分:1)

创建一个新的数据框,在其中替换您不希望按NA汇总的值。

df2 <- df
df2[df2$cat2!="D",varb2] <- NA
results3 <- df2 %>% 
    group_by(cat1) %>% 
    summarise_each(funs(count=sum(!is.na(.)),
                        mean(., na.rm = TRUE),
                        sd(., na.rm = TRUE)),
                   one_of(varb))

您可能还使用了长格式重新整形的数据。

library(reshape2)
results4 <- df %>% 
    melt(id = c("cat1","cat2","cat3")) %>%
    # con1 and con3, can have any values in cat2.
    # for con2 and con4, you only want 
    # to calculate the summary for values in cat2 == "D". 
    filter((!variable %in% varb2) |
               (variable %in% varb2 & cat2 == "D")) %>%
    group_by(cat1,variable) %>%
    summarise(count = sum(!is.na(value)),
              mean = mean(value, na.rm = TRUE),
              sd = sd(value, na.rm = TRUE))

# Which you could always reshape again
results4 %>% 
    rename(con = variable) %>%
    melt(id = c("cat1", "con")) %>%
    dcast(cat1 ~ con + variable)

tidyr包也可以重塑数据(我还没有习惯),请参阅R Studio data wrangling cheatsheet