使用空因子聚合但保留行

时间:2016-08-11 13:49:26

标签: r aggregate

我和by()有类似的问题,我接受了这样一个事实,即我必须手动替换生成的NA。现在我想聚合我的data.frame并保留结构。例如我的较大数据集包含100个国家* 10年* 5段的因子,因此它应减少到5000行。但有时一些段因子是空的,我只得到<5000行。我无法理解它......

我的MWE仍然适用:

#All 3 categories are used
df1<-data.frame( val=rep(seq(1:4),3), factor=cut(rep(seq(1:4),3),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))
# Thirds category is not used
df2<-data.frame( val=rep(seq(1:3),4), factor=cut(rep(seq(1:3),4),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))

#df1 reduces to 3 rows as each category is used
aggregate(df1$val,list(df1$factor),sum)
#df2 reduces to 2 rows because C is empty
aggregate(df2$val,list(df2$factor),sum)
#I would like
data.frame(Group.1=LETTERS[1:3], x=c(12,12,0))

  Group.1  x
1       A 12
2       B 12
3       C  0

3 个答案:

答案 0 :(得分:2)

# create dataset
df2 <- data.frame( val=rep(seq(1:3),4), factor=cut(rep(seq(1:3),4),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))

library(dplyr)

levels(df2$factor) %>%                                    # get distinct levels of the factor variable
  data.frame(factor = .) %>%                              # create a data frame
  left_join(df2 %>%                                       # join with
            group_by(factor) %>%                             # for each value that exists
            summarise(x = sum(val)), by = "factor") %>%      # sum column val
  mutate(x = coalesce(x, 0L))                             # replace NAs with 0s

#   factor  x
# 1      A 12
# 2      B 12
# 3      C  0

或没有任何包

dd = merge(data.frame(Group.1 = levels(df2$factor)), 
           aggregate(df2$val,list(df2$factor),sum), all.x = T)
dd$x = ifelse(is.na(dd$x), 0, dd$x)
dd

#   Group.1  x
# 1       A 12
# 2       B 12
# 3       C  0

或使用data.table包来检查它是否更快

library(data.table)

# assuming you start with a data frame
df2 <- data.frame( val=rep(seq(1:3),4), factor=cut(rep(seq(1:3),4),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))

# create a data table with all unique values of the variable "factor" and an index (key) on that variable
dt_levels = data.table(factor = levels(df2$factor), key = "factor")

# make df2 a data table with an index on column "factor" and aggregate
dt_sum = setDT(df2, key = "factor")[, list(Sum = sum(val)), by = "factor"]

# left join the two data tables and replace NA values with 0s
dt_result = dt_sum[dt_levels][, Sum := ifelse(is.na(Sum), 0, Sum)]

dt_result[]

#    factor Sum
# 1:      A  12
# 2:      B  12
# 3:      C   0         

答案 1 :(得分:1)

您可以使用complete中的tidyr功能明确显示结果中的缺失值:

library(dplyr); library(tidyr)
df2 %>% 
        group_by(factor) %>% 
        summarise(x = sum(val)) %>% 
        complete(factor, fill = list(x = 0))

# Source: local data frame [3 x 2]

#   factor     x
#   <fctr> <dbl>
# 1      A    12
# 2      B    12
# 3      C     0

使用aggregate功能:

tidyr::complete(aggregate(df2$val,list(df2$factor),sum), Group.1, fill = list(x = 0))

# Source: local data frame [3 x 2]

#  Group.1     x
#   <fctr> <dbl>
#1       A    12
#2       B    12
#3       C     0

答案 2 :(得分:1)

所以这是非常基本的,但我刚刚创建了一个包含2列的新数据框。 一个具有每个因子级别,一个具有全部0.然后我使用rbind但我的新框架和df2一起,然后使用聚合。

df2 <- data.frame( val=rep(seq(1:3),4), factor=cut(rep(seq(1:3),4),breaks=c(1,2,3,4), include.lowest = TRUE, ordered_results=True , labels=LETTERS[1:3]))

dat <- data.frame(val = 0, factor = levels(df2$factor))

df3 <- rbind(df2,dat)

aggregate(. ~ factor,df3,sum)

#  factor val
#1      A  12
#2      B  12
#3      C   0