查找每个多个分组的数据框列差异

时间:2016-05-04 14:41:35

标签: r dataframe aggregate plyr

在R中,我想从相同值列的总和中减去值列的总和(按列' t1'中的字母分组)(按列中的相同字母分组) #39; T2&#39)。对每个字母和每年组重复此过程。

考虑;

set.seed(3)    
df <- data.frame(age = rep(1:3,each=25),
                      t1 = rep(expand.grid(LETTERS[1:5],LETTERS[1:5])[,1],3),
                      t2 = rep(expand.grid(LETTERS[1:5],LETTERS[1:5])[,2],3),
                   value = sample(1:10,75,replace=T))

此数据框显示&#39; age&#39;中的3个值。列,2列,类别(t1和t2)和关联值(值)。

例如,以下是它如何适用于&#39; A&#39;:

library(plyr);

# extract rows with A
df2 <- df[df$t1=="A" | df$t2=="A",]
# remove where t1 and t2 are the same (not needed) 
df2 <- df2[df2$t1 != df2$t2,]
# use ddply to subtract sum of 'value' for A in t1 from t2
df2 <- ddply(df2, .(age), transform, change = sum(value[t2=="A"])-sum(value[t1=="A"]))
# create a name
df2$cat <- "A"
# remove all the duplicate rows, just need one summary value
df2 <- df2[ !duplicated(df2$change), ]
# keep summary data
df2 <- df2[,c(1,6,5)]

现在我需要为t1和t2中出现的所有值(在本例中为A,B,C和D)执行此操作,创建12行摘要。

我尝试了一个循环;

for (c in as.character(unique(df$t1)))

但现在得到了

非常感谢

2 个答案:

答案 0 :(得分:2)

这是一个涉及聚合和合并的基本R解决方案:

# aggregate by age  and t1 or t2
t1Agg <- aggregate(value ~ t1 + age, data=df, FUN=sum)
t2Agg <- aggregate(value ~ t2 + age, data=df, FUN=sum)

# merge aggregated data
aggData <- merge(t1Agg, t2Agg, by.x=c("age","t1"), by.y=c("age","t2"))
names(aggData) <- c("age", "t", "value.t1", "value.t2")

aggData$diff <- aggData$value.t1 - aggData$value.t2

答案 1 :(得分:1)

我建议您首先整理数据,然后spread发布 - summarise并添加新列:

# Make reproducible
set.seed(4)
df <- data.frame(age = rep(1:3,each=25),
                 t1 = rep(expand.grid(LETTERS[1:5],LETTERS[1:5])[,1],3),
                 t2 = rep(expand.grid(LETTERS[1:5],LETTERS[1:5])[,2],3),
                 value = sample(1:10,75,replace=T))

library(tidyr)
library(dplyr)

df_tidy <- gather(df, t_var, t_val, -age, -value)
 sample_n(df_tidy, 3)
#      age value t_var t_val
#  104   2     6    t2     A
#  48    2     9    t1     C
#  66    3     7    t1     A

df_tidy %>%
  group_by(age, t_var, t_val) %>%
  summarise(val_sum = sum(value)) %>%
  spread(t_var, val_sum) %>%
  mutate(diff = t1 - t2)

#      age t_val    t1    t2  diff
#    (int) (chr) (int) (int) (int)
# 1      1     A    30    22     8
# 2      1     B    32    32     0
# 3      1     C    27    28    -1
# 4      1     D    38    39    -1
# 5      1     E    30    36    -6
# 6      2     A    36    35     1
# 7      2     B    26    30    -4
# 8      2     C    40    27    13
# 9      2     D    27    31    -4
# 10     2     E    28    34    -6
# 11     3     A    26    39   -13
# 12     3     B    19    26    -7
# 13     3     C    31    29     2
# 14     3     D    41    33     8
# 15     3     E    39    29    10