我想按组计算差异。虽然我在SO上提到R: Function “diff” over various groups帖子,但由于未知原因,我无法找到差异。我尝试了三种方法:a)spread
b)dplyr::mutate
base::diff()
c)data.table
base::diff()
。虽然a)有效,但我不确定如何使用b)和c)来解决这个问题。
有关数据的说明:
我有按年收入的产品数据。我将年份> = 2013分类为期间2(称为P2
),年份< 2013年为第1期(称为P1
)。
示例数据:
dput(Test_File)
structure(list(Ship_Date = c(2010, 2010, 2012, 2012, 2012, 2012,
2017, 2017, 2017, 2016, 2016, 2016, 2011, 2017), Name = c("Apple",
"Apple", "Banana", "Banana", "Banana", "Banana", "Apple", "Apple",
"Apple", "Banana", "Banana", "Banana", "Mango", "Pineapple"),
Revenue = c(5, 10, 13, 14, 15, 16, 25, 25, 25, 1, 2, 4, 5,
7)), .Names = c("Ship_Date", "Name", "Revenue"), row.names = c(NA,
14L), class = "data.frame")
预期输出
dput(Diff_Table)
structure(list(Name = c("Apple", "Banana", "Mango", "Pineapple"
), P1 = c(15, 58, 5, NA), P2 = c(75, 7, NA, 7), Diff = c(60,
-51, NA, NA)), .Names = c("Name", "P1", "P2", "Diff"), class = "data.frame", row.names = c(NA,
-4L))
这是我的代码:
方法1:使用spread
[作品]
data.table::setDT(Test_File)
cutoff<-2013
Test_File[Test_File$Ship_Date>=cutoff,"Ship_Period"]<-"P2"
Test_File[Test_File$Ship_Date<cutoff,"Ship_Period"]<-"P1"
Diff_Table<- Test_File %>%
dplyr::group_by(Ship_Period,Name) %>%
dplyr::mutate(Revenue = sum(Revenue)) %>%
dplyr::select(Ship_Period, Name,Revenue) %>%
dplyr::ungroup() %>%
dplyr::distinct() %>%
tidyr::spread(key = Ship_Period,value = Revenue) %>%
dplyr::mutate(Diff = `P2` - `P1`)
方法2:使用dplyr
[不起作用:在Diff
列中生成新内容。]
Diff_Table<- Test_File %>%
dplyr::group_by(Ship_Period,Name) %>%
dplyr::mutate(Revenue = sum(Revenue)) %>%
dplyr::select(Ship_Period, Name,Revenue) %>%
dplyr::ungroup() %>%
dplyr::distinct() %>%
dplyr::arrange(Name,Ship_Period, Revenue) %>%
dplyr::group_by(Ship_Period,Name) %>%
dplyr::mutate(Diff = diff(Revenue))
方法3:使用data.table
[不起作用:它会在Diff
列中生成全部零。]
Test_File[,Revenue1 := sum(Revenue),by=c("Ship_Period","Name")]
Diff_Table<-Test_File[,.(Diff = diff(Revenue1)),by=c("Ship_Period","Name")]
问题:有人可以帮我解决上面的方法2和方法3吗?我对R很新,所以如果我的工作听起来太基础,我会道歉。我还在学习这门语言。
答案 0 :(得分:3)
我们可以使用data.table
执行此操作。将'data.frame'转换为'data.table'(setDT(Test_File)
),按“名称”和“名称”的游程长度ID分组,获取“收入”的sum
,使用dcast
将其重新整理为“宽”格式,获取“P2”和“P1”之间的差异,并将其分配(:=
)为“差异”
library(data.table)
dcast(setDT(Test_File)[, .(Revenue = sum(Revenue)),
.(grp=rleid(Name), Name)], Name~ paste0("P", rowid(Name)),
value.var = "Revenue")[, Diff := P2 - P1][]
# Name P1 P2 Diff
#1: Apple 15 75 60
#2: Banana 58 7 -51
#3: Mango 5 NA NA
#4: Pineapple 7 NA NA
或者对于第三种情况,即base R
,我们根据“姓名”中相邻元素是否相同('grp')创建分组列,然后aggregate
'收入' 'by'Name'和'grp'找到sum
,创建一个序列列,reshape
将其设置为'wide'并transform
数据集以创建'Diff'列
Test_File$grp <- with(Test_File, cumsum(c(TRUE, Name[-1]!=Name[-length(Name)])))
d1 <- aggregate(Revenue~Name +grp, Test_File, sum)
d1$Seq <- with(d1, ave(seq_along(Name), Name, FUN = seq_along))
transform(reshape(d1[-2], idvar = "Name", timevar = "Seq",
direction = "wide"), Diff = Revenue.2- Revenue.1)
tidyverse
方法也可以使用
library(dplyr)
library(tidyr)
Test_File %>%
group_by(grp = cumsum(c(TRUE, Name[-1]!=Name[-length(Name)])), Name) %>%
summarise(Revenue = sum(Revenue)) %>%
group_by(Name) %>%
mutate(Seq = paste0("P", row_number())) %>%
select(-grp) %>%
spread(Seq, Revenue) %>%
mutate(Diff = P2-P1)
#Source: local data frame [4 x 4]
#Groups: Name [4]
# Name P1 P2 Diff
# <chr> <dbl> <dbl> <dbl>
#1 Apple 15 75 60
#2 Banana 58 7 -51
#3 Mango 5 NA NA
#4 Pineapple 7 NA NA
基于OP的评论仅使用diff
函数
library(data.table)
setDT(Test_File)[, .(Revenue = sum(Revenue)), .(Name, grp = rleid(Name))
][, .(P1 = Revenue[1L], P2 = Revenue[2L], Diff = diff(Revenue)) , Name]
# Name P1 P2 Diff
#1: Apple 15 75 60
#2: Banana 58 7 -51
#3: Mango 5 NA NA
#4: Pineapple 7 NA NA
或dplyr
Test_File %>%
group_by(grp = cumsum(c(TRUE, Name[-1]!=Name[-length(Name)])), Name) %>%
summarise(Revenue = sum(Revenue)) %>%
group_by(Name) %>%
summarise(P1 = first(Revenue), P2 = last(Revenue)) %>%
mutate(Diff = P2-P1)
答案 1 :(得分:2)
这样做:
library("data.table")
setDT(Test_File)
T <- Test_File[, .(P=sum(Revenue)),by=.(Ship_Date, Name)]
T[Ship_Date>=2013][T[Ship_Date<2013][CJ(Name=T$Name, unique=TRUE), on="Name"], on="Name"][,`:=`(P1=i.P, P2=P, Diff=P-i.P)][]
# Ship_Date Name P i.Ship_Date i.P P1 P2 Diff
# 1: 2017 Apple 75 2010 15 15 75 60
# 2: 2016 Banana 7 2012 58 58 7 -51
# 3: NA Mango NA 2011 5 5 NA NA
# 4: 2017 Pineapple 7 NA NA NA 7 NA
或只有想要的列:
T[Ship_Date>=2013][T[Ship_Date<2013][CJ(Name=T$Name, unique=TRUE), on="Name"], on="Name"][,`:=`(P1=i.P, P2=P, Diff=P-i.P)][,.(Name, P1, P2, Diff)]
# Name P1 P2 Diff
# 1: Apple 15 75 60
# 2: Banana 58 7 -51
# 3: Mango 5 NA NA
# 4: Pineapple NA 7 NA
以下是使用setnames()
的变体:
setnames(T[Ship_Date>=2013][T[Ship_Date<2013][CJ(Name=T$Name, unique=TRUE), on="Name"], on="Name"],
c("P", "i.P"), c("P2", "P1"))[, Diff:=P2-P1][]