我想将数据帧从长格式转换为宽格式,并且我放弃了一些我想要保留的数据。 对于以下示例:
df <- data.frame(Par1 = unlist(strsplit("AABBCCC","")),
Par2 = unlist(strsplit("DDEEFFF","")),
ParD = unlist(strsplit("foo,bar,baz,qux,bla,xyz,meh",",")),
Type = unlist(strsplit("pre,post,pre,post,pre,post,post",",")),
Val = c(10,20,30,40,50,60,70))
# Par1 Par2 ParD Type Val
# 1 A D foo pre 10
# 2 A D bar post 20
# 3 B E baz pre 30
# 4 B E qux post 40
# 5 C F bla pre 50
# 6 C F xyz post 60
# 7 C F meh post 70
dfw <- dcast(df,
formula = Par1 + Par2 ~ Type,
value.var = "Val",
fun.aggregate = mean)
# Par1 Par2 post pre
# 1 A D 20 10
# 2 B E 40 30
# 3 C F 65 50
这几乎是我的需要,但我想拥有
ParD
字段的数据(例如,作为单个合并字符串),即。我希望得到的data.frame如下:
# Par1 Par2 post pre Num.pre Num.post ParD
# 1 A D 20 10 1 1 foo_bar
# 2 B E 40 30 1 1 baz_qux
# 3 C F 65 50 1 2 bla_xyz_meh
我会感激任何想法。例如,我尝试通过写入dcast来解决第二个任务:fun.aggregate=function(x) c(Val=mean(x),Num=length(x))
- 但这会导致错误。
答案 0 :(得分:14)
晚会,但这是使用data.table
的另一种选择:
require(data.table)
dt <- data.table(df, key=c("Par1", "Par2"))
dt[, list(pre=mean(Val[Type == "pre"]),
post=mean(Val[Type == "post"]),
pre.num=length(Val[Type == "pre"]),
post.num=length(Val[Type == "post"]),
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
# Par1 Par2 pre post pre.num post.num ParD
# 1: A D 10 20 1 1 foo_bar
# 2: B E 30 40 1 1 baz_qux
# 3: C F 50 65 1 2 bla_xyz_meh
[来自Matthew] +1一些小的改进,以保存重复相同的==
,并在j
内展示局部变量。
dt[, list(pre=mean(Val[.pre <- Type=="pre"]), # save .pre
post=mean(Val[.post <- Type=="post"]), # save .post
pre.num=sum(.pre), # reuse .pre
post.num=sum(.post), # reuse .post
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
# Par1 Par2 pre post pre.num post.num ParD
# 1: A D 10 20 1 1 foo_bar
# 2: B E 30 40 1 1 baz_qux
# 3: C F 50 65 1 2 bla_xyz_meh
dt[, { .pre <- Type=="pre" # or save .pre and .post up front
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = paste(ParD, collapse="_")) }
, by=list(Par1, Par2)]
# Par1 Par2 pre post pre.num post.num ParD
# 1: A D 10 20 1 1 foo_bar
# 2: B E 30 40 1 1 baz_qux
# 3: C F 50 65 1 2 bla_xyz_meh
如果list
列可以而不是paste
,那么这应该更快:
dt[, { .pre <- Type=="pre"
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = list(ParD)) } # list() faster than paste()
, by=list(Par1, Par2)]
# Par1 Par2 pre post pre.num post.num ParD
# 1: A D 10 20 1 1 foo,bar
# 2: B E 30 40 1 1 baz,qux
# 3: C F 50 65 1 2 bla,xyz,meh
答案 1 :(得分:13)
使用ddply
分两步解决(我不满意,但我得到了结果)
dat <- ddply(df,.(Par1,Par2),function(x){
data.frame(ParD=paste(paste(x$ParD),collapse='_'),
Num.pre =length(x$Type[x$Type =='pre']),
Num.post = length(x$Type[x$Type =='post']))
})
merge(dfw,dat)
Par1 Par2 post pre ParD Num.pre Num.post
1 A D 2.0 1 foo_bar 1 1
2 B E 4.0 3 baz_qux 1 1
3 C F 6.5 5 bla_xyz_meh 1 2
答案 2 :(得分:6)
我会发帖但是agstudy让我感到羞耻:
step1 <- with(df, split(df, list(Par1, Par2)))
step2 <- step1[sapply(step1, nrow) > 0]
step3 <- lapply(step2, function(x) {
piece1 <- tapply(x$Val, x$Type, mean)
piece2 <- tapply(x$Type, x$Type, length)
names(piece2) <- paste0("Num.", names(piece2))
out <- x[1, 1:2]
out[, 3:6] <- c(piece1, piece2)
names(out)[3:6] <- names(c(piece1, piece2))
out$ParD <- paste(unique(x$ParD), collapse="_")
out
})
data.frame(do.call(rbind, step3), row.names=NULL)
产量:
Par1 Par2 post pre Num.post Num.pre ParD
1 A D 2.0 1 1 1 foo_bar
2 B E 4.0 3 1 1 baz_qux
3 C F 6.5 5 2 1 bla_xyz_meh
答案 3 :(得分:6)
你可以合并两个dcast和一个聚合,这里所有包装成一个大的表达式,主要是为了避免中间的物体随后徘徊:
Reduce(merge, list(
dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=mean),
setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=length), c("Par1", "Par2", "Num.post",
"Num.pre")),
aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
))
答案 4 :(得分:6)
基准测试真是太棒了!
以下是plyr
方法(由@agstudy建议)与data.table
方法(由@Arun建议)的一些运行
使用不同的样本量(N = 900,2700,10800)
<强>要点:
data.table
方法优于plyr
方法7.5倍
#-------------------#
# M E T H O D S #
#-------------------#
# additional methods below, in the updates
# Method 1 -- suggested by @agstudy
plyrMethod <- quote({
dfw<-dcast(df,
formula = Par1+Par2~Type,
value.var="Val",
fun.aggregate=mean)
dat <- ddply(df,.(Par1,Par2),function(x){
data.frame(ParD=paste(paste(x$ParD),collapse='_'),
Num.pre =length(x$Type[x$Type =='pre']),
Num.post = length(x$Type[x$Type =='post']))
})
merge(dfw,dat)
})
# Method 2 -- suggested by @Arun
dtMethod <- quote(
dt[, list(pre=mean(Val[Type == "pre"]),
post=mean(Val[Type == "post"]),
Num.pre=length(Val[Type == "pre"]),
Num.post=length(Val[Type == "post"]),
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
)
# Method 3 -- suggested by @regetz
reduceMethod <- quote(
Reduce(merge, list(
dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=mean),
setNames(dcast(df, formula = Par1+Par2~Type, value.var="Val",
fun.aggregate=length), c("Par1", "Par2", "Num.post",
"Num.pre")),
aggregate(df["ParD"], df[c("Par1", "Par2")], paste, collapse="_")
))
)
# Method 4 -- suggested by @Ramnath
castddplyMethod <- quote(
reshape::cast(Par1 + Par2 + ParD ~ Type,
data = ddply(df, .(Par1, Par2), transform,
ParD = paste(ParD, collapse = "_")),
fun = c(mean, length)
)
)
# SAMPLE DATA #
#-------------#
library(data.table)
library(plyr)
library(reshape2)
library(rbenchmark)
# for Par1, ParD
LLL <- apply(expand.grid(LETTERS, LETTERS, LETTERS, stringsAsFactors=FALSE), 1, paste0, collapse="")
lll <- apply(expand.grid(letters, letters, letters, stringsAsFactors=FALSE), 1, paste0, collapse="")
# max size is 17568 with current sample data setup, ie: floor(length(LLL) / 18) * 18
size <- 17568
size <- 10800
size <- 900
set.seed(1)
df<-data.frame(Par1=rep(LLL[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
, Par2=rep(lll[1:(size/2)], times=rep(c(2,2,3), size)[1:(size/2)])[1:(size)]
, ParD=sample(unlist(lapply(c("f", "b"), paste0, lll)), size, FALSE)
, Type=rep(c("pre","post"), size/2)
, Val =sample(seq(10,100,10), size, TRUE)
)
dt <- data.table(df, key=c("Par1", "Par2"))
# Confirming Same Results #
#-------------------------#
# Evaluate
DF1 <- eval(plyrMethod)
DF2 <- eval(dtMethod)
# Convert to DF and sort columns and sort ParD levels, for use in identical
colOrder <- sort(names(DF1))
DF1 <- DF1[, colOrder]
DF2 <- as.data.frame(DF2)[, colOrder]
DF2$ParD <- factor(DF2$ParD, levels=levels(DF1$ParD))
identical((DF1), (DF2))
# [1] TRUE
#-------------------------#
#--------------------#
# BENCHMARK #
#--------------------#
benchmark(plyr=eval(plyrMethod), dt=eval(dtMethod), reduce=eval(reduceMethod), castddply=eval(castddplyMethod),
replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"),
order="relative")
# SAMPLE SIZE = 900
relative test elapsed user.self sys.self replications
1.000 reduce 0.392 0.375 0.018 5
1.003 dt 0.393 0.377 0.016 5
7.064 plyr 2.769 2.721 0.047 5
8.003 castddply 3.137 3.030 0.106 5
# SAMPLE SIZE = 2,700
relative test elapsed user.self sys.self replications
1.000 dt 1.371 1.327 0.090 5
2.205 reduce 3.023 2.927 0.102 5
7.291 plyr 9.996 9.644 0.377 5
# SAMPLE SIZE = 10,800
relative test elapsed user.self sys.self replications
1.000 dt 8.678 7.168 1.507 5
2.769 reduce 24.029 23.231 0.786 5
6.946 plyr 60.277 52.298 7.947 5
13.796 castddply 119.719 113.333 10.816 5
# SAMPLE SIZE = 17,568
relative test elapsed user.self sys.self replications
1.000 dt 27.421 13.042 14.470 5
4.030 reduce 110.498 75.853 34.922 5
5.414 plyr 148.452 105.776 43.156 5
# Used only sample size of 90, as it was taking long
relative test elapsed user.self sys.self replications
1.000 dt 0.044 0.043 0.001 5
7.773 plyr 0.342 0.339 0.003 5
65.614 base1 2.887 2.866 0.028 5
Where
baseMethod1 <- quote({
step1 <- with(df, split(df, list(Par1, Par2)))
step2 <- step1[sapply(step1, nrow) > 0]
step3 <- lapply(step2, function(x) {
piece1 <- tapply(x$Val, x$Type, mean)
piece2 <- tapply(x$Type, x$Type, length)
names(piece2) <- paste0("Num.", names(piece2))
out <- x[1, 1:2]
out[, 3:6] <- c(piece1, piece2)
names(out)[3:6] <- names(c(piece1, piece2))
out$ParD <- paste(unique(x$ParD), collapse="_")
out
})
data.frame(do.call(rbind, step3), row.names=NULL)
})
根据@MatthewDowle的评论,将索引步骤添加到公平基准中。
但是,假设使用data.table,它将取代data.frame和
因此,索引将发生一次,而不仅仅是针对此过程
dtMethod.withkey <- quote({
dt <- data.table(df, key=c("Par1", "Par2"))
dt[, list(pre=mean(Val[Type == "pre"]),
post=mean(Val[Type == "post"]),
Num.pre=length(Val[Type == "pre"]),
Num.post=length(Val[Type == "post"]),
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
})
# SAMPLE SIZE = 10,800
relative test elapsed user.self sys.self replications
1.000 dt 9.155 7.055 2.137 5
1.043 dt.withkey 9.553 7.245 2.353 5
3.567 reduce 32.659 31.196 1.586 5
6.703 plyr 61.364 54.080 7.600 5
dtMethod.MD1 <- quote(
dt[, list(pre=mean(Val[.pre <- Type=="pre"]), # save .pre
post=mean(Val[.post <- Type=="post"]), # save .post
pre.num=sum(.pre), # reuse .pre
post.num=sum(.post), # reuse .post
ParD = paste(ParD, collapse="_")),
by=list(Par1, Par2)]
)
dtMethod.MD2 <- quote(
dt[, { .pre <- Type=="pre" # or save .pre and .post up front
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = paste(ParD, collapse="_")) }
, by=list(Par1, Par2)]
)
dtMethod.MD3 <- quote(
dt[, { .pre <- Type=="pre"
.post <- Type=="post"
list(pre=mean(Val[.pre]),
post=mean(Val[.post]),
pre.num=sum(.pre),
post.num=sum(.post),
ParD = list(ParD)) } # list() faster than paste()
, by=list(Par1, Par2)]
)
benchmark(dt.M1=eval(dtMethod.MD1), dt.M2=eval(dtMethod.MD2), dt.M3=eval(dtMethod.MD3), dt=eval(dtMethod),
replications=5, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"),
order="relative")
#--------------------#
Comparing the different data.table methods amongst themselves
# SAMPLE SIZE = 900
relative test elapsed user.self sys.self replications
1.000 dt.M3 0.198 0.197 0.001 5 <~~~ "list()" Method
1.242 dt.M1 0.246 0.243 0.004 5
1.253 dt.M2 0.248 0.242 0.007 5
1.884 dt 0.373 0.367 0.007 5
# SAMPLE SIZE = 17,568
relative test elapsed user.self sys.self replications
1.000 dt.M3 33.492 24.487 9.122 5 <~~~ "list()" Method
1.086 dt.M1 36.388 11.442 25.086 5
1.086 dt.M2 36.388 10.845 25.660 5
1.126 dt 37.701 13.256 24.535 5
Comparing MD3 ("list" method) with MD1 (best of DT non-list methods)
Using a clean session (ie, removing string cache)
_Note: Ran the following twice, fresh session each time, with practically identical results
Then re-ran in the *same* session, with reps=5. Results very different._
benchmark(dt.M1=eval(dtMethod.MD1), dt.M3=eval(dtMethod.MD3), replications=1, columns=c("relative", "test", "elapsed", "user.self", "sys.self", "replications"), order="relative")
# SAMPLE SIZE=17,568; CLEAN SESSION
relative test elapsed user.self sys.self replications
1.000 dt.M1 8.885 4.260 4.617 1
1.633 dt.M3 14.506 12.821 1.677 1
# SAMPLE SIZE=17,568; *SAME* SESSION
relative test elapsed user.self sys.self replications
1.000 dt.M1 33.443 10.200 23.226 5
1.048 dt.M3 35.060 26.127 8.915 5
#--------------------#
New benchmarks against previous methods
_Note: Not using the "list method" as results are not the same as other methods_
# SAMPLE SIZE = 900
relative test elapsed user.self sys.self replications
1.000 dt.M1 0.254 0.247 0.008 5
1.705 reduce 0.433 0.425 0.010 5
11.280 plyr 2.865 2.842 0.031 5
# SAMPLE SIZE = 17,568
relative test elapsed user.self sys.self replications
1.000 dt.M1 24.826 10.427 14.458 5
4.348 reduce 107.935 70.107 38.314 5
5.942 plyr 147.508 106.958 41.083 5
答案 5 :(得分:2)
将reshape::cast
与plyr::ddply
cast(Par1 + Par2 + ParD ~ Type, data = ddply(df, .(Par1, Par2), transform,
ParD = paste(ParD, collapse = "_")), fun = c(mean, length))
请注意,dcast
中的reshape2
函数不允许传递多个聚合函数,而cast
中的reshape
函数则会传递。{/ p>
答案 6 :(得分:2)
我相信这个基础R解决方案可与@ Arun的数据表解决方案相媲美。 (这并不是说我更喜欢它;那个代码更简单!)
baseMethod2 <- quote({
is <- unname(split(1:nrow(df), with(df, paste(Par1, Par2, sep="\b"))))
i1 <- sapply(is, `[`, 1)
out <- with(df, data.frame(Par1=Par1[i1], Par2=Par2[i1]))
js <- lapply(is, function(i) split(i, df$Type[i]))
out$post <- sapply(js, function(j) mean(df$Val[j$post]))
out$pre <- sapply(js, function(j) mean(df$Val[j$pre]))
out$Num.pre <- sapply(js, function(j) length(j$pre))
out$Num.post <- sapply(js, function(j) length(j$post))
out$ParD <- sapply(is, function(x) paste(df$ParD[x], collapse="_"))
out
})
分别使用@ RicardoSaporta的时序码900,2700和10,800:
> relative test elapsed user.self sys.self replications
3 1.000 baseMethod2 0.230 0.229 0 5
1 1.130 dt 0.260 0.257 0 5
2 8.752 plyr 2.013 2.006 0 5
> relative test elapsed user.self sys.self replications
3 1.000 baseMethod2 0.877 0.872 0 5
1 1.068 dt 0.937 0.934 0 5
2 8.060 plyr 7.069 7.043 0 5
> relative test elapsed user.self sys.self replications
1 1.000 dt 6.232 6.178 0.031 5
3 1.085 baseMethod2 6.763 6.683 0.054 5
2 7.263 plyr 45.261 44.983 0.104 5
答案 7 :(得分:0)
尝试将不同的聚合表达式包装到一个自包含的函数中(表达式应该产生原子值)...
multi.by <- function(X, INDEX,...) {
expressions <- substitute(...())
duplicates <- duplicated(INDEX)
res <- do.call(rbind,sapply(split(X,cumsum(!duplicates),drop=T), function(part)
sapply(expressions,eval,part,simplify=F),simplify=F))
if (is.data.frame(INDEX)) res <- cbind(INDEX[!duplicates,],res)
else rownames(res) <- INDEX[!duplicates]
res
}
multi.by(df,df[,1:2],
pre=mean(Val[Type=="pre"]),
post=mean(Val[Type=="post"]),
Num.pre=sum(Type=="pre"),
Num.post=sum(Type=="post"),
ParD=paste(ParD, collapse="_"))