我有以下输入数据框:
df <- data.frame(x=c('a','b','c'),y=c(4,5,6),from=c(1,2,3),to=c(2,4,6))
df
x y from to
1 a 4 1 2
2 b 5 2 4
3 c 6 3 6
现在,我希望将每行的时间扩展为from和to之间的值,即(&#39; a&#39; 4)跨越两行,即1,2
。预期结果如下:
exp <- data.frame(x=c('a','a','b','b','b','c','c','c','c'),
y=c(4,4,5,5,5,6,6,6,6),
z=c(1,2,2,3,4,3,4,5,6))
exp
x y z
1 a 4 1
2 a 4 2
3 b 5 2
4 b 5 3
5 b 5 4
6 c 6 3
7 c 6 4
8 c 6 5
9 c 6 6
在没有循环的情况下,最常用的方法是什么?
答案 0 :(得分:5)
一种“非tidyverse”方式:
data.frame(
x = c('a', 'b', 'c'),
y = c(4, 5, 6),
from = c(1, 2, 3),
to = c(2, 4, 6),
stringsAsFactors = FALSE
) -> xdf
do.call(rbind.data.frame, lapply(1:nrow(xdf), function(i) {
data.frame(x = xdf$x[i], y=xdf$y[i], z=xdf$from[i]:xdf$to[i], stringsAsFactors=FALSE)
}))
一种“整齐”的方式:
library(tidyverse)
data_frame(
x = c('a', 'b', 'c'),
y = c(4, 5, 6),
from = c(1, 2, 3),
to = c(2, 4, 6)
) -> xdf
rowwise(xdf) %>%
do(data_frame(x = .$x, y=.$y, z=.$from:.$to))
不的另一种“tidyverse”方式已在下面进行了基准测试:
xdf %>%
rowwise() %>%
do( merge( as_tibble(.), tibble(z=.$from:.$to), by=NULL) ) %>%
select( -from, -to ) # Omit this line if you want to keep all original columns.
因为你问了abt表现:
library(microbenchmark)
data.table::data.table(
x = c('a','b','c'),
y = c(4,5,6),
from = c(1,2,3),
to = c(2,4,6)
) -> xdt1
data.frame(
x = c('a', 'b', 'c'),
y = c(4, 5, 6),
from = c(1, 2, 3),
to = c(2, 4, 6),
stringsAsFactors = FALSE
) -> xdf1
data.table
操作通常会就地修改,因此在执行操作之前,请保持一个公平的竞争环境并制作每个数据框/表的副本。
在大多数现代系统中,时间损失约为100 纳秒。
microbenchmark(
data.table = {
xdt2 <- xdt1
xdt2[, diff:= (to - from) + 1]
xdt2 <- xdt2[rep(1:.N, diff)]
xdt2[,z := seq(from,to), by=.(x,y,from,to)]
xdt2[,c("x", "y", "z")]
},
base = {
xdf2 <- xdf1
do.call(rbind.data.frame, lapply(1:nrow(xdf2), function(i) {
data.frame(x = xdf2$x[i], y=xdf2$y[i], z=xdf2$from[i]:xdf2$to[i], stringsAsFactors=FALSE)
}))
},
tidyverse = {
xdf2 <- xdf1
dplyr::rowwise(xdf2) %>%
dplyr::do(dplyr::data_frame(x = .$x, y=.$y, z=.$from:.$to))
},
plyr = {
xdf2 <- xdf1
plyr::mdply(xdf2, function(x,y,from,to) data.frame(x,y,z=seq(from,to)))[c("x","y","z")]
},
times = 1000
)
## Unit: microseconds
## expr min lq mean median uq max neval
## data.table 920.361 1072.9265 1257.2321 1178.832 1280.2660 10628.552 1000
## base 677.069 761.3145 884.4136 825.472 915.8985 5366.515 1000
## tidyverse 15926.127 17231.5015 19201.4798 17994.919 20014.4140 166901.570 1000
## plyr 1938.838 2196.4205 2448.5314 2322.949 2501.5075 5735.255 1000
答案 1 :(得分:1)
我意识到这已经得到了解答,但是单行data.table
解决方案是:
library(data.table)
setDT(df)[,.(z = from:to), by = .(x,y)]
# x y z
#1: a 4 1
#2: a 4 2
#3: b 5 2
#4: b 5 3
#5: b 5 4
#6: c 6 3
#7: c 6 4
#8: c 6 5
#9: c 6 6
答案 2 :(得分:0)
您可以使用data.table
library(data.table)
df <- data.table(x=c('a','b','c'),y=c(4,5,6),from=c(1,2,3),to=c(2,4,6))
df <- df[, diff:= (to - from) + 1]
df <- df[rep(1:.N,diff)]
df <- df[,z := seq(from,to) , by=.(x,y,from,to)]
df
> df
x y from to diff z
1: a 4 1 2 2 1
2: a 4 1 2 2 2
3: b 5 2 4 3 2
4: b 5 2 4 3 3
5: b 5 2 4 3 4
6: c 6 3 6 4 3
7: c 6 3 6 4 4
8: c 6 3 6 4 5
9: c 6 3 6 4 6
答案 3 :(得分:0)
使用plyr
包可以使用mdply
:
library(plyr)
df <- data.frame(x=c('a','b','c'),y=c(4,5,6),from=c(1,2,3),to=c(2,4,6))
res <- mdply(df, function(x,y,from,to) data.frame(x,y,z=seq(from,to)))[c("x","y","z")]
res
x y z
1 a 4 1
2 a 4 2
3 b 5 2
4 b 5 3
5 b 5 4
6 c 6 3
7 c 6 4
8 c 6 5
9 c 6 6
因为它为每一行创建一个数据框,所以它不是超级高效的......或者?