我有一张表格如下
product=c("a","b","c")
min=c(1,5,3)
max=c(1,7,7)
dd=data.frame(product,min,max)
> dd
product min max
1 a 1 1
2 b 5 7
3 c 3 7
我想创建一个如下所示的表格。我想为产品之间的每个值创建一行,包括产品的最小值和最大值
product mm
a 1
b 5
b 6
b 7
c 3
c 4
c 5
c 6
c 7
我如何使用R?有什么方法可以快速给出结果吗?
答案 0 :(得分:5)
尝试
library(data.table)
setDT(dd)[, list(mm=min:max), by = product]
# product mm
#1: a 1
#2: b 5
#3: b 6
#4: b 7
#5: c 3
#6: c 4
#7: c 5
#8: c 6
#9: c 7
或者@David Arenburg建议的更快的选项是seq.int(min, max, 1L)
setDT(dd)[, list(mm = seq.int(min, max, 1L)), by = product]
library(stringi)
set.seed(24)
product <- unique(stri_rand_strings(1e5,4))
min1 <- sample(1:10, length(product), replace=TRUE)
max1 <- sample(11:15, length(product), replace=TRUE)
dd <- data.frame(product, min1, max1)
dd2 <- copy(dd)
josilber <- function(){res1 <- data.frame(product=rep(dd$product,
dd$max1-dd$min1+1),
mm=unlist(mapply(seq, dd$min1, dd$max1)))
}
akrun <- function(){as.data.table(dd2)[, list(mm = seq.int(min1, max1,
1L)), by = product]}
Ananda <- function() {stack(lapply(split(dd[-1], dd[1]),
function(x) seq(x[[1]], x[[2]])))}
jiber <- function(){res <- by(dd[,-1], dd[,1], function(x)
seq(x$min1, x$max1) )
res <- as.data.frame(unlist(res))
data.frame(product=gsub("[0-9]", "", rownames(res)), mm=res[,1])}
system.time(akrun())
# user system elapsed
# 0.129 0.001 0.129
system.time(josilber())
# user system elapsed
# 0.762 0.002 0.764
system.time(Ananda())
# user system elapsed
#45.449 0.191 45.636
system.time(jiber())
# user system elapsed
# 48.013 8.218 56.291
library(microbenchmark)
microbenchmark(josilber(), akrun(), times=20L, unit='relative')
#Unit: relative
# expr min lq mean median uq max neval cld
#josilber() 6.39757 6.713236 5.570836 5.901037 5.603639 3.970663 20 b
# akrun() 1.00000 1.000000 1.000000 1.000000 1.000000 1.000000 20 a
答案 1 :(得分:3)
使用基数R,您可以执行以下操作:
data.frame(product=rep(dd$product, dd$max-dd$min+1),
mm=unlist(mapply(seq, dd$min, dd$max)))
# product mm
# 1 a 1
# 2 b 5
# 3 b 6
# 4 b 7
# 5 c 3
# 6 c 4
# 7 c 5
# 8 c 6
# 9 c 7
答案 2 :(得分:3)
您还可以考虑split
+ lapply
+ stack
:
stack(lapply(split(dd[-1], dd[1]), function(x) seq(x[[1]], x[[2]])))
## values ind
## 1 1 a
## 2 5 b
## 3 6 b
## 4 7 b
## 5 3 c
## 6 4 c
## 7 5 c
## 8 6 c
## 9 7 c
答案 3 :(得分:1)
使用R基本函数的另一种方法
> res <- by(dd[,-1], dd[,1], function(x) seq(x$min, x$max) )
> res <- as.data.frame(unlist(res))
> data.frame(product=gsub("[0-9]", "", rownames(res)), mm=res[,1])
product mm
1 a 1
2 b 5
3 b 6
4 b 7
5 c 3
6 c 4
7 c 5
8 c 6
9 c 7