我有一个像main这样的数据集。
main<-structure(list(name = structure(1:3, .Label = c("AA", "BB", "CC"
), class = "factor"), val1 = c(11L, 22L, 33L), val2 = c(111L,
222L, 333L)), .Names = c("name", "val1", "val2"), class = "data.frame", row.names = c(NA,
-3L))
我还有一个像s1这样的数据集。
s1<-structure(list(cname = structure(c(1L, 1L, 1L, 2L, 3L, 2L, 2L,
3L, 1L), .Label = c("AA", "BB", "CC"), class = "factor"), val1 = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA), val2 = c(NA, NA, NA, NA, NA,
NA, NA, NA, NA)), .Names = c("cname", "val1", "val2"), class = "data.frame", row.names = c(NA,
-9L))
通过匹配主数据集中的名称,我必须填写s1数据集中的val1和val2列。 现在我需要输出像s2。
s2<-structure(list(cname = structure(c(1L, 1L, 1L, 2L, 3L, 2L, 2L,
3L, 1L), .Label = c("AA", "BB", "CC"), class = "factor"), val1 = c(11L,
11L, 11L, 22L, 33L, 22L, 22L, 33L, 11L), val2 = c(111L, 111L,
111L, 222L, 333L, 222L, 222L, 333L, 111L)), .Names = c("cname",
"val1", "val2"), class = "data.frame", row.names = c(NA, -9L))
我可以使用for循环来完成它。但如果我有数百万条记录,那就太贵了。有没有其他替代方法可以做到这一点。请帮忙
答案 0 :(得分:1)
您可以将data.table
用于大数据集
library(data.table)
d1 <- as.data.table(s1[,1, drop=FALSE])
setkey(setDT(main), name)
main[d1]
# name val1 val2
#1: AA 11 111
#2: AA 11 111
#3: AA 11 111
#4: BB 22 222
#5: CC 33 333
#6: BB 22 222
#7: BB 22 222
#8: CC 33 333
#9: AA 11 111
稍微大一点的数据集
set.seed(24)
main <- data.frame(name=c(outer(LETTERS, 1:200, FUN= paste0)),
val1=sample(1:100, 52e2, replace=TRUE),
val2=sample(20:150, 52e2, replace=TRUE))
set.seed(36)
s1 <- data.frame(name= sample(main$name, 1e6, replace=TRUE))
f1 <- function() {s1$id = 1:nrow(s1)
s2 = merge(main, s1)
s2 = s2[order(s2$id),]
}
f2 <- function() {DT <- as.data.table(s1)
DTmain <- as.data.table(main)
setkey(DTmain, name)
DTmain[DT]
}
#the `merge` alone function
f3 <- function() {s2 = merge(main, s1)}
library(microbenchmark)
microbenchmark(f1(), f2(),f3(), unit='relative', times=20L)
# Unit: relative
# expr min lq mean median uq max neval cld
#f1() 281.9077 239.3932 133.6460 213.5693 88.62224 66.80722 20 b
#f2() 1.0000 1.0000 1.0000 1.0000 1.00000 1.00000 20 a
#f3() 272.2257 226.3176 127.0197 204.0454 85.22211 63.79950 20 b
答案 1 :(得分:0)
尝试合并命令:
> names(s1)[1]='name'
> merge(main, s1[1])
name val1 val2
1 AA 11 111
2 AA 11 111
3 AA 11 111
4 AA 11 111
5 BB 22 222
6 BB 22 222
7 BB 22 222
8 CC 33 333
9 CC 33 333
维持订单:
> s1$id = 1:nrow(s1)
> s2 = merge(main, s1[c(1,4)])
> s2 = s2[order(s2$id),]
> s2[1:3]
name val1 val2
1 AA 11 111
2 AA 11 111
3 AA 11 111
5 BB 22 222
9 CC 33 333
6 BB 22 222
7 BB 22 222
8 CC 33 333
4 AA 11 111