如何匹配匹配的两个数据集

时间:2014-11-13 14:39:16

标签: r

我有一个像main这样的数据集。

main<-structure(list(name = structure(1:3, .Label = c("AA", "BB", "CC"
), class = "factor"), val1 = c(11L, 22L, 33L), val2 = c(111L, 
222L, 333L)), .Names = c("name", "val1", "val2"), class = "data.frame", row.names = c(NA, 
-3L))

我还有一个像s1这样的数据集。

s1<-structure(list(cname = structure(c(1L, 1L, 1L, 2L, 3L, 2L, 2L, 
3L, 1L), .Label = c("AA", "BB", "CC"), class = "factor"), val1 = c(NA, 
NA, NA, NA, NA, NA, NA, NA, NA), val2 = c(NA, NA, NA, NA, NA, 
NA, NA, NA, NA)), .Names = c("cname", "val1", "val2"), class = "data.frame", row.names = c(NA, 
-9L))

通过匹配主数据集中的名称,我必须填写s1数据集中的val1和val2列。 现在我需要输出像s2。

s2<-structure(list(cname = structure(c(1L, 1L, 1L, 2L, 3L, 2L, 2L, 
3L, 1L), .Label = c("AA", "BB", "CC"), class = "factor"), val1 = c(11L, 
11L, 11L, 22L, 33L, 22L, 22L, 33L, 11L), val2 = c(111L, 111L, 
111L, 222L, 333L, 222L, 222L, 333L, 111L)), .Names = c("cname", 
"val1", "val2"), class = "data.frame", row.names = c(NA, -9L))

我可以使用for循环来完成它。但如果我有数百万条记录,那就太贵了。有没有其他替代方法可以做到这一点。请帮忙

2 个答案:

答案 0 :(得分:1)

您可以将data.table用于大数据集

 library(data.table)
 d1 <- as.data.table(s1[,1, drop=FALSE])
 setkey(setDT(main), name) 
 main[d1]
 #    name val1 val2
 #1:   AA   11  111
 #2:   AA   11  111
 #3:   AA   11  111
 #4:   BB   22  222
 #5:   CC   33  333
 #6:   BB   22  222
 #7:   BB   22  222
 #8:   CC   33  333
 #9:   AA   11  111

基准

稍微大一点的数据集

set.seed(24)
main <- data.frame(name=c(outer(LETTERS, 1:200, FUN= paste0)), 
   val1=sample(1:100, 52e2, replace=TRUE), 
          val2=sample(20:150, 52e2, replace=TRUE))
set.seed(36)
s1 <- data.frame(name= sample(main$name, 1e6, replace=TRUE))

f1 <- function() {s1$id = 1:nrow(s1)
             s2 = merge(main, s1)
             s2 = s2[order(s2$id),]
              }

f2 <- function() {DT <- as.data.table(s1)
              DTmain <- as.data.table(main)
              setkey(DTmain, name) 
              DTmain[DT]
              }
#the `merge` alone function
f3 <- function() {s2 = merge(main, s1)}




library(microbenchmark)
microbenchmark(f1(), f2(),f3(), unit='relative', times=20L)
# Unit: relative
#    expr      min       lq     mean   median       uq      max neval cld
#f1() 281.9077 239.3932 133.6460 213.5693 88.62224 66.80722    20   b
#f2()   1.0000   1.0000   1.0000   1.0000  1.00000  1.00000    20  a 
#f3() 272.2257 226.3176 127.0197 204.0454 85.22211 63.79950    20   b

答案 1 :(得分:0)

尝试合并命令:

> names(s1)[1]='name'
> merge(main, s1[1])
  name val1 val2
1   AA   11  111
2   AA   11  111
3   AA   11  111
4   AA   11  111
5   BB   22  222
6   BB   22  222
7   BB   22  222
8   CC   33  333
9   CC   33  333

维持订单:

> s1$id = 1:nrow(s1)
> s2 = merge(main, s1[c(1,4)])
> s2 = s2[order(s2$id),]
> s2[1:3]
  name val1 val2
1   AA   11  111
2   AA   11  111
3   AA   11  111
5   BB   22  222
9   CC   33  333
6   BB   22  222
7   BB   22  222
8   CC   33  333
4   AA   11  111