根据这篇维基百科文章SQL join,我希望能够清楚地了解如何与data.table建立联接。 在这个过程中,我们可能在加入NAs时发现了一个错误。 以wiki为例:
R) X = data.table(name=c("Raf","Jon","Ste","Rob","Smi","Joh"),depID=c(31,33,33,34,34,NA),key="depID")
R) Y = data.table(depID=c(31,33,34,35),depName=c("Sal","Eng","Cle","Mar"),key="depID")
R) X
name depID
1: Joh NA
2: Raf 31
3: Jon 33
4: Ste 33
5: Rob 34
6: Smi 34
R) Y
depID depName
1: 31 Sal
2: 33 Eng
3: 34 Cle
4: 35 Mar
LEFT OUTER JOIN
R) merge.data.frame(X,Y,all.x=TRUE)
depID name depName
1 31 Raf Sal
2 33 Jon Eng
3 33 Ste Eng
4 34 Rob Cle
5 34 Smi Cle
6 NA Joh <NA>
merge.data.table
不会输出相同的结果,并显示我认为lign 2上的错误。
R) merge(X,Y,all.x=TRUE)
depID name depName
1: NA Joh Eng
2: 31 Raf NA
3: 33 Jon Eng
4: 33 Ste Eng
5: 34 Rob Cle
6: 34 Smi Cle
R) Y[X] #same -> :(
depID depName name
1: NA Eng Joh
2: 31 NA Raf
3: 33 Eng Jon
4: 33 Eng Ste
5: 34 Cle Rob
6: 34 Cle Smi
RIGHT OUTER JOIN 看起来像是一样的
R) merge.data.frame(X,Y,all.y=TRUE)
depID name depName
1 31 Raf Sal
2 33 Jon Eng
3 33 Ste Eng
4 34 Rob Cle
5 34 Smi Cle
6 35 <NA> Mar
R) merge(X,Y,all.y=TRUE)
depID name depName
1: NA Joh Eng
2: 31 NA Sal
3: 33 Jon Eng
4: 33 Ste Eng
5: 34 Rob Cle
6: 34 Smi Cle
7: 35 NA Mar
INNER(NATURAL)JOIN
R) merge.data.frame(X,Y)
depID name depName
1 31 Raf Sal
2 33 Jon Eng
3 33 Ste Eng
4 34 Rob Cle
5 34 Smi Cle
R) merge(X,Y)
depID name depName
1: NA Joh Eng
2: 33 Jon Eng
3: 33 Ste Eng
4: 34 Rob Cle
5: 34 Smi Cle
答案 0 :(得分:8)
是的,它看起来像一个(令人尴尬的)与关键字NA相关的新bug。在关键不可能的情况下,还有其他关于NA的讨论,但我没有意识到它会以这种方式搞砸。将调查。谢谢......
#2453 NA in double key column messes up joins (NA in integer and character ok)
现已修复1.8.7(提交780),来自NEWS:
类型为double的连接列中的NA可能导致X [Y]和合并(X,Y)返回不正确的结果,#2453。由于C源中的错误x == NA_REAL应该是ISNA(x)。对键控连接的双重支持是对data.table的一个相对较新的补充,但同样令人尴尬。固定和测试添加。非常感谢有关彻底和可重复报告的统计数据。
答案 1 :(得分:2)
对其他答案中的评论进行跟进,是的,以下证明它只影响double
类型列(integer
中的NA和character
列都可以。)
X = data.table(name=c("Raf","Jon","Ste","Rob","Smi","Joh"),
depID=as.integer(c(31,33,33,34,34,NA)),key="depID")
Y = data.table(depID=as.integer(c(31,33,34,35)),
depName=c("Sal","Eng","Cle","Mar"),key="depID")
Y[X]
depID depName name
1: NA NA Joh
2: 31 Sal Raf
3: 33 Eng Jon
4: 33 Eng Ste
5: 34 Cle Rob
6: 34 Cle Smi
merge.data.frame(X,Y,all.x=T)
depID name depName
1 31 Raf Sal
2 33 Jon Eng
3 33 Ste Eng
4 34 Rob Cle
5 34 Smi Cle
6 NA Joh <NA>
Y = data.table(depID=as.character(c(31,33,34,35)),
depName=c("Sal","Eng","Cle","Mar"),key="depID")
X = data.table(name=c("Raf","Jon","Ste","Rob","Smi","Joh"),
depID=as.character(c(31,33,33,34,34,NA)),key="depID")
X
name depID
1: Raf 31
2: Jon 33
3: Ste 33
4: Rob 34
5: Smi 34
6: Joh NA
Y
depID depName
1: 31 Sal
2: 33 Eng
3: 34 Cle
4: 35 Mar
str(X)
Classes ‘data.table’ and 'data.frame': 6 obs. of 2 variables:
$ name : chr "Raf" "Jon" "Ste" "Rob" ...
$ depID: chr "31" "33" "33" "34" ...
- attr(*, "sorted")= chr "depID"
- attr(*, ".internal.selfref")=<externalptr>
merge.data.frame(X,Y,all.x=T)
depID name depName
1 31 Raf Sal
2 33 Jon Eng
3 33 Ste Eng
4 34 Rob Cle
5 34 Smi Cle
6 <NA> Joh <NA>
Y[X]
depID depName name
1: 31 Sal Raf
2: 33 Eng Jon
3: 33 Eng Ste
4: 34 Cle Rob
5: 34 Cle Smi
6: NA NA Joh
答案 2 :(得分:1)
一些有用的信息:
library(data.table);
X <- data.table(name=c("Raf","Jon","Ste","Rob","Smi","Joh"),depID=c(31,33,33,34,34,NA),key="depID")
#R) X
#name depID
#1: Joh NA
#2: Raf 31
#3: Jon 33
#4: Ste 33
#5: Rob 34
#6: Smi 34
Y <- data.table(depID=c(31,33,34,35),depName=c("Sal","Eng","Cle","Mar"),key="depID")
#R) Y
#depID depName
#1: 31 Sal
#2: 33 Eng
#3: 34 Cle
#4: 35 Mar
#################
#LEFT OUTER JOIN#
#################
LJ <- merge.data.frame(X,Y,by="depID",all.x=TRUE); #by is implicit (see ?merge.data.frame)
#R) LJ
#depID name depName
#1 31 Raf Sal
#2 33 Jon Eng
#3 33 Ste Eng
#4 34 Rob Cle
#5 34 Smi Cle
#6 NA Joh <NA>
LJ2 <- Y[X];
#R) LJ2
#depID depName name
#1: NA NA Joh
#2: 31 Sal Raf
#3: 33 Eng Jon
#4: 33 Eng Ste
#5: 34 Cle Rob
#6: 34 Cle Smi
##################
#RIGHT OUTER JOIN#
##################
RJ <- merge.data.frame(X,Y,by="depID",all.y=TRUE); #by is implicit (see ?merge.data.frame)
#R) RJ
#depID name depName
#1 31 Raf Sal
#2 33 Jon Eng
#3 33 Ste Eng
#4 34 Rob Cle
#5 34 Smi Cle
#6 35 <NA> Mar
RJ2 <- X[Y];
#R) RJ2
#depID name depName
#1: 31 Raf Sal
#2: 33 Jon Eng
#3: 33 Ste Eng
#4: 34 Rob Cle
#5: 34 Smi Cle
#6: 35 NA Mar
#################
#FULL OUTER JOIN#
#################
FJ <- merge.data.frame(X,Y,all=T)
#R) FJ
#depID name depName
#1 31 Raf Sal
#2 33 Jon Eng
#3 33 Ste Eng
#4 34 Rob Cle
#5 34 Smi Cle
#6 35 <NA> Mar
#7 NA Joh <NA>
FJ2 <- merge(X,Y,all=T)
#R) FJ2
#depID name depName
#1: NA Joh NA
#2: 31 Raf Sal
#3: 33 Jon Eng
#4: 33 Ste Eng
#5: 34 Rob Cle
#6: 34 Smi Cle
#7: 35 NA Mar
####################
#NATURAL INNER JOIN#
####################
IJ <- merge.data.frame(X,Y)
#R) IJ
#depID name depName
#1 31 Raf Sal
#2 33 Jon Eng
#3 33 Ste Eng
#4 34 Rob Cle
#5 34 Smi Cle
IJ2 <- merge(X,Y)
#R) IJ2
#depID name depName
#1: 31 Raf Sal
#2: 33 Jon Eng
#3: 33 Ste Eng
#4: 34 Rob Cle
#5: 34 Smi Cle
A <- data.table(time=as.POSIXct(c("10:01:01","10:01:02","10:01:04","10:01:05","10:01:02","10:01:01","10:01:01"),format="%H:%M:%S"),
b=c("a","a","a","a","b","c","c"),
d=c(1,1.9,2,1.8,5,4.1,4.2));
B <- data.table(time=as.POSIXct(c("10:01:01","10:01:03","10:01:00","10:01:01"),format="%H:%M:%S"),b=c("a","a","c","d"), e=c(1L,2L,3L,4L));
setkey(A,b,time)
setkey(B,b,time)
###########
#ASOF JOIN#
###########
AOJ <- B[A,roll=T]
#R) AOJ
#b time e d
#1: a 2013-01-11 10:01:01 1 1.0
#2: a 2013-01-11 10:01:02 1 1.9
#3: a 2013-01-11 10:01:04 2 2.0
#4: a 2013-01-11 10:01:05 2 1.8
#5: b 2013-01-11 10:01:02 NA 5.0
#6: c 2013-01-11 10:01:01 3 4.1
#7: c 2013-01-11 10:01:01 3 4.2