使用nomatch和allow.cartesian时data.table连接中出现意外行为

时间:2015-07-09 16:17:40

标签: r data.table

我有2个数据表,我正试图将一个列从corable到最终表。

cortable

cor,tickerkey
0.7539,AAL_AAN
0.573,AAL_ABB
0.6384,AAL_ACM
0.7193,AAL_ACXM
0.8386,AAL_ADP
0.7392,AAL_ADT
0.732,AAL_AER
0.4805,AAL_AGCO
0.9363,AAL_AL
0.9064,AAL_ALK
0.7545,AAL_ALSN
0.8586,AAL_AME
0.3356,AAL_AMT
0.8239,AAL_AN
0.8637,AAL_AOS
0.7638,AAL_APD
0.7915,AAL_APH
0.8785,AAL_APOL
0.8073,AAL_ARMH
0.7744,AAL_ASH
0.4179,AAL_ATLS
0.8282,AAL_AWI
-0.2539,AAL_AWK
0.8213,AAL_AXLL
0.827,AAL_BA
0.8642,AAL_BC
0.7982,AAL_BCO
0.2002,AAL_BEAV
0.7079,AAL_BERY
0.858,AAL_BGC
0.5943,AAL_BRK.B
0.1522,AAL_BWC
0.2793,AAL_CAR
0.8537,AAL_CAT
0.9115,AAL_CBI

dput

cortable<-structure(list(cor = c("0.7539", "0.573", "0.6384", "0.7193", 
                                  "0.8386", "0.7392", "0.732", "0.4805", "0.9363", "0.9064", "0.7545", 
                                  "0.8586", "0.3356", "0.8239", "0.8637", "0.7638", "0.7915", "0.8785", 
                                  "0.8073", "0.7744", "0.4179", "0.8282", "-0.2539", "0.8213", 
                                  "0.827", "0.8642", "0.7982", "0.2002", "0.7079", "0.858", "0.5943", 
                                  "0.1522", "0.2793", "0.8537", "0.9115"), 
                          tickerkey = c("AAL_AAN", "AAL_ABB", "AAL_ACM", "AAL_ACXM", "AAL_ADP", "AAL_ADT", "AAL_AER", 
                                                                                         "AAL_AGCO", "AAL_AL", "AAL_ALK", "AAL_ALSN", "AAL_AME", "AAL_AMT", 
                                                                                         "AAL_AN", "AAL_AOS", "AAL_APD", "AAL_APH", "AAL_APOL", "AAL_ARMH", 
                                                                                         "AAL_ASH", "AAL_ATLS", "AAL_AWI", "AAL_AWK", "AAL_AXLL", "AAL_BA", 
                                                                                         "AAL_BC", "AAL_BCO", "AAL_BEAV", "AAL_BERY", "AAL_BGC", "AAL_BRK.B", 
                                                                                         "AAL_BWC", "AAL_CAR", "AAL_CAT", "AAL_CBI")), .Names = c("cor", 
                                                                                                                                                  "tickerkey"), row.names = c(NA, -35L), class = c("data.table", 
                                                                                                                                                                                                   "data.frame"), sorted = "tickerkey")

finaltable

tickerkey,ticker1,ticker2
AAL_ALK,AAL,ALK
AAL_CAR,AAL,CAR
AAL_CHRW,AAL,CHRW
AAL_CNW,AAL,CNW
AAL_CSX,AAL,CSX
AAL_DAL,AAL,DAL
AAL_EXPD,AAL,EXPD
AAL_FDX,AAL,FDX
AAL_HTZ,AAL,HTZ
AAL_JBHT,AAL,JBHT

dput

    finaltable<-structure(list(tickerkey = c("AAL_ALK", "AAL_CAR", "AAL_CHRW", 
                                             "AAL_CNW", "AAL_CSX", "AAL_DAL", "AAL_EXPD", "AAL_FDX", "AAL_HTZ", 
                                             "AAL_JBHT"), ticker1 = c("AAL", "AAL", "AAL", "AAL", "AAL", "AAL", 
                                                                      "AAL", "AAL", "AAL", "AAL"), ticker2 = c("ALK", "CAR", "CHRW", 
                                                                                                               "CNW", "CSX", "DAL", "EXPD", "FDX", "HTZ", "JBHT")), .Names = c("tickerkey", 
                                                                                                                                                                               "ticker1", "ticker2"), row.names = c(NA, -10L), class = c("data.table", 
                                                                                                                                                                                                                                         "data.frame"), sorted = "tickerkey")

我正在尝试使用以下代码实现这一目标。

setkey(cortable, "tickerkey")
setkey(finaltable, "tickerkey")

finaltable[cortable,cor:=cor,allow.cartesian=TRUE,nomatch=0]

正确的预期输出将是最终确定

tickerkey,ticker1,ticker2,cor
AAL_ALK,AAL,ALK,0.9064
AAL_CAR,AAL,CAR,0.2793

其余行的值为NA

但它提供输出

finaltable

tickerkey,ticker1,ticker2,cor
AAL_ALK,AAL,ALK,0.2793
AAL_CAR,AAL,CAR,0.9064

其余的行为NA。和执行警告.. In [.data.table(finaltable, cortable, :=(cor, cor), allow.cartesian = TRUE, : Supplied 2 items to be assigned to 35 items of column 'cor' (recycled leaving remainder of 1 items).

如果我删除nomatch参数,则不匹配不会发生。 我试图找出导致这种行为的确切原因,因为我有很多使用它的代码/数据分析,并且不知道可能的意外行为的确切原因,这使我对迄今为止生成的所有数据产生了信心。 / p>

我试着研究nomatch的定义/行为,在上述用法的上下文中找不到多少。如果有人能给出一些解释,将会非常有帮助。

1 个答案:

答案 0 :(得分:1)

这应该有效:

merge(cortable, finaltable, by=c('tickerkey'))[,list(tickerkey,ticker1,ticker2,cor)]

或者你可以做到

cortable[finaltable][!is.na(cor)][,list(tickerkey,ticker1,ticker2,cor)]

后一种方法假设您已经设置了密钥,而第一种方法与密钥设置无关。