dt1
具有正确顺序的val
感兴趣
library(data.table)
dt1 <- data.frame(id = 1,
key = c(paste0("a_",1:6),paste0("b_",1:6)),
val = c(122,128,134, rep(NA,3),c(110,112,114),rep(NA,3)),
var = c(rep("a",6),rep("b",6)))
id key val var
1 1 a_1 122 a
2 1 a_2 128 a
3 1 a_3 134 a
4 1 a_4 NA a
5 1 a_5 NA a
6 1 a_6 NA a
7 1 b_1 110 b
8 1 b_2 112 b
9 1 b_3 114 b
10 1 b_4 NA b
11 1 b_5 NA b
12 1 b_6 NA b
dt2
还具有正确顺序的val
,但还有一些额外的val
dt2 <- data.frame(id = 1,
key = c(paste0("a_",c(1,3:6)),paste0("b_",c(2,4:6))),
val = c(122,127,122,128,134,110,110,112,114),
var = c(rep("a",5),rep("b",4)))
id key val var
1 1 a_1 122 a
2 1 a_3 127 a
3 1 a_4 122 a
4 1 a_5 128 a
5 1 a_6 134 a
6 1 b_2 110 b
7 1 b_4 110 b
8 1 b_5 112 b
9 1 b_6 114 b
我想将dt2
中的值序列与dt1
中的值序列进行匹配,并忽略dt2
中的多余值。
我尝试过向后滚动连接,因为dt2
中感兴趣的值在序列末尾被淘汰。
setDT(dt1,key = c("id","var","val"))
setDT(dt2,key = c("id","var","val"))
dt1[dt2, roll = -Inf]
id key val var i.key
1: 1 a_1 122 a a_1 # wrong
2: 1 a_1 122 a a_4
3: 1 a_2 127 a a_3 # wrong
4: 1 a_2 128 a a_5
5: 1 a_3 134 a a_6
6: 1 b_1 110 b b_2 # wrong
7: 1 b_1 110 b b_4
8: 1 b_2 112 b b_5
9: 1 b_3 114 b b_6
似乎dt2
中来自dt1
的重复值,但我正在寻找的顺序中的 not 导致问题。另外,我想让i.key
知道原始密钥,因为它将用于其他处理。我也尝试过:merge(dt1,dt2)
id key val var i.key
1 a_1 122 a a_4
1 a_2 128 a a_5
1 a_3 134 a a_6
1 b_1 110 b b_4
1 b_2 112 b b_5
1 b_3 114 b b_6
我将感谢您的指导
答案 0 :(得分:3)
这是一种方法,它首先会重新连接到dt2
上以过滤掉key == key
对,因为您不能在data.table
中同时进行非等额联接和滚动联接。
还请记住,on =
参数中的 last 列只能滚动,因此我建议始终显式定义它。
library(data.table)
dt2[dt1, on = .(id,var,val),nomatch = 0][
key != i.key][dt1,on = .(id,var,val,i.key=key), roll = -Inf, nomatch = 0][
,.SD[.N],by = .(id,val,var)][,.(id,val,var,key1 = i.key,key2 = key)]
id val var key1 key2
1: 1 122 a a_1 a_4
2: 1 128 a a_2 a_5
3: 1 134 a a_3 a_6
4: 1 110 b b_1 b_4
5: 1 112 b b_2 b_5
6: 1 114 b b_3 b_6
答案 1 :(得分:3)
假设我们需要在dt1
中的dt2
中找到整个序列,这是另一个选择:
setDT(dt1)
setDT(dt2)
cols <- c("sid", "cnt")
#create sequence index and count of non-NAs
DT1 <- dt1[!is.na(val)][, (cols) := .(seq(.N), .N), .(id, var)]
#inner join to id, var, val that exists in both
DT2 <- DT1[dt2, on=.(id, var, val), nomatch=0L, .(id, var, val, sid, cnt, i.key)]
#identify rows with consecutive seq index and filter for those rows
consec <- DT2[, if(.N == cnt[1L]) .SD, .(cs=cumsum(c(0L, diff(sid)!=1L)))]
#perform join to get desired output
DT1[consec, on=.(id, var, val)][, c(cols, "cs", "i.cnt") := NULL][]
输出:
id key val var i.key
1: 1 a_1 122 a A_4
2: 1 a_2 128 a A_5
3: 1 a_3 134 a A_6
4: 1 b_1 110 b B_4
5: 1 b_2 112 b B_5
6: 1 b_3 114 b B_6
7: 1 c_1 110 c C_3
8: 1 c_2 112 c C_5
9: 1 c_3 114 c C_6
带有基于评论的其他组的数据:
library(data.table)
dt1 <- data.frame(id = 1,
key = c(paste0("a_",1:6),paste0("b_",1:6),paste0("c_",1:6)),
val = c(122,128,134, rep(NA,3),c(110,112,114),rep(NA,3), c(110,112,114),rep(NA,3)),
var = c(rep("a",6),rep("b",6),rep("c",6)))
dt2 <- data.frame(id = 1,
key = c(paste0("A_",c(1,3:6)),paste0("B_",c(2,4:6)),paste0("C_",c(2,3:6))),
val = c(122,127,122,128,134,110,110,112,114,134,110,200,112,114),
var = c(rep("a",5),rep("b",4),rep("c",5)))
答案 2 :(得分:0)
可能有更好的方法,但是我认为这可以工作:
merge(dt1, dt2, by = c( "val","var")) %>%
arrange(key.x) %>%
filter(key.x != key.y) %>%
mutate (id = id.x, key = key.x, i.key = key.y) %>%
select (id, key, val, var, i.key)
结果:
id key val var i.key
1 1 a_1 122 a a_4
2 1 a_2 128 a a_5
3 1 a_3 134 a a_6
4 1 b_1 110 b b_2
5 1 b_1 110 b b_4
6 1 b_2 112 b b_5
7 1 b_3 114 b b_6