我有这样的数据:
set.seed(4597)
lower = sort(runif(10L))
upper = c(lower[-1], 1)
# randomly drop some intervals from the "complete" data
# (which is in practice "unknown")
obs = cbind(lower, upper)[-sample(10, 4), ]
library(data.table)
# augment with a column associated to each interval
DT = data.table(obs)[ , v := rpois(.N, 10)]
DT[]
# lower upper v
# 1: 0.08810018 0.1026903 7
# 2: 0.31929301 0.4530299 6
# 3: 0.45302992 0.5248329 6
# 4: 0.58620724 0.8027425 9
# 5: 0.80274248 0.9054854 10
# 6: 0.98218176 1.0000000 10
除第1-2、3-4和5-6行外,大多数间隔都在行与行之间“排成一行”。
我想为每种情况添加行,例如分别为lower = .102
和upper
= .319
,并将v
设置为0
。
为了使事情更复杂,lower
和upper
具有数字差异,因此测试.453 == .453
(第3和第4行)可能会失败:
# adding random tiny noise
DT[ , upper :=
upper +
sample(-1:1, .N, TRUE)*10^sample(0:2, .N, TRUE)*.Machine$double.eps]
我当前的方法似乎很混乱;我想知道还有哪些其他选项可能更有效(我必须执行该操作数万次(如果不是数百万次))
DT[ , {
lower_lead = shift(lower, type = 'lead', fill = upper[.N])
# "new" points come when the led value of lower
# doesn't match upper (to numerical precision)
new = abs(lower_lead - upper) > .0001
# augment lower with the new point
new_lower = c(lower, upper[new])
# don't sort right away, need to keep indices to augment v
idx = order(new_lower)
new_lower = new_lower[idx]
new_v = v[idx]
new_v[is.na(new_v)] = 0
# re-shift new_lower to get upper
new_upper = shift(new_lower, type = 'lead', fill = upper[.N])
.(lower = new_lower, upper = new_upper, v = new_v)
}]
# lower upper v
# 1: 0.08810018 0.1026903 7
# 2: 0.10269026 0.3192930 0
# 3: 0.31929301 0.4530299 6
# 4: 0.45302992 0.5248329 6
# 5: 0.52483292 0.5862072 0
# 6: 0.58620724 0.8027425 9
# 7: 0.80274248 0.9054854 10
# 8: 0.90548543 0.9821818 0
# 9: 0.98218176 1.0000000 10
有更好的方法吗?
这是一个更大范围的测试用例:
set.seed(4597)
KK = 1e5
DT = data.table(ID = 1:KK)
DT = DT[ , {
lower = sort(runif(10L))
upper = c(lower[-1], 1)
idx = sample(10, 4)
.(lower = lower[-idx], upper = upper[-idx])
}, by = ID]
DT[ , v := rpois(.N, 10)]
DT[]
DT[ , upper :=
upper +
sample(-1:1, .N, TRUE)*10^sample(0:2, .N, TRUE)*.Machine$double.eps]
system.time({
DT[ , {
lower_lead = shift(lower, type = 'lead', fill = upper[.N])
# "new" points come when the led value of lower
# doesn't match upper (to numerical precision)
new = abs(lower_lead - upper) > .0001
# augment lower with the new point
new_lower = c(lower, upper[new])
# don't sort right away, need to keep indices to augment v
idx = order(new_lower)
new_lower = new_lower[idx]
new_v = v[idx]
new_v[is.na(new_v)] = 0
# re-shift new_lower to get upper
new_upper = shift(new_lower, type = 'lead', fill = upper[.N])
.(lower = new_lower, upper = new_upper, v = new_v)
}, by = ID][]
})
# user system elapsed
# 4.592 0.018 4.609
答案 0 :(得分:2)
对于x!=按组移位(y),您可以使用技巧来避免by=
:
system.time({
# w are positions of a "lower" that is above the preceding "upper"
w <- DT[(abs(lower - shift(upper)) > eps) & (rowid(ID) != 1L), which=TRUE]
new = DT[, .(ID = ID[w], lower = upper[w-1L], upper = lower[w], v = 0L)]
fres = rbind(DT, new)
setkey(fres, ID, lower)
})
# user system elapsed
# 0.050 0.012 0.061
system.time({
mres = DT[ , {
lower_lead = shift(lower, type = 'lead', fill = upper[.N])
# "new" points come when the led value of lower
# doesn't match upper (to numerical precision)
new = abs(lower_lead - upper) > .0001
# augment lower with the new point
new_lower = c(lower, upper[new])
# don't sort right away, need to keep indices to augment v
idx = order(new_lower)
new_lower = new_lower[idx]
new_v = v[idx]
new_v[is.na(new_v)] = 0L
# re-shift new_lower to get upper
new_upper = shift(new_lower, type = 'lead', fill = upper[.N])
.(lower = new_lower, upper = new_upper, v = new_v)
}, by = ID][]
})
# user system elapsed
# 4.447 0.025 4.471
它不完全匹配,但是我认为这是由于mres
中引入的数字错误引起的。
fsetequal(fres, mres)
# FALSE
DT[fres, on=.(ID, upper), .N, nomatch=0]
# [1] 600000
DT[mres, on=.(ID, upper), .N, nomatch=0]
# [1] 400092
也就是说,mres
包含来自DT
的199908行,其值已更改(也许使用下一行的lower
而不是原始的upper
?)。
答案 1 :(得分:1)
我的方法是从给定的lower
和upper
创建一个断点向量,并从断点中导出 all 个可能的间隔,这些断点与给定的间隔相连。给定数据中的差距用NA
表示。
在这里,需要进行修改以删除长度小于给定epsilon的间隔。
eps <- 0.0001
DT[DT[, {tmp <- sort(c(lower, upper));
.(lower = head(tmp, -1L), upper = tail(tmp, -1L))}][
upper - lower > eps], on = .(lower, upper)][is.na(v), v := 0][]
lower upper v 1: 0.1026903 0.2634059 14 2: 0.2634059 0.3192930 0 3: 0.3192930 0.4530299 11 4: 0.4530299 0.5248329 12 5: 0.5248329 0.5862072 5 6: 0.5862072 0.8027425 5 7: 0.8027425 0.9054854 15
library(data.table)
DT <- fread(
"# i lower upper v
# 1: 0.1026903 0.2634059 14
# 2: 0.3192930 0.4530299 11
# 3: 0.4530299 0.5248329 12
# 4: 0.5248329 0.5862072 5
# 5: 0.5862072 0.8027425 5
# 6: 0.8027425 0.9054854 15", drop = 1:2
)
set.seed(1L)
DT[ , upper :=
upper +
sample(-1:1, .N, TRUE)*10^sample(0:2, .N, TRUE)*.Machine$double.eps]
答案 2 :(得分:0)
这与我最初将这个问题概念化的方式非常相似,就是将upper
和lower_lead
向量“碰撞”在一起并消除重复项。 (旁注:如果tol
有一个unique
参数,那就太好了)。它回避了我认为最初的方法中最大的低效率问题-排序,但是我猜想在创建new_v
重新平衡时会遇到以下问题:
DT[ , {
lower_lead = shift(lower, type = 'lead', fill = upper[.N])
new_upper = lapply(seq_len(.N), function(ii) {
if (abs(upper[ii] - lower_lead[ii]) > .0001)
c(upper[ii], lower_lead[ii])
else upper[ii]
})
new_v = as.list(v)
new_v[idx] = lapply(which(idx <- lengths(new_upper) > 1L), function(ii) c(v[[ii]], 0L))
new_upper = unlist(new_upper)
.(lower = shift(new_upper, fill = lower[1L]),
upper = new_upper, v = unlist(new_v))
}, by = ID]