我有类似以下内容的东西:
ID start value want
1 1 1.4 1.5 3
2 1 1.4 3.3 3
3 1 1.4 4.2 3
4 2 3.4 5.5 2
5 2 3.4 6.5 2
6 2 3.4 6.7 2
7 2 3.4 6.9 2
我想计算连续value
观察是否发生在ID范围内的一个接一个的间隔中,间隔是开始+ 1。本质上,我只关心观察是否在观察之后立即出现,而不是顺序观察,但是在NEXT间隔内,无论观察发生在什么间隔内。
例如,我能够通过以下方式解决此问题:创建间隔列(启动后间隔一,启动后间隔二,等等),然后使用ifelse语句确定value
是否落在间隔间隔之一({{1} } vars为1/0),然后返回任意可能的从左到右对角线的最大和(within_
;给出观察到的最大时间一个接一个的间隔),如下所示:
want
但是问题是我有大量数据,并且它根本无法运行。我全都没主意,将不胜感激。
要玩的数据:
ID start one_after two_after three_after four_after value want within_start_one within_one_two within_two_three within_three_four
1 1 1.4 2.4 3.4 4.4 5.4 1.5 3 1 0 0 0
2 1 1.4 2.4 3.4 4.4 5.4 3.3 3 0 1 0 0
3 1 1.4 2.4 3.4 4.4 5.4 4.2 3 0 0 1 0
4 2 3.4 4.4 5.4 6.4 7.4 5.5 2 0 0 1 0
5 2 3.4 4.4 5.4 6.4 7.4 6.5 2 0 0 0 1
6 2 3.4 4.4 5.4 6.4 7.4 6.7 2 0 0 0 1
7 2 3.4 4.4 5.4 6.4 7.4 6.9 2 0 0 0 1
我尝试过的方法并且对于小数据有些有用(但是不幸的是,对角线的总和而不是返回对角线的最大长度):
df<-data.frame(ID=c(1, 1, 1,2,2,2,2),
start=c(1.4, 1.4, 1.4, 3.4,3.4,3.4,3.4),
value=c(1.5, 3.3, 4.2, 5.5, 6.5, 6.7, 6.9),
want=c(3,3,3,2,2,2,2))
答案 0 :(得分:3)
一个选项是将值重置为起始值(以使所有ID的新起始值均为0),然后计算连续间隔的数量。这是使用data.table
的想法的实现:
DT[, want := {
d <- trunc(value - start)
r <- rle(cumsum(c(0L, diff(d)!=1L)))
max(r$lengths)
}, ID][
want==1L, want:=0L]
上述的另一种更快的实现方式:
DT[, rr := rowid(rleid(ID, cumsum(c(0L, diff(trunc(value - start))!=1L))))][,
want := max(rr), ID][
want==1L, want:=0L]
输出:
ID start value want
1: 1 1.4 1.5 3
2: 1 1.4 3.3 3
3: 1 1.4 4.2 3
4: 2 3.4 5.5 2
5: 2 3.4 6.5 2
6: 2 3.4 6.7 2
7: 2 3.4 6.9 2
8: 3 1.0 1.5 2
9: 3 1.0 2.5 2
10: 3 1.0 6.5 2
11: 3 1.0 7.5 2
12: 4 1.0 1.5 0
数据:
library(data.table)
DT <- data.table(ID=c(1,1,1, 2,2,2,2, 3,3,3,3, 4),
start=c(1.4,1.4,1.4, 3.4,3.4,3.4,3.4, 1,1,1,1, 1),
value=c(1.5,3.3,4.2, 5.5,6.5,6.7,6.9, 1.5,2.5,6.5,7.5, 1.5))
时间:
set.seed(0L)
nr <- 1e6
nid <- nr/4
DT <- data.table(ID=sample(nid, nr, TRUE))[,
c("start", "value") := .(runif(1L, 0, 5), runif(.N, 5, 10)),
ID]
setorder(DT, ID, start, value)
system.time({
DT[, d := trunc(value - start)][, want := {
r <- rle(cumsum(c(0L, diff(d)!=1L)))
max(r$lengths)
}, ID][
want==1L, want:=0L]
})
# user system elapsed
# 6.80 0.03 6.85
system.time({
DT2[, rr := rowid(rleid(ID, cumsum(c(0L, diff(trunc(value - start))!=1L))))][,
want := max(rr), ID][
want==1L, want:=0L]
})
# user system elapsed
# 0.22 0.03 0.24
答案 1 :(得分:2)
这是您想要的吗? 怎么样?
library(tidyverse)
df <- tibble(ID = c(1,1,1,2,2,2,2),
start = c(1.4,1.4,1.4,3.4,3.4,3.4,3.4),
value = c(1.5,3.3,4.2,5.5,6.5,6.7,6.9),
want = c(3,3,3,2,2,2,2))
df %>%
group_by(ID) %>%
mutate(
interval = floor(value - start) + 1,
consecutive = interval == lag(interval) + 1,
consecutive = if_else(is.na(consecutive), lead(consecutive), consecutive),
cumulated = sum(consecutive)
)
#> # A tibble: 7 x 7
#> # Groups: ID [2]
#> ID start value want interval consecutive cumulated
#> <dbl> <dbl> <dbl> <dbl> <dbl> <lgl> <int>
#> 1 1 1.4 1.5 3 1 TRUE 3
#> 2 1 1.4 3.3 3 2 TRUE 3
#> 3 1 1.4 4.2 3 3 TRUE 3
#> 4 2 3.4 5.5 2 3 TRUE 2
#> 5 2 3.4 6.5 2 4 TRUE 2
#> 6 2 3.4 6.7 2 4 FALSE 2
#> 7 2 3.4 6.9 2 4 FALSE 2
由reprex package(v0.3.0)于2020-01-08创建
答案 2 :(得分:1)
也许是这样
library(tidyverse)
df_example <- data.table::fread("ID start value want
1 1 1.4 1.5 3
2 1 1.4 3.3 3
3 1 1.4 4.2 3
4 2 3.4 5.5 2
5 2 3.4 6.5 2
6 2 3.4 6.7 2
7 2 3.4 6.9 2")
#> Warning in data.table::fread("ID start value want\n1 1 1.4 1.5 3\n2 1 1.4 3.3
#> 3\n3 1 1.4 4.2 3\n4 2 3.4 5.5 2\n5 2 3.4 6.5 2\n6 2 3.4 6.7 2\n7 2 3.4 6.9 2"):
#> Detected 4 column names but the data has 5 columns (i.e. invalid file). Added 1
#> extra default column name for the first column which is guessed to be row names
#> or an index. Use setnames() afterwards if this guess is not correct, or fix the
#> file write command that created the file to create a valid file.
df_example %>%
select(-V1) %>%
as.data.frame() %>%
dput()
#> structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L, 2L), start = c(1.4,
#> 1.4, 1.4, 3.4, 3.4, 3.4, 3.4), value = c(1.5, 3.3, 4.2, 5.5,
#> 6.5, 6.7, 6.9), want = c(3L, 3L, 3L, 2L, 2L, 2L, 2L)), row.names = c(NA,
#> -7L), class = "data.frame")
df_example <- structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L, 2L), start = c(1.4,
1.4, 1.4, 3.4, 3.4, 3.4, 3.4), value = c(1.5, 3.3, 4.2, 5.5,
6.5, 6.7, 6.9), want = c(3L, 3L, 3L, 2L, 2L, 2L, 2L)), row.names = c(NA,
-7L), class = "data.frame")
df_example %>%
group_by(ID) %>%
mutate(row_numb = row_number(),
current = value - start - row_numb,
sum_if = if_else(current <1 & current > -1,1,0)) %>%
mutate(want2 = sum(sum_if)) %>%
select(-sum_if,-current,-row_numb)
#> # A tibble: 7 x 5
#> # Groups: ID [2]
#> ID start value want want2
#> <int> <dbl> <dbl> <int> <dbl>
#> 1 1 1.4 1.5 3 3
#> 2 1 1.4 3.3 3 3
#> 3 1 1.4 4.2 3 3
#> 4 2 3.4 5.5 2 2
#> 5 2 3.4 6.5 2 2
#> 6 2 3.4 6.7 2 2
#> 7 2 3.4 6.9 2 2
由reprex package(v0.3.0)于2020-01-07创建