以前似乎没有提出这个问题。
我想找到连续6个小时得1分的科目数。 每小时没有对受试者进行评分,因此如果缺少一小时,则小时不连续,并且该6小时的输出应为NA。 分配NA的原因是我们不知道主题在失踪时间内如何得分。此问题可用于计算连续命中数,但仅在主题参与时才计算。
我的数据框如下所示:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
df<-data.frame(ID,hour,A)
我试过使用rle函数(我确信它可能)但我无法在小时和ID上进行调整。 输出将是这样的:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six<-c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df<-data.frame(ID,hour,A,six)
提前谢谢。
我认为我提供的原始数据集太小,无法使解决方案更具普遍性 我刚用这个数据集尝试了代码,发现这会导致错误的结果。
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
df<-data.frame(ID,hour,A)
对于新数据集,输出应为:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)
答案 0 :(得分:7)
要获得分组,您可以将当前小时与滞后小时进行比较,以查看它们是“连续”还是相差1个整数,然后取出cumsum
。获得分组后,您可以使用简单的ave
来获得所需的输出。
library(data.table)
df$six <- ave(df$A,
cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)),
FUN = function(x) {
if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
else return(rep(NA, length(x)))}
)
df
# ID hour A six
#1 1 1 0 NA
#2 1 2 1 NA
#3 1 3 0 NA
#4 1 7 1 0
#5 1 8 1 0
#6 1 9 1 0
#7 1 10 1 0
#8 1 11 1 0
#9 1 12 1 1
#10 1 17 0 NA
#11 1 18 0 NA
#12 1 19 0 NA
#13 2 1 1 0
#14 2 2 1 0
#15 2 3 1 0
#16 2 4 1 0
#17 2 5 1 0
#18 2 6 1 1
#19 2 8 1 NA
#20 2 9 1 NA
#21 2 15 1 NA
如果您只想选择一次患者,则会选择最后一个时间段:
df$six_adj <- ave(df$six, df$ID, df$six, FUN = function(x) {
if(all(x==1)) return(c(rep(0, length(x) - 1), 1))
else return(x)}
)
答案 1 :(得分:7)
以下是使用更新数据集的tidyverse方法:
library(tidyverse)
df %>%
group_by(ID) %>%
expand(hour = seq(min(hour), max(hour))) %>%
left_join(df) %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
filter(!is.na(A)) %>%
ungroup() %>%
select(ID, hour, A, six) %>%
as.data.frame() -> df_out2
检查请求的输出:
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4)
hour<-c(1,2,3,7,8,9,10,12,13,17,18,19,1,2,3,4,5,6,8,9,15,1:23,27,28,29,30,31)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1,rep(1,28))
six<-c(NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA)
df<-data.frame(ID,hour,A,six)
all.equal(df, df_out2)
#output
TRUE
df %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
ungroup() %>%
select(ID, hour, A, six) %>%
as.data.frame() -> df_out2
让我们检查结果是否符合要求:
ID <- c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour <- c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A <- c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
six <- c(NA,NA,NA,0,0,0,0,0,1,NA,NA,NA,0,0,0,0,0,1,NA,NA,NA)
df1 <- data.frame(ID, hour, A, six)
请求输出df1
all.equal(df1, df_out2)
#output
TRUE
一些基准测试:
library(microbenchmark)
library(data.table)
akrun <- function(df){
setDT(df)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
][, sixnew :=if(sum(A)>=5) rep(c(0, 1), c(.N-1, 1)) else NA_real_,.(rleid(Anew), grp)]
i1 <- df[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
df[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
}
missuse <- function(df){
df %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
ungroup() %>%
select(ID, hour, A, six)
}
Mike <- function(df){
ave(df$A,
cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)),
FUN = function(x) {
if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
else return(rep(NA, length(x)))})
}
microbenchmark(Mike(df),
akrun(df),
missuse(df))
#output
Unit: microseconds
expr min lq mean median uq max neval
Mike(df) 491.291 575.7115 704.2213 597.7155 629.0295 9578.684 100
akrun(df) 6568.313 6725.5175 7867.4059 6843.5790 7279.2240 69790.755 100
missuse(df) 11042.822 11321.0505 12434.8671 11512.3200 12616.3485 43170.935 100
迈克H的方式!
答案 2 :(得分:5)
我们可以使用rleid
中的data.table
。创建一个运行长度为&#39; A&#39;的分组列。 (&#39; GRP&#39)。取一个小时的差异&#39;使用&#39;小时&#39;的下一个值,检查它是否等于1并乘以&#39; A&#39;创造'Anew&#39;。按照“Anew”的游程ID进行分组。和&#39;&#39;,if
sum
&#39; A&#39;大于特定值,用0和1复制,否则返回NA。在最后阶段,通过创建索引(&#39; i1&#39;)将一些溢出NAs指定为0
library(data.table)
setDT(df)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
][, sixnew :=if(sum(A)>=5) rep(c(0, 1), c(.N-1, 1)) else NA_real_,.(rleid(Anew), grp)]
i1 <- df[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
df[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
# ID hour A six sixnew
# 1: 1 1 0 NA NA
# 2: 1 2 1 NA NA
# 3: 1 3 0 NA NA
# 4: 1 7 1 0 0
# 5: 1 8 1 0 0
# 6: 1 9 1 0 0
# 7: 1 10 1 0 0
# 8: 1 11 1 0 0
# 9: 1 12 1 1 1
#10: 1 17 0 NA NA
#11: 1 18 0 NA NA
#12: 1 19 0 NA NA
#13: 2 1 1 0 0
#14: 2 2 1 0 0
#15: 2 3 1 0 0
#16: 2 4 1 0 0
#17: 2 5 1 0 0
#18: 2 6 1 1 1
#19: 2 8 1 NA NA
#20: 2 9 1 NA NA
#21: 2 15 1 NA NA
或稍微更紧凑的选项
i1 <- setDT(df)[, if(sum(A)>= 5) .I[.N] , rleid(c(TRUE, diff(hour)==1), A)]$V1
df[i1, sixnew := 1][unlist(Map(`:`, i1-5, i1-1)), sixnew := 0]
df
# ID hour A six sixnew
# 1: 1 1 0 NA NA
# 2: 1 2 1 NA NA
# 3: 1 3 0 NA NA
# 4: 1 7 1 0 0
# 5: 1 8 1 0 0
# 6: 1 9 1 0 0
# 7: 1 10 1 0 0
# 8: 1 11 1 0 0
# 9: 1 12 1 1 1
#10: 1 17 0 NA NA
#11: 1 18 0 NA NA
#12: 1 19 0 NA NA
#13: 2 1 1 0 0
#14: 2 2 1 0 0
#15: 2 3 1 0 0
#16: 2 4 1 0 0
#17: 2 5 1 0 0
#18: 2 6 1 1 1
#19: 2 8 1 NA NA
#20: 2 9 1 NA NA
#21: 2 15 1 NA NA
创建了稍大的数据集并测试了解决方案
-data
ID<-c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2)
hour<-c(1,2,3,7,8,9,10,11,12,17,18,19,1,2,3,4,5,6,8,9,15)
A<-c(0,1,0,1,1,1,1,1,1,0,0,0,1,1,1,1,1,1,1,1,1)
ID <- rep(1:5000, rep(c(12, 9), 2500))
A <- rep(A, 2500)
hour <- rep(hour, 2500)
dftest <- data.frame(ID, hour, A)
-functions
akrun <- function(df){
df1 <- copy(df)
setDT(df1)[, grp := rleid(A)][, Anew := A *((hour - shift(hour, fill = hour[1])) ==1), grp
][, sixnew :=if(sum(A)>=5) rep(c(0, 1), c(.N-1, 1))
else NA_real_,.(rleid(Anew), grp)]
i1 <- df1[, .I[which(is.na(sixnew) & shift(sixnew == 0, type = 'lead'))], grp]$V1
df1[i1, sixnew := 0][, c("Anew", "grp") := NULL][]
}
akrun2 <- function(df) {
df1 <- copy(df)
i1 <- setDT(df1)[, if(sum(A)>= 5) .I[.N] , rleid(c(TRUE, diff(hour)==1), A)]$V1
df1[i1, sixnew := 1][unlist(Map(`:`, i1-5, i1-1)), sixnew := 0]
}
missuse <- function(df){
df %>%
mutate(rle = rep(rle(A)$lengths, times = rle(A)$lengths)) %>%
group_by(ID, rle) %>%
mutate(sum = cumsum(A),
six = ifelse(rle >= 6 & A == 1, 0, NA),
six = ifelse(sum == 6, 1, ifelse(sum > 6, NA, six))) %>%
ungroup() %>%
select(ID, hour, A, six)
}
Mike <- function(df){
ave(df$A,
cumsum(!(df$hour == shift(df$hour, fill = 0) + 1)),
FUN = function(x) {
if(all(x==1) & length(x) >= 6) return(c(rep(0, length(x) - 1), 1))
else return(rep(NA, length(x)))})
}
-benchmark
microbenchmark(Mike(dftest),
akrun(dftest),
akrun2(dftest),
missuse(dftest),
times = 10L, unit = 'relative')
-output
#Unit: relative
# expr min lq mean median uq max neval cld
# Mike(dftest) 1.682794 1.754494 1.673811 1.68806 1.632765 1.640221 10 a
# akrun(dftest) 13.159245 12.950117 12.176965 12.33716 11.856271 11.095228 10 b
# akrun2(dftest) 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 10 a
# missuse(dftest) 37.899905 36.773837 34.726845 34.87672 33.155939 30.665840 10 c