我有一个数据框,其中每一行都有一个ID和活动值。下面是一个示例数据框:
test=data.frame(
start=c(1,50,100,150,200,250,300,350,400,450,500,100,150,200),
end=c(50,100,150,200,250,300,350,400,450,500,550,150,200,250),
sub_id=c("subid_1_1","subid_1_2","subid_1_3","subid_1_4","subid_1_5","subid_1_6","subid_1_7","subid_1_8","subid_1_9","subid_1_10","subid_1_11","subid_2_1","subid_2_2","subid_2_3"),
id=c(rep("id_1",11),rep("id_2",3)),
activity=c(-0.2,-0.6,-1,-1.2,-1.6,-1.6,-0.5,0.2,-1.2,-0.8,0.1,0.1,1.2,0.5))
print(test)
start end sub_id id activity
1 1 50 subid_1_1 id_1 -0.2
2 50 100 subid_1_2 id_1 -0.6
3 100 150 subid_1_3 id_1 -1.0
4 150 200 subid_1_4 id_1 -1.2
5 200 250 subid_1_5 id_1 -1.6
6 250 300 subid_1_6 id_1 -1.6
7 300 350 subid_1_7 id_1 -0.5
8 350 400 subid_1_8 id_1 0.2
9 400 450 subid_1_9 id_1 -1.2
10 450 500 subid_1_10 id_1 -0.8
11 500 550 subid_1_11 id_1 0.1
12 100 150 subid_2_1 id_2 0.1
13 150 200 subid_2_2 id_2 1.2
14 200 250 subid_2_3 id_2 0.5
对于具有相同ID的每一行:
我合并了所有与最小活动值相近的未用值-1隔开的行。并保留合并行的开始和结束的第一个值,并在选定的行上计算平均值。
我这样做是这样的:
library(dplyr)
threshold <- -1
test.group <- test %>%
mutate(grp = cumsum(activity > threshold))
print(test.group, row.names = F)
test.result <-
test.group %>%
subset(activity <= -1) %>%
group_by(id, grp) %>%
arrange(activity) %>%
summarise(
start.min = first(start),
end.min = first(end),
sub_id.min = first(sub_id),
activity.min = first(activity),
start = min(start),
end = max(end),
activity = mean(activity)
) %>%
ungroup() %>%
select(start, end, id, activity, start.min, end.min, activity.min)
我的问题是我不能连续使用min(start.min)和max(end.min)。 我总是显示区域的最小活动。但是如果我的最小值等于2,该怎么办?
预期输出:
start end id activity start.min end.min activity.min
<dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl>
1 100 300 id_1 -1.2 200 300 -1.6
2 400 450 id_1 -1.2 400 450 -1.2
答案 0 :(得分:1)
我们可以使用data.table::rleid
创建组,删除activity <= -1
所在的行,并找到每个组中的相关数字。
library(dplyr)
test %>%
group_by(gr = data.table::rleid(activity <= -1)) %>%
filter(activity <= -1) %>%
summarise(start_1 = first(start),
end_1 = last(end),
id = first(id),
activity_1 = mean(activity),
activity.min = min(activity),
start.min = min(start[activity == activity.min]),
end.min = max(end[activity == activity.min])) %>%
select(-gr)
# start_1 end_1 id activity_1 activity.min start.min end.min
# <dbl> <dbl> <fct> <dbl> <dbl> <dbl> <dbl>
#1 100 300 id_1 -1.35 -1.6 200 300
#2 400 450 id_1 -1.2 -1.2 400 450
答案 1 :(得分:0)
library(dplyr)
test %>%
mutate(separated = activity <= -1,
group = cumsum(c(1, diff(separated) != 0))
) %>%
filter(separated) %>%
group_by(id,group) %>%
mutate(avgact = mean(activity),
minact = min(activity),
start0 = ifelse(activity == minact,start,NA),
end0 = ifelse(activity == minact,end,NA)
) %>%
summarise(start = first(start),
end = last(end),
act = mean(activity),
start.min = min(start0, na.rm=T),
end.min = max(end0, na.rm=T),
activity.min = min(activity)
) %>%
rename(activity = act)
# A tibble: 2 x 8
# Groups: id [1]
# id group start end activity start.min end.min activity.min
# <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 id_1 2 100 300 -1.35 200 300 -1.6
# 2 id_1 4 400 450 -1.2 400 450 -1.2