我的日期值范围很宽,我试图仅在开始日期和结束日期内计算日期值与基线的比率。
例如:
ID Start Date End Date Baseline 1/18 2/18 3/18 4/18 5/18 6/18 7/18 8/18
A 1/1/2018 5/1/2018 5 2 4 1 3 5 2 4 5
B 6/1/2018 8/1/2018 2 4 2 4 3 6 6 2 1
C 2/1/2018 3/1/2018 8 3 5 5 3 2 7 8 2
D 5/1/2015 7/1/2018 9 1 3 5 7 4 8 9 1
我想输出为:
ID Start Date End Date Baseline 1/18 2/18 3/18 4/18 5/18 6/18 7/18 8/18
A 1/1/2018 5/1/2018 5 0.4 0.8 0.2 0.6 1
B 6/1/2018 8/1/2018 2 3 1 0.5
C 2/1/2018 3/1/2018 8 0.625 0.625
D 5/1/2015 7/1/2018 9 0.44 0.88 1
谢谢!
答案 0 :(得分:0)
使用dplyr
和tidyr
的非常精致的解决方案,有人可能会基于此解决方案:
library(dplyr)
library(tidyr)
sample <- sample %>% mutate_at(vars(5:12), funs(round(./Baseline, digits = 3))) ## perform the initial simple proportion calculation
sample <- sample %>% gather(5:12, key = "day", value = "value") %>%
rowwise() %>% ## allow for rowwise operations
mutate(value_temp = case_when(any(grepl(as.numeric(str_extract(day, "^[:digit:]{1,2}(?=/)")),
as.numeric(str_extract(StartDate, "^[:digit:]{1,2}(?=/)")):as.numeric(str_extract(EndDate, "^[:digit:]{1,2}(?=/)")))) == T ~ T,
TRUE ~ NA)) ## create a logical vector which indicates TRUE if the "day" is included in the range of days of StartDate and EndDate
sample$value[is.na(sample$value_temp)] <- NA ## sets values which aren't included in the vector of days to NA
sample$value_temp <- NULL ## remove the temp variable
sample <- sample %>% spread(day, value) ## spread to original df
> sample
# A tibble: 4 x 12
ID StartDate EndDate Baseline `1/18` `2/18` `3/18` `4/18` `5/18` `6/18` `7/18` `8/18`
<chr> <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 A 1/1/2018 5/1/2018 5 0.4 0.8 0.2 0.6 1 NA NA NA
2 B 6/1/2018 8/1/2018 2 NA NA NA NA NA 3 1 0.5
3 C 2/1/2018 3/1/2018 8 NA 0.625 0.625 NA NA NA NA NA
4 D 5/1/2015 7/1/2018 9 NA NA NA NA 0.444 0.889 1 NA
更新:
sample <- sample %>% mutate_at(vars(5:12), funs(round(./Baseline, digits = 3)))
sample <- sample %>% gather(5:12, key = "day", value = "value") %>%
rowwise() %>%
mutate(value_temp = case_when(any(grepl(as.numeric(str_extract(day, "^[:digit:]{1,2}(?=/)")),
as.numeric(str_extract(Start_Date, "^[:digit:]{1,2}(?=/)")):as.numeric(str_extract(End_Date, "^[:digit:]{1,2}(?=/)")))) == T &
any(grepl(as.numeric(str_extract(day, "[:digit:]{2}$")),
as.numeric(str_extract(Start_Date, "[:digit:]{2}$")):as.numeric(str_extract(End_Date, "[:digit:]{2}$")))) ~ T,
TRUE ~ NA))
sample$value[is.na(sample$value_temp)] <- NA
sample$value_temp <- NULL
sample$day <- sample$day %>% as_factor()
sample <- sample %>% spread(day, value)