根据另一列中设置的参数从两列数据计算比率

时间:2018-11-28 16:00:05

标签: r

我的日期值范围很宽,我试图仅在开始日期和结束日期内计算日期值与基线的比率。

例如:

ID   Start Date  End Date  Baseline 1/18 2/18 3/18 4/18 5/18 6/18 7/18 8/18     
A    1/1/2018    5/1/2018     5      2    4     1    3    5    2    4    5
B    6/1/2018    8/1/2018     2      4    2     4    3    6    6    2    1  
C    2/1/2018    3/1/2018     8      3    5     5    3    2    7    8    2
D    5/1/2015    7/1/2018     9      1    3     5    7    4    8    9    1

我想输出为:

ID   Start Date  End Date  Baseline 1/18 2/18 3/18 4/18 5/18 6/18 7/18 8/18     
A    1/1/2018    5/1/2018     5     0.4  0.8   0.2  0.6   1            
B    6/1/2018    8/1/2018     2                                3    1  0.5
C    2/1/2018    3/1/2018     8         0.625 0.625    
D    5/1/2015    7/1/2018     9                         0.44  0.88  1    

谢谢!

1 个答案:

答案 0 :(得分:0)

使用dplyrtidyr的非常精致的解决方案,有人可能会基于此解决方案:

library(dplyr)
library(tidyr)

sample <- sample %>% mutate_at(vars(5:12), funs(round(./Baseline, digits = 3))) ## perform the initial simple proportion calculation

sample <- sample %>% gather(5:12, key = "day", value = "value") %>% 
  rowwise() %>% ## allow for rowwise operations
  mutate(value_temp = case_when(any(grepl(as.numeric(str_extract(day, "^[:digit:]{1,2}(?=/)")), 
                                                        as.numeric(str_extract(StartDate, "^[:digit:]{1,2}(?=/)")):as.numeric(str_extract(EndDate, "^[:digit:]{1,2}(?=/)")))) == T ~ T, 
                                TRUE ~ NA)) ## create a logical vector which indicates TRUE if the "day" is included in the range of days of StartDate and EndDate

sample$value[is.na(sample$value_temp)] <- NA ## sets values which aren't included in the vector of days to NA
sample$value_temp <- NULL ## remove the temp variable
sample <- sample %>%  spread(day, value) ## spread to original df


> sample
# A tibble: 4 x 12
  ID    StartDate EndDate  Baseline `1/18` `2/18` `3/18` `4/18` `5/18` `6/18` `7/18` `8/18`
  <chr> <chr>     <chr>       <int>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 A     1/1/2018  5/1/2018        5    0.4  0.8    0.2      0.6  1     NA         NA   NA  
2 B     6/1/2018  8/1/2018        2   NA   NA     NA       NA   NA      3          1    0.5
3 C     2/1/2018  3/1/2018        8   NA    0.625  0.625   NA   NA     NA         NA   NA  
4 D     5/1/2015  7/1/2018        9   NA   NA     NA       NA    0.444  0.889      1   NA  

更新

sample <- sample %>% mutate_at(vars(5:12), funs(round(./Baseline, digits = 3)))

sample <- sample %>% gather(5:12, key = "day", value = "value") %>% 
  rowwise() %>%
  mutate(value_temp = case_when(any(grepl(as.numeric(str_extract(day, "^[:digit:]{1,2}(?=/)")), 
                                          as.numeric(str_extract(Start_Date, "^[:digit:]{1,2}(?=/)")):as.numeric(str_extract(End_Date, "^[:digit:]{1,2}(?=/)")))) == T & 
                                  any(grepl(as.numeric(str_extract(day, "[:digit:]{2}$")), 
                                        as.numeric(str_extract(Start_Date, "[:digit:]{2}$")):as.numeric(str_extract(End_Date, "[:digit:]{2}$")))) ~ T, 
                                TRUE ~ NA))

sample$value[is.na(sample$value_temp)] <- NA
sample$value_temp <- NULL
sample$day <- sample$day %>% as_factor()
sample <- sample %>% spread(day, value)