使用日期范围进行匹配

时间:2018-12-07 13:27:08

标签: r bioinformatics

我有一个病例表和一个对照表。我想使用年龄和性别的完全匹配来创建一组匹配的控件。我还想指定控件在案件死亡日期(dod之前至少有一年的数据。

数据如下:

nControls <- 10e4
nCases <- 10e2
start_date <- as.Date('2011-04-01')
end_date <- as.Date('2016-04-01')

ages <- paste0(seq(0, 75, 5), '-', seq(4, 79, 5))
nAges <- length(ages)

controls <- data.frame(
  id = seq_len(nControls),
  start = sample(seq(start_date, end_date, by = 'year'), size = nControls, replace = T),
  dur = sample(1:5, nControls, replace = T) * 365.25,
  age = sample(ages, nControls, replace = T, prob = 1:nAges / sum(1:nAges)),
  sex = sample(c('m', 'f'), nControls, replace = T, prob = c(0.7, 0.3)))
controls$end <- controls$start + controls$dur

cases <- data.frame(
  id = seq_len(nCases),
  dod = sample(seq(as.Date('2011/04/01'), as.Date('2016/04/01'), by = 'day'), size = nCases, replace = T),
  age = sample(ages, nCases, replace = T),
  sex = sample(c('m', 'f'), nCases, replace = T))

只需手动或使用MatchIt软件包即可轻松完成年龄和性别的匹配:

controls$treat <- 0
cases$treat <- 1
mt <- rbind(controls[,c('treat', 'age', 'sex')], cases[,c('treat', 'age', 'sex')])
m.out <- matchit(treat ~ age + sex, data = mt, exact = c('age', 'sex'), method = 'nearest', ratio = 2)

但是我不知道如何包含cases$dod应该在controls$end之前以及controls$start之后至少1年的标准。

1 个答案:

答案 0 :(得分:0)

不确定这是您要执行的操作(缺少期望的输出)..但这是第一步:

下面的代码在案例和控件中对data.tablesex进行agestart.control < dod - 1 year的{​​{1}}左非等值联接:

代码

end.control > dod

输出

library( data.table )
library( lubridate )

#create data.tables
dt.controls <- as.data.table( controls )  #or use setDT()
dt.cases <- as.data.table( cases )  #or use setDT()
#add suffixes, to identify the columns after join
names( dt.controls ) <- paste0( names( dt.controls ), ".control" )
names( dt.cases ) <- paste0( names( dt.cases ), ".case" )
#save colum order for later use
colorder <- c( names( dt.cases), names( dt.controls ) )
#set join columns
dt.controls[, `:=`( age.join = age.control, 
                    sex.join = sex.control, 
                    start.join = start.control, 
                    end.join = end.control)]

dt.cases[, `:=`( age.join = age.case, 
                 sex.join = sex.case, 
                 dod.join.start = dod.case %m-% lubridate::years( 1 ),  #!! %m-%
                 dod.join.end = dod.case )]  
#perform non-equi join
result <- dt.controls[ dt.cases, on = .( age.join , 
                                         sex.join, 
                                         start.join < dod.join.start, 
                                         end.join > dod.join.end ), 
                       mult = "first", 
                       nomatch = NA ]
#drop the *.join columns
result[, grep( ".join$", names( result ) ) := NULL]
#set column order so cases come first, controls after
setcolorder( result, colorder )

您可以填充联接的head(result) # id.case dod.case age.case sex.case id.control start.control dur.control age.control sex.control end.control # 1: 1 2012-12-26 5-9 m 318 2011-04-01 1826.25 5-9 m 2016-03-31 # 2: 2 2015-09-19 75-79 f 26 2012-04-01 1461.00 75-79 f 2016-04-01 # 3: 3 2011-08-17 25-29 m NA <NA> NA <NA> <NA> <NA> # 4: 4 2011-07-23 35-39 f NA <NA> NA <NA> <NA> <NA> # 5: 5 2013-11-16 30-34 f 112 2012-04-01 730.50 30-34 f 2014-04-01 # 6: 6 2014-09-17 5-9 f 784 2013-04-01 1826.25 5-9 f 2018-04-01 nomatch参数以“调整”输出...