我一直在学习R并找到了几种几乎可以满足我想要的资源,但并不完全(或者至少不是我所理解的那样!)
我的代码可以很好地将我的预期结果用于R(我通常使用Stata),但它非常慢,我知道这是因为我强行要求我确信有更聪明的方法做的!
我有一系列指标需要按群组设定并查看每组中的先前值。
以下是我正在使用的代码(有效),其中的一个例子将(希望)显示我的意思。使用这个非常小的样本运行速度足够快,但是当我有很多组,许多观察结果和许多指标时,运行速度非常慢!
提前感谢您的专业知识! 干杯, 西蒙。
# would like to find out several things:
# 1. the year in which an observation is missing
# 2. the last year in which an observation is not missing
# 3. whether someone is lost to followup
# (ie. all remaining observations are missing)
# 4. whether someone is STILL lost to followup
# (ie. was lost to followup in previous year as well as current year)
# problem: this is very quick and simple in Stata
# but takes a VERY long time using this method in R
# which makes me sure there's a better way!
# read in data
missingness <- read.table(text=
"Var2001 Var2002 Var2003
1 1 1
1 NA NA
1 NA 1
NA NA 1
NA 1 NA", header=TRUE)
vartouse_list <- c(colnames(missingness)[grep("Var",colnames(missingness))])
number_list <- sapply(strsplit(vartouse_list,split="Var", fixed=TRUE), function(x) (x[2]))
missingness_subset <- subset(missingness[, vartouse_list])
# now create an id
# reshape to long
long_missingness <- reshape(missingness_subset,
varying = vartouse_list,
v.names = "Var",
timevar = "time_period",
times = number_list,
direction = "long")
# sort to looking by id number
long_missingness$time_period <- as.numeric(long_missingness$time_period)
long_missingness <- long_missingness[order(long_missingness$id, long_missingness$time_period),]
# find if missing this year
criteria <- paste0("long_missingness","$","Var")
long_missingness$missing_this_year <- ifelse(is.na(long_missingness$Var),1,0)
# list of non-missing time periods
long_missingness$time_period_not_missing <- ifelse(long_missingness$missing_this_year==0,
long_missingness$time_period,
NA)
# find last observed data
long_missingness$last_non_missing <- min(long_missingness$time_period)
for (current_id in unique(long_missingness$id)) {
current_long_missingness <- long_missingness[which(long_missingness$id==current_id),]
indicator = max(current_long_missingness$time_period_not_missing,na.rm=TRUE)
long_missingness$last_non_missing <- ifelse(long_missingness$id==current_id,
indicator,
long_missingness$last_non_missing)
}
# year first lost to followup
long_missingness$lost_to_followup_year <- long_missingness$last_non_missing + 1
# generate an indicator for lost to followup
# for each individual, they're lost to followup if:
# (data is missing this year AND the current year is >= the year indicated as lost to followup)
# OR
# they were lost to followup in the previous year (by definition)
long_missingness$lost_to_followup = 0
long_missingness$lost_to_followup = ifelse(long_missingness$missing_this_year==1 &
long_missingness$time_period >=
long_missingness$lost_to_followup_year,
1,
0)
# now will work out if an observation is still lost to followup
long_missingness$still_lost_to_followup <- 0
for (current_id in unique(long_missingness$id)) {
current_long_missingness <- long_missingness[which(long_missingness$id==current_id),]
numyears <- nrow(current_long_missingness)
if (numyears > 1) for(current_year in 2:numyears) {
current_time_period <- current_long_missingness$time_period[current_year]
#// generate an indicator if an observation is still lost to followup
#// ie. was lost to followup in the previous year and still (obviously) lost to followup now
# Stata code:
#gen still_lost_to_followup = 0
#by `idvar': replace still_lost_to_followup = 1 if lost_to_followup & lost_to_followup[_n-1]
indicator <- ifelse(current_long_missingness$lost_to_followup[current_year]==1
& current_long_missingness$lost_to_followup[current_year-1]==1,
1,
0)
long_missingness$still_lost_to_followup <- ifelse(long_missingness$id==current_id &
long_missingness$time_period==current_time_period,
indicator,
long_missingness$still_lost_to_followup)
}
}
答案 0 :(得分:1)
我认为这将为您提供更快,更简单的解决方案。这是使用tidyverse
完成的,它应该非常快,直到一定数量的观察。
missingness <- read.table(text =
"Var2001 Var2002 Var2003
1 1 1
1 NA NA
1 NA 1
NA NA 1
NA 1 NA", header = TRUE)
library(tidyverse)
library(stringr)
missingness %>%
rownames_to_column('id') %>%
gather(year, value,-id) %>%
mutate(year = str_extract(year, '[0-9]{4}')) %>%
group_by(id) %>%
mutate(
missing_this_year = as.integer(is.na(value)),
last_non_missing = coalesce(max(year[!is.na(value)]), max(year)),
lost_to_followup = as.integer(year > last_non_missing),
still_lost_to_followup = as.integer(lost_to_followup &
lag(lost_to_followup))
) %>%
arrange(id, year)
#> # A tibble: 15 x 7
#> # Groups: id [5]
#> id year value missing_this_year last_non_missing lost_to_followup still_lost_to_followup
#> <chr> <chr> <int> <int> <chr> <int> <int>
#> 1 1 2001 1 0 2003 0 0
#> 2 1 2002 1 0 2003 0 0
#> 3 1 2003 1 0 2003 0 0
#> 4 2 2001 1 0 2001 0 0
#> 5 2 2002 NA 1 2001 1 0
#> 6 2 2003 NA 1 2001 1 1
#> 7 3 2001 1 0 2003 0 0
#> 8 3 2002 NA 1 2003 0 0
#> 9 3 2003 1 0 2003 0 0
#> 10 4 2001 NA 1 2003 0 0
#> 11 4 2002 NA 1 2003 0 0
#> 12 4 2003 1 0 2003 0 0
#> 13 5 2001 NA 1 2002 0 0
#> 14 5 2002 1 0 2002 0 0
#> 15 5 2003 NA 1 2002 1 0