按R中的组创建指标变量,类似于Stata的“by”

时间:2017-07-27 02:28:16

标签: r stata

我一直在学习R并找到了几种几乎可以满足我想要的资源,但并不完全(或者至少不是我所理解的那样!)

我的代码可以很好地将我的预期结果用于R(我通常使用Stata),但它非常慢,我知道这是因为我强行要求我确信有更聪明的方法做的!

我有一系列指标需要按群组设定并查看每组中的先前值。

以下是我正在使用的代码(有效),其中的一个例子将(希望)显示我的意思。使用这个非常小的样本运行速度足够快,但是当我有很多组,许多观察结果和许多指标时,运行速度非常慢!

提前感谢您的专业知识! 干杯, 西蒙。

# would like to find out several things:
# 1. the year in which an observation is missing
# 2. the last year in which an observation is not missing
# 3. whether someone is lost to followup 
#    (ie. all remaining observations are missing)
# 4. whether someone is STILL lost to followup 
#    (ie. was lost to followup in previous year as well as current year)

# problem: this is very quick and simple in Stata 
#          but takes a VERY long time using this method in R
#          which makes me sure there's a better way!

# read in data
missingness <- read.table(text=
           "Var2001 Var2002 Var2003
           1 1 1
           1 NA NA
           1 NA 1
           NA NA 1
           NA 1 NA", header=TRUE)

vartouse_list <- c(colnames(missingness)[grep("Var",colnames(missingness))])
number_list <- sapply(strsplit(vartouse_list,split="Var", fixed=TRUE), function(x) (x[2]))
missingness_subset <- subset(missingness[, vartouse_list])

# now create an id

# reshape to long

long_missingness <- reshape(missingness_subset,
                            varying = vartouse_list,
                            v.names = "Var",
                            timevar = "time_period",
                            times = number_list,
                            direction = "long")

# sort to looking by id number
long_missingness$time_period <- as.numeric(long_missingness$time_period)
long_missingness <- long_missingness[order(long_missingness$id, long_missingness$time_period),]

# find if missing this year
criteria <- paste0("long_missingness","$","Var")
long_missingness$missing_this_year <- ifelse(is.na(long_missingness$Var),1,0)

# list of non-missing time periods
long_missingness$time_period_not_missing <- ifelse(long_missingness$missing_this_year==0,
                                                   long_missingness$time_period,
                                                   NA)

# find last observed data
long_missingness$last_non_missing <- min(long_missingness$time_period)
for (current_id in unique(long_missingness$id)) {
  current_long_missingness <- long_missingness[which(long_missingness$id==current_id),]
  indicator = max(current_long_missingness$time_period_not_missing,na.rm=TRUE)
  long_missingness$last_non_missing <- ifelse(long_missingness$id==current_id,
                                              indicator,
                                              long_missingness$last_non_missing)
}

# year first lost to followup
long_missingness$lost_to_followup_year <- long_missingness$last_non_missing + 1 

# generate an indicator for lost to followup
# for each individual, they're lost to followup if:
# (data is missing this year AND the current year is >= the year indicated as lost to followup)
# OR
# they were lost to followup in the previous year (by definition)
long_missingness$lost_to_followup = 0
long_missingness$lost_to_followup = ifelse(long_missingness$missing_this_year==1 & 
                                             long_missingness$time_period >= 
                                             long_missingness$lost_to_followup_year,
                                           1,
                                           0)

# now will work out if an observation is still lost to followup
long_missingness$still_lost_to_followup <- 0

for (current_id in unique(long_missingness$id)) {
  current_long_missingness <- long_missingness[which(long_missingness$id==current_id),]
  numyears <- nrow(current_long_missingness)
  if (numyears > 1) for(current_year in 2:numyears) {
    current_time_period <- current_long_missingness$time_period[current_year]

    #// generate an indicator if an observation is still lost to followup
    #// ie. was lost to followup in the previous year and still (obviously) lost to followup now
    # Stata code:
        #gen still_lost_to_followup = 0
        #by `idvar': replace still_lost_to_followup = 1 if lost_to_followup & lost_to_followup[_n-1]
    indicator <- ifelse(current_long_missingness$lost_to_followup[current_year]==1 
                        & current_long_missingness$lost_to_followup[current_year-1]==1,
                        1,
                        0)
    long_missingness$still_lost_to_followup <- ifelse(long_missingness$id==current_id & 
                                                        long_missingness$time_period==current_time_period,
                                                      indicator,
                                                      long_missingness$still_lost_to_followup)

  }

}

1 个答案:

答案 0 :(得分:1)

我认为这将为您提供更快,更简单的解决方案。这是使用tidyverse完成的,它应该非常快,直到一定数量的观察。

missingness <- read.table(text =
                              "Var2001 Var2002 Var2003
                          1 1 1
                          1 NA NA
                          1 NA 1
                          NA NA 1
                          NA 1 NA", header = TRUE)

library(tidyverse)   
library(stringr)

missingness %>%
    rownames_to_column('id') %>%
    gather(year, value,-id) %>%
    mutate(year = str_extract(year, '[0-9]{4}')) %>%
    group_by(id) %>%
    mutate(
        missing_this_year      = as.integer(is.na(value)),
        last_non_missing       = coalesce(max(year[!is.na(value)]), max(year)),
        lost_to_followup       = as.integer(year > last_non_missing),
        still_lost_to_followup = as.integer(lost_to_followup &
                                                lag(lost_to_followup))
    ) %>% 
    arrange(id, year)

#> # A tibble: 15 x 7
#> # Groups:   id [5]
#>       id  year value missing_this_year last_non_missing lost_to_followup still_lost_to_followup
#>    <chr> <chr> <int>             <int>            <chr>            <int>                  <int>
#>  1     1  2001     1                 0             2003                0                      0
#>  2     1  2002     1                 0             2003                0                      0
#>  3     1  2003     1                 0             2003                0                      0
#>  4     2  2001     1                 0             2001                0                      0
#>  5     2  2002    NA                 1             2001                1                      0
#>  6     2  2003    NA                 1             2001                1                      1
#>  7     3  2001     1                 0             2003                0                      0
#>  8     3  2002    NA                 1             2003                0                      0
#>  9     3  2003     1                 0             2003                0                      0
#> 10     4  2001    NA                 1             2003                0                      0
#> 11     4  2002    NA                 1             2003                0                      0
#> 12     4  2003     1                 0             2003                0                      0
#> 13     5  2001    NA                 1             2002                0                      0
#> 14     5  2002     1                 0             2002                0                      0
#> 15     5  2003    NA                 1             2002                1                      0