是否有一种计算上更快的方式来进行此数据转换?

时间:2019-09-30 15:31:56

标签: r for-loop lapply

我真的很努力地在不借助循环的情况下实现这种转换。我的数据框采用长格式,因此每年,年级和学校组合都有自己的行。

我想将每年+年级+学校的值除以前一年和年级的值。下面的代码应该为我完成此操作,但是如果有50万行,则需要几天的时间才能完成。

关于如何更快地执行此操作的任何想法?

我曾尝试使用dplyr,但没有成功。与标准基础R转换方法相同。

for (i in 1:NROW(df)) {
  for (j in 1:NROW(df)) {
    if(df$COUNTY[i] == df$COUNTY[j] & 
       df$YEAR[i] == (df$YEAR[j] + 1) & 
       df$Grade[i] == (df$Grade[j] + 1)){

      df$RATE[i] <- df$value[i] / df$value[j]

    } else{

      next

    }
  }
  if(i %% 10 == 0){print(i)}
}

数据:

structure(list(YEAR = c(2011, 2011, 2011, 2011, 2011, 2012, 2012, 
2012, 2012, 2012, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014, 
2014, 2014), Grade = c(-1, 0, 1, 2, 3, -1, 0, 1, 2, 3, -1, 0, 
1, 2, 3, -1, 0, 1, 2, 3), COUNTY = structure(c(1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), .Label = c("001", "002", "003", "004", "005", "006", "007", 
"008", "009", "010", "011", "012", "013", "014", "015", "016", 
"017", "018", "019", "020", "021", "022", "023", "024", "025", 
"026", "027", "028", "029", "030", "031", "032", "033", "034", 
"035", "036", "037", "038", "039", "040", "041", "042", "043", 
"044", "045", "046", "047", "048", "049", "050", "051", "052", 
"053", "054", "055", "056", "057", "058", "059", "060", "061", 
"062", "063", "064", "065", "066", "067", "068", "069", "070", 
"071", "072", "073", "074", "075", "076", "077", "078", "079", 
"080", "081", "082", "083", "084", "085", "086", "087", "088", 
"089", "090", "091", "092", "093", "094", "095", "096", "097", 
"098", "099", "100", "101", "102", "103", "104", "105", "106", 
"107", "108", "109", "110", "111", "112", "113", "114", "115", 
"126", "145", "166", "201", "347", "401", "640", "KCS"), class = "factor"), 
    value = c(178, 212, 208, 208, 242, 199, 230, 227, 208, 208, 
    187, 245, 235, 216, 204, 189, 235, 250, 226, 217)), row.names = c(NA, 
-20L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), .internal.selfref = <pointer: 0x000001d7929a1ef0>, groups = structure(list(
    YEAR = c(2011, 2011, 2011, 2011, 2011, 2012, 2012, 2012, 
    2012, 2012, 2013, 2013, 2013, 2013, 2013, 2014, 2014, 2014, 
    2014, 2014), Grade = c(-1, 0, 1, 2, 3, -1, 0, 1, 2, 3, -1, 
    0, 1, 2, 3, -1, 0, 1, 2, 3), .rows = list(1L, 2L, 3L, 4L, 
        5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 
        17L, 18L, 19L, 20L)), row.names = c(NA, -20L), class = c("tbl_df", 
"tbl", "data.frame")))

4 个答案:

答案 0 :(得分:2)

使用data.table

library(data.table)
setDT(df)[order(YEAR),  Ratio := value/shift(value) , .(COUNTY, Grade)]

答案 1 :(得分:1)

最简单的方法是消除内部for-loop并使用向量化函数。

示例:

df$RATE2 <- 0
for(i in seq(nrow(df))){
    indx <- which(df$COUNTY[i] == df$COUNTY & 
                  df$YEAR[i] == (df$YEAR + 1) & 
                  df$Grade[i] == (df$Grade + 1))
    if((n <- length(indx)) > 1)
        stop("Error, rowcount too great!")
    else if(n == 1)
        df$RATE2[i] <- df$value[i] / df$value[indx]
}
all.equal(df$RATE, df$RATE2)
[1] TRUE

请注意,&将对向量中的所有元素执行比较,因此逻辑语句将为数据帧中的每一行返回TRUEFALSE。为了方便起见(通常不浪费太多时间),我使用which将其转换为索引向量,并且如果长度仅为1(不覆盖多个元素),则覆盖{{1} }和适当的索引。

比较:

RATE2

从中值时间可以看到,我们现在仅使用原始时间的microbenchmark:::microbenchmark(original = { df$RATE <- 0 for (i in 1:NROW(df)) { for (j in 1:NROW(df)) { if(df$COUNTY[i] == df$COUNTY[j] && df$YEAR[i] == (df$YEAR[j] + 1) && df$Grade[i] == (df$Grade[j] + 1)){ df$RATE[i] <- df$value[i] / df$value[j] } } } }, improved = { df$RATE2 <- 0 for(i in seq(nrow(df))){ indx <- which(df$COUNTY[i] == df$COUNTY & df$YEAR[i] == (df$YEAR + 1) & df$Grade[i] == (df$Grade + 1)) if((n <- length(indx)) > 1) stop("Error, rowcount too great!") else if(n == 1) df$RATE2[i] <- df$value[i] / df$value[indx] } }) #output: Unit: milliseconds expr min lq mean median uq max neval original 15.452877 19.751258 26.944155 23.750028 33.886566 70.93348 100 improved 1.020224 1.221664 2.121345 1.730265 2.311173 17.56658 100 进行计算。

请注意,使用1.73/23.75 * 100 = 7.3 %不会不能加快此过程,这是通过使用矢量化函数来完成的。还要注意,我稍微更改了原始功能的代码以使用apply,并删除了多余的&&部分。这样可以稍微加快此代码版本的速度。

答案 2 :(得分:1)

您可以在dplyr中做到这一点,这应该很快……

library(dplyr)
df <- df %>% group_by(COUNTY, Grade) %>%      #for your df above, but replace with SCHOOL or whatever
             arrange(YEAR) %>%                #sort by increasing year
             mutate(Ratio = value/lag(value)) #value for year / value for previous year

df
    YEAR Grade COUNTY value  Ratio
   <dbl> <dbl> <fct>  <dbl>  <dbl>
 1  2011    -1 001      178 NA    
 2  2011     0 001      212 NA    
 3  2011     1 001      208 NA    
 4  2011     2 001      208 NA    
 5  2011     3 001      242 NA    
 6  2012    -1 001      199  1.12 
 7  2012     0 001      230  1.08 
 8  2012     1 001      227  1.09 
 9  2012     2 001      208  1    
10  2012     3 001      208  0.860
11  2013    -1 001      187  0.940
12  2013     0 001      245  1.07 
13  2013     1 001      235  1.04 
14  2013     2 001      216  1.04 
15  2013     3 001      204  0.981
16  2014    -1 001      189  1.01 
17  2014     0 001      235  0.959
18  2014     1 001      250  1.06 
19  2014     2 001      226  1.05 
20  2014     3 001      217  1.06 

答案 3 :(得分:1)

只需使用向量化的ifelse移列即可。下面假设 COUNTY 是一个因子变量(不是字符):

# SHIFT COLUMNS FORWARD
df$COUNTY_SHIFT <- factor(levels(df$COUNTY)[c(0, df$COUNTY[1:(nrow(df)-1)])]
df$YEAR_SHIFT <- c(NA, df$YEAR[1:(nrow(df)-1)])
df$Grade_SHIFT <- c(NA, df$Grade[1:(nrow(df)-1)])
df$value_SHIFT <- c(NA, df$value[1:(nrow(df)-1)])

# CONDITIONALLY ASSIGN
df$RATE <- ifelse(df$COUNTY == df$COUNTY_SHIFT & 
                  df$YEAR == df$YEAR_SHIFT & 
                  df$Grade == df$Grade_SHIFT,
                  df$value / df$value_SHIFT,
                  NA)

或全部在within上下文中:

df <- within(df, {
         # SHIFT COLUMNS FORWARD
         COUNTY_SHIFT <- factor(levels(COUNTY)[c(0, COUNTY[1:(nrow(df)-1)])]
         YEAR_SHIFT <- c(NA, YEAR[1:(nrow(df)-1)])
         Grade_SHIFT <- c(NA, Grade[1:(nrow(df)-1)])
         value_SHIFT <- c(NA, value[1:(nrow(df)-1)])

         # CONDITIONALLY ASSIGN
         RATE <- ifelse(COUNTY == COUNTY_SHIFT & 
                        YEAR == YEAR_SHIFT & 
                        Grade == Grade_SHIFT,
                        value / value_SHIFT,
                        NA)
         # REMOVE HELPER COLUMNS
         rm(COUNTY_SHIFT, YEAR_SHIFT, Grade_SHIFT, value_SHIFT)
})

或者,在移位的数据帧上合并:

df$ID <- 1:nrow(df)
shifted_df <- merge(transform(df, ID=ID-1), df[-1,], by="ID", suffixes=c("", "_SHIFT"))

final_df <- within(shifted_df , {   
         # CONDITIONALLY ASSIGN
         RATE <- ifelse(COUNTY == COUNTY_SHIFT & 
                        YEAR == YEAR_SHIFT & 
                        Grade == Grade_SHIFT,
                        value / value_SHIFT,
                        NA)
         # REMOVE HELPER COLUMNS
         rm(COUNTY_SHIFT, YEAR_SHIFT, Grade_SHIFT, value_SHIFT)
})