R-简化代码以遍历记录和样本因子

时间:2019-01-22 18:54:28

标签: r loops random

我正在寻找一种简化代码(并使之更有效)的方法。我的代码遍历表中的记录。如果记录的年龄不为4,则会从池中随机抽取具有相同年龄的记录,并应用因子。另外,年龄会增加1。当年龄达到4岁时,它就会停止。

我给定的数据

set.seed(777)

pool <- data.frame(ID = 1:10,
                   Age = sample(1:4, 10, replace = TRUE),
                   Amt = round(runif(10, 0, 10)*100,0),
                   Factor = round(runif(10, 0.5, 2), 2))

tgt <- pool[sample(nrow(pool), 2, TRUE), 1:3]

代码循环遍历tgt的记录,并应用一个随机因子直到年龄达到4。

repeat{
  for (i in 1:nrow(tgt)) {
    age.i <- tgt[i, 'Age']
    if(age.i < 4) {
      pool.i <- subset(pool, Age == age.i)
      factor.i <- pool.i[sample(nrow(pool.i), 1), 'Factor']
      tgt <- tgt %>%
        mutate(Age = ifelse(ID == tgt[i, 'ID'], Age + 1, Age),
               Amt = ifelse(ID == tgt[i, 'ID'], Amt * factor.i, Amt))
    }
  }
  if(min(tgt$Age) == 4) {
    break
  }
}

在此循环中,它:(1)选择一条记录,(2)从pool中选择一条具有相同年龄的记录,(3)将因子应用于金额并将年龄增加1。这继续直到tgt中的所有记录的年龄都为4。

使用我给定的代码,看看,结果是

ID Age      Amt
 9   4  352.000
 8   4 2101.784

2 个答案:

答案 0 :(得分:1)

对于while循环中的每一行,您可以将代码缩减为for循环:

for(i in 1:nrow(tgt)){
  while(tgt[i, 'Age'] < 4){

    rows_same_age = which(pool[,'Age'] == tgt[i,'Age'])            # sample a row with the same age
    factor_to_multiply = pool[sample(which_same_age, 1), "Factor"] # find the factor value for that row

    tgt[i, 'Amt'] = tgt[i, 'Amt'] * factor_to_multiply # multiply amount by factor
    tgt[i, 'Age'] = tgt[i, 'Age'] + 1                  # add 1 to age

    }
}

使用while循环意味着只要满足条件,就不必直接指定break语句。

答案 1 :(得分:1)

为了提高效率,我在这里和那里使用for重写了.subset2循环:

repeat{
    for (i in 1:nrow(tgt)) {
      age.i <- .subset2(tgt,2L)[i]
      if(age.i < 4) {
        ID <- .subset2(tgt,1L)
        id.i <- ID[i]
        index.i <- .subset2(pool, 2L) == age.i
        factor.i <- .subset2(pool, 4L)[index.i][sample(sum(index.i), 1)]
        tgt[ID == id.i,] <- transform(tgt, Age = Age + 1, Amt = Amt * factor.i)[ID == id.i,]
        next
      }
  } 
  if(min(tgt$Age) == 4) break
}
tgt
#   ID Age      Amt
# 9  9   4  352.000
# 8  8   4 2101.784

在更大的数据帧(pool <-> 100 rowstgt <-> 75 rows)上,我获得了大约60%的循环速度。以下是基准测试数据:

基准测试结果

# 100 times
# Unit: milliseconds
#     expr      min       lq      mean   median        uq       max neval cld
# old_loop 89.40558 93.69668 101.68928 96.73567 102.45847 166.89514   100   b
# new_loop 30.32833 32.99900  34.37742 33.96648  35.39198  56.01109   100  a

# 1000 times
# Unit: milliseconds
#     expr      min       lq      mean    median        uq      max neval cld
# old_loop 88.21493 96.23644 106.43853 100.00970 110.21998 228.6108  1000   b
# new_loop 29.79882 33.39595  36.97823  35.36317  37.98608 104.7572  1000  a  

基准代码

n <- 100L
m <- 75L
microbenchmark::microbenchmark(
  'old_loop' = {
    repeat{
      for (i in 1:nrow(tgt)) {
        age.i <- tgt[i, 'Age']
        if(age.i < 4) {
          pool.i <- subset(pool, Age == age.i)
          factor.i <- pool.i[sample(nrow(pool.i), 1), 'Factor']
          tgt <- tgt %>%
            mutate(Age = ifelse(ID == tgt[i, 'ID'], Age + 1, Age),
                   Amt = ifelse(ID == tgt[i, 'ID'], Amt * factor.i, Amt))
        }
      }
      if(min(tgt$Age) == 4) {
        break
      }
    }
  }, 
  'new_loop' = {
    repeat{
      for (i in 1:nrow(tgt)) {
        age.i <- .subset2(tgt,2L)[i]
        if(age.i < 4) {
          ID <- .subset2(tgt,1L)
          id.i <- ID[i]
          index.i <- .subset2(pool, 2L) == age.i
          factor.i <- .subset2(pool, 4L)[index.i][sample(sum(index.i), 1)]
          tgt[ID == id.i,] <- transform(tgt, Age = Age + 1, Amt = Amt * factor.i)[ID == id.i,]
          next
        }
      } 
      if(min(tgt$Age) == 4) break
    }
  }, 
  setup = {
    set.seed(777)
    pool <- data.frame(ID = 1:n,
                       Age = sample(1:4, n, replace = TRUE),
                       Amt = round(runif(n, 0, 10)*100,0),
                       Factor = round(runif(n, 0.5, 2), 2))
    tgt <- pool[sample(nrow(pool), m, TRUE), 1:3]
  }, times = 10^2)