我正在寻找一种简化代码(并使之更有效)的方法。我的代码遍历表中的记录。如果记录的年龄不为4,则会从池中随机抽取具有相同年龄的记录,并应用因子。另外,年龄会增加1。当年龄达到4岁时,它就会停止。
我给定的数据
set.seed(777)
pool <- data.frame(ID = 1:10,
Age = sample(1:4, 10, replace = TRUE),
Amt = round(runif(10, 0, 10)*100,0),
Factor = round(runif(10, 0.5, 2), 2))
tgt <- pool[sample(nrow(pool), 2, TRUE), 1:3]
代码循环遍历tgt
的记录,并应用一个随机因子直到年龄达到4。
repeat{
for (i in 1:nrow(tgt)) {
age.i <- tgt[i, 'Age']
if(age.i < 4) {
pool.i <- subset(pool, Age == age.i)
factor.i <- pool.i[sample(nrow(pool.i), 1), 'Factor']
tgt <- tgt %>%
mutate(Age = ifelse(ID == tgt[i, 'ID'], Age + 1, Age),
Amt = ifelse(ID == tgt[i, 'ID'], Amt * factor.i, Amt))
}
}
if(min(tgt$Age) == 4) {
break
}
}
在此循环中,它:(1)选择一条记录,(2)从pool
中选择一条具有相同年龄的记录,(3)将因子应用于金额并将年龄增加1。这继续直到tgt
中的所有记录的年龄都为4。
使用我给定的代码,看看,结果是
ID Age Amt
9 4 352.000
8 4 2101.784
答案 0 :(得分:1)
对于while
循环中的每一行,您可以将代码缩减为for
循环:
for(i in 1:nrow(tgt)){
while(tgt[i, 'Age'] < 4){
rows_same_age = which(pool[,'Age'] == tgt[i,'Age']) # sample a row with the same age
factor_to_multiply = pool[sample(which_same_age, 1), "Factor"] # find the factor value for that row
tgt[i, 'Amt'] = tgt[i, 'Amt'] * factor_to_multiply # multiply amount by factor
tgt[i, 'Age'] = tgt[i, 'Age'] + 1 # add 1 to age
}
}
使用while
循环意味着只要满足条件,就不必直接指定break语句。
答案 1 :(得分:1)
为了提高效率,我在这里和那里使用for
重写了.subset2
循环:
repeat{
for (i in 1:nrow(tgt)) {
age.i <- .subset2(tgt,2L)[i]
if(age.i < 4) {
ID <- .subset2(tgt,1L)
id.i <- ID[i]
index.i <- .subset2(pool, 2L) == age.i
factor.i <- .subset2(pool, 4L)[index.i][sample(sum(index.i), 1)]
tgt[ID == id.i,] <- transform(tgt, Age = Age + 1, Amt = Amt * factor.i)[ID == id.i,]
next
}
}
if(min(tgt$Age) == 4) break
}
tgt
# ID Age Amt
# 9 9 4 352.000
# 8 8 4 2101.784
在更大的数据帧(pool <-> 100 rows
和tgt <-> 75 rows
)上,我获得了大约60%的循环速度。以下是基准测试数据:
基准测试结果
# 100 times
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# old_loop 89.40558 93.69668 101.68928 96.73567 102.45847 166.89514 100 b
# new_loop 30.32833 32.99900 34.37742 33.96648 35.39198 56.01109 100 a
# 1000 times
# Unit: milliseconds
# expr min lq mean median uq max neval cld
# old_loop 88.21493 96.23644 106.43853 100.00970 110.21998 228.6108 1000 b
# new_loop 29.79882 33.39595 36.97823 35.36317 37.98608 104.7572 1000 a
基准代码
n <- 100L
m <- 75L
microbenchmark::microbenchmark(
'old_loop' = {
repeat{
for (i in 1:nrow(tgt)) {
age.i <- tgt[i, 'Age']
if(age.i < 4) {
pool.i <- subset(pool, Age == age.i)
factor.i <- pool.i[sample(nrow(pool.i), 1), 'Factor']
tgt <- tgt %>%
mutate(Age = ifelse(ID == tgt[i, 'ID'], Age + 1, Age),
Amt = ifelse(ID == tgt[i, 'ID'], Amt * factor.i, Amt))
}
}
if(min(tgt$Age) == 4) {
break
}
}
},
'new_loop' = {
repeat{
for (i in 1:nrow(tgt)) {
age.i <- .subset2(tgt,2L)[i]
if(age.i < 4) {
ID <- .subset2(tgt,1L)
id.i <- ID[i]
index.i <- .subset2(pool, 2L) == age.i
factor.i <- .subset2(pool, 4L)[index.i][sample(sum(index.i), 1)]
tgt[ID == id.i,] <- transform(tgt, Age = Age + 1, Amt = Amt * factor.i)[ID == id.i,]
next
}
}
if(min(tgt$Age) == 4) break
}
},
setup = {
set.seed(777)
pool <- data.frame(ID = 1:n,
Age = sample(1:4, n, replace = TRUE),
Amt = round(runif(n, 0, 10)*100,0),
Factor = round(runif(n, 0.5, 2), 2))
tgt <- pool[sample(nrow(pool), m, TRUE), 1:3]
}, times = 10^2)