数据看起来像
cum_ft source
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
123.1018 imds
125.4585 imds
125.4585 imds
125.4585 imds
0.0000 maint
0.0000 maint
0.0000 maint
126.7622 imds
126.7622 imds
126.7622 imds
目标是将maint的值设置为imds的最后一个值
cum_ft source
123.1018 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 imds
125.4585 maint
125.4585 maint
125.4585 maint
126.7622 imds
126.7622 imds
126.7622 imds
我正在尝试,但没有成功,像是
maint_rows_to_change = which(temp_df$source=="maint")
diff_maint_row_to_change = diff(maint_rows_to_change)
imds_rows_with_data = which(temp_df$source=="imds")
diff_imds_row_to_change = diff(imds_rows_with_data)
rows_to_change_increment = which(diff_update_row > 1)
此时,当存在要跳过的imsl数据时,diff_maint_row_to_change的数字大于1,而当存在必须调整的连续维护行时,diff_maint_row_to_change的值为1。调整是将维护行的cum_ft值设置为imsl数据的最后一个值。
我想写的是类似下面的表达式,但我不清楚如何提出last_imds_row。在这个例子中,maint_rows_to_change = c(11,12,13)和last_imds_row = c(10,10,10)。
temp_df$cum_ft[maint_rows_to_change] = temp_df$cum_ft[last_imds_row]
我也尝试了一个循环,取得了一些成功,但需要很长时间
fun1 <- function(z) {
z$cum_ft_cor = z$cum_ft
rows_to_fix = which(z$source=="maint")
z$cum_ft_cor[rows_to_fix]=-1
for(i in rows_to_fix) {
z$cum_ft_cor[i] <- z$cum_ft_cor[i-1]
}
z
}
temp_df_2 = fun1(temp_df)
答案 0 :(得分:2)
一种选择是使用Rcpp包更快地制作循环解决方案:
library(Rcpp)
copyDat <- cppFunction(
'void copyDat(NumericVector x, std::vector<std::string> y) {
for (int i=1; i < y.size(); ++i) {
if (y[i] == "maint") x[i] = x[i-1];
}
}')
然后你可以这样做:
copyDat(temp_df$cum_ft, as.character(temp_df$source))
temp_df
# cum_ft source
# 1 125.4585 imds
# 2 125.4585 imds
# 3 125.4585 imds
# 4 125.4585 imds
# 5 125.4585 imds
# 6 125.4585 imds
# 7 123.1018 imds
# 8 125.4585 imds
# 9 125.4585 imds
# 10 125.4585 imds
# 11 125.4585 maint
# 12 125.4585 maint
# 13 125.4585 maint
# 14 126.7622 imds
# 15 126.7622 imds
# 16 126.7622 imds
在一个拥有130万行的示例中,Rcpp解决方案比评论中发布的动物园解决方案快6倍(尽管两者都非常快):
# Functions to benchmark
josilber <- function(temp_df) {
copyDat(temp_df$cum_ft, as.character(temp_df$source))
temp_df
}
library(zoo)
darenburg <- function(temp_df) {
temp_df[temp_df$source == "maint", "cum_ft"] <- NA
temp_df$cum_ft <- na.locf(temp_df$cum_ft)
temp_df
}
# Do the test
library(microbenchmark)
temp_df <- data.frame(cum_ft=rnorm(1300000),
source=rep(c(rep("imds", 10), rep("maint", 3)), 100000))
all.equal(josilber(temp_df), darenburg(temp_df))
# [1] TRUE
microbenchmark(josilber(temp_df), darenburg(temp_df))
# Unit: milliseconds
# expr min lq median uq max neval
# josilber(temp_df) 78.05012 83.80206 86.96831 92.56959 122.5809 100
# darenburg(temp_df) 464.33525 492.76668 510.65864 541.43435 703.6944 100