我正在尝试基于循环中另一个数据集的值过滤数据。我有两个名为x
和mx_long2
的数据框。基于df age and range
中x
的值,我需要过滤另一个数据帧mx_long2
。目前,我正在使用来自dplyr和Fuzzyjoin方法的简单过滤器函数。两者都很慢,因为实际循环的迭代次数接近2000万。有什么有效的方法吗?
x <- structure(list(Ars_0 = c(1308.56, 5728.84, 2177.82), Ars_1 = c(0, 0, 0),
Ars_2 = c(0, 0, 0), age = c(13, 31, 43), region = c('A','A','B')),
row.names = c(NA, -3L),
class = "data.frame")
mx_long2 = read.table(header = T, text = '
Arrears Ars_0 Ars_1 Ars_2 Seasoning Region
Ars_0 0.985 0.0148 0.0002 mths:36-47 A
Ars_1 0.3816 0.286 0.3317 mths:36-47 A
Ars_2 0.2959 0.0057 0.2524 mths:36-47 A
Ars_0 0.9822 0.0176 0.0002 mths:24-35 A
Ars_1 0.389 0.2753 0.3347 mths:24-35 A
Ars_2 0.3026 0.0334 0.2399 mths:24-35 A
Ars_0 0.9753 0.0243 0.0004 mths:12-23 A
Ars_1 0.4002 0.2592 0.3394 mths:12-23 A
Ars_2 0.3032 0.0208 0.2387 mths:12-23 A
Ars_0 0.8865 0.01332 0.00018 mths:36-47 B
Ars_1 0.34344 0.2574 0.29853 mths:36-47 B
Ars_2 0.26631 0.00513 0.22716 mths:36-47 B
Ars_0 0.88398 0.01584 0.00018 mths:24-35 B
Ars_1 0.3501 0.24777 0.30123 mths:24-35 B
Ars_2 0.27234 0.03006 0.21591 mths:24-35 B
Ars_0 0.87777 0.02187 0.00036 mths:12-23 B
Ars_1 0.36018 0.23328 0.30546 mths:12-23 B
Ars_2 0.27288 0.01872 0.21483 mths:12-23 B
')
mx_long2 = mx_long2 %>% mutate(minage = as.numeric(substr(as.character(Seasoning), 6,7)),
maxage = as.numeric(substr(as.character(Seasoning), 9,10)))
过滤器
l <- list()
for (i in 1:nrow(x))
{
m <- x %>% slice(i) %>% pull(age)
r <- x[i, 'region']
Bx = filter(mx_long2, (i + m) >= minage, (i + m) <= maxage, Region==r) %>%
select(starts_with('Ars_')) %>% data.matrix()
# Matrix Multiplication
l[[i]] <- data.matrix(m) %*% Bx
}
第二种方法:Fuzzyjoin
library(fuzzyjoin)
Bx = fuzzy_inner_join(
x[i, c('age', 'region')],
mx_long2,
by = c(
"age" = "minage",
"age" = "maxage",
"region" = "Region"
),
match_fun = list(`>=`, `<`, `==`)
) %>% select(starts_with('Ars_')) %>%
data.matrix()