具有3个变量,一个标识符和2个数字变量“ A”和“ B”的吉维纳矩阵。我为变量“ A”的值定义了某些范围,为“ B”定义了其他范围。我想提取并存储用于验证“ A”和“ B”变量在指定范围内的标识符。
例如考虑以下示例:
DF = data.frame(identifier = 1:18, A=rep(c(1,2,3,4,5,6), each=3), B=rep(c(11,12,13,14, 15, 16), 3))
interval_a = c(1, 3, 6)
interval_b = c(11, 13, 16)
我有一个数据帧,我想提取验证A在1到3之间,而B在11到13之间的标识符。然后,我想要验证A在1到3之间以及B的标识符。在13到16之间,依此类推。
我知道这可以通过嵌套的for循环来完成:
identifier_list = list()
for(i in 1:(length(interval_a)-1))
{
df_tmp = DF[which(DF$A<interval_a[i+1] & DF$A>=interval_a[i]),]
for(j in 1:(length(interval_b)-1))
{
identifier_list[[(length(identifier_list) +1)]] = df_tmp[which(df_tmp$B<interval_b[j+1] & df_tmp$B>=interval_b[j]),'identifier']
}
}
但是,如果A和B中要考虑的间隔数量很大,我发现这是不切实际的。有更好的方法吗?
答案 0 :(得分:2)
这是一种data.table方法,该方法依赖于1)创建查找表和2)执行非设备联接。
library(data.table)
DF = data.frame(identifier = 1:18, A=rep(c(1,2,3,4,5,6), each=3), B=rep(c(11,12,13,14, 15, 16), 3))
interval_a = c(1, 3, 6)
interval_b = c(11, 13, 16)
# make lookup tables based on the a and b intervals
int_a = data.frame(A_start = interval_a[-length(interval_a)],
A_end = interval_a[-1L])
int_b = data.frame(B_start = interval_b[-length(interval_b)],
B_end = interval_b[-1L])
#all combinations of the two intervals and adding an ID
int_lookup = merge(int_a, int_b, by = NULL)
setDT(int_lookup)
int_lookup[, ID := .I]
int_lookup
#> A_start A_end B_start B_end ID
#> <num> <num> <num> <num> <int>
#> 1: 1 3 11 13 1
#> 2: 3 6 11 13 2
#> 3: 1 3 13 16 3
#> 4: 3 6 13 16 4
# make DF a data.table and do a non-equi join
setDT(DF)
DF[int_lookup,
on = .(A >= A_start,
A < A_end,
B >= B_start,
B < B_end),
comb_ID := ID]
DF
#> identifier A B comb_ID
#> <int> <num> <num> <int>
#> 1: 1 1 11 1
#> 2: 2 1 12 1
#> 3: 3 1 13 3
#> 4: 4 2 14 3
#> 5: 5 2 15 3
#> 6: 6 2 16 NA
#> 7: 7 3 11 2
#> 8: 8 3 12 2
#> 9: 9 3 13 4
#> 10: 10 4 14 4
#> 11: 11 4 15 4
#> 12: 12 4 16 NA
#> 13: 13 5 11 2
#> 14: 14 5 12 2
#> 15: 15 5 13 4
#> 16: 16 6 14 NA
#> 17: 17 6 15 NA
#> 18: 18 6 16 NA
如果我们在末尾除以comb_ID
,我们将获得与您的输出类似的列表:
split(DF, by = 'comb_ID')
> split(DF, by = 'comb_ID')
$`1`
identifier A B comb_ID
<int> <num> <num> <int>
1: 1 1 11 1
2: 2 1 12 1
$`3`
identifier A B comb_ID
<int> <num> <num> <int>
1: 3 1 13 3
2: 4 2 14 3
3: 5 2 15 3
$`NA`
identifier A B comb_ID
<int> <num> <num> <int>
1: 6 2 16 NA
2: 12 4 16 NA
3: 16 6 14 NA
4: 17 6 15 NA
5: 18 6 16 NA
$`2`
identifier A B comb_ID
<int> <num> <num> <int>
1: 7 3 11 2
2: 8 3 12 2
3: 13 5 11 2
4: 14 5 12 2
$`4`
identifier A B comb_ID
<int> <num> <num> <int>
1: 9 3 13 4
2: 10 4 14 4
3: 11 4 15 4
4: 15 5 13 4
答案 1 :(得分:0)
这是执行此操作的一种方法。我们首先扩展网格以获取A和B范围的每种组合,然后将数据嵌套在条件数据框中,然后绘制出符合条件的行,最后拉出标识符并将其折叠为一个变量。
library(tidyverse)
list(
expand_grid(a1 = interval_a, a2 = interval_a) %>%
filter(a1 != a2 & a1 < a2) %>%
mutate(temp = "t"),
expand_grid(b1 = interval_b, b2 = interval_b) %>%
filter(b1 != b2 & b1 < b2)%>%
mutate(temp = "t")
) %>%
reduce(full_join, by = "temp") %>%
select(-temp) %>%
mutate(data = list(DF),
identifiers = pmap_chr(list(data, a1,a2,b1,b2),
~filter(..1, A > ..2 & A < ..3 & B > ..4 & B < ..5) %>%
pull(identifier) %>% paste(., collapse = ","))) %>%
select(-data)
#> # A tibble: 9 x 5
#> a1 a2 b1 b2 identifiers
#> <dbl> <dbl> <dbl> <dbl> <chr>
#> 1 1 3 11 13 ""
#> 2 1 3 11 16 "4,5"
#> 3 1 3 13 16 "4,5"
#> 4 1 6 11 13 "8,14"
#> 5 1 6 11 16 "4,5,8,9,10,11,14,15"
#> 6 1 6 13 16 "4,5,10,11"
#> 7 3 6 11 13 "14"
#> 8 3 6 11 16 "10,11,14,15"
#> 9 3 6 13 16 "10,11"