我想通过范围或虚拟列加入两个元组。但似乎by
- 参数只允许处理现有列名的chr
或vector(chr)
。
在我的示例中,我有一个包含列d
的t value
和一个包含r
和from
列的t to
。
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
> d
# A tibble: 26 x 1
value
<dbl>
1 1.0
2 1.2
3 1.4
4 1.6
5 1.8
6 2.0
7 2.2
8 2.4
9 2.6
10 2.8
# ... with 16 more rows
> r
# A tibble: 6 x 3
from to class
<int> <dbl> <chr>
1 1 2 A
2 2 3 B
3 3 4 C
4 4 5 D
5 5 6 E
6 6 Inf F
现在,我想加入value
d
范围内from
以及to
r
范围内的d %>% inner_join(r, by = "value between from and to") # >= and <
列:
floor
我找不到办法执行此操作,因此决定加入value
中的d
from
r
列d %>% inner_join(r, by = c("floor(value)" = "from"))
列d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, class)
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
}
append
当然我可以创建第二列来解决这个问题:
insert
但是有没有更舒适的方式?
由于
答案 0 :(得分:4)
我不认为不等式连接已在dplyr
中实现,或者它将永远实现(请参阅Join on inequality constraints上的讨论),但这是一个使用SQL连接的好方法:
library(tibble)
library(sqldf)
as.tibble(sqldf("select d.value, r.class from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
或者,如果您想将联接集成到dplyr
链中,可以使用fuzzyjoin::fuzzy_join
:
library(dplyr)
library(fuzzyjoin)
d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"),
match_fun = list(`>=`, `<`)) %>%
select(value, class)
<强>结果:强>
# A tibble: 31 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 A
7 2.0 B
8 2.2 B
9 2.4 B
10 2.6 B
# ... with 21 more rows
注意我在from
和to
周围添加了单引号,因为它们是SQL语言的保留字。
答案 1 :(得分:2)
好的感谢您的建议,这非常有趣。我最后编写了一个函数range_join
(受@ ycw代码启发),并根据运行时对所有描述的解决方案进行了比较。
我喜欢fuzzy_join,但d
只有50k行,需要超过40秒。那太慢了。
此处结果为d
library(dplyr)
library(fuzzyjoin)
library(sqldf)
#join by range by @WiWeber
range_join <- function(x, y, value, left, right){
x_result <- tibble()
for (y_ in split(y, 1:nrow(y)))
x_result <- x_result %>% bind_rows(x[x[[value]] >= y_[[left]] & x[[value]] < y_[[right]],] %>% cbind(y_))
return(x_result)
}
#dynamic join by @ycw
dynamic_join <- function(d, r){
d$type <- NA_character_
for (r_ in split(r, r$type))
d <- d %>% mutate(type = ifelse(value >= r_$from & value < r_$to, r_$type, type))
return(d)
}
d <- tibble(value = seq(1,6, by = 0.001), join = TRUE)
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), type = LETTERS[seq(1,6)], join = TRUE)
# @useR sqldf - fast and intuitive but extra library with horrible code
start <- Sys.time()
d2 <- tbl_df(sqldf("select d.value, r.type from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Sys.time() - start
# @useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
start <- Sys.time()
d2 <- d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"), match_fun = list(`>=`, `<`)) %>%
select(value, type)
Sys.time() - start
# @jonathande4 cut pretty fast
start <- Sys.time()
d2 <- d
d2$type <- cut(d$value, unique(c(r$from, r$to)), r$type, right = FALSE)
Sys.time() - start
# @WiWeber floor
start <- Sys.time()
d2 <- d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, type)
Sys.time() - start
# @WiWeber cross join - filter
start <- Sys.time()
d2 <- d %>%
inner_join(r, by = "join") %>%
filter(value >= from, value < to) %>%
select(value, type)
Sys.time() - start
# @hardik-gupta sapply
start <- Sys.time()
d2 <- d %>%
mutate(
type = unlist(sapply(value, function (x) r[which(x >= r$from & x < r$to), "type"]))
) %>%
select(value, type)
Sys.time() - start
# @ycw re-dynamic join
start <- Sys.time()
d2 <- d %>% dynamic_join(r)
Sys.time() - start
# @WiWeber range_join
start <- Sys.time()
d2 <- d %>%
range_join(r, "value", "from", "to") %>%
select(value, type)
Sys.time() - start
结果:
# @useR sqldf - fast and intuitive but extra library with horrible code
Time difference of 0.06221986 secs
# @useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
Time difference of 4.765595 secs
# @jonathande4 cut pretty fast
Time difference of 0.004637003 secs
# @WiWeber floor
Time difference of 0.02223396 secs
# @WiWeber cross join - filter
Time difference of 0.0201931 secs
# @hardik-gupta sapply
Time difference of 5.166633 secs
# @ycw dynamic join
Time difference of 0.03124094 secs
# @WiWeber range_join
Time difference of 0.02691698 secs
greez WiWeber
答案 2 :(得分:1)
您可以使用剪切功能创建一个&#34;类&#34;在对象d中然后使用左连接。
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d[["class"]] <- cut(d[["value"]], c(0,2,3,4,5,6,Inf), c('A',"B", "C", "D", "E", "F"), right = FALSE)
d <- left_join(d, r)
要获得正确的存储桶,您只需使用剪切功能即可获得所需内容。
答案 3 :(得分:1)
我们可以使用mutate
中的case_when
和dplyr
。
library(dplyr)
d2 <- d %>%
mutate(class = case_when(
value >= 1 & value < 2 ~ "A",
value >= 2 & value < 3 ~ "B",
value >= 3 & value < 4 ~ "C",
value >= 4 & value < 5 ~ "D",
value >= 5 & value < 6 ~ "E",
value >= 6 ~ "F"
))
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
<强>更新强>
以下是为此任务定义函数的解决方法。
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
library(dplyr)
# Define a function for dynamic join
dynamic_join <- function(d, r){
if (!("class" %in% colnames(d))){
d[["class"]] <- NA_character_
}
d <- d %>%
mutate(class = ifelse(value >= r$from & value < r$to, r$class, class))
return(d)
}
re_dynamic_join <- function(d, r){
r_list <- split(r, r$class)
for (i in 1:length(r_list)){
d <- dynamic_join(d, r_list[[i]])
}
return(d)
}
# Apply the function
d2 <- d %>% re_dynamic_join(r)
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
答案 4 :(得分:1)
我们可以将sapply
用于此
library(tibble)
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d <- cbind(d, data.frame(class = (unlist(sapply(d$value, function (x) r[which(x >= r$from & x < r$to), "class"]))) ) )
d
value class
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
11 3.0 C
12 3.2 C
13 3.4 C
14 3.6 C
15 3.8 C
16 4.0 D
17 4.2 D
18 4.4 D
19 4.6 D
20 4.8 D
21 5.0 E
22 5.2 E
23 5.4 E
24 5.6 E
25 5.8 E
26 6.0 F
答案 5 :(得分:0)
我真的很喜欢@WiWeber的range_join函数,但是如果记录不在范围内,它将给出一个错误。这是一个修改
library(dplyr)
d <- tibble(value = c(seq(1,4, by = 0.2),9))
r <- tibble(from = seq(1,5), to = c(seq(2,5),8), class = LETTERS[seq(1,5)])
range_join <- function(x, y, value, left, right){
all_matches <- tibble()
x = as.data.frame(x)
y = as.data.frame(y)
x$index=x[,value]
for (i in 1:nrow(y)){
matches = x %>% filter(index>=y[i,left] & index<= y[i,right])
if (nrow(matches)>0){
all_matches = all_matches %>% bind_rows(matches %>% cbind(y[i,]))
}
}
all_matches = all_matches %>% select(-index)
return(all_matches)
}
data <- d %>%
range_join(r, "value", "from", "to")
data