R dplyr按范围或虚拟列连接

时间:2017-10-17 17:14:25

标签: r dplyr

我想通过范围或虚拟列加入两个元组。但似乎by - 参数只允许处理现有列名的chrvector(chr)

在我的示例中,我有一个包含列d的t value和一个包含rfrom列的t to

d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])

> d
# A tibble: 26 x 1
   value
   <dbl>
 1   1.0
 2   1.2
 3   1.4
 4   1.6
 5   1.8
 6   2.0
 7   2.2
 8   2.4
 9   2.6
10   2.8
# ... with 16 more rows

> r
# A tibble: 6 x 3
   from    to class
  <int> <dbl> <chr>
1     1     2     A
2     2     3     B
3     3     4     C
4     4     5     D
5     5     6     E
6     6   Inf     F

现在,我想加入value d范围内from以及to r范围内的d %>% inner_join(r, by = "value between from and to") # >= and < 列:

floor

我找不到办法执行此操作,因此决定加入value中的d from rd %>% inner_join(r, by = c("floor(value)" = "from")) d %>% mutate(join_value = floor(value)) %>% inner_join(r, by = c("join_value" = "from")) %>% select(value, class) # A tibble: 26 x 2 value class <dbl> <chr> 1 1.0 A 2 1.2 A 3 1.4 A 4 1.6 A 5 1.8 A 6 2.0 B 7 2.2 B 8 2.4 B 9 2.6 B 10 2.8 B # ... with 16 more rows }

append

当然我可以创建第二列来解决这个问题:

insert

但是有没有更舒适的方式?

由于

6 个答案:

答案 0 :(得分:4)

我不认为不等式连接已在dplyr中实现,或者它将永远实现(请参阅Join on inequality constraints上的讨论),但这是一个使用SQL连接的好方法:

library(tibble)
library(sqldf)

as.tibble(sqldf("select d.value, r.class from d
                join r on d.value >= r.'from' and 
                          d.value < r.'to'"))

或者,如果您想将联接集成到dplyr链中,可以使用fuzzyjoin::fuzzy_join

library(dplyr)
library(fuzzyjoin)

d %>%
  fuzzy_join(r, by = c("value" = "from", "value" = "to"), 
             match_fun = list(`>=`, `<`)) %>%
  select(value, class)

<强>结果:

# A tibble: 31 x 2
   value class
   <dbl> <chr>
 1   1.0     A
 2   1.2     A
 3   1.4     A
 4   1.6     A
 5   1.8     A
 6   2.0     A
 7   2.0     B
 8   2.2     B
 9   2.4     B
10   2.6     B
# ... with 21 more rows

注意我在fromto周围添加了单引号,因为它们是SQL语言的保留字。

答案 1 :(得分:2)

好的感谢您的建议,这非常有趣。我最后编写了一个函数range_join(受@ ycw代码启发),并根据运行时对所有描述的解决方案进行了比较。

我喜欢fuzzy_join,但d只有50k行,需要超过40秒。那太慢了。

此处结果为d

中的5k行
library(dplyr)
library(fuzzyjoin)
library(sqldf)

#join by range by @WiWeber
range_join <- function(x, y, value, left, right){
  x_result <- tibble()
  for (y_ in split(y, 1:nrow(y)))
    x_result <-  x_result %>% bind_rows(x[x[[value]] >= y_[[left]] & x[[value]] < y_[[right]],] %>% cbind(y_))
  return(x_result)
}

#dynamic join by @ycw
dynamic_join <- function(d, r){
  d$type <- NA_character_
  for (r_ in split(r, r$type))
    d <- d %>% mutate(type = ifelse(value >= r_$from & value < r_$to, r_$type, type))
  return(d)
}

d <- tibble(value = seq(1,6, by = 0.001), join = TRUE)
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), type = LETTERS[seq(1,6)], join = TRUE)

# @useR sqldf - fast and intuitive but extra library with horrible code
start <- Sys.time()
d2 <- tbl_df(sqldf("select d.value, r.type from d
                join r on d.value >= r.'from' and 
                d.value < r.'to'"))
Sys.time() - start

# @useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
start <- Sys.time()
d2 <- d %>%
  fuzzy_join(r, by = c("value" = "from", "value" = "to"), match_fun = list(`>=`, `<`)) %>%
  select(value, type)
Sys.time() - start


# @jonathande4 cut pretty fast
start <- Sys.time()
d2 <- d
d2$type <- cut(d$value, unique(c(r$from, r$to)), r$type, right = FALSE)
Sys.time() - start

# @WiWeber floor
start <- Sys.time()
d2 <- d %>% 
  mutate(join_value = floor(value)) %>% 
  inner_join(r, by = c("join_value" = "from")) %>% 
  select(value, type)
Sys.time() - start

#  @WiWeber cross join - filter
start <- Sys.time()
d2 <- d %>%
  inner_join(r, by = "join") %>% 
  filter(value >= from, value < to) %>%
  select(value, type)
Sys.time() - start

# @hardik-gupta sapply
start <- Sys.time()
d2 <- d %>%
  mutate(
    type = unlist(sapply(value, function (x) r[which(x >= r$from & x < r$to), "type"]))
  ) %>% 
  select(value, type)
Sys.time() - start

# @ycw re-dynamic join
start <- Sys.time()
d2 <- d %>% dynamic_join(r)
Sys.time() - start

# @WiWeber range_join
start <- Sys.time()
d2 <- d %>% 
  range_join(r, "value", "from", "to") %>%
  select(value, type)
Sys.time() - start

结果:

# @useR sqldf - fast and intuitive but extra library with horrible code
Time difference of 0.06221986 secs

# @useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
Time difference of 4.765595 secs

# @jonathande4 cut pretty fast
Time difference of 0.004637003 secs

# @WiWeber floor
Time difference of 0.02223396 secs

# @WiWeber cross join - filter
Time difference of 0.0201931 secs

# @hardik-gupta sapply
Time difference of 5.166633 secs

# @ycw dynamic join
Time difference of 0.03124094 secs

# @WiWeber range_join
Time difference of 0.02691698 secs

greez WiWeber

答案 2 :(得分:1)

您可以使用剪切功能创建一个&#34;类&#34;在对象d中然后使用左连接。

d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])

d[["class"]] <- cut(d[["value"]], c(0,2,3,4,5,6,Inf), c('A',"B", "C", "D", "E", "F"), right = FALSE)
d <- left_join(d, r)

要获得正确的存储桶,您只需使用剪切功能即可获得所需内容。

答案 3 :(得分:1)

我们可以使用mutate中的case_whendplyr

library(dplyr)

d2 <- d %>%
  mutate(class = case_when(
    value >= 1 & value < 2 ~ "A",
    value >= 2 & value < 3 ~ "B",
    value >= 3 & value < 4 ~ "C",
    value >= 4 & value < 5 ~ "D",
    value >= 5 & value < 6 ~ "E",
    value >= 6             ~ "F"
  ))
d2
# A tibble: 26 x 2
   value class
   <dbl> <chr>
 1   1.0     A
 2   1.2     A
 3   1.4     A
 4   1.6     A
 5   1.8     A
 6   2.0     B
 7   2.2     B
 8   2.4     B
 9   2.6     B
10   2.8     B
# ... with 16 more rows

<强>更新

以下是为此任务定义函数的解决方法。

d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])

library(dplyr)

# Define a function for dynamic join
dynamic_join <- function(d, r){

  if (!("class" %in% colnames(d))){
    d[["class"]] <- NA_character_
  }

  d <- d %>%
    mutate(class = ifelse(value >= r$from & value < r$to, r$class, class))
  return(d)
}

re_dynamic_join <- function(d, r){
  r_list <- split(r, r$class)
  for (i in 1:length(r_list)){
    d <- dynamic_join(d, r_list[[i]])
  }
  return(d)
}

# Apply the function
d2 <- d %>% re_dynamic_join(r)
d2
# A tibble: 26 x 2
   value class
   <dbl> <chr>
 1   1.0     A
 2   1.2     A
 3   1.4     A
 4   1.6     A
 5   1.8     A
 6   2.0     B
 7   2.2     B
 8   2.4     B
 9   2.6     B
10   2.8     B
# ... with 16 more rows

答案 4 :(得分:1)

我们可以将sapply用于此

library(tibble)

d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d <- cbind(d, data.frame(class = (unlist(sapply(d$value, function (x) r[which(x >= r$from & x < r$to), "class"]))) ) )

d
   value class
1    1.0     A
2    1.2     A
3    1.4     A
4    1.6     A
5    1.8     A
6    2.0     B
7    2.2     B
8    2.4     B
9    2.6     B
10   2.8     B
11   3.0     C
12   3.2     C
13   3.4     C
14   3.6     C
15   3.8     C
16   4.0     D
17   4.2     D
18   4.4     D
19   4.6     D
20   4.8     D
21   5.0     E
22   5.2     E
23   5.4     E
24   5.6     E
25   5.8     E
26   6.0     F 

答案 5 :(得分:0)

我真的很喜欢@WiWeber的range_join函数,但是如果记录不在范围内,它将给出一个错误。这是一个修改

library(dplyr)

d <- tibble(value = c(seq(1,4, by = 0.2),9))
r <- tibble(from = seq(1,5), to = c(seq(2,5),8), class = LETTERS[seq(1,5)])


range_join <- function(x, y, value, left, right){
all_matches <- tibble()
x = as.data.frame(x)
y = as.data.frame(y)
x$index=x[,value]
for (i in 1:nrow(y)){
    matches = x %>% filter(index>=y[i,left] & index<= y[i,right])
    if (nrow(matches)>0){
        all_matches = all_matches %>% bind_rows(matches %>% cbind(y[i,]))
    }
}
all_matches = all_matches %>% select(-index)
return(all_matches)
}


data <- d %>% 
range_join(r, "value", "from", "to")

data