使用2个表

时间:2017-08-08 01:59:19

标签: r dplyr sqldf

table1 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2),
                 product_id = c(14, 24, 38, 40, 66, 2, 19, 30, 71, 98, 7, 16),
                 first_order = c(1, 2, 1, 4, 5, 3, 2, 4, 2, 4, 2, 3),
                 last_order = c(4, 7, 5, 8, 8, 3, 4, 7, 5, 9, 4, 5))
table2 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
                 order_number=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6),
                 days_cumsum = c(0, 7, 15, 26, 34, 43, 53, 59, 66, 74, 82, 91, 5, 11, 17, 24, 29, 35))

我想使用table2为table1添加新功能。 新功能是每个用户的每个产品的订单间隔。

例如,让我们看一下table1。第一行有(user_id == 1),(product_id == 14),(first order == 1)和(last order == 4)。这意味着产品14按订单1,2,3,4订购。我们可以在表2中找到该订货号。 新功能是关于第一个订单和最后一个订单之间的订单间隔。我们可以在表2中使用&#34; days_cumsum&#34;这是先前订单后的累积天数。第一行的新特征值是26(= 26-0)。

我认为可以通过加入完成,但是 我无法使用加入 ,因为每个表实际上非常大。

所以我在下面使用这个函数for for循环:

f <- function(i){
  a <- table2 %>% 
    filter(user_id==table1[i, 1]) %>% 
    filter(order_number==table1[i, 3] | order_number==table1[i, 4])

  ifelse(nrow(a)==2, a[2, 3] - a[1, 3], 999999) # first_order==last_order
}

它逐行计算每个新的特征值,但它非常慢并且需要很多计算。我经常遇到这个问题(使用两个表创建新功能),但每次遇到困难时都会这样。

有更好的代码吗?我在等你的帮忙。

2 个答案:

答案 0 :(得分:3)

您可以使用循环分享join和版本的运行时/计算时间比较吗?

以下是使用连接的解决方案。

library(tidyverse)

df1 <- as.data.frame(table1)
df2 <- as.data.frame(table2)


df1 %>%
  left_join(df2, by = c("user_id"="user_id", "first_order" = "order_number")) %>%
  rename(dayMin = days_cumsum) %>%
  left_join(df2, by = c("user_id"="user_id", "last_order" = "order_number")) %>%
  rename(dayMax = days_cumsum) %>%
  mutate(newVar = dayMax-dayMin) %>%
  select(user_id, product_id, first_order, last_order, newVar)

给出:

   user_id product_id first_order last_order newVar
     <dbl>      <dbl>       <dbl>      <dbl>  <dbl>
 1       1         14           1          4     26
 2       1         24           2          7     46
 3       1         38           1          5     34
 4       1         40           4          8     33
 5       1         66           5          8     25
 6       1          2           3          3      0
 7       1         19           2          4     19
 8       1         30           4          7     27
 9       1         71           2          5     27
10       1         98           4          9     40
11       2          7           2          4     13
12       2         16           3          5     12

答案 1 :(得分:2)

对于比较,使用data.table的一些解决方案。

table1 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2),
                     product_id = c(14, 24, 38, 40, 66, 2, 19, 30, 71, 98, 7, 16),
                     first_order = c(1, 2, 1, 4, 5, 3, 2, 4, 2, 4, 2, 3),
                     last_order = c(4, 7, 5, 8, 8, 3, 4, 7, 5, 9, 4, 5))
table2 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
                     order_number=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6),
                     days_cumsum = c(0, 7, 15, 26, 34, 43, 53, 59, 66, 74, 82, 91, 5, 11, 17, 24, 29, 35))

library(data.table)

setDT(table1)
setDT(table2)

table1
#>     user_id product_id first_order last_order
#>  1:       1         14           1          4
#>  2:       1         24           2          7
#>  3:       1         38           1          5
#>  4:       1         40           4          8
#>  5:       1         66           5          8
#>  6:       1          2           3          3
#>  7:       1         19           2          4
#>  8:       1         30           4          7
#>  9:       1         71           2          5
#> 10:       1         98           4          9
#> 11:       2          7           2          4
#> 12:       2         16           3          5
table2
#>     user_id order_number days_cumsum
#>  1:       1            1           0
#>  2:       1            2           7
#>  3:       1            3          15
#>  4:       1            4          26
#>  5:       1            5          34
#>  6:       1            6          43
#>  7:       1            7          53
#>  8:       1            8          59
#>  9:       1            9          66
#> 10:       1           10          74
#> 11:       1           11          82
#> 12:       1           12          91
#> 13:       2            1           5
#> 14:       2            2          11
#> 15:       2            3          17
#> 16:       2            4          24
#> 17:       2            5          29
#> 18:       2            6          35

DayMin <- table1[table2, on = .(user_id, first_order = order_number), nomatch = 0]
setnames(DayMin, "days_cumsum", "dayMin")
DayMax <- table1[table2, on = .(user_id, last_order = order_number), nomatch = 0]
setnames(DayMax, "days_cumsum", "dayMax")
res <- DayMin[DayMax, on = .(user_id, product_id, first_order, last_order)]
# calculate diff and delete column
res[, c("diff", "dayMax", "dayMin") := list(dayMax - dayMin, NULL, NULL)]
res[]
#>     user_id product_id first_order last_order diff
#>  1:       1          2           3          3    0
#>  2:       1         14           1          4   26
#>  3:       1         19           2          4   19
#>  4:       1         38           1          5   34
#>  5:       1         71           2          5   27
#>  6:       1         24           2          7   46
#>  7:       1         30           4          7   27
#>  8:       1         40           4          8   33
#>  9:       1         66           5          8   25
#> 10:       1         98           4          9   40
#> 11:       2          7           2          4   13
#> 12:       2         16           3          5   12

“管状”版本,无需重命名

table1[table2, on = .(user_id, first_order = order_number), nomatch = 0][
    table2, on = .(user_id , last_order = order_number), nomatch = 0][
      , `:=`(
        diff = i.days_cumsum - days_cumsum, 
        days_cumsum = NULL, 
        i.days_cumsum = NULL
      )][]
#>     user_id product_id first_order last_order diff
#>  1:       1          2           3          3    0
#>  2:       1         14           1          4   26
#>  3:       1         19           2          4   19
#>  4:       1         38           1          5   34
#>  5:       1         71           2          5   27
#>  6:       1         24           2          7   46
#>  7:       1         30           4          7   27
#>  8:       1         40           4          8   33
#>  9:       1         66           5          8   25
#> 10:       1         98           4          9   40
#> 11:       2          7           2          4   13
#> 12:       2         16           3          5   12

仅使用整形进行一次合并

tab <- melt(table1, id = 1:2, value.name = "order_number")[table2, on = .(user_id, order_number), nomatch = 0]
res <- dcast(tab, user_id + product_id ~ variable, value.var = c("order_number", "days_cumsum"), sep = "#")
setnames(res, c("user_id", "product_id", "first_order", "last_order", "dayMin", "dayMax"))
res[, c("diff", "dayMax", "dayMin") := list(dayMax - dayMin, NULL, NULL)]
res
#>     user_id product_id first_order last_order diff
#>  1:       1          2           3          3    0
#>  2:       1         14           1          4   26
#>  3:       1         19           2          4   19
#>  4:       1         24           2          7   46
#>  5:       1         30           4          7   27
#>  6:       1         38           1          5   34
#>  7:       1         40           4          8   33
#>  8:       1         66           5          8   25
#>  9:       1         71           2          5   27
#> 10:       1         98           4          9   40
#> 11:       2          7           2          4   13
#> 12:       2         16           3          5   12