table1 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2),
product_id = c(14, 24, 38, 40, 66, 2, 19, 30, 71, 98, 7, 16),
first_order = c(1, 2, 1, 4, 5, 3, 2, 4, 2, 4, 2, 3),
last_order = c(4, 7, 5, 8, 8, 3, 4, 7, 5, 9, 4, 5))
table2 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
order_number=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6),
days_cumsum = c(0, 7, 15, 26, 34, 43, 53, 59, 66, 74, 82, 91, 5, 11, 17, 24, 29, 35))
我想使用table2为table1添加新功能。 新功能是每个用户的每个产品的订单间隔。
例如,让我们看一下table1。第一行有(user_id == 1),(product_id == 14),(first order == 1)和(last order == 4)。这意味着产品14按订单1,2,3,4订购。我们可以在表2中找到该订货号。 新功能是关于第一个订单和最后一个订单之间的订单间隔。我们可以在表2中使用&#34; days_cumsum&#34;这是先前订单后的累积天数。第一行的新特征值是26(= 26-0)。
我认为可以通过加入完成,但是 我无法使用加入 ,因为每个表实际上非常大。
所以我在下面使用这个函数for for循环:
f <- function(i){
a <- table2 %>%
filter(user_id==table1[i, 1]) %>%
filter(order_number==table1[i, 3] | order_number==table1[i, 4])
ifelse(nrow(a)==2, a[2, 3] - a[1, 3], 999999) # first_order==last_order
}
它逐行计算每个新的特征值,但它非常慢并且需要很多计算。我经常遇到这个问题(使用两个表创建新功能),但每次遇到困难时都会这样。
有更好的代码吗?我在等你的帮忙。
答案 0 :(得分:3)
您可以使用循环分享join
和版本的运行时/计算时间比较吗?
以下是使用连接的解决方案。
library(tidyverse)
df1 <- as.data.frame(table1)
df2 <- as.data.frame(table2)
df1 %>%
left_join(df2, by = c("user_id"="user_id", "first_order" = "order_number")) %>%
rename(dayMin = days_cumsum) %>%
left_join(df2, by = c("user_id"="user_id", "last_order" = "order_number")) %>%
rename(dayMax = days_cumsum) %>%
mutate(newVar = dayMax-dayMin) %>%
select(user_id, product_id, first_order, last_order, newVar)
给出:
user_id product_id first_order last_order newVar
<dbl> <dbl> <dbl> <dbl> <dbl>
1 1 14 1 4 26
2 1 24 2 7 46
3 1 38 1 5 34
4 1 40 4 8 33
5 1 66 5 8 25
6 1 2 3 3 0
7 1 19 2 4 19
8 1 30 4 7 27
9 1 71 2 5 27
10 1 98 4 9 40
11 2 7 2 4 13
12 2 16 3 5 12
答案 1 :(得分:2)
对于比较,使用data.table
的一些解决方案。
table1 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2),
product_id = c(14, 24, 38, 40, 66, 2, 19, 30, 71, 98, 7, 16),
first_order = c(1, 2, 1, 4, 5, 3, 2, 4, 2, 4, 2, 3),
last_order = c(4, 7, 5, 8, 8, 3, 4, 7, 5, 9, 4, 5))
table2 <- data.frame(user_id=c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2),
order_number=c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, 5, 6),
days_cumsum = c(0, 7, 15, 26, 34, 43, 53, 59, 66, 74, 82, 91, 5, 11, 17, 24, 29, 35))
library(data.table)
setDT(table1)
setDT(table2)
table1
#> user_id product_id first_order last_order
#> 1: 1 14 1 4
#> 2: 1 24 2 7
#> 3: 1 38 1 5
#> 4: 1 40 4 8
#> 5: 1 66 5 8
#> 6: 1 2 3 3
#> 7: 1 19 2 4
#> 8: 1 30 4 7
#> 9: 1 71 2 5
#> 10: 1 98 4 9
#> 11: 2 7 2 4
#> 12: 2 16 3 5
table2
#> user_id order_number days_cumsum
#> 1: 1 1 0
#> 2: 1 2 7
#> 3: 1 3 15
#> 4: 1 4 26
#> 5: 1 5 34
#> 6: 1 6 43
#> 7: 1 7 53
#> 8: 1 8 59
#> 9: 1 9 66
#> 10: 1 10 74
#> 11: 1 11 82
#> 12: 1 12 91
#> 13: 2 1 5
#> 14: 2 2 11
#> 15: 2 3 17
#> 16: 2 4 24
#> 17: 2 5 29
#> 18: 2 6 35
DayMin <- table1[table2, on = .(user_id, first_order = order_number), nomatch = 0]
setnames(DayMin, "days_cumsum", "dayMin")
DayMax <- table1[table2, on = .(user_id, last_order = order_number), nomatch = 0]
setnames(DayMax, "days_cumsum", "dayMax")
res <- DayMin[DayMax, on = .(user_id, product_id, first_order, last_order)]
# calculate diff and delete column
res[, c("diff", "dayMax", "dayMin") := list(dayMax - dayMin, NULL, NULL)]
res[]
#> user_id product_id first_order last_order diff
#> 1: 1 2 3 3 0
#> 2: 1 14 1 4 26
#> 3: 1 19 2 4 19
#> 4: 1 38 1 5 34
#> 5: 1 71 2 5 27
#> 6: 1 24 2 7 46
#> 7: 1 30 4 7 27
#> 8: 1 40 4 8 33
#> 9: 1 66 5 8 25
#> 10: 1 98 4 9 40
#> 11: 2 7 2 4 13
#> 12: 2 16 3 5 12
“管状”版本,无需重命名
table1[table2, on = .(user_id, first_order = order_number), nomatch = 0][
table2, on = .(user_id , last_order = order_number), nomatch = 0][
, `:=`(
diff = i.days_cumsum - days_cumsum,
days_cumsum = NULL,
i.days_cumsum = NULL
)][]
#> user_id product_id first_order last_order diff
#> 1: 1 2 3 3 0
#> 2: 1 14 1 4 26
#> 3: 1 19 2 4 19
#> 4: 1 38 1 5 34
#> 5: 1 71 2 5 27
#> 6: 1 24 2 7 46
#> 7: 1 30 4 7 27
#> 8: 1 40 4 8 33
#> 9: 1 66 5 8 25
#> 10: 1 98 4 9 40
#> 11: 2 7 2 4 13
#> 12: 2 16 3 5 12
仅使用整形进行一次合并
tab <- melt(table1, id = 1:2, value.name = "order_number")[table2, on = .(user_id, order_number), nomatch = 0]
res <- dcast(tab, user_id + product_id ~ variable, value.var = c("order_number", "days_cumsum"), sep = "#")
setnames(res, c("user_id", "product_id", "first_order", "last_order", "dayMin", "dayMax"))
res[, c("diff", "dayMax", "dayMin") := list(dayMax - dayMin, NULL, NULL)]
res
#> user_id product_id first_order last_order diff
#> 1: 1 2 3 3 0
#> 2: 1 14 1 4 26
#> 3: 1 19 2 4 19
#> 4: 1 24 2 7 46
#> 5: 1 30 4 7 27
#> 6: 1 38 1 5 34
#> 7: 1 40 4 8 33
#> 8: 1 66 5 8 25
#> 9: 1 71 2 5 27
#> 10: 1 98 4 9 40
#> 11: 2 7 2 4 13
#> 12: 2 16 3 5 12