我有以下数据框
user_id orderdate cart
8 2012-01-01 produce
8 2012-01-31 produce
8 2012-03-01 produce
8 2012-03-11 produce
10 2012-01-01 produce
10 2012-01-31 produce
10 2012-02-12 meat seafood
10 2012-02-26 deli
17 2012-01-01 beverages
17 2012-01-04 beverages
并希望将其格式化为
user_id orderdate cart to ord
8 2012-01-01 produce 2012-01-30 ord1
8 2012-01-31 produce 2012-02-29 ord2
8 2012-03-01 produce 2012-03-10 ord3
8 2012-03-11 produce 2012-03-31 ord4
8 2012-04-01 nopurch 2012-04-01 ord5
10 2012-01-01 produce 2012-01-30 ord1
10 2012-01-31 produce 2012-02-11 ord2
10 2012-02-12 meat seafood 2012-02-25 ord3
10 2012-02-26 deli 2012-03-24 ord4
10 2012-03-25 nopurch 2012-04-01 ord5
17 2012-01-01 beverages 2012-01-03 ord1
17 2012-01-04 beverages 2012-01-08 ord2
17 2012-01-09 beverages 2012-01-13 ord3
17 2012-01-14 beverages 2012-01-19 ord4
17 2012-01-20 nopurch 2012-04-01 ord5
我发现以下代码可以执行此操作,但由于我的数据帧非常大(140000行),因此运行时需要一个多小时才能运行。有没有办法修改它,以便运行得更快?
max.date <- max(df$orderdate)+1
ids <- unique(df$user_id)
df.new <- data.frame()
for (i in 1:length(ids)) {
df.cache <- df %>%
filter(user_id==ids[i])
ifelse(nrow(df.cache)==1,
av.dur <- 30,
av.dur <- round(((max(df.cache$orderdate) - min(df.cache$orderdate))/(nrow(df.cache)-1))*1.5, 0))
df.cache <- rbind(df.cache, data.frame(user_id=df.cache$user_id[nrow(df.cache)],
orderdate=max(df.cache$orderdate)+av.dur,
cart='nopurch'))
ifelse(max(df.cache$orderdate) > max.date,
df.cache$orderdate[which.max(df.cache$orderdate)] <- max.date,
NA)
df.cache$to <- c(df.cache$orderdate[2:nrow(df.cache)]-1, max.date)
# order# for Sankey diagram
df.cache <- df.cache %>%
mutate(ord = paste('ord', c(1:nrow(df.cache)), sep=''))
df.new <- rbind(df.new, df.cache)
}
答案 0 :(得分:0)
我能够使用一些简单的dplyr
操作来实现您需要的许多功能。我必须承认,我无法真正关注最后一个订单及其后的日期,所以基本上有一个占位符显示如何执行简单的操作。但是那里应该比你的for循环方法更有效:
library(tidyverse)
library(lubridate)
df = read.table(text = " user_id orderdate cart
8 2012-01-01 produce
8 2012-01-31 produce
8 2012-03-01 produce
8 2012-03-11 produce
10 2012-01-01 produce
10 2012-01-31 produce
10 2012-02-12 meat_seafood
10 2012-02-26 deli
17 2012-01-01 beverages
17 2012-01-04 beverages", header=TRUE, stringsAsFactors=FALSE)
df$orderdate = as.Date(df$orderdate)
df = df %>%
group_by(user_id) %>%
do(add_row(., cart = "nopurch", orderdate = max(.$orderdate) + days(1), user_id = .$user_id[1])) %>%
arrange(user_id, orderdate) %>%
mutate(to = lead(orderdate) - days(1),
ord = paste0("ord", seq_along(orderdate)))
输出:
> df
# A tibble: 13 x 5
# Groups: user_id [3]
user_id orderdate cart to ord
<int> <date> <chr> <date> <chr>
1 8 2012-01-01 produce 2012-01-30 ord1
2 8 2012-01-31 produce 2012-02-29 ord2
3 8 2012-03-01 produce 2012-03-10 ord3
4 8 2012-03-11 produce 2012-03-11 ord4
5 8 2012-03-12 nopurch NA ord5
6 10 2012-01-01 produce 2012-01-30 ord1
7 10 2012-01-31 produce 2012-02-11 ord2
8 10 2012-02-12 meat_seafood 2012-02-25 ord3
9 10 2012-02-26 deli 2012-02-26 ord4
10 10 2012-02-27 nopurch NA ord5
11 17 2012-01-01 beverages 2012-01-03 ord1
12 17 2012-01-04 beverages 2012-01-04 ord2
13 17 2012-01-05 nopurch NA ord3