数据:
DB <- data.frame(orderID = c(1,2,3,4,4,5,6,6,7,8),
orderDate = c("1.1.12","1.1.12","1.1.12","13.1.12","13.1.12","12.1.12","10.1.12","10.1.12","21.1.12","24.1.12"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
itemPrice = c(9.99, 14.99, 9.99, 19.99, 29.99, 4.99, 9.99, 14.99, 49.99, 19.99))
预期结果:
DB <- data.frame(orderID = c(1,2,3,4,4,5,6,6,7,8),
orderDate = c("1.1.12","2.1.12","3.1.12","13.1.12","13.1.12","12.1.12","10.1.12","10.1.12","21.1.12","24.1.12"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
itemPrice = c(9.99, 14.99, 9.99, 19.99, 29.99, 4.99, 9.99, 14.99, 49.99, 19.99),
DateOfFirstOrderofCustomer = c("1.1.12", "2.1.12", "3.1.12", "1.1.12", "1.1.12", "3.1.12", "2.1.12", "2.1.12", "1.1.12", "1.1.12"))
了解:
orderI
D是连续的。在同一天从同一customerID
订购的产品获得相同的orderID
。当同一客户在另一天订购产品时,他/她是新的orderID
。
我想为每个行/条目添加一个额外的列,其中包含客户的第一个订单的日期(例如,客户1(customerID
1)在1.1.12
上作出他的第一个订单,所以这个日期是输入该客户的所有订单)。我们怎么做到这一点?
原始数据有大约500k行:所以plz提供的解决方案只需要很少的性能。
答案 0 :(得分:0)
我已经更改了您输入的数据,因为您提供的数据与每位客户的首次购买日期相同,因此无法判断代码是否有效。该示例使用dplyr,您也可以使用tapply,但是您需要重新格式化命名向量。
# Dummy data
DB <- data.frame(orderID = c(1,2,3,4,4,5,6,6,7,8),
orderDate = c("1.1.12","4.1.12","6.1.12","13.1.12","13.1.12","12.1.12","10.1.12","10.1.12","21.1.12","24.1.12"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
itemPrice = c(9.99, 14.99, 9.99, 19.99, 29.99, 4.99, 9.99, 14.99, 49.99, 19.99))
# -------------------------------------------
# Change dates to a readable format
DB$orderDate <- as.Date(DB$orderDate, format="%d.%m.%y")
# -------------------------------------------
library(dplyr)
DB <- DB %>%
group_by(customerID) %>%
mutate(DateOfFirstOrderofCustomer=min(orderDate))
答案 1 :(得分:0)
只使用基本R函数:
# convert the date column to date-format
DB$orderDate <- as.Date(DB$orderDate, format('%d.%m.%y'))
# get the first date for each customer
DB$DateFirstOrder <- with(DB, ave(orderDate, customerID, FUN = min))
结果是(使用Mike Spencer的数据):
> DB
orderID orderDate itemID customerID itemPrice DateFirstOrder
1 1 2012-01-01 2 1 9.99 2012-01-01
2 2 2012-01-04 3 2 14.99 2012-01-04
3 3 2012-01-06 2 3 9.99 2012-01-06
4 4 2012-01-13 5 1 19.99 2012-01-01
5 4 2012-01-13 12 1 29.99 2012-01-01
6 5 2012-01-12 4 3 4.99 2012-01-06
7 6 2012-01-10 2 2 9.99 2012-01-04
8 6 2012-01-10 3 2 14.99 2012-01-04
9 7 2012-01-21 1 1 49.99 2012-01-01
10 8 2012-01-24 5 1 19.99 2012-01-01
对于最快的解决方案,我建议使用 data.table 包。要使用此包获得所需的结果,您需要执行以下操作:
library(data.table)
setDT(DB)[, orderDate := as.Date(orderDate, format('%d.%m.%y'))
][, DateFirstOrder := min(orderDate), by = customerID]
答案 2 :(得分:0)
我正在使用plyr
包。休息一切都一样。
DB <- data.frame(orderID = c(1,2,3,4,4,5,6,6,7,8),
orderDate = c("1.1.12","4.1.12","6.1.12","13.1.12","13.1.12","12.1.12","10.1.12","10.1.12","21.1.12","24.1.12"),
itemID = c(2,3,2,5,12,4,2,3,1,5),
customerID = c(1, 2, 3, 1, 1, 3, 2, 2, 1, 1),
itemPrice = c(9.99, 14.99, 9.99, 19.99, 29.99, 4.99, 9.99, 14.99, 49.99, 19.99))
install.packages("plyr")
library(plyr)
DB$orderDate <- as.Date(DB$orderDate, format="%d.%m.%y")
DB = ddply(DB, .(customerID), mutate, DateOfFirstOrderofCustomer = min(orderDate))