我有两个数据集: competitor_data - 包含特定产品的竞争对手以及收集竞争对手价格的价格和日期。
product_price - 每次价格变动的日期。
competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
crawl_date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
"2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","gamespot","louis vuitton","gucci","tesla"),
competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)
competitor_data$crawl_date = as.Date(competitor_data$crawl_date)
#
product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
"2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)
product_price$date = as.Date(product_price$date)
目标
我的脚本使用嵌套for循环,但处理5000个唯一的product_id需要24小时:
unique_skus <- unique(product_price$productId)
all_competitive_data <- data.frame()
mid_step_data <- data.frame()
start_time <-Sys.time()
for (i in 1:length(unique_skus)){
step1 <- subset(product_price, productId == unique_skus[i])
transact_dates = unique(step1$date)
for (a in 1:length(transact_dates)){
step2 <- subset(step1, date ==transact_dates[a])
step3 <- inner_join(step2,competitor_data, by='productId')
if (nrow(subset(step3, date > crawl_date)) == 0){
step3 <- step3[ order(step3$crawl_date , decreasing = FALSE ),]
competitor_price <- head(step3,1)$competitor_price
step2$competitor_price = competitor_price
}
else {
step4 <- subset(step3, date > crawl_date)
step4 <- step4[ order(step4$crawl_date , decreasing = TRUE ),]
competitor_price <- head(step4,1)$competitor_price
step2$competitor_price = competitor_price
}
step2$price_leader <- ifelse(step2$price <= step2$competitor_price, 1, 0)
mid_step_data = rbind(mid_step_data,step2)
}
all_competitive_data <- rbind(all_competitive_data,mid_step_data)
}
Sys.time()-start_time
all_competitive_data = unique(all_competitive_data)
有没有办法快速使用dplyr来实现这个目标?
答案 0 :(得分:3)
competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
crawl_date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
"2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","gamespot","louis vuitton","gucci","tesla"),
competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)
competitor_data$crawl_date = as.Date(competitor_data$crawl_date)
#
product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
"2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)
product_price$date = as.Date(product_price$date)
使用此功能向前向后填充向量
## fill in NAs
f <- function(..., lead = NA) {
# f(NA, 1, NA, 2, NA, NA, lead = NULL)
x <- c(lead, c(...))
head(zoo::na.locf(zoo::na.locf(x, na.rm = FALSE), fromLast = TRUE),
if (is.null(lead)) length(x) else -length(lead))
}
按产品和日期合并两者。我们用额外的NA来按产品填写第一个价格,这样当我们填写NA时,这将有效地使用之前的价格
然后进行价格和竞争对手价格的比较。最后一步只是进行一些清理以证明它是相同的结果
dd <- merge(product_price, competitor_data,
by.y = c('productId', 'crawl_date'),
by.x = c('productId', 'date'), all = TRUE)
dd$competitor_price <-
unlist(sapply(split(dd$competitor_price, dd$productId), f))
dd$price_leader <- +(dd$price <= dd$competitor_price)
(res1 <- `rownames<-`(dd[!is.na(dd$price_leader), -4], NULL))
# productId date price competitor_price price_leader
# 1 banana 2014-02-22 2.09 2.50 1
# 2 banana 2014-05-03 2.04 2.35 1
# 3 banana 2014-05-05 2.12 2.35 1
# 4 banana 2014-06-22 2.31 2.22 0
# 5 banana 2014-07-05 2.29 2.52 1
# 6 banana 2014-08-31 2.01 2.52 1
# 7 fig 2014-03-09 5.21 5.32 1
# 8 fig 2014-05-21 5.22 5.32 1
# 9 fig 2014-06-19 5.36 5.56 1
# 10 fig 2014-06-22 5.91 5.56 0
# 11 fig 2014-07-03 5.36 5.86 1
# 12 fig 2014-09-08 5.56 5.96 1
res0 <- `rownames<-`(all_competitive_data[
order(all_competitive_data$productId, all_competitive_data$date), ], NULL)
all.equal(res0, res1)
# [1] TRUE
您可以将任何这些步骤更改为dplyr或data.table语法;我没有使用任何一个,但它应该是直截了当的:
library('dplyr')
dd <- full_join(product_price, competitor_data,
by = c(
'productId' = 'productId',
'date' = 'crawl_date'
)
) %>% arrange(productId, date)
dd %>% group_by(productId) %>%
mutate(
competitor_price = f(competitor_price),
price_leader = as.integer(price <= competitor_price)
) %>% filter(!is.na(price_leader)) %>% select(-competitor)
# Source: local data frame [12 x 5]
# Groups: productId [2]
#
# productId date price competitor_price price_leader
# <chr> <date> <dbl> <dbl> <int>
# 1 banana 2014-02-22 2.09 2.50 1
# 2 banana 2014-05-03 2.04 2.35 1
# 3 banana 2014-05-05 2.12 2.35 1
# 4 banana 2014-06-22 2.31 2.22 0
# 5 banana 2014-07-05 2.29 2.52 1
# 6 banana 2014-08-31 2.01 2.52 1
# 7 fig 2014-03-09 5.21 5.32 1
# 8 fig 2014-05-21 5.22 5.32 1
# 9 fig 2014-06-19 5.36 5.56 1
# 10 fig 2014-06-22 5.91 5.56 0
# 11 fig 2014-07-03 5.36 5.86 1
# 12 fig 2014-09-08 5.56 5.96 1
答案 1 :(得分:0)
以下解决方案使用dplyr join进行匹配。 (注意:我将“crawl_date”更改为“date”,以便dplyr join自动选择匹配的列。可以使用类似
的内容明确匹配git push
作为加入的参数。
by=c('productId'='productId', date'='crawl_date')
结果数据框是
competitor_data <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
date=c("2014-04-05", "2014-04-22", "2014-05-05", "2014-05-22","2014-06-05", "2014-06-22",
"2014-05-08", "2014-06-17", "2014-06-09", "2014-06-14","2014-07-01", "2014-08-04"),
competitor =c("amazon","apple","google","facebook","alibaba","tencent","ebay","bestbuy","ga**strong text**mespot","louis vuitton","gucci","tesla"),
competitor_price =c(2.5,2.35,1.99,2.01,2.22,2.52,5.32,5.56,5.01,6.01,5.86,5.96), stringsAsFactors=FALSE)
competitor_data$date = as.Date(competitor_data$date)
product_price <- data.frame(productId=c('banana', 'banana','banana', 'banana','banana', 'banana','fig', 'fig','fig', 'fig','fig', 'fig'),
date=c("2014-05-05", "2014-06-22", "2014-07-05", "2014-08-31","2014-05-03", "2014-02-22",
"2014-05-21", "2014-06-19", "2014-03-09", "2014-06-22","2014-07-03", "2014-09-08"),
price =c(2.12,2.31,2.29,2.01,2.04,2.09,5.22,5.36,5.21,5.91,5.36,5.56), stringsAsFactors=FALSE)
product_price$date = as.Date(product_price$date)
require(dplyr)
joined <- product_price %>% left_join(competitor_data)
joined$leader <- as.integer(joined$price <= joined$competitor_price)
joined