I have this piece of code, where i'm looping through 250,000 items. Here are the steps within the code: 1. subset data for a given product 2. merge (left) the data with month dataframe. 3. Replace the null product names with that particular product name 4. Replace NA values for sales with 0
Here is a sample data set for two products. Data:
data2 <- data.frame(product_no = c("A", "A", "A", "B","B","B"),
sales = c(200, 130, 221, 310,109, 98), month = c(1, 4, 5, 8,1, 12), stringsAsFactors=FALSE)
month_unique <- as.data.frame(seq(1,12, by=1))
colnames(month_unique)[colnames(month_unique)=="seq(1, 12, by = 1)"] <- "month"
Code:
unique_product <- unique(data2$product_no)
data3 <- data.frame()
process_time <- Sys.time()
for (i in 1:length(unique_product)){
step1 <- subset(data2, product_no==unique_product[i])
step2 <- merge(month_unique,step1, by="month", all.x = TRUE)
step2$product_no <- unique_product[i]
step2[is.na(step2)] <- 0
data3 <- rbind(data3, step2)
}
Sys.time() - process_time
Expected Result:
data3
Is there a faster way to do this?
Thank you.
答案 0 :(得分:6)
You can do this with expand.grid
to create all combinations of month and product_no
, then replace the NAs with 0.
library(tidyr)
combinations <- expand.grid(month = 1:12,
product_no = unique(data2$product_no),
stringsAsFactors = FALSE)
result <- merge(combinations, data2, all.x = TRUE)
result <- replace_na(result, list(sales = 0))
Note that I'm using the replace_na
function from the tidyr package, but you could also just do
result$sales[is.na(result$sales)] <- 0
You can also use the left_join
function in dplyr, which is often faster than merge
. In dplyr functions are often (though not necessarily) chained together with %>%
:
library(dplyr)
result <- combinations %>%
left_join(data2) %>%
replace_na(list(sales = 0))
答案 1 :(得分:1)
仅仅因为我很好奇,并且因为你说你有很多产品需要迭代,所以我用for循环运行它,使用lapply,使用David的代码,然后并行运行(在4个核心上)。这就是我想出的:
> library(dplyr)
> library(tidyr)
> library(parallel)
>
> data2 <- data.frame(productId = c("A", "A", "A", "B","B","B"),
+ sales = c(200, 130, 221, 310,109, 98),
+ month = c(1, 4, 5, 8,1, 12),
+ stringsAsFactors=FALSE)
> data2 <- do.call("rbind", lapply(1:1000, function(i) data2))
> data2$productId <- rep(1:2000, each = 3)
>
> month_unique <- as.data.frame(seq(1,12, by=1))
> colnames(month_unique)[colnames(month_unique)=="seq(1, 12, by = 1)"] <- "month"
>
>
> #* For running the original code
> unique_product <- unique(data2$productId)
> data3 <- data.frame()
>
>
> system.time({
+ for (i in 1:length(unique_product)){
+ step1 <- subset(data2, productId==unique_product[i])
+ step2 <- merge(month_unique,step1, by="month", all.x = TRUE)
+ step2$productId <- unique_product[i]
+ step2[is.na(step2)] <- 0
+ data3 <- rbind(data3, step2)
+ }
+ })
user system elapsed
4.79 0.01 4.81
>
>
> #* Function that is equivalent to the for loop
> dataFn <- function(up, data2, month_unique){
+ step1 <- subset(data2, productId==up)
+ step2 <- merge(month_unique,step1, by="month", all.x = TRUE)
+ step2$product_no <- up
+ step2[is.na(step2)] <- 0
+ step2
+ }
>
> system.time({
+ data3 <- do.call("rbind",
+ lapply(unique_product, dataFn, data2, month_unique))
+ })
user system elapsed
2.1 0.0 2.1
>
> #David's code
> system.time({
+ combinations <- expand.grid(month = 1:12,
+ productId = unique(data2$productId),
+ stringsAsFactors = FALSE)
+ result <- left_join(combinations, data2,
+ by = c("month" = "month",
+ "productId" = "productId"))
+ result <- replace_na(result, list(sales = 0))
+ })
user system elapsed
0 0 0
>
> # run in parallel
> system.time({
+ cl <- makeCluster(4)
+ clusterExport(cl, "dataFn")
+ clusterExport(cl, "data2")
+ clusterExport(cl, "month_unique")
+ data3_parallel <-
+ do.call("rbind",
+ parLapply(cl, unique_product, dataFn, data2, month_unique))
+ stopCluster(cl)
+ })
user system elapsed
0.27 0.03 1.99
>
因此使用apply函数似乎可以提高速度;并行化似乎没有什么好处(也许任务的大小太小了?),David的代码闪电般快。