请参见下面的示例数据。
我想将季度销售数据(带有开始日期和结束日期)转换为每月销售数据。
例如:
是否有一种有效的执行方法,实际数据是一个csv文件,数据大小为100K x 15,将被拆分为大约300K x 15的新数据集以进行每月分析。
样本问题数据中的一些关键特征包括:
示例问题:
Customer.ID Country Type Sale Start..Date End.Date Days
1 1 US Commercial 91 7/06/2017 5/09/2017 91
2 1 US Commerical 92 6/09/2017 6/12/2017 92
3 2 US Casual 25 10/07/2017 3/08/2017 25
4 3 UK Commercial 64 7/06/2017 9/08/2017 64
示例答案:
Customer.ID Country Type Sale Start.Date End.Date Days
1 1 US Commercial 24 7/06/2017 30/06/2017 24
2 1 US Commercial 31 1/07/2017 31/07/2017 31
3 1 US Commercial 31 1/08/2017 31/08/2017 31
4 1 US Commercial 30 1/09/2017 30/09/2017 30
5 1 US Commercial 31 1/10/2017 31/10/2017 31
6 1 US Commercial 30 1/11/2017 30/11/2017 30
7 1 US Commercial 6 1/12/2017 6/12/2017 6
8 2 US Casual 22 10/07/2017 31/07/2017 22
9 2 US Casual 3 1/08/2017 3/08/2017 3
10 3 UK Commercial 24 7/06/2017 30/06/2017 24
11 3 UK Commercial 31 1/07/2017 31/07/2017 31
12 3 UK Commercial 9 1/08/2017 9/08/2017 9
答案 0 :(得分:0)
由于使用了多个函数和循环,因此不美观,因为它包含多个操作:
# Creating the dataset
library(tidyr)
customer <- c(1,1,2,3)
country <- c("US","US","US","UK")
type <- c("Commercial","Commercial","Casual","Commercial")
sale <- c(91,92,25,64)
Start <- as.Date(c("7/06/2017","6/09/2017","10/07/2017","7/06/2017"),"%d/%m/%Y")
Finish <- as.Date(c("5/09/2017","6/12/2017","3/08/2017","9/08/2017"),"%d/%m/%Y")
days <- c(91,92,25,64)
df <- data.frame(customer,country, type,sale, Start,Finish,days)
# Function to split per month
library(zoo)
addrowFun <- function(y){
temp <- do.call("rbind", by(y, 1:nrow(y), function(x) with(x, {
eom <- as.Date(as.yearmon(Start), frac = 1)
if (eom < Finish)
data.frame(customer, country, type, Start = c(Start, eom+1), Finish = c(eom, Finish))
else x
})))
return(temp)
}
loop <- df
for(i in 1:10){ #not all months are split up at once
loop <- addrowFun(loop)
}
# Calculating the days per month
loop$days <- as.numeric(difftime(loop$Finish,loop$Start, units="days"))
# Creating the function to get the monthly sales pro rata
sumFun <- function(x){
tempSum <- df[x$Start >= df$Start & x$Finish <= df$Finish & df$customer == x$customer,]
totalSale <- sum(tempSum$sale)
totalDays <- sum(tempSum$days)
return(x$days / totalDays * totalSale)
}
for(i in 1:length(loop$customer)){
loop$sale[i] <- sumFun(loop[i,])
}
loop
答案 1 :(得分:0)
CiAndrews,
感谢您的帮助和耐心。我已经设法通过小小的改动得到答案。我已经用“ plyr”包中的“ rbind.fill”替换了“ rbind”,此后一切运行正常。
请参见下面的sample2.csv头
customer country type sale Start Finish days
1 43108181108 US Commercial 3330 17/11/2016 24/02/2017 99
2 43108181108 US Commercial 2753 24/02/2017 23/05/2017 88
3 43108181108 US Commercial 3043 13/02/2018 18/05/2018 94
4 43108181108 US Commercial 4261 23/05/2017 18/08/2017 87
5 43103703637 UK Casual 881 4/11/2016 15/02/2017 103
6 43103703637 UK Casual 1172 26/07/2018 1/11/2018 98
请参见以下代码:
library(tidyr)
#read data and change the start and finish to data type
data <- read.csv("Sample2.csv")
data$Start <- as.Date(data$Start, "%d/%m/%Y")
data$Finish <- as.Date(data$Finish, "%d/%m/%Y")
customer <- data$customer
country <- data$country
days <- data$days
Finish <- data$Finish
Start <- data$Start
sale <- data$sale
type <- data$type
df <- data.frame(customer, country, type, sale, Start, Finish, days)
# Function to split per month
library(zoo)
library(plyr)
addrowFun <- function(y){
temp <- do.call("rbind.fill", by(y, 1:nrow(y), function(x) with(x, {
eom <- as.Date(as.yearmon(Start), frac = 1)
if (eom < Finish)
data.frame(customer, country, type, Start = c(Start, eom+1), Finish = c(eom, Finish))
else x
})))
return(temp)
}
loop <- df
for(i in 1:10){ #not all months are split up at once
loop <- addrowFun(loop)
}
# Calculating the days per month
loop$days <- as.numeric(difftime(loop$Finish,loop$Start, units="days"))
# Creating the function to get the monthly sales pro rata
sumFun <- function(x){
tempSum <- df[x$Start >= df$Start & x$Finish <= df$Finish & df$customer == x$customer,]
totalSale <- sum(tempSum$sale)
totalDays <- sum(tempSum$days)
return(x$days / totalDays * totalSale)
}
for(i in 1:length(loop$customer)){
loop$sale[i] <- sumFun(loop[i,])
}
loop
答案 2 :(得分:0)
我刚刚运行了CIAndrews的代码。它似乎在大多数情况下都有效,但是在具有10,000行的数据集上运行时,它的运行速度非常慢。经过几分钟的等待,我最终取消了执行。天数也有问题:例如,七月有31天,但是days
变量仅显示30天。确实31-1 = 30,但是第一天也应该算在内。
下面的代码在我的2015 MacBook Pro上仅花费大约21秒(不包括数据生成),并且还解决了另一个问题。
library(tidyverse)
library(lubridate)
# generate data -------------------------------------------------------------
set.seed(666)
# assign variables
customer <- sample.int(n = 2000, size = 10000, replace = T)
country <- sample(c("US", "UK", "DE", "FR", "IS"), 10000, replace = T)
type <- sample(c("commercial", "casual", "other"), 10000, replace = T)
start <- sample(seq(dmy("7/06/2011"), today(), by = "day"), 10000, replace = T)
days <- sample(85:105, 10000, replace = T)
end <- start + days
sale <- sample(500:3000, 10000, replace = T)
# generate dataframe of artificial data
df_quarterly <- tibble(customer, country, type, sale, start, end, days)
# split quarters into months ----------------------------------------------
# initialize empty list with length == nrow(dataframe)
list_date_dfs <- vector(mode = "list", length = nrow(df_quarterly))
# for-loop generates new dates and adds as dataframe to list
for (i in 1:length(list_date_dfs)) {
# transfer dataframe row to variable `row`
row <- df_quarterly[i,]
# correct end date so split successful when interval doesn't cover full month
end_corr <- row$end + day(row$start) - day(row$end)
# use lubridate to compute first and last days of relevant months
m_start <- seq(row$start, end_corr, by = "month") %>%
floor_date(unit = "month")
m_end <- m_start + days_in_month(m_start) - 1
# replace first and last elements with original dates
m_start[1] <- row$start
m_end[length(m_end)] <- row$end
# compute the number of days per month as well as sales per month
# correct difference by adding 1
m_days <- as.integer(m_end - m_start) + 1
m_sale <- (row$sale / sum(m_days)) * m_days
# add tibble to list
list_date_dfs[[i]] <- tibble(customer = row$customer,
country = row$country,
type = row$type,
sale = m_sale,
start = m_start,
end = m_end,
days = m_days
)
}
# bind dataframe list elements into single dataframe
df_monthly <- bind_rows(list_date_dfs)