Question

请参见下面的示例数据。

我想将季度销售数据（带有开始日期和结束日期）转换为每月销售数据。

例如：

数据集A行1将分别分为数据集B行6月，7月和8月的第1行，第2行和第3行，并且销售将根据该月的天数按比例分配，所有其他列将按比例分配一样；
数据集A行2将提取第1行（结束于5/9/2017）中剩下的内容，并构成一个完整的9月。

是否有一种有效的执行方法，实际数据是一个csv文件，数据大小为100K x 15，将被拆分为大约300K x 15的新数据集以进行每月分析。

样本问题数据中的一些关键特征包括：

第一个季度销售数据的开始日期是客户加入的日期，因此可以是任何一天；
所有销售将按季度进行，但在90、91或92天之间的各个天中，但随着季度客户休假，季度销售数据也可能不完整。

示例问题：

  Customer.ID Country       Type Sale Start..Date  End.Date Days
1           1      US Commercial   91   7/06/2017 5/09/2017   91
2           1      US Commerical   92   6/09/2017 6/12/2017   92
3           2      US     Casual   25  10/07/2017 3/08/2017   25
4           3      UK Commercial   64   7/06/2017 9/08/2017   64

示例答案：

   Customer.ID Country       Type Sale Start.Date   End.Date Days
1           1      US Commercial   24  7/06/2017 30/06/2017   24
2           1      US Commercial   31  1/07/2017 31/07/2017   31
3           1      US Commercial   31  1/08/2017 31/08/2017   31
4           1      US Commercial   30  1/09/2017 30/09/2017   30
5           1      US Commercial   31  1/10/2017 31/10/2017   31
6           1      US Commercial   30  1/11/2017 30/11/2017   30
7           1      US Commercial    6  1/12/2017  6/12/2017    6
8           2      US     Casual   22 10/07/2017 31/07/2017   22
9           2      US     Casual    3  1/08/2017  3/08/2017    3
10          3      UK Commercial   24  7/06/2017 30/06/2017   24
11          3      UK Commercial   31  1/07/2017 31/07/2017   31
12          3      UK Commercial    9  1/08/2017  9/08/2017    9

Answer 1

由于使用了多个函数和循环，因此不美观，因为它包含多个操作：

# Creating the dataset
library(tidyr)
customer <- c(1,1,2,3)
country <- c("US","US","US","UK")
type <- c("Commercial","Commercial","Casual","Commercial")
sale <- c(91,92,25,64)
Start <- as.Date(c("7/06/2017","6/09/2017","10/07/2017","7/06/2017"),"%d/%m/%Y")
Finish <- as.Date(c("5/09/2017","6/12/2017","3/08/2017","9/08/2017"),"%d/%m/%Y")
days <- c(91,92,25,64)
df <- data.frame(customer,country, type,sale, Start,Finish,days)

# Function to split per month
library(zoo)
addrowFun <- function(y){
    temp <- do.call("rbind", by(y, 1:nrow(y), function(x) with(x, {
    eom <- as.Date(as.yearmon(Start), frac = 1)
    if (eom < Finish)
       data.frame(customer, country, type, Start = c(Start, eom+1), Finish = c(eom, Finish))
    else x
    })))
    return(temp)
 }
loop <- df
for(i in 1:10){ #not all months are split up at once
   loop <- addrowFun(loop)
}
# Calculating the days per month
loop$days <- as.numeric(difftime(loop$Finish,loop$Start, units="days"))

# Creating the function to get the monthly sales pro rata
sumFun <- function(x){
   tempSum <- df[x$Start >= df$Start & x$Finish <= df$Finish & df$customer == x$customer,]
   totalSale <- sum(tempSum$sale)
   totalDays <- sum(tempSum$days)
   return(x$days / totalDays * totalSale)
 }

for(i in 1:length(loop$customer)){
   loop$sale[i] <- sumFun(loop[i,])
}  

loop

Answer 2

CiAndrews，

感谢您的帮助和耐心。我已经设法通过小小的改动得到答案。我已经用“ plyr”包中的“ rbind.fill”替换了“ rbind”，此后一切运行正常。

请参见下面的sample2.csv头

    customer   country    type     sale      Start      Finish     days
1 43108181108    US    Commercial  3330    17/11/2016  24/02/2017   99
2 43108181108    US    Commercial  2753    24/02/2017  23/05/2017   88
3 43108181108    US    Commercial  3043    13/02/2018  18/05/2018   94
4 43108181108    US    Commercial  4261    23/05/2017  18/08/2017   87
5 43103703637    UK    Casual      881     4/11/2016   15/02/2017   103
6 43103703637    UK    Casual      1172    26/07/2018  1/11/2018    98

请参见以下代码：

library(tidyr)

#read data and change the start and finish to data type

data <- read.csv("Sample2.csv")
data$Start <- as.Date(data$Start, "%d/%m/%Y")
data$Finish <- as.Date(data$Finish, "%d/%m/%Y")
customer <- data$customer
country <- data$country
days <- data$days
Finish <- data$Finish
Start <- data$Start
sale <- data$sale
type <- data$type
df <- data.frame(customer, country, type, sale, Start, Finish, days)

# Function to split per month

library(zoo)
library(plyr)
addrowFun <- function(y){
    temp <- do.call("rbind.fill", by(y, 1:nrow(y), function(x) with(x, {
        eom <- as.Date(as.yearmon(Start), frac = 1)
        if (eom < Finish)
            data.frame(customer, country, type, Start = c(Start, eom+1), Finish = c(eom, Finish))
        else x
    })))
    return(temp)
}
loop <- df
for(i in 1:10){ #not all months are split up at once
    loop <- addrowFun(loop)
}

# Calculating the days per month

loop$days <- as.numeric(difftime(loop$Finish,loop$Start, units="days"))

# Creating the function to get the monthly sales pro rata

sumFun <- function(x){
    tempSum <- df[x$Start >= df$Start & x$Finish <= df$Finish & df$customer == x$customer,]
    totalSale <- sum(tempSum$sale)
    totalDays <- sum(tempSum$days)
    return(x$days / totalDays * totalSale)
}

for(i in 1:length(loop$customer)){
    loop$sale[i] <- sumFun(loop[i,])
}  

loop

Answer 3

我刚刚运行了CIAndrews的代码。它似乎在大多数情况下都有效，但是在具有10,000行的数据集上运行时，它的运行速度非常慢。经过几分钟的等待，我最终取消了执行。天数也有问题：例如，七月有31天，但是days变量仅显示30天。确实31-1 = 30，但是第一天也应该算在内。

下面的代码在我的2015 MacBook Pro上仅花费大约21秒（不包括数据生成），并且还解决了另一个问题。

library(tidyverse)
library(lubridate)


# generate data -------------------------------------------------------------

set.seed(666)

# assign variables
customer <- sample.int(n = 2000, size = 10000, replace = T)
country <- sample(c("US", "UK", "DE", "FR", "IS"), 10000, replace = T)
type <- sample(c("commercial", "casual", "other"), 10000, replace = T)
start <- sample(seq(dmy("7/06/2011"), today(), by = "day"), 10000, replace = T)
days <- sample(85:105, 10000, replace = T)
end <- start + days
sale <- sample(500:3000, 10000, replace = T)

# generate dataframe of artificial data
df_quarterly <- tibble(customer, country, type, sale, start, end, days)



# split quarters into months ----------------------------------------------

# initialize empty list with length == nrow(dataframe)
list_date_dfs <- vector(mode = "list", length = nrow(df_quarterly))

# for-loop generates new dates and adds as dataframe to list
for (i in 1:length(list_date_dfs)) {

    # transfer dataframe row to variable `row`
    row <- df_quarterly[i,]

    # correct end date so split successful when interval doesn't cover full month
    end_corr <- row$end + day(row$start) - day(row$end)

    # use lubridate to compute first and last days of relevant months
    m_start <- seq(row$start, end_corr, by = "month") %>% 
        floor_date(unit = "month")
    m_end <- m_start + days_in_month(m_start) - 1

    # replace first and last elements with original dates
    m_start[1] <- row$start
    m_end[length(m_end)] <- row$end

    # compute the number of days per month as well as sales per month
    # correct difference by adding 1
    m_days <- as.integer(m_end - m_start) + 1
    m_sale <- (row$sale / sum(m_days)) * m_days

    # add tibble to list
    list_date_dfs[[i]] <- tibble(customer = row$customer,
                                 country = row$country,
                                 type = row$type,
                                 sale = m_sale,
                                 start = m_start,
                                 end = m_end,
                                 days = m_days
    )
}

# bind dataframe list elements into single dataframe
df_monthly <- bind_rows(list_date_dfs)

#R-使用R将季度数据拆分为每月数据

3 个答案: