sum values between start and end dates, for categories, for every day in a period of time in R

时间:2018-02-01 18:30:50

标签: r

I have a set of tasks that have start and end dates. Each task has a category, too. I'd like to specify a particular date range, and sum all of the values within that date range, for each category. I'd be ok with the results ending up in a wide format (results1) or in a long format (results2). If either of those makes this easier, that's fine with me.

I tried to make my example below reproducible.

require(lubridate)
require(dplyr)
require(ggplot2)

dates <- seq(from = ymd("2018-01-01"), to = ymd("2018-01-31"), by = "day") %>% 
  as_data_frame() %>% 
  rename(Date = value) %>%
  arrange(Date)


tasks <- data.frame(
  task = c("task 1", "task 2", "task 3", "task 4"),
  category = c("cat1", "cat1", "cat2", "cat2"),
  start.date = c(ymd("2018-01-01"), ymd("2018-01-15"), ymd("2018-01-18"), ymd("2018-01-25")),
  end.date = c(ymd("2018-01-07"), ymd("2018-01-27"), ymd("2018-02-15"), ymd("2018-01-31")),
  value = c(1,3,5,7)
)

# desired results example 1: sums in wide format
results1 <- bind_cols(
  dates,
  cat1 = c(1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0),
  cat2 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 12, 12, 12, 12, 12, 12, 12)
)


# desired results example 2: sums in long format
results2 <- bind_cols(
  bind_rows(dates, dates),
  category = c("cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2"),
  value = c(1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 12, 12, 12, 12, 12, 12, 12)
)

#graph the results
ggplot(results2, aes(Date, value, color = category)) + geom_line()

3 个答案:

答案 0 :(得分:3)

DF1 = do.call(rbind, lapply(split(tasks, tasks$category), function(df1){
    do.call(rbind, lapply(dates$Date, function(d){
        data.frame(Date = d,
                   category = df1$category[1],
                   value = sum(df1$value[d >= df1$start.date & d <= df1$end.date]),
                   stringsAsFactors = FALSE)
    }))
}))
head(DF1)
#             Date category value
#cat1.1 2018-01-01     cat1     1
#cat1.2 2018-01-02     cat1     1
#cat1.3 2018-01-03     cat1     1
#cat1.4 2018-01-04     cat1     1
#cat1.5 2018-01-05     cat1     1
#cat1.6 2018-01-06     cat1     1

graphics.off()
ggplot(DF1, aes(Date, value, color = category)) + geom_line()

enter image description here

答案 1 :(得分:2)

使用sqldf的解决方案可能是:

 # Data
 require(lubridate)
 require(dplyr)
 require(ggplot2)

 dates <- seq(from = ymd("2018-01-01"), to = ymd("2018-01-31"), by = "day") %>% 
   as_data_frame() %>% 
   rename(Date = value) %>%
   arrange(Date)


 tasks <- data.frame(
   task = c("task 1", "task 2", "task 3", "task 4"),
   category = c("cat1", "cat1", "cat2", "cat2"),
   start_date = c(ymd("2018-01-01"), ymd("2018-01-15"), ymd("2018-01-18"), ymd("2018-01-25")),
   end_date = c(ymd("2018-01-07"), ymd("2018-01-27"), ymd("2018-02-15"), ymd("2018-01-31")),
   value = c(1,3,5,7)
 )


 library(sqldf)
 # Dates with valid values 
 result <- sqldf("SELECT dates.Date, tasks.category,  sum(tasks.value) as value
                  FROM dates, tasks 
                  WHERE dates.Date <= tasks.end_date AND
                  dates.Date >= tasks.start_date 
                  GROUP BY dates.Date, tasks.category")

  #Dates with no values for each category is found and joined with result
  final_result <- dates %>% merge(unique(result$category)) %>%
                           mutate(category = y) %>%
                           anti_join(result, by = c("Date","category")) %>% 
                           mutate(value = 0) %>%
                           select(-y) %>%
                           union_all(result) %>% 
                            arrange(category, Date)

final_result
#         Date category value
#1  2018-01-01     cat1     1
#2  2018-01-02     cat1     1
#3  2018-01-03     cat1     1
#4  2018-01-04     cat1     1
#5  2018-01-05     cat1     1
#6  2018-01-06     cat1     1
#7  2018-01-07     cat1     1
#8  2018-01-08     cat1     0
#......so on 
#57 2018-01-26     cat2    12
#58 2018-01-27     cat2    12
#59 2018-01-28     cat2    12
#60 2018-01-29     cat2    12
#61 2018-01-30     cat2    12
#62 2018-01-31     cat2    12
#plot
ggplot(final_result, aes(Date, value, color = category)) + geom_line()

enter image description here

答案 2 :(得分:0)

您的结果集不清楚您追求的目标是什么......但是从您的描述中可以看出:

  

我想指定一个特定的日期范围,并对每个类别中该日期范围内的所有值求和

然后,您只需按日期范围(即按开始日期和结束日期)和类别进行分组,并汇总值:

tasks %>% 
group_by(category, start.date, end.date) %>% 
summarise(value = sum(value))

# A tibble: 4 x 4
# Groups: category, start.date [?]
category start.date end.date   value
<fct>    <date>     <date>     <dbl>
1 cat1     2018-01-01 2018-01-07  1.00
2 cat1     2018-01-15 2018-01-27  3.00
3 cat2     2018-01-18 2018-02-15  5.00
4 cat2     2018-01-25 2018-01-31  7.00