I have a set of tasks that have start and end dates. Each task has a category, too. I'd like to specify a particular date range, and sum all of the values within that date range, for each category. I'd be ok with the results ending up in a wide format (results1) or in a long format (results2). If either of those makes this easier, that's fine with me.
I tried to make my example below reproducible.
require(lubridate)
require(dplyr)
require(ggplot2)
dates <- seq(from = ymd("2018-01-01"), to = ymd("2018-01-31"), by = "day") %>%
as_data_frame() %>%
rename(Date = value) %>%
arrange(Date)
tasks <- data.frame(
task = c("task 1", "task 2", "task 3", "task 4"),
category = c("cat1", "cat1", "cat2", "cat2"),
start.date = c(ymd("2018-01-01"), ymd("2018-01-15"), ymd("2018-01-18"), ymd("2018-01-25")),
end.date = c(ymd("2018-01-07"), ymd("2018-01-27"), ymd("2018-02-15"), ymd("2018-01-31")),
value = c(1,3,5,7)
)
# desired results example 1: sums in wide format
results1 <- bind_cols(
dates,
cat1 = c(1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0),
cat2 = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 12, 12, 12, 12, 12, 12, 12)
)
# desired results example 2: sums in long format
results2 <- bind_cols(
bind_rows(dates, dates),
category = c("cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat1", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2", "cat2"),
value = c(1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 12, 12, 12, 12, 12, 12, 12)
)
#graph the results
ggplot(results2, aes(Date, value, color = category)) + geom_line()
答案 0 :(得分:3)
DF1 = do.call(rbind, lapply(split(tasks, tasks$category), function(df1){
do.call(rbind, lapply(dates$Date, function(d){
data.frame(Date = d,
category = df1$category[1],
value = sum(df1$value[d >= df1$start.date & d <= df1$end.date]),
stringsAsFactors = FALSE)
}))
}))
head(DF1)
# Date category value
#cat1.1 2018-01-01 cat1 1
#cat1.2 2018-01-02 cat1 1
#cat1.3 2018-01-03 cat1 1
#cat1.4 2018-01-04 cat1 1
#cat1.5 2018-01-05 cat1 1
#cat1.6 2018-01-06 cat1 1
graphics.off()
ggplot(DF1, aes(Date, value, color = category)) + geom_line()
答案 1 :(得分:2)
使用sqldf
的解决方案可能是:
# Data
require(lubridate)
require(dplyr)
require(ggplot2)
dates <- seq(from = ymd("2018-01-01"), to = ymd("2018-01-31"), by = "day") %>%
as_data_frame() %>%
rename(Date = value) %>%
arrange(Date)
tasks <- data.frame(
task = c("task 1", "task 2", "task 3", "task 4"),
category = c("cat1", "cat1", "cat2", "cat2"),
start_date = c(ymd("2018-01-01"), ymd("2018-01-15"), ymd("2018-01-18"), ymd("2018-01-25")),
end_date = c(ymd("2018-01-07"), ymd("2018-01-27"), ymd("2018-02-15"), ymd("2018-01-31")),
value = c(1,3,5,7)
)
library(sqldf)
# Dates with valid values
result <- sqldf("SELECT dates.Date, tasks.category, sum(tasks.value) as value
FROM dates, tasks
WHERE dates.Date <= tasks.end_date AND
dates.Date >= tasks.start_date
GROUP BY dates.Date, tasks.category")
#Dates with no values for each category is found and joined with result
final_result <- dates %>% merge(unique(result$category)) %>%
mutate(category = y) %>%
anti_join(result, by = c("Date","category")) %>%
mutate(value = 0) %>%
select(-y) %>%
union_all(result) %>%
arrange(category, Date)
final_result
# Date category value
#1 2018-01-01 cat1 1
#2 2018-01-02 cat1 1
#3 2018-01-03 cat1 1
#4 2018-01-04 cat1 1
#5 2018-01-05 cat1 1
#6 2018-01-06 cat1 1
#7 2018-01-07 cat1 1
#8 2018-01-08 cat1 0
#......so on
#57 2018-01-26 cat2 12
#58 2018-01-27 cat2 12
#59 2018-01-28 cat2 12
#60 2018-01-29 cat2 12
#61 2018-01-30 cat2 12
#62 2018-01-31 cat2 12
#plot
ggplot(final_result, aes(Date, value, color = category)) + geom_line()
答案 2 :(得分:0)
您的结果集不清楚您追求的目标是什么......但是从您的描述中可以看出:
我想指定一个特定的日期范围,并对每个类别中该日期范围内的所有值求和
然后,您只需按日期范围(即按开始日期和结束日期)和类别进行分组,并汇总值:
tasks %>%
group_by(category, start.date, end.date) %>%
summarise(value = sum(value))
# A tibble: 4 x 4
# Groups: category, start.date [?]
category start.date end.date value
<fct> <date> <date> <dbl>
1 cat1 2018-01-01 2018-01-07 1.00
2 cat1 2018-01-15 2018-01-27 3.00
3 cat2 2018-01-18 2018-02-15 5.00
4 cat2 2018-01-25 2018-01-31 7.00