注意:类似的问题我已经要求SQL - How to use a window function to determine when to perform different tasks in Hive or Postgres?
数据
我有一些数据显示每人不同的优先排序任务的开始日期和结束日期:
input_df <- data.frame(person = c(rep("Kate", 2), rep("Adam", 2), rep("Eve", 2), rep("Jason", 5)),
task_key = c(c("A","B"), c("A","B"), c("A","B"), c("A","B","C","D","E")),
start_day = c(c(1L,1L), c(1L,2L), c(2L,1L), c(1L,4L,3L,5L,4L)),
end_day = 5L)
person task_key start_day end_day 1 Kate A 1 5 2 Kate B 1 5 3 Adam A 1 5 4 Adam B 2 5 5 Eve A 2 5 6 Eve B 1 5 7 Jason A 1 5 8 Jason B 4 5 9 Jason C 3 5 10 Jason D 5 5 11 Jason E 4 5
注意:订购任务键,以便更高的字母具有更高的优先级。
问题
我需要弄清楚每个人每天应该做的工作,条件是:
简化
在实际数据中,end_day在原始表中始终为5,即只有start_day变化但end_day是常量。这意味着我所需的输出将与原始表具有相同的行数:)
输出
这是我需要的那种输出(杰森更能代表我拥有的数据,可以覆盖90天的超过100个任务):
output_df <- data.frame(person = c(rep("Kate", 2), rep("Adam", 2), rep("Eve", 2), rep("Jason", 5)),
task_key = c(c("A","B"), c("A","B"), c("A","B"), c("A","B","C","D","E")),
start_day = c(c(1L,1L), c(1L,2L), c(2L,1L), c(1L,4L,3L,5L,4L)),
end_day = 5L,
valid_from = c( c(NA,1L), c(1L,2L), c(NA,1L), c(1L,NA,3L,NA,4L) ),
valid_to = c( c(NA,5L), c(2L,5L), c(NA,5L), c(3L,NA,4L,NA,5L) ))
person task_key start_day end_day valid_from valid_to 1 Kate A 1 5 NA NA 2 Kate B 1 5 1 5 3 Adam A 1 5 1 2 4 Adam B 2 5 2 5 5 Eve A 2 5 NA NA 6 Eve B 1 5 1 5 7 Jason A 1 5 1 3 8 Jason B 4 5 NA NA 9 Jason C 3 5 3 4 10 Jason D 5 5 NA NA 11 Jason E 4 5 4 5
初步想法
工作但是我想要一个使用dbplyr包函数的解决方案,这通常比这更好:
tmp <- input_df %>% filter(person == "Jason")
num_rows <- nrow(tmp)
tmp$valid_from <- NA
tmp$valid_to <- NA
for(i in 1:num_rows) {
# Curent value
current_value <- tmp$start_day[i]
# Values to test against
vec <- lead(tmp$start, i)
# test
test <- current_value >= vec
# result
if(any(test, na.rm = TRUE) & i!=num_rows) {
tmp$valid_from[i] <- NA
tmp$valid_to[i] <- NA
} else if(i!=num_rows) {
tmp$valid_from[i] <- current_value
tmp$valid_to[i] <- min(vec, na.rm = TRUE)
} else {
tmp$valid_from[i] <- current_value
tmp$valid_to[i] <- max(tmp$end_day, na.rm = TRUE)
}
}
tmp
person task_number start_day end_day valid_from valid_to 1 Jason A 1 5 1 3 2 Jason B 4 5 NA NA 3 Jason C 3 5 3 4 4 Jason D 5 5 NA NA 5 Jason E 4 5 4 5
跟进问题
最终我需要在SQL中执行此操作,但这似乎太难了。我听说'dbply'包可以帮助我,因为如果我可以使用dplyr函数解决这个问题,那么它会以某种方式将其转换为有效的SQL查询吗?
答案 0 :(得分:1)
使用tidyverse包的解决方案。 map2
和unnest
用于扩展数据集。 arrange(person, desc(task_key))
和distinct(person, Days, .keep_all = TRUE)
将根据task_key
的顺序删除重复项。之后,我们可以使用slice
选择最后一行并操纵开始和结束日期。
library(tidyverse)
output_df <- input_df %>%
mutate(Days = map2(start_day, end_day, `:`)) %>%
unnest() %>%
arrange(person, desc(task_key)) %>%
distinct(person, Days, .keep_all = TRUE) %>%
arrange(person, task_key, Days) %>%
group_by(person, task_key) %>%
slice(n()) %>%
mutate(end_day = ifelse(Days < end_day, Days + 1L, end_day)) %>%
select(-Days) %>%
rename(valid_from = start_day, valid_to = end_day) %>%
right_join(input_df, by = c("person", "task_key")) %>%
select(names(input_df), starts_with("valid")) %>%
ungroup()
output_df
# # A tibble: 11 x 6
# person task_key start_day end_day valid_from valid_to
# <fct> <fct> <int> <int> <int> <int>
# 1 Kate A 1 5 NA NA
# 2 Kate B 1 5 1 5
# 3 Adam A 1 5 1 2
# 4 Adam B 2 5 2 5
# 5 Eve A 2 5 NA NA
# 6 Eve B 1 5 1 5
# 7 Jason A 1 5 1 3
# 8 Jason B 4 5 NA NA
# 9 Jason C 3 5 3 4
# 10 Jason D 5 5 NA NA
# 11 Jason E 4 5 4 5
答案 1 :(得分:0)
有趣的是,我必须在本周早些时候做一些类似的事情,但是在不同的背景下。
下面介绍了仅使用dplyr
包的解决方案(步骤10中有警告,但我认为可以忽略)。
在将此dplyr
解决方案转换为具有关联的有效SQL代码的dbplyr
解决方案方面,我不知道该怎么做(我试了一下,但它没有&#39 ;工作)。
编辑:在问题的原始版本中,您使用的是任务键的数字而不是字母。在我发布之前,我没有看到你编辑了你的问题:)
带注释的代码:
# Load packages.
library(DBI)
library(dplyr)
library(dbplyr)
library(RSQLite)
library(RPostgreSQL)
# Data
input_df <- data.frame(person = c(rep("Kate", 2), rep("Adam", 2), rep("Eve", 2), rep("Jason", 5)),
task_key = c(1:2, 1:2, 1:2, 1:5),
start_day = c(c(1L,1L), c(1L,2L), c(2L,1L), c(1L,4L,3L,5L,4L)),
end_day = 5L)
# [OPTIONAL] Convert to a databse; I couldn't figure out how to make an in-memory verson of PostgreSQL using RPostgreSQL::PostgreSQL()
# If this worked, then you could use the show_query() function to see the SQL it generates.
#con <- DBI::dbConnect(RSQLite::SQLite(), ":memory:")
#DBI::dbWriteTable(con, "input_df", input_df)
#input_df <- tbl(con, "input_df")
# Step 01: Keep only minimal information.
df01 <- input_df %>%
select(person, task_key, start_day) %>%
distinct() %>%
dplyr::rename(tk=task_key, sd=start_day)
# show_query(df01)
# Step 02: Explode table with all pair-wise comparisons for each person.
df02 <- left_join(x = df01, y = df01, by = c("person"), suffix = c(".bas", ".alt"))
# show_query(df02)
# Step 03: Remove self-comparisons
df03 <- filter(.data = df02, tk.bas != tk.alt)
# show_query(df03)
# Step 04: Add a flag to indicate when the baseline task takes priority over the comparator.
df04 <- mutate(.data = df03, tk.bas_priority = tk.bas > tk.alt) # check inequality
# show_query(df04)
# Step 05: Add a flag to indicate when the baseline date is earlier then the comparator date.
df05 <- mutate(.data = df04, sd.bas_earliest = sd.bas < sd.alt)
# show_query(df05)
# Step 06: Is it possible to reduce the number of comparisons?
# I think there is a way but haven't looked into it.
df06 <- df05
# show_query(df06)
# Step 07: Organise columns to make them easier for me to read.
df07 <- select(.data = df06, person, tk.bas, tk.alt, tk.bas_priority, sd.bas, sd.alt, sd.bas_earliest)
# show_query(df07)
# Step 08: Group table by person and baseline date.
df08 <- group_by(.data = df07, person, tk.bas)
# show_query(df08)
# Step 09: Create start dates.
df09 <- df08 %>%
mutate(start = case_when(
tk.bas_priority == TRUE & sd.bas_earliest == TRUE ~ sd.bas,
tk.bas_priority == TRUE & sd.bas_earliest == FALSE ~ sd.bas,
tk.bas_priority == FALSE & sd.bas_earliest == TRUE ~ sd.bas,
tk.bas_priority == FALSE & sd.bas_earliest == FALSE ~ NA_integer_,
TRUE ~ -1L
)) %>%
mutate(start = as.integer(min(start, na.rm = FALSE)))
# show_query(df09)
# Step 10: Create end dates.
# Note: This will create warnings because empty vectors might be applied to 'min' or 'max'.
# I think these can be ignored because it doesn't matter in this case?
df10 <- df09 %>%
mutate(end = case_when(
tk.bas_priority == TRUE & sd.bas_earliest == TRUE ~ as.integer(max(sd.alt)),
tk.bas_priority == TRUE & sd.bas_earliest == FALSE ~ as.integer(max(sd.alt)),
tk.bas_priority == FALSE & sd.bas_earliest == TRUE ~ as.integer(min(sd.alt[tk.bas_priority == F])),
tk.bas_priority == FALSE & sd.bas_earliest == FALSE ~ NA_integer_,
TRUE ~ -1L
)) %>%
mutate(end = as.integer(min(end, na.rm = FALSE)))
# show_query(df10)
# Step 11: Ungroup table.
df11 <- ungroup(df10)
# show_query(df11)
# Step 12: Reduce table to distinct start/end values for each person and baseline ad.
df12 <- df11 %>%
select(person, tk.bas, start, end) %>%
distinct()
# show_query(df12)
# Step 13: Join back onto original data.
df13 <- left_join(input_df, df12, by = c("person"="person", "task_key"="tk.bas"))
# show_query(df13)
# Step 14: Account for the end date for the final row per person
df14 <- df13 %>%
group_by(person) %>%
mutate(end = if_else(row_number() == n(), as.integer(max(end_day)), end)) %>%
ungroup()
# show_query(df14)
# collect(df14)
<强>输出:强>
# A tibble: 11 x 6
person task_key start_day end_day start end
<fct> <int> <int> <int> <int> <int>
1 Kate 1 1 5 NA NA
2 Kate 2 1 5 1 5
3 Adam 1 1 5 1 2
4 Adam 2 2 5 2 5
5 Eve 1 2 5 NA NA
6 Eve 2 1 5 1 5
7 Jason 1 1 5 1 3
8 Jason 2 4 5 NA NA
9 Jason 3 3 5 3 4
10 Jason 4 5 5 NA NA
11 Jason 5 4 5 4 5