我的数据如下:
date schedule_id food_truck_id building_id truck_status last_confirmed_date dsle
2018-04-26 422 58 30 accepted_event 0 31
2018-04-26 422 59 30 accepted_event 2018-02-27 11
2018-04-26 422 65 30 accepted_event 2018-03-15 12
2018-04-26 422 88 30 accepted_event 2018-02-20 7
2018-04-26 422 89 30 accepted_event 2018-03-22 13
2018-04-26 422 101 30 accepted_event 2018-02-06 16
2018-04-26 422 120 30 accepted_event 2018-03-06 14
2018-04-26 422 135 30 accepted_event 2018-03-13 21
2018-04-26 399 42 33 accepted_event 2018-03-15 8
2018-04-26 399 58 33 accepted_event 0 31
2018-04-26 399 59 33 accepted_event 2018-03-01 11
2018-04-26 399 65 33 accepted_event 2018-02-27 12
2018-04-26 399 88 33 accepted_event
可以使用以下方式复制:
structure(list(date = structure(c(17647, 17647, 17647, 17647,
17647, 17647, 17647, 17647, 17647, 17647, 17647, 17647, 17647,
17647, 17647, 17647, 17647), class = "Date"), schedule_id = c(422L,
422L, 422L, 422L, 422L, 422L, 422L, 422L, 399L, 399L, 399L, 399L,
399L, 399L, 399L, 399L, 399L), food_truck_id = c(58L, 59L, 65L,
88L, 89L, 101L, 120L, 135L, 42L, 58L, 59L, 65L, 88L, 89L, 101L,
120L, 135L), building_id = c(30L, 30L, 30L, 30L, 30L, 30L, 30L,
30L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L, 33L), truck_status = c("accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event",
"accepted_event", "accepted_event", "accepted_event", "accepted_event"
), last_confirmed_date = c("0", "2018-02-27", "2018-03-15", "2018-02-20",
"2018-03-22", "2018-02-06", "2018-03-06", "2018-03-13", "2018-03-15",
"0", "2018-03-01", "2018-02-27", "0", "2018-03-06", "2018-03-13",
"0", "2018-02-22"), dsle = c(31, 11, 12, 7, 13, 16, 14, 21, 8,
31, 11, 12, 7, 13, 16, 14, 21)), .Names = c("date", "schedule_id",
"food_truck_id", "building_id", "truck_status", "last_confirmed_date",
"dsle"), row.names = c(142L, 223L, 379L, 455L, 495L, 589L, 806L,
877L, 63L, 155L, 215L, 287L, 452L, 483L, 667L, 809L, 894L), class = "data.frame")
我的目标是仅根据max(dsle)选择food_truck_id,但每个日期应该是唯一的。例如,对于schedule_id 422,food_truck_id(max(dsle)为58,schedule_id 399也为58.
我想要的是,让我们说422,它是58,但对于399,它应该是下一个最大值(dsle)而不是58.
我尝试过以下内容,但它没有提供我想要的内容。
testxx %>%
group_by(schedule_id) %>%
distinct(food_truck_id, date, dsle) %>%
filter(dsle == max(dsle))
我想要的结果是
date schedule_id food_truck_id
2018-04-26 422 58
2018-04-26 399 135
因为58旁边的135有max(dsle)
答案 0 :(得分:3)
已更新为帐户日期
这可能是循环是最好/最简单的解决方案之一。
但是,它在循环中执行连接操作,因此可以进行一些优化
我们的想法是遍历每个schedule_id
,并跟踪哪个food_trucks
已在哪个日期使用过。
如果我们在循环之前做一些预先安排数据,那么事情会变得更容易
df <- df %>%
arrange(schedule_id, -dsle)
## pre-allocate a result data.frame
ids <- unique(df$schedule_id)
df_res <- data.frame(schedule_id = ids,
food_truck_id = NA)
usedTrucks <- data.frame(date = as.Date(NA),
schedule_id = ids,
food_truck_id = NA_integer_)
counter <- 1
for(i in ids) {
possibleTrucks <- df[df$schedule_id %in% i, c("date", "food_truck_id")]
## possible Trucks will be in order, as we have pre-arranged the data
## use the first one that hasn't already been used
## on the given date
possibleTrucks <- anti_join(possibleTrucks, usedTrucks, by = c("date", "food_truck_id"))
thisTruck <- possibleTrucks[1, c("food_truck_id", "date")]
df_res[counter, 'food_truck_id'] <- thisTruck$food_truck_id
usedTrucks[counter, "food_truck_id"] <- thisTruck$food_truck_id
usedTrucks[counter, "date"] <- thisTruck$date
counter <- counter + 1
}
df_res
# schedule_id food_truck_id
# 1 399 58
# 2 422 135
如果速度是较大数据集的问题,可以在Rcpp
中重写,以使速度更快。
答案 1 :(得分:1)
p<-df %>% arrange(desc(schedule_id), desc(dsle)) %>% slice(1) %>% select(date,dsle,schedule_id,food_truck_id)
df %>% subset(!(schedule_id%in%c(p))) %>% subset(!(dsle%in%c(p))) %>% select(date,dsle,schedule_id,food_truck_id) %>% arrange(desc(dsle)) %>% slice(1) %>%
rbind(p,.) %>% select(-dsle)
输出
# A tibble: 2 x 3
date schedule_id food_truck_id
<date> <int> <int>
1 2018-04-26 422 58
2 2018-04-26 399 135