如果条目在其组中是最小值,则标记条目

时间:2017-11-11 23:10:21

标签: r dplyr

以下是我正在使用的数据:

top20 = structure(list(route = c("ORD-LGA", "ORD-LAX", "ORD-DFW", "ORD-SFO", 
"ORD-BOS", "ORD-ATL", "ORD-MSP", "ORD-DCA", "ORD-DEN", "ORD-MKE", 
"ORD-CLT", "ORD-PHL", "ORD-PHX", "ORD-MCO", "ORD-DTW", "ORD-LAS", 
"ORD-IAH", "ORD-MIA", "ORD-CVG", "ORD-CLE")), .Names = "route", row.names = c(NA, 
-20L), class = c("tbl_df", "tbl", "data.frame"))

routes = structure(list(route = c("ORD-LGA", "ORD-LAX", "ORD-DFW", "ORD-SFO", 
"ORD-BOS", "ORD-ATL", "ORD-MSP", "ORD-DCA", "ORD-DEN", "ORD-MKE", 
"ORD-CLT", "ORD-PHL", "ORD-PHX", "ORD-MCO", "ORD-DTW", "ORD-LAS", 
"ORD-IAH", "ORD-MIA", "ORD-CVG", "ORD-CLE", "ORD-SEA", "ORD-EWR", 
"ORD-BNA", "ORD-CMH", "ORD-DSM", "ORD-MSN", "ORD-IND", "ORD-GRR", 
"ORD-CID", "ORD-GRB", "ORD-SAN", "ORD-DAY", "ORD-SLC", "ORD-XNA", 
"ORD-STL", "ORD-FLL", "ORD-BWI", "ORD-TPA", "ORD-FWA", "ORD-AUS", 
"ORD-MEM", "ORD-MCI", "ORD-ICT", "ORD-PIA", "ORD-SNA", "ORD-PDX", 
"ORD-RIC", "ORD-FAR", "ORD-TYS", "ORD-TUL", "ORD-FSD", "ORD-OMA", 
"ORD-LIT", "ORD-LEX", "ORD-BUF", "ORD-SGF", "ORD-IAD", "ORD-PIT", 
"ORD-RDU", "ORD-FNT", "ORD-OKC", "ORD-RSW", "ORD-HPN", "ORD-TVC", 
"ORD-CMI", "ORD-EVV", "ORD-ROC", "ORD-SYR", "ORD-MLI", "ORD-MDT", 
"ORD-JFK", "ORD-BDL", "ORD-ATW", "ORD-MSY", "ORD-CWA", "ORD-AZO", 
"ORD-RST", "ORD-CAK", "ORD-MBS", "ORD-LNK", "ORD-LSE", "ORD-SBN", 
"ORD-DLH", "ORD-SAT", "ORD-SJU", "ORD-SDF", "ORD-BHM", "ORD-SPI", 
"ORD-CHS", "ORD-TOL", "ORD-BMI", "ORD-COS", "ORD-DBQ", "ORD-LAN", 
"ORD-ALB", "ORD-CHO", "ORD-SAV", "ORD-GSP", "ORD-ORF", "ORD-ROA", 
"ORD-AVL", "ORD-EAU", "ORD-CMX", "ORD-MKG", "ORD-ABE", "ORD-PAH", 
"ORD-SJC", "ORD-SCE", "ORD-MHK", "ORD-COU", "ORD-ANC", "ORD-ERI", 
"ORD-HSV", "ORD-ASE", "ORD-SUX", "ORD-ALO", "ORD-ELM", "ORD-TUS", 
"ORD-CAE", "ORD-PBI", "ORD-ELP", "ORD-JAN", "ORD-CRW", "ORD-ABQ", 
"ORD-BOI", "ORD-SMF", "ORD-BTV", "ORD-AVP", "ORD-PVD", "ORD-GSO", 
"ORD-MHT", "ORD-JAX", "ORD-JAC", "ORD-CHA", "ORD-RNO", "ORD-RAP", 
"ORD-PSP", "ORD-HNL", "ORD-OAK", "ORD-BIS", "ORD-PWM", "ORD-SRQ", 
"ORD-MOB", "ORD-MQT", "ORD-BZN", "ORD-MYR", "ORD-TTN", "ORD-PNS", 
"ORD-LBE", "ORD-ACY", "ORD-FCA", "ORD-MTJ", "ORD-OGG", "ORD-EGE", 
"ORD-STC", "ORD-FAI", "ORD-BIL", "ORD-STT", "ORD-HDN", "ORD-GUC", 
"ORD-COD", "ORD-GJT"), counts = c(9575L, 7941L, 7677L, 7378L, 
6558L, 6391L, 6361L, 5924L, 5613L, 4824L, 4755L, 4744L, 4419L, 
4416L, 4301L, 4268L, 4186L, 4166L, 4011L, 3937L, 3932L, 3874L, 
3577L, 3442L, 3290L, 3185L, 3026L, 2990L, 2843L, 2767L, 2673L, 
2643L, 2600L, 2597L, 2566L, 2531L, 2468L, 2434L, 2261L, 2239L, 
2228L, 2201L, 2190L, 2177L, 2164L, 2149L, 2141L, 2100L, 2099L, 
2071L, 2042L, 2036L, 2031L, 2014L, 2005L, 1997L, 1940L, 1921L, 
1894L, 1859L, 1841L, 1827L, 1800L, 1784L, 1783L, 1736L, 1656L, 
1641L, 1612L, 1594L, 1577L, 1521L, 1434L, 1399L, 1394L, 1326L, 
1324L, 1239L, 1190L, 1166L, 1153L, 1145L, 1102L, 1036L, 1022L, 
1016L, 1014L, 997L, 961L, 961L, 959L, 935L, 928L, 876L, 873L, 
869L, 864L, 835L, 798L, 780L, 715L, 668L, 667L, 667L, 666L, 666L, 
666L, 664L, 649L, 648L, 639L, 638L, 628L, 620L, 620L, 618L, 606L, 
606L, 595L, 580L, 578L, 578L, 576L, 550L, 543L, 534L, 519L, 506L, 
504L, 496L, 487L, 467L, 395L, 381L, 376L, 371L, 346L, 334L, 334L, 
333L, 333L, 319L, 287L, 286L, 248L, 242L, 224L, 223L, 158L, 147L, 
130L, 118L, 114L, 109L, 82L, 75L, 46L, 46L, 23L, 19L, 9L, 2L), 
    avg_delay = c(16.1197344073147, 17.0650178480367, 17.6837401627318, 
    19.3128691113858, 15.6198308270677, 16.0823548079975, 17.836747759283, 
    14.648705023466, 22.2875970391767, 12.8397227636994, 8.26507834299206, 
    15.7343851307543, 13.1397159871736, 16.0565735226752, 13.9368395773295, 
    18.4411069063387, 21.1003401360544, 15.3034801654904, 10.4566293305548, 
    16.2721800575766, 13.6062467997952, 23.5650903294368, 15.2143064002317, 
    12.2520252025203, 13.9027954256671, 8.96493549454185, 8.70938690682369, 
    11.1260006961364, 10.4089738874586, 7.63977272727273, 16.179003021148, 
    9.11252485089463, 18.1120622568093, 13.5470494417863, 13.6586634653862, 
    15.7466986794718, 18.6453222453222, 19.2895064288677, 8.42625231910946, 
    16.6359908883827, 11.5676181562939, 13.5095571095571, 13.9358187824445, 
    7.55892255892256, 11.6963448922212, 16.7737364194615, 11.881981544439, 
    11.5611793611794, 14.0765054294176, 12.4233067729084, 9.53057099545225, 
    13.0822752666328, 14.5291113381001, 7.80767236910316, 13.040826873385, 
    12.4557224236147, 17.9269190325973, 13.4247311827957, 17.5675965665236, 
    8.38544018058691, 16.1151481274455, 14.5619469026549, 9.63400236127509, 
    8.27155425219941, 10.0863905325444, 8.4207353827607, 10.0965130759651, 
    12.4070573408948, 7.76978417266187, 13.7852090032154, 22.7221502590674, 
    15.3613559322034, 5.83504398826979, 18.7053701015965, 11.650521609538, 
    8.19313647246608, 9.95961995249406, 8.0016583747927, 8.54759825327511, 
    8.38146167557932, 9.83061594202899, 12.0415913200723, 9.89130434782609, 
    19.4745596868885, 10.0815324165029, 9.23305954825462, 10.224593495935, 
    10.85625, 10.709167544784, 7.02214839424142, 8.65077605321508, 
    19.172338090011, 8.9032992036405, 7.58443113772455, 17.3629976580796, 
    17.4504181600956, 9.99174528301887, 7.50246913580247, 11.9432989690722, 
    7.20639147802929, 14.2639885222382, 8.98435054773083, 8.04458598726115, 
    8.76121794871795, 8.16275430359937, 9.00792393026941, 13.243119266055, 
    12.5658914728682, 12.0612903225806, 19.2476038338658, 13.9449685534591, 
    7.89123376623377, 12.6019900497512, 22.6994991652755, 11.2900505902192, 
    12.0187074829932, 7.90172413793103, 17.7095158597663, 11.7710843373494, 
    8.96858638743456, 15.3896797153025, 11.413357400722, 9.83150183150183, 
    13.593984962406, 19.8633776091082, 23.6628787878788, 14.5236220472441, 
    11.4355828220859, 14.8526970954357, 5.59213250517598, 12.8502109704641, 
    11.5802197802198, 28.0723514211886, 7.08219178082192, 24.3935309973046, 
    12.6383561643836, 8.11661807580175, 30.790273556231, 17.8363636363636, 
    9.37345679012346, 14.3241590214067, 12.7179487179487, 13.978021978022, 
    20.1957295373665, 14.6788617886179, 19.1004184100418, 7.21719457013575, 
    4.20642201834862, 15.5302013422819, 22.9387755102041, 11.5538461538462, 
    19.8620689655172, 19.9298245614035, 25.7058823529412, 19.4675324675325, 
    36.4864864864865, 14.2391304347826, 20.9565217391304, 18.7391304347826, 
    14.4736842105263, 34.1111111111111, 70.5)), .Names = c("route", 
"counts", "avg_delay"), row.names = c(NA, -162L), class = c("tbl_df", 
"tbl", "data.frame"))

-

我正在使用的是航班记录。我正在查看前20个最常飞行的路线,然后试图标记一家航空公司,如果他们的平均延误是飞行该路线的其他所有人中最小的。下面是我的代码,最后一行不起作用。是否有更有效的方式查看组中的每个条目,然后标记最小值,最大值等?对所有类型的答案感兴趣!

library(tidyverse)
# Order most popular routes descending
routes <- routes[order(-routes$counts),]
# Grab names of top 20 routes
top20 <- head(routes, 20)
top20 <- top20[,1]
# Grab flights by all airlines if in the top 20 routes
routesnew <- df[df$route %in% top20$route,] %>% group_by(route, AIRLINE) %>% summarise(count = n(), avg_delay = mean(DEPARTURE_DELAY, na.rm=TRUE))

# Adding factor that marks if a flight is best in class or not
routesnew$best <- 0
minimums <- routesnew %>% group_by(route) %>% summarise(mini = min(avg_delay))
routesnew$best[routesnew$avg_delay %in% minimums$mini,] <- 1

routesnew = structure(list(route = c("ORD-ATL", "ORD-ATL", "ORD-ATL", "ORD-ATL", 
"ORD-ATL", "ORD-ATL", "ORD-ATL", "ORD-ATL", "ORD-BOS", "ORD-BOS", 
"ORD-BOS", "ORD-BOS", "ORD-BOS", "ORD-CLE", "ORD-CLE", "ORD-CLE", 
"ORD-CLE", "ORD-CLE", "ORD-CLT", "ORD-CLT", "ORD-CLT", "ORD-CLT", 
"ORD-CLT", "ORD-CVG", "ORD-CVG", "ORD-CVG", "ORD-CVG", "ORD-DCA", 
"ORD-DCA", "ORD-DCA", "ORD-DCA", "ORD-DEN", "ORD-DEN", "ORD-DEN", 
"ORD-DEN", "ORD-DEN", "ORD-DFW", "ORD-DFW", "ORD-DFW", "ORD-DFW", 
"ORD-DTW", "ORD-DTW", "ORD-DTW", "ORD-DTW", "ORD-DTW", "ORD-DTW", 
"ORD-IAH", "ORD-IAH", "ORD-IAH", "ORD-IAH", "ORD-LAS", "ORD-LAS", 
"ORD-LAS", "ORD-LAS", "ORD-LAS", "ORD-LAX", "ORD-LAX", "ORD-LAX", 
"ORD-LAX", "ORD-LAX", "ORD-LGA", "ORD-LGA", "ORD-LGA", "ORD-LGA", 
"ORD-MCO", "ORD-MCO", "ORD-MCO", "ORD-MCO", "ORD-MIA", "ORD-MIA", 
"ORD-MIA", "ORD-MIA", "ORD-MKE", "ORD-MSP", "ORD-MSP", "ORD-MSP", 
"ORD-MSP", "ORD-MSP", "ORD-MSP", "ORD-MSP", "ORD-PHL", "ORD-PHL", 
"ORD-PHL", "ORD-PHL", "ORD-PHL", "ORD-PHL", "ORD-PHL", "ORD-PHX", 
"ORD-PHX", "ORD-PHX", "ORD-PHX", "ORD-PHX", "ORD-SFO", "ORD-SFO", 
"ORD-SFO", "ORD-SFO"), AIRLINE = structure(c(1L, 4L, 5L, 6L, 
7L, 8L, 9L, 10L, 1L, 3L, 8L, 9L, 10L, 5L, 6L, 7L, 9L, 10L, 1L, 
5L, 9L, 10L, 11L, 4L, 5L, 7L, 9L, 1L, 7L, 9L, 10L, 1L, 6L, 8L, 
9L, 10L, 1L, 8L, 9L, 10L, 1L, 4L, 5L, 7L, 9L, 10L, 1L, 8L, 9L, 
10L, 1L, 6L, 8L, 9L, 10L, 1L, 6L, 8L, 10L, 12L, 1L, 8L, 9L, 10L, 
1L, 6L, 8L, 10L, 1L, 6L, 9L, 10L, 9L, 1L, 4L, 5L, 7L, 8L, 9L, 
10L, 1L, 5L, 6L, 8L, 9L, 10L, 11L, 1L, 6L, 8L, 10L, 11L, 1L, 
6L, 10L, 12L), .Label = c("AA", "AS", "B6", "DL", "EV", "F9", 
"MQ", "NK", "OO", "UA", "US", "VX"), class = "factor"), count = c(319L, 
3660L, 46L, 328L, 190L, 468L, 1102L, 278L, 2973L, 977L, 179L, 
257L, 2172L, 991L, 1L, 1270L, 153L, 1522L, 1876L, 789L, 411L, 
250L, 1429L, 1L, 800L, 2172L, 1038L, 2474L, 2L, 1505L, 1943L, 
1064L, 725L, 560L, 3L, 3261L, 5331L, 660L, 1018L, 668L, 55L, 
1449L, 331L, 1229L, 794L, 443L, 53L, 564L, 10L, 3559L, 1466L, 
455L, 847L, 1L, 1499L, 3078L, 231L, 664L, 3360L, 608L, 4752L, 
668L, 101L, 4054L, 1595L, 425L, 379L, 2017L, 3264L, 334L, 94L, 
474L, 4824L, 1468L, 1491L, 96L, 188L, 827L, 726L, 1565L, 1690L, 
23L, 264L, 229L, 5L, 1457L, 1076L, 1972L, 321L, 154L, 1025L, 
947L, 1977L, 215L, 4414L, 772L), avg_delay = c(18.552380952381, 
14.7510293713972, 15.4, 30.2191358024691, 16.8950276243094, 22.5645514223195, 
14.7038425492034, 8.24444444444444, 11.6032627559875, 16.2610062893082, 
34.3028571428571, 11.1865079365079, 19.7704995287465, 14.1327160493827, 
-1, 14.1427385892116, 12.0921052631579, 19.8256203890007, 6.89272237196765, 
16.9646596858639, 11.7954545454545, 7.29838709677419, 4.49785100286533, 
4, 7.65976714100906, 9.93501454898157, 13.690927218345, 12.2648896293211, 
4.5, 13.8709677419355, 18.2815636555732, 13.8821292775665, 24.695530726257, 
37.2403669724771, 9.5, 21.9758064516129, 15.9271293981927, 27.7405660377358, 
20.8987975951904, 16.97583081571, 15.2075471698113, 9.49688581314879, 
12.6300940438871, 17.752422907489, 12.2464332036316, 22.4126984126984, 
23.9622641509434, 30.5802919708029, 0.3, 19.6342368045649, 14.9077770130764, 
24.2743362831858, 21.4772727272727, 9, 18.4199192462988, 12.4473771032662, 
23.1004366812227, 22.2894333843798, 21.430977443609, 8.2953795379538, 
11.1715408118725, 13.8344051446945, 10.6907216494845, 22.4554812146166, 
13.1553582752061, 18.3886255924171, 18.093085106383, 17.4756403817177, 
15.2144855455393, 14.9668674698795, 10.0744680851064, 17.2124463519313, 
12.8397227636994, 15.2863099374566, 14.373907195696, 14.4408602150538, 
28.2046783625731, 23.55625, 17.9818688981869, 19.5775470473718, 
9.63997591812161, 8.47619047619048, 43.7816091954023, 26.0588235294118, 
16.2, 20.4159106769016, 9.84390243902439, 10.9178222907036, 14.4794952681388, 
26.843537414966, 19.4576771653543, 8.31309904153355, 14.6524859046643, 
21.3971962616822, 22.296228150874, 13.6809895833333), best = c(0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0)), .Names = c("route", "AIRLINE", 
"count", "avg_delay", "best"), row.names = c(NA, -96L), vars = list(
    route), drop = TRUE, class = c("grouped_df", "tbl_df", "tbl", 
"data.frame")

-

所需的输出(如果该路线的最小值为每条路线,则应为其分配一个&#34; 1&#34;)

Route:   Airline:    avg_delay:    best:
ORD-DTA   UA          10            0
ORD-DTA   AA          8             1

1 个答案:

答案 0 :(得分:1)

您可以使用data.table:

library(data.table)
#make df a data.table
setDT(df)
routesnew <- df[route %in% top20$route, 
                list(count = .N,
                     avg_delay = mean(DEPARTURE_DELAY, na.rm=TRUE)),
                by = list(route, AIRLINE)]

# for rows with minimun avg_delay in group set best to 1
routesnew[, best := as.numeric(avg_delay == min(avg_delay)), by = route]

来自nycflights13包的航班数据集的完整示例:

library(nycflights13)
library(data.table)
df <- copy(flights)[, route := paste(origin, dest, sep ="-")]
top20 <- head(setorder(df[, list(counts = .N), by = route], -counts), 20)
setnames(df, "dep_delay", "DEPARTURE_DELAY")
setnames(df, "carrier", "AIRLINE")
routesnew <- df[route %in% top20$route, 
                list(count = .N,
                     avg_delay = mean(DEPARTURE_DELAY, na.rm=TRUE)),
                by = list(route, AIRLINE)]
# for rows with minimun avg_delay in group set best to 1
routesnew[, best := as.numeric(avg_delay == min(avg_delay)), by = route]
setkey(routesnew, route)[]