计算几年的月中值

时间:2017-10-24 19:27:15

标签: r dplyr median date-conversion dbplyr

我最近开始与R一起冒险并尝试解决以下问题。我有data.frame,包括一年中特定月份的到达和离开。我必须找到这些年来每个月的中位数。我的结果应保存在.csv中。以下是样本,整个观察结果包括截至2017年的日期(总共1548个障碍物):

#dput output assigned to the flights variable
flights <- structure(list(X = 1:163, ReportPeriod = structure(c(1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 
5L, 5L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 
8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 9L, 9L, 9L, 9L, 9L, 9L, 9L, 
9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 10L, 
10L, 10L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 11L, 
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 13L, 13L, 
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L, 14L, 14L, 
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L, 15L, 15L, 
15L, 15L, 15L, 15L, 15L, 15L, 15L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 
4L, 6L), .Label = c("01/01/2006 12:00:00 AM", "01/01/2007 12:00:00 AM", 
"02/01/2006 12:00:00 AM", "02/01/2007 12:00:00 AM", "03/01/2006 12:00:00 AM", 
"03/01/2007 12:00:00 AM", "04/01/2006 12:00:00 AM", "05/01/2006 12:00:00 AM", 
"06/01/2006 12:00:00 AM", "07/01/2006 12:00:00 AM", "08/01/2006 12:00:00 AM", 
"09/01/2006 12:00:00 AM", "10/01/2006 12:00:00 AM", "11/01/2006 12:00:00 AM", 
"12/01/2006 12:00:00 AM"), class = "factor"), FlightType = structure(c(1L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 
2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 
3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 
1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 
1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 
2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 
1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 
2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 
3L, 1L), .Label = c("Charter", "Commuter", "Scheduled"), class = "factor"), 
 Arrival_Departure = structure(c(1L, 1L, 2L, 2L, 1L, 1L, 2L, 
 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 
 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 
 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 
 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 
 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 
 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 
 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 
 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 
 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 
 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 
 2L, 1L, 1L, 2L, 2L, 1L), .Label = c("Arrival", "Departure"
 ), class = "factor"), Domestic_International = structure(c(1L, 
 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 
 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 
 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 
 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 2L, 1L), .Label = c("Domestic", 
 "International"), class = "factor"), FlightOpsCount = c(57L, 
 299L, 62L, 5L, 5996L, 31L, 5995L, 31L, 13695L, 4026L, 13666L, 
 4085L, 22L, 101L, 21L, 100L, 5525L, 28L, 5525L, 28L, 12308L, 
 3381L, 12322L, 3398L, 51L, 4L, 47L, 5L, 6217L, 31L, 6216L, 
 30L, 13925L, 3952L, 13963L, 3961L, 57L, 2L, 52L, 5885L, 31L, 
 5886L, 31L, 13420L, 3884L, 13461L, 3897L, 26L, 5L, 24L, 3L, 
 6089L, 41L, 6089L, 41L, 14126L, 3968L, 14138L, 3984L, 18L, 
 1L, 18L, 5980L, 39L, 5980L, 39L, 14208L, 4030L, 14211L, 4028L, 
 16L, 2L, 14L, 6092L, 39L, 6092L, 39L, 14866L, 4483L, 14883L, 
 4506L, 15L, 1L, 16L, 6134L, 40L, 6134L, 40L, 15243L, 4303L, 
 15272L, 4328L, 24L, 1L, 18L, 5903L, 37L, 5903L, 37L, 13832L, 
 3823L, 13815L, 3865L, 48L, 285L, 50L, 6135L, 40L, 6135L, 
 40L, 14292L, 3605L, 14332L, 3898L, 48L, 3L, 50L, 2L, 5772L, 
 39L, 5772L, 39L, 13855L, 3706L, 13855L, 3718L, 46L, 6L, 44L, 
 3L, 6144L, 40L, 6144L, 40L, 14417L, 4086L, 14474L, 4103L, 
 38L, 3L, 42L, 3L, 6081L, 94L, 6081L, 94L, 14140L, 4301L, 
 14165L, 4308L, 14L, 7L, 16L, 5L, 5470L, 83L, 5470L, 83L, 
 12734L, 3779L, 12768L, 3783L, 33L)), .Names = c("X", "ReportPeriod", 
"FlightType", "Arrival_Departure", "Domestic_International", 
"FlightOpsCount"), class = "data.frame", row.names = c(NA, -163L
))

这是我应该得到的预期输出:

month,Arrival_Departure,FlightOpsCount
January,Arrival,23152
January,Departure,23173
February,Arrival,20849
February,Departure,20878
March,Arrival,23981.5
March,Departure,24005
April,Arrival,23393
April,Departure,23451.5
May,Arrival,24342.5
May,Departure,24376
June,Arrival,24622.5
June,Departure,24667
July,Arrival,25795.5
July,Departure,25837
August,Arrival,25573.5
August,Departure,25600
September,Arrival,23306
September,Departure,23315
October,Arrival,23965
October,Departure,23990
November,Arrival,22379
November,Departure,22361
December,Arrival,23594
December,Departure,23579

我决定分几个步骤,我尝试做的第一件事就是从字符串中接收正确的日期格式:

library(dbplyr)
step_1 = as_tibble(flights)

step_2 = step_1 %>%
  transmute(
    date_format = as.POSIXct(strptime(ReportPeriod, format = "%m/%d/%Y")),
    even_new_date = as.Date(date_format, format = "%Y"),
    Arrival_Departure, 
    FlightOpsCount)

这对我来说实际上很棘手..我不明白如何正确地做到这一点以及为什么有两种方法可以获得日期格式,例如2006-01-01 vs 2005-12-31?在这种情况下哪一个是正确的?

现在,假设2006-01-01是正确的,我可以在months()函数中使用as.POSIXct来获取月份:

step2 = step_1 %>%
transmute(
month = months(as.POSIXct(strptime(ReportPeriod, format = "%m/%d/%Y"))), 
Arrival_Departure, 
FlightOpsCount)

下一步需要分组操作:

step_3 = step_2 %>%
  group_by(month, Arrival_Departure) %>% 
  summarize(median = median(FlightOpsCount))

当把它写到csv时,我得到了可笑的小值..

"","month","Arrival_Departure","median"
"1","April","Arrival",102.5
"2","April","Departure",3061
"3","August","Arrival",1412.5
"4","August","Departure",3667.5
"5","December","Arrival",102
"6","December","Departure",1738
"7","February","Arrival",116
"8","February","Departure",116
"9","January","Arrival",284
"10","January","Departure",1708
"11","July","Arrival",95.5
"12","July","Departure",3571
"13","June","Arrival",119
"14","June","Departure",3292
"15","March","Arrival",115
"16","March","Departure",1759
"17","May","Arrival",1609.5
"18","May","Departure",3121
"19","November","Arrival",93.5
"20","November","Departure",93.5
"21","October","Arrival",2359
"22","October","Departure",2756
"23","September","Arrival",1228
"24","September","Departure",3187.5

有人可以指导我并告诉我解决问题的正确方法吗?

我将不胜感激。

4 个答案:

答案 0 :(得分:3)

你差不多了,不过我建议使用dplyr:

# Step 1: Convert dates using as.Date function
flights$ReportPeriod <- as.Date(flights$ReportPeriod, "%m/%d/%Y")

# Step 2: Use dplyr to summarize information
require(dplyr)
flights <- flights %>% 
             group_by(ReportPeriod, Arrival_Departure) %>%
             summarise(FlightOpsCount = median(FlightOpsCount)) %>% 
             as.data.frame() 

# Step 3: Convert date to string for month name
flights <- flights %>%
             mutate(ReportPeriod = months(ReportPeriod)) %>%
             rename(month = ReportPeriod) # If you need to rename the column to be "months"


# Alternate Step 3: If you want to add in year as well
require(lubridate)
flights <- flights %>%
             mutate(ReportPeriod = paste(months(ReportPeriod), 
                                         year(ReportPeriod), 
                                         sep = " ")) %>%
             rename(month = ReportPeriod) # If you need to rename the column to be "months"

# Step 4: Write to csv
write.csv(flights, "file_name.csv", row.names = FALSE)

干杯。

答案 1 :(得分:2)

我相信这更简单。请注意,months的格式与您的格式略有不同。

library(zoo)

months <- as.yearmon(flights$ReportPeriod, "%m/%d/%Y %H:%M:%S")
agg <- aggregate(FlightOpsCount ~ months + Arrival_Departure, flights, median)

无论如何,我无法让你的问题中的中位数接近预期值。由于这些是中位数,因此您的期望似乎有些错误。

如果您想要其他日期格式,可以使用函数format.Date

format(as.Date(months), "%Y %B")   # or "%B %Y"

有关可能的长格式列表,请参阅帮助页面?strptime

答案 2 :(得分:1)

这是data.table方法:

library(data.table)
library(lubridate)
dat <- fread("sample_data.txt", col.names = c("dte", "flight", "typ1","typ2","flt_count"))
dat$dte <- as.POSIXct(strptime(dat$dte, format = "%m/%d/%Y %H:%M:%S"), tz = "GMT")

new_dat <- dat[, sum(flt_count), by = list(month(dte),typ1)]

为方便起见,我重命名了这些列。您还可以编辑by参数以基于其他变量进行分组/执行操作。上面代码段的输出是:

> new_dat
   month      typ1    V1
1:     1   Arrival 24104
2:     1 Departure 23844
3:     2   Arrival 21365
4:     2 Departure 21394
5:     3   Arrival 24180
6:     3 Departure 24222

这似乎是你正在寻找的东西。 data.table对大型数据集非常有用。

要撰写结果,请使用write.csv(new_dat, "new_file.csv", row_names = F)

希望这有帮助。

答案 3 :(得分:1)

谢谢大家的帮助!我确实收到了特定月份的正确值,这是我的代码:

#summarize Arrival & Departures through the years
step_1 <-  flights %>% 
  group_by(ReportPeriod, Arrival_Departure) %>% 
  summarise(sum = sum(FlightOpsCount)) %>% 
  arrange(ReportPeriod) %>% 
  ungroup()

#modify date format in ReportPeriod column to receive months
step_2 <- step_1 %>% 
  transmute(month = months(as.Date(ReportPeriod,"%m/%d/%Y")),
            Arrival_Departure,
            sum) %>% 
  group_by(month, Arrival_Departure) %>%
  summarise(FlightOpsCount = median(sum)) %>% 
  write.csv(., "flights_output.csv", row.names = FALSE, quote = FALSE)

然而,我按字母顺序而不是按时间顺序获得数月。我在这里的某个地方找到了解决方案,但是它无法正常工作,我只能获得NAs。显然我在给.csv写任何内容之前调用它,并在step_2结束时添加ungroup()。

step_3 <- step_2 %>% 
  mutate(month = factor(month.name[month], levels = month.name)) %>% 
  arrange(month)