Question

我有一个data.frames列表。每个data.frame都有一列包含日期，另一列包含值。这是data.frame之一的样本（数据非常大）

           dput(mydata)
structure(list(date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 
3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("1993-01-01", 
"1993-02-01", "1993-03-01", "1993-04-01", "1993-05-01", "1993-06-01", 
"1993-07-01", "1993-08-01", "1993-09-01", "1993-10-01", "1993-11-01", 
"1993-12-01", "1994-01-01", "1994-02-01", "1994-03-01", "1994-04-01", 
"1994-05-01", "1994-06-01", "1994-07-01", "1994-08-01", "1994-09-01", 
"1994-10-01", "1994-11-01", "1994-12-01", "1995-01-01", "1995-02-01", 
"1995-03-01", "1995-04-01", "1995-05-01", "1995-06-01", "1995-07-01", 
"1995-08-01", "1995-09-01", "1995-10-01", "1995-11-01", "1995-12-01", 
"1996-01-01", "1996-02-01", "1996-03-01", "1996-04-01", "1996-05-01", 
"1996-06-01", "1996-07-01", "1996-08-01", "1996-09-01", "1996-10-01", 
"1996-11-01", "1996-12-01", "2000-01-01", "2000-02-01", "2000-03-01", 
"2000-04-01", "2000-05-01", "2000-06-01", "2000-07-01", "2000-08-01", 
"2000-09-01", "2000-10-01", "2000-11-01", "2000-12-01", "2001-01-01", 
"2001-02-01", "2001-03-01", "2001-04-01", "2001-05-01", "2001-06-01", 
"2001-07-01", "2001-08-01", "2001-09-01", "2001-10-01", "2001-11-01", 
"2001-12-01", "2002-01-01", "2002-02-01", "2002-03-01", "2002-04-01", 
"2002-05-01", "2002-06-01", "2002-07-01", "2002-08-01", "2002-09-01", 
"2002-10-01", "2002-11-01", "2002-12-01", "2003-01-01", "2003-02-01", 
"2003-03-01", "2003-04-01", "2003-05-01", "2003-06-01", "2003-07-01", 
"2003-08-01", "2003-09-01", "2003-10-01", "2003-11-01", "2003-12-01"
), class = "factor"), value = c(32, 33.75, 23, 15.125, 25.125, 
25.857, 25.25, 38.125, 35.5, 49.875, 55.25, 46.625, 52.625, 51.125, 
45.75, 52.625, 54.125, 42.875, 36.375, 39.375, 52.625, 61.375, 
61.375, 61.875, 61.5, 62.5, 50.625, 43.5, 38.167, 36.714, 41.875, 
28.875, 20.333, 22.25, 24, 21.333, 3.167, 41.75, 19.375, 10.25, 
16.625, 19.5, 48.333, 22.5, 26.25, 28.625, 31.5, 50.625, 50.375, 
57, 63, 64.286, 65, 46.125, 59, 54.75, 25.375, 77, 74.125, -99, 
-99, -99, 95.375, 86, 72.875, 85.25, 38.5, -99, 72.875, 80.375, 
99.25, 94, 76.5, 32.875, 87.375, 114.875, 91.5, 96.25, 64.25, 
55.75, 67.5, 79, 103, 66.25, 50.143, 79.5, 66.25, 75.75, 73.125, 
77.25, 70.125, 65.5, 77.375, 103.375, 97.25, 66.875, 86.875, 
71.375, 69.875, 62.75)), .Names = c("date", "value"), row.names = c(1L, 
97L, 193L, 289L, 385L, 481L, 577L, 673L, 769L, 865L, 961L, 1057L, 
1153L, 1249L, 1345L, 1441L, 1537L, 1633L, 1729L, 1825L, 1921L, 
2017L, 2113L, 2209L, 2305L, 2401L, 2497L, 2593L, 2689L, 2785L, 
2881L, 2L, 98L, 194L, 290L, 386L, 482L, 578L, 674L, 770L, 866L, 
962L, 1058L, 1154L, 1250L, 1346L, 1442L, 1538L, 1634L, 1730L, 
1826L, 1922L, 2018L, 2114L, 2210L, 2306L, 2402L, 2498L, 2594L, 
2690L, 2786L, 2882L, 3L, 99L, 195L, 291L, 387L, 483L, 579L, 675L, 
771L, 867L, 963L, 1059L, 1155L, 1251L, 1347L, 1443L, 1539L, 1635L, 
1731L, 1827L, 1923L, 2019L, 2115L, 2211L, 2307L, 2403L, 2499L, 
2595L, 2691L, 2787L, 2883L, 4L, 100L, 196L, 292L, 388L, 484L, 
580L), class = "data.frame")

问题是列日期是一个因素，所有月份都包含31天（相应的值有-99是月份是30或28,29天）。我正在尝试将日期转换为正确的日期格式，正确的日期，但我正在努力...我可以将因素转换为日期，但我不知道如何转换成正确的日子像：

      head(mydata)
      date  value 
      1993-01-01 32.000
      1993-01-02 33.750
      1993-01-03 23.000
      1993-01-04 15.125
      1993-01-05 25.125
      1993-01-06 25.857

...

我感谢任何建议，谢谢！

Answer 1

使用dplyr和tidyr的解决方案。我在此过程中使用value过滤了-99，假设这是您要执行的操作。

library(dplyr)
library(tidyr)

mydata2 <- mydata %>%
  mutate(date = as.character(date)) %>%                       # Convert to character
  filter(value != -99) %>%                                    # Remove -99
  extract(date, into = c("Year", "Month", "Day"),             # Split the date column
          regex = "([0-9]+)-([0-9]+)-([0-1]+)", 
          convert = TRUE) %>%    
  group_by(Year, Month) %>%                                   # Group by Year and Month
  mutate(Day = Day + row_number() - 1) %>%                    # Update Day to Day + row numbers - 1
  unite(col = date, Year, Month, Day, sep = "-") %>%          # Unite the Year, Month, Day columns
  mutate(date = as.Date(date))                                # Convert the date to date class
head(mydata2)
# # A tibble: 6 x 2
#         date  value
#       <date>  <dbl>
# 1 1993-01-01 32.000
# 2 1993-01-02 33.750
# 3 1993-01-03 23.000
# 4 1993-01-04 15.125
# 5 1993-01-05 25.125
# 6 1993-01-06 25.857

或者我们可以使用类似策略的dplyr和lubridate包。

library(dplyr)
library(lubridate)

mydata2 <- mydata %>%
  filter(value != -99) %>%                            # Remove -99
  mutate(date = ymd(as.character(date))) %>%          # Convert the date column to date class
  mutate(Year = year(date), Month = month(date)) %>%  # Create Year and Month column
  group_by(Year, Month) %>%                           # Group by Year and Month
  mutate(date = date + days(row_number() - 1)) %>%    # Add day numbers based on row number - 1
  ungroup() %>%                                       # Ungroup
  select(date, value)                                 # Select the columns
head(mydata2)
# # A tibble: 6 x 2
#         date  value
#       <date>  <dbl>
# 1 1993-01-01 32.000
# 2 1993-01-02 33.750
# 3 1993-01-03 23.000
# 4 1993-01-04 15.125
# 5 1993-01-05 25.125
# 6 1993-01-06 25.857

在适当的日期将因子转换为日期

1 个答案: