我有一个data.frames列表。每个data.frame都有一列包含日期,另一列包含值。这是data.frame之一的样本(数据非常大)
dput(mydata)
structure(list(date = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L), .Label = c("1993-01-01",
"1993-02-01", "1993-03-01", "1993-04-01", "1993-05-01", "1993-06-01",
"1993-07-01", "1993-08-01", "1993-09-01", "1993-10-01", "1993-11-01",
"1993-12-01", "1994-01-01", "1994-02-01", "1994-03-01", "1994-04-01",
"1994-05-01", "1994-06-01", "1994-07-01", "1994-08-01", "1994-09-01",
"1994-10-01", "1994-11-01", "1994-12-01", "1995-01-01", "1995-02-01",
"1995-03-01", "1995-04-01", "1995-05-01", "1995-06-01", "1995-07-01",
"1995-08-01", "1995-09-01", "1995-10-01", "1995-11-01", "1995-12-01",
"1996-01-01", "1996-02-01", "1996-03-01", "1996-04-01", "1996-05-01",
"1996-06-01", "1996-07-01", "1996-08-01", "1996-09-01", "1996-10-01",
"1996-11-01", "1996-12-01", "2000-01-01", "2000-02-01", "2000-03-01",
"2000-04-01", "2000-05-01", "2000-06-01", "2000-07-01", "2000-08-01",
"2000-09-01", "2000-10-01", "2000-11-01", "2000-12-01", "2001-01-01",
"2001-02-01", "2001-03-01", "2001-04-01", "2001-05-01", "2001-06-01",
"2001-07-01", "2001-08-01", "2001-09-01", "2001-10-01", "2001-11-01",
"2001-12-01", "2002-01-01", "2002-02-01", "2002-03-01", "2002-04-01",
"2002-05-01", "2002-06-01", "2002-07-01", "2002-08-01", "2002-09-01",
"2002-10-01", "2002-11-01", "2002-12-01", "2003-01-01", "2003-02-01",
"2003-03-01", "2003-04-01", "2003-05-01", "2003-06-01", "2003-07-01",
"2003-08-01", "2003-09-01", "2003-10-01", "2003-11-01", "2003-12-01"
), class = "factor"), value = c(32, 33.75, 23, 15.125, 25.125,
25.857, 25.25, 38.125, 35.5, 49.875, 55.25, 46.625, 52.625, 51.125,
45.75, 52.625, 54.125, 42.875, 36.375, 39.375, 52.625, 61.375,
61.375, 61.875, 61.5, 62.5, 50.625, 43.5, 38.167, 36.714, 41.875,
28.875, 20.333, 22.25, 24, 21.333, 3.167, 41.75, 19.375, 10.25,
16.625, 19.5, 48.333, 22.5, 26.25, 28.625, 31.5, 50.625, 50.375,
57, 63, 64.286, 65, 46.125, 59, 54.75, 25.375, 77, 74.125, -99,
-99, -99, 95.375, 86, 72.875, 85.25, 38.5, -99, 72.875, 80.375,
99.25, 94, 76.5, 32.875, 87.375, 114.875, 91.5, 96.25, 64.25,
55.75, 67.5, 79, 103, 66.25, 50.143, 79.5, 66.25, 75.75, 73.125,
77.25, 70.125, 65.5, 77.375, 103.375, 97.25, 66.875, 86.875,
71.375, 69.875, 62.75)), .Names = c("date", "value"), row.names = c(1L,
97L, 193L, 289L, 385L, 481L, 577L, 673L, 769L, 865L, 961L, 1057L,
1153L, 1249L, 1345L, 1441L, 1537L, 1633L, 1729L, 1825L, 1921L,
2017L, 2113L, 2209L, 2305L, 2401L, 2497L, 2593L, 2689L, 2785L,
2881L, 2L, 98L, 194L, 290L, 386L, 482L, 578L, 674L, 770L, 866L,
962L, 1058L, 1154L, 1250L, 1346L, 1442L, 1538L, 1634L, 1730L,
1826L, 1922L, 2018L, 2114L, 2210L, 2306L, 2402L, 2498L, 2594L,
2690L, 2786L, 2882L, 3L, 99L, 195L, 291L, 387L, 483L, 579L, 675L,
771L, 867L, 963L, 1059L, 1155L, 1251L, 1347L, 1443L, 1539L, 1635L,
1731L, 1827L, 1923L, 2019L, 2115L, 2211L, 2307L, 2403L, 2499L,
2595L, 2691L, 2787L, 2883L, 4L, 100L, 196L, 292L, 388L, 484L,
580L), class = "data.frame")
问题是列日期是一个因素,所有月份都包含31天(相应的值有-99是月份是30或28,29天)。我正在尝试将日期转换为正确的日期格式,正确的日期,但我正在努力...我可以将因素转换为日期,但我不知道如何转换成正确的日子像:
head(mydata)
date value
1993-01-01 32.000
1993-01-02 33.750
1993-01-03 23.000
1993-01-04 15.125
1993-01-05 25.125
1993-01-06 25.857
...
我感谢任何建议, 谢谢!
答案 0 :(得分:0)
使用dplyr
和tidyr
的解决方案。我在此过程中使用value
过滤了-99
,假设这是您要执行的操作。
library(dplyr)
library(tidyr)
mydata2 <- mydata %>%
mutate(date = as.character(date)) %>% # Convert to character
filter(value != -99) %>% # Remove -99
extract(date, into = c("Year", "Month", "Day"), # Split the date column
regex = "([0-9]+)-([0-9]+)-([0-1]+)",
convert = TRUE) %>%
group_by(Year, Month) %>% # Group by Year and Month
mutate(Day = Day + row_number() - 1) %>% # Update Day to Day + row numbers - 1
unite(col = date, Year, Month, Day, sep = "-") %>% # Unite the Year, Month, Day columns
mutate(date = as.Date(date)) # Convert the date to date class
head(mydata2)
# # A tibble: 6 x 2
# date value
# <date> <dbl>
# 1 1993-01-01 32.000
# 2 1993-01-02 33.750
# 3 1993-01-03 23.000
# 4 1993-01-04 15.125
# 5 1993-01-05 25.125
# 6 1993-01-06 25.857
或者我们可以使用类似策略的dplyr
和lubridate
包。
library(dplyr)
library(lubridate)
mydata2 <- mydata %>%
filter(value != -99) %>% # Remove -99
mutate(date = ymd(as.character(date))) %>% # Convert the date column to date class
mutate(Year = year(date), Month = month(date)) %>% # Create Year and Month column
group_by(Year, Month) %>% # Group by Year and Month
mutate(date = date + days(row_number() - 1)) %>% # Add day numbers based on row number - 1
ungroup() %>% # Ungroup
select(date, value) # Select the columns
head(mydata2)
# # A tibble: 6 x 2
# date value
# <date> <dbl>
# 1 1993-01-01 32.000
# 2 1993-01-02 33.750
# 3 1993-01-03 23.000
# 4 1993-01-04 15.125
# 5 1993-01-05 25.125
# 6 1993-01-06 25.857