原始数据: Here is the spreadsheet
我有50个州,从1997年到现在的每周数据看起来像这样:
structure(list(date = c("1/1/2011", "1/8/2011", "1/15/2011",
"1/22/2011", "1/29/2011", "2/5/2011", "2/12/2011", "2/19/2011",
"2/26/2011", "3/5/2011", "3/12/2011", "3/19/2011", "3/26/2011",
"4/2/2011", "4/9/2011"), CT = c(8593L, 14629L, 8084L, 6986L,
6050L, 6368L, 5408L, 5416L, 6098L, 5260L, 4428L, 3823L, 3808L,
4366L, 4697L), MA = c(13656L, 14779L, 9462L, 8565L, 9575L, 8548L,
9248L, 7569L, 11373L, 8891L, 7113L, 6775L, 4524L, 7099L, 7390L
), ME = c(2521L, 3811L, 3239L, 2306L, 2381L, 2000L, 1878L, 1745L,
1582L, 2008L, 1887L, 1676L, 1707L, 1898L, 1843L), NH = c(3155L,
2892L, 1983L, 1961L, 1948L, 1596L, 1533L, 1534L, 1905L, 1819L,
1434L, 1366L, 1288L, 1376L, 1477L), RI = c(3446L, 3159L, 2428L,
2221L, 2010L, 1891L, 2351L, 2107L, 2883L, 1763L, 1595L, 1434L,
1094L, 1154L, 1496L), VT = c(2173L, 1547L, 946L, 838L, 883L,
838L, 704L, 890L, 1230L, 761L, 1019L, 854L, 830L, 848L, 1205L
), NJ = c(18871L, 21451L, 16872L, 15175L, 12138L, 13015L, 12777L,
10372L, 10528L, 10036L, 9582L, 8551L, 9477L, 9141L, 9750L), NY = c(37620L,
61983L, 33269L, 27656L, 28151L, 26433L, 26553L, 22283L, 20990L,
38916L, 21999L, 20928L, 20657L, 21008L, 24132L), PR = c(2961L,
2437L, 2123L, 3423L, 4364L, 4525L, 4088L, 3765L, 3181L, 2857L,
3366L, 2985L, 2730L, 2382L, 2720L)), .Names = c("date", "CT",
"MA", "ME", "NH", "RI", "VT", "NJ", "NY", "PR"), row.names = c(NA,
-15L), class = c("tbl_df", "tbl", "data.frame"))
我想为过去5年的每个州制作年度图表。这就是我到目前为止所做的事情:
library(tidyverse)
library(lubridate)
require(plotly)
df <- read_csv('icclaims.csv') %>%
mutate(date = as.Date(date, format = '%m/%e/%Y'),
doy = as.numeric(format(date, '%j')),
Year = as.factor(year(date))) %>%
gather(key = 'state', value = 'claims', -date, -doy, -Year) %>%
arrange(date)
df %>%
filter(state %in% c('CA', 'MA')) %>%
# Make sample plot
ggplot(aes(x = doy, y = claims, group = Year, colour = Year)) +
geom_path() + geom_point() +
facet_wrap(~ state, ncol = 1) +
geom_smooth(span = 1, level=.99) + #THANKS TO @JIMBOU
theme_bw()
最终目的是看看来自特定州的新的每周数据是否超出该州的99%置信区间,这取决于我决定回顾过去多久。
剩下的问题是如何让geom_smooth()使用所有年份的数据来计算置信区间而不是一年。简而言之,我希望在过去5年中逐年打印,但显示1997年至2017年整个数据范围的置信区间。
非常感谢你的帮助。
答案 0 :(得分:1)
简单的技巧是将因子仅放在点/路径函数中。您的数据保存在d
中,我使用melt()
以长格式转换数据。
library(reshape2);
dl <- melt(d[,1:3])
# the date object
dl$date2 <- as.Date(dl$date, format = '%m/%e/%Y')
# add a second year
dl2 <- dl
dl2$date2 <- dl2$date2 + 365
dl2$value <- dl2$value + 10000
dl <- rbind(dl, dl2)
# Your x value Doy and the grouping factor
dl$Doy <- as.numeric(as.character(format(dl$date2, format="%j")))
dl$Year <- as.factor(format(dl$date2, format="%Y"))
# the plot
ggplot(dl, aes(Doy,value)) +
geom_point(aes(color=Year)) + geom_path(aes(color=Year)) +
facet_wrap(~ variable, ncol = 1) +
geom_smooth()