Question

请参阅数据输入。您可以直接向下滚动到目标和问题陈述。也许您不需要数据，因为您之前可能遇到过这个问题。

调用所需的库

library(zoo)
library(ggplot2)
library(scales)
library(plotly)
library(ggthemes)
library(forecast)
library(plotly)
library(DescTools)

数据输入

dput(ridership.ts)
structure(c(1709L, 1621L, 1973L, 1812L, 1975L, 1862L, 1940L, 
2013L, 1596L, 1725L, 1676L, 1814L, 1615L, 1557L, 1891L, 1956L, 
1885L, 1623L, 1903L, 1997L, 1704L, 1810L, 1862L, 1875L, 1705L, 
1619L, 1837L, 1957L, 1917L, 1882L, 1933L, 1996L, 1673L, 1753L, 
1720L, 1734L, 1563L, 1574L, 1903L, 1834L, 1831L, 1776L, 1868L, 
1907L, 1686L, 1779L, 1776L, 1783L, 1548L, 1497L, 1798L, 1733L, 
1772L, 1761L, 1792L, 1875L, 1571L, 1647L, 1673L, 1657L, 1382L, 
1361L, 1559L, 1608L, 1697L, 1693L, 1836L, 1943L, 1551L, 1687L, 
1576L, 1700L, 1397L, 1372L, 1708L, 1655L, 1763L, 1776L, 1934L, 
2008L, 1616L, 1774L, 1732L, 1797L, 1570L, 1413L, 1755L, 1825L, 
1843L, 1826L, 1968L, 1922L, 1670L, 1791L, 1817L, 1847L, 1599L, 
1549L, 1832L, 1840L, 1846L, 1865L, 1966L, 1949L, 1607L, 1804L, 
1850L, 1836L, 1542L, 1617L, 1920L, 1971L, 1992L, 2010L, 2054L, 
2097L, 1824L, 1977L, 1981L, 2000L, 1683L, 1663L, 2008L, 2024L, 
2047L, 2073L, 2127L, 2203L, 1708L, 1951L, 1974L, 1985L, 1760L, 
1771L, 2020L, 2048L, 2069L, 1994L, 2075L, 2027L, 1734L, 1917L, 
1858L, 1996L, 1778L, 1749L, 2066L, 2099L, 2105L, 2130L, 2223L, 
2174L, 1931L, 2121L, 2076L, 2141L, 1832L, 1838L, 2132L), .Tsp = c(1991, 
2004.16666666667, 12), class = "ts")

创建ts对象的数据框以使用ggplot

tsd = data.frame(time = as.Date(ridership.ts), 
                 value = as.matrix(ridership.ts))

构建线性模型

ridership.lm <- tslm(ridership.ts ~ trend + I(trend^2))

将新列添加到现有数据框tsd

tsd$linear_fit = as.matrix(ridership.lm$fitted.values)

定义验证和培训期的长度

nValid = 36 
nTrain = length(ridership.ts) - nValid

培训数据

train.ts = window(ridership.ts, 
                  start = c(1991, 1),
                  end = c(1991, nTrain))

验证数据

valid.ts = window(ridership.ts, 
                  start = c(1991, nTrain + 1), 
                  end = c(1991, nTrain + nValid))

建筑模型

ridership.lm = tslm(train.ts ~ trend + I(trend^2))

使用我们的构建模型进行预测

ridership.lm.pred = forecast(ridership.lm, h = nValid, level = 0)

为拟合的模型值制作数据框

tsd_train_model = data.frame(time = as.Date(train.ts), 
                             lm_fit_train = as.matrix(ridership.lm$fitted.values))

为绘图目的制作数据框

forecast_df = data.frame(time = as.Date(valid.ts), 
                         value = as.matrix(ridership.lm.pred$mean))

使用ggplot创建绘图

p1 = ggplot(data = tsd, 
            aes(x = time, y = value)) + 
  geom_line(color = 'blue') + 
  ylim(1300, 2300) + 
  geom_line(data = tsd_train_model, 
            aes(x = time, y = lm_fit_train), 
            color = 'red')

p2 = p1 + 
  geom_line(data = forecast_df, 
            aes(x = time, y = value), 
            col = 'red', linetype = 'dotted') + 
  scale_x_date(breaks = date_breaks('1 years'), 
               labels = date_format('%b-%y')) +
  geom_vline(xintercept = as.numeric(c(tsd_train_model[NROW(tsd_train_model), ]$time,  #last date of training period
                                       forecast_df[NROW(forecast_df), ]$time))) #last date of testing period 

p3 = p2 + 
  annotate('text', 
           x = c(tsd_train_model[NROW(tsd_train_model)/2, ]$time, 
                 forecast_df[NROW(forecast_df) / 2,]$time),
           y = 2250, 
           label = c('Training Period', 'Validation Period'))

目标：我想在预测线的两侧添加5百分位和95百分位的预测误差（在该图中用红色点缀）并遮蔽该区域。

我使用分位数来生成预测范围

q = quantile(ridership.lm.pred$residuals, c(.05, .95))

percentile_5 = as.numeric(q[1])
percentile_95 = as.numeric(q[2])

为预测数据添加5个百分点和95个百分点

yl = forecast_df$value + percentile_5 
ym =  forecast_df$value  + percentile_95

问题：如果我使用以下命令，那么它不会在整个验证期间显示阴影区域。

p3 + geom_ribbon(data = forecast_df, 
                 aes(ymin = yl, 
                     ymax = ym), 
                 fill="gray30")

NROW(yl)
 [1]36

sum(is.na(yl))
[1] 0

NROW(ym)
[1] 36

sum(is.na(ym))
[1] 0

尝试的事情：如果我用任何其他值替换ymin和ymax的值例如，如果我使用下面的命令，那么我得到的命令正好在命令

下面

p3 + geom_ribbon(data = forecast_df, 
                 aes(ymin = rep(1750,36), 
                     ymax = rep(2000,36), 
                     fill="gray30"))

我的问题：

有谁能告诉我图2输出背后的原因？为什么R会给出奇怪的输出，如图2所示？

有人可以帮我用ggplot遮蔽整个区域吗？

Answer 1

TLDR：从ylim(1300, 2300) +代码中移除ggplot行。

当您使用scale_x_***() / scale_y_***（或等效xlim() / ylim()）设置图表的限制时，图表会丢弃所有数据点落在这个范围之外。在geom_ribbon的情况下，需要ymin和amp; ymax值，当对应于ymax的值被移除时（因为它们大于2300），色带不能仅用ymin绘制，因此色带在此之前停止。

如果您确实只想绘制范围（1300,2300），请在coord_cartesian()内设置限制。这使得绘图可以缩放到范围限制，而不会丢弃外部的数据点。有关详细信息，请参阅documentation。

以下其他非必要建议：

为了在ggplot中绘图，我通常会尽可能地将所有内容保存在同一数据框中，以便在美学映射中使用公共变量。以下是我的表现：

将所有内容组合到一个数据框中：

library(dplyr)
df <- left_join(tsd %>% select(time, value),
                rbind(tsd_train_model %>% 
                        rename(fit = lm_fit_train) %>%
                        mutate(status = "train"),
                      forecast_df %>%
                        rename(fit = value) %>%
                        mutate(status = "valid")))
df <- df %>%
  mutate(yl = ifelse(status == "valid", fit + percentile_5, NA),
         ym = ifelse(status == "valid", fit + percentile_95, NA))

> head(df)
        time value      fit status yl ym
1 1991-01-01  1709 1882.681  train NA NA
2 1991-02-01  1621 1876.546  train NA NA
3 1991-03-01  1973 1870.518  train NA NA
4 1991-04-01  1812 1864.597  train NA NA
5 1991-05-01  1975 1858.784  train NA NA
6 1991-06-01  1862 1853.078  train NA NA

> tail(df)
          time value      fit status       yl       ym
154 2003-10-01  2121 2190.490  valid 1934.914 2397.875
155 2003-11-01  2076 2200.756  valid 1945.179 2408.141
156 2003-12-01  2141 2211.129  valid 1955.553 2418.514
157 2004-01-01  1832 2221.609  valid 1966.033 2428.994
158 2004-02-01  1838 2232.197  valid 1976.620 2439.582
159 2004-03-01  2132 2242.891  valid 1987.315 2450.277

剧情

ggplot(data = df,
       aes(x = time)) +

  # place the ribbon below all other geoms for easier viewing, & increase transparency
  geom_ribbon(aes(ymin = yl, ymax = ym), fill = "gray30", alpha = 0.2) +

  # original values
  geom_line(aes(y = value), color = "blue") +

  # fitted values (line type differs by training / validation)
  geom_line(aes(y = fit, linetype = status), color = "red") +

  # indicates validation range
  geom_vline(xintercept = c(min(df$time[df$status=="valid"]),
                            max(df$time[df$status=="valid"]))) +

  scale_x_date(breaks = scales::date_breaks('1 year'), 
               labels = scales::date_format('%b-%y')) +

  # hide legend for line type (comment this line out if you want to show it)
  scale_linetype(guide = F) + 

  # limits can be tweaked here
  coord_cartesian(ylim = c(1300, 2500)) +

  # plain white plot background for easier viewing
  theme_classic()

修改：使传说更容易的替代解决方案：

# create long data frame where all values (original / training / validation) are
# in the same column
df2 <- rbind(tsd %>% select(time, value) %>%
               mutate(status = "original"),
             tsd_train_model %>% 
               rename(value = lm_fit_train) %>%
               mutate(status = "train"),
             forecast_df %>%
               mutate(status = "valid")) %>%
  mutate(yl = ifelse(status == "valid", value + percentile_5, NA),
         ym = ifelse(status == "valid", value + percentile_95, NA))

# in the scales for colour / line type, define the same labels in order to
# combine the two legends
ggplot(data = df2,
       aes(x = time)) +
  geom_ribbon(data = subset(df2, !is.na(yl)),
              aes(ymin = yl, ymax = ym, fill = "interval"), alpha = 0.2) +
  geom_line(aes(y = value, color = status, linetype = status)) +
  geom_vline(xintercept = c(min(df2$time[df$status=="valid"]),
                            max(df2$time[df$status=="valid"]))) +
  scale_x_date(breaks = scales::date_breaks('1 year'), 
               labels = scales::date_format('%b-%y')) +
  scale_color_manual(name = "",
                     values = c("original" = "blue",
                                "train" = "red",
                                "valid" = "red")) +
  scale_linetype_manual(name = "",
                 values = c("original" = "solid",
                            "train" = "solid",
                            "valid" = "longdash")) +
  scale_fill_manual(name = "",
                    values = c("interval" = "gray30")) +
  coord_cartesian(ylim = c(1300, 2500)) +
  theme_classic() +
  theme(legend.position = "bottom")

使用ggplot

1 个答案: