Question

我连续三年有五个不同的时间序列。现在我想通过绘图上的间隙显示这些系列中的缺失值。所以，我认为我将创建与这些系列相对应的另一个数据框，并且在哪里我有一个值，我将用一个替换它并保留NA＆＃39; s。这样的伪数据帧如下：

# create sample time index
timeindex <- seq(as.POSIXct("2014-01-01"),as.POSIXct("2016-12-31"),by="1 mins")
# create 5 sample series of same length as of time index
sequence_1 <- sample(seq(from = 0, to = 1, by = 1), size =  length(timeindex), replace = TRUE)
sequence_2 <- sample(seq(from = 0, to = 1, by = 1), size =  length(timeindex), replace = TRUE)
sequence_3 <- sample(seq(from = 0, to = 1, by = 1), size =  length(timeindex), replace = TRUE)
sequence_4 <- sample(seq(from = 0, to = 1, by = 1), size =  length(timeindex), replace = TRUE)
sequence_5 <- sample(seq(from = 0, to = 1, by = 1), size =  length(timeindex), replace = TRUE)
# create data frame of sequences
df <- data.frame(sequence_1,sequence_2,sequence_3,sequence_4,sequence_5)
df <- ifelse(df==0,NA,1) # replace 0 with NA to show missing data values
df_with_time <- data.frame(timeindex,df) # attach timestamp to sequences

现在的问题是如何在一个图表中显示缺失值（缺口）。我融化了我的数据框和将geom_line()与facet_grid()一起使用的想法，但似乎我的计算机无限期地挂起。代码是：

library(ggplot2)
df_melt <- reshape2::melt(df_with_time,id.vars="timeindex") # melt for ggplot
ggplot(df_melt,aes(timeindex,value,variable)) +  geom_line() + facet_grid(variable~.)
#ggplot(df_melt,aes(timeindex,value,variable)) +  geom_area() + facet_grid(variable~.)

现在我有两个问题：

虽然ggplot无法在具有8GB RAM，2.6 GHZ处理器的机器上绘制这个庞大的数据。有没有其他方法来绘制如此庞大的数据？
是否有其他方法可以显示数据中的差距（缺失值）？

更新我想要一个这样的情节：

缺失的数据点显示为间隙。

Answer 1

如果不按系列聚合NAs，我建议您对数据执行基于时间的分级。简而言之，您可以计算30分钟或60分钟窗口中您拥有的NA数量，并使用ggplot绘制计数。我在下面给出一个例子。

# Binning
head(df_with_time)
time.gap <- 60 # bin by hour
idx <- seq(1, nrow(df_with_time), by = time.gap) 
na.counts <- lapply(idx[-length(idx)], (function(i){
  tmp <- df_with_time[i:(i+(time.gap-1)),]
  counts <- apply(tmp[,-1], 2, (function(y){ sum(is.na(y)) }))
  counts
}))
na.counts <- data.frame(time=df_with_time[idx[-length(idx)],]$timeindex, 
                        do.call(rbind, na.counts), 
                        stringsAsFactors = FALSE,
                        row.names = NULL)
head(na.counts)

# Convert to suitable df and then plot (color tracks with NA count)
df_melt <- reshape2::melt(na.counts,id.vars="time") # melt for ggplot
df_melt$y <- as.integer(as.factor(df_melt$variable))
df_melt <- df_melt[order(df_melt$value - median(df_melt$value)), ]

ggplot(df_melt,aes(x=time, y=y)) +  
  geom_point(aes(colour = value), shape = 124, alpha = 0.75, size = 2.5) + 
  scale_colour_gradient2(low = "#01665e", mid = "#f5f5f5", high = "#8c510a", midpoint = median(df_melt$value))

这是结果。

或者，你可能想要摆脱太接近中位数的值，只绘制你的'异常值'。由于这会删除大量数据，因此将快速生成绘图。

df_melt2 <- df_melt[abs(df_melt$value - median(df_melt$value)) > 8, ]

ggplot(df_melt2,aes(x=time, y=y)) +  
  geom_point(aes(colour = value), shape = 124, alpha = 0.75, size = 4.5) + 
  scale_colour_gradient2(low = "#01665e", mid = "#f5f5f5", high = "#8c510a", midpoint = median(df_melt$value))

PS：我认为你对那些远离中位数的价值感兴趣。如果您关心总NA数，请改用scale_colour_gradient（）。

Answer 2

如果将其重塑为长格式：

data <- reshape2::melt(df_with_time,
                       id.vars="timeindex", 
                       variable.name = 'Sequence', 
                       value.name = 'Data')

您可以根据需要在ggplot中对其进行绘制：

ggplot(data, 
       aes(x = timeindex, 
           y = Sequence, 
           size = Data)) + 
geom_line()

这是一个月的时间，以使事情变小：

在一个图上显示多个时间序列中的缺失值

2 个答案: