提高大数据上的面网格图的性能

时间:2019-07-10 13:52:48

标签: r performance ggplot2 facet-grid

我有几个时间序列,需要绘制每种组合的散点图。当我已经发布了代码here时,在您的帮助下,我弄清楚了如何使用facet_grid()包中的ggplot2很好地绘制整个内容。

现在的问题是性能。下面的例子很小。您可以将n <- 50000设置为触摸我需要处理的少量数据。我认为最耗时的部分是生成具有所有组合(尤其是所有重复项)的FACET-Data_Frame。最后,由于我要经过的行数过多,绘图调用也需要很长时间。 nrow(FACET)length(df) * length(df) * n,在我的实际情况下,n = 50000length(df) = 10是500万。

library(tidyverse)
set.seed(214)

n <- 1000
df <- tibble(v1 = runif(n), v2 = runif(n)*0.1 + v1, v3 = runif(n)*0.2 + v2, v4 = runif(n)*0.3 + v3, v5 = runif(n)*0.4 + v4, v6 = runif(n)*0.5 + v5)

C                   <- crossing(w1 = 1:length(df), w2 = 1:length(df))    # Alle Kombinationsmöglichkeiten

FACET_LIST <- lapply(1:nrow(C), function(c) { # c <- 14   C[c,]
  tibble(a1 = unlist(df[, C$w1[c]], use.names = FALSE), 
         a2 = unlist(df[, C$w2[c]], use.names = FALSE), 
         name1 = names(df[, C$w1[c]]),
         name2 = names(df[, C$w2[c]])
  )
})

FACET <- do.call(rbind.data.frame, FACET_LIST)

FACET$name1 <- as_factor(FACET$name1)
FACET$name2 <- as_factor(FACET$name2)

dat_text <- tibble(
  name1 = rep(names(df), each = length(names(df))), 
  name2 = rep(names(df), length(names(df)))
)

p <- ggplot()
p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
p <- p + stat_smooth(data=FACET, aes(a1, a2), method = "lm")
p <- p + facet_grid(vars(name1), vars(name2)) + coord_fixed()
p

是否有更有效的方法将需求信息传递到facet_grid()图?还是有其他方法可以加速我的代码?

1 个答案:

答案 0 :(得分:1)

所以我用n = 50000进行了许多测试:

base <- system.time({
  p <- ggplot()
  p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
  print(p)
})

facet <- system.time({
  p <- ggplot()
  p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
  p <- p + facet_grid(vars(name1), vars(name2)) + coord_fixed()
  print(p)
})

# Adding group to stat_smooth, so the number of lines it 
# has to estimate is consistent with the facetted option
smooth <- system.time({
  p <- ggplot()
  p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
  p <- p + stat_smooth(data=FACET, aes(a1, a2, group = interaction(name1, name2)), method = "lm")
  print(p)
})

smooth_facet <- system.time({
  p <- ggplot()
  p <- p + geom_point(data=FACET, aes(a1, a2), size = 0.5)
  p <- p + stat_smooth(data=FACET, aes(a1, a2), method = "lm")
  p <- p + facet_grid(vars(name1), vars(name2)) + coord_fixed()
  print(p)
})

building <- system.time({
  pp <- ggplot_build(p)
})

interpreting <- system.time({
  ppp <- ggplotGrob(pp$plot)
})

library(grid)
drawing <- system.time({
  grid.newpage(); grid.draw(ppp)
})

alternative <- system.time({
  g <- ggplot()
  g <- g + geom_point(data=FACET, aes(a1, a2), size = 0.5, shape = ".")
  g <- g + stat_smooth(data=FACET, aes(a1, a2), method = "lm")
  g <- g + facet_grid(vars(name1), vars(name2)) + coord_fixed()
  print(g)
})

这些是结果:

rbind(base, facet, smooth, smooth_facet, building, interpreting, drawing, alternative)
             user.self sys.self elapsed user.child sys.child
base              8.34    30.96   39.44         NA        NA
facet             8.56    30.48   39.12         NA        NA
smooth           10.00    31.14   41.18         NA        NA
smooth_facet     10.14    31.50   41.73         NA        NA
building          2.59     0.42    3.03         NA        NA
interpreting      5.08     0.61    5.76         NA        NA
drawing           5.13    30.23   35.39         NA        NA
alternative       7.58     8.23   15.86         NA        NA

这会向我建议,不是ggplot的代码很慢,而是绘图代码或必须绘制许多点的事实。

但是,您似乎可以通过不使用四舍五入的点,而是使用shape = "."语句中的geom_point()来将时间减少一半以上(如“替代”测试中一样) )。无论如何,您都可能过度绘制点。外观如下:

enter image description here