如何加快我的功能(特别是ggplot命令)的速度?

时间:2019-01-20 01:54:44

标签: r ggplot2

我组合了一个识别异常值的函数。它需要一个数据框,然后用线显示数据图以指示潜在的异常值。它也会在表格上标出离群值。

但是,它是SLOOOW。问题是要加载地图需要很长时间。

我很好奇您是否对如何加快此速度有任何建议。

相关:默认绘图系统比ggplot快吗?

我将从依赖关系开始

#These next four functions are not mine. They're used in GetOutliers()

ExtractDetails <- function(x, down, up){

  outClass <- rep("N", length(x))
  indexLo <- which(x < down)
  indexHi <- which(x > up)
  outClass[indexLo] <- "L"
  outClass[indexHi] <- "U"
  index <- union(indexLo, indexHi)
  values <- x[index]
  outClass <- outClass[index]
  nOut <- length(index)
  maxNom <- max(x[which(x <= up)])
  minNom <- min(x[which(x >= down)])
  outList <- list(nOut = nOut, lowLim = down,
                  upLim = up, minNom = minNom,
                  maxNom = maxNom, index = index,
                  values = values,
                  outClass = outClass)
  return(outList)
}

Hampel <- function(x, t = 3){
  #
  mu <- median(x, na.rm = TRUE)
  sig <- mad(x, na.rm = TRUE)
  if (sig == 0){
    message("Hampel identifer implosion: MAD scale estimate is zero")
  }
  up<-mu+t*sig
  down<-mu-t*sig
  out <- list(up = up, down = down)
  return(out)
}


ThreeSigma <- function(x, t = 3){
  #
  mu <- mean(x, na.rm = TRUE)
  sig <- sd(x, na.rm = TRUE)
  if (sig == 0){
    message("All non-missing x-values are identical")
  }
  up<-mu+t* sig
  down<-mu-t * sig
  out <- list(up = up, down = down)
  return(out)
}

BoxplotRule <- function(x, t = 1.5){
  #
  xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
  xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
  Q<-xU-xL
  if(Q==0){
    message("Boxplot rule implosion: interquartile distance is zero")
  }
  up<-xU+t*Q
  down<-xU-t*Q
  out <- list(up = up, down = down)
  return(out)
}

FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
  threeLims <- ThreeSigma(x, t = t3)
  HampLims <- Hampel(x, t = tH)
  boxLims <- BoxplotRule(x, t = tb)

  n <- length(x)
  nMiss <- length(which(is.na(x)))

  threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
  HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
  boxList <- ExtractDetails(x, boxLims$down, boxLims$up)

  sumFrame <- data.frame(method = "ThreeSigma", n = n,
                         nMiss = nMiss, nOut = threeList$nOut,
                         lowLim = threeList$lowLim,
                         upLim = threeList$upLim,
                         minNom = threeList$minNom,
                         maxNom = threeList$maxNom)
  upFrame <- data.frame(method = "Hampel", n = n,
                         nMiss = nMiss, nOut = HampList$nOut,
                         lowLim = HampList$lowLim,
                         upLim = HampList$upLim,
                         minNom = HampList$minNom,
                         maxNom = HampList$maxNom)
  sumFrame <- rbind.data.frame(sumFrame, upFrame)
  upFrame <- data.frame(method = "BoxplotRule", n = n,
                         nMiss = nMiss, nOut = boxList$nOut,
                         lowLim = boxList$lowLim,
                         upLim = boxList$upLim,
                         minNom = boxList$minNom,
                         maxNom = boxList$maxNom)
  sumFrame <- rbind.data.frame(sumFrame, upFrame)

  threeFrame <- data.frame(index = threeList$index,
                         values = threeList$values,
                         type = threeList$outClass)
  HampFrame <- data.frame(index = HampList$index,
                        values = HampList$values,
                        type = HampList$outClass)
  boxFrame <- data.frame(index = boxList$index,
                       values = boxList$values,
                       type = boxList$outClass)
  outList <- list(summary = sumFrame, threeSigma = threeFrame,
                Hampel = HampFrame, boxplotRule = boxFrame)
  return(outList)
}

#strip non-numeric variables out of a dataframe
num_vars <- function(df){
  X <- which(sapply(df, is.numeric))
  num_vars <- df[names(X)]
  return(num_vars)
}

这是功能

GetOutliers <- function(df){
  library('dplyr')
  library('ggplot2')

  #strip out the non-numeric columns
  df_out <- num_vars(df)

  #initialize the data frame
  df_out$Hampel <- NA
  df_out$threeSigma <- NA
  df_out$boxplotRule <- NA
  df_out_id <- df_out

  #identify outliers for each column
  for (i in 1:length(names(num_vars(df)))){

    #find the outliers
    Outs <- FindOutliers(df_out[[i]])
    OutsSum <- Outs$summary

    #re-enter the outlier status
    df_out$Hampel <- NA
    df_out$threeSigma <- NA
    df_out$boxplotRule <- NA
    ifelse(is.na(Outs$Hampel), print(), df_out[unlist(Outs$Hampel[1]),]$Hampel <- TRUE)
    ifelse(is.na(Outs$threeSigma), print(), df_out[unlist(Outs$threeSigma[1]),]$threeSigma <- TRUE)
    ifelse(is.na(Outs$boxplotRule), print(), df_out[unlist(Outs$boxplotRule[1]),]$boxplotRule <- TRUE)

    #visualize the outliers and print outlier information
    Temp <- df_out
    A <- colnames(Temp)[i]
    AA <- paste(A,"Index")
    colnames(Temp)[i] <- 'curr_column'

    #table with outlier status
    X <- arrange(subset(Temp,Hampel == TRUE | boxplotRule == TRUE | threeSigma == TRUE), desc(curr_column))

    #scatterplot with labels
    Y <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
      geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
      geom_text(aes(40,OutsSum$lowLim[1],label="ThreeSigma Lower",vjust=-1)) +
      geom_text(aes(40,OutsSum$lowLim[2],label="Hampel Lower",vjust=-1)) +
      geom_text(aes(40,OutsSum$lowLim[3],label="Boxplot Lower",vjust=-1)) +
      geom_text(aes(40,OutsSum$upLim[1],label="ThreeSigma Upper",vjust=-1)) +
      geom_text(aes(40,OutsSum$upLim[2],label="Hampel Upper",vjust=-1)) +
      geom_text(aes(40,OutsSum$upLim[3],label="Boxplot Upper",vjust=-1)) +
      xlab(AA) + ylab(A)

    #scatterplot without labels
    Z <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
      geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
      geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
      xlab(AA) + ylab(A)

    U <- ggplot(Temp,aes(curr_column)) + geom_density() + xlab(A)

    print(A)
    print(X)
    print(OutsSum)
    print(Z)
    print(Y)
    print(U)

    #mark the extreme outliers, the rest are reasonable outliers
    A <- colnames(df_out_id[i])
    Q <- as.numeric(readline(prompt="Enter the index for final Extreme value on the upper limit (if none, enter 0): "))
    W <- as.numeric(readline(prompt="Enter the index for first Extreme value on the lower limit (if none, enter 0): "))
    col <- df_out_id[i]
    df_out_id[i] <- sapply(col[[1]], function(x){
      if(Q>1 & x %in% X$curr_column[1:Q]) return('Extreme')
      if(W>1 & x %in% X$curr_column[W:length(X$curr_column)]) return('Extreme')
      else if (x %in% X$curr_column[Q+1:length(X$curr_column)]) return('Reasonable')
      else return('Non-Outlier')
    })

  }

  #return a dataframe with outlier status, excluding the outlier ID columns
  summary(df_out_id)
  return(df_out_id[1:(length(names(df_out_id))-3)])
}

示例

library('ISLR')
data(Carseats)

GetOutliers(Carseats)

它将为您显示每个数字变量的离群值。

enter image description here

它将绘制可变密度,然后绘制带有标识线的散点图

enter image description here

它还将接受输入,因此您可以将某些离群值标记为合理,而将其他离群值标记为极端

0 个答案:

没有答案