我组合了一个识别异常值的函数。它需要一个数据框,然后用线显示数据图以指示潜在的异常值。它也会在表格上标出离群值。
但是,它是SLOOOW。问题是要加载地图需要很长时间。
我很好奇您是否对如何加快此速度有任何建议。
相关:默认绘图系统比ggplot快吗?
我将从依赖关系开始
#These next four functions are not mine. They're used in GetOutliers()
ExtractDetails <- function(x, down, up){
outClass <- rep("N", length(x))
indexLo <- which(x < down)
indexHi <- which(x > up)
outClass[indexLo] <- "L"
outClass[indexHi] <- "U"
index <- union(indexLo, indexHi)
values <- x[index]
outClass <- outClass[index]
nOut <- length(index)
maxNom <- max(x[which(x <= up)])
minNom <- min(x[which(x >= down)])
outList <- list(nOut = nOut, lowLim = down,
upLim = up, minNom = minNom,
maxNom = maxNom, index = index,
values = values,
outClass = outClass)
return(outList)
}
Hampel <- function(x, t = 3){
#
mu <- median(x, na.rm = TRUE)
sig <- mad(x, na.rm = TRUE)
if (sig == 0){
message("Hampel identifer implosion: MAD scale estimate is zero")
}
up<-mu+t*sig
down<-mu-t*sig
out <- list(up = up, down = down)
return(out)
}
ThreeSigma <- function(x, t = 3){
#
mu <- mean(x, na.rm = TRUE)
sig <- sd(x, na.rm = TRUE)
if (sig == 0){
message("All non-missing x-values are identical")
}
up<-mu+t* sig
down<-mu-t * sig
out <- list(up = up, down = down)
return(out)
}
BoxplotRule <- function(x, t = 1.5){
#
xL <- quantile(x, na.rm = TRUE, probs = 0.25, names = FALSE)
xU <- quantile(x, na.rm = TRUE, probs = 0.75, names = FALSE)
Q<-xU-xL
if(Q==0){
message("Boxplot rule implosion: interquartile distance is zero")
}
up<-xU+t*Q
down<-xU-t*Q
out <- list(up = up, down = down)
return(out)
}
FindOutliers <- function(x, t3 = 3, tH = 3, tb = 1.5){
threeLims <- ThreeSigma(x, t = t3)
HampLims <- Hampel(x, t = tH)
boxLims <- BoxplotRule(x, t = tb)
n <- length(x)
nMiss <- length(which(is.na(x)))
threeList <- ExtractDetails(x, threeLims$down, threeLims$up)
HampList <- ExtractDetails(x, HampLims$down, HampLims$up)
boxList <- ExtractDetails(x, boxLims$down, boxLims$up)
sumFrame <- data.frame(method = "ThreeSigma", n = n,
nMiss = nMiss, nOut = threeList$nOut,
lowLim = threeList$lowLim,
upLim = threeList$upLim,
minNom = threeList$minNom,
maxNom = threeList$maxNom)
upFrame <- data.frame(method = "Hampel", n = n,
nMiss = nMiss, nOut = HampList$nOut,
lowLim = HampList$lowLim,
upLim = HampList$upLim,
minNom = HampList$minNom,
maxNom = HampList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
upFrame <- data.frame(method = "BoxplotRule", n = n,
nMiss = nMiss, nOut = boxList$nOut,
lowLim = boxList$lowLim,
upLim = boxList$upLim,
minNom = boxList$minNom,
maxNom = boxList$maxNom)
sumFrame <- rbind.data.frame(sumFrame, upFrame)
threeFrame <- data.frame(index = threeList$index,
values = threeList$values,
type = threeList$outClass)
HampFrame <- data.frame(index = HampList$index,
values = HampList$values,
type = HampList$outClass)
boxFrame <- data.frame(index = boxList$index,
values = boxList$values,
type = boxList$outClass)
outList <- list(summary = sumFrame, threeSigma = threeFrame,
Hampel = HampFrame, boxplotRule = boxFrame)
return(outList)
}
#strip non-numeric variables out of a dataframe
num_vars <- function(df){
X <- which(sapply(df, is.numeric))
num_vars <- df[names(X)]
return(num_vars)
}
这是功能
GetOutliers <- function(df){
library('dplyr')
library('ggplot2')
#strip out the non-numeric columns
df_out <- num_vars(df)
#initialize the data frame
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
df_out_id <- df_out
#identify outliers for each column
for (i in 1:length(names(num_vars(df)))){
#find the outliers
Outs <- FindOutliers(df_out[[i]])
OutsSum <- Outs$summary
#re-enter the outlier status
df_out$Hampel <- NA
df_out$threeSigma <- NA
df_out$boxplotRule <- NA
ifelse(is.na(Outs$Hampel), print(), df_out[unlist(Outs$Hampel[1]),]$Hampel <- TRUE)
ifelse(is.na(Outs$threeSigma), print(), df_out[unlist(Outs$threeSigma[1]),]$threeSigma <- TRUE)
ifelse(is.na(Outs$boxplotRule), print(), df_out[unlist(Outs$boxplotRule[1]),]$boxplotRule <- TRUE)
#visualize the outliers and print outlier information
Temp <- df_out
A <- colnames(Temp)[i]
AA <- paste(A,"Index")
colnames(Temp)[i] <- 'curr_column'
#table with outlier status
X <- arrange(subset(Temp,Hampel == TRUE | boxplotRule == TRUE | threeSigma == TRUE), desc(curr_column))
#scatterplot with labels
Y <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
geom_text(aes(40,OutsSum$lowLim[1],label="ThreeSigma Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[2],label="Hampel Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$lowLim[3],label="Boxplot Lower",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[1],label="ThreeSigma Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[2],label="Hampel Upper",vjust=-1)) +
geom_text(aes(40,OutsSum$upLim[3],label="Boxplot Upper",vjust=-1)) +
xlab(AA) + ylab(A)
#scatterplot without labels
Z <- ggplot(Temp,aes(seq_along(curr_column),curr_column)) + geom_point() +
geom_hline(yintercept=OutsSum$lowLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$lowLim[3],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[1],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[2],linetype = 'dashed') +
geom_hline(yintercept=OutsSum$upLim[3],linetype = 'dashed') +
xlab(AA) + ylab(A)
U <- ggplot(Temp,aes(curr_column)) + geom_density() + xlab(A)
print(A)
print(X)
print(OutsSum)
print(Z)
print(Y)
print(U)
#mark the extreme outliers, the rest are reasonable outliers
A <- colnames(df_out_id[i])
Q <- as.numeric(readline(prompt="Enter the index for final Extreme value on the upper limit (if none, enter 0): "))
W <- as.numeric(readline(prompt="Enter the index for first Extreme value on the lower limit (if none, enter 0): "))
col <- df_out_id[i]
df_out_id[i] <- sapply(col[[1]], function(x){
if(Q>1 & x %in% X$curr_column[1:Q]) return('Extreme')
if(W>1 & x %in% X$curr_column[W:length(X$curr_column)]) return('Extreme')
else if (x %in% X$curr_column[Q+1:length(X$curr_column)]) return('Reasonable')
else return('Non-Outlier')
})
}
#return a dataframe with outlier status, excluding the outlier ID columns
summary(df_out_id)
return(df_out_id[1:(length(names(df_out_id))-3)])
}
示例
library('ISLR')
data(Carseats)
GetOutliers(Carseats)
它将为您显示每个数字变量的离群值。
它将绘制可变密度,然后绘制带有标识线的散点图
它还将接受输入,因此您可以将某些离群值标记为合理,而将其他离群值标记为极端