以下是我感兴趣的测试: http://www.itl.nist.gov/div898/handbook/eda/section3/eda35h3.htm
如何将此代码调整为接受数值向量的函数,并返回指定要删除哪些数据点的逻辑向量?
我试图在下面这样做,但是我遇到了困难,因为当我对矢量进行排序以返回时,它不会与输入矢量数据对齐。
# input data
y = c(-0.25, 0.68, 0.94, 1.15, 1.20, 1.26, 1.26,
1.34, 1.38, 1.43, 1.49, 1.49, 1.55, 1.56,
1.58, 1.65, 1.69, 1.70, 1.76, 1.77, 1.81,
1.91, 1.94, 1.96, 1.99, 2.06, 2.09, 2.10,
2.14, 2.15, 2.23, 2.24, 2.26, 2.35, 2.37,
2.40, 2.47, 2.54, 2.62, 2.64, 2.90, 2.92,
2.92, 2.93, 3.21, 3.26, 3.30, 3.59, 3.68,
4.30, 4.64, 5.34, 5.42, 6.01)
## Generate normal probability plot.
qqnorm(y)
removeoutliers = function(dfinputcol) {
y = as.vector(dfinputcol)
## Create function to compute the test statistic.
rval = function(y){
ares = abs(y - mean(y))/sd(y)
df = data.frame(y, ares)
r = max(df$ares)
list(r, df)}
## Define values and vectors.
n = length(y)
alpha = 0.05
lam = c(1:10)
R = c(1:10)
## Compute test statistic until r=10 values have been
## removed from the sample.
for (i in 1:10){
if(i==1){
rt = rval(y)
R[i] = unlist(rt[1])
df = data.frame(rt[2])
newdf = df[df$ares!=max(df$ares),]}
else if(i!=1){
rt = rval(newdf$y)
R[i] = unlist(rt[1])
df = data.frame(rt[2])
newdf = df[df$ares!=max(df$ares),]}
## Compute critical value.
p = 1 - alpha/(2*(n-i+1))
t = qt(p,(n-i-1))
lam[i] = t*(n-i) / sqrt((n-i-1+t**2)*(n-i+1))
}
## Print results.
newdf = data.frame(c(1:10),R,lam)
names(newdf)=c("Outliers","TestStat.", "CriticalVal.")
# determine how many outliers to remove
toremove = max(newdf$Outliers[newdf$TestStat. > newdf$CriticalVal.])
# create vector of same size as input vector
logicalvectorTifshouldremove = logical(length=length(y))
# but how to determine which outliers to remove?
# set largest data points as outliers to remove.. but could be the smallest in some data sets..
logicalvectorTifshouldremove = replace(logicalvectorTifshouldremove, tail(sort(y), toremove), TRUE)
return (logicalvectorTifshouldremove)
}
# this should have 3 data points set to TRUE .. but it has 2 and they aren't the correct ones
output = removeoutliers(y)
length(output[output==T])
答案 0 :(得分:1)
您可以在库robustHD
上使用winsorize库(' robustHD&#39)
set.seed(1234)
x <- rnorm(10)
x[1] <- x[1] * 10
x[2] <- x[2] * 11
x[10] <- x[10] * 10
x
[1] -12.0706575 3.0517217 1.0844412 -2.3456977 0.4291247 0.5060559 -0.5747400 -0.5466319 -0.5644520 -8.9003783
boxplot(x)
y <- winsorize(x)
y
[1] -4.5609058 3.0517217 1.0844412 -2.3456977 0.4291247 0.5060559 -0.5747400 -0.5466319 -0.5644520 -4.5609058
boxplot(y)
所以如果你有数据框或向量,你可以使用sapply来执行winsorize函数。 有关此库的详细信息,请访问此链接http://cran.r-project.org/web/packages/robustHD/index.html
答案 1 :(得分:1)
我认为它写在你给出的页面中(不完全是两句话):
删除最大化| x_i - 意味着(x)|
的 r 观察
因此,只需删除 r 那些超出差异的数据,即可获得没有异常值的数据,使用:
y[abs(y-mean(y)) < sort(abs(y-mean(y)),decreasing=TRUE)[toremove]]
您不需要代码的最后两行。顺便说一下,你可以直接计算:
toremove = which(newdf$TestStat > newdf$CriticalVal)
为简化一点,最终的功能可能是:
# Compute the critical value for ESD Test
esd.critical <- function(alpha, n, i) {
p = 1 - alpha/(2*(n-i+1))
t = qt(p,(n-i-1))
return(t*(n-i) / sqrt((n-i-1+t**2)*(n-i+1)))
}
removeoutliers = function(y) {
## Define values and vectors.
y2 = y
n = length(y)
alpha = 0.05
toremove = 0
## Compute test statistic until r=10 values have been
## removed from the sample.
for (i in 1:10){
if(sd(y2)==0) break
ares = abs(y2 - mean(y2))/sd(y2)
Ri = max(ares)
y2 = y2[ares!=Ri]
## Compute critical value.
if(Ri>esd.critical(alpha,n,i))
toremove = i
}
# Values to keep
if(toremove>0)
y = y[abs(y-mean(y)) < sort(abs(y-mean(y)),decreasing=TRUE)[toremove]]
return (y)
}
返回:
> removeoutliers(y)
[1] -0.25 0.68 0.94 1.15 1.20 1.26 1.26 1.34 1.38 1.43 1.49
[12] 1.49 1.55 1.56 1.58 1.65 1.69 1.70 1.76 1.77 1.81 1.91
[23] 1.94 1.96 1.99 2.06 2.09 2.10 2.14 2.15 2.23 2.24 2.26
[34] 2.35 2.37 2.40 2.47 2.54 2.62 2.64 2.90 2.92 2.92 2.93
[45] 3.21 3.26 3.30 3.59 3.68 4.30 4.64