删除除最小值和最大值之外的所有异常值; GGPLOT2

时间:2015-10-28 14:30:38

标签: r ggplot2 boxplot

我已经融化了一个数据集并制作了一个ggplot框图。

dfplot.m <- melt(dataforplotscaled, id.var = "NameNum")
p <- ggplot(data = dfplot.m, aes(x=variable, y=value)) + geom_boxplot(aes(fill = as.factor(NameNum)),outlier.colour = 'black')

所以它看起来像这样: ggplot2 boxplot

我可以将outlier.colour设置为FALSE来删除异常值。但是,除了变量的单个最大值和单个最小分数之外,是否可以删除所有异常值?感谢。

编辑2: enter image description here

dput(dfplot.m[sample(1:nrow(dfplot.m), 100, replace=FALSE), ])
structure(list(NameNum = c(1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1, 
1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 
2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2, 
2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 
1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1, 
1, 2, 2, 2), variable = structure(c(2L, 9L, 3L, 1L, 9L, 5L, 9L, 
5L, 9L, 5L, 2L, 5L, 5L, 9L, 2L, 6L, 4L, 1L, 8L, 3L, 5L, 6L, 4L, 
9L, 3L, 7L, 4L, 7L, 1L, 5L, 6L, 6L, 4L, 9L, 5L, 7L, 5L, 8L, 8L, 
4L, 3L, 4L, 6L, 1L, 5L, 3L, 9L, 5L, 8L, 1L, 1L, 5L, 1L, 1L, 1L, 
8L, 7L, 1L, 1L, 9L, 1L, 8L, 3L, 9L, 6L, 7L, 5L, 1L, 5L, 7L, 4L, 
3L, 4L, 1L, 2L, 1L, 4L, 2L, 9L, 9L, 4L, 5L, 9L, 3L, 4L, 7L, 4L, 
8L, 5L, 1L, 3L, 1L, 7L, 1L, 1L, 6L, 1L, 6L, 1L, 8L), .Label = c("Lexical.Diversity", 
"Lexical.Density", "Adjusted.Flesch.Index", "Words.per.Article", 
"Sentences.per.Article", "Syllables.per.Word", "Yule.s.I", "Semantic_Entropy", 
"News_Entropy_Full"), class = "factor"), value = c(-0.179489825391584, 
-0.792325582681448, 1.29830696675904, -0.372172260372429, 0.473812509279295, 
-0.153524114610876, 0.733216873520262, 0.159862654456996, 1.03813285673022, 
-1.22799303712929, -0.801484127248713, 1.01048388478408, -0.332602268363945, 
0.66932649596267, -0.397294142227917, -1.51039268603885, -0.939588396924419, 
1.31739241446542, 0.362129563570176, 2.43220201868067, -0.466910883678748, 
0.00762367594660348, 1.47033623968998, 0.56033306843174, -1.08779429476028, 
-0.614861840522387, -0.99006951408483, -0.588458131299134, 0.209767792916019, 
1.18956203853715, -1.50350408751644, 0.0897817201255289, 1.05112348414048, 
0.391145755883612, 0.294171269771798, -0.734554231683023, -0.60121949899355, 
-0.641086898714197, -0.265721827105799, 0.987473379894742, 0.388246974509265, 
0.302686051457864, -0.1477688795229, -0.244993198590439, -0.243063191487411, 
0.172830716237646, 0.998505690574715, -0.42214134524048, -1.20946387417143, 
-0.835885073498765, -0.694988387016547, -0.556449960555282, 3.03293150151147, 
-0.86883683187086, 0.0445475503649207, -0.36453451897155, 0.0866582788638452, 
-1.33882302006402, 0.0350835139394611, 0.723227903107476, -0.670493848522175, 
0.499257932941051, -0.942732073226944, -0.584142934508364, 1.8222314128331, 
-0.154919311457164, -0.466910883678748, -0.572000286640364, 0.115093116018728, 
-0.0217498642546982, -0.676208655217928, 0.917388044269897, -1.05591444951145, 
0.415923816164584, -0.919100535087332, 0.612734679661034, -0.395270264064337, 
-2.49841622541778, -0.175104307426753, -1.15243308738203, 1.1169684195671, 
0.741866654154472, 0.345946629759173, -0.561021804651866, 1.68104003305517, 
-0.252399447011283, -1.240280268706, -0.408600537376025, -1.00414534493796, 
2.47037393302189, 0.129265778455885, -1.28468737659633, -0.599025562669206, 
-0.143696213356152, -0.426830949451079, 0.739223485373646, 0.101588138431446, 
1.4233959109099, -0.85418312281721, -0.658803957246004)), .Names = c("NameNum", 
"variable", "value"), row.names = c(38596L, 322758L, 87485L, 
8989L, 313389L, 163593L, 313212L, 167000L, 324632L, 187503L, 
39789L, 152553L, 154681L, 308748L, 72620L, 205863L, 145649L, 
6466L, 287807L, 112967L, 155211L, 192004L, 135759L, 314675L, 
84997L, 245093L, 127005L, 249782L, 37979L, 166529L, 219834L, 
200867L, 149905L, 341712L, 158804L, 230897L, 177312L, 299014L, 
276196L, 116256L, 96190L, 145030L, 218225L, 31678L, 154210L, 
110942L, 313669L, 186700L, 291929L, 34515L, 19301L, 166914L, 
15960L, 31009L, 12238L, 302182L, 263125L, 7793L, 34356L, 342499L, 
12123L, 291952L, 89436L, 324929L, 228150L, 252579L, 173163L, 
27094L, 175068L, 239725L, 140518L, 113174L, 138132L, 1407L, 41388L, 
3601L, 116579L, 56938L, 334059L, 318082L, 143004L, 173021L, 317959L, 
81472L, 150147L, 234327L, 121440L, 304182L, 171002L, 33485L, 
82307L, 27032L, 262933L, 36494L, 18372L, 190798L, 1083L, 207963L, 
25490L, 295376L), class = "data.frame")

2 个答案:

答案 0 :(得分:2)

以下是我如何使用您提供的数据集进行操作。

首先,让我们得到一个数据集,其中每个因子组合只有最小值和最大值:

library(dplyr)
dat2<- dfplot.m %>% group_by(variable, NameNum) %>%
                    arrange(value) %>%
                    slice(c(1, n()))

然后使用geom_point绘制这些点:

library(ggplot2)
ggplot(data = dfplot.m, aes(x = variable, y = value)) +
   geom_boxplot(aes(fill = as.factor(NameNum)),outlier.colour = 'black') +
   geom_point(data = dat2, aes(fill = as.factor(NameNum)), position = position_dodge(0.75))

enter image description here

答案 1 :(得分:1)

如果您不希望凹口(线条)长于正常值,请使用:

dat <-  mtcars

require(data.table)
ext <- melt(setDT(dat)[, range(wt), cyl], "cyl")

ext$cyl <- as.factor(ext$cyl)
dat$cyl <- as.factor(dat$cyl)

require(ggplot2)
ggplot(dat, aes(cyl, wt)) + geom_boxplot(outlier.size = 0) +
  geom_point(data = ext, aes(x = cyl, y = value))

哪个给你

enter image description here

而不是:

ggplot(dat, aes(cyl, wt)) + geom_boxplot()

enter image description here