我已经融化了一个数据集并制作了一个ggplot框图。
dfplot.m <- melt(dataforplotscaled, id.var = "NameNum")
p <- ggplot(data = dfplot.m, aes(x=variable, y=value)) + geom_boxplot(aes(fill = as.factor(NameNum)),outlier.colour = 'black')
我可以将outlier.colour
设置为FALSE
来删除异常值。但是,除了变量的单个最大值和单个最小分数之外,是否可以删除所有异常值?感谢。
dput(dfplot.m[sample(1:nrow(dfplot.m), 100, replace=FALSE), ])
structure(list(NameNum = c(1, 2, 1, 1, 1, 1, 1, 2, 2, 2, 1, 1,
1, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2, 1, 2,
2, 1, 1, 2, 2, 1, 1, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1,
1, 1, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2, 1,
1, 2, 2, 2), variable = structure(c(2L, 9L, 3L, 1L, 9L, 5L, 9L,
5L, 9L, 5L, 2L, 5L, 5L, 9L, 2L, 6L, 4L, 1L, 8L, 3L, 5L, 6L, 4L,
9L, 3L, 7L, 4L, 7L, 1L, 5L, 6L, 6L, 4L, 9L, 5L, 7L, 5L, 8L, 8L,
4L, 3L, 4L, 6L, 1L, 5L, 3L, 9L, 5L, 8L, 1L, 1L, 5L, 1L, 1L, 1L,
8L, 7L, 1L, 1L, 9L, 1L, 8L, 3L, 9L, 6L, 7L, 5L, 1L, 5L, 7L, 4L,
3L, 4L, 1L, 2L, 1L, 4L, 2L, 9L, 9L, 4L, 5L, 9L, 3L, 4L, 7L, 4L,
8L, 5L, 1L, 3L, 1L, 7L, 1L, 1L, 6L, 1L, 6L, 1L, 8L), .Label = c("Lexical.Diversity",
"Lexical.Density", "Adjusted.Flesch.Index", "Words.per.Article",
"Sentences.per.Article", "Syllables.per.Word", "Yule.s.I", "Semantic_Entropy",
"News_Entropy_Full"), class = "factor"), value = c(-0.179489825391584,
-0.792325582681448, 1.29830696675904, -0.372172260372429, 0.473812509279295,
-0.153524114610876, 0.733216873520262, 0.159862654456996, 1.03813285673022,
-1.22799303712929, -0.801484127248713, 1.01048388478408, -0.332602268363945,
0.66932649596267, -0.397294142227917, -1.51039268603885, -0.939588396924419,
1.31739241446542, 0.362129563570176, 2.43220201868067, -0.466910883678748,
0.00762367594660348, 1.47033623968998, 0.56033306843174, -1.08779429476028,
-0.614861840522387, -0.99006951408483, -0.588458131299134, 0.209767792916019,
1.18956203853715, -1.50350408751644, 0.0897817201255289, 1.05112348414048,
0.391145755883612, 0.294171269771798, -0.734554231683023, -0.60121949899355,
-0.641086898714197, -0.265721827105799, 0.987473379894742, 0.388246974509265,
0.302686051457864, -0.1477688795229, -0.244993198590439, -0.243063191487411,
0.172830716237646, 0.998505690574715, -0.42214134524048, -1.20946387417143,
-0.835885073498765, -0.694988387016547, -0.556449960555282, 3.03293150151147,
-0.86883683187086, 0.0445475503649207, -0.36453451897155, 0.0866582788638452,
-1.33882302006402, 0.0350835139394611, 0.723227903107476, -0.670493848522175,
0.499257932941051, -0.942732073226944, -0.584142934508364, 1.8222314128331,
-0.154919311457164, -0.466910883678748, -0.572000286640364, 0.115093116018728,
-0.0217498642546982, -0.676208655217928, 0.917388044269897, -1.05591444951145,
0.415923816164584, -0.919100535087332, 0.612734679661034, -0.395270264064337,
-2.49841622541778, -0.175104307426753, -1.15243308738203, 1.1169684195671,
0.741866654154472, 0.345946629759173, -0.561021804651866, 1.68104003305517,
-0.252399447011283, -1.240280268706, -0.408600537376025, -1.00414534493796,
2.47037393302189, 0.129265778455885, -1.28468737659633, -0.599025562669206,
-0.143696213356152, -0.426830949451079, 0.739223485373646, 0.101588138431446,
1.4233959109099, -0.85418312281721, -0.658803957246004)), .Names = c("NameNum",
"variable", "value"), row.names = c(38596L, 322758L, 87485L,
8989L, 313389L, 163593L, 313212L, 167000L, 324632L, 187503L,
39789L, 152553L, 154681L, 308748L, 72620L, 205863L, 145649L,
6466L, 287807L, 112967L, 155211L, 192004L, 135759L, 314675L,
84997L, 245093L, 127005L, 249782L, 37979L, 166529L, 219834L,
200867L, 149905L, 341712L, 158804L, 230897L, 177312L, 299014L,
276196L, 116256L, 96190L, 145030L, 218225L, 31678L, 154210L,
110942L, 313669L, 186700L, 291929L, 34515L, 19301L, 166914L,
15960L, 31009L, 12238L, 302182L, 263125L, 7793L, 34356L, 342499L,
12123L, 291952L, 89436L, 324929L, 228150L, 252579L, 173163L,
27094L, 175068L, 239725L, 140518L, 113174L, 138132L, 1407L, 41388L,
3601L, 116579L, 56938L, 334059L, 318082L, 143004L, 173021L, 317959L,
81472L, 150147L, 234327L, 121440L, 304182L, 171002L, 33485L,
82307L, 27032L, 262933L, 36494L, 18372L, 190798L, 1083L, 207963L,
25490L, 295376L), class = "data.frame")
答案 0 :(得分:2)
以下是我如何使用您提供的数据集进行操作。
首先,让我们得到一个数据集,其中每个因子组合只有最小值和最大值:
library(dplyr)
dat2<- dfplot.m %>% group_by(variable, NameNum) %>%
arrange(value) %>%
slice(c(1, n()))
然后使用geom_point绘制这些点:
library(ggplot2)
ggplot(data = dfplot.m, aes(x = variable, y = value)) +
geom_boxplot(aes(fill = as.factor(NameNum)),outlier.colour = 'black') +
geom_point(data = dat2, aes(fill = as.factor(NameNum)), position = position_dodge(0.75))
答案 1 :(得分:1)
如果您不希望凹口(线条)长于正常值,请使用:
dat <- mtcars
require(data.table)
ext <- melt(setDT(dat)[, range(wt), cyl], "cyl")
ext$cyl <- as.factor(ext$cyl)
dat$cyl <- as.factor(dat$cyl)
require(ggplot2)
ggplot(dat, aes(cyl, wt)) + geom_boxplot(outlier.size = 0) +
geom_point(data = ext, aes(x = cyl, y = value))
哪个给你
而不是:
ggplot(dat, aes(cyl, wt)) + geom_boxplot()