我有两个实验室测试的大量测量(N~500.000)我想用geom_boxplot
绘制。这些测量是使用三种不同的分析仪创建的。
然而,这些测量中的许多都是极端异常值,这导致极大的尺度,这又导致盒子太小而无法真正读取或比较。
我创建了一个虚拟数据集,仅对这些测量中的250个进行采样以展示问题。数据如下:
dummy <- structure(list(result = c(3.93, 2.708, 2.08, 1.8422, 0.897, 1.68,
3.56, 2.8954, 0.972, 2.99, 0.567, 2.01, 2.5629, 0.7958, 4.81,
3.539, 2.24, 4.79, 2.07, 5.56, 3.4761, 1.74, 1.5691, 0.964, 1.8171,
0.005, 0.663, 2.16, 2.37, 0.0164, 1.25, 3.086, 0.769, 1.9573,
1.05, 2.17, 1.6331, 8.2358, 0.58, 1.43, 0.9328, 1.94, 2.59, 5.06,
0.0574, 1.61, 1.01, 8.21, 2.77, 0.9938, 20.38, 3.71, 1.6731,
0.0259, 0.9701, 0.0114, 0.499, 38.8, 4.8689, 3.02, 1.176, 2.86,
1.96, 5.03, 0.7564, 0.903, 2.0017, 0.4928, 1.3993, 4.02, 1.97,
10.6109, 1.18, 1.68, 230, 1.9764, 3.81, 3.3518, 0.985, 3.4, 16.1,
1.5889, 3.13, 2.0168, 1.82, 4.75, 2.61, 2.0133, 1.1971, 0.4736,
74.1, 0.737, 5.21, 1.6495, 1.4, 3.7408, 0.68, 2.26, 2.5, 2.16,
0.459, 0.0281, 5.34, 1.3, 4.11, 1.9344, 3.9611, 2.79, 1.72, 8.9041,
1.47, 2.61, 3.02, 1.91, 3.49, 1.0161, 1.9, 1.63, 1.31, 1.81,
2.556, 0.972, 4.9, 8.313, 1.55, 0.875, 1.4379, 3.68, 0.716, 2.76,
2.1897, 0.3121, 1.4376, 2.56, 0.89, 3.0298, 0.6003, 1.2542, 1.61,
0.491, 3.08, 1.45, 1.94, 2.1503, 2.6605, 23.5, 1.54, 3.54, 4.22,
2.31, 1.082, 1.45, 1.77, 0.423, 11.9, 2.77, 4.8894, 0.8142, 0.158,
9.2012, 1.96, 0.467, 0.4081, 1.06, 2, 3.05, 2.81, 0.2151, 2.21,
0.95, 2.3647, 0.357, 1.7284, 1.31, 0.9586, 1.4548, 4.51, 0.022,
2.2629, 39.9014, 1.3403, 6.64, 4.62, 1.27, 1.18, 1.1, 0.565,
0.939, 1.9141, 3.855, 0.455, 3.14, 1.09, 1.0475, 0.971, 2.33,
1.16, 4.6919, 100, 0.2412, 2.53, 1.84, 1.04, 2.35, 4.89, 1.6384,
3.33, 1.82, 0.8973, 0.6061, 0.98, 6.18, 0.4258, 2.5555, 1.67,
5.37, 2.29, 2.93, 5.0596, 2.2328, 2.84, 7.73, 1.8, 2.3978, 3.02,
2.71, 0.618, 0.0035, 3.97, 0.9827, 2.3385, 5.07, 0.306, 3.13,
2.62, 2.81, 4.4749, 1.0362, 1.4896, 2.3907, 1.45, 2.2823, 2.3726,
2.1746, 4.08, 2.98, 2.57, 0.947, 2.16, 1.46), testName = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 2L, 2L,
1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L,
1L, 2L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 1L,
2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 2L, 1L, 1L,
1L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 2L, 1L, 1L,
1L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Test A", "Test B"
), class = "factor"), accountName = structure(c(3L, 2L, 3L, 2L,
2L, 5L, 6L, 2L, 6L, 3L, 2L, 3L, 1L, 1L, 5L, 2L, 3L, 5L, 3L, 2L,
2L, 1L, 1L, 2L, 1L, 1L, 6L, 1L, 3L, 4L, 2L, 2L, 1L, 2L, 5L, 6L,
2L, 4L, 3L, 4L, 4L, 3L, 3L, 3L, 4L, 5L, 4L, 6L, 3L, 2L, 5L, 3L,
2L, 5L, 1L, 4L, 5L, 1L, 2L, 3L, 1L, 5L, 3L, 3L, 2L, 3L, 2L, 2L,
2L, 3L, 1L, 1L, 1L, 5L, 6L, 4L, 3L, 1L, 3L, 5L, 3L, 1L, 3L, 2L,
5L, 3L, 2L, 2L, 2L, 4L, 5L, 1L, 3L, 1L, 3L, 4L, 2L, 5L, 6L, 6L,
1L, 3L, 6L, 2L, 6L, 2L, 2L, 1L, 3L, 4L, 2L, 3L, 5L, 2L, 1L, 4L,
6L, 2L, 1L, 3L, 1L, 3L, 3L, 4L, 3L, 2L, 4L, 5L, 4L, 1L, 1L, 2L,
1L, 1L, 6L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 5L, 4L, 2L, 4L, 3L, 5L,
3L, 1L, 2L, 6L, 3L, 3L, 5L, 3L, 1L, 4L, 3L, 4L, 6L, 3L, 1L, 3L,
3L, 6L, 1L, 4L, 3L, 3L, 1L, 4L, 2L, 1L, 1L, 1L, 3L, 6L, 2L, 1L,
2L, 3L, 1L, 3L, 1L, 6L, 3L, 1L, 2L, 2L, 3L, 3L, 2L, 1L, 2L, 6L,
3L, 1L, 4L, 1L, 4L, 2L, 1L, 6L, 3L, 2L, 3L, 3L, 4L, 4L, 6L, 3L,
4L, 1L, 5L, 1L, 3L, 3L, 2L, 2L, 3L, 2L, 1L, 1L, 5L, 3L, 6L, 1L,
1L, 2L, 1L, 3L, 6L, 5L, 3L, 3L, 4L, 2L, 4L, 2L, 6L, 1L, 4L, 1L,
3L, 1L, 3L, 2L, 2L, 1L), .Label = c("Lab 1", "Lab 2", "Lab 3",
"Lab 4", "Lab 5", "Lab 6"), class = "factor"), moduleCode = structure(c(2L,
1L, 3L, 1L, 3L, 2L, 2L, 1L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 1L, 3L,
2L, 3L, 3L, 1L, 3L, 1L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 3L, 1L, 3L,
1L, 3L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 3L, 3L, 1L, 3L, 3L, 2L, 3L,
1L, 2L, 3L, 1L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 1L, 3L, 3L, 3L, 1L,
3L, 1L, 1L, 1L, 3L, 3L, 1L, 3L, 2L, 3L, 1L, 3L, 1L, 3L, 3L, 3L,
1L, 3L, 1L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 1L, 3L, 1L, 1L,
3L, 2L, 2L, 3L, 3L, 2L, 3L, 3L, 1L, 1L, 3L, 3L, 1L, 3L, 3L, 2L,
3L, 3L, 1L, 3L, 3L, 3L, 3L, 1L, 3L, 3L, 1L, 3L, 3L, 1L, 2L, 1L,
3L, 1L, 1L, 1L, 3L, 2L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 2L, 1L, 1L,
3L, 2L, 2L, 3L, 3L, 1L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 3L, 1L, 2L,
3L, 1L, 3L, 3L, 2L, 3L, 1L, 3L, 3L, 1L, 1L, 1L, 3L, 1L, 1L, 3L,
2L, 1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 3L, 3L, 3L,
1L, 3L, 2L, 3L, 1L, 1L, 1L, 3L, 3L, 3L, 2L, 3L, 1L, 3L, 2L, 1L,
1L, 3L, 3L, 1L, 1L, 2L, 3L, 3L, 3L, 1L, 1L, 2L, 3L, 3L, 1L, 2L,
3L, 3L, 1L, 3L, 1L, 1L, 3L, 3L, 2L, 3L, 3L, 1L, 1L, 1L, 1L, 3L,
1L, 1L, 1L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("X", "Y", "Z"
), class = "factor")), .Names = c("result", "testName", "accountName",
"moduleCode"), class = "data.frame", row.names = c(NA, -250L))
使用此代码
library(ggplot2)
boxplot <- ggplot(data=dummy, aes(x=accountName, y=result, fill=moduleCode)) +
geom_boxplot(position=position_dodge(width=0.85), outlier.alpha=0.2) +
facet_wrap(~testName, scales="free_y", nrow=2) +
labs(fill="Module") +
theme_bw() +
theme(axis.ticks.x=element_line(color="black")) +
theme(axis.ticks.y=element_line(color="black")) +
theme(axis.text.x=element_text(size=10, angle=55, hjust=1, vjust=0.975, color="black")) +
theme(axis.text.y=element_text(color="black")) +
theme(panel.background=element_blank()) +
theme(axis.title.x=element_blank()) +
theme(axis.title.y=element_blank()) +
theme(axis.line=element_line(colour="black", size=0.5, linetype ="solid")) +
theme(panel.grid.major.y=element_line(color="#bdbdbd", linetype="dotted")) +
theme(panel.grid.minor.y=element_line(color="#bdbdbd", linetype="dotted")) +
theme(plot.margin=unit(c(0.5, 0.5, 0.5, 0.7), "cm"))
print(boxplot)
我创建了这个情节
现在,问题是: 我如何独立地缩放小平面的y轴,并且手动 - 最好的情况,以便它一般地工作,因为我想编写一个可以处理两个随机测试的比较的脚本?
一般来说,我希望刻度的范围远小于完整数据的范围,即我想专注于每个盒子图的胡须的下限和上限的范围。方面独立。 那些可以使用
获得boxplot.stats(dummy$result)$stats[c(1, 5)]
我没有找scales='free_y'
个参数。此外,scale_y_continuous
和coord_cartesian
无法完成这项工作,因为根据我的理解,他们会为代表单项测试的所有方面设置全局范围,这可能会有很大差异。我尝试使用geom_blank
添加空图层,其中虚拟数据的范围受限于较低和较高的胡须极值。问题是,根据测试,实际数据中总会出现极端异常值,导致比例变得非常宽,从而忽略了插入geom_blank
的虚拟数据的范围。
我还尝试使用函数ggplot_build
创建图形对象后提取和操作比例:
graph_object <- ggplot_build(boxplot)
graph_object$layout$panel_scales$y[[1]]$range$range <- boxplot.stats(dummy[with(dummy, testName=="Test A"),]$result)$stats[c(1, 5)]
graph_object$layout$panel_scales$y[[2]]$range$range <- boxplot.stats(dummy[with(dummy, testName=="Test B"),]$result)$stats[c(1, 5)]
然后我尝试使用print(graph_object)
和graph_object$plot
绘制操纵对象,但是,尺度仍然是原始的&#39;更广泛的包括极端价值观。我错过了什么?
对于很长的帖子感到抱歉,我可能在试图解决这个问题的路上迷路了,忽略了解决方案。因此,我很高兴看到一些想法,甚至是对此的正确解决方案。
非常感谢!