我正在阅读一个包含26列和4288个实例的数据集“Dummy_data.csv”,其中总共有17个参数(列)对我们的数据分析非常重要。 17个参数中的6个,即(param1,param2,param3,param5,param6,param7)是同时超出范围的关键参数,确定该项目是否有缺陷(类别标签)。例如,
range1 = (min1, max1) = (0.25, 0.35)
range2 = (min2, max2) = (2.5, 3.1)
range3 = (min3, max3) = (680, 700)
range5 = (min5, max5) = (56, 64)
range6 = (min6, max6) = (40, 60)
range7 = (min7, max7) = (28, 38)
if (param1 out of range1 & param2 out of range2 & param3 out of range3 &
param5 out of range5 & param6 out of range6 & param7 out of range7)
class = 'defective'
else
class = 'ok'
我们需要对上述数据进行两次缺陷分析。首先,我需要从项目总数中找出缺陷的%份额。其次,我需要找出6个关键参数中每一个的超出范围值的频率直方图,以便了解这些关键参数的哪些超出范围值对缺陷项目的贡献更大。
我做了什么:
由于这6个关键参数的范围大部分是非重叠的,因此我首先使用(x - min(x))/(max(x)-min(x))
缩放17个参数(尽管按比例缩放6个关键参数就足够了!),以便(0, 1)
的间隔,以便我可以在x轴的均匀比例上为6个参数进行超出范围值的频率分布。在图形术语中,这意味着小于0的参数值意味着小于最小值,而大于1的参数值意味着大于最大值。因此,我从z
数据框中的数据集中过滤了所有有缺陷的实例,并绘制饼图以显示有缺陷和有缺陷的项目百分比。 (第一次分析)
对于频率直方图(第二次分析),我将所有有缺陷的实例从缩放的数据集scaled.dat.df
过滤到defect.dat.df
。然后,我从所有6个参数中选择min
和max
以确定缺陷间隔范围。接下来,我将6个参数中的每个参数的唯一值合并到p1.bin.defect.dat.df
到p7.bin.defect.dat.df
,并在同一个图上使用plot
函数绘制单个直方图。
多个重叠直方图的问题
我得到如下所示的多直方图,但问题是条的宽度对于6个参数是不同的。 是否有人知道如何为多直方图绘制均匀条宽?另外,如何在此多组图中添加合适的图例?
任何有用的建议/答案都将受到高度赞赏并得到相应的奖励。
注意:我在这里的多个直方图上跟踪了另一个线程 how-to-plot-two-histograms-together-in-r ,并且想要一个非常类似于此的多组图,但是6个重叠的直方图而不是2个重叠hist plot(如在帖子中)
library(RWeka)
library(party)
library(plyr)
library(plotrix)
library(sm)
#read data and class labels
dat <- read.csv("Dummy_data.csv", head=T, sep=",")
datm <- as.matrix(dat[,8:24])
class <- as.matrix(dat[,26])
#center and scale data
center <- c(0.25, 2.5, 680, 1067, 56, 40, 28, -99, -99, 40, 5, 50, 5000, 15000, 11.3, 9.1, 0)
scale <- c(0.1, 0.6, 20, 6, 8, 20, 10, 19, 19, 20, 2, 10, 500, 1000, 3.4, 18.3, 5)
scaled.datm <- scale(datm, center, scale)
write.table(scaled.datm,
file = "C:\\Users\\schakrabarti\\Documents\\Dummy_data_whdr17.csv",
append=FALSE, quote=TRUE, sep=",", eol = "\n", na = "NA", dec = ".",
row.names = FALSE, col.names = TRUE, qmethod = c("escape", "double"),
fileEncoding = "")
#filter total non-compliants
scaled.dat.df <- as.data.frame(scaled.datm)
total <- length(scaled.dat.df[,1])
z <- c((scaled.dat.df[,"PARAM1"]<0 | scaled.dat.df[,"PARAM1"]>1) &
(scaled.dat.df[,"PARAM2"]<0 | scaled.dat.df[,"PARAM2"]>1) &
(scaled.dat.df[,"PARAM3"]<0 | scaled.dat.df[,"PARAM3"]>1) &
(scaled.dat.df[,"PARAM5"]<0 | scaled.dat.df[,"PARAM5"]>1) &
(scaled.dat.df[,"PARAM6"]<0 | scaled.dat.df[,"PARAM6"]>1) &
(scaled.dat.df[,"PARAM7"]<0 | scaled.dat.df[,"PARAM7"]>1) )
noncompliant <- length(z[z == TRUE])
slices <- c(noncompliant, total - noncompliant)
labls <- c("NOT OK","OK")
pct <- round(slices/sum(slices)*100, digits=2)
labls <- paste(labls, pct)
labls <- paste(labls, "%", sep="")
#pie3D(slices,labels=labls,explode=0.05, col=c(rgb(0.75,0,0.5),rgb(0,1,0.75)),main="Defect Analysis due to critical parameters")
pie(slices,labels=labls,main="Defect Analysis due to critical parameters")
#filter non-compliants due to individual params
defect.dat.df <- scaled.dat.df[z,]
#select defect interval range
min <- min(as.numeric(sapply(defect.dat.df[,c("PARAM1","PARAM2","PARAM3","PARAM5","PARAM6","PARAM7")], function(x) min(as.numeric(x)))))
max <- max(as.numeric(sapply(defect.dat.df[,c("PARAM1","PARAM2","PARAM3","PARAM5","PARAM6","PARAM7")], function(x) max(as.numeric(x)))))
#plot histogram for param1 defect
#p1.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM1")], breaks=seq(-0.4,0.2,by=0.2))
p1.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM1")], breaks=seq(min,max,by=0.2))
#h1 <- hist(defect.dat.df[,c("PARAM1")])
#plot(h1, col=rgb(1,0,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
h1 <- hist(defect.dat.df[,c("PARAM1")], col=rgb(1,0,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
#h1 <- hist(defect.dat.df[,c("PARAM1")], col=rgb(1,0,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(min,max))
box()
p2.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM2")], breaks=seq(min,max,by=0.2))
#h2 <- hist(defect.dat.df[,c("PARAM2")])
#plot(h2, col=rgb(0,0,1,1/7), xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
h2 <- hist(defect.dat.df[,c("PARAM2")], col=rgb(0,0,1,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p2.bin.defect.dat.df$breaks, n=1),tail(p2.bin.defect.dat.df$breaks, n=1)), add=T)
#h2 <- hist(defect.dat.df[,c("PARAM2")], col=rgb(0,0,1,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(min,max), add=T)
box()
p3.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM3")], breaks=seq(min,max,by=0.2))
#h3 <- hist(defect.dat.df[,c("PARAM3")])
#plot(h3, col=rgb(0,1,0,1/7), xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
h3 <- hist(defect.dat.df[,c("PARAM3")], col=rgb(0,1,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p3.bin.defect.dat.df$breaks, n=1),tail(p3.bin.defect.dat.df$breaks, n=1)), add=T)
#h3 <- hist(defect.dat.df[,c("PARAM3")], col=rgb(0,1,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(min,max), add=T)
box()
p5.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM5")], breaks=seq(min,max,by=0.2))
#h5 <- hist(defect.dat.df[,c("PARAM5")])
#plot(h5, col=rgb(0.5,0.5,0,1/7), xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
h5 <- hist(defect.dat.df[,c("PARAM5")], col=rgb(0.5,0,0.5,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p5.bin.defect.dat.df$breaks, n=1),tail(p5.bin.defect.dat.df$breaks, n=1)), add=T)
#h5 <- hist(defect.dat.df[,c("PARAM5")], col=rgb(0.5,0,0.5,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(min,max), add=T)
box()
p6.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM6")], breaks=seq(min,max,by=0.2))
#h6 <- hist(defect.dat.df[,c("PARAM6")])
#plot(h6, col=rgb(0,0.5,0.5,1/7), xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
h6 <- hist(defect.dat.df[,c("PARAM6")], col=rgb(0,0.5,0.5,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p6.bin.defect.dat.df$breaks, n=1),tail(p6.bin.defect.dat.df$breaks, n=1)), add=T)
#h6 <- hist(defect.dat.df[,c("PARAM6")], col=rgb(0,0.5,0.5,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(min,max), add=T)
box()
p7.bin.defect.dat.df <- binning(defect.dat.df[,c("PARAM7")], breaks=seq(min,max,by=0.2))
#h7 <- hist(defect.dat.df[,c("PARAM7")])
#plot(h7, col=rgb(0.5,0,0.5,1/7), xlim=c(head(p1.bin.defect.dat.df$breaks, n=1),tail(p1.bin.defect.dat.df$breaks, n=1)))
h7 <- hist(defect.dat.df[,c("PARAM7")], col=rgb(0.5,0.5,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(head(p7.bin.defect.dat.df$breaks, n=1),tail(p7.bin.defect.dat.df$breaks, n=1)), add=T)
#h7 <- hist(defect.dat.df[,c("PARAM7")], col=rgb(0.5,0.5,0,1/7), xlab="Param Defect Intervals", main="Frequency of Parameter Defects", xlim=c(min,max), add=T)
box()
答案 0 :(得分:0)
实际上,为了让条形图具有相同的宽度,您希望在所有组中使用相同的间隔。我继续创建了一些示例数据。我没有把一堆不同的data.frames保存在列表中。
#sample data
set.seed(15)
observations <- lapply(1:6, function(x) rnorm(100*x))
这里我们有6个项目,每个项目具有来自标准正态分布的不同数量的随机抽取。现在我将整个范围打破并进行50次休息
maxrange <- range(sapply(observations, range))
breaks <- seq(maxrange[1], maxrange[2], length.out=50)
现在我将这些相同的中断应用于所有集合,我将计算直方图数据而不实际绘制它
hists <- lapply(bins, hist, breaks=breaks, plot=F)
为了帮助绘图,我需要预先计算ylim,以确保第一个绘图足够高,以便所有其他绘图的所有数据。我现在也会设置颜色。
ylim <- range(sapply(hists, function(x) {range(x$density)}))
cols<-list(
rgb(1,0,0,1/7), rgb(0,0,1,1/7), rgb(0,1,0,1/7),
rgb(0.5,0,0.5,1/7), rgb(0,0.5,0.5,1/7), rgb(0.5,0.5,0,1/7)
)
现在我们用我们想要的所有标签绘制第一个直方图,然后我们将所有其他标记绘制在顶部
plot(hists[[1]], ylim=ylim, col=cols[[1]], freq=F,
main="Combined Histogram", xlab="Observation")
invisible(mapply(function(x, c)
plot(x, ylim=ylim, col=c, freq=F, add=T),
hists[-1], cols[-1]))