我有一个700K行和5列的data.frame。我首先通过第一列子集,然后我想基于第二列将子集切割成100个间隔。对于这些间隔中的每一个,我想计算平均值dat.percent(第五列),我想在切割时绘制它。
> head(data)
X1 X2 X3 X4 dat.percent
1 1 69270 NA NA 57.32338
2 1 69351 NA NA 61.68868
3 1 69428 NA NA 57.03619
4 1 69511 NA NA 52.78576
5 1 69552 NA NA 57.66801
6 1 69590 NA NA 44.39977
> dput(head(data))
structure(list(X1 = c(1L, 1L, 1L, 1L, 1L, 1L), X2 = c(69270,
69351, 69428, 69511, 69552, 69590), X3 = c(NA, NA, NA, NA, NA,
NA), X4 = c(NA, NA, NA, NA, NA, NA), dat.percent = c(57.323377369328,
61.6886846639862, 57.0361860999426, 52.7857553130385, 57.6680068925905,
44.3997702469845)), .Names = c("X1", "X2", "X3", "X4", "dat.percent"
), row.names = c(NA, 6L), class = "data.frame")
我无法让它发挥作用,但到目前为止,这就是我的目标:
for(i in 1:length(chr)) {
png(paste0("./plots/Feature", i,".png"))
data.subset <- data[which(data[,1] %in% chr[i]) ,]
data.cuts <- cut(data.subset[,2], 100)
data.subset$cuts <- data.cuts ## Does it need to be in data.frame for by?
by(data.subset, as.factor(data.subset$cuts), function(x) {
plot(mean(x[,2]), mean(x[,5]), xlim = c(min(x[,2]) , max(x[,2])) ,
ylim = c( min(x[,5]) - mean(abs(x[,5])), max(x[,5]) + mean(abs(x[,5]))) ,
xlab = "Genome",
ylab = "Percent" ,
main = paste0("Feature ", i))
}
)
}
dev.off()
}
答案 0 :(得分:1)
ggplot2如何:
require(plyr)
require(ggplot2)
data<-data.frame(X1=rep(1:3,each=10000),X2=sample(600000:700000,30000),X3=NA,X4=NA,dat.percent=runif(30000)*100)
head(data.plot)
data.plot<-ddply(data,.(X1,cut=cut(X2,10)),summarise,mean=mean(dat.percent))
ggplot(data.plot)+
geom_point(aes(cut,mean,color=factor(X1)),size=10,alpha=0.5)+
geom_line(aes(cut,mean,group=factor(X1)),alpha=0.5) +
theme(axis.text.x=element_text(angle=-90))