鉴于此数据,(可在此处找到完整的数据:http://pastebin.com/raw.php?i=6NTcnLj7):
Probes Gene.symbol ImmGen FOO_YJ_06.ip FOO_MI_06.ip FOO_NL_06.id FOO_YJ_06.id FOO_MI_06.id BAR_NN_06.ip BAR_PR_06.ip BAR_YJ_06.ip BAR_MI_06.ip BAR_NL_06.id BAR_YJ_06.id BAR_MI_06.id BAR_NN_24.ip BAR_PR_24.ip BAR_YJ_24.ip BAR_MI_24.ip BAR_NN_06.ip BAR_NN_24.ip BAR_PR_06.ip BAR_PR_24.ip BAR_YJ_06.ip BAR_YJ_24.ip BAR_MI_06.ip BAR_MI_24.ip BAR_NL_06.id BAR_YJ_06.id BAR_MI_06.id TXT_NL_06.id TXT_YJ_06.ip TXT_MI_06.ip TXT_YJ_06.id TXT_MI_06.id XXX_YJ_06.ip XXX_MI_06.ip XXX_NL_06.id XXX_YJ_06.id XXX_MI_06.id KTH_NL_06.id KTH_YJ_06.ip KTH_MI_06.ip K3_YJ_06.id K3_MI_06.id UUU_YJ_06.in UUU_MI_06.in DAR_NL_06.id DAR_YJ_06.id DAR_MI_06.id
1425352_at Rcor3 StromalCells(12.99),DendriticCells(12.18),StemCells(11.43),NKCells(10.50),Macrophages(10.11),abTcells(9.11),Neutrophils(8.72),Monocytes(8.63),Bcells(8.61),gdTCells(7.71) 1.162 0.795 0.695 0.701 1.085 1.052 1.544 0.75 1.305 1.213 1.142 0.814 0.79 0.89 1.691 1.013 1.052 0.79 1.544 0.89 0.75 1.691 1.305 1.013 1.213 1.142 0.814 1.556 0.744 1.22 1.239 1.164 0.827 1.203 0.778 0.929 0.95 0 0.877 0.906 1.294 0.904 0 1.2 0.927 0.704 1.181
1417466_at Rgs5 StromalCells(72.03),Neutrophils(3.39),DendriticCells(3.31),NKCells(3.28),Monocytes(3.25),Macrophages(3.15),gdTCells(3.01),abTcells(2.99),Bcells(2.80),StemCells(2.80) 1.149 0.904 1.225 0.821 1.075 0.947 0.969 1.262 0.868 1.013 0.984 0.938 0.925 1.11 1.36 1.014 0.947 0.925 0.969 1.11 1.262 1.36 0.868 1.014 1.013 0.984 0.938 0.877 0.887 1.035 1.226 0.979 1.142 1.126 0.933 0.854 1.033 0.911 1.255 1.038 1.125 1.086 1.18 0.958 1.115 1.017 1.061
我获得了这个热图,只有尾巴显示。注意 它添加了不需要的额外复制列(标有红色框)。
例如BAR_YJ_06.ip
仅在上面的数据中出现一次。但在情节中它出现了
两次BAR_YJ_06.ip
和BAR_YJ_06.ip.1
为什么?我该如何删除它们?
这是我用来生成上图的代码:
#!/usr/bin/env Rscript
library(gplots);
library(RColorBrewer);
plot_hclust <- function(inputfile,clust.height,type.order=c(),row.margins=30) {
dat.bcd <- read.table(inputfile,na.strings=NA, sep="\t",header=TRUE);
base <- substr(basename(inputfile), 1, nchar(basename(inputfile)) - 4 )
rownames(dat.bcd) <- do.call(paste,c(dat.bcd[c("Probes","Gene.symbol","ImmGen")],sep=" "))
dat.bcd <- dat.bcd[,!names(dat.bcd) %in% c("Probes","Gene.symbol","ImmGen")]
dat.bcd <- dat.bcd
# Clustering and distance function
hclustfunc <- function(x) hclust(x, method="complete")
distfunc <- function(x) dist(x,method="maximum")
# Select based on FC, as long as any of them >= anylim
anylim <- 2.0
dat.bcd <- dat.bcd[ apply(dat.bcd, 1,function(x) any (x >= anylim)), ]
nrow(dat.bcd);
#print(heatout):
# Clustering functions
height <- clust.height;
# Define output file name
heatout <- paste("myheatmap.pdf",sep="");
print(heatout)
# Compute distance and clusteirn function
d.bcd <- distfunc(dat.bcd)
fit.bcd <- hclustfunc(d.bcd)
# Plot the hierarchical dendogram without heatmap
# Cluster by height
#cutree and rect.huclust has to be used in tandem
clusters <- cutree(fit.bcd, h=height)
nofclust.height <- length(unique(as.vector(clusters)));
myorder <- colnames(dat.bcd);
if (length(type.order)>0) {
myorder <- type.order
}
# Define colors
#hmcols <- rev(brewer.pal(11,"Spectral"));
hmcols <- rev(redgreen(2750));
selcol <- colorRampPalette(brewer.pal(12,"Set3"))
selcol2 <- colorRampPalette(brewer.pal(9,"Set1"))
sdcol= selcol(5);
clustcol.height = selcol2(nofclust.height);
# Plot heatmap
pdf(file=heatout,width=50,height=80);
par(xaxs="i");
# We do bi-clustering
heatmap.2(as.matrix(dat.bcd), trace='none', dendrogram='both',Colv=T, scale='row',
hclust=hclustfunc, distfun=distfunc, col=hmcols,
symbreak=T,
margins=c(15,200), keysize=0.5,
labRow=rownames(dat.bcd),
lwid=c(2,0.1,4), lhei=c(0.05,3),
lmat=rbind(c(5,0,4),c(3,1,2)),
RowSideColors=clustcol.height[clusters])
dev.off();
}
# Plotting
plot_hclust("http://pastebin.com/raw.php?i=6NTcnLj7",clust.height=3);
答案 0 :(得分:1)
这不是heatmap.2
问题。所有这些重复的样本都显示在源数据框中。您应该检查您的工作流程并修复将重复项引入数据的步骤。
替代 ad hoc 解决方案是在绘制热图之前从数据框中删除任何重复的列:
data <- read.table(file='http://pastebin.com/raw.php?i=6NTcnLj7', header=T)
# obtain the logical vector (TRUE/FALSE), where TRUE == duplicated elements
ind <- duplicated(t(data))
# retain only the unique columns
# ! == inverts the logical vector, so TRUE == unique elements
subset <- data[,!ind]