我有维恩图的值表,我试图读入R并解析以便用VennDiagram包绘图。我的表看起来像这样:
H3K27AC.bed H3K4ME3.bed gencode.bed Total Name
X 19184 gencode.bed
X 6843 H3K4ME3.bed
X X 3942 H3K4ME3.bed|gencode.bed
X 5097 H3K27AC.bed
X X 1262 H3K27AC.bed|gencode.bed
X X 4208 H3K27AC.bed|H3K4ME3.bed
X X X 9222 H3K27AC.bed|H3K4ME3.bed|gencode.bed
我可以像这样的数据框读取表格:
> venn_table_df<-read.table(venn_table_file,header = TRUE,sep = "\t",stringsAsFactors = FALSE)
> venn_table_df
H3K27AC.bed H3K4ME3.bed gencode.bed Total Name
1 X 19184 gencode.bed
2 X 6843 H3K4ME3.bed
3 X X 3942 H3K4ME3.bed|gencode.bed
4 X 5097 H3K27AC.bed
5 X X 1262 H3K27AC.bed|gencode.bed
6 X X 4208 H3K27AC.bed|H3K4ME3.bed
7 X X X 9222 H3K27AC.bed|H3K4ME3.bed|gencode.bed
我可以从表格中获取维恩图的类别
> venn_categories<-colnames(venn_table_df)[!colnames(venn_table_df) %in% c("Total","Name")]
> venn_categories
[1] "H3K27AC.bed" "H3K4ME3.bed" "gencode.bed"
我甚至可以制作一个更容易阅读的汇总表,如下所示:
> venn_summary<-venn_table_df[!colnames(venn_table_df) %in% venn_categories]
> venn_summary
Total Name
1 19184 gencode.bed
2 6843 H3K4ME3.bed
3 3942 H3K4ME3.bed|gencode.bed
4 5097 H3K27AC.bed
5 1262 H3K27AC.bed|gencode.bed
6 4208 H3K27AC.bed|H3K4ME3.bed
7 9222 H3K27AC.bed|H3K4ME3.bed|gencode.bed
但令我感到困惑的是如何从表中获取值并将它们正确地分配给维恩图的区域。作为参考,三重静脉功能如下所示:
n1<-5097
n2<-6843
n3<-19184
n12<-4208
n13<-1262
n23<-3942
n123<-9222
venn <-draw.triple.venn(area1=n1+n12+n13+n123,
area2=n2+n23+n12+n123,
area3=n3+n23+n13+n123,
n12=n12+n123,
n13=n13+n123,
n23=n23+n123,
n123=n123,
category=venn_categories,
fill=c('red','blue','green'),
alpha=c(rep(0.3,3)))
但显然这需要手动设置值,这是不可取的,因为我有很多这些数据集,并且还需要将其扩展到4路和5路Venn。如何让R为venn中的每个字段找到正确的值?我使用grep
,grepl
尝试了多种不同的方法,并为与图的每个区域的类别匹配的行对数据帧进行了子集化,但这种方法无法正常工作。有什么建议?顺便说一下,这些数据是从HOMER软件包的mergePeaks程序中输出的。
答案 0 :(得分:1)
我想我想通了,用正则表达式搜索表格中正确的条目。这是完整的工作流程:
# load packages
library('VennDiagram')
library('gridExtra')
# read in the venn text
venn_table_df<-read.table(venn_table_file,header = TRUE,sep = "\t",stringsAsFactors = FALSE)
venn_table_df
看起来像这样:
> venn_table_df
H3K27AC.bed H3K4ME3.bed gencode.bed Total Name
1 X 19184 gencode.bed
2 X 6843 H3K4ME3.bed
3 X X 3942 H3K4ME3.bed|gencode.bed
4 X 5097 H3K27AC.bed
5 X X 1262 H3K27AC.bed|gencode.bed
6 X X 4208 H3K27AC.bed|H3K4ME3.bed
7 X X X 9222 H3K27AC.bed|H3K4ME3.bed|gencode.bed
> # recreate it with this btw
> dput(venn_table_df)
structure(list(H3K27AC.bed = c("", "", "", "X", "X", "X", "X"
), H3K4ME3.bed = c("", "X", "X", "", "", "X", "X"), gencode.bed = c("X",
"", "X", "", "X", "", "X"), Total = c(19184L, 6843L, 3942L, 5097L,
1262L, 4208L, 9222L), Name = c("gencode.bed", "H3K4ME3.bed",
"H3K4ME3.bed|gencode.bed", "H3K27AC.bed", "H3K27AC.bed|gencode.bed",
"H3K27AC.bed|H3K4ME3.bed", "H3K27AC.bed|H3K4ME3.bed|gencode.bed"
)), .Names = c("H3K27AC.bed", "H3K4ME3.bed", "gencode.bed", "Total",
"Name"), class = "data.frame", row.names = c(NA, -7L))
然后解析表
# get the venn categories
venn_categories<-colnames(venn_table_df)[!colnames(venn_table_df) %in% c("Total","Name")]
# make a summary table
venn_summary<-venn_table_df[!colnames(venn_table_df) %in% venn_categories]
venn_summary
# get the areas for the venn; add up all the overlaps that contain the given category
# area1
area_n1<-sum(venn_summary[grep(pattern = paste0("(?=.*",venn_categories[1],")"),x = venn_summary$Name,perl = TRUE),][["Total"]])
# area2
area_n2<-sum(venn_summary[grep(pattern = paste0("(?=.*",venn_categories[2],")"),x = venn_summary$Name,perl = TRUE),][["Total"]])
# area3
area_n3<-sum(venn_summary[grep(pattern = paste0("(?=.*",venn_categories[3],")"),x = venn_summary$Name,perl = TRUE),][["Total"]])
# n12
area_n12<-sum(venn_summary[grep(pattern = paste0("(?=.*",venn_categories[1],")","(?=.*",venn_categories[2],")"),x = venn_summary$Name,perl = TRUE),][["Total"]])
# n13
area_n13<-sum(venn_summary[grep(pattern = paste0("(?=.*",venn_categories[1],")","(?=.*",venn_categories[3],")"),x = venn_summary$Name,perl = TRUE),][["Total"]])
# n23
area_n23<-sum(venn_summary[grep(pattern = paste0("(?=.*",venn_categories[2],")","(?=.*",venn_categories[3],")"),x = venn_summary$Name,perl = TRUE),][["Total"]])
# n123
area_n123<-sum(venn_summary[grep(pattern = paste0("(?=.*",venn_categories[1],")","(?=.*",venn_categories[2],")","(?=.*",venn_categories[3],")"),x = venn_summary$Name,perl = TRUE),][["Total"]])
venn <-draw.triple.venn(area1=area_n1,
area2=area_n2,
area3=area_n3,
n12=area_n12,
n13=area_n13,
n23=area_n23,
n123=area_n123,
category=venn_categories,
fill=c('red','blue','green'),
alpha=c(rep(0.3,3)))
关键是使用正则表达式来获取包含venn区域的所有类别的表条目。这比我希望的要多一些,并且需要手动设置以适应四向和五向的静脉,但它到目前为止工作。我愿意接受其他可能简化流程并扩大规模的建议。
答案 1 :(得分:1)
万一有人发现它有用,现在有一个非常简单的过程可以将这些数字转换成近似成比例的维恩图。使用nVennR软件包创建图表的方法之一是从头开始。如vignette中所述,每个区域的值均以特定顺序输入,该顺序恰好与表中的相同。唯一的区别是nVennR期望在开始时再加上一个值,该值对应于外部区域(该值应为0,但无论如何它将被忽略)。这使过程非常容易:
> vt <- read.table('clipboard', header = T)
> vt
H3K27AC.bed H3K4ME3.bed gencode.bed Total Name
1 0 0 X 19184 gencode.bed
2 0 X 0 6843 H3K4ME3.bed
3 0 X X 3942 H3K4ME3.bed|gencode.bed
4 X 0 0 5097 H3K27AC.bed
5 X 0 X 1262 H3K27AC.bed|gencode.bed
6 X X 0 4208 H3K27AC.bed|H3K4ME3.bed
7 X X X 9222 H3K27AC.bed|H3K4ME3.bed|gencode.bed
> myV <- createVennObj(nSets = 3, sNames = c('H3K27Ac', 'H3K4ME3', 'gencode'), sSizes = c(0, vt$Total))
> vp <- plotVenn(nVennObj = myV)