我对 R 编程比较陌生,我试图创建一个表来显示8个基因列表之间的任何基因是否重叠。下面,我包括了我目前拥有的编码:
setwd("~/Desktop/R_Project/Gene_overlap")
getwd()
files <- list.files(pattern="*.txt", full.names = TRUE)
files
data.list <- lapply(files, function(fil) {
scan(file=fil, what=character())
})
names(data.list) <- basename(files) %>% stringr::str_remove("\\.txt$")
str(data.list)
# List of 8
# $ GSE108363_BCGdown_D: chr [1:350] "IL1B" "IL6" "IL1A" "CCL20" ...
# $ GSE108363_BCGdown_V: chr [1:267] "IL6" "CCL20" "IL1A" "CXCL5" ...
# $ GSE108363_BCGup_D : chr [1:250] "FABP4" "CMTM2" "FUCA1" "CD36" ...
# $ GSE108363_BCGup_V : chr [1:429] "FCN1" "FCGR3B" "MNDA" "CPVL" ...
# $ GSE108363_MTBdown_D: chr [1:86] "CCL20" "IL1B" "IL1A" "IL6" ...
# $ GSE108363_MTBdown_V: chr [1:244] "IL1B" "IL1A" "CCL20" "IL6" ...
# $ GSE108363_MTBup_D : chr [1:128] "FUCA1" "FGL2" "TGFBI" "CPVL" ...
# $ GSE108363_MTBup_V : chr [1:286] "FABP4" "RNASE1" "MNDA" "CPVL" ...
intersect(data.list$GSE108363_BCGdown_D, data.list$GSE108363_BCGdown_V) %>% length
sapply(data.list, length)
set.seed(11)
BCG_validation_Up <- sample(letters[1:429], )
BCG_discovery_Up <- sample(letters[1:250], )
MTB_validation_Up <- sample(letters[1:286], )
MTB_discovery_Up <- sample(letters[1:128], )
BCG_validation_Down <- sample(letters[1:267], )
BCG_discovery_Down <- sample(letters[1:350], )
MTB_validation_Down <- sample(letters[1:244], )
MTB_discovery_Down <- sample(letters[1:86], )
cross_table <- matrix(, nrow = 4, ncol = 4)
rownames(cross_table) <- c("BCG_validation_Up", "BCG_discovery_Up", "MTB_validation_Up", "MTB_discovery_Up")
colnames(cross_table) <- c("BCG_validation_Down", "BCG_discovery_Down", "MTB_validation_Up", "MTB_discovery_Up")
for (i in 1:4){
for(j in 1:4){
cross_table[i,j] <- length(intersect(get(paste0("",i,"_Up")),(get(paste0("",j,"_Down")))))
}
}
cross_table
如何更改它以成功找到重叠?
答案 0 :(得分:0)
未测试...
library(dplyr) # for pipes the %>% symbol
tgenes <- data.list %>% unlist %>% table
names(tgenes[tgenes==8])
答案 1 :(得分:0)
我想这就是你想要的:
nparray
如果要查找所有列表共有的基因,请使用:
sapply(data.list, function(x) sapply(data.list, function(y) sum(x %in% y)))
或者,如果您要查找列表的任何其他子集共有的基因,请使用:
Reduce(intersect, data.list)
我刚刚意识到您可能不想将列表中的所有项目与所有其他项目进行比较。在这种情况下,第一个sapply()用于列,第二个sapply()用于行。
index_of_interest = c(2, 5, 7)
Reduce(intersect, data.list[index_of_interest])
答案 2 :(得分:0)
我想出了办法。查看所有不同基因集之间可能重叠的代码如下:
# seq_along(gene.lists) # 1 2 3 4 5 6 7 8
for (i in seq_along(gene.lists)) {
g1 <- gene.lists[[i]]
for (j in seq_along(gene.lists)) {
g2 <- gene.lists[[j]]
a <- intersect(g1, g2)
b <- length(a)
mx.overlap.count[j,i] <- b
}
}
mx.overlap.count
View(mx.overlap.count)