Question

我有一个看起来像这样的数据框（这只是一个子集，实际上数据集有2724098行）

> head(dat)

chr   start  end    enhancer motif 
chr10 238000 238600 9_EnhA1  GATA6 
chr10 238000 238600 9_EnhA1  GATA4 
chr10 238000 238600 9_EnhA1    SRF 
chr10 238000 238600 9_EnhA1  MEF2A 
chr10 375200 375400 9_EnhA1  GATA6 
chr10 375200 375400 9_EnhA1  GATA4 
chr10 440400 441000 9_EnhA1  GATA6 
chr10 440400 441000 9_EnhA1  GATA4 
chr10 440400 441000 9_EnhA1    SRF 
chr10 440400 441000 9_EnhA1  MEF2A 
chr10 441600 442000 9_EnhA1    SRF 
chr10 441600 442000 9_EnhA1  MEF2A

我能够将我的数据集转换为这种格式，其中chr，start，end和enhancer组代表一个ID：

> dat

 id motif 
 1  GATA6 
 1  GATA4 
 1    SRF 
 1  MEF2A 
 2  GATA6 
 2  GATA4
 3  GATA6 
 3  GATA4 
 3    SRF 
 3  MEF2A 
 4    SRF 
 4  MEF2A

我想找到每个可能的图案对的计数，按id分组。 所以我想要一个像，

这样的输出表

motif1 motif2 count
 GATA6  GATA4     3
 GATA6    SRF     2
 GATA6  MEF2A     2
 ... and so on for each pair of motif

在实际数据集中，有1716个独特的图案。有83509个唯一身份证。

有关如何进行的任何建议？

Answer 1

更新：这是使用data.table的快速和内存效率版本：

第1步：构建尺寸的样本数据：

require(data.table) ## 1.9.4+
set.seed(1L)        ## For reproducibility
N = 2724098L
motif = sample(paste("motif", 1:1716, sep="_"), N, TRUE)
id = sample(83509, N, TRUE)
DT = data.table(id, motif)

第2步：预处理：

DT = unique(DT) ## IMPORTANT: not to have duplicate motifs within same id
setorder(DT)    ## IMPORTANT: motifs are ordered within id as well
setkey(DT, id)  ## reset key to 'id'. Motifs ordered within id from previous step
DT[, runlen := .I]

第3步：解决方案：

ans = DT[DT, {
              tmp = runlen < i.runlen; 
              list(motif[tmp], i.motif[any(tmp)])
             }, 
      by=.EACHI][, .N, by="V1,V2"]

在最后一步3中，这需要约27秒和~1GB的内存。

我们的想法是执行自我加入，但要使用data.table的by=.EACHI功能，该功能会评估每个j-expression i 1}}，因此内存效率高。并且j-expression确保我们只获得条目＆＃34; motif_a，motif_b＆＃34;而不是多余的＆＃34; motif_b，motif_a＆＃34;。这也节省了计算时间和内存。二进制搜索非常快，即使有87K + ID。最后，我们通过主题组合进行聚合，以获得每个组合中的行数 - 这就是您所需要的。

HTH

PS：请参阅旧版（+慢版）的修订版。

Answer 2

这是一种从this question无耻地借用的稀疏矩阵技术。

# Create an id
dat$id <- as.factor(paste(dat$chr, dat$start, dat$end, dat$enhancer))

# Create the sparse matrix.
library(Matrix)
s <- sparseMatrix(
      as.numeric(dat$id), 
      as.numeric(dat$motif),
      dimnames = list(levels(dat$id),levels(dat$motif)),
  x = TRUE)

co.oc <- t(s) %*% s # Find co-occurrences.
tab <- summary(co.oc) # Create triplet representation.
tab <- tab[tab$i < tab$j,] # Extract upper triangle of matrix

data.frame(motif1 = levels(dat$motif)[tab$i],
           motif2 = levels(dat$motif)[tab$j],
           number = tab$x)

#    motif1 motif2 number
# 1  GATA4  GATA6      3
# 2  GATA4  MEF2A      2
# 3  GATA6  MEF2A      2
# 4  GATA4    SRF      2
# 5  GATA6    SRF      2
# 6  MEF2A    SRF      3

Answer 3

我认为data.table包可能是最有效的。我们可以计算每个组内的对，然后进行聚合。与首先计算所有对数相比，这是一种更有效的数据方式。

#Bring in data.table and convert data to data.table
require(data.table)
setDT(dat)

#Summarize by two-way pairs
summ <- dat[ , list(motifs=list(combn(unique(as.character(motif)),
   min(2,length(unique(as.character(motif)))), by=list(chr,start,end,enhancer)]

#Transpose and gather data into one table
motifs.table <- rbindlist(lapply(summ$motifs,function(x) data.table(t(x))))

#Summarize table with counts
motifs.table[ , .N, by=list(V1,V2)]

#       V1    V2 N
# 1: GATA6 GATA4 3
# 2: GATA6   SRF 2
# 3: GATA6 MEF2A 2
# 4: GATA4   SRF 2
# 5: GATA4 MEF2A 2
# 6:   SRF MEF2A 3

Answer 4

如果您可以将数据放入名为dat的SQL表中，则此查询应该起作用：

select d1.motif m1, d2.motif m2, count(*) count
from dat d1
join dat d2
on d1.chr = d2.chr
  and d1.start = d2.start
  and d1.end = d2.end
  and d1.enhancer = d2.enhancer
  and d1.motif <> d2.motif
group by d1.motif, d2.motif

鉴于您的数据大小，我怀疑R sqldf包可以处理它，但是通过免费的MySQL安装，您可以使用RODBC或RJDBC来进行R和SQL通话。

Answer 5

您可能会从正式建模数据语义中受益。如果基因组上有范围，请使用Bioconductor的GenomicRanges包。

library(GenomicRanges)
gr <- makeGRangesFromDataFrame(df, keep.extra.columns=TRUE)

这是一个GRanges对象，它正式理解基因组位置的概念，所以这些操作才有效：

hits <- findMatches(gr, gr)
tab <- table(motif1=gr$motif[queryHits(hits)],
             motif2=gr$motif[subjectHits(hits)])
subset(as.data.frame(tab, responseName="count"), motif1 != motif2)

Answer 6

...如果这不是你想要的，我就放弃了。显然它没有针对大型数据集进行优化。这只是一种利用R的自然优势的通用算法。有几种可能的改进，例如，使用dplyr和data.table。后者将大大加快此处的[和%in%操作。

motif_pairs <- combn(unique(dat$motif), 2)
colnames(motif_pairs) <- apply(motif_pairs, 2, paste, collapse = " ")
motif_pair_counts <- apply(motif_pairs, 2, function(motif_pair) {
  sum(daply(dat[dat$motif %in% motif_pair, ], .(id), function(dat_subset){
    all(motif_pair %in% dat_subset$motif)
  }))
})
motif_pair_counts <- as.data.frame(unname(cbind(t(motif_pairs), motif_pair_counts)))
names(motif_pair_counts) <- c("motif1", "motif2", "count")
motif_pair_counts

#   motif1 motif2 count
# 1  GATA6  GATA4     3
# 2  GATA6    SRF     2
# 3  GATA6  MEF2A     2
# 4  GATA4    SRF     2
# 5  GATA4  MEF2A     2
# 6    SRF  MEF2A     3

另一个旧版本。请确保您的问题清楚！

这正是plyr旨在实现的目标。试试dlply(dat, .(id), function(x) table(x$motif) )。

但请不要只是尝试复制并粘贴此解决方案，而不至少阅读文档。 plyr是一个非常强大的软件包，对您理解它非常有帮助。

旧帖回答错误的问题：

你在寻找不相交或重叠的对吗？

以下是使用包rollapply中的zoo函数的一种解决方案：

library(zoo)

motif_pairs <- rollapply(dat$motif, 2, c)              # get a matrix of pairs
motif_pairs <- apply(motif_pairs, 1, function(row) {   # for every row...
  paste0(sort(row), collapse = " ")                    #   sort the row, and concatenate it to a single string
                                                       #   (sorting ensures that pairs are not double-counted)
})
table(motif_pairs)                                     # since each pair is now represented by a unique string, just tabulate the string appearances

## if you want disjoint pairs, do `rollapply(dat$motif, 2, c, by = 2)` instead

如果这不是您需要的话，请查看rollapply的文档。对于其他变量的分组，您可以将其与以下之一组合：

基础R函数aggregate或by（不推荐）或
来自*ply的{{1}}函数（更好）

Answer 7

这个怎么样？：

res1<- split(dat$motif,dat$id)
res2<- lapply(res1,function(x) combn(x,2))
res3<- apply(do.call(cbind,res2),2,function(x) paste(x[1],x[2],sep="_"))

table(res3)

计算按多列分组的列中的每个可能值对

7 个答案: