Question

我有一个数据集，其中包含许多样本中许多测试的结果。样本在数据集中复制。我想比较每组复制样本中重复的测试结果。我认为首先用SampleID分割我的数据帧可能是最简单的，这样我就有了一个数据帧列表，每个SampleID有一个数据帧。样本可能有2个，3个，4个甚至5个重复，因此每个样本组要比较的行的唯一组合数量不同。我有以下逻辑思路。我想在数据帧列表上运行一个函数并输出匹配结果。该函数将比较每组复制样本中的2行的唯一集合，并返回“匹配”，“不匹配”或NA的值（如果缺少测试的一个或两个值）。它还将返回在2个比较重复之间重叠的测试计数，匹配数和不匹配数。最后，它将包含一个列，其中样本名称与行号粘贴在一起，因此我知道比较了哪两个样本（例如Sample1.1_Sample1.2）。有人能指出我正确的方向吗？

    #Input data structure
    data = as.data.frame(cbind(rbind("Sample1","Sample1","Sample2","Sample2","Sample2"),rbind("A","A","C","C","C"), rbind("A","T","C","C","C"), 
                 rbind("A",NA,"C","C","C"), rbind("A","A","C","C","C"), rbind("A","T","C","C",NA), rbind("A","A","C","C","C"),
                 rbind("A","A","C","C","C"), rbind("A",NA,"C","T","T"), rbind("A","A","C","C","C"), rbind("A","A","C","C","C")))

    colnames(data) = c("SampleID", "Test1","Test2","Test3","Test4","Test5","Test6","Test7","Test8","Test9","Test10")
    data 

    data.split = split(data, data$SampleID)


    ##Row comparison function
    #Input is a list of data frames. Each data frame contains results for replicates of the same sample.
    RowCompare = function(x){
      rowcount = nrow(x)
      ##ifelse(rowcount==2,
        ##compare row 1 to row 2
          ##paste sample names being compared together
          ##how many non-NA values overlap, keep value
          ##of those that overlap, how many match, keep value
          ##of those that overlap, how many do not match, keep value
      #ifelse(rowcount==3,
          ##compare row 1 to row 2
            ##paste sample names being compared together
            ##how many non-NA values overlap, keep value
            ##of those that overlap, how many match, keep value
            ##of those that overlap, how many do not match, keep value
          ##compare row 1 to row 3
            ##paste sample names being compared together
            ##how many non-NA values overlap, keep value
            ##of those that overlap, how many match, keep value
            ##of those that overlap, how many do not match, keep value
          ##compare row 2 to row 3
            ##paste sample names being compared together
            ##how many non-NA values overlap, keep value
            ##of those that overlap, how many match, keep value
            ##of those that overlap, how many do not match, keep value
      return(results)
    }

    #Output is a list of data frames - one for sample name
    out = lapply(names(data.split), function(x) RowCompare(data.split[[x]])) 

    #Row bind the list of data frames back together to one large data frame
    out.merge = do.call(rbind.data.frame, out) 
    head(out.merge)

    #Desired output
    out.merge = as.data.frame(cbind(rbind("Sample1.1_Sample1.2","Sample2.1_Sample2.2","Sample2.1_Sample2.3","Sample2.2_Sample2.3"),rbind("Match","Match","Match","Match"), 
                      rbind("Mismatch","Match","Match","Match"), rbind(NA,"Match","Match","Match"), rbind("Match","Match","Match","Match"), rbind("Mismatch","Match",NA,NA), 
                      rbind("Match","Match","Match","Match"), rbind("Match","Match","Match","Match"), rbind(NA,"Mismatch","Mismatch","Match"), rbind("Match","Match","Match","Match"), 
                      rbind("Match","Match","Match","Match"), rbind(8,10,9,9), rbind(6,9,8,8), rbind(2,1,1,1)))

    colnames(out.merge) = c("SampleID", "Test1","Test2","Test3","Test4","Test5","Test6","Test7","Test8","Test9","Test10", "Num_Overlap", "Num_Match","Num_Mismatch")
    out.merge

我在另一篇文章中看到的我认为可能有用的一件事是下面的行将创建一个独特行组合的数据框，然后可以用它来定义每组复制样本中要比较的行。不知道如何实现它。

    t(combn(nrow(data),2))

谢谢。

Answer 1

您与t(combn(nrow(data),2))走在正确的轨道上。请参阅下文，了解我将如何做到这一点。

testCols <- which(grepl("^Test\\d+",colnames(data)))

TestsCompare=function(x,y){
  ##how many non-NA values overlap
  overlaps <- sum(!is.na(x) & !is.na(y))
  ##of those that overlap, how many match
  matches <- sum(x==y, na.rm=T)
  ##of those that overlap, how many do not match
  non_matches <- overlaps - matches # complement of matches
  c(overlaps,matches,non_matches)
}

RowCompare= function(x){
  comp <- NULL
  pairs <- t(combn(nrow(x),2))
  for(i in 1:nrow(pairs)){
    row_a <- pairs[i,1]
    row_b <- pairs[i,2]
    a_tests <- x[row_a,testCols]
    b_tests <- x[row_b,testCols]
    comp <- rbind(comp, c(row_a, row_b, TestsCompare(a_tests, b_tests)))
  }
  colnames(comp) <- c("row_a","row_b","overlaps","matches","non_matches")
  return(comp)
}

out = lapply(data.split, RowCompare)

产地：

> out
$Sample1
     row_a row_b overlaps matches non_matches
[1,]     1     2        8       6           2

$Sample2
     row_a row_b overlaps matches non_matches
[1,]     1     2       10       9           1
[2,]     1     3        9       8           1
[3,]     2     3        9       9           0

R中行的成对比较

1 个答案: