行之间匹配的百分比,并根据特定列查找重复项

时间:2014-07-02 20:49:20

标签: r find matching duplication

嗨朋友们请帮帮我!

我有大量的数据集,包含第一和第三个cloumns的重复。

test<-matrix(c("Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_D","Line_D","Line_D","Line_D","Line_E","Line_E","Line_E","Line_E","F5","F5","F5","F5","F6","F6","F6","F6","F5","F5","F5","F5","F6","F6","F6","F6","F7","F7","F7","F7","F5","F5","F5","F5","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","A/A","T/T","T/T","C/C","A/A","T/T","A/T","C/C","A/A","T/T","T/T","C/C","-/-","T/T","A/T","C/C","A/A","C/T","A/A","G/G","A/A","T/T","T/T","C/C","A/A","C/T","T/A","C/C","A/A","T/T","T/T","C/C","A/A","C/C","A/A","G/G"),nrow=36)

colnames(test)<-c("Line","Year","Marker","data")

我转换为数据框

test1<-data.frame(test)

数据集中有重复项。例如第1行和第5行,第2行和第6行等。它们在cloumn 1(Line)和第3列(Marker)中具有相同的数据。

Line         Year      Marker      data
Line_A         F5          M1       A/A
Line_A         F5          M2       T/T
Line_A         F5          M3       T/T
Line_A         F5          M4       C/C
Line_A         F6          M1       A/A
Line_A         F6          M2       T/T
Line_A         F6          M3       A/T
Line_A         F6          M4       C/C
Line_B         F5          M1       A/A
Line_B         F5          M2       T/T
Line_B         F5          M3       T/T
.
.
.

我想有一个如下所示的表来列出重复的行,然后是非重复的行。我希望通过比较标记数据来获得年份之间重复的匹配百分比。如

Line            Year                 Duplication        Matching_Marker %  
Line A          F5 / F6                  Yes                  75                  
Line B          F5 / F6                  Yes                  75                   
Line B          F5 / F7                  Yes                  50                   
Line B          F6 / F7                  Yes                  25                   
Line C          F5 / F6                  Yes                  50                   
Line D          F6                         No                 NA                  
Line D          F6                         No                 NA                  

提前致谢!

显示结果的最佳方式应该是

Line    Year        Duplication   MatchCount Mismatchcount     Matching_Marker %  Het%Year1   Het%Year2   Missing%Year1    Missing%Year2
Line A   F5 / F6      Yes            3           1                75               0            25            0                0     
Line B   F5 / F6      Yes            2           1                75               0            33.3          0                25                                               
Line B   F5 / F7      Yes            1           3                50               0            25             0                0                      
Line B   F6 / F7      Yes            0           3                0               25            25             25               0                       
Line C   F5 / F6      Yes            2           2                50               0            50             0                0                     
Line D   F6            No            NA           NA              NA               0            NA             0                NA                             
Line D   F6            No            NA           NA              NA               0           NA              0                NA              

第二个示例数据集是

test<-matrix(c("Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_D","Line_D","Line_D","Line_D","Line_E","Line_E","Line_E","Line_E","Line_E","Line_E","Line_E","Line_E","F5","F5","F5","F5","F6","F6","F6","F6","F5","F5","F5","F5","F6","F6","F6","F6","F7","F7","F7","F7","F5","F5","F5","F5","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","A/A","T/T","T/T","C/C","A/A","T/T","A/T","C/C","A/A","T/T","T/T","C/C","-/-","T/T","A/T","C/C","A/A","C/T","A/A","G/G","A/A","T/T","T/T","C/C","A/A","C/T","T/A","C/C","A/A","T/T","T/T","C/C","A/A","C/C","A/A","G/G","A/A","C/C","A/A","G/G"),nrow=40)
colnames(test)<-c("Line","Year","Marker","data")
test1<-data.frame(test)   

第三个示例数据集,其中缺少某一行的年度数据

test<-matrix(c("Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_D","Line_D","Line_D","Line_D","Line_E","Line_E","Line_E","Line_E","Line_E","Line_E","Line_E","Line_E","F5","F5","F5","F5","F6","F6","F6","F6","F5","F5","F5","F5","F6","F6","F6","F6","F7","F7","F7","F7","F5","F5","F5","F5","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","-/-","-/-","-/-","-/-","A/A","T/T","A/T","C/C","A/A","T/T","T/T","C/C","-/-","T/T","A/T","C/C","A/A","C/T","A/A","G/G","A/A","T/T","T/T","C/C","A/A","C/T","T/A","C/C","A/A","T/T","T/T","C/C","A/A","C/C","A/A","G/G","A/A","C/C","A/A","G/G"),nrow=40) 

2 个答案:

答案 0 :(得分:0)

以下是使用dplyr的示例。第一步是“传播”年份数据,然后计算对的相似性。

library(dplyr)
library(tidyr)

test %>% 
  data.frame(stringsAsFactors = FALSE) %>% 
  spread(Year,data) %>%
  group_by(Line) %>%
  summarise(F5_F6 = sum(F5 == F6)/length(F5)*100,
            F5_F7 = sum(F5 == F7)/length(F5)*100,
            F6_F7 = sum(F6 == F7)/length(F6)*100) %>%
  gather(Year, Matching_marker, F5_F6:F6_F7, na.rm = FALSE) %>%
  arrange(Line) %>%
  group_by(Line) %>%
  mutate(Duplicated = all(is.na(Matching_marker))) %>% 
  filter(!((!Duplicated) & is.na(Matching_marker)))

     Line  Year Matching_marker Duplicated
1  Line_A F5_F6              75      FALSE
2  Line_B F5_F6              50      FALSE
3  Line_B F5_F7              25      FALSE
4  Line_B F6_F7               0      FALSE
5  Line_C F5_F6              50      FALSE
6  Line_D F5_F6              NA       TRUE
7  Line_D F5_F7              NA       TRUE
8  Line_D F6_F7              NA       TRUE
9  Line_E F5_F6              NA       TRUE
10 Line_E F5_F7              NA       TRUE
11 Line_E F6_F7              NA       TRUE

答案 1 :(得分:0)

编辑 我正在对修订后的问题进行第二次尝试;之前的答案将从下面删除。我故意试图直截了当地回答,以便随时随地轻松阅读和修改(特别是对于更新后的问题中的微不足道的更改)另外,下面没有看起来太有效了;希望它无论如何都有帮助。

基本理念是:

  1. split“test1”by“Line”(#1)
  2. 为每个“行”(#2)找到2个“年”的所有可能组合
  3. 使用“ - / - ”(#3)
  4. 操作条目
  5. 比较每个“Line”(#4)
  6. 的每对“Year”之间的“数据”
  7. 相应地格式化输出(#5)
  8. 除了#1之外,所有上述内容都是在辅助函数中定义的,以便于实现。

    helper_ff = function(x) 
    {
       if(length(unique(x[["Year"]])) > 1) {
           combs = combn(as.character(unique(x[["Year"]])), 2, simplify = F)  #2: get all combinations
           do.call(rbind, 
                   lapply(combs,
                          function(z) {
                             Y1 = x[["data"]][x[["Year"]] == z[1]]
                             Y2 = x[["data"]][x[["Year"]] == z[2]]
                             misY1 = Y1 == "-/-"   
                             misY2 = Y2 == "-/-"
                             mis = misY1 | misY2  #3: ignore "-/-"
                             Y1b = Y1[!mis]
                             Y2b = Y2[!mis]
                             matches = Y1b == Y2b #4: find matches of the non '-/-'
                             data.frame(Line = x[["Line"]][1], #5: formatting stuff
                                        Year = paste(z, collapse = "/"),
                                        Duplication = "Yes",
                                        MatchCount = sum(matches),
                                        MismatchCount = sum(!matches),
                                        Matching_Marker = (sum(matches) / (sum(matches) + sum(!matches))) * 100,
                                        Het1 = (sum(sapply(strsplit(as.character(Y1b), "/"), function(x) x[1] != x[2])) / length(Y1b)) * 100,
                                        Het2 = (sum(sapply(strsplit(as.character(Y2b), "/"), function(x) x[1] != x[2])) / length(Y2b)) * 100,
                                        Mis1 = (sum(misY1) / length(Y1)) * 100,
                                        Mis2 = (sum(misY2) / length(Y2)) * 100)
                          }))
       } else {
           Y = x[["data"]]
           misY = Y == "-/-"
           Yb = Y[!misY]  #3: ignore "-/-"
           data.frame(Line = x[["Line"]][1], #5: formatting stuff
                      Year = x[["Year"]][1],
                      Duplication = "No",
                      MatchCount = NA,
                      MismatchCount = NA,
                      Matching_Marker = NA,
                      Het1 = (sum(sapply(strsplit(as.character(Yb), "/"), function(x) x[1] != x[2])) / length(Yb)) * 100,
                      Het2 = NA,
                      Mis1 = (sum(misY) / length(Y)) * 100,
                      Mis2 = NA)
       }     
    }
    
    res = do.call(rbind, 
                  lapply(split(test1, test1[["Line"]]), #1: split
                         helper_ff))
    rownames(res) = NULL
    res
    #    Line  Year Duplication MatchCount MismatchCount Matching_Marker     Het1     Het2 Mis1 Mis2
    #1 Line_A F5/F6         Yes          3             1        75.00000  0.00000 25.00000    0    0
    #2 Line_B F5/F6         Yes          2             1        66.66667  0.00000 33.33333    0   25
    #3 Line_B F5/F7         Yes          1             3        25.00000  0.00000 25.00000    0    0
    #4 Line_B F6/F7         Yes          0             3         0.00000 33.33333 33.33333   25    0
    #5 Line_C F5/F6         Yes          2             2        50.00000  0.00000 50.00000    0    0
    #6 Line_D    F6          No         NA            NA              NA  0.00000       NA    0   NA
    #7 Line_E    F6          No         NA            NA              NA  0.00000       NA    0   NA