嗨朋友们请帮帮我!
我有大量的数据集,包含第一和第三个cloumns的重复。
test<-matrix(c("Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_D","Line_D","Line_D","Line_D","Line_E","Line_E","Line_E","Line_E","F5","F5","F5","F5","F6","F6","F6","F6","F5","F5","F5","F5","F6","F6","F6","F6","F7","F7","F7","F7","F5","F5","F5","F5","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","A/A","T/T","T/T","C/C","A/A","T/T","A/T","C/C","A/A","T/T","T/T","C/C","-/-","T/T","A/T","C/C","A/A","C/T","A/A","G/G","A/A","T/T","T/T","C/C","A/A","C/T","T/A","C/C","A/A","T/T","T/T","C/C","A/A","C/C","A/A","G/G"),nrow=36)
colnames(test)<-c("Line","Year","Marker","data")
我转换为数据框
test1<-data.frame(test)
数据集中有重复项。例如第1行和第5行,第2行和第6行等。它们在cloumn 1(Line)和第3列(Marker)中具有相同的数据。
Line Year Marker data
Line_A F5 M1 A/A
Line_A F5 M2 T/T
Line_A F5 M3 T/T
Line_A F5 M4 C/C
Line_A F6 M1 A/A
Line_A F6 M2 T/T
Line_A F6 M3 A/T
Line_A F6 M4 C/C
Line_B F5 M1 A/A
Line_B F5 M2 T/T
Line_B F5 M3 T/T
.
.
.
我想有一个如下所示的表来列出重复的行,然后是非重复的行。我希望通过比较标记数据来获得年份之间重复的匹配百分比。如
Line Year Duplication Matching_Marker %
Line A F5 / F6 Yes 75
Line B F5 / F6 Yes 75
Line B F5 / F7 Yes 50
Line B F6 / F7 Yes 25
Line C F5 / F6 Yes 50
Line D F6 No NA
Line D F6 No NA
提前致谢!
显示结果的最佳方式应该是
Line Year Duplication MatchCount Mismatchcount Matching_Marker % Het%Year1 Het%Year2 Missing%Year1 Missing%Year2
Line A F5 / F6 Yes 3 1 75 0 25 0 0
Line B F5 / F6 Yes 2 1 75 0 33.3 0 25
Line B F5 / F7 Yes 1 3 50 0 25 0 0
Line B F6 / F7 Yes 0 3 0 25 25 25 0
Line C F5 / F6 Yes 2 2 50 0 50 0 0
Line D F6 No NA NA NA 0 NA 0 NA
Line D F6 No NA NA NA 0 NA 0 NA
第二个示例数据集是
test<-matrix(c("Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_D","Line_D","Line_D","Line_D","Line_E","Line_E","Line_E","Line_E","Line_E","Line_E","Line_E","Line_E","F5","F5","F5","F5","F6","F6","F6","F6","F5","F5","F5","F5","F6","F6","F6","F6","F7","F7","F7","F7","F5","F5","F5","F5","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","A/A","T/T","T/T","C/C","A/A","T/T","A/T","C/C","A/A","T/T","T/T","C/C","-/-","T/T","A/T","C/C","A/A","C/T","A/A","G/G","A/A","T/T","T/T","C/C","A/A","C/T","T/A","C/C","A/A","T/T","T/T","C/C","A/A","C/C","A/A","G/G","A/A","C/C","A/A","G/G"),nrow=40)
colnames(test)<-c("Line","Year","Marker","data")
test1<-data.frame(test)
第三个示例数据集,其中缺少某一行的年度数据
test<-matrix(c("Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_A","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_B","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_C","Line_D","Line_D","Line_D","Line_D","Line_E","Line_E","Line_E","Line_E","Line_E","Line_E","Line_E","Line_E","F5","F5","F5","F5","F6","F6","F6","F6","F5","F5","F5","F5","F6","F6","F6","F6","F7","F7","F7","F7","F5","F5","F5","F5","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","F6","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","M1","M2","M3","M4","-/-","-/-","-/-","-/-","A/A","T/T","A/T","C/C","A/A","T/T","T/T","C/C","-/-","T/T","A/T","C/C","A/A","C/T","A/A","G/G","A/A","T/T","T/T","C/C","A/A","C/T","T/A","C/C","A/A","T/T","T/T","C/C","A/A","C/C","A/A","G/G","A/A","C/C","A/A","G/G"),nrow=40)
答案 0 :(得分:0)
以下是使用dplyr
的示例。第一步是“传播”年份数据,然后计算对的相似性。
library(dplyr)
library(tidyr)
test %>%
data.frame(stringsAsFactors = FALSE) %>%
spread(Year,data) %>%
group_by(Line) %>%
summarise(F5_F6 = sum(F5 == F6)/length(F5)*100,
F5_F7 = sum(F5 == F7)/length(F5)*100,
F6_F7 = sum(F6 == F7)/length(F6)*100) %>%
gather(Year, Matching_marker, F5_F6:F6_F7, na.rm = FALSE) %>%
arrange(Line) %>%
group_by(Line) %>%
mutate(Duplicated = all(is.na(Matching_marker))) %>%
filter(!((!Duplicated) & is.na(Matching_marker)))
Line Year Matching_marker Duplicated
1 Line_A F5_F6 75 FALSE
2 Line_B F5_F6 50 FALSE
3 Line_B F5_F7 25 FALSE
4 Line_B F6_F7 0 FALSE
5 Line_C F5_F6 50 FALSE
6 Line_D F5_F6 NA TRUE
7 Line_D F5_F7 NA TRUE
8 Line_D F6_F7 NA TRUE
9 Line_E F5_F6 NA TRUE
10 Line_E F5_F7 NA TRUE
11 Line_E F6_F7 NA TRUE
答案 1 :(得分:0)
编辑 我正在对修订后的问题进行第二次尝试;之前的答案将从下面删除。我故意试图直截了当地回答,以便随时随地轻松阅读和修改(特别是对于更新后的问题中的微不足道的更改)另外,下面没有看起来太有效了;希望它无论如何都有帮助。
基本理念是:
split
“test1”by“Line”(#1)除了#1之外,所有上述内容都是在辅助函数中定义的,以便于实现。
helper_ff = function(x)
{
if(length(unique(x[["Year"]])) > 1) {
combs = combn(as.character(unique(x[["Year"]])), 2, simplify = F) #2: get all combinations
do.call(rbind,
lapply(combs,
function(z) {
Y1 = x[["data"]][x[["Year"]] == z[1]]
Y2 = x[["data"]][x[["Year"]] == z[2]]
misY1 = Y1 == "-/-"
misY2 = Y2 == "-/-"
mis = misY1 | misY2 #3: ignore "-/-"
Y1b = Y1[!mis]
Y2b = Y2[!mis]
matches = Y1b == Y2b #4: find matches of the non '-/-'
data.frame(Line = x[["Line"]][1], #5: formatting stuff
Year = paste(z, collapse = "/"),
Duplication = "Yes",
MatchCount = sum(matches),
MismatchCount = sum(!matches),
Matching_Marker = (sum(matches) / (sum(matches) + sum(!matches))) * 100,
Het1 = (sum(sapply(strsplit(as.character(Y1b), "/"), function(x) x[1] != x[2])) / length(Y1b)) * 100,
Het2 = (sum(sapply(strsplit(as.character(Y2b), "/"), function(x) x[1] != x[2])) / length(Y2b)) * 100,
Mis1 = (sum(misY1) / length(Y1)) * 100,
Mis2 = (sum(misY2) / length(Y2)) * 100)
}))
} else {
Y = x[["data"]]
misY = Y == "-/-"
Yb = Y[!misY] #3: ignore "-/-"
data.frame(Line = x[["Line"]][1], #5: formatting stuff
Year = x[["Year"]][1],
Duplication = "No",
MatchCount = NA,
MismatchCount = NA,
Matching_Marker = NA,
Het1 = (sum(sapply(strsplit(as.character(Yb), "/"), function(x) x[1] != x[2])) / length(Yb)) * 100,
Het2 = NA,
Mis1 = (sum(misY) / length(Y)) * 100,
Mis2 = NA)
}
}
res = do.call(rbind,
lapply(split(test1, test1[["Line"]]), #1: split
helper_ff))
rownames(res) = NULL
res
# Line Year Duplication MatchCount MismatchCount Matching_Marker Het1 Het2 Mis1 Mis2
#1 Line_A F5/F6 Yes 3 1 75.00000 0.00000 25.00000 0 0
#2 Line_B F5/F6 Yes 2 1 66.66667 0.00000 33.33333 0 25
#3 Line_B F5/F7 Yes 1 3 25.00000 0.00000 25.00000 0 0
#4 Line_B F6/F7 Yes 0 3 0.00000 33.33333 33.33333 25 0
#5 Line_C F5/F6 Yes 2 2 50.00000 0.00000 50.00000 0 0
#6 Line_D F6 No NA NA NA 0.00000 NA 0 NA
#7 Line_E F6 No NA NA NA 0.00000 NA 0 NA