我有以下数据
df<-structure(list(V1 = structure(c(5L, 1L, 7L, 3L, 2L, 4L, 6L, 6L
), .Label = c("CPSIAAAIAAVNALHGR", "DLNYCFSGMSDHR", "FPEHELIVDPQR",
"IADPDAVKPDDWDEDAPSK", "LWADHGVQACFGR", "WGEAGAEYVVESTGVFTTMEK",
"YYVTIIDAPGHR"), class = "factor"), V2 = structure(c(5L, 2L,
7L, 3L, 4L, 6L, 1L, 1L), .Label = c("", "CPSIAAAIAAVNALHGR",
"GCITIIGGGDTATCCAK", "HVGPGVLSMANAGPNTNGSQFFICTIK", "LLELGPKPEVAQQTR",
"MVCCSAWSEDHPICNLFTCGFDR", "YYVTIIDAPGHR"), class = "factor"),
V3 = structure(c(4L, 3L, 2L, 4L, 3L, 1L, 1L, 1L), .Label = c("",
"AVCMLSNTTAIAEAWAR", "DLNYCFSGMSDHR", "FPEHELIVDPQR"), class = "factor")), .Names = c("V1",
"V2", "V3"), class = "data.frame", row.names = c(NA, -8L))
我想知道每列中共享哪些字符串和多少字符串
例如
CPSIAAAIAAVNALHGR 1,2
YYVTIIDAPGHR 1,2
WGEAGAEYVVESTGVFTTMEK 1,1
FPEHELIVDPQR 1,3
DLNYCFSGMSDHR 1,3
DLNYCFSGMSDHR 1,3
这意味着在第一列和第二列中重复第一个字符串CPSIAAAIAAVNALHGR
。第一列和第二列重复YYVTIIDAPGHR
。等等
然后给我一个百分比第一列有8行,在这8行中,它与第2列共享2行所以它有2/8 * 100 = 25%的份额 第一次使用第三份共享3/8 * 100 = 37% 第3列第3列共享0% 等等
答案 0 :(得分:2)
我真的觉得在删除每一行的重复项(即考虑匹配的唯一字符串)后,常见的数量更多是有用的。所以,我正在更新我的解决方案,并且我添加了更多代码来获取维恩图。
library(dplyr)
library(tidyr)
library(gplots)
# reshape dataset
df_reshaped = df %>%
gather(column, string) %>%
filter(string != '') %>%
distinct()
# dataset that shows all strings and in which columns they appear
df_result1 = df_reshaped %>%
group_by(string) %>%
summarise(columns = paste(unique(column), collapse=","))
df_result1
# # A tibble: 12 x 2
# string columns
# <chr> <chr>
# 1 AVCMLSNTTAIAEAWAR V3
# 2 CPSIAAAIAAVNALHGR V1,V2
# 3 DLNYCFSGMSDHR V1,V3
# 4 FPEHELIVDPQR V1,V3
# 5 GCITIIGGGDTATCCAK V2
# 6 HVGPGVLSMANAGPNTNGSQFFICTIK V2
# 7 IADPDAVKPDDWDEDAPSK V1
# 8 LLELGPKPEVAQQTR V2
# 9 LWADHGVQACFGR V1
# 10 MVCCSAWSEDHPICNLFTCGFDR V2
# 11 WGEAGAEYVVESTGVFTTMEK V1
# 12 YYVTIIDAPGHR V1,V2
# function to get number of common rows
f1 = function(v1, v2) {
x1 = (df_reshaped %>% filter(column == v1))$string
x2 = (df_reshaped %>% filter(column == v2))$string
length(x2[x2 %in% x1]) }
f1 = Vectorize(f1)
# function to get number of rows of each column
f2 = function(v) {df_reshaped %>% filter(column == v) %>% nrow}
f2 = Vectorize(f2)
# dataset that shows overlap of columns (number of common strings)
expand.grid(unique(df_reshaped$column), unique(df_reshaped$column)) %>%
filter(Var1 != Var2) %>%
mutate(NumShared = f1(Var1, Var2),
NumRows = f2(Var1),
Prc = NumShared/NumRows) %>%
arrange(Var1, Var2)
# Var1 Var2 NumShared NumRows Prc
# 1 V1 V2 2 7 0.2857143
# 2 V1 V3 2 7 0.2857143
# 3 V2 V1 2 6 0.3333333
# 4 V2 V3 0 6 0.0000000
# 5 V3 V1 2 3 0.6666667
# 6 V3 V2 0 3 0.0000000
# reshape dataset and create a Venn diagram
df_reshaped %>%
mutate(exist = TRUE) %>%
spread(column, exist, fill=FALSE) %>%
select(-string) %>%
venn()
显然,此图中显示的数字总和应该等于表df_result1
中获得的唯一字符串数。在我们的案例中,这是12。