如何在各列内和跨列找到相似字符串的百分比

时间:2017-07-27 23:44:41

标签: r

我有以下数据

df<-structure(list(V1 = structure(c(5L, 1L, 7L, 3L, 2L, 4L, 6L, 6L
), .Label = c("CPSIAAAIAAVNALHGR", "DLNYCFSGMSDHR", "FPEHELIVDPQR", 
"IADPDAVKPDDWDEDAPSK", "LWADHGVQACFGR", "WGEAGAEYVVESTGVFTTMEK", 
"YYVTIIDAPGHR"), class = "factor"), V2 = structure(c(5L, 2L, 
7L, 3L, 4L, 6L, 1L, 1L), .Label = c("", "CPSIAAAIAAVNALHGR", 
"GCITIIGGGDTATCCAK", "HVGPGVLSMANAGPNTNGSQFFICTIK", "LLELGPKPEVAQQTR", 
"MVCCSAWSEDHPICNLFTCGFDR", "YYVTIIDAPGHR"), class = "factor"), 
    V3 = structure(c(4L, 3L, 2L, 4L, 3L, 1L, 1L, 1L), .Label = c("", 
    "AVCMLSNTTAIAEAWAR", "DLNYCFSGMSDHR", "FPEHELIVDPQR"), class = "factor")), .Names = c("V1", 
"V2", "V3"), class = "data.frame", row.names = c(NA, -8L))

我想知道每列中共享哪些字符串和多少字符串

例如

CPSIAAAIAAVNALHGR          1,2       
YYVTIIDAPGHR               1,2
WGEAGAEYVVESTGVFTTMEK      1,1
FPEHELIVDPQR               1,3
DLNYCFSGMSDHR              1,3
DLNYCFSGMSDHR              1,3

这意味着在第一列和第二列中重复第一个字符串CPSIAAAIAAVNALHGR。第一列和第二列重复YYVTIIDAPGHR。等等

然后给我一个百分比第一列有8行,在这8行中,它与第2列共享2行所以它有2/8 * 100 = 25%的份额 第一次使用第三份共享3/8 * 100 = 37% 第3列第3列共享0% 等等

1 个答案:

答案 0 :(得分:2)

我真的觉得在删除每一行的重复项(即考虑匹配的唯一字符串)后,常见的数量更多是有用的。所以,我正在更新我的解决方案,并且我添加了更多代码来获取维恩图。

library(dplyr)
library(tidyr)
library(gplots)

# reshape dataset
df_reshaped = df %>% 
  gather(column, string) %>% 
  filter(string != '') %>%
  distinct()

# dataset that shows all strings and in which columns they appear
df_result1 = df_reshaped %>% 
  group_by(string) %>% 
  summarise(columns = paste(unique(column), collapse=","))

df_result1


# # A tibble: 12 x 2
#                        string columns
#                         <chr>   <chr>
# 1           AVCMLSNTTAIAEAWAR      V3
# 2           CPSIAAAIAAVNALHGR   V1,V2
# 3               DLNYCFSGMSDHR   V1,V3
# 4                FPEHELIVDPQR   V1,V3
# 5           GCITIIGGGDTATCCAK      V2
# 6 HVGPGVLSMANAGPNTNGSQFFICTIK      V2
# 7         IADPDAVKPDDWDEDAPSK      V1
# 8             LLELGPKPEVAQQTR      V2
# 9               LWADHGVQACFGR      V1
# 10     MVCCSAWSEDHPICNLFTCGFDR     V2
# 11       WGEAGAEYVVESTGVFTTMEK     V1
# 12                YYVTIIDAPGHR  V1,V2


# function to get number of common rows
f1 = function(v1, v2) {
  x1 = (df_reshaped %>% filter(column == v1))$string
  x2 = (df_reshaped %>% filter(column == v2))$string
  length(x2[x2 %in% x1]) }
f1 = Vectorize(f1)


# function to get number of rows of each column
f2 = function(v) {df_reshaped %>% filter(column == v) %>% nrow}
f2 = Vectorize(f2)

# dataset that shows overlap of columns (number of common strings)
expand.grid(unique(df_reshaped$column), unique(df_reshaped$column)) %>%
  filter(Var1 != Var2) %>%
  mutate(NumShared = f1(Var1, Var2),
         NumRows = f2(Var1),
         Prc = NumShared/NumRows) %>%
  arrange(Var1, Var2)

#   Var1 Var2 NumShared NumRows       Prc
# 1   V1   V2         2       7 0.2857143
# 2   V1   V3         2       7 0.2857143
# 3   V2   V1         2       6 0.3333333
# 4   V2   V3         0       6 0.0000000
# 5   V3   V1         2       3 0.6666667
# 6   V3   V2         0       3 0.0000000


# reshape dataset and create a Venn diagram
df_reshaped %>%
  mutate(exist = TRUE) %>%
  spread(column, exist, fill=FALSE) %>%
  select(-string) %>%
  venn()

维恩图看起来像: enter image description here

显然,此图中显示的数字总和应该等于表df_result1中获得的唯一字符串数。在我们的案例中,这是12。