计算列表元素,仅使它们出现x次

时间:2019-06-07 08:46:10

标签: r dplyr

这与this question在计算字符串的出现次数方面有些相似,但是我需要一个与dplyr一起使用并能够根据结果进行测试的解决方案。

在基因组学中,有一个k-mer的概念,它是较长字符串中长度为k的每个子字符串。我需要处理一串字符串(基因组序列),并找到在字符串中恰好发生4次的长度为5的(可能重叠)k-mer。

我可以通过以下方式计算每个k-mer的出现次数:

sequence1 <- "CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAA"
sequence2 <- "GGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAAT"
sequence3 <- "GACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATG"
sequences <- list(sequence1, sequence2, sequence3)

#Generate all k-mers of length 5 within each sequence
k <- 5
kmers <- map(sequences, function(x) {
    map_chr(seq_len(nchar(x) - k + 1), function(y) str_sub(x, y, y + k - 1))}) %>% 
    set_names(sequences)

kmers

给出k-mers:

#> $CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAA
#>  [1] "CGGAC" "GGACT" "GACTC" "ACTCG" "CTCGA" "TCGAC" "CGACA" "GACAG"
#>  [9] "ACAGA" "CAGAT" "AGATG" "GATGT" "ATGTG" "TGTGA" "GTGAA" "TGAAG"
#> [17] "GAAGA" "AAGAA" "AGAAA" "GAAAT" "AAATG" "AATGT" "ATGTG" "TGTGA"
#> [25] "GTGAA" "TGAAG" "GAAGA" "AAGAC" "AGACT" "GACTG" "ACTGA" "CTGAG"
#> [33] "TGAGT" "GAGTG" "AGTGA" "GTGAA" "TGAAG" "GAAGA" "AAGAG" "AGAGA"
#> [41] "GAGAA" "AGAAG" "GAAGA" "AAGAG" "AGAGG" "GAGGA" "AGGAA" "GGAAA"
#> [49] "GAAAC" "AAACA" "AACAC" "ACACG" "CACGA" "ACGAC" "CGACA" "GACAC"
#> [57] "ACACG" "CACGA" "ACGAC" "CGACA" "GACAT" "ACATT" "CATTG" "ATTGC"
#> [65] "TTGCG" "TGCGA" "GCGAC" "CGACA" "GACAT" "ACATA" "CATAA"
#> 
#> $GGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAAT
#>  [1] "GGACT" "GACTC" "ACTCG" "CTCGA" "TCGAC" "CGACA" "GACAG" "ACAGA"
#>  [9] "CAGAT" "AGATG" "GATGT" "ATGTG" "TGTGA" "GTGAA" "TGAAG" "GAAGA"
#> [17] "AAGAA" "AGAAA" "GAAAT" "AAATG" "AATGT" "ATGTG" "TGTGA" "GTGAA"
#> [25] "TGAAG" "GAAGA" "AAGAC" "AGACT" "GACTG" "ACTGA" "CTGAG" "TGAGT"
#> [33] "GAGTG" "AGTGA" "GTGAA" "TGAAG" "GAAGA" "AAGAG" "AGAGA" "GAGAA"
#> [41] "AGAAG" "GAAGA" "AAGAG" "AGAGG" "GAGGA" "AGGAA" "GGAAA" "GAAAC"
#> [49] "AAACA" "AACAC" "ACACG" "CACGA" "ACGAC" "CGACA" "GACAC" "ACACG"
#> [57] "CACGA" "ACGAC" "CGACA" "GACAT" "ACATT" "CATTG" "ATTGC" "TTGCG"
#> [65] "TGCGA" "GCGAC" "CGACA" "GACAT" "ACATA" "CATAA" "ATAAT"
#> 
#> $GACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATG
#>  [1] "GACTC" "ACTCG" "CTCGA" "TCGAC" "CGACA" "GACAG" "ACAGA" "CAGAT"
#>  [9] "AGATG" "GATGT" "ATGTG" "TGTGA" "GTGAA" "TGAAG" "GAAGA" "AAGAA"
#> [17] "AGAAA" "GAAAT" "AAATG" "AATGT" "ATGTG" "TGTGA" "GTGAA" "TGAAG"
#> [25] "GAAGA" "AAGAC" "AGACT" "GACTG" "ACTGA" "CTGAG" "TGAGT" "GAGTG"
#> [33] "AGTGA" "GTGAA" "TGAAG" "GAAGA" "AAGAG" "AGAGA" "GAGAA" "AGAAG"
#> [41] "GAAGA" "AAGAG" "AGAGG" "GAGGA" "AGGAA" "GGAAA" "GAAAC" "AAACA"
#> [49] "AACAC" "ACACG" "CACGA" "ACGAC" "CGACA" "GACAC" "ACACG" "CACGA"
#> [57] "ACGAC" "CGACA" "GACAT" "ACATT" "CATTG" "ATTGC" "TTGCG" "TGCGA"
#> [65] "GCGAC" "CGACA" "GACAT" "ACATA" "CATAA" "ATAAT" "TAATG"

我可以用

找到计数
kmers %>%
    imap(~ str_count(.y, .x))

返回哪个

#> $CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAA
#>  [1] 1 1 1 1 1 1 4 1 1 1 1 1 2 2 3 3 4 1 1 1 1 1 2 2 3 3 4 1 1 1 1 1 1 1 1
#> [36] 3 3 4 2 1 1 1 4 2 1 1 1 1 1 1 1 2 2 2 4 1 2 2 2 4 2 1 1 1 1 1 1 4 2 1
#> [71] 1
#> 
#> $GGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAAT
#>  [1] 1 1 1 1 1 4 1 1 1 1 1 2 2 3 3 4 1 1 1 1 1 2 2 3 3 4 1 1 1 1 1 1 1 1 3
#> [36] 3 4 2 1 1 1 4 2 1 1 1 1 1 1 1 2 2 2 4 1 2 2 2 4 2 1 1 1 1 1 1 4 2 1 1
#> [71] 1
#> 
#> $GACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATG
#>  [1] 1 1 1 1 4 1 1 1 1 1 2 2 3 3 4 1 1 1 1 1 2 2 3 3 4 1 1 1 1 1 1 1 1 3 3
#> [36] 4 2 1 1 1 4 2 1 1 1 1 1 1 1 2 2 2 4 1 2 2 2 4 2 1 1 1 1 1 1 4 2 1 1 1
#> [71] 1

但是现在我需要返回每个计数为4的唯一k-mer。到目前为止,我唯一的解决方案是获取每个k-mer等于4的索引,然后使用substr这样重新生成k-mer。

kmers >%>
    imap(~ str_count(.y, .x)) %>%
#test for k-mers that appear 4 times
    map(function(y) {
        map_lgl(y, function(x) x == 4)}) %>%
#Get the indexes of the matches
    map(which) %>%
#Recreate the k-mers from each sequence
    imap(function(a,b) {
        map_chr(a, ~ substr(b, .x, .x + k -1))}) %>%
    unlist %>%
    unique

哪个给了我想要的输出

#>  'CGACA' 'GAAGA' 

但是丢弃k-mers然后重新创建它们是低效率的。我如何获得计数,然后使用它们过滤k-mers的原始列表? table()可以工作,但是我无法弄清楚如何使用table()会导致dplyr管道获得带有满足条件的简单字符串列表的结果。

3 个答案:

答案 0 :(得分:1)

您可以flatten kmerscnts以及kmers为4的子集cnts。您可以将此处的值展平,因为最终只需要{ {1}}个。

unique

相同
library(tidyverse)

cnts <- kmers %>% imap(~ str_count(.y, .x))
(kmers %>% flatten_chr)[cnts %>% flatten_int == 4] %>% unique
#[1] "CGACA" "GAAGA"

答案 1 :(得分:1)

我接受Ronak Shah的回答,因为它具有将计数保存到单独列表中,然后使用该列表测试k-mers原始列表的明显特性。但是,我一直在尝试使数据流过一个管道而不必保存中间值。这是我去的地方

map(sequences, function(x) {
    #Generate all k-mers for each sequence
    str_sub(x, a <- seq_len(nchar(x) - k + 1), a + k - 1) %>%
    #Count how many times each k-mer appears in the sequence and keep if equal to 4
    keep(str_count(x, .) == 4)}) %>%
unlist %>%
unique

答案 2 :(得分:1)

这是完全基础的方法。

编辑:基本let border = SKPhysicsBody(edgeLoopFrom: self.frame) border.friction = 0 border.restitution = 1 self.physicsBody = border 调用已更改。您应该考虑对kmer使用其他方法。基本方法快约100倍。如果删除kmer,则基数r仍然快20倍。我还简化了set_names的调用。

lapply(kmers, function (x) ...)

替代字符串方法的性能:

kmers <- lapply(sequences, function(x) substring(x, seq_len(nchar(x) - k + 1), seq_len(nchar(x) - k + 1)+ k - 1))
names(kmers) = sequences

unique(
  unlist(
    lapply(kmers, function(x) names(Filter(function(z) z == 4, table(x))))
    )
  )

[1] "CGACA" "GAAGA"

#Or with no intermediate variables (it doesn't look pretty)
  unique(
    unlist(
      lapply(sequences, function(x) names(Filter(function(z) z == 4, table(substring(x, seq_len(nchar(x) - k + 1), seq_len(nchar(x) - k + 1)+ k - 1)))))
    )
  )

# Or same thing with chains (you can use map instead of lapply):

  lapply(sequences, function(x) {
    table(substring(x, seq_len(nchar(x) - k+1), seq_len(nchar(x) - k+1) + k-1))%>%
      Filter(function(z) z == 4, .)%>%
      names()
    }
  )%>%
    unlist()%>%
    unique()

过滤器的性能-我的解决方案的速度是@Ronak的两倍。 OP的解决方案虽然跳过了创建Unit: microseconds expr min lq mean median uq max neval base_kmers 50.0 52.50 129.598 58.15 71.85 6836.7 100 base_w_names 52.5 55.85 135.946 62.10 77.65 6995.0 100 purr_w_names_kmers 1651.1 7259.55 7683.990 7569.30 7898.35 10476.2 100 purr_kmers 1260.8 1294.65 1424.776 1322.50 1364.00 7395.9 100 变量的速度,但速度却慢了20倍。

kmers