如何根据已知字符的向量计算数据帧下序列中的字符

时间:2018-04-09 03:17:59

标签: r dplyr tidyverse

我有以下数据框(dat):

library(tidyverse)
dat <- structure(list( fasta_header = "sp|A0A0A0MT76|LJ01_HUMAN",
  sequence = "PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL",
  type = "human", seq_len = 42L), .Names = c( "fasta_header","sequence", "type", "seq_len"), row.names = c(NA, -1L), class = c("tbl_df","tbl", "data.frame"))
dat
#> # A tibble: 1 x 4
#>   fasta_header             sequence                          type  seq_len
#>   <chr>                    <chr>                             <chr>   <int>
#> 1 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYV… human      42

我想要做的是计算sequence列中氨基酸的频率。必须根据以下已知氨基酸载体计数

complete_aa <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", 
"P", "Q", "R", "S", "T", "V", "W", "Y")

complete_aa
#>  [1] "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T"
#> [18] "V" "W" "Y"

最终的预期结果如下:

               fasta_header                                   sequence aa aa_count
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  A        1
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  C        2
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  D        1
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  E        1
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  F        2
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  G        3
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  H        0
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  I        0
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  K        1
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  L        5
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  M        0
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  N        0
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  P        6
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  Q        3
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  R        4
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  S        4
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  T        3
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  V        3
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  W        2
  sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL  Y        1

如何使用dplyr管道方法做到这一点?

我试过了:

dat %>% 
  mutate(aa = map(sequence, ~ unlist(str_split(., "")))) %>% 
  unnest() %>% 
  group_by(fasta_header, sequence, aa) %>% # group
  summarise(aa_count = n()) %>% 
  as.data.frame()

但它未能包括H, I, M, N

更新:基于@akrun回答。

我用这个新数据fram尝试了你的代码:

dat2 <- structure(list(fasta_header = c(">seq1", ">seq2"), sequence = c("MPSRGTRPE", 
"VSSKYTFWNF")), .Names = c("fasta_header", "sequence"), row.names = c(NA, 
-2L), class = c("tbl_df", "tbl", "data.frame"))

dat2
#>   fasta_header   sequence
#> 1        >seq1  MPSRGTRPE
#> 2        >seq2 VSSKYTFWNF

使用此代码:

complete_aa <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", 
"P", "Q", "R", "S", "T", "V", "W", "Y")


dat2 %>% 
   mutate(aa = list(complete_aa), aa_count = list(stringr::str_count(sequence, complete_aa))) %>%
   unnest %>% 
  print( n = 100)

但它给出了一行如下:

  fasta_header  sequence   aa    aa_count
18 >seq1        MPSRGTRPE  V            1

V的计数应为零。我该如何修复这个错误?

2 个答案:

答案 0 :(得分:4)

以下是使用Biostrings::letterFrequency的解决方案:

library(Biostrings);
dat %>%
    mutate(
        aa = list(complete_aa),
        aa_count = list(letterFrequency(BString(sequence), letters = complete_aa))) %>%
    unnest() %>%
    select(-type, -seq_len);
## A tibble: 20 x 4
#   fasta_header             sequence                             aa    aa_count
#   <chr>                    <chr>                                <chr>    <int>
# 1 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… A            1
# 2 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… C            2
# 3 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… D            1
# 4 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… E            1
# 5 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… F            2
# 6 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… G            3
# 7 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… H            0
# 8 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… I            0
# 9 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… K            1
#10 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… L            5
#11 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… M            0
#12 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… N            0
#13 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… P            6
#14 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… Q            3
#15 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… R            4
#16 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… S            4
#17 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… T            3
#18 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… V            3
#19 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… W            2
#20 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… Y            1

Biostrings还提供了直接读取和解析fasta文件的方法,请参阅?read.DNAStringSet

更新

对于您的第二个示例,解决方案是:

dat2 %>%
    mutate(
        aa = list(complete_aa),
        aa_count = lapply(sequence, function(x) 
            letterFrequency(BString(x), letters = complete_aa))) %>%
    unnest()

以长格式生成数据。如果需要,请使用spread从长到长重新整形。

#   fasta_header   sequence aa aa_count
#1         >seq1  MPSRGTRPE  A        0
#2         >seq1  MPSRGTRPE  C        0
#3         >seq1  MPSRGTRPE  D        0
#4         >seq1  MPSRGTRPE  E        1
#5         >seq1  MPSRGTRPE  F        0
#6         >seq1  MPSRGTRPE  G        1
#7         >seq1  MPSRGTRPE  H        0
#8         >seq1  MPSRGTRPE  I        0
#9         >seq1  MPSRGTRPE  K        0
#10        >seq1  MPSRGTRPE  L        0
#11        >seq1  MPSRGTRPE  M        1
#12        >seq1  MPSRGTRPE  N        0
#13        >seq1  MPSRGTRPE  P        2
#14        >seq1  MPSRGTRPE  Q        0
#15        >seq1  MPSRGTRPE  R        2
#16        >seq1  MPSRGTRPE  S        1
#17        >seq1  MPSRGTRPE  T        1
#18        >seq1  MPSRGTRPE  V        0
#19        >seq1  MPSRGTRPE  W        0
#20        >seq1  MPSRGTRPE  Y        0
#21        >seq2 VSSKYTFWNF  A        0
#22        >seq2 VSSKYTFWNF  C        0
#23        >seq2 VSSKYTFWNF  D        0
#24        >seq2 VSSKYTFWNF  E        0
#25        >seq2 VSSKYTFWNF  F        2
#26        >seq2 VSSKYTFWNF  G        0
#27        >seq2 VSSKYTFWNF  H        0
#28        >seq2 VSSKYTFWNF  I        0
#29        >seq2 VSSKYTFWNF  K        1
#30        >seq2 VSSKYTFWNF  L        0
#31        >seq2 VSSKYTFWNF  M        0
#32        >seq2 VSSKYTFWNF  N        1
#33        >seq2 VSSKYTFWNF  P        0
#34        >seq2 VSSKYTFWNF  Q        0
#35        >seq2 VSSKYTFWNF  R        0
#36        >seq2 VSSKYTFWNF  S        2
#37        >seq2 VSSKYTFWNF  T        1
#38        >seq2 VSSKYTFWNF  V        1
#39        >seq2 VSSKYTFWNF  W        1
#40        >seq2 VSSKYTFWNF  Y        1

答案 1 :(得分:1)

根据输入数据,我们可以创建list列和unnest

library(stringr)
library(dplyr)
dat %>% 
   mutate(aa = list(complete_aa), aa_count = list(str_count(sequence, complete_aa))) %>%
   unnest

如果有多个论坛,请执行group_by,然后在list

中创建summarise
dat %>%
   group_by(fasta_header, sequence, type) %>%
   summarise(aa = list(complete_aa), aa_count = list(str_count(sequence, complete_aa))) %>%
   unnest