我有以下数据框(dat
):
library(tidyverse)
dat <- structure(list( fasta_header = "sp|A0A0A0MT76|LJ01_HUMAN",
sequence = "PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL",
type = "human", seq_len = 42L), .Names = c( "fasta_header","sequence", "type", "seq_len"), row.names = c(NA, -1L), class = c("tbl_df","tbl", "data.frame"))
dat
#> # A tibble: 1 x 4
#> fasta_header sequence type seq_len
#> <chr> <chr> <chr> <int>
#> 1 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYV… human 42
我想要做的是计算sequence
列中氨基酸的频率。必须根据以下已知氨基酸载体计数
complete_aa <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N",
"P", "Q", "R", "S", "T", "V", "W", "Y")
complete_aa
#> [1] "A" "C" "D" "E" "F" "G" "H" "I" "K" "L" "M" "N" "P" "Q" "R" "S" "T"
#> [18] "V" "W" "Y"
最终的预期结果如下:
fasta_header sequence aa aa_count
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL A 1
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL C 2
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL D 1
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL E 1
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL F 2
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL G 3
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL H 0
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL I 0
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL K 1
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL L 5
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL M 0
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL N 0
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL P 6
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL Q 3
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL R 4
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL S 4
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL T 3
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL V 3
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL W 2
sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGTGTKVTVL Y 1
如何使用dplyr管道方法做到这一点?
我试过了:
dat %>%
mutate(aa = map(sequence, ~ unlist(str_split(., "")))) %>%
unnest() %>%
group_by(fasta_header, sequence, aa) %>% # group
summarise(aa_count = n()) %>%
as.data.frame()
但它未能包括H, I, M, N
。
更新:基于@akrun回答。
我用这个新数据fram尝试了你的代码:
dat2 <- structure(list(fasta_header = c(">seq1", ">seq2"), sequence = c("MPSRGTRPE",
"VSSKYTFWNF")), .Names = c("fasta_header", "sequence"), row.names = c(NA,
-2L), class = c("tbl_df", "tbl", "data.frame"))
dat2
#> fasta_header sequence
#> 1 >seq1 MPSRGTRPE
#> 2 >seq2 VSSKYTFWNF
使用此代码:
complete_aa <- c("A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N",
"P", "Q", "R", "S", "T", "V", "W", "Y")
dat2 %>%
mutate(aa = list(complete_aa), aa_count = list(stringr::str_count(sequence, complete_aa))) %>%
unnest %>%
print( n = 100)
但它给出了一行如下:
fasta_header sequence aa aa_count
18 >seq1 MPSRGTRPE V 1
V
的计数应为零。我该如何修复这个错误?
答案 0 :(得分:4)
以下是使用Biostrings::letterFrequency
的解决方案:
library(Biostrings);
dat %>%
mutate(
aa = list(complete_aa),
aa_count = list(letterFrequency(BString(sequence), letters = complete_aa))) %>%
unnest() %>%
select(-type, -seq_len);
## A tibble: 20 x 4
# fasta_header sequence aa aa_count
# <chr> <chr> <chr> <int>
# 1 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… A 1
# 2 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… C 2
# 3 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… D 1
# 4 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… E 1
# 5 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… F 2
# 6 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… G 3
# 7 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… H 0
# 8 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… I 0
# 9 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… K 1
#10 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… L 5
#11 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… M 0
#12 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… N 0
#13 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… P 6
#14 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… Q 3
#15 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… R 4
#16 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… S 4
#17 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… T 3
#18 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… V 3
#19 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… W 2
#20 sp|A0A0A0MT76|LJ01_HUMAN PSRLLLQPSPQRADPRCWPRGFWSEPQSLCYVFGT… Y 1
Biostrings
还提供了直接读取和解析fasta文件的方法,请参阅?read.DNAStringSet
。
对于您的第二个示例,解决方案是:
dat2 %>%
mutate(
aa = list(complete_aa),
aa_count = lapply(sequence, function(x)
letterFrequency(BString(x), letters = complete_aa))) %>%
unnest()
以长格式生成数据。如果需要,请使用spread
从长到长重新整形。
# fasta_header sequence aa aa_count
#1 >seq1 MPSRGTRPE A 0
#2 >seq1 MPSRGTRPE C 0
#3 >seq1 MPSRGTRPE D 0
#4 >seq1 MPSRGTRPE E 1
#5 >seq1 MPSRGTRPE F 0
#6 >seq1 MPSRGTRPE G 1
#7 >seq1 MPSRGTRPE H 0
#8 >seq1 MPSRGTRPE I 0
#9 >seq1 MPSRGTRPE K 0
#10 >seq1 MPSRGTRPE L 0
#11 >seq1 MPSRGTRPE M 1
#12 >seq1 MPSRGTRPE N 0
#13 >seq1 MPSRGTRPE P 2
#14 >seq1 MPSRGTRPE Q 0
#15 >seq1 MPSRGTRPE R 2
#16 >seq1 MPSRGTRPE S 1
#17 >seq1 MPSRGTRPE T 1
#18 >seq1 MPSRGTRPE V 0
#19 >seq1 MPSRGTRPE W 0
#20 >seq1 MPSRGTRPE Y 0
#21 >seq2 VSSKYTFWNF A 0
#22 >seq2 VSSKYTFWNF C 0
#23 >seq2 VSSKYTFWNF D 0
#24 >seq2 VSSKYTFWNF E 0
#25 >seq2 VSSKYTFWNF F 2
#26 >seq2 VSSKYTFWNF G 0
#27 >seq2 VSSKYTFWNF H 0
#28 >seq2 VSSKYTFWNF I 0
#29 >seq2 VSSKYTFWNF K 1
#30 >seq2 VSSKYTFWNF L 0
#31 >seq2 VSSKYTFWNF M 0
#32 >seq2 VSSKYTFWNF N 1
#33 >seq2 VSSKYTFWNF P 0
#34 >seq2 VSSKYTFWNF Q 0
#35 >seq2 VSSKYTFWNF R 0
#36 >seq2 VSSKYTFWNF S 2
#37 >seq2 VSSKYTFWNF T 1
#38 >seq2 VSSKYTFWNF V 1
#39 >seq2 VSSKYTFWNF W 1
#40 >seq2 VSSKYTFWNF Y 1
答案 1 :(得分:1)
根据输入数据,我们可以创建list
列和unnest
library(stringr)
library(dplyr)
dat %>%
mutate(aa = list(complete_aa), aa_count = list(str_count(sequence, complete_aa))) %>%
unnest
如果有多个论坛,请执行group_by
,然后在list
summarise
列
dat %>%
group_by(fasta_header, sequence, type) %>%
summarise(aa = list(complete_aa), aa_count = list(str_count(sequence, complete_aa))) %>%
unnest