我有一长串序列如下
AAAAAACGTTATGATCGATC
AAAATTCGCGCTTAGAGATC
AAGCTACGCATGCATCGACT
AAAAAACGTTATGATCGATC
AAAAAACGTTATGATCGATC
AAAATTCGCGCTTAGAGATC etc.
我还有一个较短的列表,我想看看短列表中的每个元素出现在长列表中的次数,并将其绘制为直方图。我想它就像一个Vlookup函数。我怎么能在R?中做到这一点?
答案 0 :(得分:1)
match
和table
适用于您的角色向量。这是一个随机字母的例子:
set.seed(1492)
dat <- sample(c(letters, LETTERS), 100, replace=TRUE)
dat
## [1] "o" "l" "j" "f" "c" "a" "S" "A" "u" "N" "H" "H" "k" "B" "B" "P" "g"
## [18] "r" "I" "V" "H" "t" "g" "F" "e" "W" "E" "D" "r" "Y" "h" "Z" "R" "l"
## [35] "Z" "K" "v" "f" "b" "q" "M" "P" "i" "u" "w" "m" "S" "g" "f" "g" "G"
## [52] "h" "q" "T" "J" "M" "K" "m" "X" "Q" "f" "x" "t" "B" "k" "z" "I" "Y"
## [69] "z" "g" "z" "u" "O" "k" "G" "L" "n" "B" "A" "A" "J" "p" "U" "F" "E"
## [86] "X" "R" "J" "G" "L" "H" "o" "z" "r" "d" "r" "V" "H" "S" "I"
matches <- match(dat, LETTERS)
match_counts <- table(matches[!is.na(matches)])
match_counts
##
## 1 2 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
## 3 4 1 2 2 3 5 3 3 2 2 2 1 1 2 1 2 3 1 1 2 1 2 2 2
names(match_counts) <- LETTERS[as.numeric(names(match_counts))]
match_counts
## A B D E F G H I J K L M N O P Q R S T U V W X Y Z
## 3 4 1 2 2 3 5 3 3 2 2 2 1 1 2 1 2 3 1 1 2 1 2 2 2
barplot(sort(match_counts), col="#649388")
答案 1 :(得分:1)
假设序列是字符串。
lines <- readLines(n=6)
AAAAAACGTTATGATCGATC
AAAATTCGCGCTTAGAGATC
AAGCTACGCATGCATCGACT
AAAAAACGTTATGATCGATC
AAAAAACGTTATGATCGATC
AAAATTCGCGCTTAGAGATC
shortlist <- readLines(n=1)
AGTD
在这里,我假设每个元素都是单个字符,因为它不清楚。
pat1 <- gsub("(?<=[A-Za-z])(?=[A-Za-z])", "|", shortlist, perl=TRUE)
pat1
#[1] "A|G|T|D"
library(stringr)
lvls <- unique(str_extract_all(shortlist, "[A-Za-z]")[[1]])
t1 <- table(factor(unlist(regmatches(lines,gregexpr(pat1, lines))), levels=lvls))
t1
#
# A G T D
#47 21 29 0
barplot(t1, col="#649388")
如果您的shortlist
如下所示,并且您希望获得每个字符串的频率而不是字符串中的字符。
shortlist1 <- readLines(n=4)
AAGCTACGCATGCATCGACT
AAAAAACGTTATGATCGATC
AAAAAACGTTATCT
AAAAAACG
pat2 <- paste0("^",paste(shortlist1, collapse="|"), "$")
lvls1 <- unique(shortlist1)
t2 <- table(factor(unlist(regmatches(lines,gregexpr(pat2, lines))), levels=lvls1))
t2
#AAGCTACGCATGCATCGACT AAAAAACGTTATGATCGATC AAAAAACGTTATCT
# 1 3 0
# AAAAAACG
# 0
barplot(t2, col="#649388")
答案 2 :(得分:1)
尝试:
longlist = c("AAAAAACGTTATGATCGATC", "AAAATTCGCGCTTAGAGATC", "AAGCTACGCATGCATCGACT",
"AAAAAACGTTATGATCGATC", "AAAAAACGTTATGATCGATC", "AAGCTACGCATGCATCGACT",
"AAGCTACGCATGCATCGACT", "AAAAAACGTTATGATCGATC", "AAAAAACGTTATGATCGATC"
)
shortlist = c("AAAAAACGTTATGATCGATC", "AAGCTACGCATGCATCGACT")
longlist
[1] "AAAAAACGTTATGATCGATC" "AAAATTCGCGCTTAGAGATC" "AAGCTACGCATGCATCGACT" "AAAAAACGTTATGATCGATC" "AAAAAACGTTATGATCGATC"
[6] "AAGCTACGCATGCATCGACT" "AAGCTACGCATGCATCGACT" "AAAAAACGTTATGATCGATC" "AAAAAACGTTATGATCGATC"
shortlist
[1] "AAAAAACGTTATGATCGATC" "AAGCTACGCATGCATCGACT"
outdf = data.frame(var=character(), freq=numeric(), stringsAsFactors=F)
for(i in 1:length(shortlist)) {outdf[i,]=c(shortlist[i], sum(longlist==shortlist[i]))}
outdf
var freq
1 AAAAAACGTTATGATCGATC 5
2 AAGCTACGCATGCATCGACT 3
outdf$freq = as.numeric(outdf$freq)
barplot(outdf$freq, names.arg=outdf$var)
可以轻松使用以下内容查看完整长列表的频率和条形图:
table(longlist)
longlist
AAAAAACGTTATGATCGATC AAAATTCGCGCTTAGAGATC AAGCTACGCATGCATCGACT
5 1 3
barplot(table(longlist))