Question

我有一长串序列如下

AAAAAACGTTATGATCGATC
AAAATTCGCGCTTAGAGATC
AAGCTACGCATGCATCGACT
AAAAAACGTTATGATCGATC
AAAAAACGTTATGATCGATC
AAAATTCGCGCTTAGAGATC etc.

我还有一个较短的列表，我想看看短列表中的每个元素出现在长列表中的次数，并将其绘制为直方图。我想它就像一个Vlookup函数。我怎么能在R？中做到这一点？

Answer 1

match和table适用于您的角色向量。这是一个随机字母的例子：

set.seed(1492)

dat <- sample(c(letters, LETTERS), 100, replace=TRUE)
dat

##   [1] "o" "l" "j" "f" "c" "a" "S" "A" "u" "N" "H" "H" "k" "B" "B" "P" "g"
##  [18] "r" "I" "V" "H" "t" "g" "F" "e" "W" "E" "D" "r" "Y" "h" "Z" "R" "l"
##  [35] "Z" "K" "v" "f" "b" "q" "M" "P" "i" "u" "w" "m" "S" "g" "f" "g" "G"
##  [52] "h" "q" "T" "J" "M" "K" "m" "X" "Q" "f" "x" "t" "B" "k" "z" "I" "Y"
##  [69] "z" "g" "z" "u" "O" "k" "G" "L" "n" "B" "A" "A" "J" "p" "U" "F" "E"
##  [86] "X" "R" "J" "G" "L" "H" "o" "z" "r" "d" "r" "V" "H" "S" "I"

matches <- match(dat, LETTERS)

match_counts <- table(matches[!is.na(matches)])
match_counts

## 
##  1  2  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 
##  3  4  1  2  2  3  5  3  3  2  2  2  1  1  2  1  2  3  1  1  2  1  2  2  2

names(match_counts) <- LETTERS[as.numeric(names(match_counts))]
match_counts

## A B D E F G H I J K L M N O P Q R S T U V W X Y Z 
## 3 4 1 2 2 3 5 3 3 2 2 2 1 1 2 1 2 3 1 1 2 1 2 2 2

barplot(sort(match_counts), col="#649388")

enter image description here

Answer 2

假设序列是字符串。

 lines <- readLines(n=6)
 AAAAAACGTTATGATCGATC
 AAAATTCGCGCTTAGAGATC
 AAGCTACGCATGCATCGACT
 AAAAAACGTTATGATCGATC
 AAAAAACGTTATGATCGATC
 AAAATTCGCGCTTAGAGATC

 shortlist <- readLines(n=1)
 AGTD

在这里，我假设每个元素都是单个字符，因为它不清楚。

  pat1 <- gsub("(?<=[A-Za-z])(?=[A-Za-z])", "|", shortlist, perl=TRUE)
  pat1
  #[1] "A|G|T|D"

  library(stringr)
  lvls <- unique(str_extract_all(shortlist, "[A-Za-z]")[[1]])

  t1 <- table(factor(unlist(regmatches(lines,gregexpr(pat1, lines))), levels=lvls))

  t1
  #
  # A  G  T  D 
  #47 21 29  0 

  barplot(t1, col="#649388")

更新

如果您的shortlist如下所示，并且您希望获得每个字符串的频率而不是字符串中的字符。

  shortlist1 <- readLines(n=4)
  AAGCTACGCATGCATCGACT
  AAAAAACGTTATGATCGATC
  AAAAAACGTTATCT
  AAAAAACG

  pat2 <- paste0("^",paste(shortlist1, collapse="|"), "$")
  lvls1 <- unique(shortlist1)

  t2 <- table(factor(unlist(regmatches(lines,gregexpr(pat2, lines))), levels=lvls1))
  t2

  #AAGCTACGCATGCATCGACT AAAAAACGTTATGATCGATC       AAAAAACGTTATCT 
  #                   1                    3                    0     
  #            AAAAAACG 
  #                   0 

  barplot(t2, col="#649388")

Answer 3

尝试：

longlist = c("AAAAAACGTTATGATCGATC", "AAAATTCGCGCTTAGAGATC", "AAGCTACGCATGCATCGACT", 
"AAAAAACGTTATGATCGATC", "AAAAAACGTTATGATCGATC", "AAGCTACGCATGCATCGACT", 
"AAGCTACGCATGCATCGACT", "AAAAAACGTTATGATCGATC", "AAAAAACGTTATGATCGATC"
)
shortlist = c("AAAAAACGTTATGATCGATC", "AAGCTACGCATGCATCGACT")

longlist
[1] "AAAAAACGTTATGATCGATC" "AAAATTCGCGCTTAGAGATC" "AAGCTACGCATGCATCGACT" "AAAAAACGTTATGATCGATC" "AAAAAACGTTATGATCGATC"
[6] "AAGCTACGCATGCATCGACT" "AAGCTACGCATGCATCGACT" "AAAAAACGTTATGATCGATC" "AAAAAACGTTATGATCGATC"

shortlist
[1] "AAAAAACGTTATGATCGATC" "AAGCTACGCATGCATCGACT"

outdf = data.frame(var=character(), freq=numeric(), stringsAsFactors=F)
for(i in 1:length(shortlist)) {outdf[i,]=c(shortlist[i], sum(longlist==shortlist[i]))} 
outdf 
                   var freq
1 AAAAAACGTTATGATCGATC    5
2 AAGCTACGCATGCATCGACT    3

outdf$freq = as.numeric(outdf$freq)
barplot(outdf$freq, names.arg=outdf$var)

enter image description here
可以轻松使用以下内容查看完整长列表的频率和条形图：

table(longlist)
longlist
AAAAAACGTTATGATCGATC AAAATTCGCGCTTAGAGATC AAGCTACGCATGCATCGACT 
                   5                    1                    3 

barplot(table(longlist))

enter image description here

如何绘制出现在另一个列表中的一个列表中的所有元素的频率

3 个答案:

更新