dog_ch38 <- read.GenBank("NC_006620.3")
dog_ch38 <- dog_ch38$NC_006620.3[1:100000]
dog_ch38 <- c2s(ape::as.character.DNAbin(dog_ch38))
dog_ch38 <- str_to_upper(dog_ch38)
kmer_to_index <- function(kmer){
+ n <- str_length(kmer)
+ letter_value <- c("A" = 0, "C" = 1, "G" = 2, "T" = 3)
+ base <- 1
+ index <- 1
+ for( i in n:1){
+ nucleotide <- str_sub(kmer,start = i,end = i)
+ index <- index + base * letter_value[nucleotide]
+ base <- base * 4
+ }
+ return(as.numeric(index))
+ }
k <- 5
kmers <- numeric(4^k)
kmers
N <- str_length(dog_ch38)
> for (i in 1:(N - k + 1)) {
+ kmer <- str_sub(dog_ch38, i, i + k - 1)
+ index <- kmer_to_index(kmer)
+ kmers[index] <- kmers[index] + 1
+ }
但出现这样的错误:
Error in kmers[index] <- kmers[index] + 1 :
NAs are not allowed in subscripted assignments
In addition: Warning messages:
1: In 1:(N - k + 1) :
numerical expression has 100000 elements: only the first used
2: In n:1 : numerical expression has 100000 elements: only the first used
seqinr::count(dog_ch38[1,], 5)
Error in dog_ch38[1, ] : incorrect number of dimensions
我实际上期望看到的结果是:
## aaaaa accaa taaag aataa
## 75 75 47 92
对于此功能,我绝对是新手,如果有人可以指导我如何解决它,请参考一些示例。谢谢!
答案 0 :(得分:1)
我认为oligonucleotideFrequency()
包中的Biostrings
函数可以提供帮助。这是一个包含人工数据的示例。
library(Biostrings) # requires appropriate Bioconductor install
s1 <- sample(c("A", "C", "G", "T"), 10^5, TRUE)
s1 <- DNAString(paste(s1, collapse = ""))
kmers <- oligonucleotideFrequency(s1, width = 5)
该函数返回了所有可能 kmers的命名数字矢量。您可以使用此功能提取您感兴趣的kmer。在这个例子中应该有大约100个。
kmers["AATAA"] # actual count varies because of random sampling
> AATAA
> 102
查看此功能的帮助页面。使用默认参数,它将返回重叠 kmers。可以使用step
选项来控制,如以下示例所示:
s2 <- DNAString("AATAATAATAA")
kmers1 <- oligonucleotideFrequency(s2, width = 5)
kmers2 <- oligonucleotideFrequency(s2, width = 5, step = 5)
# See all the 5-mers found with step = 1 (default) versus step = 5
kmers1[kmers1 != 0]
> AATAA ATAAT TAATA
> 3 2 2
kmers2[kmers2 != 0]
> AATAA TAATA
> 1 1
我发现(仍然发现)用于处理令人困惑的DNA序列的各种格式,并且似乎需要将read.GenBank()
返回的紧凑型二进制格式转换为Biostrings
中的字符表示形式。它们都非常有效。
可以在read.GenBank()
返回的二进制对象列表上进行转换,也可以使用as.character = TRUE
选项返回原始字符。我在这里展示后一种方法。
# Using package ape to read GenBank file, Biostrings for analysis
library(ape)
library(Biostrings)
# By default, read.GenBank returns a list of DNA sequences in compact binary form.
# This asks it to return a list of character vectors.
dog_ch38 <- read.GenBank("NC_006620.3", as.character = TRUE)
str(dog_ch38)
> List of 1
> $ NC_006620.3: chr [1:23914537] "n" "n" "n" "n" ...
> - attr(*, "species")= chr "Canis_lupus_familiaris"
# Now convert the first (and only) member of the list to a single character string
txt <- paste(dog_ch38[[1]], collapse = "")
print(nchar(txt))
> [1] 23914537
# And now convert the character string to a DNAString
s <- DNAString(txt)
# This is the form that can be handed to oligonucleotideFrequency
km <- oligonucleotideFrequency(s[1:10^5], 5)
km["AATAA"]
> AATAA
> 176