我有包含以下信息的tab delim文件文件
>fasta
>ss_23_122_0_1
MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS
>ss_23_167_0_1
WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW
>ss_23_167_0_1
MAASDASDWEPWERIWERIWER
>ss_23_167_0_1
QWEKCKLSDOIEOWIOWEUWWEUWEZURZEWURZUWEUZUQZUWZUE
>ss_45_201_0_1
HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER
>ss_45_201_0_1
ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE
>ss_89_10_0_2
NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP
对于像ss_45_201_0_1
和ss_23_167_0_1
这样的ID,有多个条目,我想只保留那些最大长度为全部的条目。我希望获得如下输出:
>fasta
>ss_23_122_0_1
MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS
>ss_23_167_0_1
WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW
>ss_45_201_0_1
HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER
>ss_89_10_0_2
NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP
我在R中尝试了以下代码,但它失败了
Unique(fasta)
任何人都可以指导我。如何才能获得具有不同长度的多个条目的相同ID的最长序列。
答案 0 :(得分:2)
以下三个选项需要考虑。
取消列表,在其上使用nchar
,然后使用ave
找出要保留的值。
x <- nchar(unlist(l))
l[as.logical(ave(x, names(x), FUN = function(x) x == max(x)))]
# $ss_23_122_0_1
# [1] "MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS"
#
# $ss_23_167_0_1
# [1] "WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW"
#
# $ss_45_201_0_1
# [1] "HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER"
#
# $ss_89_10_0_2
# [1] "NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP"
使用来自&#34; reshape2&#34;的melt
创建data.frame
。将rank
与nchar
一起使用到子集。 (我使用了排名而不是==
,因此我没有必要使用nchar
两次 - 还没有检查比较效率。)
library(data.table)
library(reshape2)
as.data.table(melt(l))[, Rnk := rank(nchar(as.character(value))),
by = L1][Rnk == 1]
# value L1 Rnk
# 1: MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS ss_23_122_0_1 1
# 2: MAASDASDWEPWERIWERIWER ss_23_167_0_1 1
# 3: ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE ss_45_201_0_1 1
# 4: NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP ss_89_10_0_2 1
&#34; data.table&#34;。
的类似方法library(dplyr)
library(reshape2)
melt(l) %>%
group_by(L1) %>%
mutate(Rnk = dense_rank(nchar(as.character(value)))) %>%
filter(Rnk == 1)
# Source: local data frame [4 x 3]
# Groups: L1
#
# value L1 Rnk
# 1 MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS ss_23_122_0_1 1
# 2 MAASDASDWEPWERIWERIWER ss_23_167_0_1 1
# 3 ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE ss_45_201_0_1 1
# 4 NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP ss_89_10_0_2 1
答案 1 :(得分:1)
也许有一种更优雅的方式...
l <-list(ss_23_122_0_1 = "MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS",
ss_23_167_0_1 = "WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW",
ss_23_167_0_1 = "MAASDASDWEPWERIWERIWER",
ss_23_167_0_1 = "QWEKCKLSDOIEOWIOWEUWWEUWEZURZEWURZUWEUZUQZUWZUE",
ss_45_201_0_1 = "HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER",
ss_45_201_0_1 = "ZTTRASOIIDIFOSDIOFISDOFSDFQAWTZETQWE",
ss_89_10_0_2 = "NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP")
res <- split(l, names(l))
ind <- lapply(split(sapply(l, nchar), names(l)), which.max)
Map(function(x, y) x[y], res, ind)
$ss_23_122_0_1
$ss_23_122_0_1$ss_23_122_0_1
[1] "MJSDHWTEZTZEWUIASUDUAISDUASADIASDIAUSIDAUSIDCASDAS"
$ss_23_167_0_1
$ss_23_167_0_1$ss_23_167_0_1
[1] "WEIURIOWERWKLEJDSAJFASDGASZDTTQZWTEZQWTEZUQWEZQWTEZQTWEZTQW"
$ss_45_201_0_1
$ss_45_201_0_1$ss_45_201_0_1
[1] "HZTMKSKDIUWZUWEZTZWERWUEOIRUOEROOWEWERSDFSDFRRRETERTER"
$ss_89_10_0_2
$ss_89_10_0_2$ss_89_10_0_2
[1] "NJZTIWEIOIOIPIEPWIQPOEIQWIEPOQWIEPOQWIEPQIWEP"