我有这种数据格式:
df <- data.frame(seqpart=factor(c("", "ccagttgttg", "tttgattcg", "ctttgtc","", "ctttgtcga","cttagta", "ttactgt", "ttacat")),
seqinfo= factor(c("IDseq1|specie1", "", "","","IDseq2|specie2","","","","")))
> df
seqpart seqinfo
<NA> IDseq1|specie1
ccagttgttg <NA>
tttgattcg <NA>
ctttgtc <NA>
<NA> IDseq2|specie2
ctttgtcga <NA>
cttagta <NA>
ttactgt <NA>
ttacat <NA>
我想根据列seqinfo
连接行,以使用这种新格式构建另一个数据:
>df1
seqinfo seq
IDseq1|specie1 ccagttgttgtttgattcgctttgtc
IDseq2|specie2 ctttgtcgacttagtattactgtttacat
有办法做到这一点吗?非常感谢
答案 0 :(得分:3)
通过tidyverse
的另一个想法。我们首先用NA替换''
并填充它们。我们按seqinfo
分组并粘贴唯一的seqparts
,即
library(tidyverse)
df %>%
mutate_all(funs(replace(., . == '', NA))) %>%
fill(seqpart, .direction = 'up') %>%
fill(seqinfo) %>%
group_by(seqinfo) %>%
summarise(seqpart = paste(unique(seqpart), collapse = ''))
A tibble: 2 x 2 seqinfo seqpart <fctr> <chr> 1 IDseq1|specie1 ccagttgttgtttgattcgctttgtc 2 IDseq2|specie2 ctttgtcgacttagtattactgtttacat
答案 1 :(得分:2)
我们根据&#39; seqinfo&#39;中的非空白元素的存在创建分组变量(&#39; grp&#39;),从&#39; seqinfo&#获取非空白元素39;和paste
&#39; seqpart&#39;一起
library(data.table)
setDT(df)[, .(seqinfo = seqinfo[seqinfo!=''],
seqpart = paste(seqpart, collapse='')),.(grp = cumsum(seqinfo !=""))][, grp := NULL][]
# seqinfo seqpart
#1: IDseq1|specie1 ccagttgttgtttgattcgctttgtc
#2: IDseq2|specie2 ctttgtcgacttagtattactgtttacat
答案 2 :(得分:1)
还有一个替代data.table
解决方案使用na.locf()
(最后观察结果):
library(data.table)
data.table(df)[, seqinfo := zoo::na.locf(droplevels(seqinfo, ""))][
, .(seq = paste(seqpart, collapse = "")), by = seqinfo]
seqinfo seq 1: IDseq1|specie1 ccagttgttgtttgattcgctttgtc 2: IDseq2|specie2 ctttgtcgacttagtattactgtttacat
df <- data.frame(
seqpart=factor(c("", "ccagttgttg", "tttgattcg", "ctttgtc", "", "ctttgtcga",
"cttagta", "ttactgt", "ttacat")),
seqinfo= factor(c("IDseq1|specie1", "", "", "", "IDseq2|specie2", "", "", "", "")))
NA
如果空条目编码为droplevels()
而不是NA
,则可以跳过对""
的调用:
df1 <- fread(
" seqpart seqinfo
<NA> IDseq1|specie1
ccagttgttg <NA>
tttgattcg <NA>
ctttgtc <NA>
<NA> IDseq2|specie2
ctttgtcga <NA>
cttagta <NA>
ttactgt <NA>
ttacat <NA>",
na.strings = "<NA>"
)
data.table(df1)[, seqinfo := zoo::na.locf(seqinfo)][
, .(seq = paste(seqpart, collapse = "")), by = seqinfo]