我以这种方式获得了一个txt文件:
-- SMART RESULTS TEXTFORMAT --
USER_PROTEIN_ID = SAUSA300_RS14200
SMART_PROTEIN_ID = uniprot|Q5HCS9|Q5HCS9_STAAC
NUMBER_OF_FEATURES_FOUND=1
DOMAIN=transmembrane_domain
START=7
END=29
EVALUE=0
TYPE=INTRINSIC
STATUS=visible|OK
-- FINISHED --
-- SMART RESULTS TEXTFORMAT --
USER_PROTEIN_ID = SAUSA300_RS11975
SMART_PROTEIN_ID = uniprot|A6QJ58|A6QJ58_STAAE
NUMBER_OF_FEATURES_FOUND=0
-- FINISHED --
-- SMART RESULTS TEXTFORMAT --
USER_PROTEIN_ID = SAUSA300_RS14395
SMART_PROTEIN_ID = uniprot|Q2FDK5|SRAP_STAA3
NUMBER_OF_FEATURES_FOUND=1
DOMAIN=Pfam:Gram_pos_anchor
START=2221
END=2258
EVALUE=6e-08
TYPE=PFAM
STATUS=visible|OK
-- FINISHED --
我想要获得的是“ SMART RESULTS TEXTFORMAT”和“ FINISHED”之间的信息,并将不同USER_PROTEIN_ID的各个部分导出到表的每一行中。
有人可以提供一些代码吗?我一个人有一些复杂的东西。它最终出现在一个数据框中,但不知道如何继续导出到excel中的每一行。
start="-- SMART RESULTS TEXTFORMAT --"
end="-- FINISHED --"
n=nrow(myfile)
index=c(1:n)
myfile=cbind(index,myfile)
starline=as.data.frame(grep(start,myfile[,2]))
endline=as.data.frame(grep(end,myfile[,2]))
indexlist=cbind(starline,endline)
newlist=character(length = n)
for (i in 1:n) {
index1=indexlist[i,1]+1
index2=indexlist[i,2]-1
newlist[i]=as.data.frame(as.data.frame(myfile[index1:index2,2]))
}
答案 0 :(得分:0)
您编写的代码看起来不错。
对于excel导出,请查看xlsx package
答案 1 :(得分:0)
我认为这对您应该很好:
library("magrittr")
data <- split(txt, cumsum(grepl("-- SMART RESULTS TEXTFORMAT --", txt))) %>%
lapply(function(i) i[!grepl("-- SMART RESULTS TEXTFORMAT --|-- FINISHED --", i)]) %>%
lapply(function(i) {
read.table(text = i, sep = "=", header = FALSE) %>%
t(.) %>%
tibble::as_tibble() %>%
magrittr::set_colnames(trimws(.[1, ])) %>%
slice(-1)
}) %>%
plyr::rbind.fill()
对于出口,我强烈推荐rio
:
rio::export(data, "data.xlsx")
这是我获取数据的方式(我将文本写出来并读回以模拟该部分):
txt <- "-- SMART RESULTS TEXTFORMAT --
USER_PROTEIN_ID = SAUSA300_RS14200
SMART_PROTEIN_ID = uniprot|Q5HCS9|Q5HCS9_STAAC
NUMBER_OF_FEATURES_FOUND=1
DOMAIN=transmembrane_domain
START=7
END=29
EVALUE=0
TYPE=INTRINSIC
STATUS=visible|OK
-- FINISHED --
-- SMART RESULTS TEXTFORMAT --
USER_PROTEIN_ID = SAUSA300_RS11975
SMART_PROTEIN_ID = uniprot|A6QJ58|A6QJ58_STAAE
NUMBER_OF_FEATURES_FOUND=0
-- FINISHED --
-- SMART RESULTS TEXTFORMAT --
USER_PROTEIN_ID = SAUSA300_RS14395
SMART_PROTEIN_ID = uniprot|Q2FDK5|SRAP_STAA3
NUMBER_OF_FEATURES_FOUND=1
DOMAIN=Pfam:Gram_pos_anchor
START=2221
END=2258
EVALUE=6e-08
TYPE=PFAM
STATUS=visible|OK
-- FINISHED --"
writeLines(txt, "test.txt")
txt <- readLines("test.txt")