我目前正在尝试解析R中的RDP多分类器层次结构文件,但问题更普遍适用。基本上我创建了一个列表,其中包含几个包含“分层”行的文件的数据框:
dput(corner(hierlist$hier_M2MID06_Trimmed_noGaps.fas_fixrank.txt,n=c(7,10)))
structure(list(X1 = structure(c(30L, 31L, 163L, 45L, 64L, 65L,
66L), .Label = c("-1071", "-1102", "-1153", "-1159", "-1176",
"-1177", "-1207", "-1241", "-1256", "-1281", "-1332", "-1353",
"-1354", "-1502", "-1567", "-18", "-2", "-2715", "-423", "-460",
"-463", "-471", "-567", "-568", "-828", "-842", "-843", "-871",
"-980", "0", "1", "1031", "1069", "1070", "1093", "1101", "1126",
"1151", "1152", "1158", "1159", "1164", "1165", "1166", "1175",
"1176", "1195", "1200", "1206", "1207", "1215", "1216", "1217",
"1219", "1240", "1251", "1255", "1256", "1261", "1269", "1279",
"1280", "1282", "1330", "1331", "1339", "1341", "1343", "1348",
"1352", "1353", "1354", "1355", "1356", "1357", "1358", "1360",
"1501", "1566", "16", "1668", "1672", "1674", "17", "1762", "1763",
"1764", "1767", "1883", "1884", "1885", "1891", "1893", "1894",
"2", "2164", "2179", "2180", "2183", "2184", "2187", "2192",
"2195", "2208", "2209", "2210", "2211", "2259", "2260", "2333",
"2371", "2372", "254", "255", "261", "264", "2684", "2713", "2714",
"274", "3", "35", "422", "458", "459", "46", "462", "470", "48",
"49", "54", "565", "566", "567", "570", "577", "581", "648",
"653", "657", "659", "804", "805", "806", "807", "808", "817",
"818", "819", "820", "822", "824", "825", "826", "827", "829",
"832", "834", "837", "838", "839", "840", "841", "842", "843",
"844", "846", "848", "870", "886", "887", "908", "918", "927",
"929", "950", "957", "978", "979", "taxid"), class = "factor"),
X2 = structure(c(3L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("Root",
"lineage", "null"), class = "factor"), X3 = structure(c(1L,
3L, 3L, 3L, 3L, 3L, 3L), .Label = c("Root", "name", "rootrank"
), class = "factor"), X4 = structure(c(2L, 1L, 1L, 1L, 1L,
1L, 1L), .Label = c("Bacteria", "no rank", "rank"), class = "factor"),
X5 = structure(c(1L, 3L, 3L, 3L, 3L, 3L, 3L), .Label = c("194",
"M2MID06_Trimmed_noGaps.fas", "domain"), class = "factor"),
X6 = structure(c(NA, NA, 10L, 10L, 10L, 10L, 10L), .Label = c("",
"Acidobacteria", "Actinobacteria", "Bacteroidetes", "Cyanobacteria/Chloroplast",
"Firmicutes", "Gemmatimonadetes", "Nitrospira", "Planctomycetes",
"Proteobacteria", "Spirochaetes", "Verrucomicrobia", "unclassified_Bacteria"
), class = "factor"), X7 = structure(c(NA, 2L, 3L, 3L, 3L,
3L, 3L), .Label = c("", "Bacteria", "phylum"), class = "factor"),
X8 = structure(c(NA, 21L, NA, 8L, 8L, 8L, 8L), .Label = c("",
"Acidobacteria_Gp3", "Acidobacteria_Gp4", "Actinobacteria",
"Alphaproteobacteria", "Bacilli", "Bacteroidetes_incertae_sedis",
"Betaproteobacteria", "Chloroplast", "Deltaproteobacteria",
"Flavobacteria", "Gammaproteobacteria", "Gemmatimonadetes",
"Nitrospira", "Phycisphaerae", "Planctomycetacia", "Sphingobacteria",
"Spirochaetes", "Subdivision3", "Verrucomicrobiae", "domain",
"unclassified_Bacteroidetes", "unclassified_Proteobacteria"
), class = "factor"), X9 = structure(c(NA, 2L, 11L, 14L,
14L, 14L, 14L), .Label = c("", "194", "Acidobacteria", "Actinobacteria",
"Bacteroidetes", "Cyanobacteria/Chloroplast", "Firmicutes",
"Gemmatimonadetes", "Nitrospira", "Planctomycetes", "Proteobacteria",
"Spirochaetes", "Verrucomicrobia", "class", "unclassified_Bacteria"
), class = "factor"), X10 = structure(c(NA, NA, 29L, NA,
22L, 22L, 22L), .Label = c("", "Actinobacteridae", "Bdellovibrionales",
"Burkholderiales", "Caulobacterales", "Chloroplast", "Chromatiales",
"Flavobacteriales", "Gemmatimonadales", "Gp3", "Gp4", "Lactobacillales",
"Legionellales", "Methylophilales", "Nitrospirales", "Ohtaekwangia",
"Phycisphaerales", "Planctomycetales", "Pseudomonadales",
"Rhizobiales", "Rhodobacterales", "Rhodocyclales", "Rhodospirillales",
"Sphingobacteriales", "Sphingomonadales", "Spirochaetales",
"Subdivision3_genera_incertae_sedis", "Verrucomicrobiales",
"phylum", "unclassified_Alphaproteobacteria", "unclassified_Betaproteobacteria",
"unclassified_Deltaproteobacteria", "unclassified_Gammaproteobacteria"
), class = "factor")), .Names = c("X1", "X2", "X3", "X4",
"X5", "X6", "X7", "X8", "X9", "X10"), row.names = 2:8, class = "data.frame")
这基本上意味着我有渐进式行,在渐进列中填充NA。但是,无法告知第一个NA
所在的特定行。在第一个NA
列之前,我有两列我真正感兴趣的列:指定分类级别的重叠群数量,以及分类级别名称前面的两列。
我已经创建了一个列表,其中包含每个数据框的索引,这些索引将选择最后一行:
library(plyr)
lastcollist<-lapply(hierlist,function(p)lapply(apply(p, 1, function(x) which(!is.na(x)) ),function(x)if(length(x)>0){max(x)}else{0}))
lastcollist<-lapply(lastcollist,unlist)
lastcollist.idx<-llply(lastcollist,function(x)cbind(seq(1,length(x)),x))
此处lastcollist.idx
将包含每行的索引以及最后一个非NA
列:
head(lastcollist.idx$hier_M2MID06_Trimmed_noGaps.fas_fixrank.txt)
x
[1,] 1 5
[2,] 2 5
[3,] 3 9
[4,] 4 11
[5,] 5 13
[6,] 6 15
所以我现在基本上想要做的是创建一个新列表,其中包含每个给定行的数据帧(或者只有最后一列,x
中的变量lastcollist.idx
)最后选择的列。
这将是给定示例的期望输出:
dput(rbind(c('domain','194'),c('Proteobacteria','Phylum'),c('Betaproteobacteria','class'),c ('class','Rhodocyclales'),c('class','Rhodocyclales'),c('class','Rhodocyclales')))
structure(c("domain", "Proteobacteria", "Betaproteobacteria",
"class", "class", "class", "194", "Phylum", "class", "Rhodocyclales",
"Rhodocyclales", "Rhodocyclales"), .Dim = c(6L, 2L))
我必须承认,我不会立即知道如何这样做。任何指针都受到热烈欢迎。我不是R的新手,所以你不必在解释中花很多时间。
对于更大的可重复示例,请考虑来自bioconductor库的数据集'khanmiss'(bioconductor library impute)。
source("http://bioconductor.org/biocLite.R")
biocLite("impute")
require(impute)
data(khanmiss)
这基本上是在几个地方引入了NA的数据帧。它与我的文件不是完全相同的层次结构,但它符合目的。由于这是一个非常不方便的数据帧,有2309个观测值,并且只有222个行包含缺失值,因此我选择了缺少值的行,并随机添加了78行,这些行在新的data.frame中没有缺失值。然后将此data.frame拆分为4个arbirtraty大小的数据帧列表(最多300个)。
isnadf<-as.data.frame(which(is.na(khanmiss),arr.ind=T))
na.rows<-sort(unique(isnadf$row))
length(na.rows) #the dataset has 222 rows which contain NA
na.khanmiss<-khanmiss[na.rows,]
notna.rows<-setdiff(rownames(khanmiss),na.rows)
notna.rows.selected<-sort(as.numeric(sample(notna.rows,78)))
notna.selected.khanmiss<-khanmiss[notna.rows.selected,]
khanmiss.selected<-rbind(na.khanmiss,notna.selected.khanmiss)
dfsizes<-c(82,74,79,65) #arbitrarily selected, adds up to 300
khanmiss.list<-split(khanmiss.selected,rep(letters[1:4],dfsizes))
最终给出的列表与我的数据集有些相似。
答案 0 :(得分:2)
沿着这些未经测试的行可能有效:
apply(dfrm, 1, function(r) { r[ (which(is.na(r))[1]-1):(which(is.na(r))[1]-2)) ] } )
我将这种加载文本输出作为数据帧的常用方法在本例中失败,因此我的建议是发布dput
输出而不是屏幕剪贴。 (在我看来,你应该用header = TRUE完成数据输入,因为你的第一行数据看起来不像数据。)
使用新数据(并且意识到需要测试没有NA:
apply(hierlist, 1, function(r) { r[
if( any(is.na(r))){
(which(is.na(r))[1]-1):( which(is.na(r))[1]-2)
}else{
(length(r)-2): (length(r)-1)}
] }
)
#--------------------------------------
2 3 4 5
[1,] "194" "domain" "phylum" "class"
[2,] "no rank" "Bacteria" "Proteobacteria" "Betaproteobacteria"
6 7 8
[1,] "Betaproteobacteria" "Betaproteobacteria" "Betaproteobacteria"
[2,] "class" "class" "class"