Question

我遇到了xml文件的问题。我无法从这些xml结构中获得正确的数据框。

<Node1 Value1="start">                            
    <Node2 Value2="1110" Value3="345">              
        <Node3 Value4="500" Value5="3">               
            <Node4 Value6="484663" Value7="130"  /> 
            <Node4 Value6="253234" Value7="59"  />   
            <Node4 Value6="198476" Value7="131" />  
        </Node3>                                      
        <Node3 Value4="113" Value5="1">               
            <Node4 Value6="2009343" Value7="555"  /> 
            <Node4 Value6="2530931" Value7="333"  />   
            <Node4 Value6="1984761" Value7="111" />  
        </Node3>                                      
    </Node2>                                        
</Node1>

我使用以下行来获取数据框。但是有很多NA。如何强制data.table重复值，而不是将NA放入？

library(data.table)                                                                       
library(XML)                                                                              
# test.xml = the xml-file 

test <- xmlTreeParse("test.xml",  useInternalNodes=TRUE)                                  

Node1 <- rbindlist(lapply(test["//*"], function(x)as.list(xmlAttrs(x))), fill = TRUE, use.names = TRUE)

结果应如下所示..

Value1  Value2  Value3 Value4 Value5 Value6   Value7
start   1110    345    500    3      484663   130
start   1110    345    500    3      253234   59
start   1110    345    500    3      198476   131 
start   1110    345    113    1      2009343  555
start   1110    345    113    1      2530931  333
start   1110    345    113    1      1984761  111

Answer 1

b1 <- t( xpathSApply(doc, "//Node4", xmlAncestors, xmlAttrs) )
b1 <- data.frame( matrix( unlist( b1 ), 
                          nrow = nrow(b1),
                          ncol = ncol(b1),
                          dimnames = list( NULL, colnames( b1 ) ) ), 
                  stringsAsFactors = FALSE )
b1
#   Value1 Value2 Value3 Value4 Value5  Value6 Value7
# 1  start   1110    345    500      3  484663    130
# 2  start   1110    345    500      3  253234     59
# 3  start   1110    345    500      3  198476    131
# 4  start   1110    345    113      1 2009343    555
# 5  start   1110    345    113      1 2530931    333
# 6  start   1110    345    113      1 1984761    111

数据：

library(XML) doc <- xmlParse( '<Node1 Value1="start"> <Node2 Value2="1110" Value3="345"> <Node3 Value4="500" Value5="3"> <Node4 Value6="484663" Value7="130" /> <Node4 Value6="253234" Value7="59" /> <Node4 Value6="198476" Value7="131" /> </Node3> <Node3 Value4="113" Value5="1"> <Node4 Value6="2009343" Value7="555" /> <Node4 Value6="2530931" Value7="333" /> <Node4 Value6="1984761" Value7="111" /> </Node3> </Node2> </Node1>')

Answer 2

使用fill包

中的tidyr

XML:::xmlAttrsToDataFrame(doc["//*"]) %>% fill(1:6) %>% filter(!is.na(Value7))
  Value1 Value2 Value3 Value4 Value5  Value6 Value7
1  start   1110    345    500      3  484663    130
2  start   1110    345    500      3  253234     59
3  start   1110    345    500      3  198476    131
4  start   1110    345    113      1 2009343    555
5  start   1110    345    113      1 2530931    333
6  start   1110    345    113      1 1984761    111

将xml转换为不带NA的

2 个答案: