如何将xml文件转换为R中的数据框?

时间:2016-12-13 21:52:11

标签: r xml

我正在尝试使用XML包将xml文件转换为R中的数据框,如下所示:

library("XML")

file<-c("<?xml version="1.0" encoding="utf-8"?>
<dashboardreport name="host_cpu_report" version="6.5.4.1014" reportdate="2016-12-13T16:23:21.959-05:00" description="">
  <source name="Web Application">
    <filters summary="last 7 days">
      <filter>tf:Last7d</filter>
    </filters>
  </source>
  <reportheader>
    <reportdetails>
      <user>test</user>
    </reportdetails>
  </reportheader>
  <data>
    <chartdashlet name="host_cpu" description="" showabsolutevalues="false">
      <measures structuretype="tree">
        <measure measure="CPU Total Time - CPU Total Time (split by Agent Host)" color="#c04000" aggregation="Maximum" unit="%" thresholds="false" drawingorder="1">
          <measure measure="CPU Total Time - web101" color="#1a40b2" aggregation="Maximum" unit="%" thresholds="false">
            <measurement timestamp="1481368500000" avg="3.391054992675781" min="0.3517608642578125" max="52.68987274169922" sum="84.77637481689453" count="25"></measurement>
            <measurement timestamp="1481369400000" avg="0.9348518371582031" min="0.20020294189453125" max="8.555328369140625" sum="84.13666534423828" count="90"></measurement>
          </measure>
          <measure measure="CPU Total Time - web02" color="#ebeb7a" aggregation="Maximum" unit="%" thresholds="false">
            <measurement timestamp="1481337000000" avg="2.2219837733677457" min="0.1999969482421875" max="21.680084228515625" sum="31.107772827148438" count="14"></measurement>
            <measurement timestamp="1481337900000" avg="0.9414346483018663" min="0.05010223388671875" max="29.486526489257812" sum="84.72911834716797" count="90"></measurement>
          </measure>
          <measure measure="CPU Total Time - web03" color="#3a21de" aggregation="Maximum" unit="%" thresholds="false">
            <measurement timestamp="1481370300000" avg="0.7432420518663194" min="0.05007171630859375" max="9.136299133300781" sum="26.7567138671875" count="36"></measurement>
            <measurement timestamp="1481371200000" avg="1.0104971991644964" min="0.050048828125" max="31.359756469726562" sum="90.94474792480469" count="90"></measurement>
            <measurement timestamp="1481372100000" avg="0.5768865797254774" min="0.0" max="25.47643280029297" sum="51.91979217529297" count="90"></measurement>
            <measurement timestamp="1481373000000" avg="0.870541433270058" min="0.0" max="25.13770294189453" sum="77.47818756103516" count="89"></measurement>
            <measurement timestamp="1481373900000" avg="1.2104591369628905" min="0.0" max="36.33165740966797" sum="108.94132232666016" count="90"></measurement>
          </measure>
          </measure>
        </measure>
      </measures>
    </chartdashlet>
  </data>
</dashboardreport>
")

我需要从这个xml中捕获这些:

CPU Total Time - web101, timestamp, max 

我做到了:

library(plyr)
df<-ldply(xmlToList(file), data.frame)

我收到此错误:

Error in data.frame(measurement = c("1481368500000", "3.391054992675781",  : 
  arguments imply differing number of rows: 6, 5

我在这里做错了什么想法?

1 个答案:

答案 0 :(得分:2)

您的固定文档示例:

doc_txt <- '<?xml version="1.0" encoding="utf-8"?>
<dashboardreport name="host_cpu_report" version="6.5.4.1014" reportdate="2016-12-13T16:23:21.959-05:00" description="">
  <source name="Web Application">
    <filters summary="last 7 days">
      <filter>tf:Last7d</filter>
    </filters>
  </source>
  <reportheader>
    <reportdetails>
      <user>test</user>
    </reportdetails>
  </reportheader>
  <data>
    <chartdashlet name="host_cpu" description="" showabsolutevalues="false">
      <measures structuretype="tree">
        <measure measure="CPU Total Time - CPU Total Time (split by Agent Host)" color="#c04000" aggregation="Maximum" unit="%" thresholds="false" drawingorder="1">
          <measure measure="CPU Total Time - web101" color="#1a40b2" aggregation="Maximum" unit="%" thresholds="false">
            <measurement timestamp="1481368500000" avg="3.391054992675781" min="0.3517608642578125" max="52.68987274169922" sum="84.77637481689453" count="25"></measurement>
            <measurement timestamp="1481369400000" avg="0.9348518371582031" min="0.20020294189453125" max="8.555328369140625" sum="84.13666534423828" count="90"></measurement>
          </measure>
          <measure measure="CPU Total Time - web02" color="#ebeb7a" aggregation="Maximum" unit="%" thresholds="false">
            <measurement timestamp="1481337000000" avg="2.2219837733677457" min="0.1999969482421875" max="21.680084228515625" sum="31.107772827148438" count="14"></measurement>
            <measurement timestamp="1481337900000" avg="0.9414346483018663" min="0.05010223388671875" max="29.486526489257812" sum="84.72911834716797" count="90"></measurement>
          </measure>
          <measure measure="CPU Total Time - web03" color="#3a21de" aggregation="Maximum" unit="%" thresholds="false">
            <measurement timestamp="1481370300000" avg="0.7432420518663194" min="0.05007171630859375" max="9.136299133300781" sum="26.7567138671875" count="36"></measurement>
            <measurement timestamp="1481371200000" avg="1.0104971991644964" min="0.050048828125" max="31.359756469726562" sum="90.94474792480469" count="90"></measurement>
            <measurement timestamp="1481372100000" avg="0.5768865797254774" min="0.0" max="25.47643280029297" sum="51.91979217529297" count="90"></measurement>
            <measurement timestamp="1481373000000" avg="0.870541433270058" min="0.0" max="25.13770294189453" sum="77.47818756103516" count="89"></measurement>
            <measurement timestamp="1481373900000" avg="1.2104591369628905" min="0.0" max="36.33165740966797" sum="108.94132232666016" count="90"></measurement>
          </measure>
        </measure>
      </measures>
    </chartdashlet>
  </data>
</dashboardreport>
'

代码:

library(xml2)
library(purrr)
library(dplyr)

doc <- read_xml(doc_txt)

xml_find_all(doc, ".//measure/measure") %>%
  map_df(function(x) {
    xml_find_all(x, ".//measurement") %>%
      map_df(~as.list(xml_attrs(.))) %>%
      select(-min, -avg, -sum, -count) %>%
      mutate(node=xml_attr(x, "measure"))
  })

产生:

##  # A tibble: 9 × 3
##        timestamp                max                    node
##            <chr>              <chr>                   <chr>
##  1 1481368500000  52.68987274169922 CPU Total Time - web101
##  2 1481369400000  8.555328369140625 CPU Total Time - web101
##  3 1481337000000 21.680084228515625  CPU Total Time - web02
##  4 1481337900000 29.486526489257812  CPU Total Time - web02
##  5 1481370300000  9.136299133300781  CPU Total Time - web03
##  6 1481371200000 31.359756469726562  CPU Total Time - web03
##  7 1481372100000  25.47643280029297  CPU Total Time - web03
##  8 1481373000000  25.13770294189453  CPU Total Time - web03
##  9 1481373900000  36.33165740966797  CPU Total Time - web03

您可以filter()离开您不想要的节点或在XPath中获取挑剔的节点,如果您想要真正的列类型与上一个mutate()的字符管道:

mutate(timestamp=as.POSIXct(as.numeric(timestamp)/1000, origin="1970-01-01"), max=as.numeric(max))

或将其内部添加为现有的参数。