我正在尝试使用XML包将xml文件转换为R中的数据框,如下所示:
library("XML")
file<-c("<?xml version="1.0" encoding="utf-8"?>
<dashboardreport name="host_cpu_report" version="6.5.4.1014" reportdate="2016-12-13T16:23:21.959-05:00" description="">
<source name="Web Application">
<filters summary="last 7 days">
<filter>tf:Last7d</filter>
</filters>
</source>
<reportheader>
<reportdetails>
<user>test</user>
</reportdetails>
</reportheader>
<data>
<chartdashlet name="host_cpu" description="" showabsolutevalues="false">
<measures structuretype="tree">
<measure measure="CPU Total Time - CPU Total Time (split by Agent Host)" color="#c04000" aggregation="Maximum" unit="%" thresholds="false" drawingorder="1">
<measure measure="CPU Total Time - web101" color="#1a40b2" aggregation="Maximum" unit="%" thresholds="false">
<measurement timestamp="1481368500000" avg="3.391054992675781" min="0.3517608642578125" max="52.68987274169922" sum="84.77637481689453" count="25"></measurement>
<measurement timestamp="1481369400000" avg="0.9348518371582031" min="0.20020294189453125" max="8.555328369140625" sum="84.13666534423828" count="90"></measurement>
</measure>
<measure measure="CPU Total Time - web02" color="#ebeb7a" aggregation="Maximum" unit="%" thresholds="false">
<measurement timestamp="1481337000000" avg="2.2219837733677457" min="0.1999969482421875" max="21.680084228515625" sum="31.107772827148438" count="14"></measurement>
<measurement timestamp="1481337900000" avg="0.9414346483018663" min="0.05010223388671875" max="29.486526489257812" sum="84.72911834716797" count="90"></measurement>
</measure>
<measure measure="CPU Total Time - web03" color="#3a21de" aggregation="Maximum" unit="%" thresholds="false">
<measurement timestamp="1481370300000" avg="0.7432420518663194" min="0.05007171630859375" max="9.136299133300781" sum="26.7567138671875" count="36"></measurement>
<measurement timestamp="1481371200000" avg="1.0104971991644964" min="0.050048828125" max="31.359756469726562" sum="90.94474792480469" count="90"></measurement>
<measurement timestamp="1481372100000" avg="0.5768865797254774" min="0.0" max="25.47643280029297" sum="51.91979217529297" count="90"></measurement>
<measurement timestamp="1481373000000" avg="0.870541433270058" min="0.0" max="25.13770294189453" sum="77.47818756103516" count="89"></measurement>
<measurement timestamp="1481373900000" avg="1.2104591369628905" min="0.0" max="36.33165740966797" sum="108.94132232666016" count="90"></measurement>
</measure>
</measure>
</measure>
</measures>
</chartdashlet>
</data>
</dashboardreport>
")
我需要从这个xml中捕获这些:
CPU Total Time - web101, timestamp, max
我做到了:
library(plyr)
df<-ldply(xmlToList(file), data.frame)
我收到此错误:
Error in data.frame(measurement = c("1481368500000", "3.391054992675781", :
arguments imply differing number of rows: 6, 5
我在这里做错了什么想法?
答案 0 :(得分:2)
您的固定文档示例:
doc_txt <- '<?xml version="1.0" encoding="utf-8"?>
<dashboardreport name="host_cpu_report" version="6.5.4.1014" reportdate="2016-12-13T16:23:21.959-05:00" description="">
<source name="Web Application">
<filters summary="last 7 days">
<filter>tf:Last7d</filter>
</filters>
</source>
<reportheader>
<reportdetails>
<user>test</user>
</reportdetails>
</reportheader>
<data>
<chartdashlet name="host_cpu" description="" showabsolutevalues="false">
<measures structuretype="tree">
<measure measure="CPU Total Time - CPU Total Time (split by Agent Host)" color="#c04000" aggregation="Maximum" unit="%" thresholds="false" drawingorder="1">
<measure measure="CPU Total Time - web101" color="#1a40b2" aggregation="Maximum" unit="%" thresholds="false">
<measurement timestamp="1481368500000" avg="3.391054992675781" min="0.3517608642578125" max="52.68987274169922" sum="84.77637481689453" count="25"></measurement>
<measurement timestamp="1481369400000" avg="0.9348518371582031" min="0.20020294189453125" max="8.555328369140625" sum="84.13666534423828" count="90"></measurement>
</measure>
<measure measure="CPU Total Time - web02" color="#ebeb7a" aggregation="Maximum" unit="%" thresholds="false">
<measurement timestamp="1481337000000" avg="2.2219837733677457" min="0.1999969482421875" max="21.680084228515625" sum="31.107772827148438" count="14"></measurement>
<measurement timestamp="1481337900000" avg="0.9414346483018663" min="0.05010223388671875" max="29.486526489257812" sum="84.72911834716797" count="90"></measurement>
</measure>
<measure measure="CPU Total Time - web03" color="#3a21de" aggregation="Maximum" unit="%" thresholds="false">
<measurement timestamp="1481370300000" avg="0.7432420518663194" min="0.05007171630859375" max="9.136299133300781" sum="26.7567138671875" count="36"></measurement>
<measurement timestamp="1481371200000" avg="1.0104971991644964" min="0.050048828125" max="31.359756469726562" sum="90.94474792480469" count="90"></measurement>
<measurement timestamp="1481372100000" avg="0.5768865797254774" min="0.0" max="25.47643280029297" sum="51.91979217529297" count="90"></measurement>
<measurement timestamp="1481373000000" avg="0.870541433270058" min="0.0" max="25.13770294189453" sum="77.47818756103516" count="89"></measurement>
<measurement timestamp="1481373900000" avg="1.2104591369628905" min="0.0" max="36.33165740966797" sum="108.94132232666016" count="90"></measurement>
</measure>
</measure>
</measures>
</chartdashlet>
</data>
</dashboardreport>
'
代码:
library(xml2)
library(purrr)
library(dplyr)
doc <- read_xml(doc_txt)
xml_find_all(doc, ".//measure/measure") %>%
map_df(function(x) {
xml_find_all(x, ".//measurement") %>%
map_df(~as.list(xml_attrs(.))) %>%
select(-min, -avg, -sum, -count) %>%
mutate(node=xml_attr(x, "measure"))
})
产生:
## # A tibble: 9 × 3
## timestamp max node
## <chr> <chr> <chr>
## 1 1481368500000 52.68987274169922 CPU Total Time - web101
## 2 1481369400000 8.555328369140625 CPU Total Time - web101
## 3 1481337000000 21.680084228515625 CPU Total Time - web02
## 4 1481337900000 29.486526489257812 CPU Total Time - web02
## 5 1481370300000 9.136299133300781 CPU Total Time - web03
## 6 1481371200000 31.359756469726562 CPU Total Time - web03
## 7 1481372100000 25.47643280029297 CPU Total Time - web03
## 8 1481373000000 25.13770294189453 CPU Total Time - web03
## 9 1481373900000 36.33165740966797 CPU Total Time - web03
您可以filter()
离开您不想要的节点或在XPath中获取挑剔的节点,如果您想要真正的列类型与上一个mutate()
的字符管道:
mutate(timestamp=as.POSIXct(as.numeric(timestamp)/1000, origin="1970-01-01"), max=as.numeric(max))
或将其内部添加为现有的参数。