我有一个XML文件,如果我在excel中打开它有两个工作表。我知道它有3个节点,样式和两个工作表。我一直在尝试将第一个工作表转换为data.frame几个小时,但无法弄清楚我做错了什么。第一个工作表的名称是PD Analysis。
library(xml2)
pg <- read_xml("/Users/lucychen/Documents/Lab/NIH
lab/Data/exportedTest.xml")
# get all the <record>s
recs <- xml_find_all(pg, "PD Analysis")
print("here")
# extract and clean all the columns
vals <- trimws(xml_text(recs))
# extract and clean (if needed) the area names
labs <- trimws(xml_attr(recs, "label"))
# mine the column names from the two variable descriptions
# this XPath construct lets us grab either the <categ…> or <real…> tags
# and then grabs the 'name' attribute of them
cols <- xml_attr(xml_find_all(pg, "//data/variables/*[self::categoricalvariable or
self::realvariable]"), "name")
# this converts each set of <record> columns to a data frame
# after first converting each row to numeric and assigning
# names to each column (making it easier to do the matrix to data frame conv)
dat <- do.call(rbind, lapply(strsplit(vals, "\ +"),
function(x) {
data.frame(rbind(setNames(as.numeric(x),cols)))
}))
# then assign the area name column to the data frame
dat$area_name <- labs
print(dat)
以下是XML文件的示例
<?xml version="1.0" encoding="UTF-8"?>
<Workbook xmlns="urn:schemas-microsoft-com:office:spreadsheet" xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:x="urn:schemas-microsoft-com:office:excel" xmlns:ss="urn:schemas-microsoft-com:office:spreadsheet" xmlns:html="http://www.w3.org/TR/REC-html40">
<Styles>
<Style ss:ID="b1">
<Font ss:Bold="1"/>
</Style>
<Style ss:ID="c1">
<Interior ss:Color="#FFFF00" ss:Pattern="Solid"/>
</Style>
<Style ss:ID="c2">
<Interior ss:Color="#00FFFF" ss:Pattern="Solid"/>
</Style>
</Styles>
<Worksheet ss:Name="PD Analysis">
<Table>
<Row ss:StyleID="b1">
<Cell>
<Data ss:Type="String">File</Data>
</Cell>
<Cell>
<Data ss:Type="String">SegmNo</Data>
</Cell>
<Cell>
<Data ss:Type="String">SegmName</Data>
</Cell>
<Cell>
<Data ss:Type="String">EventNo</Data>
</Cell>
<Cell>
<Data ss:Type="String">EventName</Data>
</Cell>
<Cell>
<Data ss:Type="String">EventXDAT</Data>
</Cell>
<Cell>
<Data ss:Type="String">sample_#</Data>
</Cell>
<Cell>
<Data ss:Type="String">time_secs</Data>
</Cell>
<Cell>
<Data ss:Type="String">pupil_diam</Data>
</Cell>
<Cell>
<Data ss:Type="String">scaled_pupil_diam</Data>
</Cell>
<Cell>
<Data ss:Type="String">xdat</Data>
</Cell>
<Cell>
<Data ss:Type="String">blink</Data>
</Cell>
<Cell>
<Data ss:Type="String">interpolated</Data>
</Cell>
</Row>
<Row>
<Cell>
<Data ss:Type="String">Data1</Data>
</Cell>
<Cell>
<Data ss:Type="Number">1</Data>
</Cell>
<Cell>
<Data ss:Type="String">segment 1</Data>
</Cell>
<Cell>
<Data ss:Type="Number">1</Data>
</Cell>
<Cell>
<Data ss:Type="String">event 1</Data>
</Cell>
<Cell>
<Data ss:Type="String">0</Data>
</Cell>
<Cell>
<Data ss:Type="Number">1</Data>
</Cell>
<Cell>
<Data ss:Type="Number">0</Data>
</Cell>
<Cell>
<Data ss:Type="Number">100</Data>
</Cell>
<Cell>
<Data ss:Type="Number">100</Data>
</Cell>
<Cell>
<Data ss:Type="Number">0</Data>
</Cell>
<Cell>
<Data ss:Type="String"></Data>
</Cell>
<Cell>
<Data ss:Type="String"></Data>
</Cell>
</Row>
<Row>