我有一个XML文件,我想从中提取数据。到目前为止,我已经设法用tidyverse和xml2包做了一切,但我无法弄清楚如何在我的XML任务中解决下一个谜题。
示例XML:
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<ns2:ArchiveView>
<Notification ID="1001">
<persons>
<Timestamp>07:39:25</Timestamp>
<person type="A" name="Barney">
<uniqueUserId>2222</uniqueUserId>
</person>
</persons>
<persons>
<Timestamp>08:40:25</Timestamp>
<person type="B" name="John">
<uniqueUserId>1111</uniqueUserId>
</person>
</persons>
</Notification>
<Notification ID="1002">
<persons>
<Timestamp>14:39:25</Timestamp>
<person type="A" name="Barney">
<uniqueUserId>2222</uniqueUserId>
</person>
</persons>
</Notification>
<Notification ID="1003">
</Notification>
</ns2:ArchiveView>
由于可以分配给通知的最大人数是3,我想最终得到一个如下所示的data.frame:
ID name1 time1 type1 name2 time2 type2 name3 time3 type3
1001 Barney 07:39:25 A John 08:40:25 B NA NA NA
1002 Barney 14:39:25 A NA NA NA NA NA NA
1003 NA NA NA NA NA NA NA NA NA
到目前为止我成功实现了目标:
doc <- read_xml( "./data/test.xml" )
提取所有ID
df.ID <- data.frame(
ID = xml_find_all( doc, ".//Notifications" ) %>% xml_attrs() %>% unlist() ,
stringsAsFactors = FALSE )
确定附加人员的通知ID
ID.with.persons <- xml_find_all( doc, ".//Notifications[ persons ]" ) %>%
xml_attrs() %>%
unlist()
创建附加人员的通知节点集
nodes.persons <- xml_find_all( doc, ".//Notifications[ persons ]"
我还设法得到了所有人的名字(在一个向量中)
persons.name <- nodes.persons %>% xml_attr("name") %>% unlist()
我感觉我非常接近解决方案,但我不能完全理解如何将所有这些数据合并到一个不错的data.frame中(如上所述)。
所有建议都热烈赞赏:)
答案 0 :(得分:2)
这是一种非常类似工作的方法(我对R很新,所以它可能不像R一样。)只需循环遍历每个元素,将所需元素粘贴到矢量中。最后将其转换为矩阵并将其插入数据帧。这只能起作用,因为有一个固定数量的列来构建矩阵。
>>> import datetime
... import time
...
... fmt = '%d/%m/%Y %H:%M%p'
...
... half_hour_date = '23/02/2018 23:00PM'
...
... date_format = datetime.datetime.strptime(half_hour_date, fmt)
... half_hour = date_format + datetime.timedelta(days=2) # self.program_day)
...
... # convert from datetime to time_struct_time object
... epg_time_1 = time.struct_time(half_hour.timetuple())
>>> epg_time_1
time.struct_time(tm_year=2018, tm_mon=2, tm_mday=25, tm_hour=23, tm_min=0, tm_sec=0, tm_wday=6, tm_yday=56, tm_isdst=-1)
输出:
library(xml2)
doc <- read_xml("test.xml")
row <- c()
notifications <- xml_find_all(doc, ".//Notification")
for (i in 1:length(notifications)) {
row <- c(row, xml_attr(notifications[i], "ID"))
for (j in 1:3) {
person <- xml_find_all(notifications[i], sprintf("persons[%d]", j))
if (length(person) > 0) {
row <- c(row, xml_find_chr(person, "string(./person/@name)"))
row <- c(row, xml_find_chr(person, "string(./Timestamp/text())"))
row <- c(row, xml_find_chr(person, "string(./person/@type)"))
} else {
row <- c(row, NA, NA, NA)
}
}
}
df <- data.frame(matrix(data=rows, ncol=10, byrow=TRUE))
colnames(df) <- c("ID", "name1", "time1", "type1", "name2", "time2", "type2", "name3", "time3", "type3")
df
答案 1 :(得分:1)
这是解决方案。它有比我想要的更多的手动编码,但它确实显示了解决方案技术:
library(xml2)
doc<-read_xml("*Your xml Document goes here*")
#find the Notification nodes
Notices<-xml_find_all( doc, ".//Notification" )
#find all of the timestamps in each Notification
timestamps<-sapply(Notices, function(x){xml_text(xml_find_all(x, ".//Timestamp"))})
#extract the three timestamps in each Notification (missing ones return NA)
#sapply returns a column, need to transpose to create the row in the data frame
time.df<-data.frame(t(sapply(timestamps, function(x){c(x[1], x[2], x[3])})))
#rename the column names
names(time.df)<-paste0("time", 1:3)
#repeat for the person's name and type
persons.name <-sapply(Notices, function(x){x %>% xml_find_all( ".//person" ) %>% xml_attr("name")})
name.df<-data.frame(t(sapply(persons.name, function(x){c(x[1], x[2], x[3])})))
names(name.df)<-paste0("name", 1:3)
persons.type <-sapply(Notices, function(x){x %>% xml_find_all( ".//person" ) %>% xml_attr("type")})
type.df<-data.frame(t(sapply(persons.type, function(x){c(x[1], x[2], x[3])})))
names(type.df)<-paste0("type", 1:3)
#assemble the final answer and rearrange the column order
answer<-cbind(name.df, time.df, type.df)
answer<-answer[,c(1, 4, 7, 2, 5, 8, 3, 6, 9)]
df.ID <- data.frame(ID = xml_find_all( doc, ".//Notification" ) %>%
xml_attr("ID"), stringsAsFactors = FALSE)
answer<-cbind(df.ID, answer)
代码的注释解释了解决方案所采取的步骤。我确定可以进行一些优化,但这是一个良好的开端。