我正在使用xml并尝试按特定值过滤节点(在本例中为一个特定的邮政编码)。 XML看起来像这样:
https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml
我能够使用此R命令计算位置数:
fileUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml"
libary(XML)
doc <- xmlTreeParse(sub("s","",fileUrl), useInternal=TRUE)
xmlRoot(doc)
xpathSApply(rootNode,"/response//row[zipcode=21231]",xmlValue)
输出:
[1] "191921231Fells Point1SOUTHEASTERN"
[2] "300 SOUTH ANN STREET21231Upper Fells Point1SOUTHEASTERN"
[3] "ADMIRAL FELL INN21231Fells Point1SOUTHEASTERN"
[4] "ALE MARY'S21231Fells Point1SOUTHEASTERN"
[5] "ALEXANDER'S TAVERN21231Fells Point1SOUTHEASTERN"
[6] "BERTHA'S RESTAURANT21231Fells Point1SOUTHEASTERN"
[7] "BIRDS OF A FEATHER21231Fells Point1SOUTHEASTERN"
[8] "BLARNEY STONE PUB21231Fells Point1SOUTHEASTERN"
[9] "ARCOS21231Washington Hill1SOUTHEASTERN"
等
我担心的是有更好的方法来实现这一目标。社区的任何建议?
答案 0 :(得分:2)
有很多方法可以解析这个XML - 可能会获得匹配的节点,然后使用xmlToDataFrame
并将空位置字段替换为address属性。
doc <- xmlParse(sub("s","",fileUrl))
rows <- getNodeSet(doc, "//row[zipcode=21231]")
z <- xmlToDataFrame(nodes = rows)
addr <- sapply(rows, function(x) xpathSApply( x, "./location_1", xmlGetAttr, "human_address"))
z$location_1 <- gsub('.*:"([^"]+).*:"([^"]+).*:"([^"]+).*', '\\1, \\2 \\3', addr)
head(z)
name zipcode neighborhood councildistrict policedistrict location_1
1 1919 21231 Fells Point 1 SOUTHEASTERN 1919 FLEET ST, Baltimore MD
2 300 SOUTH ANN STREET 21231 Upper Fells Point 1 SOUTHEASTERN 300 ANN ST, Baltimore MD
3 ADMIRAL FELL INN 21231 Fells Point 1 SOUTHEASTERN 818 BROADWAY, Baltimore MD
4 ALE MARY'S 21231 Fells Point 1 SOUTHEASTERN 1939 FLEET ST, Baltimore MD
5 ALEXANDER'S TAVERN 21231 Fells Point 1 SOUTHEASTERN 710 BROADWAY, Baltimore MD
6 BERTHA'S RESTAURANT 21231 Fells Point 1 SOUTHEASTERN 734 BROADWAY, Baltimore MD
答案 1 :(得分:1)
这是使用xml2
获取属性的所有字段的方法。它还使用fromJSON
来解析嵌入的地址字段,因为它是JSON,而正则表达式可能不适用于其他数据文件,如果该字段添加了一些难以理解的位。
library(xml2)
library(dplyr)
library(jsonlite)
library(pbapply) # some ops take a while & progress bars (like fezzes) are cool
# parse the doc
doc <- read_xml("getdata-data-restaurants.xml")
# extract the individual rows
rows <- xml_find_all(doc, "//response/row/row")
# extract "easy" fields
fields <- data.frame(pbsapply(c("name", "zipcode", "neighborhood",
"councildistrict", "policedistrict"),
function(x) { xml_text(xml_find_all(rows, x)) }),
stringsAsFactors=FALSE)
# alternate method
# fields <- data_frame(name=xml_text(xml_find_all(rows, "//name")),
# zipcode=xml_text(xml_find_all(rows, "//zipcode")),
# neighborhood=xml_text(xml_find_all(rows, "//neighborhood")),
# councildistrict=xml_text(xml_find_all(rows, "//councildistrict")),
# policedistrict=xml_text(xml_find_all(rows, "//policedistrict")))
# extract the attributes from <row>
row_attrs <- bind_rows(lapply(xml_attrs(rows), as.list))
# extract the attributes from <location_1>
loc_attrs <- pblapply(xml_attrs(xml_find_all(rows, "//location_1")), as.list)
# since each one is a JSON encoded string, use fromJSON to convert
# and then extract them all into a data frame
human_address <- bind_rows(pblapply(loc_attrs,
function(x) data.frame(fromJSON(x$human_addres),
stringsAsFactors=FALSE)))
# bind them all together
dat <- bind_cols(row_attrs, human_address, fields)
# take a look
glimpse(dat)
## Observations: 1,327
## Variables: 13
## $ _id (chr) "1", "2", "3", "4", "5", "6", "7", "8", "...
## $ _uuid (chr) "93CACF6F-C8C2-4B87-95A8-8177806D5A6F", "...
## $ _position (chr) "1", "2", "3", "4", "5", "6", "7", "8", "...
## $ _address (chr) "http://data.baltimorecity.gov/resource/k...
## $ address (chr) "4509 BELAIR ROAD", "1919 FLEET ST", "284...
## $ city (chr) "Baltimore", "Baltimore", "Baltimore", "B...
## $ state (chr) "MD", "MD", "MD", "MD", "MD", "MD", "MD",...
## $ zip (chr) "", "", "", "", "", "", "", "", "", "", "...
## $ name (chr) "410", "1919", "SAUTE", "#1 CHINESE KITCH...
## $ zipcode (chr) "21206", "21231", "21224", "21211", "2122...
## $ neighborhood (chr) "Frankford", "Fells Point", "Canton", "Ha...
## $ councildistrict (chr) "2", "1", "1", "14", "9", "14", "13", "7"...
## $ policedistrict (chr) "NORTHEASTERN", "SOUTHEASTERN", "SOUTHEAS...
# do filtering in R (dplyr is super gd for this)
dat %>%
filter(zipcode=="21231") %>%
select(name, address, city, state, zip)
## Source: local data frame [127 x 5]
##
## name address city state zip
## (chr) (chr) (chr) (chr) (chr)
## 1 1919 1919 FLEET ST Baltimore MD
## 2 300 SOUTH ANN STREET 300 ANN ST Baltimore MD
## 3 ADMIRAL FELL INN 818 BROADWAY Baltimore MD
## 4 ALE MARY'S 1939 FLEET ST Baltimore MD
## 5 ALEXANDER'S TAVERN 710 BROADWAY Baltimore MD
## 6 BERTHA'S RESTAURANT 734 BROADWAY Baltimore MD
## 7 BIRDS OF A FEATHER 1712 ALICEANNA ST Baltimore MD
## 8 BLARNEY STONE PUB 704 BROADWAY Baltimore MD
## 9 ARCOS 129 BROADWAY Baltimore MD
## 10 ARIZONA BAR & GRILL 25 BROADWAY Baltimore MD
## .. ... ... ... ... ...
我没有打扰needs_recoding
因为在这种情况下总是true
,但如果你需要它,那么很容易添加。