使用R中的XPath按值过滤xml节点

时间:2015-11-05 23:20:46

标签: xml r xpath filter

我正在使用xml并尝试按特定值过滤节点(在本例中为一个特定的邮政编码)。 XML看起来像这样:

https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml

我能够使用此R命令计算位置数:

fileUrl <- "https://d396qusza40orc.cloudfront.net/getdata%2Fdata%2Frestaurants.xml"

libary(XML)
doc <- xmlTreeParse(sub("s","",fileUrl), useInternal=TRUE)
xmlRoot(doc)
xpathSApply(rootNode,"/response//row[zipcode=21231]",xmlValue)

输出:

  [1] "191921231Fells Point1SOUTHEASTERN"                                             
  [2] "300 SOUTH ANN STREET21231Upper Fells Point1SOUTHEASTERN"                       
  [3] "ADMIRAL FELL INN21231Fells Point1SOUTHEASTERN"                                 
  [4] "ALE MARY'S21231Fells Point1SOUTHEASTERN"                                       
  [5] "ALEXANDER'S TAVERN21231Fells Point1SOUTHEASTERN"                               
  [6] "BERTHA'S RESTAURANT21231Fells Point1SOUTHEASTERN"                              
  [7] "BIRDS OF A FEATHER21231Fells Point1SOUTHEASTERN"                               
  [8] "BLARNEY STONE PUB21231Fells Point1SOUTHEASTERN"                                
  [9] "ARCOS21231Washington Hill1SOUTHEASTERN"  

我担心的是有更好的方法来实现这一目标。社区的任何建议?

2 个答案:

答案 0 :(得分:2)

有很多方法可以解析这个XML - 可能会获得匹配的节点,然后使用xmlToDataFrame并将空位置字段替换为address属性。

doc <- xmlParse(sub("s","",fileUrl))
rows <- getNodeSet(doc, "//row[zipcode=21231]")
z <- xmlToDataFrame(nodes = rows)

addr <- sapply(rows, function(x) xpathSApply( x, "./location_1", xmlGetAttr, "human_address"))
z$location_1 <- gsub('.*:"([^"]+).*:"([^"]+).*:"([^"]+).*', '\\1, \\2 \\3', addr)

head(z)

                  name zipcode      neighborhood councildistrict policedistrict                  location_1
1                 1919   21231       Fells Point               1   SOUTHEASTERN 1919 FLEET ST, Baltimore MD
2 300 SOUTH ANN STREET   21231 Upper Fells Point               1   SOUTHEASTERN    300 ANN ST, Baltimore MD
3     ADMIRAL FELL INN   21231       Fells Point               1   SOUTHEASTERN  818 BROADWAY, Baltimore MD
4           ALE MARY'S   21231       Fells Point               1   SOUTHEASTERN 1939 FLEET ST, Baltimore MD
5   ALEXANDER'S TAVERN   21231       Fells Point               1   SOUTHEASTERN  710 BROADWAY, Baltimore MD
6  BERTHA'S RESTAURANT   21231       Fells Point               1   SOUTHEASTERN  734 BROADWAY, Baltimore MD

答案 1 :(得分:1)

这是使用xml2获取属性的所有字段的方法。它还使用fromJSON来解析嵌入的地址字段,因为它是JSON,而正则表达式可能不适用于其他数据文件,如果该字段添加了一些难以理解的位。

library(xml2)
library(dplyr)
library(jsonlite)
library(pbapply)  # some ops take a while & progress bars (like fezzes) are cool

# parse the doc
doc <- read_xml("getdata-data-restaurants.xml")

# extract the individual rows
rows <- xml_find_all(doc, "//response/row/row")

# extract "easy" fields
fields  <- data.frame(pbsapply(c("name", "zipcode", "neighborhood", 
                              "councildistrict", "policedistrict"),
                            function(x) { xml_text(xml_find_all(rows, x)) }),
                   stringsAsFactors=FALSE)

# alternate method
# fields <- data_frame(name=xml_text(xml_find_all(rows, "//name")),
#                      zipcode=xml_text(xml_find_all(rows, "//zipcode")),
#                      neighborhood=xml_text(xml_find_all(rows, "//neighborhood")),
#                      councildistrict=xml_text(xml_find_all(rows, "//councildistrict")),
#                      policedistrict=xml_text(xml_find_all(rows, "//policedistrict")))

# extract the attributes from <row>
row_attrs <- bind_rows(lapply(xml_attrs(rows), as.list))

# extract the attributes from <location_1>
loc_attrs <- pblapply(xml_attrs(xml_find_all(rows, "//location_1")), as.list)

# since each one is a JSON encoded string, use fromJSON to convert
# and then extract them all into a data frame
human_address <- bind_rows(pblapply(loc_attrs, 
                                    function(x) data.frame(fromJSON(x$human_addres), 
                                                           stringsAsFactors=FALSE)))

# bind them all together
dat <- bind_cols(row_attrs, human_address, fields)

# take a look
glimpse(dat)

## Observations: 1,327
## Variables: 13
## $ _id             (chr) "1", "2", "3", "4", "5", "6", "7", "8", "...
## $ _uuid           (chr) "93CACF6F-C8C2-4B87-95A8-8177806D5A6F", "...
## $ _position       (chr) "1", "2", "3", "4", "5", "6", "7", "8", "...
## $ _address        (chr) "http://data.baltimorecity.gov/resource/k...
## $ address         (chr) "4509 BELAIR ROAD", "1919 FLEET ST", "284...
## $ city            (chr) "Baltimore", "Baltimore", "Baltimore", "B...
## $ state           (chr) "MD", "MD", "MD", "MD", "MD", "MD", "MD",...
## $ zip             (chr) "", "", "", "", "", "", "", "", "", "", "...
## $ name            (chr) "410", "1919", "SAUTE", "#1 CHINESE KITCH...
## $ zipcode         (chr) "21206", "21231", "21224", "21211", "2122...
## $ neighborhood    (chr) "Frankford", "Fells Point", "Canton", "Ha...
## $ councildistrict (chr) "2", "1", "1", "14", "9", "14", "13", "7"...
## $ policedistrict  (chr) "NORTHEASTERN", "SOUTHEASTERN", "SOUTHEAS...

# do filtering in R (dplyr is super gd for this)
dat %>% 
  filter(zipcode=="21231") %>% 
  select(name, address, city, state, zip)

## Source: local data frame [127 x 5]
## 
##                    name           address      city state   zip
##                   (chr)             (chr)     (chr) (chr) (chr)
## 1                  1919     1919 FLEET ST Baltimore    MD      
## 2  300 SOUTH ANN STREET        300 ANN ST Baltimore    MD      
## 3      ADMIRAL FELL INN      818 BROADWAY Baltimore    MD      
## 4            ALE MARY'S     1939 FLEET ST Baltimore    MD      
## 5    ALEXANDER'S TAVERN      710 BROADWAY Baltimore    MD      
## 6   BERTHA'S RESTAURANT      734 BROADWAY Baltimore    MD      
## 7    BIRDS OF A FEATHER 1712 ALICEANNA ST Baltimore    MD      
## 8     BLARNEY STONE PUB      704 BROADWAY Baltimore    MD      
## 9                 ARCOS      129 BROADWAY Baltimore    MD      
## 10  ARIZONA BAR & GRILL       25 BROADWAY Baltimore    MD      
## ..                  ...               ...       ...   ...   ...

我没有打扰needs_recoding因为在这种情况下总是true,但如果你需要它,那么很容易添加。