我正在抓取http://www.progarchives.com/album.asp?id=
并收到警告信息:
警告信息:
XML内容似乎不是XML:
http://www.progarchives.com/album.asp?id=2
http://www.progarchives.com/album.asp?id=3 http://www.progarchives.com/album.asp?id=4
http://www.progarchives.com/album.asp?id=5
刮刀分别适用于每个页面,但不适用于网址b1=2:b2=1000
。
library(RCurl)
library(XML)
getUrls <- function(b1,b2){
root="http://www.progarchives.com/album.asp?id="
urls <- NULL
for (bandid in b1:b2){
urls <- c(urls,(paste(root,bandid,sep="")))
}
return(urls)
}
prog.arch.scraper <- function(url){
SOURCE <- getUrls(b1=2,b2=1000)
PARSED <- htmlParse(SOURCE)
album <- xpathSApply(PARSED,"//h1[1]",xmlValue)
date <- xpathSApply(PARSED,"//strong[1]",xmlValue)
band <- xpathSApply(PARSED,"//h2[1]",xmlValue)
return(c(band,album,date))
}
prog.arch.scraper(urls)
答案 0 :(得分:6)
以下是使用rvest
和dplyr
的替代方法:
library(rvest)
library(dplyr)
library(pbapply)
base_url <- "http://www.progarchives.com/album.asp?id=%s"
get_album_info <- function(id) {
pg <- html(sprintf(base_url, id))
data.frame(album=pg %>% html_nodes(xpath="//h1[1]") %>% html_text(),
date=pg %>% html_nodes(xpath="//strong[1]") %>% html_text(),
band=pg %>% html_nodes(xpath="//h2[1]") %>% html_text(),
stringsAsFactors=FALSE)
}
albums <- bind_rows(pblapply(2:10, get_album_info))
head(albums)
## Source: local data frame [6 x 3]
##
## album date band
## 1 FOXTROT Studio Album, released in 1972 Genesis
## 2 NURSERY CRYME Studio Album, released in 1971 Genesis
## 3 GENESIS LIVE Live, released in 1973 Genesis
## 4 A TRICK OF THE TAIL Studio Album, released in 1976 Genesis
## 5 FROM GENESIS TO REVELATION Studio Album, released in 1969 Genesis
## 6 GRATUITOUS FLASH Studio Album, released in 1984 Abel Ganz
我不想用大量的请求来阻止网站,所以要提高你的使用顺序。 pblapply
为您提供免费进度条。
要对网站表示友好(特别是因为它没有明确禁止抓取),您可能希望在Sys.sleep(10)
函数的末尾抛出get_album_info
。
<强>更新强>
要处理服务器错误(在这种情况下为500
,但它也适用于其他人),您可以使用try
:
library(rvest)
library(dplyr)
library(pbapply)
library(data.table)
base_url <- "http://www.progarchives.com/album.asp?id=%s"
get_album_info <- function(id) {
pg <- try(html(sprintf(base_url, id)), silent=TRUE)
if (inherits(pg, "try-error")) {
data.frame(album=character(0), date=character(0), band=character(0))
} else {
data.frame(album=pg %>% html_nodes(xpath="//h1[1]") %>% html_text(),
date=pg %>% html_nodes(xpath="//strong[1]") %>% html_text(),
band=pg %>% html_nodes(xpath="//h2[1]") %>% html_text(),
stringsAsFactors=FALSE)
}
}
albums <- rbindlist(pblapply(c(9:10, 23, 28, 29, 30), get_album_info))
## album date band
## 1: THE DANGERS OF STRANGERS Studio Album, released in 1988 Abel Ganz
## 2: THE DEAFENING SILENCE Studio Album, released in 1994 Abel Ganz
## 3: AD INFINITUM Studio Album, released in 1998 Ad Infinitum
您将不会获得错误页面的任何条目(在这种情况下,它只返回id 9,10和30的条目)。
答案 1 :(得分:4)
而不是xpathApply()
,您可以在每个路径的节点集中对第一个节点进行子集化,并在其上调用xmlValue()
。这就是我想出来的,
library(XML)
library(RCurl)
## define the urls and xpath queries
urls <- sprintf("http://www.progarchives.com/album.asp?id=%s", 2:10)
path <- c(album = "//h1", date = "//strong", band = "//h2")
## define a re-usable curl handle for the c-level nodes
curl <- getCurlHandle()
## allocate the result list
out <- vector("list", length(urls))
## do the work
for(u in urls) {
content <- getURL(u, curl = curl)
doc <- htmlParse(content, useInternalNodes = TRUE)
out[[u]] <- lapply(path, function(x) xmlValue(doc[x][[1]]))
free(doc)
}
## structure the result
data.table::rbindlist(out)
# album date band
# 1: FOXTROT Studio Album, released in 1972 Genesis
# 2: NURSERY CRYME Studio Album, released in 1971 Genesis
# 3: GENESIS LIVE Live, released in 1973 Genesis
# 4: A TRICK OF THE TAIL Studio Album, released in 1976 Genesis
# 5: FROM GENESIS TO REVELATION Studio Album, released in 1969 Genesis
# 6: GRATUITOUS FLASH Studio Album, released in 1984 Abel Ganz
# 7: GULLIBLES TRAVELS Studio Album, released in 1985 Abel Ganz
# 8: THE DANGERS OF STRANGERS Studio Album, released in 1988 Abel Ganz
# 9: THE DEAFENING SILENCE Studio Album, released in 1994 Abel Ganz
更新:要处理id
查询不存在,我们可以编写一个RCurl::url.exists()
来处理不良查询的条件。因此,以下函数getAlbums()
返回获取的xml值或NA
的字符向量,具体取决于url的状态。当然,如果你愿意,你可以改变它。这只是凌晨时分想到的一种方法。
getAlbums <- function(url, id = numeric(), xPath = list()) {
urls <- sprintf("%s?id=%d", url, id)
curl <- getCurlHandle()
out <- vector("list", length(urls))
for(u in urls) {
out[[u]] <- if(url.exists(u)) {
content <- getURL(u, curl = curl)
doc <- htmlParse(content, useInternalNodes = TRUE)
lapply(path, function(x) xmlValue(doc[x][[1]]))
} else {
warning(sprintf("returning 'NA' for urls[%d] ", id[urls == u]))
structure(as.list(path[NA]), names = names(path))
}
if(exists("doc")) free(doc)
}
data.table::rbindlist(out)
}
url <- "http://www.progarchives.com/album.asp"
id <- c(9:10, 23, 28, 29, 30)
path <- c(album = "//h1", date = "//strong", band = "//h2")
getAlbums(url, id, path)
# album date band
# 1: THE DANGERS OF STRANGERS Studio Album, released in 1988 Abel Ganz
# 2: THE DEAFENING SILENCE Studio Album, released in 1994 Abel Ganz
# 3: NA NA NA
# 4: NA NA NA
# 5: NA NA NA
# 6: AD INFINITUM Studio Album, released in 1998 Ad Infinitum
#
# Warning messages:
# 1: In albums(url, id, path) : returning 'NA' for urls[23]
# 2: In albums(url, id, path) : returning 'NA' for urls[28]
# 3: In albums(url, id, path) : returning 'NA' for urls[29]