当我尝试重建网址时,出现问题“ preNode [[1]]中的错误:下标超出范围”。我的代码在下面列出,我通过樱花比赛的跑步者URL提取信息。
请让我知道我的重构是否有问题:/。
Installing XML Package
install.packages("XML")
library(XML)
#Establish the View Page Source of the Web Site
ubase = "http://www.cherryblossom.org/"
url = paste(ubase, "/results/2012/2012cucb10m-m.htm", sep = "")
doc = htmlParse(url)
preNode = getNodeSet(doc, "//pre")
txt = xmlValue(preNode[[1]])
nchar(txt)
substr(txt, 1, 50)
substr(txt, nchar(txt) - 50, nchar(txt))
els = strsplit(txt, "\\r\\n")[[1]]
length(els)
els[1:3]
els[ length(els) ]
extractResTable =
# Retrieve data from web site, find preformatted text,
# return as a character vector.
function(url)
{
doc = htmlParse(url)
preNode = getNodeSet(doc, "//pre")
txt = xmlValue(preNode[[1]])
els = strsplit(txt, "\r\n")[[1]]
return(els)
}
# Retrieve the 2012 Men's Results
m2012 = extractResTable(url)
identical(m2012, els)
#Setting a vector of all the URLS for each year
ubase = "http://www.cherryblossom.org/"
urls = paste(ubase, "results/", 1999:2012, "/",
1999:2012, "cucb10m-m.htm", sep = "")
# Apply the extractRestTable() to "urls"
menTables = lapply(urls, extractResTable)
#Resolving the error message
options(error = recover)
menTables = lapply(urls, extractResTable) #Choose Selection 2
# After choosing Selection 2, enter ls() in the console below
# The list should display: [1] "doc" "preNode" "url"
# Proceed if so by:
# 1. Enter url in the console.
# Output: [1] "http://www.cherryblossom.org/results/1999/1999cucb10m-m.htm"
# 2. Enter length(preNode)
# Output: [1] 0
# Gather the URLs for Male Results into a character vector, menURLS
menURLs =
c("cb99m.htm", "cb003m.htm", "results/2001/oof_m.html",
"results/2002/oofm.htm", "results/2003/CB03-M.HTM",
"results/2004/men.htm", "results/2005/CB05-M.htm",
"results/2006/men.htm", "results/2007/men.htm",
"results/2008/men.htm", "results/2009/09cucb-M.htm",
"results/2010/2010cucb10m-m.htm",
"results/2011/2011cucb10m-m.htm",
"results/2012/2012cucb10m-m.htm")
# Reconstruct the urls vector to contain the proper Web Addresses
urls = paste(ubase, menURLs, sep = "")
urls[1:3]
# Print out the results again
menTables = lapply(urls, extractResTable)
names(menTables) = 1999:2012
length(urls)