问题是,我确定,相当简单。但我无法弄清楚如何使其发挥作用。我有四个这样的网站:
require(xml2)
require(rvest)
html1 <- html("http://academic.research.microsoft.com/RankList?entitytype=4&topdomainid=2&subdomainid=6&last=0&orderby=6")
html2 <- html("http://academic.research.microsoft.com/RankList?entitytype=3&topdomainid=2&subdomainid=6&last=0&orderby=6")
html3 <- html("http://academic.research.microsoft.com/RankList?entitytype=3&topdomainid=2&subdomainid=7&last=0&orderby=6")
html4 <- html("http://academic.research.microsoft.com/RankList?entitytype=4&topDomainID=2&subDomainID=7&last=0&orderby=6")
htmlPages <- c(html1,html2,html3,html4)
我试图将它们全部放在列表中,以便在for循环或其他内容中轻松访问。将它们放在列表中是没有问题的。问题是以后访问它们。我的意思是我再从节点中获取文本。
getCSSElementText <- function(htmlpage, CSSElement)
{
#Return a vector of the text values of the CSS element the function is looking for
cssNodes <- html_nodes(htmlpage, CSSElement)
cssValues <- html_text(cssNodes)
return(cssValues)
}
我打电话
getCSSElementText(htmlPages[1], #properCSSTag#)
我收到此错误:
UseMethod出错(&#34; xml_find_all&#34;): 没有适用于&#39; xml_find_all&#39;的方法应用于类&#34;列表&#34;的对象
这是我的整个代码,以防万一其他地方出现问题:
library(rvest)
library(xml2)
html1 <- html("http://academic.research.microsoft.com/RankList?entitytype=4&topdomainid=2&subdomainid=6&last=0&orderby=6")
html2 <- html("http://academic.research.microsoft.com/RankList?entitytype=3&topdomainid=2&subdomainid=6&last=0&orderby=6")
html3 <- html("http://academic.research.microsoft.com/RankList?entitytype=3&topdomainid=2&subdomainid=7&last=0&orderby=6")
html4 <- html("http://academic.research.microsoft.com/RankList?entitytype=4&topDomainID=2&subDomainID=7&last=0&orderby=6")
htmlPages <- c(html1,html2,html3,html4)
CSSElementIDs <- c("#ctl00_MainContent_divRankList a", ".staticOrderCol:nth-child(3)", ".staticOrderCol:nth-child(4)")
getCSSElementText <- function(htmlpage, CSSElement)
{
#Return a vector of the text values of the CSS element the function is looking for
cssNodes <- html_nodes(htmlpage, CSSElement)
cssValues <- html_text(cssNodes)
return(cssValues)
}
getCSSElementNumber <- function(htmlpage, CSSElement)
{
#Return a vector of numbers with proper formatting etc from the CSS element the function is looking for
cssNodes <- html_nodes(htmlpage, CSSElement)
cssValues <- html_text(cssNodes)
parsedCssValues <- as.numeric(gsub("\\D", "", cssValues))
return(parsedCssValues)
}
addToDataFrame <- function(df, vector)
{
df[deparse(substitute(vector))] <- vector
return(df)
}
非常感谢你的时间!
答案 0 :(得分:2)
当您连接html*
个对象(每个长度为2的列表)时,它们将成为8的列表:
htmlPages <- c(html1,html2,html3,html4)
str(htmlPages)
# List of 8
# $ node:<externalptr>
# $ doc :<externalptr>
# $ node:<externalptr>
# $ doc :<externalptr>
# $ node:<externalptr>
# $ doc :<externalptr>
# $ node:<externalptr>
# $ doc :<externalptr>
相反,将html*
个对象放入列表中:
htmlPages <- list(html1,html2,html3,html4)
str(htmlPages)
# List of 4
# $ :List of 2
# ..$ node:<externalptr>
# ..$ doc :<externalptr>
# ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
# $ :List of 2
# ..$ node:<externalptr>
# ..$ doc :<externalptr>
# ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
# $ :List of 2
# ..$ node:<externalptr>
# ..$ doc :<externalptr>
# ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
# $ :List of 2
# ..$ node:<externalptr>
# ..$ doc :<externalptr>
# ..- attr(*, "class")= chr [1:2] "xml_document" "xml_node"
并使用[[
htmlPages[[1]]
# {xml_document}
# <html xmlns="http://www.w3.org/1999/xhtml" xmlns:fb="http://www.facebook.com/2008/fbml">
# [1] <head id="Head1">\n <meta http-equiv="Content-Type" content="text/html; ...
# [2] <body onpageshow="document.forms['aspnetForm'].reset();"> \n <form ...