我试图在this page中获取选择器列表:
$("#Lastname"),$(".intro"),....
我尝试使用xpathSApply
:
library(XML)
library(RCurl)
a <- getURL('http://www.w3schools.com/jquery/trysel.asp')
doc <- htmlParse(a)
xpathSApply(doc,'//*[@id="selectorOptions"]') ## I can't get the right xpath
我也试过但没有成功:
xpathSApply(doc,'//*[@id="selectorOptions"]/div[i]')
编辑我添加了python标签,因为我也接受了python解决方案。
答案 0 :(得分:4)
以下是获取此类javascript页面的R方式。您需要使用@Peyton所述的浏览器。 Selenium服务器是控制浏览器的一种好方法。我为硒服务器写了一些R的绑定 https://github.com/johndharrison/RSelenium
以下内容允许用户访问帖子javascript来源:
require(devtools)
devtools::install_github("RSelenium", "johndharrison")
library(RSelenium)
library(RJSONIO)
# one needs to have an active server running
# the following commented out lines source the latest java binary
# RSelenium::checkForServer()
# RSelenium::startServer()
# a selenium server is assummed to be running now
remDR <- remoteDriver$new()
remDR$open() # opens a browser usually firefox with default settings
remDR$navigate('http://www.w3schools.com/jquery/trysel.asp') # navigate to your page
webElem <- remDR$findElements(value = "//*[@id='selectorOptions']") # find your elememts
# display the appropriate quantities
cat(fromJSON(webElem[[1]]$getElementText())$value)
> cat(fromJSON(webElem[[1]]$getElementText())$value)
$("#Lastname")
$(".intro")
$(".intro, #Lastname")
$("h1")
$("h1, p")
$("p:first")
$("p:last")
$("tr:even")
$("tr:odd")
$("p:first-child")
$("p:first-of-type")
$("p:last-child")
$("p:last-of-typ
.....................
更新:
在这种情况下访问信息的一种更简单的方法是使用executeScript
方法
library(RSelenium)
RSelenium:startServer()
remDr$open()
remDR$navigate('http://www.w3schools.com/jquery/trysel.asp')
remDr$executeScript("return w3Sels;")[[1]]
> remDr$executeScript("return w3Sels;")[[1]]
[1] "#Lastname" ".intro"
[3] ".intro, #Lastname" "h1"
[5] "h1, p" "p:first"
[7] "p:last" "tr:even"
[9] "tr:odd" "p:first-child"
[11] "p:first-of-type" "p:last-child"
[13] "p:last-of-type" "li:nth-child(1)"
[15] "li:nth-last-child(1)" "li:nth-of-type(2)"
[17] "li:nth-last-of-type(2)" "b:only-child"
[19] "h3:only-of-type" "div > p"
[21] "div p" "ul + h3"
[23] "ul ~ table" "ul li:eq(0)"
[25] "ul li:gt(0)" "ul li:lt(2)"
[27] ":header" ":header:not(h1)"
[29] ":animated" ":focus"
[31] ":contains(Duck)" "div:has(p)"
[33] ":empty" ":parent"
[35] "p:hidden" "table:visible"
[37] ":root" "p:lang(it)"
[39] "[id]" "[id=my-Address]"
[41] "p[id!=my-Address]" "[id$=ess]"
[43] "[id|=my]" "[id^=L]"
[45] "[title~=beautiful]" "[id*=s]"
[47] ":input" ":text"
[49] ":password" ":radio"
[51] ":checkbox" ":submit"
[53] ":reset" ":button"
[55] ":image" ":file"
[57] ":enabled" ":disabled"
[59] ":selected" ":checked"
[61] "*"
答案 1 :(得分:0)
感谢jdharrison评论我解析了javascript代码以提取所有选择器。正如Peyton所提到的,这适用于这种特殊情况,因为所有选择器都在代码中。
capture.output(xpathSApply(doc,'//*/script')[[6]],
file='test.js')
ll <- readLines('test.js')
ll <- ll[grepl('w3Sels.push',ll)]
ll <- unlist(regmatches(ll, gregexpr("(?<=\\().*?(?=\\))", ll, perl=T)))
cat(head(ll))
"#Lastname" ".intro" ".intro, #Lastname" "h1" "h1, p" "p:first"