我将如何并行运行RSelenium
。
以下是使用rvest
并行
library(RSelenium)
library(rvest)
library(magrittr)
library(foreach)
library(doParallel)
URLsPar <- c("http://www.example.com/", "http://s5.tinypic.com/n392s6_th.jpg", "http://s5.tinypic.com/jl1jex_th.jpg",
"http://s6.tinypic.com/16abj1s_th.jpg", "http://s6.tinypic.com/2ymvpqa_th.jpg")
(detectCores() - 1) %>% makeCluster %>% registerDoParallel
ws <- foreach(x = 1:length(URLsPar), .packages = c("rvest", "magrittr", "RSelenium")) %dopar% {
URLsPar[x] %>% read_html %>% as("character")}
stopImplicitCluster()
答案 0 :(得分:9)
在群集中的每个节点上启动remoteDriver:
library(RSelenium)
library(rvest)
library(magrittr)
library(foreach)
library(doParallel)
URLsPar <- c("http://www.bbc.com/", "http://www.cnn.com", "http://www.google.com",
"http://www.yahoo.com", "http://www.twitter.com")
appHTML <- c()
# start a Selenium Server
selServ <- startServer()
(cl <- (detectCores() - 1) %>% makeCluster) %>% registerDoParallel
# open a remoteDriver for each node on the cluster
clusterEvalQ(cl, {
library(RSelenium)
remDr <- remoteDriver()
remDr$open()
})
myTitles <- c()
ws <- foreach(x = 1:length(URLsPar), .packages = c("rvest", "magrittr", "RSelenium")) %dopar% {
remDr$navigate(URLsPar[x])
remDr$getTitle()[[1]]
}
# close browser on each node
clusterEvalQ(cl, {
remDr$close()
})
stopImplicitCluster()
# stop Selenium Server
selServ$stop()
> ws
[[1]]
[1] "BBC - Homepage"
[[2]]
[1] "CNN - Breaking News, U.S., World, Weather, Entertainment & Video News"
[[3]]
[1] "Google"
[[4]]
[1] "Yahoo"
[[5]]
[1] "Welcome to Twitter - Login or Sign up"