使用multiplie Rselenium浏览器加速网页抓取

时间:2017-09-21 09:00:55

标签: r selenium web-scraping rselenium

我正在使用Rselenium废弃以下网站: http://plovila.pomorstvo.hr/

每次我必须输入'NIB'字段时,执行并废弃所有数据。 我几次使用Sys.time()函数,所以我的代码很慢(一个NIB的cca为12秒)。我需要废弃大约200,000个NIB号码,这样可以进行30天的刮擦。

我感兴趣的是,如果我可以在本地打开多个浏览器,或者在云端以某种方式打开,并使我的抓取脚本更快。

是否可以使用并行计算来克服此问题? 你有什么建议吗?

编辑: 我正在添加代码:

library(XML)
library(RCurl)
library(RSelenium)
library(png)
library(imager)
library(RMySQL)
library(htmltab)
library(jsonlite)
library(rvest)

# function for waiting instead Sys.sleep()
waitLoad <- function (xpath_check = "//input[@id = 'ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]",
                       iterations = 5){
  counter <- 0
  chk <- FALSE
  while(!chk & counter <= iterations){
    wait <- tryCatch(
      remDr$findElement(using = "xpath",
                        xpath_check)$getElementText(),
      # remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_Img1']")$clearElement(),
      error = function(e) print(paste0("Trazi dalje"))
    )
    if(wait == "Trazi dalje" ){
      Sys.sleep(1L)
      counter <- sum(counter, 1)
    }else{
      chk <- TRUE
    }
  }
}

# Start Selenium Server
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.0
remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome")
remDr$open()

# Simulate browser session and fill out form
remDr$navigate("http://plovila.pomorstvo.hr/")
remDr$findElement(using = "xpath", "//select[@id = 'ctl00_Content_FormContent_uiTipObjektaDropDown']/option[@value = '1']")$clickElement()
remDr$screenshot(display = TRUE)

# Scrap !
df <- list()
Porivni_uredjaji <- list()
Clanovi_posade <- list()
Vlasnici <- list()
Korisnici <- list()
df_2 <- list()
Tereti <- list()
pocetak <- 100000
kraj <- 100003
system.time(
for (i in pocetak:kraj){
  remDr$findElement(using = "xpath", "//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$clearElement()
  Sys.sleep(1L)
  remDr$findElement(using = "xpath", 
                    "//input[@id = 'ctl00_Content_FormContent_uiNibTextBox']")$sendKeysToElement(list(as.character(i), 
                                                                                                         key = "enter"))
  waitLoad()
  remDr$screenshot(display = TRUE)
  doc <- htmlParse(remDr$getPageSource()[[1]])
  Sys.sleep(1L)
  Ime <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[1]", fun = xmlValue)
  Oznaka <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[2]", fun = xmlValue)
  NIB <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[3]", fun = xmlValue)
  Vlasnik <- xpathSApply(doc = doc, path = "//*[@id='ctl00_Content_FormContent_uiIspisGrid_ctl00__0']/td[4]", fun = xmlValue)
  LK_LI <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[5]", fun = xmlValue)
  br1 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[6]", fun = xmlValue)
  br2 <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/table/tbody/tr/td[7]", fun = xmlValue)
  x <- i-pocetak + 1
  if (length(NIB)==0){
    Pozivni_znak <- NA
    df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Pozivni_znak)
    df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE)
  }else{
    remDr$findElement(using = "xpath", "//input[@title = 'Detalji']")$clickElement()
    waitLoad("//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']", 5)
    doc <- htmlParse(remDr$getPageSource()[[1]], encoding = "UTF-8")
    Sys.sleep(1L)
    list_a <- xpathSApply(doc = doc, path = "/html/body/form/div[4]/div[1]/div[3]/fieldset/h3[1]", fun = xmlValue)
    if (length(list_a) >= 1){

      Namjena <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNamjenaText']/@value")
      json <- paste0("[", '"', Namjena, '"', "]")
      Namjena <- fromJSON(json)
      Namjena <- as.data.frame(Namjena, stringsAsFactors = FALSE)
      colnames(Namjena) <- "Namjena"
      Vrsta_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVrstaPlovilaText']/@value")
      json <- paste0("[", '"', Vrsta_plovila, '"', "]")
      Vrsta_plovila <- fromJSON(json)
      Vrsta_plovila <- as.data.frame(Vrsta_plovila, stringsAsFactors = FALSE)
      colnames(Vrsta_plovila) <- "Vrsta_plovila"
      Model_plovila <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiModelPlovilaText']/@value")
      json <- paste0("[", '"', Model_plovila, '"', "]")
      Model_plovila <- fromJSON(json)
      Model_plovila <- as.data.frame(Model_plovila, stringsAsFactors = FALSE)
      colnames(Model_plovila) <- "Model_plovila"
      Duljina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiDuljinaTrupaText']/@value")
      json <- paste0("[", '"', Duljina_trupa, '"', "]")
      Duljina_trupa <- fromJSON(json)
      Duljina_trupa <- as.data.frame(Duljina_trupa, stringsAsFactors = FALSE)
      colnames(Duljina_trupa) <- "Duljina_trupa"
      Sirina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiSirinaText']/@value")
      json <- paste0("[", '"', Sirina_trupa, '"', "]")
      Sirina_trupa <- fromJSON(json)
      Sirina_trupa <- as.data.frame(Sirina_trupa, stringsAsFactors = FALSE)
      colnames(Sirina_trupa) <- "Sirina_trupa"
      Visina_trupa <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiVisinaText']/@value")
      json <- paste0("[", '"', Visina_trupa, '"', "]")
      Visina_trupa <- fromJSON(json)
      Visina_trupa <- as.data.frame(Visina_trupa, stringsAsFactors = FALSE)
      colnames(Visina_trupa) <- "Visina_trupa"
      Gaz <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGazText']/@value")
      json <- paste0("[", '"', Gaz, '"', "]")
      Gaz <- fromJSON(json)
      Gaz <- as.data.frame(Gaz, stringsAsFactors = FALSE)
      colnames(Gaz) <- "Gaz"
      Nosivost <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiNosivostText']/@value")
      json <- paste0("[", '"', Nosivost, '"', "]")
      Nosivost <- fromJSON(json)
      Nosivost <- as.data.frame(Nosivost, stringsAsFactors = FALSE)
      colnames(Nosivost) <- "Nosivost"
      GT <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGtText']/@value")
      json <- paste0("[", '"', GT, '"', "]")
      GT <- fromJSON(json)
      GT <- as.data.frame(GT, stringsAsFactors = FALSE)
      colnames(GT) <- "GT"
      Snaga_motora <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiUkupnaSnagaText']/@value")
      json <- paste0("[", '"', Snaga_motora, '"', "]")
      Snaga_motora <- fromJSON(json)
      Snaga_motora <- as.data.frame(Snaga_motora, stringsAsFactors = FALSE)
      colnames(Snaga_motora) <- "Snaga_motora"
      Brodogradiliste <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiBrodogradilisteText']/@value")
      Brodogradiliste <- gsub("\"", "'", Brodogradiliste)
      json <- paste0("[", '"', Brodogradiliste, '"', "]")
      Brodogradiliste <- fromJSON(json)
      Brodogradiliste <- as.data.frame(Brodogradiliste, stringsAsFactors = FALSE)
      Encoding(Brodogradiliste[,c(1)]) <- "UTF-8"
      colnames(Brodogradiliste) <- "Brodogradiliste"
      Godina_gradnje <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiGodGradnjeText']/@value")
      json <- paste0("[", '"', Godina_gradnje, '"', "]")
      Godina_gradnje <- fromJSON(json)
      Godina_gradnje <- as.data.frame(Godina_gradnje, stringsAsFactors = FALSE)
      colnames(Godina_gradnje) <- "Godina_gradnje"
      Materijal <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaterijalGradnjeText']/@value")
      json <- paste0("[", '"', Materijal, '"', "]")
      Materijal <- fromJSON(json)
      Materijal <- as.data.frame(Materijal, stringsAsFactors = FALSE)
      colnames(Materijal) <- "Materijal"
      Najveci_broj_osoba <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojOsobaText']/@value")
      json <- paste0("[", '"', Najveci_broj_osoba, '"', "]")
      Najveci_broj_osoba <- fromJSON(json)
      Najveci_broj_osoba <- as.data.frame(Najveci_broj_osoba, stringsAsFactors = FALSE)
      colnames(Najveci_broj_osoba) <- "Najveci_broj_osoba"
      Najveci_broj_putnika <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMaxBrojPutnikaText']/@value")
      json <- paste0("[", '"', Najveci_broj_putnika, '"', "]")
      Najveci_broj_putnika <- fromJSON(json)
      Najveci_broj_putnika <- as.data.frame(Najveci_broj_putnika, stringsAsFactors = FALSE)
      colnames(Najveci_broj_putnika) <- "Najveci_broj_putnika"
      Najmanji_broj_posade <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiMinBrojPosade']/@value")
      json <- paste0("[", '"', Najmanji_broj_posade, '"', "]")
      Najmanji_broj_posade <- fromJSON(json)
      Najmanji_broj_posade <- as.data.frame(Najmanji_broj_posade, stringsAsFactors = FALSE)
      colnames(Najmanji_broj_posade) <- "Najmanji_broj_posade"
      Prethodna_oznaka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaOznakaText']/@value")
      json <- paste0("[", '"', Prethodna_oznaka, '"', "]")
      Prethodna_oznaka <- fromJSON(json)
      Prethodna_oznaka <- as.data.frame(Prethodna_oznaka, stringsAsFactors = FALSE)
      colnames(Prethodna_oznaka) <- "Prethodna_oznaka"
      Prethodna_luka <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaLukaUpisaText']/@value")
      Prethodna_luka <- gsub("\"", "'", Prethodna_luka)
      json <- paste0("[", '"', Prethodna_luka, '"', "]")
      Prethodna_luka <- fromJSON(json)
      Prethodna_luka <- as.data.frame(Prethodna_luka, stringsAsFactors = FALSE)
      colnames(Prethodna_luka) <- "Prethodna_luka"
      Prethodna_drĹľava <- xpathSApply(doc = doc, path = "//input[@id = 'ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiPrethodnaDrzavaUpisaText']/@value")
      json <- paste0("[", '"', Prethodna_drĹľava, '"', "]")
      Prethodna_drĹľava <- fromJSON(json)
      Prethodna_drĹľava <- as.data.frame(Prethodna_drĹľava, stringsAsFactors = FALSE)
      colnames(Prethodna_drĹľava) <- "Prethodna_drĹľava"

      df[[x]] <- cbind(Ime, Oznaka, NIB, Vlasnik, LK_LI, br1, br2, Namjena, Vrsta_plovila, 
                       Model_plovila, Duljina_trupa, Sirina_trupa, Visina_trupa, Gaz, Nosivost, GT,
                       Snaga_motora, Brodogradiliste, Godina_gradnje, Materijal, Najveci_broj_osoba,
                       Najveci_broj_putnika, Najmanji_broj_posade, Prethodna_oznaka,
                       Prethodna_luka, Prethodna_drĹľava)
      df[[x]] <- as.data.frame(df[[x]], stringsAsFactors = FALSE)

      df_2 <- readHTMLTable(doc)
      Sys.sleep(2L)

      Porivni_uredjaji[[x]] <- tryCatch(as.data.frame(cbind(df_2[[2]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
      Clanovi_posade[[x]] <- tryCatch(as.data.frame(cbind(df_2[[3]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
      Vlasnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[4]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
      Korisnici[[x]] <- tryCatch(as.data.frame(cbind(df_2[[5]], NIB), stringsAsFactors = FALSE), error=function(e) print(paste0("Error ", NIB)))
      Tereti[[x]] <- cbind(remDr$findElement(using = "xpath", "//*/span[@id='ctl00_Content_FormContent_uiDetaljiPlovilaControl_uiTeretiLabel']")$getElementText(), NIB)
    }}
}
)

# manipulate data after scraping
for (i in 1:length(df)){
  if (length(df[[i]]) < 13){
    df[[i]] <- matrix(data = rep(NA, 26), nrow = 1, ncol = 26)
    df[[i]] <- as.data.frame(df[[i]])
    colnames(df[[i]]) <- c("Ime", "Oznaka", "NIB", "Vlasnik", "LK_LI", "br1", "br2","Namjena",
                           "Vrsta_plovila", "Model_plovila", "Duljina_trupa", "Sirina_trupa", "Visina_trupa",
                           "Gaz", "Nosivost", "GT", "Snaga_motora", "Brodogradiliste", "Godina_gradnje", 
                           "Materijal", "Najveci_broj_osoba", "Najveci_broj_putnika", "Najmanji_broj_posade", 
                           "Prethodna_oznaka", "Prethodna_luka", "Prethodna_drĹľava")
  }
}

df_final <- do.call(rbind, df)
df_final_1 <- df_final[!is.na(df_final$NIB), ]

编辑2: 我发布的上述代码存在问题。如果我跑:

(cl <- (detectCores() - 1) %>%  makeCluster) %>% registerDoParallel
# open a remoteDriver for each node on the cluster
# docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.3
clusterEvalQ(cl, {
  library(RSelenium)
  remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, browserName = "chrome")
  remDr$open()
})
myTitles <- c()
ws <- foreach(x = 1:length(urls), 
              .packages = c("rvest", "magrittr", "RSelenium", "jsonlite", "htmltab", "XML", "RCurl"))  %dopar%  {
  remDr$navigate(urls[x])
  Sys.sleep(3L)
  remDr$getTitle()[[1]]
              }

它会返回错误

Error in { : task 1 failed - "   Summary: UnknownError
     Detail: An unknown server-side error occurred while processing the command.
     Further Details: run errorDetails method"

1 个答案:

答案 0 :(得分:0)

Chrome可能存在问题:3.5.0泊坞窗图片。以下是使用docker toolbox在win 10上运行的:

library(RSelenium)
library(rvest)
library(magrittr)
library(foreach)
library(doParallel)

# using  docker run -d -p 4445:4444 selenium/standalone-chrome:3.5.3
# in windows
URLsPar <- c("https://stackoverflow.com/", "https://github.com/", 
             "http://www.bbc.com/", "http://www.google.com", 
             "https://www.r-project.org/", "https://cran.r-project.org",
             "https://twitter.com/", "https://www.facebook.com/")

appHTML <- c()

(cl <- (detectCores() - 1) %>%  makeCluster) %>% registerDoParallel
# open a remoteDriver for each node on the cluster
clusterEvalQ(cl, {
  library(RSelenium)
  remDr <- remoteDriver(remoteServerAddr = "192.168.99.100", port = 4445L, 
                        browserName = "chrome")
  remDr$open()
})
ws <- foreach(x = 1:length(URLsPar), 
              .packages = c("rvest", "magrittr", "RSelenium"))  %dopar%  {
                print(URLsPar[x])
                remDr$navigate(URLsPar[x])
                remDr$getTitle()[[1]]
              }
> ws
[[1]]
[1] "Stack Overflow - Where Developers Learn, Share, & Build Careers"

[[2]]
[1] "The world's leading software development platform · GitHub"

[[3]]
[1] "BBC - Homepage"

[[4]]
[1] "Google"

[[5]]
[1] "R: The R Project for Statistical Computing"

[[6]]
[1] "The Comprehensive R Archive Network"

[[7]]
[1] "Twitter. It's what's happening."

[[8]]
[1] "Facebook - Log In or Sign Up"     


# close browser on each node
clusterEvalQ(cl, {
  remDr$close()
})

stopImplicitCluster()