我使用RSelenium在联合国条约集合网站上提交表格并保存结果。一切都运作良好,除了条约的名称在我的决赛桌中被截断的事实。是因为readHTML可以读取的字符数量有限制,或者我做错了什么?
这是一个(希望)可重复的例子:
###
### RSelenium scraping of UN treaty collection
###
# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html
rm(list=ls())
###
setwd("C:/Users/HP/Desktop/BIT/UN_treaties")
library("XML")
library("RCurl")
library("RSelenium")
library("RHTMLForms")
# Start Selenium Server --------------------------------------------------------
checkForServer() ## check if the server is there
startServer() ## start the selenium server
remDrv <- remoteDriver() ## connect server
remDrv$open() ## connect server
Sys.sleep(5)
#remDrv$getStatus() ## info connection, not necessary
# Simulate browser session and fill out form -----------------------------------
## go to page
remDrv$navigate('https://treaties.un.org/Pages/UNTSOnline.aspx?id=2')
## check out what is of interest:
## additional attributes
webElem1 <- remDrv$findElement(using = 'xpath', "//*/input[@value = 'cnd2']")
webElem1$getElementAttribute("class")
webElem1$getElementAttribute("type")
webElem1$getElementAttribute("id")
webElem1$clickElement() ## match all the elements
Sys.sleep(5)
## results per page
webElem2 <- remDrv$findElement(using = 'css selector', "select.basicPullDn") ## . denotes class
webElem2$getElementAttribute("class")
webElem2$getElementAttribute("type")
webElem2$getElementAttribute("id")
Sys.sleep(5)
## results per page more in detail
webElem3 <- remDrv$findElement(using = 'xpath', "//*/select[@class = 'basicPullDn']/option[@value = '500']")
webElem3$getElementAttribute("class")
webElem3$getElementAttribute("type")
webElem3$getElementAttribute("id")
webElem3$clickElement() ## number of pages
Sys.sleep(5)
## element to get total count
webElem5 <- remDrv$findElement(using = 'css selector', "span.RecordCount") ## . denotes class
webElem5$getElementAttribute("class")
webElem5$getElementAttribute("type")
webElem5$getElementAttribute("id")
## find first element of interest: text search
webElem0 <- remDrv$findElement(using = 'css selector', "input.login") ## . denotes class
webElem0$getElementAttribute("class")
webElem0$getElementAttribute("type")
webElem0$getElementAttribute("id")
Sys.sleep(5)
df_all <- data.frame()
###### need to run search for multiple countries
country_list <- c("Morocco", "Italy", "France")
for (i in country_list){
Sys.sleep(5)
## define keys to search
keys <- paste(i, "Agreement promotion investment", sep=" ")
## search for files, one by one and save results
webElem0$clearElement()
webElem0$sendKeysToElement(list(keys, key = "enter"))
Sys.sleep(20)
# check if the table is there
doc<-htmlParse(remDrv$getPageSource()[[1]]) ## now parse html so that we can search it
tables = readHTMLTable(doc) ## extract all tables
#names(tables) ## names of all tables
tableexists <- grepl("ctl00_ContentPlaceHolder1_dgSearch", names(tables))
yes_no <- all(tableexists==F)
yes_no
if(yes_no==FALSE){
## copy table
table <- remDrv$findElement(using = 'xpath', "//*[@id = 'ctl00_ContentPlaceHolder1_dgSearch']")
table$getElementAttribute("class")
table$getElementAttribute("type")
table$getElementAttribute("id")
## extract table of interest
tabledat <-readHTMLTable(doc, stringsAsFactors = F , skip.rows=c(1))[[37]]
df_all <- rbind(tabledat, df_all)
}else{print("caccadicane")}
}
write.csv(df_all[,-(7:ncol(df_all))], ("un_bits.csv"))
结果是:
V1 V2 V3 V4 V5 V6
1 I-42051 Agreement between the Government of... See Details 08/07/1996 27/07/2000 Bilateral
2 I-35582 Agreement between the Government of... See Details 11/10/1995 22/06/1997 Bilateral
3 I-35481 Agreement between the Government of... See Details 30/11/1995 30/05/1997 Bilateral
4 I-23169 Agreement concerning the establishm... See Details 28/06/1980 28/06/1980 Bilateral
5 I-29086 Exchange of notes constituting an a... See Details 12/08/1985 12/08/1985 Bilateral
6 I-43258 Agreement on the promotion and prot... See Details 27/01/1999 08/05/2001 Bilateral
为什么V2中的字符串被截断?
答案 0 :(得分:0)
好的,经过一段时间后,我发现即使ReadHTML命令有限制,也不是这个例子中的文本被截断的原因。通过更仔细地检查我发现文本的html文件已被截断,而全名在元素&#34; title&#34;。
因此,解决方案是阅读每个&#34;标题&#34;获得协议的全名。下面是代码,如果有人有兴趣,还会添加其他一些东西。
###
### RSelenium scraping of UN treaty collection
###
# https://cran.r-project.org/web/packages/RSelenium/vignettes/RSelenium-basics.html
rm(list=ls())
###
setwd("C:/Users/HP/Desktop/BIT/UN_treaties")
library("XML")
library("RCurl")
library("RSelenium")
library("RHTMLForms")
# Start Selenium Server --------------------------------------------------------
checkForServer() ## check if the server is there
startServer() ## start the selenium server
remDrv <- remoteDriver() ## connect server
remDrv$open() ## connect server
Sys.sleep(5)
#remDrv$getStatus() ## info connection, not necessary
# Simulate browser session and fill out form -----------------------------------
## go to page
remDrv$navigate('https://treaties.un.org/Pages/UNTSOnline.aspx?id=2')
## check out what is of interest:
## additional attributes
webElem1 <- remDrv$findElement(using = 'xpath', "//*/input[@value = 'cnd2']")
webElem1$getElementAttribute("class")
webElem1$getElementAttribute("type")
webElem1$getElementAttribute("id")
webElem1$clickElement() ## match all the elements
Sys.sleep(5)
## results per page
webElem2 <- remDrv$findElement(using = 'css selector', "select.basicPullDn") ## . denotes class
webElem2$getElementAttribute("class")
webElem2$getElementAttribute("type")
webElem2$getElementAttribute("id")
Sys.sleep(5)
## results per page more in detail
webElem3 <- remDrv$findElement(using = 'xpath', "//*/select[@class = 'basicPullDn']/option[@value = '500']")
webElem3$getElementAttribute("class")
webElem3$getElementAttribute("type")
webElem3$getElementAttribute("id")
webElem3$clickElement() ## number of pages
Sys.sleep(5)
## element to get total count
webElem5 <- remDrv$findElement(using = 'css selector', "span.RecordCount") ## . denotes class
webElem5$getElementAttribute("class")
webElem5$getElementAttribute("type")
webElem5$getElementAttribute("id")
## find first element of interest: text search
webElem0 <- remDrv$findElement(using = 'css selector', "input.login") ## . denotes class
webElem0$getElementAttribute("class")
webElem0$getElementAttribute("type")
webElem0$getElementAttribute("id")
Sys.sleep(5)
df_all <- data.frame()
###### need to run search for multiple countries
#country_list <- c("Morocco", "Italy", "Brutto porco", "France")
names <- read.csv("participants_clean.csv")
country_list <- names$names
current_search <- length(country_list)
for (i in country_list){
#i <- "Morocco"
print("-------------------------")
print("-------------------------")
text <- paste("Still", current_search, "searches to do... ", sep=" ")
print(text)
text0 <- paste("Now looking for treaties signed by... ", i , " ----------------------->>" , sep=" ")
print(text0)
Sys.sleep(5)
## define keys to search
keys <- paste(i, "Agreement promotion investment", sep=" ")
## search for files, one by one and save results
webElem0$clearElement()
webElem0$sendKeysToElement(list(keys, key = "enter"))
Sys.sleep(20)
# check if the table is there
doc<-htmlParse(remDrv$getPageSource()[[1]]) ## now parse html so that we can search it
tables = readHTMLTable(doc) ## extract all tables
#names(tables) ## names of all tables
tableexists <- grepl("ctl00_ContentPlaceHolder1_dgSearch", names(tables))
yes_no <- all(tableexists==F)
yes_no
if(yes_no==FALSE){
## copy table
table <- remDrv$findElement(using = 'xpath', "//*[@id = 'ctl00_ContentPlaceHolder1_dgSearch']")
table$getElementAttribute("class")
table$getElementAttribute("type")
table$getElementAttribute("id")
## extract table of interest
tabledat <-readHTMLTable(doc, stringsAsFactors = F )[[37]]
treatfou <-nrow(tabledat)
text1 <- paste("Amazing, I just found", treatfou - 1, " !!", sep=" ")
print(text1)
## now need to extract the real names of the treaties: start from 2 to treatfound
names_new <- vector(mode="character",length = treatfou)
urls <- vector(mode="character",length = treatfou)
for (jj in 2:treatfou) {
cell_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[2]", sep="")
cell_table <- remDrv$findElement(using = 'xpath', cell_add)
names_new[[jj]] <- as.character(cell_table$getElementAttribute("title"))
}
## now substitute in the real titles:
names_new <- as.vector(unlist(names_new))
tabledat$title <- names_new
tabledat$party <- i
## get the link
for (jj in 2:treatfou) {
url_add <- paste("//*[@id='ctl00_ContentPlaceHolder1_dgSearch']/tbody/tr[", jj, "]/td[3]/a", sep="")
url_add <- remDrv$findElement(using = 'xpath', url_add)
gio <- unlist(url_add$getElementAttribute("href"))
gio <- gsub("javascript:void%20window.open\\('","",gio) ## need to excape the parenthesis with \\
gio <- gsub("\\'.*", "", gio) ## cancel everything after '
urls[[jj]] <- paste0("https://treaties.un.org/Pages/",gio)
}
tabledat$url <-urls
df_all <- rbind(tabledat[-(1),], df_all)
}else{print("Too bad, there is nothing, I'll try with the next one :) " )}
current_search <- current_search -1
}
write.csv(df_all[,-(7:10)], ("un_bits.csv"))