我写了以下代码,每天从门户网站上废弃招标信息。
packages <- c('rvest', 'stringi', 'tidyverse','lubridate','dplyr')
purrr::walk(packages, library, character.only = TRUE, warn.conflicts = FALSE)
start_time <- proc.time()
主要页面废弃并获得完整的记录。
data <- read_html('https://eprocure.gov.in/mmp/latestactivetenders')
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
All_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
All_tenders <- cbind(All_tenders,links_fair)
读取要获取的记录总数
Count_of_Recs_raw <- html_nodes(data, xpath = '//*[(@id = "edit-l-active-teners")]//div')
Count_of_Recs <- as.numeric(gsub("Total Tenders : ","",html_text(Count_of_Recs_raw[1])))
用于清理和处理日期和因素等数据字段的功能。
process_dates <- function(data){
cols2date <- c('Bid.Submission.Closing.Date','epublished_date','document_download_start_date','bid_submission_start_date','bid_opening_date','document_download_end_date','bid_submission_end_date')
date_processed_data <- data
date_processed_data[cols2date] <- lapply(data[cols2date] , dmy_hm)
return(date_processed_data)
}
clean_process_data <- function(data){
cols2factor <- c('State.Name','product_category','pre_qualification','organisation_name','organisation_type','tender_type')
clean_processed_data <- data
clean_processed_data[cols2factor] <- lapply(data[cols2factor] , factor)
#clean_processed_data <- process_dates(clean_processed_data)
return(clean_processed_data)
}
表格废料从这里开始。第一页已被废弃以获得数据框的结构。
for (page_no in 2:round(Count_of_Recs/10)){
closeAllConnections()
on.exit(closeAllConnections())
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
url <- paste(url_bit1, page_no, sep="")
cat(page_no,"\t",proc.time() - start_time,"\n")
data <- read_html(url)
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
Page_tenders <- cbind(Page_tenders,links_fair)
All_tenders <- rbind(All_tenders,Page_tenders)
}
此for循环通常最终需要数小时才能完成。 我正在寻找使用申请系列以达到良好效果以便节省时间。 此程序还负责获取和处理所有记录,然后再次为每个单独的记录每次都会删除一个全新的页面(此处未列出代码)....
我尝试了以下代码,但它没有给我我想要的东西:
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
closeAllConnections()
on.exit(closeAllConnections())
url <- paste(url_bit1, datain$S.No., sep="")
cat(S.No.,"\t",proc.time() - start_time,"\n")
data <- read_html(url)
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
Page_tenders <- cbind(Page_tenders,links_fair)
All_tenders <- rbind(All_tenders,Page_tenders)
}
All_tenders <- sapply(All_tenders, FUN=read_page(All_tenders$S.No.))
欢迎任何建议,指导,建议,意见或帮助。我一直在使用R只有3-4个月。我也知道Python在这个问题上优于R,但我倾向于R来解决这个问题。
答案 0 :(得分:1)
您的保障功能不正确。我对您的代码进行了一些编辑,并在样本大小 N = 50 上对其进行了测试。我们可能会使用 system.time()来了解完成任务所需的时间。
“for”方法:
system.time(
for (page_no in 1:50){
closeAllConnections()
on.exit(closeAllConnections())
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
url <- paste(url_bit1, page_no, sep="")
cat(page_no,"\t",proc.time() - start_time,"\n")
data <- read_html(url)
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
Page_tenders <- cbind(Page_tenders,links_fair)
All_tenders <- rbind(All_tenders,Page_tenders)
}
)
#user system elapsed
# 50.15 81.26 132.73
“lapply”方法:
All_tenders = NULL
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
closeAllConnections()
on.exit(closeAllConnections())
url <- paste(url_bit1, datain, sep="")
cat(datain,"\t",proc.time() - start_time,"\n")
data <- read_html(url)
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
Page_tenders <- cbind(Page_tenders,links_fair)
All_tenders <- rbind(All_tenders,Page_tenders)
}
system.time(
All_tenders <- lapply(1:50, function(x) read_page(x))
)
# user system elapsed
# 49.84 78.97 131.16
如果我们想将结果放在数据框中,请将All_tenders列表转换为数据框,如下所示:
All_tenders = do.call(rbind, lapply(All_tenders, data.frame, stringsAsFactors=FALSE)
原来lapply稍快一些。
答案 1 :(得分:1)
for
循环和sapply
的工作方式不同:
- for
循环迭代地执行:它们在第一个元素上进行计算,然后在第二个元素上进行计算......
- sapply
独立地(以任何顺序)在元素列表上执行操作。所以结果是独立构建的。
所以在你的for循环中,当你这样做时:
All_tenders <- rbind(All_tenders,Page_tenders)
All_tenders
变量迭代增加。
在sapply
函数中,它不起作用(因为它不知道其他元素的结果)。
所以你应该做那样的事情:
url_bit1 <- 'https://eprocure.gov.in/mmp/latestactivetenders/page='
read_page <- function(datain){
closeAllConnections()
on.exit(closeAllConnections())
url <- paste(url_bit1, datain, sep="")
cat(S.No.,"\t",proc.time() - start_time,"\n")
data <- read_html(url)
total_tenders_raw <- html_nodes(data,xpath = '//*[(@id = "table")]')
Page_tenders <- data.frame(html_table(total_tenders_raw, header = TRUE))
links <- html_nodes(data, xpath='//*[(@id = "table")] | //td | //a')
links_fair <- html_attr(links,'href')
links_fair <- links_fair[grep("tendersfullview",links_fair)]
Page_tenders <- cbind(Page_tenders,links_fair)
return(Page_tenders)
}
要返回每个页面的结果并按以下方式应用它:
All_tenders_tmp <- sapply(2:round(Count_of_Recs/10), FUN=read_page)
然后,您的结果将是所有结果的列表,例如,您可以将其与data.table::rbindlist
合并。
我希望我很清楚。