从网站中所有网页上抓取完整数据

时间:2018-10-22 07:35:41

标签: r web-scraping rvest

使用此代码,我可以从该网站的首页获取数据。但是我想从完整的数据。我想从所有网页中提取数据。提取数据后,应将其保存在excel或csv文件中。

install.packages("rvest")
library(rvest)
install.packages("dplyr")
library(dplyr)

pg<-read_html("https://bidplus.gem.gov.in/bidresultlists?bidresultlists&page_no=i")

#pg <- read_html("https://bidplus.gem.gov.in/bidresultlists")

blocks <- html_nodes(pg, ".block")

items_and_quantity <- html_nodes(blocks, xpath=".//div[@class='col-block' and contains(., 'Item(s)')]")

items <- html_nodes(items_and_quantity, xpath=".//strong[contains(., 'Item(s)')]/following-sibling::span") %>% html_text(trim=TRUE)
quantity <- html_nodes(items_and_quantity, xpath=".//strong[contains(., 'Quantity')]/following-sibling::span") %>% html_text(trim=TRUE) %>% as.numeric()

department_name_and_address <- html_nodes(blocks, xpath=".//div[@class='col-block' and contains(., 'Department Name And Address')]") %>% 
  html_text(trim=TRUE) %>% 
  gsub("\n", "|", .) %>% 
  gsub("[[:space:]]*\\||\\|[[:space:]]*", "|", .)

block_header <- html_nodes(blocks, "div.block_header")

html_nodes(block_header, xpath=".//p[contains(@class, 'bid_no')]") %>%
  html_text(trim=TRUE) %>% 
  gsub("^.*: ", "", .) -> bid_no

html_nodes(block_header, xpath=".//p/b[contains(., 'Status')]/following-sibling::span") %>% 
  html_text(trim=TRUE) -> status

html_nodes(blocks, xpath=".//strong[contains(., 'Start Date')]/following-sibling::span") %>%
  html_text(trim=TRUE) -> start_date

html_nodes(blocks, xpath=".//strong[contains(., 'End Date')]/following-sibling::span") %>%
  html_text(trim=TRUE) -> end_date

data.frame(
  bid_no,
  status,
  start_date,
  end_date,
  items,
  quantity,
  department_name_and_address,
  stringsAsFactors=FALSE
) -> xdf

xdf$is_ra <- grepl("/RA/", bid_no)

str(xdf)
## 'data.frame': 10 obs. of  8 variables:
##  $ bid_no                     : chr  "GEM/2018/B/93066" "GEM/2018/B/93082" "GEM/2018/B/93105" "GEM/2018/B/93999" ...
##  $ status                     : chr  "Not Evaluated" "Not Evaluated" "Not Evaluated" "Not Evaluated" ...
##  $ start_date                 : chr  "25-09-2018 03:53:pm" "27-09-2018 09:16:am" "25-09-2018 05:08:pm" "26-09-2018 05:21:pm" ...
##  $ end_date                   : chr  "18-10-2018 03:00:pm" "18-10-2018 03:00:pm" "18-10-2018 03:00:pm" "18-10-2018 03:00:pm" ...
##  $ items                      : chr  "automotive chassis fitted with engine" "automotive chassis fitted with engine" "automotive chassis fitted with engine" "Storage System" ...
##  $ quantity                   : num  1 1 1 2 90 1 981 6 4 376
##  $ department_name_and_address: chr  "Department Name And Address:||Ministry Of Steel Na Kirandul Complex N/a" "Department Name And Address:||Ministry Of Steel Na Kirandul Complex N/a" "Department Name And Address:||Ministry Of Steel Na Kirandul Complex N/a" "Department Name And Address:||Maharashtra Energy Department Maharashtra Bhusawal Tps N/a" ...
##  $ is_ra                      : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...

xdf
write.csv(xdf,'xdf1.csv')


write.csv(xdf,'xdf.csv')
write.csv(xdf,'xdf.csv', append = TRUE)
?write.csv

write.table( xdf1,  
             file="xdf.csv", 
             append = T, 
             sep=',', 
             row.names=F, 
             col.names=F )

1 个答案:

答案 0 :(得分:1)

尝试这个:

src/index.d.ts