我正在尝试通过https连接到远程站点并下载一些信息。我正在这样做:
library("httr")
library("XML")
library(RCurl)
url<-c("https://salesweb.civilview.com/Sales/SalesSearch?countyId=3")
file<-getURL(url, ssl.verifyhost = 0L, ssl.verifypeer = 0L)
每行都有“详细信息”链接,该链接提供了有关每条记录的更多信息。我需要下载url并进入每个“详细信息”部分,并将其与初始数据集合并。
我该怎么做?
答案 0 :(得分:1)
如果我理解了您的问题,那么您想从主表中https://salesweb.civilview.com/Sales/SalesSearch?countyId=3
的URL检索数据,以及主URL中每个记录的详细信息数据。
作为示例,我编写了一个代码,使您可以从结构化数据框中的主页检索数据,其中第一列是详细信息记录的网址。
#load libraries
library(rvest)
library (tidyverse)
#assign url
url <- "https://salesweb.civilview.com/Sales/SalesSearch?countyId=3"
#extract td tags contents
readUrlHtml <- read_html(url) %>% html_nodes("td")
#create empty dataframe
df <- data.frame(Details=character(),
Sheriff=character(),
SalesDate=character(),
Plaintiff=character(),
Defendant=character(),
Address=character(),
stringsAsFactors=FALSE)
#loop to harvest the data
j = 1
for (i in 1:(length(readUrlHtml)/6))
{
df[i,c('Details')] <- paste0("https://salesweb.civilview.com/Sales/SaleDetails?PropertyId=",substr(readUrlHtml[j],65,73))
df[i,c('Sheriff')] <- readUrlHtml[j+1] %>% html_text()
df[i,c('SalesDate')] <- readUrlHtml[j+2] %>% html_text()
df[i,c('Plaintiff')] <- readUrlHtml[j+3] %>% html_text()
df[i,c('Defendant')] <- readUrlHtml[j+4] %>% html_text()
df[i,c('Address')] <- readUrlHtml[j+5] %>% html_text()
j = j + 6
}
#values check
df[1,]
df[50,]
df[525,]
使用rvest
包,您可以检索详细信息页面的数据并将其保存在新的数据框中。
为了检索详细信息数据,您需要从主URL保存cookie信息。完成后,您可以创建一个新的数据框来存储该数据:这将显示在代码的更新版本中。
1)新的库httr
用于检索cookie数据
2)要检索的详细信息数据是打印屏幕中红色矩形内的那个(要检索最后一个,我建议创建一个新的数据框来存储其他数据,但是我想这会大大增加类型的数量需要处理所有数据!)
3)两个数据帧df
和dfDetails
可以通过使用Details
键进行合并
#load libraries
library(rvest)
library (tidyverse)
library (httr) #new library
#assign url
url <- "https://salesweb.civilview.com/Sales/SalesSearch?countyId=3"
#extract td tags contents
readUrlHtml <- read_html(url) %>% html_nodes("td")
#create empty dataframe
df <- data.frame(Details=character(),
Sheriff=character(),
SalesDate=character(),
Plaintiff=character(),
Defendant=character(),
Address=character(),
stringsAsFactors=FALSE)
#loop to harvest the data
j = 1
for (i in 1:(length(readUrlHtml)/6))
{
df[i,c('Details')] <- paste0("https://salesweb.civilview.com/Sales/SaleDetails?PropertyId=",substr(readUrlHtml[j],65,73))
df[i,c('Sheriff')] <- readUrlHtml[j+1] %>% html_text()
df[i,c('SalesDate')] <- readUrlHtml[j+2] %>% html_text()
df[i,c('Plaintiff')] <- readUrlHtml[j+3] %>% html_text()
df[i,c('Defendant')] <- readUrlHtml[j+4] %>% html_text()
df[i,c('Address')] <- readUrlHtml[j+5] %>% html_text()
j = j + 6
}
#values check
df[1,]
df[50,]
df[525,]
## UPDATED SECTION TO RETRIEVE THE URLS DETAILS ##
#retrieve session cookie by taking the url of the main page
urlInfos <- GET(url)
#create empty details dataframe
dfDetails <- data.frame(Details=character(),
Sheriff=character(),
CourtCase=character(),
SalesDate=character(),
Plaintiff=character(),
Defendant=character(),
Address=character(),
Description=character(),
ApproxUpset=character(),
Attorney=character(),
AttorneyPhone=character(),
stringsAsFactors=FALSE)
#loop to harvest the details
for (i in 1:length(df$Details)) #takes a while to retrieve all records! (5-6 mins)
#for (i in 1:3) #loop through few record for testing purposes
{
responseDetail <- GET(df[i,c('Details')], set_cookies(`urlInfos$cookies[6]` = paste0('"',urlInfos$cookies[7],'"')))
readUrlHtmlDetail <- read_html(responseDetail) %>% html_nodes("td")
dfDetails[i,c('Details')] <- df[i,c('Details')]
dfDetails[i,c('Sheriff')] <- readUrlHtmlDetail[2] %>% html_text()
dfDetails[i,c('CourtCase')] <- readUrlHtmlDetail[4] %>% html_text()
dfDetails[i,c('SalesDate')] <- readUrlHtmlDetail[6] %>% html_text()
dfDetails[i,c('Plaintiff')] <- readUrlHtmlDetail[8] %>% html_text()
dfDetails[i,c('Defendant')] <- readUrlHtmlDetail[10] %>% html_text()
dfDetails[i,c('Address')] <- readUrlHtmlDetail[12] %>% html_text()
dfDetails[i,c('ApproxUpset')] <- readUrlHtmlDetail[14] %>% html_text()
dfDetails[i,c('Attorney')] <- readUrlHtmlDetail[16] %>% html_text()
dfDetails[i,c('AttorneyPhone')] <- readUrlHtmlDetail[18] %>% html_text()
}
#values detail check
dfDetails[1,]
dfDetails[50,]
dfDetails[525,]