由于某些网页缺少标记,我很难提取价值:结果猫
我已经访问了这个问题here,但我仍然无法抓取数据。
HTML :
<div class="result ">
<span class="result-txt">
<span class="result-name">
<a href="/some/value/">COMPANY_NAME</a>
<a class="result-icons" href="/some/value/COMPANY_NAME_/">
<span title="Info" class="sprite sprite-info">Info</span>
<span title="Phone" class="sprite sprite-phone">Phone</span>
</a>
</span>
<em>
<a href="/some/value/">LOCATION</a>
<span> ADDRESS </span>
</em>
<span class="result-cats">
<a href="/some/value/" title="CAT1">CAT1</a>
<a href="/some/value/" title="CAT2">CAT2</a>
</span>
</span>
</div>
我正在尝试以下代码,但是由于某些网页没有 results-cats 标记,因此它会给我错误。因此,数据帧具有矢量长度不匹配
码
library(rvest)
library(XML)
library(stringi)
df <- data.frame(CompanyName = NULL, CompanyLink = NULL, Address = NULL, cats = NULL)
for(i in 1:100 ){
print(paste("Page: ", i, sep = ""))
url <- "url.com"
page <- read_html(url)
CompanyNameNode <- html_nodes(page,'.result-name a:nth-child(1)')
CompanyName <- html_text(CompanyNameNode)
CompanyLink <- html_attr(CompanyNameNode, 'href')
Address <- html_text(html_nodes(page,'.result-txt em'))
Address <- gsub("[\r\n]", "", Address)
cats <- html_text(html_nodes(page,'.result-cats'))
cats <- stri_trim(cats)
cats <- gsub("[\r\n]", ", ", cats)
df <- rbind(df, data.frame(CompanyName = CompanyName,
CompanyLink = CompanyLink,
Address = Address,
cats = cats))
}
更新:使用以下代码解决问题
pg <- html_nodes(page,'.result-txt')
cats <- NULL
for(j in 1:length(pg)){
cats[j] <- ifelse(length(html_text(html_nodes(pg[j],'.result-cats')))==0,
NA,
html_text(html_nodes(pg[j],'.result-cats')))
}
cats <- stri_trim(cats)
cats <- gsub("[\r\n]", ", ", cats)
答案 0 :(得分:1)
使用以下代码
解决了这个问题pg <- html_nodes(page,'.result-txt')
cats <- NULL
for(j in 1:length(pg)){
cats[j] <- ifelse(length(html_text(html_nodes(pg[j],'.result-cats')))==0,
NA,
html_text(html_nodes(pg[j],'.result-cats')))
}
cats <- stri_trim(cats)
cats <- gsub("[\r\n]", ", ", cats)