Question

问题： 从网页（imdb.com，带有胶片详细信息的网页）中进行剪贴时，显示错误消息。当我检查了详细信息时，我注意到某些条目没有可用的数据。如何在刮擦期间找出哪一行没有数据以及如何用NA填充它？

手动调查： 我已经手动检查过网页，而问题是编号1097，那里只有电影类型可用，没有运行时间。

尝试过： 如果输入0，则会添加，但会添加到最后一行，而不是缺少该值的标题。

代码：

#install packages
install.packages("rvest")
install.packages("RSelenium")
library(rvest)
library(RSelenium)

#open browser (in my case Firefox)
rD <- rsDriver(browser=c("firefox"))
remDr <- rD[["client"]]

#set variable for the link
ile<-seq(from=1, by=250, length.out = 5)

#create empty frame
filmy_df=data.frame()

#empty values
rank_data<-NA;link<-NA;year<-NA;title_data<-NA;description_data<-NA;runtime_data<-NA;genre_data<-NA

#loop reading the data from each page
for (j in ile){
  #set link for browser
  newURL<-"https://www.imdb.com/search/title/?title_type=feature&release_date=,2018-12-31&count=250&start="
  startNumberURL<-paste0(newURL,j)

#open link
remDr$navigate(startNumberURL)

#read webpage code
strona_int<-read_html(startNumberURL)

#empty values
rank_data<-NA;link<-NA;year<-NA;title_data<-NA;description_data<-NA;runtime_data<-NA;genre_data<-NA

#read rank
rank_data<-html_nodes(strona_int,'.text-primary')
#convert text
rank_data<-html_text(rank_data)
#remove the comma for thousands
rank_data<-gsub(",","",rank_data)
#convert numeric
rank_data<-as.numeric(rank_data)

#read link for each movie
link<-url_absolute(html_nodes(strona_int, '.lister-item-header a')%>%html_attr(.,'href'),"https://www.imdb.com")

#release year
year<-html_nodes(strona_int,'.lister-item-year')
#convert text
year<-html_text(year)
#remove non numeric
year<-gsub("\\D","",year)
#set factor
year<-as.factor(year)

#read title
title_data<-html_nodes(strona_int,'.lister-item-header a')
#convert text
title_data<-html_text(title_data)
#title_data<-as.character(title_data)

#read description
description_data<-html_nodes(strona_int,'.ratings-bar+ .text-muted')
#convert text
description_data<-html_text(description_data)
#remove '\n'
description_data<-gsub("\n","",description_data)
#remove space
description_data<-trimws(description_data,"l")

#read runtime
runtime_data <- html_nodes(strona_int,'.text-muted .runtime')
#convert text
runtime_data <- html_text(runtime_data)
#remove min
runtime_data<-gsub(" min","",runtime_data)
length_runtime_data<-length(runtime_data)
#if (length_runtime_data<250){ runtime_data<-append(runtime_data,list(0))}
runtime_data<-as.numeric(runtime_data)


#temp_df
filmy_df_temp<-data.frame(Rank=rank_data,Title=title_data,Release.Year=year,Link=link,Description=description_data,Runtime=runtime_data)

#add to df
filmy_df<-rbind(filmy_df,filmy_df_temp)
}

#close browser
remDr$close()
#stop RSelenium
rD[["server"]]$stop()

显示的错误消息：

“ data.frame中的错误（等级= rank_data，标题= title_data，Release.Year =年，：参数暗示不同的行数：250、249“

Runtime_data仅包含249个条目，而不是250个，并且最后一行（实际上确实缺少的行）没有运行时数据。

更新我发现了有趣的想法，也许可以帮助解决问题。请检查图片。 Anima - source of error Knocked Up - next entry

当我们比较图片时，我们可以注意到引起了runtime_data问题的Anima根本没有包含运行时的html_node。

那么问题：有没有办法检查html_node是否存在？如果是，该怎么做？

Answer 1

如果您的程序结构稍有不同，则不会遇到此问题。通常，最好将程序分成逻辑上独立的块，这些块或多或少彼此独立，而不是一次完成所有操作。这使调试更加容易。

首先，报废数据并将其存储在列表中–使用lapply或类似的方法。

newURL <- "https://www.imdb.com/search/title/?title_type=feature&release_date=,2018-12-31&count=250&start="

pages <- lapply(ile, function(j) {   #set link for browser

    startNumberURL<-paste0(newURL,j)

    #open link
    remDr$navigate(startNumberURL)

    #read webpage code
    read_html(startNumberURL)
})

然后，您便会抓取数据，并且可以花所有时间进行分析和过滤，而无需再次开始读取过程。例如，将函数定义如下：

parsuj_strone <- function(strona_int) {

  #read rank
  rank_data<-html_nodes(strona_int,'.text-primary')

  #convert text
  rank_data<-html_text(rank_data)

  #remove the comma for thousands
  rank_data<-gsub(",","",rank_data)

  #convert numeric
  rank_data<-as.numeric(rank_data)

  #read link for each movie
  link<-url_absolute(html_nodes(strona_int, '.lister-item-header a')%>%html_attr(.,'href'),"https://www.imdb.com")

  #release year
  year<-html_nodes(strona_int,'.lister-item-year')

  #convert text
  year<-html_text(year)

  #remove non numeric
  year<-gsub("\\D","",year)

  #set factor
  year<-as.factor(year)

  #read title
  title_data<-html_nodes(strona_int,'.lister-item-header a')

  #convert text
  title_data<-html_text(title_data)

  #read description
  description_data<-html_nodes(strona_int,'.ratings-bar+ .text-muted')

  #convert text
  description_data<-html_text(description_data)

  #remove '\n'
  description_data<-gsub("\n","",description_data)

  #remove space
  description_data<-trimws(description_data,"l")

  #read runtime
  runtime_data <- html_nodes(strona_int,'.text-muted .runtime')

  #convert text
  runtime_data <- html_text(runtime_data)

  #remove min
  runtime_data<-gsub(" min","",runtime_data)
  length_runtime_data<-length(runtime_data)

  runtime_data<-as.numeric(runtime_data)

  #temp_df
  filmy_df_temp<- data.frame(Rank=rank_data,Title=title_data,Release.Year=year,Link=link,Description=description_data,Runtime=runtime_data)
  return(filmy_df_temp)

}

现在，将该功能应用于每个抓取的网站：

pages_parsed <- lapply(pages, parsuj_strone)

最后将它们放到数据框中：

pages_df <- Reduce(rbind, pages_parsed)

Reduce不会介意NULL。 Powodzenia！

编辑：好，所以问题出在parsuj_strone()函数中。首先，用以下命令替换该函数的最后一行：

filmy_df_temp<- list(Rank=rank_data, 
                     Title=title_data, 
                     Release.Year=year, Link=link, 
                     Description=description_data, 
                     Runtime=runtime_data)
return(filmy_df_temp)

运行

pages_parsed <- lapply(pages, parsuj_strone)

然后，确定5个网站中哪些返回了有问题的条目：

sapply(pages_parsed, function(x) sapply(x, length))

这应该给您一个5 x 6的矩阵。最后，选择一个只有249个条目的元素；看起来怎么样至少在不了解解析器的情况下，这至少应该给您提示可能存在问题的地方。

Answer 2

在Stackoverflow上，您将找到所有内容。您只需要知道如何搜索和搜索答案。这是我的问题答案的链接：Scraping with rvest: how to fill blank numbers in a row to transform in a data frame?

简而言之：应该使用html_node（不带s）而不是html_nodes。

#read runtime
runtime_data <- html_node(szczegoly_filmu,'.text-muted .runtime')
#convert to text
runtime_data <- html_text(runtime_data)
#remove " min"
runtime_data<-gsub(" min","",runtime_data)
runtime_data<-as.numeric(runtime_data)

R scraper-如何查找缺少数据的条目

2 个答案: