Question

下面的代码是我从多个页面的IMDB中抓取数据，但是，当我尝试将数据合并到一个数据帧中时，这会给我一个错误，告诉我关于Gross和meta的不同行。我想知道如何将NA值插入这些空位置，以便字符串长度相等？（请注意，我必须删除一些链接，因为我需要某些代表才能发布更多链接）

urls <- c("https://www.imdb.com/search/title?title_type=feature&release_date=2010-01-01,2017-12-31",
              "https://www.imdb.com/search/title?title_type=feature&release_date=2010-01-01,2017-12-31&start=51&ref_=adv_nxt",
              "https://www.imdb.com/search/title?title_type=feature&release_date=2010-01-01,2017-12-31&start=101&ref_=adv_nxt",
              "https://www.imdb.com/search/title?title_type=feature&release_date=2010-01-01,2017-12-31&start=151&ref_=adv_nxt",
              "https://www.imdb.com/search/title?title_type=feature&release_date=2010-01-01,2017-12-31&start=201&ref_=adv_nxt",
              "https://www.imdb.com/search/title?title_type=feature&release_date=2010-01-01,2017-12-31&start=251&ref_=adv_nxt",
              "https://www.imdb.com/search/title?title_type=feature&release_date=2010-01-01,2017-12-31&start=301&ref_=adv_nxt",
              "https://www.imdb.com/search/title?title_type=feature&release_date=2010-01-01,2017-12-31&start=351&ref_=adv_nxt",
              "https://www.imdb.com/search/title?title_type=feature&release_date=2010-01-01,2017-12-31&start=401&ref_=adv_nxt",
              "https://www.imdb.com/search/title?title_type=feature&release_date=2010-01-01,2017-12-31&start=451&ref_=adv_nxt",
              "https://www.imdb.com/search/title?title_type=feature&release_date=2010-01-01,2017-12-31&start=501&ref_=adv_nxt",
              "https://www.imdb.com/search/title?title_type=feature&release_date=2010-01-01,2017-12-31&start=551&ref_=adv_nxt",
              "https://www.imdb.com/search/title?
              )


    results_list <- list()

    for(.page in seq_along(urls)){
      webpage <- read_html(urls[[.page]])
      titlehtml <- html_nodes(webpage,'.lister-item-header a')
      title <- html_text(titlehtml)
      runtimehtml <- html_nodes(webpage,'.text-muted .runtime')
      runtime <- html_text(runtimehtml)
      runtime <- gsub(" min","",runtime)
      ratinghtml <- html_nodes(webpage,'.ratings-imdb-rating strong')
      rating<- html_text(ratinghtml)
      voteshtml <- html_nodes(webpage,'.sort-num_votes-visible span:nth-child(2)')
      votes <- html_text(voteshtml)
      votes<-gsub(",","",votes)#removing commas
      metascorehtml <- html_nodes(webpage,'.metascore')
      metascore <- html_text(metascorehtml)
      metascore<-gsub(" ","",metascore)#removing extra space in metascore
      grosshtml <- html_nodes(webpage,'.ghost~ .text-muted+ span')
      gross <- html_text(grosshtml)
      gross<-gsub("M","",gross)#removing '$' and 'M' signs
      gross<-substring(gross,2,6)

      results_list[[.page]] <- data.frame(Title = title,
                                          Runtime = as.numeric(runtime),
                                          Rating = as.numeric(rating),
                                          Metascore = as.numeric(metascore), 
                                          Votes = as.numeric(votes), 
                                          Gross_Earning_in_Mil = as.numeric(unlist(gross))
                                          )
    }

    final_results <- plyr::ldply(results_list)

    Error in data.frame(Title = title, Runtime = as.numeric(runtime), Rating = as.numeric(rating),  : 
      arguments imply differing number of rows: 50, 49, 48

Answer 1

您需要知道数据丢失的地方，因此您需要知道哪些项目属于同一项目。现在，您仅具有单独的值向量，因此您不知道哪些值属于同一类。

看页面，看起来它们整齐地组织成“ lister-item-content”-节点，因此，干净的事情是首先提取这些节点，然后才分别从每个单元中提取更多信息。像这样的东西对我有用：

items <- html_nodes(webpage,'.lister-item-content')
gross <- sapply(items, function(i) {html_text(html_node(i, '.ghost~ .text-muted+ span'))})

它将在每个不包含您要查找的标头的地方插入NA。

从不均匀的列表创建数据框

1 个答案: