为什么lapply无法遍历所有URL?

时间:2019-04-19 05:59:16

标签: r rvest

我正在尝试删除SEC中多家公司的子公司,但似乎只停留在前几家公司。

library(rvest)
library(stringr)
library(dplyr)
library(lubridate)
library(readr)
library(stringi)

tickerSymbols <- c("MSFT","PYPL","QCOM","AAPL","MSFT","AMZN","GOOGL","FB","INTC","CSCO") #Sample Companies

pages <- sapply(tickerSymbols,function(Sym)
{
  tryCatch( #Catch errors like no EXHIBIT 21
    {

      Overview <- sub("_SUB_",Sym,"https://www.sec.gov/cgi-bin/browse-edgar?CIK=_SUB_&owner=exclude&action=getcompany") #Go to the overview-page of the company
      Sess <- html_session(Overview) #open html-session
      Search <- html_form(Sess)[[1]] #get search form
      Search_new <- set_values(Search,type = "10-K") #search for annual reports
      Sess <- Sess %>% 
        jump_to(submit_form(Sess,Search_new)$response$url) #submit search

      name <- Sess %>% html_node(xpath="/html/body/div[4]/div[1]/div[3]/span") %>% html_text() %>%
        str_extract(".+? (?=CIK#)")

      LinkN <- (Sess %>%
                  html_node(xpath="/html/body/div[4]/div[4]/table") %>%
                  html_table %>%  
                  { which(.$Filings == "10-K")})[1] #get index of first complete report
      Sess <- Sess %>% 
        follow_link(xpath=paste0("(//*[@id=\"documentsbutton\"])[",LinkN,"]")) #follow link to this report
      File <- (Sess %>% 
                 html_node(xpath="/html/body/div[4]/div[3]/div/table") %>%
                 html_table %>%
                 filter(grepl("EX-21",Type)))$Document #Get filename of EXHIBIT21 part
      Link <- stringi::stri_reverse(
        str_replace(stringi::stri_reverse(Sess$url),"^.+?/",stringi::stri_reverse(paste0("/",File)))) #modify Link
      return(c(name,Link))
    },error=function(e) return("Error"),warning=function(w) return("Warning"))
},simplify = F,USE.NAMES=T)

#list of tables
List.Of.Tabs <- lapply(names(pages)[1:2],function(name)
{
  webpage <- read_html(pages[[name]][2])
  tbls <- html_nodes(webpage, "table")
  tbls_ls <- html_table(tbls,fill = TRUE)
  tot <- bind_rows(tbls_ls)
  tot1 <- subset(tot,tot[,ncol(tot)] == 'Ireland' | tot[,ncol(tot)] == 'Japan')
  #write.csv(tot1,paste0(name,".csv"))
  if(nrow(tot1) == 0) return(tot1)
  else
  {
    tot1 <- tot1 %>% mutate(Company = rep(pages[[name]][1],nrow(tot1)))
    return(tot1)
  }
})

df2 <- bind_rows(List.Of.Tabs, .id = "column_label")
df3 <- data.frame(pages)

当前df2仅显示Microsoft子公司,而不显示其他子公司,例如谷歌。我尝试用sapply替换lapply,但是它似乎不起作用。

1 个答案:

答案 0 :(得分:0)

这是一个没有library(tidyverse) pages <- map(tickerSymbols, ~ { tryCatch( #Catch errors like no EXHIBIT 21 { Overview <- sub("_SUB_", .x,"https://www.sec.gov/cgi-bin/browse-edgar?CIK=_SUB_&owner=exclude&action=getcompany") #Go to the overview-page of the company Sess <- html_session(Overview) #open html-session Search <- html_form(Sess)[[1]] #get search form Search_new <- set_values(Search,type = "10-K") #search for annual reports Sess <- Sess %>% jump_to(submit_form(Sess,Search_new)$response$url) #submit search name <- Sess %>% html_node(xpath="/html/body/div[4]/div[1]/div[3]/span") %>% html_text() %>% str_extract(".+? (?=CIK#)") LinkN <- (Sess %>% html_node(xpath="/html/body/div[4]/div[4]/table") %>% html_table %>% { which(.$Filings == "10-K")})[1] #get index of first complete report Sess <- Sess %>% follow_link(xpath=paste0("(//*[@id=\"documentsbutton\"])[",LinkN,"]")) #follow link to this report File <- (Sess %>% html_node(xpath="/html/body/div[4]/div[3]/div/table") %>% html_table %>% filter(grepl("EX-21",Type)))$Document #Get filename of EXHIBIT21 part Link <- stringi::stri_reverse( str_replace(stringi::stri_reverse(Sess$url),"^.+?/",stringi::stri_reverse(paste0("/",File)))) #modify Link return(c(name,Link)) },error=function(e) return("Error"),warning=function(w) return("Warning")) },simplify = F,USE.NAMES=T) List.Of.Tabs <- map(pages, ~ { nm1 <- .x[1] link <- .x[2] webpage <- read_html(link) tbls <- html_nodes(webpage, "table") tbls_ls <- html_table(tbls,fill = TRUE) pos1 <- possibly(function(tbls) bind_rows(tbls) #%>% #filter_at(ncol(.), any_vars(. %in% c("Ireland", "Japan"))) , otherwise = NA) pos1(tbls_ls) })

的选项
{{1}}