如何使用R提取指向Edgar SEC文件中附件21的链接?

时间:2019-04-17 12:00:10

标签: r rvest readr

我需要帮助自动为所有公司从展览21提取信息,这些公司可以使用其股票代码通过https://www.sec.gov/edgar/searchedgar/companysearch.html进行访问。

这是我想在每个Exhibit 21上为每个公司重复的代码,但这意味着必须手动为每个公司粘贴链接。

library(rvest)
library(stringr)
library(dplyr)
library(lubridate)
library(readr)

webpage <- read_html("https://www.sec.gov/Archives/edgar/data/6951/000000695118000041/ex21_amatq42018.htm")

tbls <- html_nodes(webpage, "table")
tbls_ls <- html_table(tbls,fill = TRUE)
tot <- bind_rows(tbls_ls)

tot1 <- subset(tot,tot[ncol(tot)] == 'India' | tot[,1] == 'China')

write_csv(tot1, "coy1.csv")

1 个答案:

答案 0 :(得分:0)

此代码对我适用于Google,Facebook和Apple。如果对您有用,请自己尝试。该函数会将URL返回到EXHIBIT 21 html。

library(rvest)
library(stringr)
library(dplyr)
library(lubridate)
library(readr)
library(stringi)

tickerSymbols <- c("AAPL","GOOG","FB") #Sample Companies

pages <- sapply(tickerSymbols,function(Sym)
{
  tryCatch( #Catch errors like no EXHIBIT 21
    {
      Overview <- sub("_SUB_",Sym,"https://www.sec.gov/cgi-bin/browse-edgar?CIK=_SUB_&owner=exclude&action=getcompany") #Go to the overview-page of the company
      Sess <- html_session(Overview) #open html-session
      Search <- html_form(Sess)[[1]] #get search form
      Search_new <- set_values(Search,type = "10-K") #search for annual reports
      Sess <- Sess %>% 
        jump_to(submit_form(Sess,Search_new)$response$url) #submit search
      LinkN <- (Sess %>%
                  html_node(xpath="/html/body/div[4]/div[4]/table") %>%
                  html_table %>%  
                  { which(.$Filings == "10-K")})[1] #get index of first complete report
      Sess <- Sess %>% 
        follow_link(xpath=paste0("(//*[@id=\"documentsbutton\"])[",LinkN,"]")) #follow link to this report
      File <- (Sess %>% 
                 html_node(xpath="/html/body/div[4]/div[3]/div/table") %>%
                 html_table %>%
                 filter(grepl("EXHIBIT 21",Description)))$Document #Get filename of EXHIBIT21 part
      Link <- stringi::stri_reverse(
        str_replace(stringi::stri_reverse(Sess$url),"^.+?/",stringi::stri_reverse(paste0("/",File)))) #modify Link
      return(Link)
    },error=function(e) return("Error"),warning=function(w) return("Warning"))
})

#list of tables
List.Of.Tabs <- lapply(names(pages),function(name)
{
webpage <- read_html(pages[name])
tbls <- html_nodes(webpage, "table")
tbls_ls <- html_table(tbls,fill = TRUE)
tot <- bind_rows(tbls_ls)
tot1 <- subset(tot,tot[,ncol(tot)] == 'India' | tot[,2] == 'China')
write.csv(tot1,paste0(name,".csv"))
return(tot1)
})