现在,我已经提取了子公司的名称,我还想在提取的每个子公司旁边添加上市公司的名称。
例如,我想从https://www.sec.gov/cgi-bin/browse-edgar?CIK=aapl&owner=exclude&action=getcompany中提取“ APPLE INC”
library(rvest)
library(stringr)
library(dplyr)
library(lubridate)
library(readr)
library(stringi)
tickerSymbols <- c("AAPL","GOOG","FB") #Sample Companies
pages <- sapply(tickerSymbols,function(Sym)
{
tryCatch( #Catch errors like no EXHIBIT 21
{
Overview <- sub("_SUB_",Sym,"https://www.sec.gov/cgi-bin/browse-edgar?CIK=_SUB_&owner=exclude&action=getcompany") #Go to the overview-page of the company
Sess <- html_session(Overview) #open html-session
Search <- html_form(Sess)[[1]] #get search form
Search_new <- set_values(Search,type = "10-K") #search for annual reports
Sess <- Sess %>%
jump_to(submit_form(Sess,Search_new)$response$url) #submit search
LinkN <- (Sess %>%
html_node(xpath="/html/body/div[4]/div[4]/table") %>%
html_table %>%
{ which(.$Filings == "10-K")})[1] #get index of first complete report
Sess <- Sess %>%
follow_link(xpath=paste0("(//*[@id=\"documentsbutton\"])[",LinkN,"]")) #follow link to this report
File <- (Sess %>%
html_node(xpath="/html/body/div[4]/div[3]/div/table") %>%
html_table %>%
filter(grepl("EXHIBIT 21",Description)))$Document #Get filename of EXHIBIT21 part
Link <- stringi::stri_reverse(
str_replace(stringi::stri_reverse(Sess$url),"^.+?/",stringi::stri_reverse(paste0("/",File)))) #modify Link
return(Link)
},error=function(e) return("Error"),warning=function(w) return("Warning"))
})
#list of tables
List.Of.Tabs <- lapply(names(pages),function(name)
{
webpage <- read_html(pages[name])
tbls <- html_nodes(webpage, "table")
tbls_ls <- html_table(tbls,fill = TRUE)
tot <- bind_rows(tbls_ls)
tot1 <- subset(tot,tot[,ncol(tot)] == 'Ireland' | tot[,2] == 'Japan')
write.csv(tot1,paste0(name,".csv"))
return(tot1)
})
输出应类似于:
Apple Distribution International,APPLE INC
Apple Japan,Inc.,APPLE INC
Apple Operations,苹果公司 苹果欧洲业务部,苹果公司
Apple Operations International,APPLE INC
Apple Sales International,APPLE INC
答案 0 :(得分:0)
我已经修改了代码的搜索部分,以便公司名称grepped
在那里返回,以便我们稍后使用:
pages <- sapply(tickerSymbols,function(Sym)
{
tryCatch( #Catch errors like no EXHIBIT 21
{
Overview <- sub("_SUB_",Sym,"https://www.sec.gov/cgi-bin/browse-edgar?CIK=_SUB_&owner=exclude&action=getcompany") #Go to the overview-page of the company
Sess <- html_session(Overview) #open html-session
Search <- html_form(Sess)[[1]] #get search form
Search_new <- set_values(Search,type = "10-K") #search for annual reports
Sess <- Sess %>%
jump_to(submit_form(Sess,Search_new)$response$url) #submit search
name <- Sess %>% html_node(xpath="/html/body/div[4]/div[1]/div[3]/span") %>% html_text() %>%
str_extract(".+? (?=CIK#)")
LinkN <- (Sess %>%
html_node(xpath="/html/body/div[4]/div[4]/table") %>%
html_table %>%
{ which(.$Filings == "10-K")})[1] #get index of first complete report
Sess <- Sess %>%
follow_link(xpath=paste0("(//*[@id=\"documentsbutton\"])[",LinkN,"]")) #follow link to this report
File <- (Sess %>%
html_node(xpath="/html/body/div[4]/div[3]/div/table") %>%
html_table %>%
filter(grepl("EXHIBIT 21",Description)))$Document #Get filename of EXHIBIT21 part
Link <- stringi::stri_reverse(
str_replace(stringi::stri_reverse(Sess$url),"^.+?/",stringi::stri_reverse(paste0("/",File)))) #modify Link
return(c(name,Link))
},error=function(e) return("Error"),warning=function(w) return("Warning"))
},simplify = F,USE.NAMES=T)
#list of tables
List.Of.Tabs <- lapply(names(pages)[1:2],function(name)
{
webpage <- read_html(pages[[name]][2])
tbls <- html_nodes(webpage, "table")
tbls_ls <- html_table(tbls,fill = TRUE)
tot <- bind_rows(tbls_ls)
tot1 <- subset(tot,tot[,ncol(tot)] == 'Ireland' | tot[,ncol(tot)] == 'Japan')
#write.csv(tot1,paste0(name,".csv"))
if(nrow(tot1) == 0) return(tot1)
else
{
tot1 <- tot1 %>% mutate(Company = rep(pages[[name]][1],nrow(tot1)))
return(tot1)
}
})