在解析SEC 13F文件后,从奇怪的字符串中查找公司元数据

时间:2016-08-28 22:20:44

标签: r web-scraping finance

我想从R中的SEC文件中下载数据。下面的代码执行此操作。它创建一个包含13F数据的数据框。

#einhorn_13F_2016.R
# Holdings of D. Einhorns Hedge Fund
# Metadata / Background Info
#https://www.sec.gov/Archives/edgar/data/1079114/000107911416000025/xslForm13F_X01/primary_doc.xml
library(ggplot2)
library(rvest)
library(stringi)
library(purrr)
library(tidyr)
library(dplyr)

# data
# read in HTML:
html_url <- "https://www.sec.gov/Archives/edgar/data/1079114/000107911416000025/xslForm13F_X01/Greenlight_13FXML_06302016.xml"
html_dat <- read_html(html_url)

#find the right table in HTML DOM
html_dat <- html_table(html_dat, header = TRUE, fill=TRUE)[[4]]
glimpse(html_dat)

# parse messed-up table header
einhorn_col <- map2_chr(html_dat[1,],html_dat[2,], paste)
einhorn <- html_dat
colnames(einhorn) <-  make.names(stri_trim(stringi::stri_trans_tolower(paste0( einhorn_col, sep=""))))
einhorn <- einhorn[3:nrow(einhorn),]

# there are 2 important numeric columns
einhorn[, "value..x.1000."] <- as.numeric(gsub(",", "",einhorn[, "value..x.1000."]))
einhorn[, "shrs.or.prn.amt"] <- as.numeric(gsub(",", "", einhorn[, "shrs.or.prn.amt"]))

# most important holdings by value
einhorn %>%
        group_by(name.of.issuer) %>%
        summarise(sum_value=sum(value..x.1000.),sum_shares=sum(shrs.or.prn.amt)) %>%
        arrange(desc(sum_value))

# show some company names
companies <- unique(einhorn$name.of.issuer)
sample(companies, 6)

现在我想扩充数据框。

colnames(einhorn)
 [1] "name.of.issuer"          "title.of.class"          "cusip"                  
 [4] "value..x.1000."          "shrs.or.prn.amt"         "sh..prn"                
 [7] "put..call"               "investment.discretion"   "other.manager"          
[10] "voting.authority.sole"   "voting.authority.shared" "voting.authority.none"

从第1栏“发行人名称”开始,我想找到市场类别,居住国等。

我希望输出类似于finreportr::CompanyInfo("GOOG") call

  company        CIK  SIC state state.inc FY.end            street.address             city.state
  1 GOOGLE INC. 0001288776 7370    CA        DE   1231 1600 AMPHITHEATRE PARKWAY MOUNTAIN VIEW CA 94043

但是当我从“发行人名称”列输入值时,我不知道从哪里获取此数据。

sample(companies, 6)
[1] "TAKE-TWO INTERACTIVE SOFTWAR" "TERRAFORM PWR INC"           
[3] "APPLE INC"                    "VOYA FINL INC"               
[5] "AERCAP HOLDINGS NV"           "PERRIGO CO PLC

不适用于上述值之一(因为它不是真正的股票代码值):

finreportr::CompanyInfo("TERRAFORM PWR INC")

结果:

Error in open.connection(x, "rb") : HTTP error 400.
Calls: <Anonymous> -> <Anonymous> -> read_html.default

是否有可用于获取此数据的Web服务,API端点或R包?

1 个答案:

答案 0 :(得分:0)

回答我自己的问题:

我使用Google Knowledge Graph Search API从奇怪的格式化和缩写字符串中查找公司详细信息。它适用于大多数情况。

从代码中省略了API密钥处理/赋值。

(...在这里从问题块前面加上代码....)

# show some company names
companies <- unique(einhorn$name.of.issuer)
#samp <- data.frame(company=sample(companies, 6), stringsAsFactors = FALSE)
samp <- sample(companies, 6)


kgapi_call_str <- function(query,
                           apikey,
                           templatestr="https://kgsearch.googleapis.com/v1/entities:search?key=%s&limit=1&indent=True&query=%s"){
        knowledgeapi <- sprintf(fmt = templatestr, apikey, URLencode(query))
        knowledgeapi
}


kg_api_call <- function(api_call_str, extracolumn=NA){
        json <- jsonlite::fromJSON(api_call_str)
        if(is.data.frame(json$itemListElement)) {
                json.result <- jsonlite::flatten(json$itemListElement)
                colnames(json.result) <- make.names(colnames(json.result) )
                json.result$name.of.issuer <- extracolumn
                json.result
        }

}


kgapi_call_data <- function(api_call_str, extracolumn=NA){
        extracolumn_shortened <- gsub('\\s+\\w+$', '', extracolumn, perl=TRUE)
        extracolumn_shortened.2 <- gsub('\\s+\\w+$', '', extracolumn_shortened, perl=TRUE)
        json <- kg_api_call(api_call_str, extracolumn)
        if(!is.null(json)){
                return(json)
        }
        # Query unsuccessful try shortened company-name,
        if (stri_length(extracolumn_shortened) > 0){
                message(sprintf("cannot resolve - 2nd try:\n%s\n%s\n\n", extracolumn, extracolumn_shortened))
                api_call_str <- kgapi_call_str(query=extracolumn_shortened, apikey=apikey)
                json <- kg_api_call(api_call_str, extracolumn)

                if(!is.null(json)){
                        return(json)
                }
        }

        if(is.null(json) & stri_length(extracolumn_shortened.2) > 0) {
                message(sprintf("cannot resolve - 3rd try:\n%s\n%s\n\n", extracolumn, extracolumn_shortened.2))
                api_call_str <- kgapi_call_str(query=extracolumn_shortened.2, apikey=apikey)
                json <- kg_api_call(api_call_str, extracolumn)
        }
        else {
                warning(sprintf("cannot resolve: \n%s\n%s\n\n", extracolumn, extracolumn_shortened))
        }

}

kgapi_lookup <- function(lookup_str, apikey) {
        dat <- kgapi_call_data(api_call_str=kgapi_call_str(query=lookup_str, apikey=apikey), extracolumn = lookup_str)
        dat
}

#kgapi_call_str("GENERAL MTRS CO", apikey)

companies.metadata.3 <- do.call(bind_rows, lapply(companies, kgapi_lookup, apikey))
companies.metadata.4 <- companies.metadata.3 %>%
        mutate(result..type=map(map(result..type, unlist), sort, decreasing=TRUE))

einhorn <- einhorn %>%
        left_join(companies.metadata.4, by="name.of.issuer")

下次我尝试使用SEC 13F表格中提供的CUSIP标识符,但此服务是非免费的AFAIK。