我想从R中的SEC文件中下载数据。下面的代码执行此操作。它创建一个包含13F数据的数据框。
#einhorn_13F_2016.R
# Holdings of D. Einhorns Hedge Fund
# Metadata / Background Info
#https://www.sec.gov/Archives/edgar/data/1079114/000107911416000025/xslForm13F_X01/primary_doc.xml
library(ggplot2)
library(rvest)
library(stringi)
library(purrr)
library(tidyr)
library(dplyr)
# data
# read in HTML:
html_url <- "https://www.sec.gov/Archives/edgar/data/1079114/000107911416000025/xslForm13F_X01/Greenlight_13FXML_06302016.xml"
html_dat <- read_html(html_url)
#find the right table in HTML DOM
html_dat <- html_table(html_dat, header = TRUE, fill=TRUE)[[4]]
glimpse(html_dat)
# parse messed-up table header
einhorn_col <- map2_chr(html_dat[1,],html_dat[2,], paste)
einhorn <- html_dat
colnames(einhorn) <- make.names(stri_trim(stringi::stri_trans_tolower(paste0( einhorn_col, sep=""))))
einhorn <- einhorn[3:nrow(einhorn),]
# there are 2 important numeric columns
einhorn[, "value..x.1000."] <- as.numeric(gsub(",", "",einhorn[, "value..x.1000."]))
einhorn[, "shrs.or.prn.amt"] <- as.numeric(gsub(",", "", einhorn[, "shrs.or.prn.amt"]))
# most important holdings by value
einhorn %>%
group_by(name.of.issuer) %>%
summarise(sum_value=sum(value..x.1000.),sum_shares=sum(shrs.or.prn.amt)) %>%
arrange(desc(sum_value))
# show some company names
companies <- unique(einhorn$name.of.issuer)
sample(companies, 6)
现在我想扩充数据框。
colnames(einhorn)
[1] "name.of.issuer" "title.of.class" "cusip"
[4] "value..x.1000." "shrs.or.prn.amt" "sh..prn"
[7] "put..call" "investment.discretion" "other.manager"
[10] "voting.authority.sole" "voting.authority.shared" "voting.authority.none"
从第1栏“发行人名称”开始,我想找到市场类别,居住国等。
我希望输出类似于finreportr::CompanyInfo("GOOG") call
company CIK SIC state state.inc FY.end street.address city.state
1 GOOGLE INC. 0001288776 7370 CA DE 1231 1600 AMPHITHEATRE PARKWAY MOUNTAIN VIEW CA 94043
但是当我从“发行人名称”列输入值时,我不知道从哪里获取此数据。
sample(companies, 6)
[1] "TAKE-TWO INTERACTIVE SOFTWAR" "TERRAFORM PWR INC"
[3] "APPLE INC" "VOYA FINL INC"
[5] "AERCAP HOLDINGS NV" "PERRIGO CO PLC
不适用于上述值之一(因为它不是真正的股票代码值):
finreportr::CompanyInfo("TERRAFORM PWR INC")
结果:
Error in open.connection(x, "rb") : HTTP error 400.
Calls: <Anonymous> -> <Anonymous> -> read_html.default
是否有可用于获取此数据的Web服务,API端点或R包?
答案 0 :(得分:0)
回答我自己的问题:
我使用Google Knowledge Graph Search API从奇怪的格式化和缩写字符串中查找公司详细信息。它适用于大多数情况。
从代码中省略了API密钥处理/赋值。
(...在这里从问题块前面加上代码....)
# show some company names
companies <- unique(einhorn$name.of.issuer)
#samp <- data.frame(company=sample(companies, 6), stringsAsFactors = FALSE)
samp <- sample(companies, 6)
kgapi_call_str <- function(query,
apikey,
templatestr="https://kgsearch.googleapis.com/v1/entities:search?key=%s&limit=1&indent=True&query=%s"){
knowledgeapi <- sprintf(fmt = templatestr, apikey, URLencode(query))
knowledgeapi
}
kg_api_call <- function(api_call_str, extracolumn=NA){
json <- jsonlite::fromJSON(api_call_str)
if(is.data.frame(json$itemListElement)) {
json.result <- jsonlite::flatten(json$itemListElement)
colnames(json.result) <- make.names(colnames(json.result) )
json.result$name.of.issuer <- extracolumn
json.result
}
}
kgapi_call_data <- function(api_call_str, extracolumn=NA){
extracolumn_shortened <- gsub('\\s+\\w+$', '', extracolumn, perl=TRUE)
extracolumn_shortened.2 <- gsub('\\s+\\w+$', '', extracolumn_shortened, perl=TRUE)
json <- kg_api_call(api_call_str, extracolumn)
if(!is.null(json)){
return(json)
}
# Query unsuccessful try shortened company-name,
if (stri_length(extracolumn_shortened) > 0){
message(sprintf("cannot resolve - 2nd try:\n%s\n%s\n\n", extracolumn, extracolumn_shortened))
api_call_str <- kgapi_call_str(query=extracolumn_shortened, apikey=apikey)
json <- kg_api_call(api_call_str, extracolumn)
if(!is.null(json)){
return(json)
}
}
if(is.null(json) & stri_length(extracolumn_shortened.2) > 0) {
message(sprintf("cannot resolve - 3rd try:\n%s\n%s\n\n", extracolumn, extracolumn_shortened.2))
api_call_str <- kgapi_call_str(query=extracolumn_shortened.2, apikey=apikey)
json <- kg_api_call(api_call_str, extracolumn)
}
else {
warning(sprintf("cannot resolve: \n%s\n%s\n\n", extracolumn, extracolumn_shortened))
}
}
kgapi_lookup <- function(lookup_str, apikey) {
dat <- kgapi_call_data(api_call_str=kgapi_call_str(query=lookup_str, apikey=apikey), extracolumn = lookup_str)
dat
}
#kgapi_call_str("GENERAL MTRS CO", apikey)
companies.metadata.3 <- do.call(bind_rows, lapply(companies, kgapi_lookup, apikey))
companies.metadata.4 <- companies.metadata.3 %>%
mutate(result..type=map(map(result..type, unlist), sort, decreasing=TRUE))
einhorn <- einhorn %>%
left_join(companies.metadata.4, by="name.of.issuer")
下次我尝试使用SEC 13F表格中提供的CUSIP标识符,但此服务是非免费的AFAIK。