如何将已删除的输出文本放入带有列
的表中library(rvest)
base_url <- c("https://www.sec.gov/Archives/edgar/data/1409916/000162828017002570/exhibit211nobilishealthcor.htm",
"https://www.sec.gov/Archives/edgar/data/1320695/000156459018002405/ths-ex211_71.htm")
df <- lapply(base_url,function(u){
html_obj <- read_html(u)
temp <- html_nodes(html_obj,'text')
draft1 <- html_text(temp)
draft1 <- as.data.frame(draft1)
require(data.table)
setDT(draft1)
})
希望在列名为
的表格中输出如下所示 Sl Subsidiary Region
1. Bay Valley Foods, LLC Delaware limited liability company
2. Sturm Foods Wisconsin corporation
3. S.T. Specialty Foods Minnesota corporation
答案 0 :(得分:0)
我使用了基于rvest的解决方案:
收集第一个网址:
base_url <- c("https://www.sec.gov/Archives/edgar/data/1409916/000162828017002570/exhibit211nobilishealthcor.htm",
"https://www.sec.gov/Archives/edgar/data/1320695/000156459018002405/ths-ex211_71.htm")
#SCRAPE FIRST URL
u <- base_url[1]
html_obj <- read_html(u)
tr <- html_obj %>% html_nodes('div[style="line-height:120%;text-
align:center;font-size:10pt;"] tr')
loc <- NULL
interest <- NULL
for (bal in tr) {
val1 <- bal %>% html_nodes('div[style="text-align:left;font-size:10pt;"] font[style="font-family:inherit;font-size:10pt;"]') %>% html_text()
val2 <- bal %>% html_nodes('div[style="text-align:center;font-size:10pt;"] font[style="font-family:inherit;font-size:10pt;"]') %>% html_text()
if(length(val1) != 1) val1 <- "NA"
if(length(val2) != 1) val2 <- "NA"
interest <- c(interest,val1)
loc <- c(loc,val2)
}
#GET THE RESULTS IN A DF
res1 <- data.frame(interest,loc)
然后收集第二个url数据的脚本
#SCRAPE 2ND URL
html_obj <- read_html(u)
u <- base_url[2]
html_obj <- read_html(u)
text <- html_obj %>% html_nodes("p[style='margin-bottom:0pt;margin-top:12pt;text-indent:0%;color:#000000;font-size:10pt;font-family:Times New Roman;font-weight:normal;font-style:normal;text-transform:none;font-variant: normal;']") %>% html_text()
res2 <- strsplit(text,split = ", a |\\. a |, an |\\. an ")
res2 <- data.frame(interest = unlist(res2)[seq(1,length(res2),2)],loc = unlist(res2)[seq(2,length(res2),2)])
希望这会对你有所帮助
Gottavianoni