我正在对之前发表的原始帖子(reading text into R with xml problems)提出全新的问题/建议。
该帖子似乎参与其中,我知道问题可能不太清楚。在处理了几个小时之后,我重新表述了问题并简化了该过程。
我有一些数据正在尝试使用一些URL读入R-总共20个。
我创建以下定义的函数:
library(dplyr)
library(plyr)
library(purrr)
library(edgarWebR)
library(rvest)
library(devtools)
library(tidyr)
library(tidytext)
library(stringr)
library(tibble)
parse10k <- function(uri) {
# 10-K HTML files are very flat with a long list of nodes. This pulls all
# the relevant nodes.
nodes <- read_html(uri) %>%
html_nodes('text') %>%
xml_children()
nodes <- nodes[xml_name(nodes) != "hr"]
}
我应用以下内容过滤一些“无用” URL并给出一个数据框-现在为16行:
data2 <- df %>%
rename_(ID = ".id") %>%
rowwise() %>%
filter(grepl(".htm", doc.href, fixed = TRUE)) %>%
filter(!grepl(".html", doc.href, fixed = TRUE)) %>%
mutate(nodes = map(doc.href, parse10k))
接下来,我应用以下功能:
docprtsfun <- function(nodes){
doc.parts <- tibble(nid = seq(length(nodes)),
node = nodes,
text = xml_text(nodes) ) %>%
filter(text != "") # way to get columns defined properly
parts <- doc.parts %>%
filter(grepl("^part",text, ignore.case=TRUE)) %>%
select(nid,text)
# mutate(next.nid = c(nid[-1],length(nodes)+1)) %>%
if (parts$nid[1] > 1) {
parts <- bind_rows(tibble(nid = 0, text= "PART 0"), parts)
}
parts <- bind_rows(parts,
tibble(nid = doc.parts$nid[length(doc.parts$nid)] + 1,
text = "NA"))
items <- doc.parts %>%
filter(grepl("^item",text, ignore.case=TRUE)) %>%
select(nid,text) %>%
mutate(next.nid = c(nid[-1],length(nodes)+1),
part.next = parts$nid[findInterval(nid,parts$nid) + 1],
next.nid = ifelse(part.next < next.nid, part.next, next.nid),
prev.end = c(0,next.nid[-length(nid)]))
# Fill in item gaps w/ N/A
n <- 0
for(i in seq(length(items$nid))) {
j <- i + n
if(items$prev.end[j] != items$nid[j]) {
items <- items %>%
add_row(nid = items$prev.end[j], text = NA, .before = j)
n <- n + 1
}
}
doc.parts <- doc.parts %>%
mutate( part = parts$text[findInterval(nid, parts$nid)],
item = items$text[findInterval(nid, items$nid)]) %>%
select(nid,part,item,text)
return(doc.parts)
}
之后,我可以快速查看数据现在的格式。
take_a_look <- docprtsfun(data2$nodes[[16]])
但是有一个或两个结果成为列表。例如:
x <- docprtsfun(data2$nodes[[9]])
x <- docprtsfun(data2$nodes[[8]])
x <- docprtsfun(data2$nodes[[4]])
x <- docprtsfun(data2$nodes[[2]])
给出以下错误:
Error in if (parts$nid[1] > 1) { : missing value where TRUE/FALSE needed
也;
x <- docprtsfun(data2$nodes[[5]])
给出此错误
Error: Tibble columns must have consistent lengths, only values of length one are recycled:
* Length 0: Columns `node`, `text`
* Length 2: Column `nid`
Call `rlang::last_error()` to see a backtrace
我当然想“解决”这些错误,但是我想知道是否有一种方法可以放入函数中以删除这些观察结果,而不必现在处理复杂性?
数据:
df <- structure(list(.id = c("TGT", "DVN", "XRAY", "XRAY", "MSFT",
"MSFT", "DAL", "AON", "AON", "TGT", "TGT", "TIF", "XRAY", "NVDA",
"MSFT", "AON", "MSFT", "NVDA", "NVDA", "DVN"), accession_number = c("0000027419-14-000014",
"0000950134-09-003904", "0000818479-04-000031", "0000818479-99-000003",
"0001193125-11-200680", "0001193125-04-150689", "0000027904-17-000004",
"0001047469-12-001478", "0001047469-05-006608", "0001047469-10-002121",
"0001047469-98-015191", "0000950123-09-005683", "0000818479-14-000004",
"0001045810-09-000013", "0001193125-15-272806", "0001047469-13-001494",
"0000891020-95-000433", "0001045810-15-000036", "0001045810-11-000015",
"0001193125-14-076267"), act = c("34", "34", NA, NA, "34", NA,
"34", "34", "34", "34", NA, "34", "34", "34", "34", "34", NA,
"34", "34", "34"), file_number = c("001-06049", "001-32318",
"000-16211", "000-16211", "000-14278", "000-14278", "001-05424",
"001-07933", "001-07933", "001-06049", "001-06049", "001-09494",
"000-16211", "000-23985", "000-14278", "001-07933", "000-14278",
"000-23985", "000-23985", "001-32318"), filing_date = structure(c(1394751600,
1235689200, 1079305200, 922744800, 1311804000, 1093989600, 1486940400,
1330038000, 1110927600, 1268348400, 892591200, 1238364000, 1392850800,
1236898800, 1438293600, 1361487600, 811983600, 1426114800, 1300230000,
1393542000), class = c("POSIXct", "POSIXt"), tzone = ""), accepted_date = structure(c(1394751600,
1235689200, 1079305200, 922744800, 1311804000, 1093989600, 1486940400,
1330038000, 1110841200, 1268348400, 892591200, 1238364000, 1392850800,
1236898800, 1438293600, 1361487600, 811983600, 1426028400, 1300230000,
1393542000), class = c("POSIXct", "POSIXt"), tzone = ""), href = c("https://www.sec.gov/Archives/edgar/data/27419/000002741914000014/0000027419-14-000014-index.htm",
"https://www.sec.gov/Archives/edgar/data/1090012/000095013409003904/0000950134-09-003904-index.htm",
"https://www.sec.gov/Archives/edgar/data/818479/000081847904000031/0000818479-04-000031-index.htm",
"https://www.sec.gov/Archives/edgar/data/818479/0000818479-99-000003-index.html",
"https://www.sec.gov/Archives/edgar/data/789019/000119312511200680/0001193125-11-200680-index.htm",
"https://www.sec.gov/Archives/edgar/data/789019/000119312504150689/0001193125-04-150689-index.htm",
"https://www.sec.gov/Archives/edgar/data/27904/000002790417000004/0000027904-17-000004-index.htm",
"https://www.sec.gov/Archives/edgar/data/315293/000104746912001478/0001047469-12-001478-index.htm",
"https://www.sec.gov/Archives/edgar/data/315293/000104746905006608/0001047469-05-006608-index.htm",
"https://www.sec.gov/Archives/edgar/data/27419/000104746910002121/0001047469-10-002121-index.htm",
"https://www.sec.gov/Archives/edgar/data/27419/0001047469-98-015191-index.html",
"https://www.sec.gov/Archives/edgar/data/98246/000095012309005683/0000950123-09-005683-index.htm",
"https://www.sec.gov/Archives/edgar/data/818479/000081847914000004/0000818479-14-000004-index.htm",
"https://www.sec.gov/Archives/edgar/data/1045810/000104581009000013/0001045810-09-000013-index.htm",
"https://www.sec.gov/Archives/edgar/data/789019/000119312515272806/0001193125-15-272806-index.htm",
"https://www.sec.gov/Archives/edgar/data/315293/000104746913001494/0001047469-13-001494-index.htm",
"https://www.sec.gov/Archives/edgar/data/789019/0000891020-95-000433-index.html",
"https://www.sec.gov/Archives/edgar/data/1045810/000104581015000036/0001045810-15-000036-index.htm",
"https://www.sec.gov/Archives/edgar/data/1045810/000104581011000015/0001045810-11-000015-index.htm",
"https://www.sec.gov/Archives/edgar/data/1090012/000119312514076267/0001193125-14-076267-index.htm"
), type = c("10-K", "10-K", "10-K", "10-K", "10-K", "10-K", "10-K",
"10-K", "10-K", "10-K", "10-K", "10-K", "10-K", "10-K", "10-K",
"10-K", "10-K", "10-K", "10-K", "10-K"), film_number = c("14693644",
"09639574", "04670190", "99578860", "11993262", "041011640",
"17600107", "12638817", "05683013", "10676542", "98594743", "09714434",
"14630484", "09677521", "151019135", "13634337", "95575998",
"15694143", "11692266", "14653539"), form_name = c("Annual report [Section 13 and 15(d), not S-K Item 405]",
"Annual report [Section 13 and 15(d), not S-K Item 405]", "Annual report [Section 13 and 15(d), not S-K Item 405]",
"Annual report [Section 13 and 15(d), not S-K Item 405]", "Annual report [Section 13 and 15(d), not S-K Item 405]",
"Annual report [Section 13 and 15(d), not S-K Item 405]", "Annual report [Section 13 and 15(d), not S-K Item 405]",
"Annual report [Section 13 and 15(d), not S-K Item 405]", "Annual report [Section 13 and 15(d), not S-K Item 405]",
"Annual report [Section 13 and 15(d), not S-K Item 405]", "Annual report [Section 13 and 15(d), not S-K Item 405]",
"Annual report [Section 13 and 15(d), not S-K Item 405]", "Annual report [Section 13 and 15(d), not S-K Item 405]",
"Annual report [Section 13 and 15(d), not S-K Item 405]", "Annual report [Section 13 and 15(d), not S-K Item 405]",
"Annual report [Section 13 and 15(d), not S-K Item 405]", "Annual report [Section 13 and 15(d), not S-K Item 405]",
"Annual report [Section 13 and 15(d), not S-K Item 405]", "Annual report [Section 13 and 15(d), not S-K Item 405]",
"Annual report [Section 13 and 15(d), not S-K Item 405]"), description = c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_),
size = c("20 MB", "2 MB", "687 KB", "309 KB", "16 MB", "1 MB",
"14 MB", "22 MB", "2 MB", "6 MB", "201 KB", "1 MB", "35 MB",
"4 MB", "14 MB", "24 MB", "189 KB", "16 MB", "19 MB", "41 MB"
), doc.href = c("https://www.sec.gov/Archives/edgar/data/27419/000002741914000014/tgt-20140201x10k.htm",
"https://www.sec.gov/Archives/edgar/data/1090012/000095013409003904/d66379e10vk.htm",
"https://www.sec.gov/Archives/edgar/data/818479/000081847904000031/f102003.txt",
"https://www.sec.gov/Archives/edgar/data/818479/", "https://www.sec.gov/Archives/edgar/data/789019/000119312511200680/d10k.htm",
"https://www.sec.gov/Archives/edgar/data/789019/000119312504150689/d10k.htm",
"https://www.sec.gov/Archives/edgar/data/27904/000002790417000004/dal1231201610k.htm",
"https://www.sec.gov/Archives/edgar/data/315293/000104746912001478/a2207295z10-k.htm",
"https://www.sec.gov/Archives/edgar/data/315293/000104746905006608/a2152901z10-k.htm",
"https://www.sec.gov/Archives/edgar/data/27419/000104746910002121/a2196751z10-k.htm",
"https://www.sec.gov/Archives/edgar/data/27419/", "https://www.sec.gov/Archives/edgar/data/98246/000095012309005683/y75075e10vk.htm",
"https://www.sec.gov/Archives/edgar/data/818479/000081847914000004/dentsply201310-k.htm",
"https://www.sec.gov/Archives/edgar/data/1045810/000104581009000013/fy2009form10k.htm",
"https://www.sec.gov/Archives/edgar/data/789019/000119312515272806/d918813d10k.htm",
"https://www.sec.gov/Archives/edgar/data/315293/000104746913001494/a2212713z10-k.htm",
"https://www.sec.gov/Archives/edgar/data/789019/", "https://www.sec.gov/Archives/edgar/data/1045810/000104581015000036/nvda-2015x10k.htm",
"https://www.sec.gov/Archives/edgar/data/1045810/000104581011000015/fy2011form10k.htm",
"https://www.sec.gov/Archives/edgar/data/1090012/000119312514076267/d656849d10k.htm"
), mdlink = c("[Filing Link](https://www.sec.gov/Archives/edgar/data/27419/000002741914000014/0000027419-14-000014-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/1090012/000095013409003904/0000950134-09-003904-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/818479/000081847904000031/0000818479-04-000031-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/818479/0000818479-99-000003-index.html)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/789019/000119312511200680/0001193125-11-200680-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/789019/000119312504150689/0001193125-04-150689-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/27904/000002790417000004/0000027904-17-000004-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/315293/000104746912001478/0001047469-12-001478-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/315293/000104746905006608/0001047469-05-006608-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/27419/000104746910002121/0001047469-10-002121-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/27419/0001047469-98-015191-index.html)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/98246/000095012309005683/0000950123-09-005683-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/818479/000081847914000004/0000818479-14-000004-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/1045810/000104581009000013/0001045810-09-000013-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/789019/000119312515272806/0001193125-15-272806-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/315293/000104746913001494/0001047469-13-001494-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/789019/0000891020-95-000433-index.html)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/1045810/000104581015000036/0001045810-15-000036-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/1045810/000104581011000015/0001045810-11-000015-index.htm)",
"[Filing Link](https://www.sec.gov/Archives/edgar/data/1090012/000119312514076267/0001193125-14-076267-index.htm)"
), reportLink = c("[10-K Link](https://www.sec.gov/Archives/edgar/data/27419/000002741914000014/tgt-20140201x10k.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/1090012/000095013409003904/d66379e10vk.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/818479/000081847904000031/f102003.txt)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/818479/)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/789019/000119312511200680/d10k.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/789019/000119312504150689/d10k.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/27904/000002790417000004/dal1231201610k.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/315293/000104746912001478/a2207295z10-k.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/315293/000104746905006608/a2152901z10-k.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/27419/000104746910002121/a2196751z10-k.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/27419/)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/98246/000095012309005683/y75075e10vk.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/818479/000081847914000004/dentsply201310-k.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/1045810/000104581009000013/fy2009form10k.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/789019/000119312515272806/d918813d10k.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/315293/000104746913001494/a2212713z10-k.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/789019/)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/1045810/000104581015000036/nvda-2015x10k.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/1045810/000104581011000015/fy2011form10k.htm)",
"[10-K Link](https://www.sec.gov/Archives/edgar/data/1090012/000119312514076267/d656849d10k.htm)"
)), row.names = c(64L, 158L, 143L, 148L, 90L, 97L, 109L,
24L, 31L, 68L, 80L, 49L, 133L, 10L, 86L, 23L, 106L, 4L, 8L, 153L
), class = "data.frame")