我想抓取多个网址的标题。根据网站页面,标题位于不同的标签/节点下。我想使用一条if语句,因此,如果我使用其中一个节点获取字符(0),则应使用其他标记/节点,依此类推。
(".tittleArticuloOpinion")
(".nameColumnista")
(".article-header h2")
prueba_titulos2 =lapply(noticias_semana_lapply[12:14,1], function(x) {
tryCatch(
{
Sys.sleep(0.1)
read_html(x) %>% html_nodes(".tittleArticuloOpinion") %>% html_text %>%
{if(length(.) == 0) read_html(x) %>% html_nodes(".nameColumnista") %>% html_text else {
if (length(.) == 0) read_html(x) %>% html_nodes(".article-header h2") %>%
html_text}} %>% as.character
},
error = function(cond) return(NULL),
finally = print(x)
)
})
noticias_semana_lapply
是10.000个网址的列表。这是我正在抓取的三个网站的可复制示例:
dput(noticias_semana_lapply[12:14,1])
"http://www.semana.com/nacion/articulo/cuales-cree-temas-principales-deben-tratar-dialogos-del-gobierno-farc/263693-3"
"http://www.semana.com/confidenciales-semanacom/articulo/las-farc-marcha-patriotica/263691-3"
"http://www.semana.com/nacion/articulo/procuraduria-formulo-cargos-contra-dos-excongresistas-chocoanos/263685-3"
prueba_titulos3 =lapply(noticias_semana_lapply[12:14,1], function(x) {
tryCatch(
{
Sys.sleep(0.1)
if(character(length = 0)) {
read_html(x) %>% html_nodes(".tittleArticuloOpinion") %>% html_text
read_html(x) %>% html_nodes(".nameColumnista") %>% html_text
read_html(x) %>% html_nodes(".article-header h2") %>% html_text
} else {.}%>%
as.character
},
error = function(cond) return(NULL),
finally = print(x)
)
})
有人可以帮我吗?非常感谢!
答案 0 :(得分:2)
使用CSS或语法来匹配其中任何一个还不够吗?
即
read_html(url) %>% html_nodes(".tittleArticuloOpinion, .nameColumnista, .article-header h2") %>% html_text
示例:
library(rvest)
library(magrittr)
urls = c("http://www.semana.com/nacion/articulo/cuales-cree-temas-principales-deben-tratar-dialogos-del-gobierno-farc/263693-3"
,"http://www.semana.com/confidenciales-semanacom/articulo/las-farc-marcha-patriotica/263691-3"
,"http://www.semana.com/nacion/articulo/procuraduria-formulo-cargos-contra-dos-excongresistas-chocoanos/263685-3")
for(url in urls){
x <- read_html(url) %>% html_nodes(".tittleArticuloOpinion, .nameColumnista, .article-header h2") %>% html_text
print(x)
}
答案 1 :(得分:0)
我认为问题是您不应该使用if...else
,而只能使用if
,因为如果不满足第一个条件,使用前者将不会检查下一个条件。这是一个易于阅读的版本,它返回所有3个共享链接的输出。
library(rvest)
lapply(noticias_semana_lapply[12:14,1], function(x) {
new_x <- read_html(x) %>% html_nodes(".tittleArticuloOpinion") %>% html_text
if(length(new_x) == 0)
new_x <- read_html(x) %>% html_nodes(".nameColumnista") %>% html_text
if(length(new_x) == 0)
new_x <- read_html(x) %>% html_nodes(".article-header h2") %>% html_text
return(new_x)
})