带循环的网页抓取

时间:2019-11-18 10:05:30

标签: r loops web-scraping rvest xml2

我正在尝试使用循环功能从网站上抓取一些文本,但是在选择向量列表中的下一项时,循环功能并没有移动。感谢任何有用的建议。谢谢

library(rvest)
library(xml2)


ID <- c(1:2)
Land <- c('Afghanistan','Ägypten')
url <- c('afghanistan', 'aegypten') 
Text <- (NA)

data <- data.frame(ID, Land, Text)

for(i in url) {
  nam <- paste("https://www.reporter-ohne-grenzen.de", i, sep = "/")
  assign(nam, i)

  webpage <- read_html(paste0(nam, i))
  data$Text <- i <- webpage %>% html_nodes('div.text') %>% .[[1]] %>% html_text() 
}

嗯,不确定我是否明确了我的问题。这是我想要的数据输出的示例。

library(rvest)
library(xml2)

ID <- c(1:2)
Land <- c('Afghanistan','Ägypten')
url <- c('afghanistan', 'aegypten') 
Text <- (NA)

data <- data.frame(ID, Land, Text)


afghanistan <- 'https://www.reporter-ohne-grenzen.de/afghanistan'
afghanistan <- read_html(afghanistan)
afghanistan <- html_nodes(afghanistan,'div.text')
afghanistan <- html_text(afghanistan)[[1]]

aegypten <- 'https://www.reporter-ohne-grenzen.de/aegypten'
aegypten <- read_html(aegypten)
aegypten <- html_nodes(aegypten,'div.text')
aegypten <- html_text(aegypten)[[1]]

# desired data output
data$Text <- c(afghanistan, aegypten)

我不想在180个国家/地区重复这些行。

aegypten <- 'https://www.reporter-ohne-grenzen.de/aegypten'
aegypten <- read_html(aegypten)
aegypten <- html_nodes(aegypten,'div.text')
aegypten <- html_text(aegypten)[[1]]

以下是解决方法:

library(rvest)
library(xml2)

ID <- c(1:4) 
Land <- c('Afghanistan','Ägypten','Deutschland','Italien')
Url <- c('afghanistan', 'aegypten','deutschland','italien') 
Text <- NA

data <- data.frame(ID, Land, Text)
website <- 'https://www.reporter-ohne-grenzen.de'

for (i in ID) {
  country <- Url[i]

  html_url <- paste(website,country,sep='/')
  output <- read_html(html_url)
  output <- html_nodes(output,'div.text')
  output <- html_text(output)[[1]]

  data$Text[i] <- output
}

2 个答案:

答案 0 :(得分:1)

即使for循环真的很方便,您通常也可以通过创建可以迭代的函数来解决R中的迭代。

在此示例中,我们可以将for循环放入函数中,然后使用purrr和函数map(),或者在这种情况下,将子函数map_character()放在dplyr'中s mutate()将文本结果存储在列中。

library(rvest)
#> Loading required package: xml2
library(xml2)
library(tidyverse)

ID <- c(1:2)
Land <- c('Afghanistan','Ägypten')
url <- c('afghanistan', 'aegypten') 
Text <- (NA)

data <- data.frame(ID, Land, url, Text)

read_country <- function(country_url){

nam <- paste0("https://www.reporter-ohne-grenzen.de/", country_url)

webpage <- read_html(paste0(nam))

webpage %>% html_nodes('div.text') %>% .[[1]] %>% html_text() 

}

data <- data %>% 
    mutate(Text = map_chr(url, read_country))

reprex package(v0.3.0)于2019-11-18创建

答案 1 :(得分:0)

使用purrr函数和rvest,我们可以做到

library(purrr)
library(rvest)

data$Text <- map(paste0("https://www.reporter-ohne-grenzen.de/", url),
             ~.x %>% 
                read_html %>% 
                html_nodes('div.text') %>%
                html_text %>% .[[1]]) %>% flatten_chr()

数据

ID <- c(1:2)
Land <- c('Afghanistan','Ägypten')
url <- c('afghanistan', 'aegypten') 
Text <- (NA)
data <- data.frame(ID, Land, Text)