Question

我在csv文件中列出了40,000个网页地址。我想在新的csv文件中阅读这些页面，以便csv中的每个单元格都是相关网页的内容。我可以使用以下代码阅读（解析）单个网页

library(XML)

# Read and parse HTML file
doc.html = htmlTreeParse('',useInternal = TRUE)

# Extract all the paragraphs (HTML tag is p, starting at
# the root of the document). Unlist flattens the list to
# create a character vector.
doc.text = unlist(xpathApply(doc.html, '//p', xmlValue))

# Replace all \n by spaces
doc.text = gsub('\\n', ' ', doc.text)

# Join all the elements of the character vector into a single
# character string, separated by spaces
doc.text = paste(doc.text, collapse = ' ')

是否可以使用具有网页地址的csv作为输入，并获得包含上述所有内容的新文件？

Answer 1

您可以尝试以下代码。它应该起作用你的目的，但它未经测试，因为我不知道你想看的网站：

library(XML)
library(rvest)


df <- read.csv("Webpage_urls.csv", stringsAsFactors = F)

webpage_parser <- function(x){
x <- read_html(x)

  doc.html = htmlTreeParse(x, useInternal = TRUE)
  # Extract all the paragraphs (HTML tag is p, starting at
  # the root of the document). Unlist flattens the list to
  # create a character vector.
  doc.text = unlist(xpathApply(doc.html, '//p', xmlValue))

  # Replace all \n by spaces
  doc.text = gsub('\\n', ' ', doc.text)
  # Join all the elements of the character vector into a single
  # character string, separated by spaces
  doc.text = paste(doc.text, collapse = ' ')
}

all_webpages <- lapply(df, function(x) webpage_parser(x))

Pages <- do.call(rbind, all_webpages)

Parsed_pages <- cbind(df, Pages)

write.csv(Parsed_pages, "All_parsed_pages.csv", row.names = F)

如果我们想要同时，我们可以使用R中的doParallel库，这将设置多个集群（R的实例），并且应该有助于加快您的流程。

library(doParallel)
# split your webpage list into n vectors and create a list called Split_df

Split_df <- list(df1, df2, df3,..., dfn)

# Here I initiate my cluster
cl <- makeCluster(detectCores()-1)
registerDoParallel(cl)

Parsed_pages  <- foreach(i = 1:length(Split_df), .combine = rbind) %dopar%
{
  library(rvest)
  library(XML)
  all_webpages <- lapply(Split_df[[i]], function(x) webpage_parser(x))

  Pages <- do.call(rbind, all_webpages)

  Parsed_pages <- cbind(Split_df[[i]], Pages)

  Parsed_pages
}
stopCluster(cl)

write.csv(Parsed_pages, "All_parsed_pages.csv", row.names = F)

读取R中的网页列表并将输出保存在csv中

1 个答案: