使用 rvest 进行网页抓取时收到错误消息?

时间:2021-01-12 18:34:06

标签: rvest shapefile

我想从以下网站抓取所有 shapefile:https://www.sciencebase.gov/catalog/items?q=&filter0=browseCategory%3DData&community=California+Condor&filter1=browseType%3DMap+Service&filter2=browseType%3DOGC+WMS+Layer&filter3=browseType%3DDownloadable&filter4=facets.facetName%3DShapefile&&filter5=browseType%3DShapefile

我使用了以下脚本:

installed.packages('rvest')
library(rvest)
library(tidyverse)

## CONSTANT ----

URL <- "https://www.sciencebase.gov/catalog/item/54471eb5e4b0f888a81b82ca"
dir_out <- "~/condor"

## MAIN ----

# Get the webpage content
webpage <- read_html(URL)

# Extract the information of interest from the website
data <- html_nodes(webpage, ".sb-file-get sb-download-link")

# Grab the base URLs to download all the referenced data
url_base <- html_attr(data,"href")

# Filter the zip files
shapefile_base <- grep("*.zip",url_base, value=TRUE)

# Fix the double `//`
shapefile_fixed <- gsub("//", "/", shapefile_base)

# Add the URL prefix
shapefile_full <- paste0("https://www.sciencebase.gov/",shapefile_fixed)

# Create the output directory
dir.create(dir_out, showWarnings = FALSE)

# Create a list of filenames
filenames_full <- file.path(dir_out,basename(shapefile_full))

# Download the files
lapply(shapefile_full, FUN=function(x) download.file(x, file.path(dir_out,basename(x))))

# Unzip the files
unzip(filenames_full, overwrite = TRUE)

但是对于下载文件部分,我收到以下错误

> lapply(shapefile_full, FUN=function(x) download.file(x, file.path(dir_out,basename(x))))
trying URL 'https://www.sciencebase.gov/'
Content type 'text/html' length unknown
downloaded 111 bytes

[[1]]
[1] 0

> # Unzip the files
> unzip(filenames_full, overwrite = TRUE)
Warning message:
In unzip(filenames_full, overwrite = TRUE) :
  error 1 in extracting from zip file

1 个答案:

答案 0 :(得分:0)

这对我有用,并在我的工作目录中生成 6 个 zip 文件。

library(rvest)
URL <- "https://www.sciencebase.gov/catalog/item/54471eb5e4b0f888a81b82ca"
webpage <- read_html(URL)

# Extract the information of interest from the website
data <- html_nodes(webpage, "span.sb-file-get")

# Grab the base URLs to download all the referenced data
url_base <- html_attr(data,"data-url")

# Filter the zip files
shapefile_full <- paste0("https://www.sciencebase.gov",url_base)

Map(download.file, shapefile_full, sprintf('file%d.zip', seq_along(shapefile_full)))