Question

我试图从WorldPop英国网站下载很多文件，用于数据集中的很多国家（不仅仅是小例子）。下载每个文件将非常耗时且乏味。

我对R中的下载方法非常熟悉，但我无法让这些下载工作。我知道这是因为下载链接以某种方式通过html运行，但我对html或java并不好。

我在httr，RCurl和RSelenium上做了很多阅读。我更喜欢避免使用RSelenium的解决方案，因为我对其他软件包更为熟悉，可能会共享代码，并且不希望每次都要设置浏览器（至少这是我的理解）

有人可以帮我解决这个问题吗？

直接下载链接到一个小文本（.txt）文件，该文件在浏览器中正常工作，但在R中没有使用download.file或curl_download： http://www.worldpop.org.uk/data/files/index.php?dataset=140&action=download&file=60

具有尼日利亚文件索引的网站（您可以在html代码中看到href = links）：http://www.worldpop.org.uk/data/files/index.php?dataset=140&action=dir

在Chrome上，查看来源：http://www.worldpop.org.uk/data/files/index.php?dataset=140&action=dir

下载链接位于我的开发者控制台的第558行和第559行之间。

提前致谢！

Answer 1

嗯，他们肯定不会让这很容易。除了错综复杂的“网络应用程序”之外，他们还尝试做正确的事情并使用sha1对源javascript资源进行验证，但未能保持这些（即安全浏览器将无法使用该网站）。 / p>

无论如何，以下是您必须采取的措施，以避免splashr或RSelenium / seleniumPipes。我使用了“README”示例，并且有很多评论。

我的建议是将一个或多个位包装到函数中以便于使用，并考虑在purrr帮助器中包装各种调用，如safely（还有“重试”示例oot和aboot）。

library(httr)
library(rvest)
library(tidyverse)

# Need to "prime" the session with a cookie
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")

# Get the page contents
pg <- content(res)

# Find the summary links
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")

# extract the table cells & href so we can make a data frame
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
  map(html_children) %>%
  map(html_text) %>%
  map(~.[1:4]) %>%
  map(as.list) %>%
  map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
  bind_cols(
    data_frame(
      summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
    )
  ) -> world_pop_data

glimpse(world_pop_data)
## Observations: 462
## Variables: 5
## $ continent    <chr> "Africa", "Africa", "Africa", "Africa", "Africa", "Africa", "Afri...
## $ country      <chr> "Algeria", "Angola", "Benin", "Botswana", "Burkina Faso", "Burund...
## $ resolution   <chr> "100m", "100m", "100m", "100m", "100m", "100m", "100m", "100m", "...
## $ data_type    <chr> "Population", "Population", "Population", "Population", "Populati...
## $ summary_link <chr> "http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP0000...

# just see "Nigeria" data
filter(world_pop_data, country=="Nigeria")
## # A tibble: 8 x 5
##   continent country resolution         data_type                                                      summary_link
##       <chr>   <chr>      <chr>             <chr>                                                             <chr>
## 1    Africa Nigeria       100m        Population http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00196
## 2    Africa Nigeria        1km            Births http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00357
## 3    Africa Nigeria        1km       Pregnancies http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00465
## 4    Africa Nigeria        1km Contraceptive Use http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00198
## 5    Africa Nigeria        1km          Literacy http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00199
## 6    Africa Nigeria        1km           Poverty http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00200
## 7    Africa Nigeria        1km          Stunting http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00201
## 8    Africa Nigeria       100m    Age structures http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00194

我相当确定您可以从^^ URL之一开始任何文件下载会话尝试，但您需要测试它，因为您可能需要始终从“主”页面开始（如上所述，它根据cookie保持会话位置。

# get nigeria population URL
filter(world_pop_data, country=="Nigeria") %>%
  filter(data_type=="Population") %>%
  pull(summary_link) -> nigeria_pop

nigeria_pop
# [1] "http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00196"

# follow it
GET(url=nigeria_pop) -> res2
pg2 <- content(res2)

该页面上总是有<form>，因此我们需要使用POST“提交”所述表单：

# extract "form" fields (that page does a POST request)
fields <- html_nodes(pg2, "form#conform > input")
fields <- set_names(xml_attr(fields, "value"), html_attr(fields, "name"))

str(as.list((fields))) # just to show what it looks like
## List of 4
##  $ zip_id   : chr "140"
##  $ zip_title: chr "Nigeria 100m Population"
##  $ decoy    : chr "website"
##  $ website  : chr NA

# submit the form with the field data.
# NOTE we need to add the `Referer` (the faux page we're on)
POST(
  url = "http://www.worldpop.org.uk/data/download/",
  add_headers(`Referer` = nigeria_pop),
  body = list(
    client_first_name = "",
    client_last_name = "",
    client_organization = "",
    client_country = "",
    client_email = "",
    client_message = "",
    zip_id = fields["zip_id"],
    zip_title = fields["zip_title"],
    decoy = fields["decoy"],
    website = "",
    download = "Browse Individual Files"
  ),
  encode = "form"
) -> res3

结果页面上的某个位置是“切换到文件列表”链接，因此我们需要找到它并按照它：

# find the link that has the file list
pg3 <- content(res3)
html_nodes(pg3, xpath=".//a[contains(., 'switch to')]") %>%
  html_attr("href") -> file_list_query_string # just to see the format
## [1] "?dataset=140&action=dir"

# follow that link (we need to use some of the previous captured fields)
GET(
  url = "http://www.worldpop.org.uk/data/files/index.php",
  query = list(
    dataset = fields["zip_id"],
    action = "dir"
  )
) -> res4

现在，我们在该页面上构建了所有链接的数据框：

pg4 <- content(res4)

data_frame(
  group_name = html_nodes(pg4, "a.dl") %>% html_text(),
  href = html_nodes(pg4, "a.dl") %>% html_attr("href")
) -> downloads

downloads
## # A tibble: 60 x 2
##                      group_name                                 href
##                           <chr>                                <chr>
##  1                  Licence.txt  ?dataset=140&action=download&file=1
##  2            NGA_metadata.html  ?dataset=140&action=download&file=2
##  3         NGA_pph_v2c_2006.tfw  ?dataset=140&action=download&file=3
##  4         NGA_pph_v2c_2006.tif  ?dataset=140&action=download&file=4
##  5 NGA_pph_v2c_2006.tif.aux.xml  ?dataset=140&action=download&file=5
##  6     NGA_pph_v2c_2006.tif.xml  ?dataset=140&action=download&file=6
##  7         NGA_pph_v2c_2010.tfw  ?dataset=140&action=download&file=7
##  8         NGA_pph_v2c_2010.tif  ?dataset=140&action=download&file=8
##  9 NGA_pph_v2c_2010.tif.aux.xml  ?dataset=140&action=download&file=9
## 10     NGA_pph_v2c_2010.tif.xml ?dataset=140&action=download&file=10
## # ... with 50 more rows

虽然我之前提到过，您可能需要始终从一开始或从上一个链接页面开始，您也可以按顺序下载所有这些。你需要做测试。这是一个痛苦的网站。

filter(downloads, str_detect(group_name, "README")) %>%
  pull(href) -> readme_query_string # we need this below
readme_query_string
## [1] "?dataset=140&action=download&file=60"

# THERE IS A RLY GD CHANCE YOU'LL NEED TO USE timeout() for
# some of these calls. That server takes a while
# right here is where that modal "preparing the data" is shown.
# I'm 99% certain this is there to slow down crawlers/scrapers.
GET(
  url = "http://www.worldpop.org.uk/data/files/index.php",
  query = parse_url(readme_query_string)$query,
  verbose()
) -> res5

这不是你真正要做的。你可能想要content(res5, as="raw")和writeBin()，因为有些（大多数）不是纯文本。但是，这是为了表明上述所有工作：

content(res5, as="text") %>%
  cat()
## WorldPop Africa dataset details
## _______________________
##
## DATASET: Alpha version 2010, 2015 and 2020 estimates of numbers of people per pixel ('ppp') and people per hectare ('pph'), with national totals adjusted to match UN population division estimates (http://esa.un.org/wpp/) and remaining unadjusted.
## REGION: Africa
## SPATIAL RESOLUTION: 0.000833333 decimal degrees (approx 100m at the equator)
## PROJECTION: Geographic, WGS84
## UNITS: Estimated persons per grid square
## MAPPING APPROACH: Random Forest
## FORMAT: Geotiff (zipped using 7-zip (open access tool): www.7-zip.org)
## FILENAMES: Example - NGA_ppp_v2b_2010_UNadj.tif = Nigeria (NGA) population per pixel (ppp), mapped using WorldPOP modelling version 2b (v2b) for 2010 (2010) adjusted to match UN national estimates (UNadj).
## DATE OF PRODUCTION: February 2017
##
## Also included: (i) Metadata html file, (ii) Google Earth file, (iii) Population datasets produced using original census year data (2006).

如果你坚持下去，可以考虑添加你最后做的答案，或者把它变成一个包，这样其他人也可以使用它。

Answer 2

HEY STACKOVERFLOW DELETERS，看看这个。我很有可能与我的其他答案相结合，但我有限制为30K字符并且有很多代码。

现在可以将其下载为名为spaceheater的github包。再次感谢@hrbrmstr，没有他，这是不可能的。

library(devtools)
install_github("nbarsch/spaceheater")

我把它分成三个功能：

getWPdatatypes("country") 
getWPoptions("country","datatype")  
getWPdownload("country","datatype",c ("options"),year)
###takes 0, 1 or 2 options. 0 is ""

__

getWPdatatypes("country") 
#Example
getWPdatatypes("Nigeria")

是基本级别，并从worldpop返回该国家/地区的可用数据类型的数据框。示例数据类型是＆＃34;人口＆＃34;，＆＃34;出生＆＃34;，＆＃34; UrbanChange＆＃34;。

_

getWPoptions("country", "datatype")
#Example
getWPoptions("Ghana", "Population")

旨在在getWPdatatypes之后运行，并返回给定数据类型的国家/地区可用的所有选项和年份的数据框。

getWPdownload("country", "datatype", c("options"), year)
#Example
getWPdownload("Benin", "Pregnancies", "pp", 2015)

感谢hrbrmstr

下载套装

如果设置年份未在文件名年中标记为9999.世界流行网站列出了年份。他们的文件命名方案很残酷，大部分代码都是由于文件名不一致造成的。

话虽如此，我喜欢他们的方法论和他们构建的网格栅格集，所以如果有人使用该代码，请引用WorldPop。：）

Answer 3

现在这是github上spaceheater包的一部分，请参阅其他答案以获取说明https://github.com/nbarsch/spaceheater。我所做的参考代码如下。

库：

library(httr)
library(rvest)
library(tidyverse)
library(raster)
library(rgdal)
library(reshape2)
library(rangeBuilder)
library(stringr)
library(foreach)
library(dplyr)

函数（由于char的限制，我必须删除代码中的大部分#notes。请参阅下面的答案以获取相关说明。大量代码是由于文件名不一致）：

getWPdatatypes：

getWPdatatypes <- function (country)  {
country <- standardizeCountry(paste(country),fuzzyDist=30)
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
  map(html_children) %>%
  map(html_text) %>%
  map(~.[1:4]) %>%
  map(as.list) %>%
  map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
  bind_cols(
    data_frame(
      summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
    )
  ) -> world_pop_data
world_pop_data$data_type <- gsub("Urban change", "UrbanChange", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Maternal and Newborn Health", "MaternalNewbornHealth", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Contraceptive Use", "ContraceptiveUse", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Age structures", "AgeStructures", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Dynamic Population", "DynamicPopulation", world_pop_data$data_type)
countryreference <- as.data.frame(world_pop_data)
countryreference <- countryreference[,c(1,2,4)]
countryreference <- countryreference[!(countryreference$country)=="N/A",]
world_pop_data <- world_pop_data[!(world_pop_data$country)=="N/A",]
countryreference$CountryStandard <- standardizeCountry(countryreference[,"country"], fuzzyDist=20)
countryreference$CountryEdit <- gsub("[()]", "", countryreference$country)
countryreference$CountryEdit2 <- gsub("\\s*\\([^\\)]+\\)","",as.character(countryreference$country))
foreach(a=1:nrow(countryreference)) %do% {
  if(countryreference[a,"CountryStandard"]==""){
    countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit"], fuzzyDist=20)
    if(countryreference[a,"CountryStandard"]==""){
      countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit2"], fuzzyDist=20)
      if(countryreference[a,"CountryStandard"]==""){
        countryreference[a,"CountryStandard"] <- toupper(countryreference[a,"country"])
      }
    }
  }
}
exists <- isTRUE(paste(country) %in% as.character(countryreference$CountryStandard))
if(exists==FALSE){
  print("It appears this country is not in the WorldPop set, please check and try again")
  break
}
countryreference <- countryreference[,c(1,4,3)]
world_pop_data$CountryStandard <- countryreference[,2]
world_pop_data <- world_pop_data[,c(1,6,2,3,4,5)]
countryreference <- suppressMessages(dcast(countryreference, continent+CountryStandard ~ data_type))
countryreference <- filter(countryreference, countryreference$CountryStandard==country)
countryreference <- countryreference[,colSums(is.na(countryreference))<nrow(countryreference)]
print(countryreference[1,])
WPdata.types <<- countryreference[1,]
print("The above table has also been added to your working environment as dataframe: WPdata.types")
}

getWPoptions：

#Example
getWPdatatypes("Nigeria", "Population")

getWPoptions <- function (country, datatype)  {
  country <- standardizeCountry(paste(country),fuzzyDist=30)
  res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
  # Get the page contents
  pg <- content(res)
  summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
  map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
    map(html_children) %>%
    map(html_text) %>%
    map(~.[1:4]) %>%
    map(as.list) %>%
    map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
    bind_cols(
      data_frame(
        summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
      )
    ) -> world_pop_data
  world_pop_data$data_type <- gsub("Urban change", "UrbanChange", world_pop_data$data_type)
  world_pop_data$data_type <- gsub("Maternal and Newborn Health", "MaternalNewbornHealth", world_pop_data$data_type)
  world_pop_data$data_type <- gsub("Contraceptive Use", "ContraceptiveUse", world_pop_data$data_type)
  world_pop_data$data_type <- gsub("Age structures", "AgeStructures", world_pop_data$data_type)
  world_pop_data$data_type <- gsub("Dynamic Population", "DynamicPopulation", world_pop_data$data_type)
  countryreference <- as.data.frame(world_pop_data)
  countryreference <- countryreference[,c(1,2,4)]
  countryreference <- countryreference[!(countryreference$country)=="N/A",]
  world_pop_data <- world_pop_data[!(world_pop_data$country)=="N/A",]
  countryreference$CountryStandard <- standardizeCountry(countryreference[,"country"], fuzzyDist=20)
  countryreference$CountryEdit <- gsub("[()]", "", countryreference$country)
  countryreference$CountryEdit2 <- gsub("\\s*\\([^\\)]+\\)","",as.character(countryreference$country))
  foreach(a=1:nrow(countryreference)) %do% {
    if(countryreference[a,"CountryStandard"]==""){
      countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit"], fuzzyDist=20)
      if(countryreference[a,"CountryStandard"]==""){
        countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit2"], fuzzyDist=20)
        if(countryreference[a,"CountryStandard"]==""){
          countryreference[a,"CountryStandard"] <- toupper(countryreference[a,"country"])
        }
      }
    }
  }
  exists <- isTRUE(paste(country) %in% as.character(countryreference$CountryStandard))
  if(exists==FALSE){
    print("It appears this country is not in the WorldPop set, please check and try again")
    break
  }
  countryreference <- countryreference[,c(1,4,3)]
  world_pop_data$CountryStandard <- countryreference[,2]
  world_pop_data <- world_pop_data[,c(1,6,2,3,4,5)]
  countryreference <- suppressMessages(dcast(countryreference, continent+CountryStandard ~ data_type))
  countryreference <- filter(countryreference, countryreference$CountryStandard==country)
  countryreference <- countryreference[,colSums(is.na(countryreference))<nrow(countryreference)]
  world_pop_data <- filter(world_pop_data, CountryStandard %in% countryreference$CountryStandard)
  world_pop_data <- filter(world_pop_data, data_type==paste(datatype))
  dataset_link <- as.character(world_pop_data[1,"summary_link"])
  GET(url=dataset_link) -> res2
  pg2 <- content(res2)
  fields <- html_nodes(pg2, "form#conform > input")
  fields <- set_names(xml_attr(fields, "value"), html_attr(fields, "name"))
  POST(
    url = "http://www.worldpop.org.uk/data/download/",
    add_headers(`Referer` = dataset_link),
    body = list(
      client_first_name = "",
      client_last_name = "",
      client_organization = "",
      client_country = "",
      client_email = "",
      client_message = "",
      zip_id = fields["zip_id"],
      zip_title = fields["zip_title"],
      decoy = fields["decoy"],
      website = "",
      download = "Browse Individual Files"
    ),
    encode = "form"
  ) -> res3
  pg3 <- content(res3)
  html_nodes(pg3, xpath=".//a[contains(., 'switch to')]") %>%
    html_attr("href") -> file_list_query_string
  GET(
    url = "http://www.worldpop.org.uk/data/files/index.php",
    query = list(
      dataset = fields["zip_id"],
      action = "dir"
    )
  ) -> res4
  pg4 <- content(res4)
  data_frame(
    group_name = html_nodes(pg4, "a.dl") %>% html_text(),
    href = html_nodes(pg4, "a.dl") %>% html_attr("href")
  ) -> downloads
  downloads$istif <- str_sub(downloads$group_name,-4,-1)
  #Some such as senegal are inexplicably .TIF
  downloads$istif <- tolower(downloads$istif)
  downloads <- filter(downloads, istif==".tif")
  pg4charfile <- as.character(downloads[1,"group_name"])
  pg4charfile <- gsub(' {1,}','',pg4charfile)
  if(substr(pg4charfile,1,6)!="popmap"){
    if(grepl("\\d", pg4charfile)==TRUE){
      char4 <- substr(pg4charfile,4,4)
      char6 <-substr(pg4charfile,6,6)
      char9 <-substr(pg4charfile,9,9)
      char11 <-substr(pg4charfile,11,11)
      char4num <- suppressWarnings(!is.na(as.numeric(char4)))
      char6num <- suppressWarnings(!is.na(as.numeric(char6)))
      char9num <- suppressWarnings(!is.na(as.numeric(char9)))
      char11num <- suppressWarnings(!is.na(as.numeric(char11)))
      if(char4num==TRUE & char6num==TRUE){
        downloads$years <-substr(downloads$group_name,4,7)

      }
      if(char4num==TRUE & char6num==FALSE){
        downloads$years <-substr(downloads$group_name,4,5)
        getfouryear <- function (yearsvec)  {
          yrFlip = 50
          yearsvec <- as.numeric(yearsvec)
          yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
          yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
          return(yearsvec)
        }
        downloads$years <- getfouryear(downloads$years)
      }
      if(char9num==FALSE & char11num==TRUE){
        downloads$years <-substr(downloads$group_name,11,12)
        getfouryear <- function (yearsvec)  {
          yrFlip = 50
          yearsvec <- as.numeric(yearsvec)
          yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
          yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
          return(yearsvec)
        }
        downloads$years <- getfouryear(downloads$years)
      }
      if(char4num==FALSE & char6num==FALSE & char9num==TRUE){
        downloads$years <- str_extract(downloads$group_name, "\\d{4}")
      }
      if(char4num==FALSE & char6num==FALSE & char9num==FALSE & char11num==FALSE){
        downloads$years <- str_extract(downloads$group_name, "\\d{4}")
      }
    }else{downloads$years <- 9999}
  }else{
    downloads$years<- as.numeric(substr(downloads$group_name,7,8))
    getfouryear <- function (yearsvec)  {
      yrFlip = 50
      yearsvec <- as.numeric(yearsvec)
      yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
      yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
      return(yearsvec)
    }
    downloads$years <- getfouryear(downloads$years)
  }
  downloads <- downloads[!is.na(downloads$years),]
  ###Possible Options due to the inexplicable nature of their inconsistent file names
  possopt <- c("_pph_", "_ppp_", "_pp_", "uncert", "adj","_M.",  "_M_","_F.", "_F_", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
  opttext <- c("Persons per hectare", "Persons per pixel", "per pixel", " uncertainty dataset showing 95% credible intervals",
               "adjusted to match UN estimates", "MALE", "MALE", "FEMALE", "FEMALE", "Uncertainty map", "poverty standard deviation map", "$1.25/day",
               "$2.00/day", "mean wealth index", "mean likelihood of living in poverty per grid square", "Income estimate USD per grid square", "%poverty by Multidimensional Poverty Index",
               "prob of four or more antenatal care visits at time of delivery", "prob of skilled birth attendance during delivery", "prob of postnatal care received within 48 hours of delivery")
  possoptdf <- data.frame(possopt, opttext, stringsAsFactors = FALSE)
  groupsubstr <- str_sub(downloads$group_name,4,-4)
  optionsforchoice<-foreach(a=1:nrow(downloads), .combine=rbind)%do%{
    theoptions<- foreach(b=1:length(opttext), .combine=cbind)%do%{
      matchoopt <- str_detect(downloads[a,"group_name"],coll(possopt[b]))
      if(matchoopt==TRUE){result <-possopt[b]}
      if(matchoopt==FALSE){result<- NA}
      if(b==13){
        mistake <- str_detect(downloads[a,"group_name"],"\\d{4}")
        if(mistake==TRUE){result <- NA}
      }
      result
    }
  }
  optionsforchoice<-do.call(rbind,lapply(1:nrow(optionsforchoice),function(x) t(matrix(optionsforchoice[x,order(is.na(optionsforchoice[x,]))])) ))
  optionsforchoice <- as.data.frame(optionsforchoice, stringsAsFactors=FALSE)
  optionsforchoice <- optionsforchoice[,colSums(is.na(optionsforchoice))<nrow(optionsforchoice)]
  downloads <- cbind(downloads,optionsforchoice)
  optiters <- as.data.frame(optionsforchoice)
  if(length(optionsforchoice)==1){colnames(downloads)[5]<-"V1"}
  if(ncol(as.data.frame(optionsforchoice))==1){colnames(downloads)[5]<-"V1"}
  optiters <- ncol(optiters)
  ###join all the options so they can be displayed
  foreach(a=1:optiters)%do%{
    downloads <- merge(downloads, possoptdf, by.x=paste0("V",a), by.y="possopt", all.x=TRUE)
    coltochange <- ncol(downloads)
    colnames(downloads)[coltochange] <- paste0("possopt",a)
  }
  downloads$years <- as.numeric(downloads$years)
  downloads <- downloads[order(downloads$years),]
  ##Subsetting downloads to columns that only contain possopt
  downpossopt <- downloads[ ,  grepl( "possopt" , colnames( downloads ) ) ]
  downpossopt <- as.data.frame(downpossopt)
  downpossopt$code <- c(1:nrow(downpossopt))
  downloads$code <- c(1:nrow(downloads))
  rownames(downloads) <- c(1:nrow(downloads))
  rownames(downpossopt) <- c(1:nrow(downpossopt))
  if(ncol(downpossopt)==1){
    colnames(downpossopt) <- "possopt1"}
  downpossopt <- cbind(downloads$years, downpossopt)
  names(downpossopt)[names(downpossopt) == 'downloads$years'] <- 'years'
  names(downpossopt)[names(downpossopt) == 'V1'] <- 'years'
  possoptcodes <- c("pph", "ppp", "pp", "uncert", "adj","M",  "M","F", "F", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
  possoptcodes <- as.data.frame(cbind(possoptcodes, opttext))
  possoptcodes <- possoptcodes[c(1:6,8,10:nrow(possoptcodes)),]
  names(downpossopt)[names(downpossopt) == 'downpossopt'] <- 'possopt1'
  foreach(a=1:optiters)%do%{
    downpossopt <- merge(downpossopt, possoptcodes, by.x=paste0("possopt",a), by.y="opttext", all.x=TRUE)
    coltochange <- ncol(downpossopt)
    colnames(downpossopt)[coltochange] <- paste0("OptionCode",a)
  }
  downpossopt <- as.data.frame(downpossopt)
  downpossopt <- downpossopt[order(downpossopt$code),]
  downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="possopt1"),which(colnames(downpossopt)!="possopt1"))]
  downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="years"),which(colnames(downpossopt)!="years"))]
  downpossopt$country <- countryreference[1,"CountryStandard"]
  downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="country"),which(colnames(downpossopt)!="country"))]
  row.names(downpossopt) <- c(1:nrow(downpossopt))
  downpossoptcodes <- downpossopt[ ,  grepl( "OptionCode" , colnames( downpossopt ) ) ]
  downpossoptcodes <- as.data.frame(downpossoptcodes)
  print(downpossopt)
  WP.options <<- downpossopt
  print("The above table has also been added to your working environment as dataframe: WP.options")
}

getWPdownload：

#Example
getWPdownload("Benin", "Pregnancies", "pp", 2015)
#if the set is missing a year type 9999, 9999 will be returned for sets with missing years in getWPoptions. 

getWPdownload <- function (country, datatype, options, year)  {
country <- standardizeCountry(paste(country),fuzzyDist=30)
year <- as.numeric(year)
optionschosen <- paste(options, collapse = '-')
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
  map(html_children) %>%
  map(html_text) %>%
  map(~.[1:4]) %>%
  map(as.list) %>%
  map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
  bind_cols(
    data_frame(
      summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
    )
  ) -> world_pop_data
world_pop_data$data_type <- gsub("Urban change", "UrbanChange", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Maternal and Newborn Health", "MaternalNewbornHealth", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Contraceptive Use", "ContraceptiveUse", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Age structures", "AgeStructures", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Dynamic Population", "DynamicPopulation", world_pop_data$data_type)
countryreference <- as.data.frame(world_pop_data)
countryreference <- countryreference[,c(1,2,4)]
countryreference <- countryreference[!(countryreference$country)=="N/A",]
world_pop_data <- world_pop_data[!(world_pop_data$country)=="N/A",]
###Filter country names so they match the desired country
countryreference$CountryStandard <- standardizeCountry(countryreference[,"country"], fuzzyDist=20)
countryreference$CountryEdit <- gsub("[()]", "", countryreference$country)
countryreference$CountryEdit2 <- gsub("\\s*\\([^\\)]+\\)","",as.character(countryreference$country))
foreach(a=1:nrow(countryreference)) %do% {
  if(countryreference[a,"CountryStandard"]==""){
    countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit"], fuzzyDist=20)
    if(countryreference[a,"CountryStandard"]==""){
      countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit2"], fuzzyDist=20)
      if(countryreference[a,"CountryStandard"]==""){
        countryreference[a,"CountryStandard"] <- toupper(countryreference[a,"country"])
      }
    }
  }
}
exists <- isTRUE(paste(country) %in% as.character(countryreference$CountryStandard))
if(exists==FALSE){
  print("It appears this country is not in the WorldPop set, please check and try again")
  break
}
countryreference <- countryreference[,c(1,4,3)]
world_pop_data$CountryStandard <- countryreference[,2]
world_pop_data <- world_pop_data[,c(1,6,2,3,4,5)]
countryreference <- suppressMessages(dcast(countryreference, continent+CountryStandard ~ data_type))
countryreference <- filter(countryreference, countryreference$CountryStandard==country)
countryreference <- countryreference[,colSums(is.na(countryreference))<nrow(countryreference)]
world_pop_data <- filter(world_pop_data, CountryStandard %in% countryreference$CountryStandard)
world_pop_data <- filter(world_pop_data, data_type==paste(datatype))
dataset_link <- as.character(world_pop_data[1,"summary_link"])
GET(url=dataset_link) -> res2
pg2 <- content(res2)
fields <- html_nodes(pg2, "form#conform > input")
fields <- set_names(xml_attr(fields, "value"), html_attr(fields, "name"))
POST(
  url = "http://www.worldpop.org.uk/data/download/",
  add_headers(`Referer` = dataset_link),
  body = list(
    client_first_name = "",
    client_last_name = "",
    client_organization = "",
    client_country = "",
    client_email = "",
    client_message = "",
    zip_id = fields["zip_id"],
    zip_title = fields["zip_title"],
    decoy = fields["decoy"],
    website = "",
    download = "Browse Individual Files"
  ),
  encode = "form"
) -> res3
pg3 <- content(res3)
html_nodes(pg3, xpath=".//a[contains(., 'switch to')]") %>%
  html_attr("href") -> file_list_query_string
GET(
  url = "http://www.worldpop.org.uk/data/files/index.php",
  query = list(
    dataset = fields["zip_id"],
    action = "dir"
  )
) -> res4
pg4 <- content(res4)
data_frame(
  group_name = html_nodes(pg4, "a.dl") %>% html_text(),
  href = html_nodes(pg4, "a.dl") %>% html_attr("href")
) -> downloads
downloads$istif <- str_sub(downloads$group_name,-4,-1)
#Some such as senegal are inexplicably .TIF
downloads$istif <- tolower(downloads$istif)
downloads <- filter(downloads, istif==".tif")
pg4charfile <- as.character(downloads[1,"group_name"])
pg4charfile <- gsub(' {1,}','',pg4charfile)
if(substr(pg4charfile,1,6)!="popmap"){
if(grepl("\\d", pg4charfile)==TRUE){
  char4 <- substr(pg4charfile,4,4)
  char6 <-substr(pg4charfile,6,6)
  char9 <-substr(pg4charfile,9,9)
  char11 <-substr(pg4charfile,11,11)
  char4num <- suppressWarnings(!is.na(as.numeric(char4)))
  char6num <- suppressWarnings(!is.na(as.numeric(char6)))
  char9num <- suppressWarnings(!is.na(as.numeric(char9)))
  char11num <- suppressWarnings(!is.na(as.numeric(char11)))
  if(char4num==TRUE & char6num==TRUE){
    downloads$years <-substr(downloads$group_name,4,7)
  }
  if(char4num==TRUE & char6num==FALSE){
    downloads$years <-substr(downloads$group_name,4,5)
    getfouryear <- function (yearsvec)  {
      yrFlip = 50
      yearsvec <- as.numeric(yearsvec)
      yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
      yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
      return(yearsvec)
    }
    downloads$years <- getfouryear(downloads$years)
  }
  if(char9num==FALSE & char11num==TRUE){
    downloads$years <-substr(downloads$group_name,11,12)
    getfouryear <- function (yearsvec)  {
      yrFlip = 50
      yearsvec <- as.numeric(yearsvec)
      yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
      yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
      return(yearsvec)
    }
    downloads$years <- getfouryear(downloads$years)
  }
  if(char4num==FALSE & char6num==FALSE & char9num==TRUE){
    downloads$years <- str_extract(downloads$group_name, "\\d{4}")
  }
  if(char4num==FALSE & char6num==FALSE & char9num==FALSE & char11num==FALSE){
    downloads$years <- str_extract(downloads$group_name, "\\d{4}")
  }
}else{downloads$years <- 9999}
}else{
  downloads$years<- as.numeric(substr(downloads$group_name,7,8))
  getfouryear <- function (yearsvec)  {
    yrFlip = 50
    yearsvec <- as.numeric(yearsvec)
    yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
    yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
    return(yearsvec)
  }
  downloads$years <- getfouryear(downloads$years)
  }
downloads <- downloads[!is.na(downloads$years),]
possopt <- c("_pph_", "_ppp_", "_pp_", "uncert", "adj","_M.",  "_M_","_F.", "_F_", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
opttext <- c("Persons per hectare", "Persons per pixel", "per pixel", " uncertainty dataset showing 95% credible intervals",
              "adjusted to match UN estimates", "MALE", "MALE", "FEMALE", "FEMALE", "Uncertainty map", "poverty standard deviation map", "$1.25/day",
              "$2.00/day", "mean wealth index", "mean likelihood of living in poverty per grid square", "Income estimate USD per grid square", "%poverty by Multidimensional Poverty Index",
              "prob of four or more antenatal care visits at time of delivery", "prob of skilled birth attendance during delivery", "prob of postnatal care received within 48 hours of delivery")
possoptdf <- data.frame(possopt, opttext, stringsAsFactors = FALSE)
groupsubstr <- str_sub(downloads$group_name,4,-4)
###get options for each file from the worldpop selected country and datatype###
optionsforchoice<-foreach(a=1:nrow(downloads), .combine=rbind)%do%{
  theoptions<- foreach(b=1:length(opttext), .combine=cbind)%do%{
    matchoopt <- str_detect(downloads[a,"group_name"],coll(possopt[b]))
    if(matchoopt==TRUE){result <-possopt[b]}
    if(matchoopt==FALSE){result<- NA}
    if(b==13){
      mistake <- str_detect(downloads[a,"group_name"],"\\d{4}")
      if(mistake==TRUE){result <- NA}
    }
    result
  }
}
optionsforchoice<-do.call(rbind,lapply(1:nrow(optionsforchoice),function(x) t(matrix(optionsforchoice[x,order(is.na(optionsforchoice[x,]))])) ))
optionsforchoice <- as.data.frame(optionsforchoice, stringsAsFactors=FALSE)
optionsforchoice <- optionsforchoice[,colSums(is.na(optionsforchoice))<nrow(optionsforchoice)]
downloads <- cbind(downloads,optionsforchoice)
optiters <- as.data.frame(optionsforchoice)
if(length(optionsforchoice)==1){colnames(downloads)[5]<-"V1"}
if(ncol(as.data.frame(optionsforchoice))==1){colnames(downloads)[5]<-"V1"}
optiters <- ncol(optiters)
foreach(a=1:optiters)%do%{
  downloads <- merge(downloads, possoptdf, by.x=paste0("V",a), by.y="possopt", all.x=TRUE)
  coltochange <- ncol(downloads)
  colnames(downloads)[coltochange] <- paste0("possopt",a)
}
downloads$years <- as.numeric(downloads$years)
downloads <- downloads[order(downloads$years),]
##Subsetting downloads to columns that only contain possopt
downpossopt <- downloads[ ,  grepl( "possopt" , colnames( downloads ) ) ]
downpossopt <- as.data.frame(downpossopt)
downpossopt$code <- c(1:nrow(downpossopt))
downloads$code <- c(1:nrow(downloads))
rownames(downloads) <- c(1:nrow(downloads))
rownames(downpossopt) <- c(1:nrow(downpossopt))
if(ncol(downpossopt)==1){
  colnames(downpossopt) <- "possopt1"}
downpossopt <- cbind(downloads$years, downpossopt)
names(downpossopt)[names(downpossopt) == 'downloads$years'] <- 'years'
names(downpossopt)[names(downpossopt) == 'V1'] <- 'years'
###get the right codes for the function
possoptcodes <- c("pph", "ppp", "pp", "uncert", "adj","M",  "M","F", "F", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
possoptcodes <- as.data.frame(cbind(possoptcodes, opttext))
possoptcodes <- possoptcodes[c(1:6,8,10:nrow(possoptcodes)),]
names(downpossopt)[names(downpossopt) == 'downpossopt'] <- 'possopt1'
foreach(a=1:optiters)%do%{
  downpossopt <- merge(downpossopt, possoptcodes, by.x=paste0("possopt",a), by.y="opttext", all.x=TRUE)
  coltochange <- ncol(downpossopt)
  colnames(downpossopt)[coltochange] <- paste0("OptionCode",a)
}
downpossopt <- as.data.frame(downpossopt)
downpossopt <- downpossopt[order(downpossopt$code),]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="possopt1"),which(colnames(downpossopt)!="possopt1"))]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="years"),which(colnames(downpossopt)!="years"))]
downpossopt$country <- countryreference[1,"CountryStandard"]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="country"),which(colnames(downpossopt)!="country"))]
row.names(downpossopt) <- c(1:nrow(downpossopt))
downpossoptcodes <- downpossopt[ ,  grepl( "OptionCode" , colnames( downpossopt ) ) ]
downpossoptcodes <- as.data.frame(downpossoptcodes)
if(ncol(downpossoptcodes)>1){
downpossoptcodes <- data.frame(x=apply(downpossoptcodes,1,function(x) {paste(x[!is.na(x)],collapse='-')}))
}
colnames(downpossoptcodes) <- "optionspossible"
downloads <- as.data.frame(cbind(downloads, downpossoptcodes))
downloads <- filter(downloads, downloads$years==year)
if(optionschosen!=""){
downloads2 <- filter(downloads, downloads$optionspossible==optionschosen)
if(is.na(downloads2[1,1]) & length(options)>1){
  optionschosen <- paste0(options[2],"-", options[1], collapse='')
  downloads2 <- filter(downloads, downloads$optionspossible==optionschosen)
}
}else{downloads2 <- filter(downloads, is.na(downloads$optionspossible))}
readme_query_stringdownload <- as.character(downloads2[1,"href"])
filenamedest <- as.character(paste0(country,downloads2[1,"group_name"]))
GET(
  url = "http://www.worldpop.org.uk/data/files/index.php",
  query = parse_url(readme_query_stringdownload)$query,
  progress(),
  verbose(),
  write_disk(paste0(filenamedest), overwrite=TRUE)
)-> res5
}

（R）下载具有直接下载链接的文件，该链接在浏览器中有效，但在R中无法使用

3 个答案: