我试图从WorldPop英国网站下载很多文件,用于数据集中的很多国家(不仅仅是小例子)。下载每个文件将非常耗时且乏味。
我对R中的下载方法非常熟悉,但我无法让这些下载工作。我知道这是因为下载链接以某种方式通过html运行,但我对html或java并不好。
我在httr,RCurl和RSelenium上做了很多阅读。我更喜欢避免使用RSelenium的解决方案,因为我对其他软件包更为熟悉,可能会共享代码,并且不希望每次都要设置浏览器(至少这是我的理解)
有人可以帮我解决这个问题吗?
直接下载链接到一个小文本(.txt)文件,该文件在浏览器中正常工作,但在R中没有使用download.file或curl_download: http://www.worldpop.org.uk/data/files/index.php?dataset=140&action=download&file=60
具有尼日利亚文件索引的网站(您可以在html代码中看到href = links):http://www.worldpop.org.uk/data/files/index.php?dataset=140&action=dir
在Chrome上,查看来源:http://www.worldpop.org.uk/data/files/index.php?dataset=140&action=dir
下载链接位于我的开发者控制台的第558行和第559行之间。
提前致谢!
答案 0 :(得分:4)
嗯,他们肯定不会让这很容易。除了错综复杂的“网络应用程序”之外,他们还尝试做正确的事情并使用sha1对源javascript资源进行验证,但未能保持这些(即安全浏览器将无法使用该网站)。 / p>
无论如何,以下是您必须采取的措施,以避免splashr
或RSelenium
/ seleniumPipes
。我使用了“README”示例,并且有很多评论。
我的建议是将一个或多个位包装到函数中以便于使用,并考虑在purrr
帮助器中包装各种调用,如safely
(还有“重试”示例oot和aboot)。
library(httr)
library(rvest)
library(tidyverse)
# Need to "prime" the session with a cookie
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
# Find the summary links
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
# extract the table cells & href so we can make a data frame
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
map(html_children) %>%
map(html_text) %>%
map(~.[1:4]) %>%
map(as.list) %>%
map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
bind_cols(
data_frame(
summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
)
) -> world_pop_data
glimpse(world_pop_data)
## Observations: 462
## Variables: 5
## $ continent <chr> "Africa", "Africa", "Africa", "Africa", "Africa", "Africa", "Afri...
## $ country <chr> "Algeria", "Angola", "Benin", "Botswana", "Burkina Faso", "Burund...
## $ resolution <chr> "100m", "100m", "100m", "100m", "100m", "100m", "100m", "100m", "...
## $ data_type <chr> "Population", "Population", "Population", "Population", "Populati...
## $ summary_link <chr> "http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP0000...
# just see "Nigeria" data
filter(world_pop_data, country=="Nigeria")
## # A tibble: 8 x 5
## continent country resolution data_type summary_link
## <chr> <chr> <chr> <chr> <chr>
## 1 Africa Nigeria 100m Population http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00196
## 2 Africa Nigeria 1km Births http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00357
## 3 Africa Nigeria 1km Pregnancies http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00465
## 4 Africa Nigeria 1km Contraceptive Use http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00198
## 5 Africa Nigeria 1km Literacy http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00199
## 6 Africa Nigeria 1km Poverty http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00200
## 7 Africa Nigeria 1km Stunting http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00201
## 8 Africa Nigeria 100m Age structures http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00194
我相当确定您可以从^^ URL之一开始任何文件下载会话尝试,但您需要测试它,因为您可能需要始终从“主”页面开始(如上所述,它根据cookie保持会话位置。
# get nigeria population URL
filter(world_pop_data, country=="Nigeria") %>%
filter(data_type=="Population") %>%
pull(summary_link) -> nigeria_pop
nigeria_pop
# [1] "http://www.worldpop.org.uk/data/summary?doi=10.5258/SOTON/WP00196"
# follow it
GET(url=nigeria_pop) -> res2
pg2 <- content(res2)
该页面上总是有<form>
,因此我们需要使用POST
“提交”所述表单:
# extract "form" fields (that page does a POST request)
fields <- html_nodes(pg2, "form#conform > input")
fields <- set_names(xml_attr(fields, "value"), html_attr(fields, "name"))
str(as.list((fields))) # just to show what it looks like
## List of 4
## $ zip_id : chr "140"
## $ zip_title: chr "Nigeria 100m Population"
## $ decoy : chr "website"
## $ website : chr NA
# submit the form with the field data.
# NOTE we need to add the `Referer` (the faux page we're on)
POST(
url = "http://www.worldpop.org.uk/data/download/",
add_headers(`Referer` = nigeria_pop),
body = list(
client_first_name = "",
client_last_name = "",
client_organization = "",
client_country = "",
client_email = "",
client_message = "",
zip_id = fields["zip_id"],
zip_title = fields["zip_title"],
decoy = fields["decoy"],
website = "",
download = "Browse Individual Files"
),
encode = "form"
) -> res3
结果页面上的某个位置是“切换到文件列表”链接,因此我们需要找到它并按照它:
# find the link that has the file list
pg3 <- content(res3)
html_nodes(pg3, xpath=".//a[contains(., 'switch to')]") %>%
html_attr("href") -> file_list_query_string # just to see the format
## [1] "?dataset=140&action=dir"
# follow that link (we need to use some of the previous captured fields)
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = list(
dataset = fields["zip_id"],
action = "dir"
)
) -> res4
现在,我们在该页面上构建了所有链接的数据框:
pg4 <- content(res4)
data_frame(
group_name = html_nodes(pg4, "a.dl") %>% html_text(),
href = html_nodes(pg4, "a.dl") %>% html_attr("href")
) -> downloads
downloads
## # A tibble: 60 x 2
## group_name href
## <chr> <chr>
## 1 Licence.txt ?dataset=140&action=download&file=1
## 2 NGA_metadata.html ?dataset=140&action=download&file=2
## 3 NGA_pph_v2c_2006.tfw ?dataset=140&action=download&file=3
## 4 NGA_pph_v2c_2006.tif ?dataset=140&action=download&file=4
## 5 NGA_pph_v2c_2006.tif.aux.xml ?dataset=140&action=download&file=5
## 6 NGA_pph_v2c_2006.tif.xml ?dataset=140&action=download&file=6
## 7 NGA_pph_v2c_2010.tfw ?dataset=140&action=download&file=7
## 8 NGA_pph_v2c_2010.tif ?dataset=140&action=download&file=8
## 9 NGA_pph_v2c_2010.tif.aux.xml ?dataset=140&action=download&file=9
## 10 NGA_pph_v2c_2010.tif.xml ?dataset=140&action=download&file=10
## # ... with 50 more rows
虽然我之前提到过,您可能需要始终从一开始或从上一个链接页面开始,您也可以按顺序下载所有这些。你需要做测试。这是一个痛苦的网站。
filter(downloads, str_detect(group_name, "README")) %>%
pull(href) -> readme_query_string # we need this below
readme_query_string
## [1] "?dataset=140&action=download&file=60"
# THERE IS A RLY GD CHANCE YOU'LL NEED TO USE timeout() for
# some of these calls. That server takes a while
# right here is where that modal "preparing the data" is shown.
# I'm 99% certain this is there to slow down crawlers/scrapers.
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = parse_url(readme_query_string)$query,
verbose()
) -> res5
这不是你真正要做的。你可能想要content(res5, as="raw")
和writeBin()
,因为有些(大多数)不是纯文本。但是,这是为了表明上述所有工作:
content(res5, as="text") %>%
cat()
## WorldPop Africa dataset details
## _______________________
##
## DATASET: Alpha version 2010, 2015 and 2020 estimates of numbers of people per pixel ('ppp') and people per hectare ('pph'), with national totals adjusted to match UN population division estimates (http://esa.un.org/wpp/) and remaining unadjusted.
## REGION: Africa
## SPATIAL RESOLUTION: 0.000833333 decimal degrees (approx 100m at the equator)
## PROJECTION: Geographic, WGS84
## UNITS: Estimated persons per grid square
## MAPPING APPROACH: Random Forest
## FORMAT: Geotiff (zipped using 7-zip (open access tool): www.7-zip.org)
## FILENAMES: Example - NGA_ppp_v2b_2010_UNadj.tif = Nigeria (NGA) population per pixel (ppp), mapped using WorldPOP modelling version 2b (v2b) for 2010 (2010) adjusted to match UN national estimates (UNadj).
## DATE OF PRODUCTION: February 2017
##
## Also included: (i) Metadata html file, (ii) Google Earth file, (iii) Population datasets produced using original census year data (2006).
如果你坚持下去,可以考虑添加你最后做的答案,或者把它变成一个包,这样其他人也可以使用它。
答案 1 :(得分:0)
HEY STACKOVERFLOW DELETERS,看看这个。我很有可能与我的其他答案相结合,但我有限制为30K字符并且有很多代码。
现在可以将其下载为名为spaceheater的github包。再次感谢@hrbrmstr,没有他,这是不可能的。
library(devtools)
install_github("nbarsch/spaceheater")
我把它分成三个功能:
getWPdatatypes("country")
getWPoptions("country","datatype")
getWPdownload("country","datatype",c ("options"),year)
###takes 0, 1 or 2 options. 0 is ""
__
getWPdatatypes("country")
#Example
getWPdatatypes("Nigeria")
是基本级别,并从worldpop返回该国家/地区的可用数据类型的数据框。示例数据类型是&#34;人口&#34;,&#34;出生&#34;,&#34; UrbanChange&#34;。
_
getWPoptions("country", "datatype")
#Example
getWPoptions("Ghana", "Population")
旨在在getWPdatatypes之后运行,并返回给定数据类型的国家/地区可用的所有选项和年份的数据框。
getWPdownload("country", "datatype", c("options"), year)
#Example
getWPdownload("Benin", "Pregnancies", "pp", 2015)
感谢hrbrmstr
下载套装如果设置年份未在文件名年中标记为9999.世界流行网站列出了年份。他们的文件命名方案很残酷,大部分代码都是由于文件名不一致造成的。
话虽如此,我喜欢他们的方法论和他们构建的网格栅格集,所以如果有人使用该代码,请引用WorldPop。 :)
答案 2 :(得分:0)
现在这是github上spaceheater包的一部分,请参阅其他答案以获取说明https://github.com/nbarsch/spaceheater。我所做的参考代码如下。
库:
library(httr)
library(rvest)
library(tidyverse)
library(raster)
library(rgdal)
library(reshape2)
library(rangeBuilder)
library(stringr)
library(foreach)
library(dplyr)
函数(由于char的限制,我必须删除代码中的大部分#notes。请参阅下面的答案以获取相关说明。大量代码是由于文件名不一致):
getWPdatatypes:
getWPdatatypes <- function (country) {
country <- standardizeCountry(paste(country),fuzzyDist=30)
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
map(html_children) %>%
map(html_text) %>%
map(~.[1:4]) %>%
map(as.list) %>%
map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
bind_cols(
data_frame(
summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
)
) -> world_pop_data
world_pop_data$data_type <- gsub("Urban change", "UrbanChange", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Maternal and Newborn Health", "MaternalNewbornHealth", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Contraceptive Use", "ContraceptiveUse", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Age structures", "AgeStructures", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Dynamic Population", "DynamicPopulation", world_pop_data$data_type)
countryreference <- as.data.frame(world_pop_data)
countryreference <- countryreference[,c(1,2,4)]
countryreference <- countryreference[!(countryreference$country)=="N/A",]
world_pop_data <- world_pop_data[!(world_pop_data$country)=="N/A",]
countryreference$CountryStandard <- standardizeCountry(countryreference[,"country"], fuzzyDist=20)
countryreference$CountryEdit <- gsub("[()]", "", countryreference$country)
countryreference$CountryEdit2 <- gsub("\\s*\\([^\\)]+\\)","",as.character(countryreference$country))
foreach(a=1:nrow(countryreference)) %do% {
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit2"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- toupper(countryreference[a,"country"])
}
}
}
}
exists <- isTRUE(paste(country) %in% as.character(countryreference$CountryStandard))
if(exists==FALSE){
print("It appears this country is not in the WorldPop set, please check and try again")
break
}
countryreference <- countryreference[,c(1,4,3)]
world_pop_data$CountryStandard <- countryreference[,2]
world_pop_data <- world_pop_data[,c(1,6,2,3,4,5)]
countryreference <- suppressMessages(dcast(countryreference, continent+CountryStandard ~ data_type))
countryreference <- filter(countryreference, countryreference$CountryStandard==country)
countryreference <- countryreference[,colSums(is.na(countryreference))<nrow(countryreference)]
print(countryreference[1,])
WPdata.types <<- countryreference[1,]
print("The above table has also been added to your working environment as dataframe: WPdata.types")
}
getWPoptions:
#Example
getWPdatatypes("Nigeria", "Population")
getWPoptions <- function (country, datatype) {
country <- standardizeCountry(paste(country),fuzzyDist=30)
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
map(html_children) %>%
map(html_text) %>%
map(~.[1:4]) %>%
map(as.list) %>%
map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
bind_cols(
data_frame(
summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
)
) -> world_pop_data
world_pop_data$data_type <- gsub("Urban change", "UrbanChange", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Maternal and Newborn Health", "MaternalNewbornHealth", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Contraceptive Use", "ContraceptiveUse", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Age structures", "AgeStructures", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Dynamic Population", "DynamicPopulation", world_pop_data$data_type)
countryreference <- as.data.frame(world_pop_data)
countryreference <- countryreference[,c(1,2,4)]
countryreference <- countryreference[!(countryreference$country)=="N/A",]
world_pop_data <- world_pop_data[!(world_pop_data$country)=="N/A",]
countryreference$CountryStandard <- standardizeCountry(countryreference[,"country"], fuzzyDist=20)
countryreference$CountryEdit <- gsub("[()]", "", countryreference$country)
countryreference$CountryEdit2 <- gsub("\\s*\\([^\\)]+\\)","",as.character(countryreference$country))
foreach(a=1:nrow(countryreference)) %do% {
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit2"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- toupper(countryreference[a,"country"])
}
}
}
}
exists <- isTRUE(paste(country) %in% as.character(countryreference$CountryStandard))
if(exists==FALSE){
print("It appears this country is not in the WorldPop set, please check and try again")
break
}
countryreference <- countryreference[,c(1,4,3)]
world_pop_data$CountryStandard <- countryreference[,2]
world_pop_data <- world_pop_data[,c(1,6,2,3,4,5)]
countryreference <- suppressMessages(dcast(countryreference, continent+CountryStandard ~ data_type))
countryreference <- filter(countryreference, countryreference$CountryStandard==country)
countryreference <- countryreference[,colSums(is.na(countryreference))<nrow(countryreference)]
world_pop_data <- filter(world_pop_data, CountryStandard %in% countryreference$CountryStandard)
world_pop_data <- filter(world_pop_data, data_type==paste(datatype))
dataset_link <- as.character(world_pop_data[1,"summary_link"])
GET(url=dataset_link) -> res2
pg2 <- content(res2)
fields <- html_nodes(pg2, "form#conform > input")
fields <- set_names(xml_attr(fields, "value"), html_attr(fields, "name"))
POST(
url = "http://www.worldpop.org.uk/data/download/",
add_headers(`Referer` = dataset_link),
body = list(
client_first_name = "",
client_last_name = "",
client_organization = "",
client_country = "",
client_email = "",
client_message = "",
zip_id = fields["zip_id"],
zip_title = fields["zip_title"],
decoy = fields["decoy"],
website = "",
download = "Browse Individual Files"
),
encode = "form"
) -> res3
pg3 <- content(res3)
html_nodes(pg3, xpath=".//a[contains(., 'switch to')]") %>%
html_attr("href") -> file_list_query_string
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = list(
dataset = fields["zip_id"],
action = "dir"
)
) -> res4
pg4 <- content(res4)
data_frame(
group_name = html_nodes(pg4, "a.dl") %>% html_text(),
href = html_nodes(pg4, "a.dl") %>% html_attr("href")
) -> downloads
downloads$istif <- str_sub(downloads$group_name,-4,-1)
#Some such as senegal are inexplicably .TIF
downloads$istif <- tolower(downloads$istif)
downloads <- filter(downloads, istif==".tif")
pg4charfile <- as.character(downloads[1,"group_name"])
pg4charfile <- gsub(' {1,}','',pg4charfile)
if(substr(pg4charfile,1,6)!="popmap"){
if(grepl("\\d", pg4charfile)==TRUE){
char4 <- substr(pg4charfile,4,4)
char6 <-substr(pg4charfile,6,6)
char9 <-substr(pg4charfile,9,9)
char11 <-substr(pg4charfile,11,11)
char4num <- suppressWarnings(!is.na(as.numeric(char4)))
char6num <- suppressWarnings(!is.na(as.numeric(char6)))
char9num <- suppressWarnings(!is.na(as.numeric(char9)))
char11num <- suppressWarnings(!is.na(as.numeric(char11)))
if(char4num==TRUE & char6num==TRUE){
downloads$years <-substr(downloads$group_name,4,7)
}
if(char4num==TRUE & char6num==FALSE){
downloads$years <-substr(downloads$group_name,4,5)
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
if(char9num==FALSE & char11num==TRUE){
downloads$years <-substr(downloads$group_name,11,12)
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
if(char4num==FALSE & char6num==FALSE & char9num==TRUE){
downloads$years <- str_extract(downloads$group_name, "\\d{4}")
}
if(char4num==FALSE & char6num==FALSE & char9num==FALSE & char11num==FALSE){
downloads$years <- str_extract(downloads$group_name, "\\d{4}")
}
}else{downloads$years <- 9999}
}else{
downloads$years<- as.numeric(substr(downloads$group_name,7,8))
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
downloads <- downloads[!is.na(downloads$years),]
###Possible Options due to the inexplicable nature of their inconsistent file names
possopt <- c("_pph_", "_ppp_", "_pp_", "uncert", "adj","_M.", "_M_","_F.", "_F_", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
opttext <- c("Persons per hectare", "Persons per pixel", "per pixel", " uncertainty dataset showing 95% credible intervals",
"adjusted to match UN estimates", "MALE", "MALE", "FEMALE", "FEMALE", "Uncertainty map", "poverty standard deviation map", "$1.25/day",
"$2.00/day", "mean wealth index", "mean likelihood of living in poverty per grid square", "Income estimate USD per grid square", "%poverty by Multidimensional Poverty Index",
"prob of four or more antenatal care visits at time of delivery", "prob of skilled birth attendance during delivery", "prob of postnatal care received within 48 hours of delivery")
possoptdf <- data.frame(possopt, opttext, stringsAsFactors = FALSE)
groupsubstr <- str_sub(downloads$group_name,4,-4)
optionsforchoice<-foreach(a=1:nrow(downloads), .combine=rbind)%do%{
theoptions<- foreach(b=1:length(opttext), .combine=cbind)%do%{
matchoopt <- str_detect(downloads[a,"group_name"],coll(possopt[b]))
if(matchoopt==TRUE){result <-possopt[b]}
if(matchoopt==FALSE){result<- NA}
if(b==13){
mistake <- str_detect(downloads[a,"group_name"],"\\d{4}")
if(mistake==TRUE){result <- NA}
}
result
}
}
optionsforchoice<-do.call(rbind,lapply(1:nrow(optionsforchoice),function(x) t(matrix(optionsforchoice[x,order(is.na(optionsforchoice[x,]))])) ))
optionsforchoice <- as.data.frame(optionsforchoice, stringsAsFactors=FALSE)
optionsforchoice <- optionsforchoice[,colSums(is.na(optionsforchoice))<nrow(optionsforchoice)]
downloads <- cbind(downloads,optionsforchoice)
optiters <- as.data.frame(optionsforchoice)
if(length(optionsforchoice)==1){colnames(downloads)[5]<-"V1"}
if(ncol(as.data.frame(optionsforchoice))==1){colnames(downloads)[5]<-"V1"}
optiters <- ncol(optiters)
###join all the options so they can be displayed
foreach(a=1:optiters)%do%{
downloads <- merge(downloads, possoptdf, by.x=paste0("V",a), by.y="possopt", all.x=TRUE)
coltochange <- ncol(downloads)
colnames(downloads)[coltochange] <- paste0("possopt",a)
}
downloads$years <- as.numeric(downloads$years)
downloads <- downloads[order(downloads$years),]
##Subsetting downloads to columns that only contain possopt
downpossopt <- downloads[ , grepl( "possopt" , colnames( downloads ) ) ]
downpossopt <- as.data.frame(downpossopt)
downpossopt$code <- c(1:nrow(downpossopt))
downloads$code <- c(1:nrow(downloads))
rownames(downloads) <- c(1:nrow(downloads))
rownames(downpossopt) <- c(1:nrow(downpossopt))
if(ncol(downpossopt)==1){
colnames(downpossopt) <- "possopt1"}
downpossopt <- cbind(downloads$years, downpossopt)
names(downpossopt)[names(downpossopt) == 'downloads$years'] <- 'years'
names(downpossopt)[names(downpossopt) == 'V1'] <- 'years'
possoptcodes <- c("pph", "ppp", "pp", "uncert", "adj","M", "M","F", "F", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
possoptcodes <- as.data.frame(cbind(possoptcodes, opttext))
possoptcodes <- possoptcodes[c(1:6,8,10:nrow(possoptcodes)),]
names(downpossopt)[names(downpossopt) == 'downpossopt'] <- 'possopt1'
foreach(a=1:optiters)%do%{
downpossopt <- merge(downpossopt, possoptcodes, by.x=paste0("possopt",a), by.y="opttext", all.x=TRUE)
coltochange <- ncol(downpossopt)
colnames(downpossopt)[coltochange] <- paste0("OptionCode",a)
}
downpossopt <- as.data.frame(downpossopt)
downpossopt <- downpossopt[order(downpossopt$code),]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="possopt1"),which(colnames(downpossopt)!="possopt1"))]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="years"),which(colnames(downpossopt)!="years"))]
downpossopt$country <- countryreference[1,"CountryStandard"]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="country"),which(colnames(downpossopt)!="country"))]
row.names(downpossopt) <- c(1:nrow(downpossopt))
downpossoptcodes <- downpossopt[ , grepl( "OptionCode" , colnames( downpossopt ) ) ]
downpossoptcodes <- as.data.frame(downpossoptcodes)
print(downpossopt)
WP.options <<- downpossopt
print("The above table has also been added to your working environment as dataframe: WP.options")
}
getWPdownload:
#Example
getWPdownload("Benin", "Pregnancies", "pp", 2015)
#if the set is missing a year type 9999, 9999 will be returned for sets with missing years in getWPoptions.
getWPdownload <- function (country, datatype, options, year) {
country <- standardizeCountry(paste(country),fuzzyDist=30)
year <- as.numeric(year)
optionschosen <- paste(options, collapse = '-')
res <- GET(url="http://www.worldpop.org.uk/data/data_sources/")
# Get the page contents
pg <- content(res)
summary_link_nodes <- html_nodes(pg, xpath=".//a[contains(@href,'summary')]")
map(summary_link_nodes, html_nodes, xpath=".//../..") %>%
map(html_children) %>%
map(html_text) %>%
map(~.[1:4]) %>%
map(as.list) %>%
map_df(set_names, c("continent", "country", "resolution", "data_type")) %>%
bind_cols(
data_frame(
summary_link = sprintf("http://www.worldpop.org.uk%s", html_attr(summary_link_nodes, "href"))
)
) -> world_pop_data
world_pop_data$data_type <- gsub("Urban change", "UrbanChange", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Maternal and Newborn Health", "MaternalNewbornHealth", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Contraceptive Use", "ContraceptiveUse", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Age structures", "AgeStructures", world_pop_data$data_type)
world_pop_data$data_type <- gsub("Dynamic Population", "DynamicPopulation", world_pop_data$data_type)
countryreference <- as.data.frame(world_pop_data)
countryreference <- countryreference[,c(1,2,4)]
countryreference <- countryreference[!(countryreference$country)=="N/A",]
world_pop_data <- world_pop_data[!(world_pop_data$country)=="N/A",]
###Filter country names so they match the desired country
countryreference$CountryStandard <- standardizeCountry(countryreference[,"country"], fuzzyDist=20)
countryreference$CountryEdit <- gsub("[()]", "", countryreference$country)
countryreference$CountryEdit2 <- gsub("\\s*\\([^\\)]+\\)","",as.character(countryreference$country))
foreach(a=1:nrow(countryreference)) %do% {
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- standardizeCountry(countryreference[a,"CountryEdit2"], fuzzyDist=20)
if(countryreference[a,"CountryStandard"]==""){
countryreference[a,"CountryStandard"] <- toupper(countryreference[a,"country"])
}
}
}
}
exists <- isTRUE(paste(country) %in% as.character(countryreference$CountryStandard))
if(exists==FALSE){
print("It appears this country is not in the WorldPop set, please check and try again")
break
}
countryreference <- countryreference[,c(1,4,3)]
world_pop_data$CountryStandard <- countryreference[,2]
world_pop_data <- world_pop_data[,c(1,6,2,3,4,5)]
countryreference <- suppressMessages(dcast(countryreference, continent+CountryStandard ~ data_type))
countryreference <- filter(countryreference, countryreference$CountryStandard==country)
countryreference <- countryreference[,colSums(is.na(countryreference))<nrow(countryreference)]
world_pop_data <- filter(world_pop_data, CountryStandard %in% countryreference$CountryStandard)
world_pop_data <- filter(world_pop_data, data_type==paste(datatype))
dataset_link <- as.character(world_pop_data[1,"summary_link"])
GET(url=dataset_link) -> res2
pg2 <- content(res2)
fields <- html_nodes(pg2, "form#conform > input")
fields <- set_names(xml_attr(fields, "value"), html_attr(fields, "name"))
POST(
url = "http://www.worldpop.org.uk/data/download/",
add_headers(`Referer` = dataset_link),
body = list(
client_first_name = "",
client_last_name = "",
client_organization = "",
client_country = "",
client_email = "",
client_message = "",
zip_id = fields["zip_id"],
zip_title = fields["zip_title"],
decoy = fields["decoy"],
website = "",
download = "Browse Individual Files"
),
encode = "form"
) -> res3
pg3 <- content(res3)
html_nodes(pg3, xpath=".//a[contains(., 'switch to')]") %>%
html_attr("href") -> file_list_query_string
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = list(
dataset = fields["zip_id"],
action = "dir"
)
) -> res4
pg4 <- content(res4)
data_frame(
group_name = html_nodes(pg4, "a.dl") %>% html_text(),
href = html_nodes(pg4, "a.dl") %>% html_attr("href")
) -> downloads
downloads$istif <- str_sub(downloads$group_name,-4,-1)
#Some such as senegal are inexplicably .TIF
downloads$istif <- tolower(downloads$istif)
downloads <- filter(downloads, istif==".tif")
pg4charfile <- as.character(downloads[1,"group_name"])
pg4charfile <- gsub(' {1,}','',pg4charfile)
if(substr(pg4charfile,1,6)!="popmap"){
if(grepl("\\d", pg4charfile)==TRUE){
char4 <- substr(pg4charfile,4,4)
char6 <-substr(pg4charfile,6,6)
char9 <-substr(pg4charfile,9,9)
char11 <-substr(pg4charfile,11,11)
char4num <- suppressWarnings(!is.na(as.numeric(char4)))
char6num <- suppressWarnings(!is.na(as.numeric(char6)))
char9num <- suppressWarnings(!is.na(as.numeric(char9)))
char11num <- suppressWarnings(!is.na(as.numeric(char11)))
if(char4num==TRUE & char6num==TRUE){
downloads$years <-substr(downloads$group_name,4,7)
}
if(char4num==TRUE & char6num==FALSE){
downloads$years <-substr(downloads$group_name,4,5)
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
if(char9num==FALSE & char11num==TRUE){
downloads$years <-substr(downloads$group_name,11,12)
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
if(char4num==FALSE & char6num==FALSE & char9num==TRUE){
downloads$years <- str_extract(downloads$group_name, "\\d{4}")
}
if(char4num==FALSE & char6num==FALSE & char9num==FALSE & char11num==FALSE){
downloads$years <- str_extract(downloads$group_name, "\\d{4}")
}
}else{downloads$years <- 9999}
}else{
downloads$years<- as.numeric(substr(downloads$group_name,7,8))
getfouryear <- function (yearsvec) {
yrFlip = 50
yearsvec <- as.numeric(yearsvec)
yearsvec[yearsvec > yrFlip] <- yearsvec[yearsvec > yrFlip] + 1900
yearsvec[yearsvec < yrFlip] <- yearsvec[yearsvec < yrFlip] + 2000
return(yearsvec)
}
downloads$years <- getfouryear(downloads$years)
}
downloads <- downloads[!is.na(downloads$years),]
possopt <- c("_pph_", "_ppp_", "_pp_", "uncert", "adj","_M.", "_M_","_F.", "_F_", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
opttext <- c("Persons per hectare", "Persons per pixel", "per pixel", " uncertainty dataset showing 95% credible intervals",
"adjusted to match UN estimates", "MALE", "MALE", "FEMALE", "FEMALE", "Uncertainty map", "poverty standard deviation map", "$1.25/day",
"$2.00/day", "mean wealth index", "mean likelihood of living in poverty per grid square", "Income estimate USD per grid square", "%poverty by Multidimensional Poverty Index",
"prob of four or more antenatal care visits at time of delivery", "prob of skilled birth attendance during delivery", "prob of postnatal care received within 48 hours of delivery")
possoptdf <- data.frame(possopt, opttext, stringsAsFactors = FALSE)
groupsubstr <- str_sub(downloads$group_name,4,-4)
###get options for each file from the worldpop selected country and datatype###
optionsforchoice<-foreach(a=1:nrow(downloads), .combine=rbind)%do%{
theoptions<- foreach(b=1:length(opttext), .combine=cbind)%do%{
matchoopt <- str_detect(downloads[a,"group_name"],coll(possopt[b]))
if(matchoopt==TRUE){result <-possopt[b]}
if(matchoopt==FALSE){result<- NA}
if(b==13){
mistake <- str_detect(downloads[a,"group_name"],"\\d{4}")
if(mistake==TRUE){result <- NA}
}
result
}
}
optionsforchoice<-do.call(rbind,lapply(1:nrow(optionsforchoice),function(x) t(matrix(optionsforchoice[x,order(is.na(optionsforchoice[x,]))])) ))
optionsforchoice <- as.data.frame(optionsforchoice, stringsAsFactors=FALSE)
optionsforchoice <- optionsforchoice[,colSums(is.na(optionsforchoice))<nrow(optionsforchoice)]
downloads <- cbind(downloads,optionsforchoice)
optiters <- as.data.frame(optionsforchoice)
if(length(optionsforchoice)==1){colnames(downloads)[5]<-"V1"}
if(ncol(as.data.frame(optionsforchoice))==1){colnames(downloads)[5]<-"V1"}
optiters <- ncol(optiters)
foreach(a=1:optiters)%do%{
downloads <- merge(downloads, possoptdf, by.x=paste0("V",a), by.y="possopt", all.x=TRUE)
coltochange <- ncol(downloads)
colnames(downloads)[coltochange] <- paste0("possopt",a)
}
downloads$years <- as.numeric(downloads$years)
downloads <- downloads[order(downloads$years),]
##Subsetting downloads to columns that only contain possopt
downpossopt <- downloads[ , grepl( "possopt" , colnames( downloads ) ) ]
downpossopt <- as.data.frame(downpossopt)
downpossopt$code <- c(1:nrow(downpossopt))
downloads$code <- c(1:nrow(downloads))
rownames(downloads) <- c(1:nrow(downloads))
rownames(downpossopt) <- c(1:nrow(downpossopt))
if(ncol(downpossopt)==1){
colnames(downpossopt) <- "possopt1"}
downpossopt <- cbind(downloads$years, downpossopt)
names(downpossopt)[names(downpossopt) == 'downloads$years'] <- 'years'
names(downpossopt)[names(downpossopt) == 'V1'] <- 'years'
###get the right codes for the function
possoptcodes <- c("pph", "ppp", "pp", "uncert", "adj","M", "M","F", "F", "interdecile", "povsd", "125", "200","wpipov", "ppipov", "incpov", "mpipov", "ANC", "SBA", "PNC")
possoptcodes <- as.data.frame(cbind(possoptcodes, opttext))
possoptcodes <- possoptcodes[c(1:6,8,10:nrow(possoptcodes)),]
names(downpossopt)[names(downpossopt) == 'downpossopt'] <- 'possopt1'
foreach(a=1:optiters)%do%{
downpossopt <- merge(downpossopt, possoptcodes, by.x=paste0("possopt",a), by.y="opttext", all.x=TRUE)
coltochange <- ncol(downpossopt)
colnames(downpossopt)[coltochange] <- paste0("OptionCode",a)
}
downpossopt <- as.data.frame(downpossopt)
downpossopt <- downpossopt[order(downpossopt$code),]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="possopt1"),which(colnames(downpossopt)!="possopt1"))]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="years"),which(colnames(downpossopt)!="years"))]
downpossopt$country <- countryreference[1,"CountryStandard"]
downpossopt <- downpossopt[,c(which(colnames(downpossopt)=="country"),which(colnames(downpossopt)!="country"))]
row.names(downpossopt) <- c(1:nrow(downpossopt))
downpossoptcodes <- downpossopt[ , grepl( "OptionCode" , colnames( downpossopt ) ) ]
downpossoptcodes <- as.data.frame(downpossoptcodes)
if(ncol(downpossoptcodes)>1){
downpossoptcodes <- data.frame(x=apply(downpossoptcodes,1,function(x) {paste(x[!is.na(x)],collapse='-')}))
}
colnames(downpossoptcodes) <- "optionspossible"
downloads <- as.data.frame(cbind(downloads, downpossoptcodes))
downloads <- filter(downloads, downloads$years==year)
if(optionschosen!=""){
downloads2 <- filter(downloads, downloads$optionspossible==optionschosen)
if(is.na(downloads2[1,1]) & length(options)>1){
optionschosen <- paste0(options[2],"-", options[1], collapse='')
downloads2 <- filter(downloads, downloads$optionspossible==optionschosen)
}
}else{downloads2 <- filter(downloads, is.na(downloads$optionspossible))}
readme_query_stringdownload <- as.character(downloads2[1,"href"])
filenamedest <- as.character(paste0(country,downloads2[1,"group_name"]))
GET(
url = "http://www.worldpop.org.uk/data/files/index.php",
query = parse_url(readme_query_stringdownload)$query,
progress(),
verbose(),
write_disk(paste0(filenamedest), overwrite=TRUE)
)-> res5
}