注意:ipums international和ipums usa可能使用相同的系统。 ipums usa允许更快的注册。如果您想测试代码,请尝试https://usa.ipums.org/usa-action/users/request_access注册!
我正尝试使用R语言和httr以编程方式从https://international.ipums.org/下载文件。我需要使用httr而不是RCurl,因为我需要将认证后的大文件而不是下载到RAM但直接下载到磁盘。 this is currently only possible with httr
as far as i know
以下可重现的代码记录了我从登录页面(https://international.ipums.org/international-action/users/login)到主后验证页面的最大努力。任何提示或提示将不胜感激!谢谢!
my_email <- "email@address.com"
my_password <- "password"
tf <- tempfile()
# use httr, because i need to download a large file after authentication
# and only httr supports that with its `write_disk()` option
library(httr)
# turn off ssl verify, otherwise the subsequent GET command will fail
set_config( config( ssl_verifypeer = 0L ) )
GET( "https://international.ipums.org/Shibboleth.sso/Login?target=https%3A%2F%2Finternational.ipums.org%2Finternational-action%2Fmenu" )
# connect to the starting login page of the website
( a <- GET( "https://international.ipums.org/international-action/users/login" , verbose( info = TRUE ) ) )
# which takes me through to a lot of websites, but ultimately (in my browser) lands at
shibboleth_url <- "https://live.identity.popdata.org:443/idp/Authn/UserPassword"
# construct authentication information?
base_values <- list( "j_username" = my_email , "j_password" = my_password )
idp_values <- list( "j_username" = my_email , "j_password" = my_password , "_idp_authn_lc_key"=subset( a$cookies , domain == "live.identity.popdata.org" )$value , "JSESSIONID" = subset( a$cookies , domain == "#HttpOnly_live.identity.popdata.org" )$value )
ipums_values <- list( "j_username" = my_email , "j_password" = my_password , "_idp_authn_lc_key"=subset( a$cookies , domain == "live.identity.popdata.org" )$value , "JSESSIONID" = subset( a$cookies , domain == "international.ipums.org" )$value)
# i believe this is where the main login should happen, but it looks like it's failing
GET( shibboleth_url , query = idp_values )
POST( shibboleth_url , body = base_values )
writeBin( GET( shibboleth_url , query = idp_values )$content , tf )
readLines( tf )
# The MPC account authentication system has encountered an error
# This error can sometimes occur if you did not close your browser after logging out of an application previously. It may also occur for other reasons. Please close your browser and try your action again."
writeBin( GET( "https://live.identity.popdata.org/idp/profile/SAML2/Redirect/SSO" , query = idp_values )$content , tf )
POST( "https://live.identity.popdata.org/idp/profile/SAML2/Redirect/SSO" , body = idp_values )
readLines( tf )
# same error as above
# return to the main login page..
writeBin( GET( "https://international.ipums.org/international-action/menu" , query = ipums_values )$content , tf )
readLines( tf )
# ..not logged in
答案 0 :(得分:2)
您必须使用set_cookies()
将Cookie发送到服务器:
library(httr)
library(rvest)
#my_email <- "xxx"
#my_password <- "yyy"
tf <- tempfile()
set_config( config( ssl_verifypeer = 0L ) )
# Get first page
p1 <- GET( "https://international.ipums.org/international-action/users/login" , verbose( info = TRUE ) )
# Post Login credentials
b2 <- list( "j_username" = my_email , "j_password" = my_password )
c2 <- c(JSESSIONID=p1$cookies[p1$cookies$domain=="#HttpOnly_live.identity.popdata.org",]$value,
`_idp_authn_lc_key`=p1$cookies[p1$cookies$domain=="live.identity.popdata.org",]$value)
p2 <- POST(p1$url,body = b2, set_cookies(.cookies = c2), encode="form" )
# Parse hidden fields
h2 <- read_html(p2$content)
form <- h2 %>% html_form()
# Post hidden fields
b3 <- list( "RelayState"=form[[1]]$fields[[1]]$value, "SAMLResponse"=form[[1]]$fields[[2]]$value)
c3 <- c(JSESSIONID=p1$cookies[p1$cookies$domain=="#HttpOnly_live.identity.popdata.org",]$value,
`_idp_session`=p2$cookies[p2$cookies$name=="_idp_session",]$value,
`_idp_authn_lc_key`=p2$cookies[p2$cookies$name=="_idp_authn_lc_key",]$value)
p3 <- POST( form[[1]]$url , body=b3, set_cookies(.cookies = c3), encode = "form")
# Get interesting page
c4 <- c(JSESSIONID=p3$cookies[p1$cookies$domain=="international.ipums.org" && p3$cookies$name=="JSESSIONID",]$value,
`_idp_session`=p3$cookies[p3$cookies$name=="_idp_session",]$value,
`_idp_authn_lc_key`=p3$cookies[p3$cookies$name=="_idp_authn_lc_key",]$value)
p4 <- GET( "https://international.ipums.org/international-action/menu", set_cookies(.cookies = c4) )
writeBin(p4$content , tf )
readLines( tf )[55]
由于结果是
[1] " <li class=\"lastItem\"><a href=\"/international-action/users/logout\">Logout</a></li>"
我认为您已登录...
答案 1 :(得分:2)
@HubertL在正确的方向上做了很多步骤,但是,我认为,他的答案并不完整。
首先,在实施自动Web授权时要考虑的主要问题是在“正常”手动工作流程中使用的cookie。您可以在任何现代浏览器中使用开发工具轻松监视它们:
在这里,我们看到JSESSIONID
和_shibsession*
个Cookie,第一个包含网站的JSP会话ID,第二个最有可能仅用于shibboleth授权。服务器可能会以某种方式绑定它们,但JSESSIONID
不需要授权,您可以在打开网站后立即获得它。因此,我们必须获取_shibsession*
Cookie才能授权JSESSIONID
。这就是Shibboleth对许多重定向的授权过程的意义所在。请参阅代码中的注释。
login_ipums = function(user, password)
{
require(httr)
require(rvest)
set_config( config( ssl_verifypeer = 0L ) )
#important - httr preserves cookies on subsequent requests to the same host, we don't need that because of sessions expiration
handle_reset("https://usa.ipums.org/")
#set login and password
login1 = GET( "https://usa.ipums.org/usa-action/users/login" )
form_auth = list( "j_username" = user , "j_password" = password )
l1_cookies=login1$cookies$value
names(l1_cookies)=login1$cookies$name
#receive auth tokens as html hidden fields in a form
login2 = POST(login1$url, body = form_auth, set_cookies(.cookies=l1_cookies), encode="form")
login2_form = read_html(login2$content) %>% html_form()
l2_cookies=login2$cookies$value
names(l2_cookies)=login2$cookies$name
#submit the form back (browser submits it back automatically with JS)
login3 = POST(login2_form[[1]]$url, body=list(RelayState=login2_form[[1]]$fields$RelayState$value,
SAMLResponse=login2_form[[1]]$fields$SAMLResponse$value),
set_cookies(.cookies=l2_cookies),
encode="form")
#now we have what we came for - _shibsession_* and JSESSION id cookie
login_cookies = login3$cookies$value
names(login_cookies)=login3$cookies$name
return=login_cookies
}
致电login_ipums
后,我们将拥有以下Cookie:
> cookies=login_ipums(my_email, my_password)
> names(cookies)
[1] "JSESSIONID"
[2] "_idp_authn_lc_key"
[3] "_shibsession_7573612e69..."
此处,我们将JSESSIONID
和_shibsession_*
用于网站范围的授权。可能不需要_idp_authn_lc_key
,但留下它不会受到伤害。
现在,您可以轻松下载这样的文件:
cookies=login_ipums(my_email, my_password)
target = GET("https://usa.ipums.org/usa-action/downloads/extract_files/usa_00001.dat.gz",
set_cookies(.cookies=cookies),
write_disk("file.bin", overwrite = TRUE))
重要提示:正如您所看到的,我使用的是IPUMS USA,而不是国际。要使用您的帐户检查该代码,请将usa
替换为international
,包括网址中的*-action
。