我想使用apache HttpClient api登录网站:rfp.ca并检索其html内容,其次是我的代码。
运行后,它只返回登录页面的源代码:http://www.rfp.ca/login/ 仍然要求提供凭据,我希望将其重定向到http://www.rfp.ca/my_account/,就像使用浏览器登录一样。
关于如何实现这一点的任何想法?
提前致谢。
埃里克
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.HTTP;
public class HttpGetter {
public static void main(String[] args) throws Exception {
DefaultHttpClient httpclient = new DefaultHttpClient();
HttpGet httpget = new HttpGet("http://www.rfp.ca/login/");
HttpResponse response = httpclient.execute(httpget);
HttpEntity entity = response.getEntity();
System.out.println("Login form get: " + response.getStatusLine());
if (entity != null) {
entity.consumeContent();
}
System.out.println("Initial set of cookies:");
List<Cookie> cookies = httpclient.getCookieStore().getCookies();
if (cookies.isEmpty()) {
System.out.println("None");
} else {
for (int i = 0; i < cookies.size(); i++) {
System.out.println("- " + cookies.get(i).toString());
}
}
HttpPost httpost = new HttpPost("http://www.rfp.ca/login/");
List <NameValuePair> nvps = new ArrayList <NameValuePair>();
nvps.add(new BasicNameValuePair("username", "myusername"));
nvps.add(new BasicNameValuePair("password", "mypassword"));
httpost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));
response = httpclient.execute(httpost);
System.out.println("Response "+response.toString());
entity = response.getEntity();
System.out.println("Login form get: " + response.getStatusLine());
if (entity != null) {
InputStream is = entity.getContent();
BufferedReader br = new BufferedReader(new InputStreamReader(is));
String str ="";
while ((str = br.readLine()) != null){
System.out.println(""+str);
}
}
System.out.println("Post logon cookies:");
cookies = httpclient.getCookieStore().getCookies();
if (cookies.isEmpty()) {
System.out.println("None");
} else {
for (int i = 0; i < cookies.size(); i++) {
System.out.println("- " + cookies.get(i).toString());
}
}
httpclient.getConnectionManager().shutdown();
}
}
这是返回的结果:
Login form get: HTTP/1.1 200 OK
Initial set of cookies:
- [version: 0][name: PHPSESSID][value: f4dc36acc705b31b15b4ea07a398a60b][domain: www.rfp.ca][path: /][expiry: null]
Response HTTP/1.1 200 OK [Date: Sat, 17 Mar 2012 04:04:49 GMT, Server: Apache/2.2.22 (Unix) mod_ssl/2.2.22 OpenSSL/1.0.0-fips mod_auth_passthrough/2.1 mod_bwlimited/1.4 FrontPage/5.0.2.2635, X-Powered-By: PHP/5.2.17, Expires: Thu, 19 Nov 1981 08:52:00 GMT, Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0, Pragma: no-cache, Vary: Accept-Encoding,User-Agent, Connection: close, Transfer-Encoding: chunked, Content-Type: text/html;charset=utf-8]
Login form get: HTTP/1.1 200 OK
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" lang="en-US">
<head>
<meta name="keywords" content="" />
<meta http-equiv="Content-Type" content="text/html charset=utf-8"/>
<title>RFP.ca: User login form</title>
<link rel="StyleSheet" type="text/css" href="http://www.rfp.ca/templates/_system/main/images/css/form.css" />
<link rel="StyleSheet" type="text/css" href="http://www.rfp.ca/templates/rfp/main/images/design.css" />
<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">
<link rel="icon" href="/favicon.ico" type="image/x-icon">
<link rel="stylesheet" href="http://www.rfp.ca/system/lib/rating/style.css" type="text/css" />
<link rel="StyleSheet" type="text/css" href="http://www.rfp.ca/system/ext/jquery/css/jquery-ui.css" />
<script language="JavaScript" type="text/javascript" src="http://www.rfp.ca/system/ext/jquery/jquery.js"></script>
<script language="JavaScript" type="text/javascript" src="http://www.rfp.ca/system/ext/jquery/jquery-ui.js"></script>
<script language="JavaScript" type="text/javascript" src="http://www.rfp.ca/system/ext/jquery/jquery.validate.min.js"></script>
<style type="text/css">
*html img,
*html.png
{
azimuth: expression(
this.pngSet?
this.pngSet=true :
(this.nodeName == "IMG" ?
(this.src.toLowerCase().indexOf('.png')>-1 ?
(this.runtimeStyle.backgroundImage = "none", this.runtimeStyle.filter = "progid:DXImageTransform.Microsoft.AlphaImageLoader(src='" + this.src + "', sizingMethod='image')",
this.src = "http://www.rfp.ca/templates/rfp/main/images/blank.gif") :
'') :
(this.currentStyle.backgroundImage.toLowerCase().indexOf('.png')>-1) ?
(this.origBg = (this.origBg) ?
this.origBg :
this.currentStyle.backgroundImage.toString().replace('url("','').replace('")',''),
this.runtimeStyle.filter = "progid:DXImageTransform.Microsoft.AlphaImageLoader(src='" + this.origBg + "', sizingMethod='crop')",
this.runtimeStyle.backgroundImage = "none") :
''
), this.pngSet=true
);
}
</style>
<script type="text/javascript">
var _gaq = _gaq || [];
_gaq.push(['_setAccount', 'UA-254707-12']);
_gaq.push(['_trackPageview']);
(function() {
var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
})();
</script>
</head>
<body>
<div id="messageBox"></div>
<div class="MainDiv">
<div class="headerPage">
<div class="logo">
<div class="png"></div>
<a href="http://www.rfp.ca/"><img src="http://www.rfp.ca/templates/rfp/main/images/logo.png" border="0" alt="" title="" /></a>
</div>
<div class="userMenu">
<a href="http://www.rfp.ca/" title="RFP Home"> Home</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="" />
<a href="http://www.rfp.ca/find_rfps/" title="Search">Search</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="" />
<a href="http://www.rfp.ca/rfp_alerts/?action=new" title="E-mail Alert">E-mail Alert</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="" />
<a href="http://www.rfp.ca/contact/" title="Contact">Contact</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="" />
<a href="http://www.rfp.ca/login/" title="Sign In"> Sign In</a>
<br/><br/>
<!--
<form id="langSwitcherForm" method="get" action="">
<select name="lang" onchange="location.href='http://www.rfp.ca/login/?lang='+this.value+'&'" style="width: 200px;">
<option value="de">Deutsch</option>
<option value="tr">Türkçe</option>
<option value="ps">پښتو</option>
<option value="fr">Français</option>
<option value="ar">العربية</option>
<option value="en" selected="selected">English</option>
<option value="fa">فارسی</option>
<option value="ja">日本語</option>
<option value="es">Español</option>
<option value="nl">Nederlands</option>
<option value="ru">–†—É—Å—Å–∫–∏–π</option>
<option value="pt">Português</option>
</select>
</form>
-->
</div>
</div>
<div class="clr"><br /></div>
<div class="indexDiv" >
<h1>Sign In</h1>
<p style="color:#9B9B9B"><i>Tip: Username is your e-mail address</i></p>
<form action="http://www.rfp.ca/login/" method="post" id="loginForm" >
<input type="hidden" name="return_url" value="" />
<input type="hidden" name="action" value="login" />
<fieldset>
<div class="inputName">Username</div>
<div class="inputField"><input type="text" class="logInNameInput" name="username" /></div>
</fieldset>
<fieldset>
<div class="inputName">Password</div>
<div class="inputField"><input class="logInPassInput2" type="password" name="password" /></div>
</fieldset>
<fieldset>
<div class="inputName"> </div>
<div class="inputField"><input type="checkbox" name="keep" /> Keep me signed in</div>
</fieldset>
<fieldset>
<div class="inputName"> </div>
<div class="inputField"><input type="submit" value="Login" class="button" /></div>
</fieldset>
</form>
<br/>
<a href="http://www.rfp.ca/password_recovery/">Forgot Your Password?</a> | <a href="http://www.rfp.ca/registration/">Subscription</a>
</div>
<div id="grayBgBanner"></div>
<div class="clr"><br /></div>
<div class="bottomMenu">
<a href="http://www.rfp.ca/">Home</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
<a href="http://www.rfp.ca/faq/"> About Us/FAQ</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
<a href="http://www.rfp.ca/features/"> Features</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
<a href="http://www.rfp.ca/contact/" >Contact</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
<a href="http://www.rfp.ca/privacy_policy/">Privacy Policy</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
<a href="http://www.rfp.ca/terms_of_use/">Terms of use</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
© 2012 Organized Media <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
<a href="http://www.twitter.com/rfpca" rel="me" target="_blank"><img src="http://www.rfp.ca/templates/rfp/main/images/twitter.gif" border="0" alt="Twitter"></a><a href="http://www.facebook.com/pages/RFPca/164233376967738" rel="me" target="_blank"><img src="http://www.rfp.ca/templates/rfp/main/images/facebook.gif" border="0" alt="Facebook"></a>
</div>
</div>
<div class="Footer">
</div>
</body>
</html>
Post logon cookies:
- [version: 0][name: PHPSESSID][value: f4dc36acc705b31b15b4ea07a398a60b][domain: www.rfp.ca][path: /][expiry: null]
我尝试添加这两个参数:
nvps.add(new BasicNameValuePair("return_url", "http://www.rfp.ca/my_account/"));
nvps.add(new BasicNameValuePair("action", "login"));
然后发现了一些错误:
Login form get: HTTP/1.1 200 OK
Initial set of cookies:
- [version: 0][name: PHPSESSID][value: e76f3b507a3db64cf1d4ad2297fb0c58][domain: www.rfp.ca][path: /][expiry: null]
Exception in thread "main" org.apache.http.client.ClientProtocolException
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:822)
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:754)
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:732)
at Crawler.HttpGetter.main(HttpGetter.java:203)
Caused by: org.apache.http.ProtocolException: Invalid redirect URI: ?Ûiÿü0·éq¯æɧ¢éí
at org.apache.http.impl.client.DefaultRedirectStrategy.createLocationURI(DefaultRedirectStrategy.java:185)
at org.apache.http.impl.client.DefaultRedirectStrategy.getLocationURI(DefaultRedirectStrategy.java:116)
at org.apache.http.impl.client.DefaultRedirectStrategy.getRedirect(DefaultRedirectStrategy.java:193)
at org.apache.http.impl.client.DefaultRequestDirector.handleResponse(DefaultRequestDirector.java:1035)
at org.apache.http.impl.client.DefaultRequestDirector.execute(DefaultRequestDirector.java:492)
at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:820)
... 3 more
Caused by: java.net.URISyntaxException: Illegal character in path at index 0: ?Ûiÿü0·éq¯æɧ¢éí
at java.net.URI$Parser.fail(URI.java:2809)
at java.net.URI$Parser.checkChars(URI.java:2982)
at java.net.URI$Parser.parseHierarchical(URI.java:3066)
at java.net.URI$Parser.parse(URI.java:3024)
at java.net.URI.<init>(URI.java:578)
at org.apache.http.impl.client.DefaultRedirectStrategy.createLocationURI(DefaultRedirectStrategy.java:183)
... 8 more
答案 0 :(得分:4)
我不确定这一点,但从我在表格中看到的情况来看,它会有以下参数
return_url:
action:login
username:myusername
password:mypassword
并且您未在POST
请求中提供前两个。
更新:在这种情况下获得正确参数的最佳方法是,在浏览器中打开URL,然后在Firebug或开发人员工具(WebKit)中监控网络活动。它将以编程方式向您显示您需要发送的内容。