apache HttpClient,基于表单的登录,并检索HTML内容

时间:2012-03-17 04:18:25

标签: java apache httpclient

我想使用apache HttpClient api登录网站:rfp.ca并检索其html内容,其次是我的代码。

运行后,它只返回登录页面的源代码:http://www.rfp.ca/login/ 仍然要求提供凭据,我希望将其重定向到http://www.rfp.ca/my_account/,就像使用浏览器登录一样。

关于如何实现这一点的任何想法?

提前致谢。

埃里克

import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.cookie.Cookie;
import org.apache.http.impl.client.DefaultHttpClient;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.protocol.HTTP;



public class HttpGetter {



        public static void main(String[] args) throws Exception {

            DefaultHttpClient httpclient = new DefaultHttpClient();

            HttpGet httpget = new HttpGet("http://www.rfp.ca/login/");

            HttpResponse response = httpclient.execute(httpget);
            HttpEntity entity = response.getEntity();

            System.out.println("Login form get: " + response.getStatusLine());
            if (entity != null) {
                entity.consumeContent();
            }
            System.out.println("Initial set of cookies:");
            List<Cookie> cookies = httpclient.getCookieStore().getCookies();
            if (cookies.isEmpty()) {
                System.out.println("None");
            } else {
                for (int i = 0; i < cookies.size(); i++) {
                    System.out.println("- " + cookies.get(i).toString());
                }
            }

            HttpPost httpost = new HttpPost("http://www.rfp.ca/login/");

            List <NameValuePair> nvps = new ArrayList <NameValuePair>();
            nvps.add(new BasicNameValuePair("username", "myusername"));
            nvps.add(new BasicNameValuePair("password", "mypassword"));

            httpost.setEntity(new UrlEncodedFormEntity(nvps, HTTP.UTF_8));

            response = httpclient.execute(httpost);

            System.out.println("Response "+response.toString());
            entity = response.getEntity();

            System.out.println("Login form get: " + response.getStatusLine());
            if (entity != null) {

                InputStream is = entity.getContent();
                BufferedReader br = new BufferedReader(new InputStreamReader(is));
                String str ="";
                while ((str = br.readLine()) != null){
                    System.out.println(""+str);
                }
            }

            System.out.println("Post logon cookies:");
            cookies = httpclient.getCookieStore().getCookies();
            if (cookies.isEmpty()) {
                System.out.println("None");
            } else {
                for (int i = 0; i < cookies.size(); i++) {
                    System.out.println("- " + cookies.get(i).toString());
                }
            }
            httpclient.getConnectionManager().shutdown();        

    }
}

这是返回的结果:

    Login form get: HTTP/1.1 200 OK
Initial set of cookies:
- [version: 0][name: PHPSESSID][value: f4dc36acc705b31b15b4ea07a398a60b][domain: www.rfp.ca][path: /][expiry: null]
Response HTTP/1.1 200 OK [Date: Sat, 17 Mar 2012 04:04:49 GMT, Server: Apache/2.2.22 (Unix) mod_ssl/2.2.22 OpenSSL/1.0.0-fips mod_auth_passthrough/2.1 mod_bwlimited/1.4 FrontPage/5.0.2.2635, X-Powered-By: PHP/5.2.17, Expires: Thu, 19 Nov 1981 08:52:00 GMT, Cache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0, Pragma: no-cache, Vary: Accept-Encoding,User-Agent, Connection: close, Transfer-Encoding: chunked, Content-Type: text/html;charset=utf-8]
Login form get: HTTP/1.1 200 OK
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"

    "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" lang="en-US">

  <head>

<meta name="keywords" content="" />

<meta http-equiv="Content-Type" content="text/html charset=utf-8"/>     

<title>RFP.ca: User login form</title>

<link rel="StyleSheet" type="text/css" href="http://www.rfp.ca/templates/_system/main/images/css/form.css" />

<link rel="StyleSheet" type="text/css" href="http://www.rfp.ca/templates/rfp/main/images/design.css" />


<link rel="shortcut icon" href="/favicon.ico" type="image/x-icon">

<link rel="icon" href="/favicon.ico" type="image/x-icon">

<link rel="stylesheet" href="http://www.rfp.ca/system/lib/rating/style.css" type="text/css" />

<link rel="StyleSheet" type="text/css" href="http://www.rfp.ca/system/ext/jquery/css/jquery-ui.css"  />

<script language="JavaScript" type="text/javascript" src="http://www.rfp.ca/system/ext/jquery/jquery.js"></script>

<script language="JavaScript" type="text/javascript" src="http://www.rfp.ca/system/ext/jquery/jquery-ui.js"></script>

<script language="JavaScript" type="text/javascript" src="http://www.rfp.ca/system/ext/jquery/jquery.validate.min.js"></script>






<style type="text/css">

*html img,

*html.png

{

  azimuth: expression(

    this.pngSet?

      this.pngSet=true : 

        (this.nodeName == "IMG" ? 

          (this.src.toLowerCase().indexOf('.png')>-1 ? 

            (this.runtimeStyle.backgroundImage = "none", this.runtimeStyle.filter = "progid:DXImageTransform.Microsoft.AlphaImageLoader(src='" + this.src + "', sizingMethod='image')",

                this.src = "http://www.rfp.ca/templates/rfp/main/images/blank.gif") :

            '') :          

          (this.currentStyle.backgroundImage.toLowerCase().indexOf('.png')>-1) ?

            (this.origBg = (this.origBg) ? 

              this.origBg :             

              this.currentStyle.backgroundImage.toString().replace('url("','').replace('")',''),

              this.runtimeStyle.filter = "progid:DXImageTransform.Microsoft.AlphaImageLoader(src='" + this.origBg + "', sizingMethod='crop')",

              this.runtimeStyle.backgroundImage = "none") :

            ''

        ), this.pngSet=true

  );

}

</style>



<script type="text/javascript">

  var _gaq = _gaq || [];
  _gaq.push(['_setAccount', 'UA-254707-12']);
  _gaq.push(['_trackPageview']);

  (function() {
    var ga = document.createElement('script'); ga.type = 'text/javascript'; ga.async = true;
    ga.src = ('https:' == document.location.protocol ? 'https://ssl' : 'http://www') + '.google-analytics.com/ga.js';
    var s = document.getElementsByTagName('script')[0]; s.parentNode.insertBefore(ga, s);
  })();

</script>

    </head>

<body>

<div id="messageBox"></div>

<div class="MainDiv">
    <div class="headerPage">
        <div class="logo">
            <div class="png"></div>
            <a href="http://www.rfp.ca/"><img src="http://www.rfp.ca/templates/rfp/main/images/logo.png" border="0" alt="" title="" /></a>
        </div>
        <div class="userMenu">
                            <a href="http://www.rfp.ca/" title="RFP Home"> Home</a> &nbsp; &nbsp; <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="" /> &nbsp; &nbsp;  
                <a href="http://www.rfp.ca/find_rfps/" title="Search">Search</a> &nbsp; &nbsp; <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="" /> &nbsp; &nbsp;  
                <a href="http://www.rfp.ca/rfp_alerts/?action=new" title="E-mail Alert">E-mail Alert</a> &nbsp; &nbsp; <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="" /> &nbsp; &nbsp;  
                <a href="http://www.rfp.ca/contact/" title="Contact">Contact</a> &nbsp; &nbsp; <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="" /> &nbsp; &nbsp;  
                <a href="http://www.rfp.ca/login/" title="Sign In"> Sign In</a>
                        <br/><br/>
<!--            
            <form id="langSwitcherForm" method="get" action="">
                <select name="lang" onchange="location.href='http://www.rfp.ca/login/?lang='+this.value+'&amp;'" style="width: 200px;">
                                            <option value="de">Deutsch</option>
                                            <option value="tr">Türkçe</option>
                                            <option value="ps">پښتو</option>
                                            <option value="fr">Français</option>
                                            <option value="ar">العربية</option>
                                            <option value="en" selected="selected">English</option>
                                            <option value="fa">فارسی</option>
                                            <option value="ja">日本語</option>
                                            <option value="es">Español</option>
                                            <option value="nl">Nederlands</option>
                                            <option value="ru">–†—É—Å—Å–∫–∏–π</option>
                                            <option value="pt">Português</option>
                                    </select>
            </form>
-->

        </div>
    </div>
    <div class="clr"><br /></div>

<div class="indexDiv"   >







    <h1>Sign In</h1>


    <p style="color:#9B9B9B"><i>Tip: Username is your e-mail address</i></p>

    <form action="http://www.rfp.ca/login/" method="post" id="loginForm" >

        <input type="hidden" name="return_url" value="" />

        <input type="hidden" name="action" value="login" />


        <fieldset>

            <div class="inputName">Username</div>

            <div class="inputField"><input type="text" class="logInNameInput" name="username" /></div>

        </fieldset>

        <fieldset>

            <div class="inputName">Password</div>

            <div class="inputField"><input class="logInPassInput2" type="password" name="password" /></div>

        </fieldset>

        <fieldset>

            <div class="inputName">&nbsp;</div>

            <div class="inputField"><input type="checkbox" name="keep" /> Keep me signed in</div>

        </fieldset>

        <fieldset>

            <div class="inputName">&nbsp;</div>

            <div class="inputField"><input type="submit" value="Login" class="button" /></div>

        </fieldset>

    </form>

    <br/>

    <a  href="http://www.rfp.ca/password_recovery/">Forgot Your Password?</a>&nbsp;|&nbsp; <a href="http://www.rfp.ca/registration/">Subscription</a>



</div>

<div id="grayBgBanner"></div>

    <div class="clr"><br /></div>
    <div class="bottomMenu">
        <a href="http://www.rfp.ca/">Home</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
        <a href="http://www.rfp.ca/faq/"> About Us/FAQ</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt=""> 
        <a href="http://www.rfp.ca/features/"> Features</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt=""> 
        <a href="http://www.rfp.ca/contact/" >Contact</a>  <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">
        <a href="http://www.rfp.ca/privacy_policy/">Privacy Policy</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">         
        <a href="http://www.rfp.ca/terms_of_use/">Terms of use</a> <img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt=""> 
        &nbsp;&copy; 2012 Organized Media &nbsp;<img src="http://www.rfp.ca/templates/rfp/main/images/sepDot.png" border="0" alt="">    
        <a href="http://www.twitter.com/rfpca" rel="me" target="_blank"><img src="http://www.rfp.ca/templates/rfp/main/images/twitter.gif" border="0" alt="Twitter"></a><a href="http://www.facebook.com/pages/RFPca/164233376967738" rel="me" target="_blank"><img src="http://www.rfp.ca/templates/rfp/main/images/facebook.gif" border="0" alt="Facebook"></a>       
        </div>
</div>
<div class="Footer">
</div>
</body>

</html>

Post logon cookies:
- [version: 0][name: PHPSESSID][value: f4dc36acc705b31b15b4ea07a398a60b][domain: www.rfp.ca][path: /][expiry: null]

我尝试添加这两个参数:

nvps.add(new BasicNameValuePair("return_url", "http://www.rfp.ca/my_account/"));
nvps.add(new BasicNameValuePair("action", "login"));

然后发现了一些错误:

Login form get: HTTP/1.1 200 OK
Initial set of cookies:
- [version: 0][name: PHPSESSID][value: e76f3b507a3db64cf1d4ad2297fb0c58][domain: www.rfp.ca][path: /][expiry: null]
Exception in thread "main" org.apache.http.client.ClientProtocolException
    at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:822)
    at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:754)
    at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:732)
    at Crawler.HttpGetter.main(HttpGetter.java:203)
Caused by: org.apache.http.ProtocolException: Invalid redirect URI: ?Ûiÿü0·éq¯æɧ¢éí
    at org.apache.http.impl.client.DefaultRedirectStrategy.createLocationURI(DefaultRedirectStrategy.java:185)
    at org.apache.http.impl.client.DefaultRedirectStrategy.getLocationURI(DefaultRedirectStrategy.java:116)
    at org.apache.http.impl.client.DefaultRedirectStrategy.getRedirect(DefaultRedirectStrategy.java:193)
    at org.apache.http.impl.client.DefaultRequestDirector.handleResponse(DefaultRequestDirector.java:1035)
    at org.apache.http.impl.client.DefaultRequestDirector.execute(DefaultRequestDirector.java:492)
    at org.apache.http.impl.client.AbstractHttpClient.execute(AbstractHttpClient.java:820)
    ... 3 more
Caused by: java.net.URISyntaxException: Illegal character in path at index 0: ?Ûiÿü0·éq¯æɧ¢éí
    at java.net.URI$Parser.fail(URI.java:2809)
    at java.net.URI$Parser.checkChars(URI.java:2982)
    at java.net.URI$Parser.parseHierarchical(URI.java:3066)
    at java.net.URI$Parser.parse(URI.java:3024)
    at java.net.URI.<init>(URI.java:578)
    at org.apache.http.impl.client.DefaultRedirectStrategy.createLocationURI(DefaultRedirectStrategy.java:183)
    ... 8 more

1 个答案:

答案 0 :(得分:4)

我不确定这一点,但从我在表格中看到的情况来看,它会有以下参数

return_url:
action:login
username:myusername
password:mypassword

并且您未在POST请求中提供前两个。

更新:在这种情况下获得正确参数的最佳方法是,在浏览器中打开URL,然后在Firebug或开发人员工具(WebKit)中监控网络活动。它将以编程方式向您显示您需要发送的内容。