我正在努力搜索一个需要验证才能访问有趣信息的网站。我按照指示here但由于某种原因,我的蜘蛛没有通过登录表单页面。
编辑:使用的Scrapy版本:1.0.1
这是我蜘蛛的代码:
# -*- coding: utf-8 -*-
import time
import re
import scrapy
from SST.items import WebsiteItem
from scrapy.spiders.init import InitSpider
class WebsiteSpider(InitSpider):
name = 'WebOfScience'
login_page = "https://website.com/userLogin.do"
search_page = "http://apps.website.com"
start_urls = ["http://apps.website.com"]
def __init__(self, username="", password="", *args, **kwargs):
"""
"""
super(scrapy.Spider, self).__init__(*args, **kwargs)
self.http_user = username
self.http_pass = password
def init_request(self):
"""We initialize the spider by logging in."""
return scrapy.Request(url=self.login_page,
callback=self.login, dont_filter=True)
def login(self, response):
"""This function takes care of the login form"""
request = scrapy.FormRequest.\
from_response(response,
formdata={'username': self.http_user,
'password': self.http_pass},
callback=self.check_login_response
)
return request
def check_login_response(self, response):
"""This function checks the login was successfull"""
# Sleep 10s to make sure all the redirections are done.
time.sleep(10)
if re.search("loginFailed", response.url):
scrapy.log("Login has failed.")
# Otherwise, we should be on the right page to start crawling.
else:
# print response.body
if "Basic Search" in response.body:
scrapy.log("Crawling starts now !")
return scrapy.Request(callback=self.parse)
def parse(self, response):
"""We start parsing the results."""
links = response.xpath(restrict_xpaths=("//a[contains('RECORD',id)]/value"))
for link in links:
# do whatever we have to do
编辑:该程序从不输出"登录失败"或者"现在开始爬行!"。这是蜘蛛关闭的地方。
**编辑:**以下是我试图填写表格的代码。
<form name="userLoginForm" method="POST" onsubmit="return RegisterUserLogin()">
<p> </p>
<div align="center">
<table width="31%" border="0">
<tbody><tr>
<td width="53%" nowrap align="right">
<p align="right">Enter your <A HREF="../html/help.htm#ID Number" title="User ID">User ID</A>:
</p></td>
<td width="50%" align="center">
<INPUT class="inputInput" id="j_username" name="j_username" TYPE=TEXT SIZE="20" value="" maxlength="100">
</td>
</tr>
<tr>
<td width="53%" nowrap align="right">
<p align="right">
Enter your <A HREF="../html/help.htm#PW" title="Password">Password</A>:
</p></td>
<td width="50%" align="center">
<INPUT class="inputInput" id="j_password" name="j_password" TYPE=PASSWORD SIZE="20">
</td>
</tr>
<tr>
<td width="103%" colspan="2" align="center">
<p align="center"><font size="2"><input id="rememberme" name="rememberme" type="checkbox" value="ON" title="Remember Password"> Remember Password</font></p>
<p align="center"><a href="forgotPassword.do" TITLE="Forgot My Password"><font size="2">Forgot My Password</font></a>
</p></td>
</tr>
</tbody></table>
<input id="j_auth_type" name="j_auth_type" type="hidden" value="UNP" />
</div>
<p></p>
<center><input value="Submit" type="Submit" title="Submit" >
<input type="button" value="Clear" title="Clear the form" onclick="clearValues();">
<input id="userType" name="userType" type="hidden" value="user" />
<p></p>
</center>
<p><center>
[<a href="../html/custsupp.htm" title="Support">Support</a>]
[<a href="../html/help.htm" title="Help">Help</a>]
</center>
</p>
<hr>
</form>
为什么没有使用Scrapy进行重定向的任何想法?
谢谢!