我正在尝试使用Scrapy登录到init中的网站,然后在确认登录后我想初始化并通过start_urls启动标准爬网。我不知道出了什么问题,但我清楚登录并且每件事都确认,但parse_item永远不会启动。任何帮助都将受到赞赏。
我可以说到“================成功登录=================”
但
我无法进入“========================== PARSE ITEM ============= =============“
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import Rule
from selenium import webdriver
class ProductDetailsSpider(InitSpider):
name = 'product_details_spider'
allowed_domains = ['my_domain.com']
login_page = 'http://www.my_domain.com/'
start_urls = ['http://www.my_domain.com/nextpage1/',
'http://www.my_domain.com/nextpage2/',
'http://www.my_domain.com/nextpage3/']
rules = (
Rule(SgmlLinkExtractor(allow=()),
callback='parse_item',
follow=True),
)
def get_cookies(self):
driver = webdriver.Firefox()
driver.implicitly_wait(30)
base_url = "http://www.my_domain.com"
driver.get(base_url + "/")
driver.find_element_by_name("USR").clear()
driver.find_element_by_name("USR").send_keys("my_user")
driver.find_element_by_name("PASSWRD").clear()
driver.find_element_by_name("PASSWRD").send_keys("my_pass")
driver.find_element_by_name("submit").click()
cookies = driver.get_cookies()
driver.close()
cookie_dic = {}
for c in cookies:
cookie_dic[c['name']] = c['value']
return cookie_dic
def init_request(self):
print '=======================INIT======================='
"""This function is called before crawling starts."""
return Request(url=self.login_page, callback=self.login)
def login(self, response):
print '=======================LOGIN======================='
"""Generate a login request."""
return [FormRequest.from_response(response,formname='login_form',
formdata={'USR': 'my_user', 'PASSWRD': 'my_pass'},
callback=self.login_cookies)]
def login_cookies(self, response):
print '=======================COOKIES======================='
return Request(url='http://www.my_domain.com/home',
cookies=self.get_cookies(),
callback=self.check_login_response)
def check_login_response(self, response):
print '=======================CHECK LOGIN======================='
"""Check the response returned by a login request to see if we are
successfully logged in.
"""
if "Logoff" in response.body:
print "=========Successfully logged in.========="
self.initialized()
# Now the crawling can begin..
else:
print "==============Bad times :(==============="
# Something went wrong, we couldn't log in, so nothing happens.
def parse_item(self, response):
print "==============PARSE ITEM=========================="
# Scrape data from page
答案 0 :(得分:5)
我参加聚会有点晚了,但我很确定你需要返回self.intialized():
if "Logoff" in response.body:
print "=========Successfully logged in.========="
return self.initialized()
# Now the crawling can begin..