我正在尝试在登录会话时抓取数据,但它一直在说
Traceback (most recent call last):
File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 1201, in mainLoop
self.runUntilCurrent()
File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
call.func(*call.args, **call.kw)
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 382, in callback
self._startRunCallbacks(result)
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 490, in _startRunCallbacks
self._runCallbacks()
--- <exception caught here> ---
File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 577, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/suz/user_login/login_user/login_u/login_u/spiders/spiders.py", line 60, in check_login_response
return self.initialized()
File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spiders/init.py", line 15, in initialized
return self.__dict__.pop('_postinit_reqs')
exceptions.KeyError: '_postinit_reqs'
我哪里错了?
这是我登录的蜘蛛。我成功登录但有刮刮问题。如果你给我简短的回复,那将是非常有帮助的
from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import Rule
from scrapy.spider import BaseSpider
from scrapy.selector import Selector
from login_u.items import myspiderBotItem
class LinkedPySpider(InitSpider):
name = 'myspider'
# allowed_domains = ['linkedin.com']
login_page = 'https://www.linkedin.com/uas/login'
# start_urls = ["http://www.linkedin.com/csearch/results?type=companies&keywords=&pplSearchOrigin=GLHD&pageKey=member-home&search=Search#facets=pplSearchOrigin%3DFCTD%26keywords%3D%26search%3DSubmit%26facet_CS%3DC%26facet_I%3D80%26openFacets%3DJO%252CN%252CCS%252CNFR%252CF%252CCCR%252CI"]
start_urls = ["http://www.linkedin.com/"]
def __init__(self, username, password, *args, **kwargs):
global rules
# query = query.replace(' ','-')
self.rules = (Rule(SgmlLinkExtractor(allow=('/'+'*')), callback='parse_item',follow = True),)
super(LinkedPySpider, self).__init__(*args, **kwargs)
self.allowed_domains = ['linkedin.com']
self.start_urls = [kwargs.get('start_url')]
print(self.start_urls)
self.username=username
self.password=password
def init_request(self):
#"""This function is called before crawling starts."""
print self.login_page
return Request(url=self.login_page, callback=self.login)
def login(self, response):
#"""Generate a login request."""
username=self.username
password=self.password
return FormRequest.from_response(response,
formdata={'session_key': username, 'session_password': password},
callback=self.check_login_response)
def check_login_response(self, response):
#"""Check the response returned by a login request to see if we aresuccessfully logged in."""
if "Sign Out" in response.body:
self.log("\n\n\nSuccessfully logged in. Let's start crawling!\n\n\n")
# Now the crawling can begin..
print "2222222222222222"
print self.initialized()
print "33333333333333333"
return self.initialized() # ****THIS LINE FIXED THE LAST PROBLEM*****
else:
self.log("\n\n\nFailed, Bad times :(\n\n\n")
# Something went wrong, we couldn't log in, so nothing happens.
def parse(self, response):
self.log("\n\n\n We got data! \n\n\n")
# hxs = HtmlXPathSelector(response)
sites = Selector(text=response.body).xpath('//div[contains(@id, "identity")]//section/div/div/h3/')
items = []
for site in sites:
item = myspiderBotItem()
item['title'] = site.select('a/text()').extract()
item['link'] = site.select('a/@href').extract()
items.append(item)
return items