Scrapy:尝试身份验证并抓取网站

时间:2015-10-02 09:06:04

标签: python authentication web-scraping scrapy

我正在尝试在登录会话时抓取数据,但它一直在说

Traceback (most recent call last):
  File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 1201, in mainLoop
    self.runUntilCurrent()
  File "/usr/lib/python2.7/dist-packages/twisted/internet/base.py", line 824, in runUntilCurrent
    call.func(*call.args, **call.kw)
  File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 382, in callback
    self._startRunCallbacks(result)
  File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 490, in _startRunCallbacks
    self._runCallbacks()
--- <exception caught here> ---
  File "/usr/lib/python2.7/dist-packages/twisted/internet/defer.py", line 577, in _runCallbacks
    current.result = callback(current.result, *args, **kw)
  File "/home/suz/user_login/login_user/login_u/login_u/spiders/spiders.py", line 60, in check_login_response
    return self.initialized()
  File "/usr/local/lib/python2.7/dist-packages/scrapy/contrib/spiders/init.py", line 15, in initialized
    return self.__dict__.pop('_postinit_reqs')
exceptions.KeyError: '_postinit_reqs'

我哪里错了?

这是我登录的蜘蛛。我成功登录但有刮刮问题。如果你给我简短的回复,那将是非常有帮助的

from scrapy.contrib.spiders.init import InitSpider
from scrapy.http import Request, FormRequest
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import Rule

from scrapy.spider import BaseSpider
from scrapy.selector import Selector

from login_u.items import myspiderBotItem

class LinkedPySpider(InitSpider):
    name = 'myspider'
    # allowed_domains = ['linkedin.com']
    login_page = 'https://www.linkedin.com/uas/login'
    # start_urls = ["http://www.linkedin.com/csearch/results?type=companies&keywords=&pplSearchOrigin=GLHD&pageKey=member-home&search=Search#facets=pplSearchOrigin%3DFCTD%26keywords%3D%26search%3DSubmit%26facet_CS%3DC%26facet_I%3D80%26openFacets%3DJO%252CN%252CCS%252CNFR%252CF%252CCCR%252CI"]
    start_urls = ["http://www.linkedin.com/"]

    def __init__(self, username, password, *args, **kwargs):
        global rules
        # query = query.replace(' ','-')
        self.rules = (Rule(SgmlLinkExtractor(allow=('/'+'*')), callback='parse_item',follow = True),)
        super(LinkedPySpider, self).__init__(*args, **kwargs)
        self.allowed_domains = ['linkedin.com']
        self.start_urls = [kwargs.get('start_url')]
        print(self.start_urls)
        self.username=username
        self.password=password

    def init_request(self):
        #"""This function is called before crawling starts."""
        print self.login_page
        return Request(url=self.login_page, callback=self.login)

    def login(self, response):
        #"""Generate a login request."""
        username=self.username
        password=self.password
        return FormRequest.from_response(response,
                    formdata={'session_key': username, 'session_password': password},
                    callback=self.check_login_response)

    def check_login_response(self, response):
        #"""Check the response returned by a login request to see if we aresuccessfully logged in."""
        if "Sign Out" in response.body:
            self.log("\n\n\nSuccessfully logged in. Let's start crawling!\n\n\n")
            # Now the crawling can begin..
            print "2222222222222222"
            print self.initialized()
            print "33333333333333333"

            return self.initialized() # ****THIS LINE FIXED THE LAST PROBLEM*****

        else:
            self.log("\n\n\nFailed, Bad times :(\n\n\n")
            # Something went wrong, we couldn't log in, so nothing happens.

    def parse(self, response):
        self.log("\n\n\n We got data! \n\n\n")
        # hxs = HtmlXPathSelector(response)
        sites = Selector(text=response.body).xpath('//div[contains(@id, "identity")]//section/div/div/h3/')
        items = []
        for site in sites:
            item = myspiderBotItem()
            item['title'] = site.select('a/text()').extract()
            item['link'] = site.select('a/@href').extract()
            items.append(item)
        return items

0 个答案:

没有答案