CrawlSpider with Splash,只抓取和处理第一个链接

时间:2021-05-19 21:27:36

标签: python web-scraping scrapy

我正在将 Scrapy 与 Splash 一起使用。这是我的蜘蛛中的内容:

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from scrapy_splash import SplashRequest
import logging

class MainSpider(CrawlSpider):
    name = 'main'
    allowed_domains = ['www.somesite.com']

    script = '''
    function main(splash, args)
      splash.private_mode_enabled = false

      my_user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36'

      headers = {
        ['User-Agent'] = my_user_agent,
        ['Accept-Language'] = 'en-GB,en-US;q=0.9,en;q=0.8',
        ['Referer'] = 'https://www.google.com'
      }

      splash:set_custom_headers(headers)

      url = args.url

      assert(splash:go(url))

      assert(splash:wait(2))

      -- username input
      username_input = assert(splash:select('#username'))
      username_input:focus()
      username_input:send_text('myusername')
      assert(splash:wait(0.3))

      -- password input
      password_input = assert(splash:select('#password'))
      password_input:focus()
      password_input:send_text('mysecurepass')
      assert(splash:wait(0.3))

      -- the login button
      login_btn = assert(splash:select('#login_btn'))
      login_btn:mouse_click()
      assert(splash:wait(4))

      return splash:html()
    end
    '''

    rules = (
        Rule(LinkExtractor(restrict_xpaths="(//div[@id='sidebar']/ul/li)[7]/a"), callback='parse_item', follow=True, process_request='use_splash'),
    )

    def start_requests(self):
        yield SplashRequest(url = 'https://www.somesite.com/login', callback = self.post_login, endpoint = 'execute', args = {
            'lua_source': self.script
        })

    def use_splash(self, request):
        request.meta.update(splash={
            'args': {
                'wait': 1,
            },
            'endpoint': 'render.html',
        })

        return request

    def _requests_to_follow(self, response):
        if not isinstance(response, (HtmlResponse, SplashJsonResponse, SplashTextResponse)):
            return

        seen = set()

        for n, rule in enumerate(self._rules):
            links = [lnk for lnk in rule.link_extractor.extract_links(response) if lnk not in seen]

            if links and rule.process_links:
                links = rule.process_links(links)

            for link in links:
                seen.add(link)
                r = self._build_request(n, link)

                yield rule.process_request(r)

    def post_login(self, response):
       logging.info('hey from login!')

       with open('post_login_response.txt', 'w') as f:
           f.write(response.text)
           f.close()

    def parse_item(self, response):
        logging.info('hey from parse_item!')

        with open('post_search_response.txt', 'w') as f:
            f.write(response.text)
            f.close()

我遇到了 this 并尝试以相同的方式实现,但 prase_item 始终没有运行。在日志中,我从来没有得到 hey from parse_item!

我不确定我错过了什么。可以找到完整的日志输出here

1 个答案:

答案 0 :(得分:0)

我放弃了爬行蜘蛛并转换为普通蜘蛛,现在一切正常。