Question

我试图开始我的第一个scrapy项目，我遇到了一个奇怪的问题。对于某些网站，我的抓取工具运行良好，对于其他网站，它不遵循提取链接的规则。我在SO上搜索并看到其他人有类似的问题，但在他们的情况下，他们的allow参数格式错误导致Filtered offsite request，这不会发生在我身上。我的日志在这里http://pastebin.com/r1pXmeJW（首先是失败的网址，然后是一个工作正常的网址，因为我不能发布超过2个链接...）。

我的蜘蛛是通过使用API的Python脚本控制的：

# -*- coding: utf-8 -*-

from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy.utils.project import get_project_settings
from govcrawl.spiders.main_spider import DomainSpider
import sys, urlparse, re
from scrapy.contrib.spiders import Rule
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor

args = sys.argv[1].split('§')
url_id = args[0]
start_url = args[1]
url_parts = urlparse.urlparse(start_url)
allowed_domain = url_parts.netloc
allowed_path = '/'.join(url_parts.path.split('/')[:-1])
cur_state = sys.argv[2]

spider = DomainSpider(
   start_urls = [start_url],
   allowed_domains = [allowed_domain],
   url_id = url_id,
   cur_state = cur_state,
   rules = (
      Rule(
         LxmlLinkExtractor(
            allow = re.compile(r".*%s.*" % re.escape(allowed_path), re.IGNORECASE),
            allow_domains = [allowed_domain],
            tags = ('a', 'area', 'frame'),
            attrs = ('href', 'src')
         ),
         callback = "parse_items",
         follow = True
      ),
   )
)
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal = signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()

这是我的DomainSpider：

import re
from govcrawl.items import DomainItem
from scrapy.utils.markup import remove_tags
from scrapy.contrib.spiders import CrawlSpider
from scrapy import log

class DomainSpider(CrawlSpider):
    name = "govcrawl_main"

    def parse_start_url(self, response):
        return self.parse_items(response)

    def parse_items(self, response):
        pages_done = self.crawler.stats.get_value('downloader/response_count')
        pages_todo = self.crawler.stats.get_value('scheduler/enqueued') - self.crawler.stats.get_value('downloader/response_count')
        log.msg("URL: %s (%s) Crawled %d pages. To Crawl: %d" % (self.start_urls[0], self.url_id, pages_done, pages_todo), spider = self)
        links = []
        for sel in response.xpath('//a'):
           href = sel.xpath('@href').extract()
           if len(href) > 0:
              href = href[0]
              if href.startswith("http"):
                 links.append(href)
        item = DomainItem()
        item["url"] = response.url
        item["text"] = re.sub(r'\s{2,}', ' ', remove_tags(' '.join(response.xpath('//body//text()').extract()))).strip()
        item["links"] = links
        self.crawler.stats.inc_value('pages_crawled')
        yield item

知道如何让抓取工具遵循失败网站的规则吗？

Answer 1

事实证明，返回错误的网页包含格式错误的html代码，</html>解析器并不喜欢多个lxml。由于scrapy不允许您将CrawlSpider与不同的解析器一起使用，因此我最终重新实现了一个常规Spider对象，该对象的行为或多或少为CrawlSpider ：

import urlparse, re
from scrapy import Spider, log
from bs4 import BeautifulSoup
from scrapy.http import Request
from govcrawl.items import DomainItem

class DomainSimpleSpider(Spider):
    name = "govcrawl_simple"

    def parse(self, response):
        pages_done = self.crawler.stats.get_value('downloader/response_count')
        pages_todo = self.crawler.stats.get_value('scheduler/enqueued') - self.crawler.stats.get_value('downloader/response_count')
        log.msg("URL: %s (%s) Crawled %d pages. To Crawl: %d" % (self.start_urls[0], self.url_id, pages_done, pages_todo), spider = self)
        #import ipdb
        #ipdb.set_trace()
        soup = BeautifulSoup(response._body, "html5lib")
        links = []
        for tag in self.tags:
           for a in soup.find_all(tag):
              for attr in self.attrs:
                 if attr in a.attrs:
                    href = a.attrs[attr]
                    if href.startswith("http"):
                       links.append(href)
                    href = urlparse.urljoin(response.url, href)
                    href_parts = urlparse.urlparse(href.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', '+'))
                    if re.match(self.allow, href_parts.path) and not self.forbidden_extension(href_parts.path):
                       yield Request(href)
        for script in soup(["script", "style"]):
           script.extract()
        item = DomainItem()
        item["url"] = response.url
        #item["text"] = re.sub(r'\s{2,}', ' ', remove_tags(' '.join(response.xpath('//body//text()').extract()))).strip()
        item["text"] = soup.get_text()
        item["links"] = links
        self.crawler.stats.inc_value('pages_crawled')
        yield item

    def forbidden_extension(self, url):
       url = url.lower()
       return url.endswith("pdf") or url.endswith("jpg") or url.endswith("wmv") or url.endswith("avi") or url.endswith("pptx") or url.endswith("gif") or url.endswith("mp3") or url.endswith("mp4") or url.endswith("wav") or url.endswith("mov") or url.endswith("ppt") or url.endswith("xls") or url.endswith("doc") or url.endswith("docx") or url.endswith("xlsx") or url.endswith("flv") or url.endswith("wma") or url.endswith("jpeg") or url.endswith("png") or url.endswith("odf") or url.endswith("ods") or url.endswith("zip") or url.endswith("gz") or url.endswith("tar") or url.endswith("7z") or url.endswith("rar") or url.endswith("vob")

可以通过以下Python脚本控制此蜘蛛：

from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy import log, signals
from scrapy.utils.project import get_project_settings
from govcrawl.spiders.simple_spider import DomainSimpleSpider
import urlparse, re
from scrapy.contrib.linkextractors.lxmlhtml import LxmlLinkExtractor

start_url = ...
url_parts = urlparse.urlparse(start_url)
allowed_domain = url_parts.netloc
allowed_path = '/'.join(url_parts.path.split('/')[:-1])

spider = DomainSimpleSpider(
   start_urls = [start_url],
   allowed_domains = [allowed_domain],
   allow = re.compile(r".*%s.*" % re.escape(allowed_path), re.IGNORECASE),
   tags = ('a', 'area', 'frame'),
   attrs = ('href', 'src'),
   response_type_whitelist = [r"text/html", r"application/xhtml+xml", r"application/xml"]
)
settings = get_project_settings()
crawler = Crawler(settings)
crawler.signals.connect(reactor.stop, signal = signals.spider_closed)
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start()
reactor.run()

请注意：

我使用html5lib中的BeautifulSoup解析器而不是lxml。 html5lib可以很好地处理多个</html>，但它是一个外部依赖项，因此您必须安装它。
由于某种原因，mimetype检查似乎无法正常工作。因此，我添加了一个forbidden_extensions函数，该函数阻止为非Request文件创建html，并且我必须添加另一个DownloaderMiddleware来使用response_type_whitelist蜘蛛（请参阅Python Scrapy - mimetype based filter to avoid non-text file downloads了解中间件实现）
看来这只蜘蛛正在处理起始页两次，但坦率地说，我并不在意解决这个问题。

CrawlSpider无法遵循某些网站的规则

1 个答案: