Question

虽然有很多问题，但是大多数人都会因为“dont_filter”参数而遇到这个问题。我传递了这个参数“dont_filter = True”但是我的自定义解析生成器仍然没有用。这是我的代码（第三个解析器“parse_spec”从未被调用过，“parse_models_follow_next_page”在parse（）调用时效果很好，但是当它需要转到下一页时它无法调用自身）：

import scrapy
from gsmarena.items import PhoneItems

class VendorSpider(scrapy.Spider):
    custom_settings = {
        'DOWNLOAD_DELAY': 1.5,
        'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/7046A194A',
        'COOKIES_ENABLED': False
    }

    name = "gsmarena_spec"

    allowed_domains = ["https://www.gsmarena.com/"]

    start_urls = [
        "https://www.gsmarena.com/makers.php3"
    ]

    def parse(self, response):
        # print("Existing settings: %s" % self.settings.attributes.items())
        length = len(response.xpath("//table//a").extract())
        for i in range(1, length):
            brand = response.xpath(
                '(//table//a)[{}]/text()'.format(i)).extract()[0]
            url = "https://www.gsmarena.com/" + \
                response.xpath("(//table//a)[{}]/@href".format(i)).extract()[0]
            yield scrapy.Request(url, callback=self.parse_models_follow_next_page, meta={'brand': brand}, dont_filter=True)

    def parse_models_follow_next_page(self, response):
        brand = response.meta.get('brand')
        length = len(response.xpath(
            "//div[@class='makers']/self::div//a").extract())
        for i in range(1, length):
            url = "https://www.gsmarena.com/" + \
                response.xpath(
                    "(//div[@class='makers']/self::div//a)[{}]/@href".format(i)).extract()[0]
            model = response.xpath(
                "(//div[@class='makers']/self::div//a//span/text())[{}]".format(i)).extract()[0]
            yield scrapy.Request(url, callback=self.parse_spec, meta={'brand': brand, 'model': model}, dont_filter=True)
        is_next_page = response.xpath(
            "//a[@class=\"pages-next\"]/@href").extract()

        if is_next_page:
            next_page = "https://www.gsmarena.com/" + is_next_page[0]
            yield scrapy.Request(next_page, callback=self.parse_models_follow_next_page, meta={'brand': brand}, dont_filter=True)



    def parse_spec(self, response):
        item = PhoneItems()
        item['model'] = response.meta.get('model')
        item['brand'] = response.meta.get('brand')
        for spec_name, spec in zip(response.xpath('//table//td[1]').extract(), response.xpath('//table//td[2]').extract()):
        item[spec_name] = spec
        yield item

抱歉我的英语不好。

Answer 1

我在代码中做了一些更改，它会删除所有期望spec_name的结果，而这种结果并没有以理解的方式指定。

导入scrapy

来自lxml import html

来自tutorial.items的

导入PhoneItems

类VendorSpider（scrapy.Spider）：

   custom_settings = {
    'DOWNLOAD_DELAY': 1.5,
    'USER_AGENT': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) 
    AppleWebKit/537.75.14 (KHTML, '
                  'like Gecko) Version/7.0.3 Safari/7046A194A',
    'COOKIES_ENABLED': False
    }
    name = "gsmarena_spec"
    allowed_domains = ["https://www.gsmarena.com/"]
    start_urls = [
        "https://www.gsmarena.com/makers.php3"
    ]

   def parse(self, response):
        # print("Existing settings: %s" % 
        self.settings.attributes.items())
        length = len(response.xpath("//table//a").extract())
        for i in range(1, length):
            brand = response.xpath(
            '(//table//a)[{}]/text()'.format(i)).extract()[0]
            url = "https://www.gsmarena.com/" + \
            response.xpath("(//table//a)
                [{}]/@href".format(i)).extract()[0]
            yield scrapy.Request(url,
                callback=self.parse_models_follow_next_page,
                meta={'brand':brand},dont_filter=True)

   def parse_models_follow_next_page(self, response):
        brand = response.meta.get('brand')
        meta = response.meta
        doc = html.fromstring(response.body)
        single_obj = doc.xpath('.//div[@class="makers"]/ul//li')
        for obj in single_obj:
            url = self.allowed_domains[0]+obj.xpath('.//a/@href')[0]
            meta['brand'] = obj.xpath('.//a/@href')[0].split('_')[0]
            meta['model'] = obj.xpath('.//a/@href')[0]
            yield scrapy.Request(url=url, callback=self.parse_spec, 
                meta=meta, dont_filter=True)
        is_next_page = response.xpath(
                       "//a[@class=\"pages-next\"]/@href").extract()
        if is_next_page:
            next_page = "https://www.gsmarena.com/" + is_next_page[0]
            yield scrapy.Request(next_page, 
            callback=self.parse_models_follow_next_page, 
            meta={'brand': brand},dont_filter=True)

    def parse_spec(self, response):
        item = PhoneItems()
        meta = response.meta
        item['model'] = meta['model']
        item['brand'] = meta['brand']

        #Need to specify details about spec_name
        # for spec_name, spec in 
        #zip(response.xpath('//table//td[1]').extract(),
        #  response.xpath('//table//td[2]').extract()):
        #     item[spec_name] = spec
        yield item

Answer 2

刮刀几乎没有问题。

allowed_domains = ["https://www.gsmarena.com/"]

应该是

allowed_domains = ["www.gsmarena.com"]

接下来，您没有在班级中定义errback_httpbin方法

def errback_httpbin(self, response):
    pass

代码

for spec_name, spec in zip(response.xpath('//table//td[1]').extract(), response.xpath('//table//td[2]').extract()):

应该是

for spec_name, spec in zip(response.xpath('//table//td[1]/text()').extract(), response.xpath('//table//td[2]/text()').extract()):

这虽然仍有一些问题。

此外，您的代码需要一些时间才能获得第一个收益，因为调度程序将根据进入的网址顺序选择网址

scrapy.Reaquests（）回调不起作用

2 个答案: