提出NotSupported("响应内容不是文本") - scrapy.exceptions.NotSupported:响应内容不是文本

时间:2017-06-30 06:55:10

标签: python xpath scrapy

我几天都有同样的错误。我无法解决它!我真的不明白我的代码在哪里不正确。我之前通过更改“链接”已经解决了类似的错误消息。部分但现在,它不再工作了。有人可以帮帮我吗?

# -*- coding: utf-8 -*-
import scrapy
import re
import numbers
from amazon_test.items import AmazonTestItem
from urllib.parse import urlparse
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor

class AmazonSellersSpider(CrawlSpider): #scrapy.Spider
    name = 'AmazonFR'
    allowed_domains = ['amazon.fr']
    start_urls = ['https://www.amazon.fr']

    rules = (
        Rule(LinkExtractor(allow=()), callback='parse'),
    )

    def parse(self, response):
        item = AmazonTestItem()
        link = (response.xpath('//div[@class="a-column a-span6"]/h3[@id="-component-heading"]/text()'))
        if link:
            wait = response.xpath('//div[@class="a-column a-span6"]/h3[@id="-component-heading"]/text()').extract()
            if (len(wait) != 0):
                name = response.xpath('//div[@class="a-row a-spacing-medium"]/div[@class="a-column a-span6"]/ul[@class="a-unordered-list a-nostyle a-vertical"]/li//span[@class="a-list-item"]/span[contains(.,"Nom")]/following-sibling::text()').extract()
                phone = response.xpath('//div[@class="a-column a-span6"]/ul[@class="a-unordered-list a-nostyle a-vertical"]/li//span[@class="a-list-item"]/span[contains(.,"Téléphone")]/following-sibling::text()').extract()
                registre = response.xpath('//div[@class="a-column a-span6"]/ul[@class="a-unordered-list a-nostyle a-vertical"]/li//span[@class="a-list-item"]/span[contains(.,"registre de commerce")]/following-sibling::text()').extract()
                TVA = response.xpath('//div[@class="a-column a-span6"]/ul[@class="a-unordered-list a-nostyle a-vertical"]/li//span[@class="a-list-item"]/span[contains(.,"TVA")]/following-sibling::text()').extract()
                address = response.xpath('//div[@class="a-column a-span6"]/ul[@class="a-unordered-list a-nostyle a-vertical"]/li//span[span[contains(.,"Adresse")]]/ul//li//text()').extract()
                item['Business_name'] = ''.join(name).strip()
                item['Phone_number'] = ''.join(phone).strip()
                item['VAT_number'] = ''.join(TVA).strip()
                item['Address'] = '\n'.join(address).strip()
                item['Registre_commerce'] = ''.join(registre).strip()
                yield item
        else:
            for sel in response.xpath('//html/body'):
                item = AmazonTestItem()
                list_urls = sel.xpath('//a/@href').extract()
                for url in list_urls:
                    yield scrapy.Request(response.urljoin(url), callback=self.parse, meta={'item': item})

,错误信息为:

Traceback (most recent call last):
  File "C:\Users\paulpo\AppData\Local\Continuum\Anaconda3\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
    yield next(it)
  File "C:\Users\paulpo\AppData\Local\Continuum\Anaconda3\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
    for x in result:
  File "C:\Users\paulpo\AppData\Local\Continuum\Anaconda3\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
    return (_set_referer(r) for r in result or ())
  File "C:\Users\paulpo\AppData\Local\Continuum\Anaconda3\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "C:\Users\paulpo\AppData\Local\Continuum\Anaconda3\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
    return (r for r in result or () if _filter(r))
  File "C:\Users\paulpo\Documents\amazon_test\amazon_test\spiders\AmazonFR.py", line 21, in parse
    link = (response.xpath('//div[@class="a-column a-span6"]/h3[@id="-component-heading"]/text()')).extract
  File "C:\Users\paulpo\AppData\Local\Continuum\Anaconda3\lib\site-packages\scrapy\http\response\__init__.py", line 105, in xpath
    raise NotSupported("Response content isn't text")
scrapy.exceptions.NotSupported: Response content isn't text

0 个答案:

没有答案