Question

我是python和scrapy的新手

以下是我的代码，以获取所有下一页的所有产品名称，价格，图片，标题

import scrapy      
class TestSpider(scrapy.Spider):    
name = "testdoc1"    
start_urls = ["https://www.amazon.in/s/ref=amb_link_46?ie=UTF8&bbn=1389432031&rh=i%3Aelectronics%2Cn%3A976419031%2Cn%3A%21976420031%2Cn%3A1389401031%2Cn%3A1389432031%2Cp_89%3AApple&pf_rd_m=A1VBAL9TL5WCBF&pf_rd_s=merchandised-search-leftnav&pf_rd_r=CYS25V3W021MSYPQ32FB&pf_rd_r=CYS25V3W021MSYPQ32FB&pf_rd_t=101&pf_rd_p=1ce3e975-c6e8-479a-8485-2e490b9f58a9&pf_rd_p=1ce3e975-c6e8-479a-8485-2e490b9f58a9&pf_rd_i=1389401031"]

def parse(self, response):
    for post_link in response.xpath('//a/@href').extract():
        link = response.urljoin(post_link)
        yield scrapy.Request(link, callback=self.parse_post)

    # Checks if the main page has a link to next page if True keep parsing.
    next_page = response.xpath('(//a[@class="pagnNext"])[1]/@href').extract_first()
    if next_page:
        yield scrapy.Request(next_page, callback=self.parse)

def parse_post(self, response):
    # Scrape name,price,image, link from product.
    for post in response.xpath('//li[contains(@class,"s-result-item  celwidget")]'):
        item = dict()
        item['Name'] = post.xpath('.//h2[contains(@class,"a-size-base s-inline  s-access-title  a-text-normal")]/text()').extract()
        item['Price'] = post.xpath('.//span[contains(@class,"a-size-base a-color-price s-price a-text-bold")]/text()').extract()
        item['Image'] = post.xpath('.//img[contains(@class,"s-access-image cfMarker")]/@src').extract()
        item['Link'] = post.sel.xpath('.//a[contains(@class,"a-link-normal s-access-detail-page  s-color-twister-title-link a-text-normal")]/@href').extract()
        yield item

    # If the products page has a link to next page keep parsing.
    next_page = response.xpath('(//a[@class="pagnNext"])[1]/@href').extract_first()
    if next_page:
        yield scrapy.Request(next_page, callback=self.parse_post)

我的抓取没有出现任何错误，但我的CSV为空 `

Answer 1

您的问题在行

之下

yield scrapy.Request(next_page, callback=self.parse)

网址是相对网址。所以你应该使用

yield response.follow(next_page, callback=self.parse)

这将自动解析相对网址

修改-1

刚刚意识到您正在浏览单个页面，而您只需要从结果页面中提取数据。所以根本不需要你的parse_post功能。以下是你需要做的事情

class TestSpider(scrapy.Spider): name = "testdoc1" allowed_domains = ['amazon.in'] start_urls = [ "https://www.amazon.in/s/ref=amb_link_46?ie=UTF8&bbn=1389432031&rh=i%3Aelectronics%2Cn%3A976419031%2Cn%3A%21976420031%2Cn%3A1389401031%2Cn%3A1389432031%2Cp_89%3AApple&pf_rd_m=A1VBAL9TL5WCBF&pf_rd_s=merchandised-search-leftnav&pf_rd_r=CYS25V3W021MSYPQ32FB&pf_rd_r=CYS25V3W021MSYPQ32FB&pf_rd_t=101&pf_rd_p=1ce3e975-c6e8-479a-8485-2e490b9f58a9&pf_rd_p=1ce3e975-c6e8-479a-8485-2e490b9f58a9&pf_rd_i=1389401031"] def parse(self, response): for post in response.css('li.s-result-item'): item = dict() item['Name'] = post.xpath( './/h2[contains(@class,"a-size-base s-inline s-access-title a-text-normal")]/text()').extract() item['Price'] = post.xpath( './/span[contains(@class,"a-size-base a-color-price s-price a-text-bold")]/text()').extract() item['Image'] = post.xpath('.//img[contains(@class,"s-access-image cfMarker")]/@src').extract() item['Link'] = post.xpath( './/a[contains(@class,"a-link-normal s-access-detail-page s-color-twister-title-link a-text-normal")]/@href').extract() yield item # Checks if the main page has a link to next page if True keep parsing. next_page = response.xpath('(//a[@class="pagnNext"])[1]/@href').extract_first() if next_page: yield response.follow(next_page, callback=self.parse)

无法使用scrapy为下一页生成csv

1 个答案: