使用Scrapy

时间:2017-05-05 16:13:32

标签: python web-scraping scrapy

我是Scrapy的新手。

这是我的蜘蛛版,目标是抓取电子商务菜单并返回所有类别,分别遵守分类标准(N1,N2和N3)

我的结果很好但是N3类别在输出json上显示了最后一个。

示例:N1 - > N2 / Linkn2,N1 - > N2 / Linkn2,N1 - > N2 / Linkn2,N1 - > N2 / Linkn2 ......

N3 / Linkn3,N3 / Linkn3,N3 / Linkn3,N3 / Linkn3

我想获得具体级别的类别。每个N1与他们的N2和每个N2与他们的N3。

我需要的结构是

N1-N2-Linkn2-N3-Linkn3(N1类Muebles)/ N1-N2-Linkn2-N3-Linkn3(N1类Herramientas)/ N1-N2-Linkn2-N3-Linkn3(N1类Automovil) / .....

很清楚?非常感谢!

import urlparse
import scrapy


class ReadySpider(scrapy.Spider):
name = 'cats2'
start_urls = ['http://www.sodimac.com.ar']

def parse(self, response):
    # loop over all cover  N1 links elements on main menu
    SELECTOR = '//*[@id="navBarLeave"]/ul/li/a/@href'
    for href in response.xpath(SELECTOR).extract():
        yield scrapy.Request(urlparse.urljoin(response.url, href), self.parse_n2)

    # loop over all N2 on left menu
def parse_n2(self, response):
    SELECTOR = '.jq-accordionGroup'
    for ready in response.css(SELECTOR):
        N1 = 'menu.menu-list h1::text'
        N2 = '//*[@class="jq-accordion"]/a/text()'
        Linkn2 = '//*[@class="jq-accordion"]/a/@href'

        # come back with results
        yield {
        'N1': response.css(N1).extract(),
        'N2': response.xpath(N2).extract(),
        'Linkn2': response.xpath(Linkn2).extract(),
        }

        for href in response.xpath(Linkn2).extract():
            yield scrapy.Request(urlparse.urljoin(response.url, href), self.parse_n3)

    # loop over all N3 on left menu
def parse_n3(self, response):
    SELECTOR = '.jq-accordionGroup'
    for ready in response.css(SELECTOR).extract():
        N3 = '//*[@class="jq-accordion"]/a/text()'
        Linkn3 = '//*[@class="jq-accordion"]/a/@href'

    # come back with results
        yield {
        'N3': response.xpath(N3).extract(),
        'Linkn3': response.xpath(Linkn3).extract(),
        }

2 个答案:

答案 0 :(得分:0)

像这样创建自定义Item

class CustomItem(scrapy.Item):
    N1 = scrapy.Field()
    N2 = scrapy.Field()
    N3 = scrapy.Field()

并使用requests属性将此项目传递到meta

def parse_n1(self, response):
    item = CustomItem()
    item['N1'] = ...

    request = scrapy.Request(..., callback=self.parse_n2)
    request.meta['item'] = item   # attach to request
    yield request

def parse_n2(self, response):
    item = response.meta['item']  # <- unpack from response
    item['N2'] = ...              # <- fill more fields
    yield item                    # <- finally pass the item to output

这是伪代码。请适应您的需要。

答案 1 :(得分:0)

import urlparse
import scrapy
from ready.items import ReadyItem

class ReadySpider(scrapy.Spider):   
name = 'cats3'
start_urls = ['http://www.sodimac.com.ar']

def parse(self, response):
    # loop over all cover  N1 links elements on main menu
    SELECTOR = '//*[@id="navBarLeave"]/ul/li/a/@href'
    for href in response.xpath(SELECTOR).extract():
        item = ReadyItem()

        request = scrapy.Request(urlparse.urljoin(response.url, href), self.parse_n2)
        request.meta['item'] = item   # attach to request
        yield request


    # loop over all N2 on left menu
def parse_n2(self, response):
    SELECTOR = '.jq-accordionGroup'
    for ready in response.css(SELECTOR):
        item = response.meta['item']
        item['NameN1'] = response.css('menu.menu-list h1::text').extract()
        item['NameN2'] = response.xpath('//*[@class="jq-accordion"]/a/text()').extract()
        item['LinkN2'] = response.xpath('//*[@class="jq-accordion"]/a/@href').extract()
        yield request

        request = scrapy.Request(urlparse.urljoin(response.url, href), self.parse_n3)
        request.meta['item'] = item   # attach to request
        yield request

def parse_n3(self, response):
    SELECTOR = '.jq-accordionGroup'
    for ready in response.css(SELECTOR):
        item = response.meta['item']
        item['NameN3'] = response.css('//*[@class="jq-accordion"]/a/text()').extract()
        item['LinkN3'] = response.xpath('//*[@class="jq-accordion"]/a/@href').extract()
        yield item