我是Scrapy的新手。
这是我的蜘蛛版,目标是抓取电子商务菜单并返回所有类别,分别遵守分类标准(N1,N2和N3)
我的结果很好但是N3类别在输出json上显示了最后一个。
示例:N1 - > N2 / Linkn2,N1 - > N2 / Linkn2,N1 - > N2 / Linkn2,N1 - > N2 / Linkn2 ......
N3 / Linkn3,N3 / Linkn3,N3 / Linkn3,N3 / Linkn3
我想获得具体级别的类别。每个N1与他们的N2和每个N2与他们的N3。
我需要的结构是
N1-N2-Linkn2-N3-Linkn3(N1类Muebles)/ N1-N2-Linkn2-N3-Linkn3(N1类Herramientas)/ N1-N2-Linkn2-N3-Linkn3(N1类Automovil) / .....
很清楚?非常感谢!
import urlparse
import scrapy
class ReadySpider(scrapy.Spider):
name = 'cats2'
start_urls = ['http://www.sodimac.com.ar']
def parse(self, response):
# loop over all cover N1 links elements on main menu
SELECTOR = '//*[@id="navBarLeave"]/ul/li/a/@href'
for href in response.xpath(SELECTOR).extract():
yield scrapy.Request(urlparse.urljoin(response.url, href), self.parse_n2)
# loop over all N2 on left menu
def parse_n2(self, response):
SELECTOR = '.jq-accordionGroup'
for ready in response.css(SELECTOR):
N1 = 'menu.menu-list h1::text'
N2 = '//*[@class="jq-accordion"]/a/text()'
Linkn2 = '//*[@class="jq-accordion"]/a/@href'
# come back with results
yield {
'N1': response.css(N1).extract(),
'N2': response.xpath(N2).extract(),
'Linkn2': response.xpath(Linkn2).extract(),
}
for href in response.xpath(Linkn2).extract():
yield scrapy.Request(urlparse.urljoin(response.url, href), self.parse_n3)
# loop over all N3 on left menu
def parse_n3(self, response):
SELECTOR = '.jq-accordionGroup'
for ready in response.css(SELECTOR).extract():
N3 = '//*[@class="jq-accordion"]/a/text()'
Linkn3 = '//*[@class="jq-accordion"]/a/@href'
# come back with results
yield {
'N3': response.xpath(N3).extract(),
'Linkn3': response.xpath(Linkn3).extract(),
}
答案 0 :(得分:0)
像这样创建自定义Item
class CustomItem(scrapy.Item):
N1 = scrapy.Field()
N2 = scrapy.Field()
N3 = scrapy.Field()
并使用requests
属性将此项目传递到meta
。
def parse_n1(self, response):
item = CustomItem()
item['N1'] = ...
request = scrapy.Request(..., callback=self.parse_n2)
request.meta['item'] = item # attach to request
yield request
def parse_n2(self, response):
item = response.meta['item'] # <- unpack from response
item['N2'] = ... # <- fill more fields
yield item # <- finally pass the item to output
这是伪代码。请适应您的需要。
答案 1 :(得分:0)
import urlparse
import scrapy
from ready.items import ReadyItem
class ReadySpider(scrapy.Spider):
name = 'cats3'
start_urls = ['http://www.sodimac.com.ar']
def parse(self, response):
# loop over all cover N1 links elements on main menu
SELECTOR = '//*[@id="navBarLeave"]/ul/li/a/@href'
for href in response.xpath(SELECTOR).extract():
item = ReadyItem()
request = scrapy.Request(urlparse.urljoin(response.url, href), self.parse_n2)
request.meta['item'] = item # attach to request
yield request
# loop over all N2 on left menu
def parse_n2(self, response):
SELECTOR = '.jq-accordionGroup'
for ready in response.css(SELECTOR):
item = response.meta['item']
item['NameN1'] = response.css('menu.menu-list h1::text').extract()
item['NameN2'] = response.xpath('//*[@class="jq-accordion"]/a/text()').extract()
item['LinkN2'] = response.xpath('//*[@class="jq-accordion"]/a/@href').extract()
yield request
request = scrapy.Request(urlparse.urljoin(response.url, href), self.parse_n3)
request.meta['item'] = item # attach to request
yield request
def parse_n3(self, response):
SELECTOR = '.jq-accordionGroup'
for ready in response.css(SELECTOR):
item = response.meta['item']
item['NameN3'] = response.css('//*[@class="jq-accordion"]/a/text()').extract()
item['LinkN3'] = response.xpath('//*[@class="jq-accordion"]/a/@href').extract()
yield item