开发了Scrapy中的以下蜘蛛代码,用于从americanas网站抓取页面:
# -*- coding: utf-8 -*-
import scrapy
import urllib
import re
import webscrap.items
import time
from urlparse import urljoin
from HTMLParser import HTMLParser
class AmericanasSpider(scrapy.Spider):
name = "americanas"
start_urls = ('http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.mc_id=home-menuLista-alimentos/',)
source = webscrap.items.ImportSource ("Americanas")
def parse (self, response):
ind = 0
self.source.submit()
b = []
for c in response.xpath ('//div[@class="item-menu"]/ul'):
c1 = re.sub('[\t\n]','', c.xpath('//span [@class="menu-heading"]/text()').extract()[ind])
if (c1):
x = webscrap.items.Category(c1)
x.submit()
for b in c.xpath ('li'):
b1 = webscrap.items.Category( b.xpath('a/text()').extract()[0])
if (b1):
b1.setParent(x.getID())
b1.submit()
link = b.xpath ('@href').extract()
urla = urljoin (response.url, link)
request = scrapy.Request (urla, callback = self.parse_category)
request.meta['idCategory'] = b1.getID ()
yield request
for a in b.xpath ('ul/li/a/text()'):
a1 = webscrap.items.Category( a.extract())
a1.setParent(b1.getID())
a1.submit()
link = a.xpath ('@href').extract()
urla = urljoin (response.url, link)
request = scrapy.Request (urla, callback = self.parse_category)
request.meta['idCategory'] = a1.getID ()
yield request
ind = ind + 1
def parse_category(self, response):
# produtos na pagina
items = response.xpath('//div[@class="paginado"]//article[@class="single-product vitrine230 "]')
for item in items:
url = item.xpath('.//div[@itemprop="item"]/form/div[@class="productInfo"]/div]/a[@class="prodTitle"]/@href').extract()
urla = urljoin(response.url, link)
request = scrapy.Request (urla, callback = self.parse_product)
request.meta['idCategory'] = response.meta['idCategory']
yield request
# proxima pagina (caso exista)
nextpage = response.xpath('//div[@class="pagination"]/ul/li/a[@class="pure-button next"]/@href').extract()
if (nextpage):
link = nextpage[0]
urlb = urljoin(response.url, link)
self.log('Next Page: {0}'.format(nextpage))
request = scrapy.Request (urlb, callback = self.parse_category)
request.meta['idCategory'] = response.meta['idCategory']
yield request
def parse_product (self, response):
print response.url
title = response.xpath('//title/text()').extract()
self.log(u'Título: {0}'.format(title))
但我得到以下输出:
PS C:\Users\Natalia Oliveira\Desktop\Be Happy\behappy\import\webscrap> scrapy crawl americanas
2016-10-06 17:28:04 [scrapy] INFO: Scrapy 1.1.2 started (bot: webscrap)
2016-10-06 17:28:04 [scrapy] INFO: Overridden settings: {'NEWSPIDER_MODULE': 'webscrap.spiders', 'REDIRECT_ENABLED': Fal
se, 'SPIDER_MODULES': ['webscrap.spiders'], 'BOT_NAME': 'webscrap'}
2016-10-06 17:28:04 [scrapy] INFO: Enabled extensions:
['scrapy.extensions.logstats.LogStats',
'scrapy.extensions.telnet.TelnetConsole',
'scrapy.extensions.corestats.CoreStats']
2016-10-06 17:28:05 [scrapy] INFO: Enabled downloader middlewares:
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware',
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware',
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware',
'scrapy.downloadermiddlewares.retry.RetryMiddleware',
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware',
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware',
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware',
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware',
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware',
'scrapy.downloadermiddlewares.stats.DownloaderStats']
2016-10-06 17:28:05 [scrapy] INFO: Enabled spider middlewares:
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware',
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware',
'scrapy.spidermiddlewares.referer.RefererMiddleware',
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware',
'scrapy.spidermiddlewares.depth.DepthMiddleware']
2016-10-06 17:28:05 [scrapy] INFO: Enabled item pipelines:
[]
2016-10-06 17:28:05 [scrapy] INFO: Spider opened
2016-10-06 17:28:05 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min)
2016-10-06 17:28:05 [scrapy] DEBUG: Telnet console listening on 127.0.0.1:6023
2016-10-06 17:28:05 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.m
c_id=home-menuLista-alimentos/> (referer: None)
2016-10-06 17:28:07 [scrapy] DEBUG: Filtered duplicate request: <GET http://www.americanas.com.br/loja/226795/alimentos-
e-bebidas?WT.mc_id=home-menuLista-alimentos/> - no more duplicates will be shown (see DUPEFILTER_DEBUG to show all dupli
cates)
2016-10-06 17:28:07 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.m
c_id=home-menuLista-alimentos/> (referer: http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.mc_id=home-men
uLista-alimentos/)
2016-10-06 17:28:22 [scrapy] INFO: Closing spider (finished)
2016-10-06 17:28:22 [scrapy] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 931,
'downloader/request_count': 2,
'downloader/request_method_count/GET': 2,
'downloader/response_bytes': 80585,
'downloader/response_count': 2,
'downloader/response_status_count/200': 2,
'dupefilter/filtered': 60,
'finish_reason': 'finished',
'finish_time': datetime.datetime(2016, 10, 6, 20, 28, 22, 257000),
'log_count/DEBUG': 4,
'log_count/INFO': 7,
'request_depth_max': 1,
'response_received_count': 2,
'scheduler/dequeued': 2,
'scheduler/dequeued/memory': 2,
'scheduler/enqueued': 2,
'scheduler/enqueued/memory': 2,
'start_time': datetime.datetime(2016, 10, 6, 20, 28, 5, 346000)}
2016-10-06 17:28:22 [scrapy] INFO: Spider closed (finished)
我真的不知道这里有什么问题,因为我是scrapy的初学者。这是错误的观点? def解析按预期运行,因此,我认为错误应该在def parse_category或parse_product方法中。
答案 0 :(得分:0)
您的 xpath 不正确,每页只有一个item-menu
,我删除了项目逻辑,因为我不知道它们是什么。这将从 item-menu ul获取所有链接,您可以添加回您喜欢的任何逻辑:
def parse(self, response):
for url in response.xpath('//div[@class="item-menu"]/ul/li[@class="item-linha"]/a/@href').extract():
if not url.startswith("http"):
url = response.urljoin(url)
request = scrapy.Request(url, callback=self.parse_category)
request.meta['idCategory'] = url # add whatever here
yield request
你的下一个方法也过于复杂,你不需要担心除prodTitle
类的 anchor 标签以外的任何东西:
def parse_category(self, response):
# produtos na pagina
urls = response.css('a.prodTitle::attr(href)').extract()
for url in urls:
request = scrapy.Request(url, callback=self.parse_product)
request.meta['idCategory'] = response.meta['idCategory']
yield request
# you want to check for the anchor with "Próxima" text
nextpage = response.xpath(u'//ul[@class="pure-paginator acrN"]/li/a[contains(.,"Próxima")]/@href').extract_first()
if nextpage:
self.log(u'Next Page: {0}'.format(nextpage))
request = scrapy.Request(nextpage, callback=self.parse_category)
request.meta['idCategory'] = response.meta['idCategory']
yield request
def parse_product(self, response):
print response.url
title = response.xpath('//title/text()').extract_first()
self.log(u'Título: {0}'.format(title))
如果你现在运行它,你会看到很多输出,如:
2016-10-06 23:25:15 [americanas] DEBUG: Next Page: http://www.americanas.com.br/linha/314061/alimentos-e-bebidas/biscoitos?ofertas.offset=30
2016-10-06 23:25:15 [americanas] DEBUG: Next Page: http://www.americanas.com.br/linha/342151/alimentos-e-bebidas/azeite-e-vinagre?ofertas.offset=30
2016-10-06 23:25:15 [americanas] DEBUG: Next Page: http://www.americanas.com.br/linha/342129/alimentos-e-bebidas/barra-de-cereais?ofertas.offset=30
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/15815078/nan-comfor-1-formula-infantil-nestle-lata-800g> (referer: http://www.americanas.com.br/linha/314080/alimentos-e-bebidas/alimentacao-infantil)
http://www.americanas.com.br/produto/15815078/nan-comfor-1-formula-infantil-nestle-lata-800g
2016-10-06 23:25:16 [americanas] DEBUG: Título: Nan Comfor 1 Fórmula Infantil Nestlé Lata 800g - Americanas.com
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/linha/316829/eletrodomesticos/adega-de-vinho> (referer: http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.mc_id=home-menuLista-alimentos/)
2016-10-06 23:25:16 [americanas] DEBUG: Next Page: http://www.americanas.com.br/linha/316829/eletrodomesticos/adega-de-vinho?ofertas.offset=30
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/7170286/goiabada-135g-diet-house> (referer: http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/9955598/adocante-em-sache-fit-caixa-com-30-unidades-de-2-5g-uniao> (referer: http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/linha/285368/utilidades-domesticas/vinho> (referer: http://www.americanas.com.br/loja/226795/alimentos-e-bebidas?WT.mc_id=home-menuLista-alimentos/)
http://www.americanas.com.br/produto/7170286/goiabada-135g-diet-house
2016-10-06 23:25:16 [americanas] DEBUG: Título: Goiabada 135g - Diet House - Americanas.com
http://www.americanas.com.br/produto/9955598/adocante-em-sache-fit-caixa-com-30-unidades-de-2-5g-uniao
2016-10-06 23:25:16 [americanas] DEBUG: Título: Adoçante Em Sache Fit Caixa Com 30 Unidades De 2,5g União - Americanas.com
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/121047374/barra-de-chocolate-ao-leite-lacta-150g-1-unidade> (referer: http://www.americanas.com.br/linha/314045/alimentos-e-bebidas/bomboniere)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/linha/314080/alimentos-e-bebidas/alimentacao-infantil?ofertas.offset=30> (referer: http://www.americanas.com.br/linha/314080/alimentos-e-bebidas/alimentacao-infantil)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce?ofertas.offset=30> (referer: http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce)
2016-10-06 23:25:16 [scrapy] DEBUG: Crawled (200) <GET http://www.americanas.com.br/produto/9800047/acucar-refinado-caixa-com-400-envelopes-x-5g-uniao-premium> (referer: http://www.americanas.com.br/linha/314082/alimentos-e-bebidas/mercearia-doce)
http://www.americanas.com.br/produto/121047374/barra-de-chocolate-ao-leite-lacta-150g-1-unidade
2016-10-06 23:25:16 [americanas] DEBUG: Título: Barra de Chocolate Ao leite Lacta 150g - 1 unidade - Americanas.com