我的方法parse_adf_info永远不会被调用,我不知道为什么。没有错误发生。我想获取每个广告的链接(解析)并逐个转到广告(parse_ads_urls)和抓取数据(parse_ads_info),但这个方法永远不会被调用。
这是我的代码:
# -*- coding: utf-8 -*-
from scrapy import Request, Spider
#from zapimoveis.items import ads_info
from scrapy.selector import Selector
#from scrapy.loader import ItemLoader
proxy_list = ["###","###"]
PROXY = "###"
class AdsSpider(Spider):
name = "zapimoveis"
allowed_domains = ["https://www.zapimoveis.com.br/", "https://www.zapimoveis.com.br/oferta/"]
def __init__(self, start_url='', *args, **kwargs):
super(AdsSpider, self).__init__(*args, **kwargs)
self.start_urls = []
self.start_urls.append(start_url)
self.json = '#{"precomaximo":"2147483647","parametrosautosuggest":[{"B\
airro":"JD CAMBURI","Zona":"","Cidade":"VITORIA","Agrupame\
nto":"","Estado":"ES"}],"pagina":"%d","ordem":"DataAtualiz\
acao","paginaOrigem":"ResultadoBusca","semente":"213739135\
0","formato":"Lista"}'
def start_requests(self):
rq = Request(url=self.start_urls[0], callback=self.parse)
rq.meta['proxy'] = PROXY
yield rq
def parse(self, response):
n_pages = response.css('span[class="pull-right num-of"]::text') \
.extract_first()
n_pages = int(n_pages.replace("de ", ""))
for i in range(1, n_pages+1):
rq = Request(url=self.start_urls[0]+(self.json % i),
callback=self.parse_ads_urls, dont_filter=True)
rq.meta['proxy'] = PROXY
yield rq
def parse_ads_urls(self,response):
for article in response.css('article[class=minificha]'):
url_to_ads = article.css('a[class=btn-ver-detalhes]::attr(href)')\
.extract_first()
rq2 = Request(url=url_to_ads, callback=self.parse_ads_info,
dont_filter=True)
rq2.meta['proxy'] = proxy_list[0]
yield rq2
def parse_ads_info(self, response):
print "#--------->"
print response.css('span[class=value-ficha]::text').extract_first()
我删除了我的个人代理。
(2017-06-06)编辑1: 输出日志:https://pastebin.com/4jv2r9um