我需要申请FormRequest [从这里] [1]:
#Request = FormRequest.from_response(
# response,
# formname='frmSearch',
# formdata={'classtype': 'of'},
# #callback=self.parse_links,
# dont_filter=True,
#
# )
对于start_urls中的链接以及我从СrawlSpider中的规则获得的所有页面。
class QuokaSpider(CrawlSpider):
name = 'quoka'
allowed_domains = ['www.quoka.de']
start_urls = ['http://www.quoka.de/immobilien/bueros-gewerbeflaechen/']
curr_page = 0
rules = (Rule(LinkExtractor(allow=(r'.+'), restrict_xpaths = [u'//li[@class="arr-rgt active"]',]),
follow=True, callback='parse_links'),
)
def _url(self, url):
return 'http://www.quoka.de' + url
def parse_links(self, response):
hxs = Selector(response)
lnks = hxs.xpath('//a[contains(@class, "img-lmtr") and contains(@class, "multi") or contains(@class, "single")]/@href').extract()
filters = hxs.xpath(u'//div[@class="modal-title"]/text()').extract()
for fil in filters:
print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"+fil+"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
for url in lnks:
request = Request(self._url(url), callback=self.parse_object)
yield request
def parse_object(self, response):
item = AnbieterItem()
hxs = Selector(response)
item['Beschreibung'] = hxs.xpath(u'//div[@class="text"]/text()').extract()
# item['Kleinanzeigen_App'] = '1'
# item['Preis'] = '1'
return item
如果我尝试对过滤器使用“start_request”,则蜘蛛不会使用规则中的页面。
如何解决此问题并应用此过滤器从规则启动URL和URL?
答案 0 :(得分:1)
I don't know how to combine CrawlSpider
Rules with FormRequest
but I'd like to suggest that you replace the CrawlSpider
with a generic Spider
and create the Requests
manually.
The Rule
in your code does only take care of following the pagination (as far as i can see). To replace that you could use something like in the following code sample:
import scrapy
class TestSpider(scrapy.Spider):
name = 'quoka'
start_urls = ['http://www.quoka.de/immobilien/bueros-gewerbeflaechen']
def parse(self, response):
request = scrapy.FormRequest.from_response(
response,
formname='frmSearch',
formdata={'classtype': 'of'},
callback=self.parse_filtered
)
print request.body
yield request
def parse_filtered(self,response):
resultList = response.xpath('//div[@id="ResultListData"]/ul/li')
for resultRow in resultList:
xpath_Result_Details = './/div[@class="q-col n2"]/a'
# Check if row has details
if resultRow.xpath(xpath_Result_Details):
result_Details = resultRow.xpath(xpath_Result_Details)
# If YES extract details
title = result_Details.xpath('./@title').extract()
href = result_Details.xpath('./@href').extract()[0]
# Code to request detail pages goes here ...
print title, href
# Use this instead of CrawlSpider to follow the pagination links
xpath_NextPage = '//div[@class="rslt-pagination"]//li[@class="arr-rgt active"]/a'
if response.xpath(xpath_NextPage):
nextPage_href = response.xpath(xpath_NextPage + '/@href').extract()[0]
nextPage_url = 'http://www.quoka.de/immobilien/bueros-gewerbeflaechen' + nextPage_href
nextPage_num = response.xpath(xpath_NextPage + '/@data-qng-page').extract()[0]
# request = scrapy.Request(nextPage_url, callback=self.parse_filtered)
# Create request with formdata ...
request = scrapy.FormRequest.from_response(
response,
formname='frmNaviSearch',
formdata={'pageno': nextPage_num},
callback=self.parse_filtered
)
yield request