结合FormRequest和CrawlSpider

时间:2015-06-30 09:41:57

标签: python web-scraping scrapy

我需要申请FormRequest [从这里] [1]:

#Request = FormRequest.from_response(
#         response,
#         formname='frmSearch',
#         formdata={'classtype': 'of'},
#         #callback=self.parse_links,
#         dont_filter=True,
#
#     )

对于start_urls中的链接以及我从СrawlSpider中的规则获得的所有页面。

class QuokaSpider(CrawlSpider):
name = 'quoka'
allowed_domains = ['www.quoka.de']
start_urls = ['http://www.quoka.de/immobilien/bueros-gewerbeflaechen/']
curr_page = 0

rules = (Rule(LinkExtractor(allow=(r'.+'), restrict_xpaths = [u'//li[@class="arr-rgt active"]',]),
            follow=True, callback='parse_links'),
       )


def _url(self, url):
    return 'http://www.quoka.de' + url

def parse_links(self, response):
    hxs = Selector(response)
    lnks = hxs.xpath('//a[contains(@class, "img-lmtr") and contains(@class, "multi") or contains(@class, "single")]/@href').extract()

    filters = hxs.xpath(u'//div[@class="modal-title"]/text()').extract()
    for fil in filters:
        print "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"+fil+"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!"
    for url in lnks:
        request = Request(self._url(url), callback=self.parse_object)
        yield request

def parse_object(self, response):
    item = AnbieterItem()
    hxs = Selector(response)
    item['Beschreibung'] = hxs.xpath(u'//div[@class="text"]/text()').extract()
    # item['Kleinanzeigen_App'] = '1'
    # item['Preis'] = '1'

    return item

如果我尝试对过滤器使用“start_request”,则蜘蛛不会使用规则中的页面。

如何解决此问题并应用此过滤器从规则启动URL和URL?

1 个答案:

答案 0 :(得分:1)

I don't know how to combine CrawlSpider Rules with FormRequest but I'd like to suggest that you replace the CrawlSpider with a generic Spider and create the Requests manually.

The Rule in your code does only take care of following the pagination (as far as i can see). To replace that you could use something like in the following code sample:

import scrapy

class TestSpider(scrapy.Spider):

    name = 'quoka'
    start_urls = ['http://www.quoka.de/immobilien/bueros-gewerbeflaechen']

    def parse(self, response):

        request = scrapy.FormRequest.from_response(
            response,
            formname='frmSearch',
            formdata={'classtype': 'of'},
            callback=self.parse_filtered
        )
        print request.body
        yield request

    def parse_filtered(self,response):

        resultList = response.xpath('//div[@id="ResultListData"]/ul/li')
        for resultRow in resultList:
            xpath_Result_Details = './/div[@class="q-col n2"]/a'
            # Check if row has details
            if resultRow.xpath(xpath_Result_Details):
                result_Details = resultRow.xpath(xpath_Result_Details)
                # If YES extract details
                title = result_Details.xpath('./@title').extract()
                href = result_Details.xpath('./@href').extract()[0]
                # Code to request detail pages goes here ...
                print title, href

        # Use this instead of CrawlSpider to follow the pagination links
        xpath_NextPage = '//div[@class="rslt-pagination"]//li[@class="arr-rgt active"]/a'
        if response.xpath(xpath_NextPage):

            nextPage_href = response.xpath(xpath_NextPage + '/@href').extract()[0]
            nextPage_url = 'http://www.quoka.de/immobilien/bueros-gewerbeflaechen' + nextPage_href
            nextPage_num = response.xpath(xpath_NextPage + '/@data-qng-page').extract()[0]

            # request = scrapy.Request(nextPage_url, callback=self.parse_filtered)

            # Create request with formdata ...
            request = scrapy.FormRequest.from_response(
                response,
                formname='frmNaviSearch',
                formdata={'pageno': nextPage_num},
                callback=self.parse_filtered
            )

            yield request