如何使用scrapy管理网站上的过滤器

时间:2017-12-11 19:59:17

标签: python scrapy

我尝试获取具有过滤器的网站的数据。该网站如下:https://www.lequipe.fr/Basket/RES_NBA.html

我有一个简单的蜘蛛,它收集所需的所有信息,但仅用于显示的日期。

我需要迭代过滤器,以便收集过滤器中所有可用日期的数据。

如果有人可以提供帮助,我会非常感激。

我的蜘蛛是这样的:

 # -*- coding: utf-8 -*-
 import scrapy


class LequipeBotSpider(scrapy.Spider):
name = 'Lequipe_bot'
allowed_domains = ['www.lequipe.fr/Basket/RES_NBA.html']
start_urls = ['http://www.lequipe.fr/Basket/RES_NBA.html']
#location of csv file
custom_settings = {
    'FEED_FORMAT' : "csv",
    'FEED_URI' : 'tmp/lequipe2.csv'
}

def parse(self, response):
    #Extracting the content using css selectors
    #recap = response.css(".equipeDom a::text,div.score span.score--chiffre::text,.equipeExt a::text").extract()
    recap=response.css(".equipeDom a::text,div.score span.score--chiffre::text,.equipeExt a::text,div.equipeDom span.nba--ranking::text,div.equipeExt span.nba--ranking::text").extract()

    #Give the extracted content row wise
    for x in range(0,(len(recap))/6):
        #create a dictionary to store the scraped info
        scraped_info = {
            'equipe_dom' : recap[1+6*x],
            'score_dom' : recap[2+6*x],
            'score_ext' : recap[3+6*x],
            'equipe_ext' : recap[4+6*x],
            'classement_dom' : recap[0+6*x],
            'classement_ext' : recap[5+6*x],

        }

        #yield or give the scraped info to scrapy
        yield scraped_info

那么,我如何使用@furas

的解决方案迭代所有页面的抓取

提前致谢

1 个答案:

答案 0 :(得分:0)

select小部件中的所有日期均为class=filtrecalendrier 并且有详细信息页面的网址

for x in response.xpath('//*[@class="filtrecalendrier"]/option/@value'): 
    print(x.extract())

结果

/Basket/BasketResultat22420.html
/Basket/BasketResultat22421.html
/Basket/BasketResultat22422.html
...

现在你必须加入https://www.lequipe.fr/

response.urljoin('/Basket/BasketResultat22420.html')

# https://www.lequipe.fr/Basket/BasketResultat22420.html

您可以加载页面以获取详细信息

编辑工作代码

#!/usr/bin/env python3

#
# https://stackoverflow.com/a/47761077/1832058
#

import scrapy

class MySpider(scrapy.Spider):

    name = 'myspider'

    allowed_domains = ['www.lequipe.fr']

    start_urls = ['http://www.lequipe.fr/Basket/RES_NBA.html']

    def parse(self, response):
        print('url:', response.url)

        for item in response.xpath('//*[@class="filtrecalendrier"]/option'): 

            date = item.xpath('./text()').extract_first()
            url = item.xpath('./@value').extract_first()

            url = response.urljoin(url)

            yield scrapy.Request(url, callback=self.parse_items, meta={'date': date})


    def parse_items(self, response):
        rows = response.css('.ligne.bb-color')

        for row in rows:

            score = row.css('.score span::text').extract()
            if len(score) < 2:
                score = ['', '']

            item = {
                'date': response.meta['date'],
                'equipe_dom': row.css('.equipeDom a::text').extract_first(),
                'score_dom':  score[0],
                'score_ext':  score[1],
                'equipe_ext': row.css('.equipeExt a::text').extract_first(),
                'classement_dom': row.css('.equipeDom a span::text').extract_first(),
                'classement_ext': row.css('.equipeExt a span::text').extract_first(),
            }

            #print(item)

            yield item

# --- it runs without project and saves in `output.csv` ---

from scrapy.crawler import CrawlerProcess

c = CrawlerProcess({
    'USER_AGENT': 'Mozilla/5.0',

    # save in file as CSV, JSON or XML
    'FEED_FORMAT': 'csv',     # csv, json, xml
    'FEED_URI': 'output.csv', # 
})
c.crawl(MySpider)
c.start()