我尝试获取具有过滤器的网站的数据。该网站如下:https://www.lequipe.fr/Basket/RES_NBA.html
我有一个简单的蜘蛛,它收集所需的所有信息,但仅用于显示的日期。
我需要迭代过滤器,以便收集过滤器中所有可用日期的数据。
如果有人可以提供帮助,我会非常感激。
我的蜘蛛是这样的:
# -*- coding: utf-8 -*-
import scrapy
class LequipeBotSpider(scrapy.Spider):
name = 'Lequipe_bot'
allowed_domains = ['www.lequipe.fr/Basket/RES_NBA.html']
start_urls = ['http://www.lequipe.fr/Basket/RES_NBA.html']
#location of csv file
custom_settings = {
'FEED_FORMAT' : "csv",
'FEED_URI' : 'tmp/lequipe2.csv'
}
def parse(self, response):
#Extracting the content using css selectors
#recap = response.css(".equipeDom a::text,div.score span.score--chiffre::text,.equipeExt a::text").extract()
recap=response.css(".equipeDom a::text,div.score span.score--chiffre::text,.equipeExt a::text,div.equipeDom span.nba--ranking::text,div.equipeExt span.nba--ranking::text").extract()
#Give the extracted content row wise
for x in range(0,(len(recap))/6):
#create a dictionary to store the scraped info
scraped_info = {
'equipe_dom' : recap[1+6*x],
'score_dom' : recap[2+6*x],
'score_ext' : recap[3+6*x],
'equipe_ext' : recap[4+6*x],
'classement_dom' : recap[0+6*x],
'classement_ext' : recap[5+6*x],
}
#yield or give the scraped info to scrapy
yield scraped_info
那么,我如何使用@furas
的解决方案迭代所有页面的抓取提前致谢
答案 0 :(得分:0)
select
小部件中的所有日期均为class=filtrecalendrier
并且有详细信息页面的网址
for x in response.xpath('//*[@class="filtrecalendrier"]/option/@value'):
print(x.extract())
结果
/Basket/BasketResultat22420.html
/Basket/BasketResultat22421.html
/Basket/BasketResultat22422.html
...
现在你必须加入https://www.lequipe.fr/
response.urljoin('/Basket/BasketResultat22420.html')
# https://www.lequipe.fr/Basket/BasketResultat22420.html
您可以加载页面以获取详细信息
编辑工作代码
#!/usr/bin/env python3
#
# https://stackoverflow.com/a/47761077/1832058
#
import scrapy
class MySpider(scrapy.Spider):
name = 'myspider'
allowed_domains = ['www.lequipe.fr']
start_urls = ['http://www.lequipe.fr/Basket/RES_NBA.html']
def parse(self, response):
print('url:', response.url)
for item in response.xpath('//*[@class="filtrecalendrier"]/option'):
date = item.xpath('./text()').extract_first()
url = item.xpath('./@value').extract_first()
url = response.urljoin(url)
yield scrapy.Request(url, callback=self.parse_items, meta={'date': date})
def parse_items(self, response):
rows = response.css('.ligne.bb-color')
for row in rows:
score = row.css('.score span::text').extract()
if len(score) < 2:
score = ['', '']
item = {
'date': response.meta['date'],
'equipe_dom': row.css('.equipeDom a::text').extract_first(),
'score_dom': score[0],
'score_ext': score[1],
'equipe_ext': row.css('.equipeExt a::text').extract_first(),
'classement_dom': row.css('.equipeDom a span::text').extract_first(),
'classement_ext': row.css('.equipeExt a span::text').extract_first(),
}
#print(item)
yield item
# --- it runs without project and saves in `output.csv` ---
from scrapy.crawler import CrawlerProcess
c = CrawlerProcess({
'USER_AGENT': 'Mozilla/5.0',
# save in file as CSV, JSON or XML
'FEED_FORMAT': 'csv', # csv, json, xml
'FEED_URI': 'output.csv', #
})
c.crawl(MySpider)
c.start()