如果仅列出一个URL,则下面的代码将提取数据,就像应该的那样。如果我输入两个网址(如下所示),它将仅从第一个网址提取数据-但两次!任何想法如何克服这个?请注意,我删除了一些选择器以缩短代码。
import scrapy
from scrapy_selenium import SeleniumRequest
from selenium import webdriver
class Spider(scrapy.Spider):
name = "match_summary"
def start_requests(self):
urls = [
'https://www.flashscore.com/match/v5GmqsWa/#match-summary',
'https://www.flashscore.com/match/Wju9nz58/#match-summary',]
for url in urls:
yield SeleniumRequest(url=url, callback=self.parse,dont_filter=True)
def parse(self, response):
for quote in response.css('div.detailMS__incidentRow'):
yield {'Match': response.selector.xpath('//title/text()').getall(),
'Round': response.selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/span[2]/a/text()').getall(),
'Date': response.selector.xpath('//*[@id="utime"]/text()').getall(),
'Time': quote.css('div.time-box::text').extract(),
'OverTime': quote.css('div.time-box-wide::text').extract()}
答案 0 :(得分:0)
请注意:请勿为您的班级命名Spider
,它与library's Spider class
冲突并且可能会使您的脚本混乱。
您是否尝试过使用start_urls
而非start_requests
方法?
class MySpider(scrapy.Spider):
name = 'match_summary'
start_urls = [
'https://www.flashscore.com/match/v5GmqsWa/#match-summary',
'https://www.flashscore.com/match/Wju9nz58/#match-summary',]
def parse(self, response):
for quote in response.css('div.detailMS__incidentRow'):
yield {'Match': response.selector.xpath('//title/text()').getall(),
'Round': response.selector.xpath('/html/body/div[2]/div[1]/div[2]/div[1]/span[2]/a/text()').getall(),
'Date': response.selector.xpath('//*[@id="utime"]/text()').getall(),
'Time': quote.css('div.time-box::text').extract(),
'OverTime': quote.css('div.time-box-wide::text').extract()}