我需要有关硒>硒的帮助
Selenium仅解析最后50页数据,但我需要所有页面数据。我收到了50条抓取的数据,但应该是304。**
我也尝试过传递meta={'s':s} from "lists"
但结果相同。这次使用meta={'lists':lists}
# -*- coding: utf-8 -*-
from time import sleep
from scrapy import Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
class AthSpider(Spider):
name = 'ath'
def start_requests(self):
self.driver = webdriver.Chrome()
self.driver.get('https://www.athlinks.com/event/127711/results/Event/828080/Results')
sleep(20)
self.driver.find_element_by_xpath('//*[@class="view-all-results"]').click()
sleep(20)
sel = Selector(text=self.driver.page_source)
lists=sel.xpath('//*[@class="row mx-0 link-to-irp"]')
for s in lists:
yield Request(self.driver.current_url,meta={'lists':lists},callback=self.parse_page)
while True:
try:
next_page=self.driver.find_element_by_xpath("//button[contains(text(),'>')]")
next_page.click()
sleep(20)
sel = Selector(text=self.driver.page_source)
lists=sel.xpath('//*[@class="row mx-0 link-to-irp"]')
for s in lists:
yield Request(self.driver.current_url,meta={'lists':lists},callback=self.parse_page)
except NoSuchElementException:
self.logger.info('No more pages to load.')
self.driver.quit()
break
def parse_page(self, response):
lists=response.meta['lists']
lis=lists.xpath('//*[@class="row mx-0 link-to-irp"]')
for s in lis:
Name=s.xpath('.//*[@class="athName"]//text()').extract_first()
Gender=s.xpath('.//*[@class="col-12 pl-0"]//text()').extract_first()
if Gender:
Gender=Gender.split()[0]
Bib=s.xpath('.//*[@class="col-12 pl-0"]//span[2]//text()').extract_first()
if Bib:
Bib=Bib.split()[-1]
City=s.xpath('.//*[@id="location"]//text()').extract_first()
Pace=s.xpath('.//*[@class="col px-0"]//div[1]//text()').extract_first()
Time=s.xpath('.//*[@class="col-2 px-0"]//text()').extract_first()
yield {
'Name':Name,
'Gender':Gender,
'Bib':Bib,
'City':City,
'Pace':Pace,
'Time':Time
}