我尝试用硒刮擦以下站点的结果。 正在提取其他网站的第一个URL。 其次,每个站点都打开。 现在,我从下拉菜单中多次选择选项以获取所有数据。 每个应该有48个不同的结果,但我只能得到第一个。
我在consol端运行代码,将结果导出到csv: (抓取抓取工具-o example.csv
# -*- coding: utf-8 -*-
from time import sleep
from scrapy im
port Spider
from selenium import webdriver
from scrapy.selector import Selector
from scrapy.http import Request
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support.ui import Select
class KostenSpider(Spider):
name = 'kosten'
allowed_domains = ['www.adac.de']
start_urls = ['http://www.adac.de/']
def start_requests(self):
self.driver = webdriver.Chrome('/Users/david/Desktop/adacfinal/chromedriver')
self.driver.get('https://www.adac.de/infotestrat/autodatenbank/autokosten/autokosten-rechner/default.aspx')
sleep(1)
start_page = self.driver.find_element_by_xpath('//*[@id="wucAutokostenrechner-12"]')
start_page.click()
sleep(1)
select = Select(self.driver.find_element_by_id('ctl00_ctl00_cphContentRow_cphROWBreiterTopContent_cphBreiterTopContent_wucAutokostenrechner1_ddlPageSize'))
select.select_by_visible_text('50')
sleep(1)
while True:
try:
sel = Selector(text=self.driver.page_source)
books = sel.xpath('//*[@class="tal"]//@href').extract()
monate = ["24 Monate","36 Monate","48 Monate","60 Monate"]
fahrleistung = ["5.000 km","10.000 km","15.000 km","20.000 km","25.000 km","30.000 km","35.000 km","40.000 km","45.000 km","50.000 km","55.000 km","60.000 km"]
for book in books:
url = 'https://www.adac.de' + book
self.driver.get(url)
sleep(1)
for monat in monate:
select = Select(self.driver.find_element_by_id('ctl00_ctl00_cphContentRow_cphROW2spaltigerContent_cphMittelContent_wucAutodatenbankDetail1_DropDownListHaltedauer'))
select.select_by_visible_text(monat)
sleep(2)
for leistung in fahrleistung:
select = Select(self.driver.find_element_by_id('ctl00_ctl00_cphContentRow_cphROW2spaltigerContent_cphMittelContent_wucAutodatenbankDetail1_DropDownListFahrleistung'))
select.select_by_visible_text(leistung)
sleep(2)
start_page = self.driver.find_element_by_xpath('//*[@id="ctl00_ctl00_cphContentRow_cphROW2spaltigerContent_cphMittelContent_wucAutodatenbankDetail1_ImageButtonSubmit"]')
start_page.click()
sleep(2)
sel = Selector(text=self.driver.page_source)
bezeichnung = sel.xpath('//*[@class="pb11"]/text()').extract()
proMonat = sel.xpath('//*[@id="wucAutodatenbankDetail-29"]/text()').extract()
yield Request(url,
callback=self.parse_book,
meta={
'monate': monat,
'leistung': leistung,
'url': url,
'bezeichnung': bezeichnung,
'proMonat': proMonat,
})
self.driver.get('https://www.adac.de/infotestrat/autodatenbank/autokosten/autokosten-rechner/default.aspx')
sleep(1)
start_page = self.driver.find_element_by_xpath('//*[@id="wucAutokostenrechner-12"]')
start_page.click()
sleep(1)
next_page = self.driver.find_element_by_xpath('//*[@id="ctl00_ctl00_cphContentRow_cphROWBreiterTopContent_cphBreiterTopContent_wucAutokostenrechner1_ResultListPagerKosten_lnkPageNext"]')
sleep(1)
next_page.click()
except NoSuchElementException:
self.logger.info('No more pages to load.')
self.driver.quit()
break
def parse_book(self, response):
url = response.meta['url']
bezeichnung = response.meta['bezeichnung']
proMonat = response.meta['proMonat']
monate = response.meta['monate']
leistung = response.meta['leistung']
yield {
'bezeichnung': bezeichnung,
'monate': monate,
'fahrleistung': leistung,
'proMonat': proMonat,
'url': url,
}
在第34行中抓取的每个url应该有几个数据点,因为在接下来的2个For-Loop(第43/49行)中,通过按selenium单击不同的下拉菜单,数据会多次更改。 不幸的是,每个URL的最终csv只有一个结果。