如何刷新页面后如何在循环中保存缓存?

时间:2015-09-15 10:47:25

标签: python selenium web-crawler scrapy

我正在使用这个蜘蛛点击颜色,然后页面被刷新然后随后点击链接但它间断并且抛出未在缓存中找到的元素 - 也许页面已经改变,因为它是查找错误如何在完成循环后获取原始页面?

无法为此找到合适的解决方案。

import scrapy
from scrapy.contrib.spiders import CrawlSpider
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.keys import Keys
from scrapy.selector import Selector
import time

class CompItem(scrapy.Item):
    model_name = scrapy.Field()
    model_link = scrapy.Field()
    url = scrapy.Field()
    what = scrapy.Field()
    seller = scrapy.Field()

class criticspider(CrawlSpider):
    name = "extract"
    allowed_domains = ["mysmartprice.com"]
    start_urls = ["http://www.mysmartprice.com/mobile/huawei-honor-holly-msp4857"]

    def __init__(self, *args, **kwargs):
        super(criticspider, self).__init__(*args, **kwargs)
        self.download_delay = 0.25
        self.browser = webdriver.Firefox()
        self.browser.maximize_window()

        self.browser.implicitly_wait(20)

    def parse_start_url(self, response):
        self.browser.get(response.url)

        # waiting for "Go to store" to become visible
        wait = WebDriverWait(self.browser, 10)
        wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div.store_pricetable")))

        main_window = self.browser.window_handles[0]

        # iterate over featured stores and visit them
        for i,store in enumerate(self.browser.find_elements_by_css_selector("div.store_pricetable")):
            link = store.find_element_by_css_selector("div.store_gostore > div.storebutton")
            ActionChains(self.browser).key_down(Keys.SHIFT).move_to_element(link).click(link).key_up(Keys.SHIFT).perform()

            # there is a popup preventing us to navigate to the store URL - close it 
            try:
                popup_close = self.browser.find_element_by_css_selector(".popup-closebutton")
                popup_close.click()

                # repeat the click
                ActionChains(self.browser).key_down(Keys.SHIFT).move_to_element(link).click(link).key_up(Keys.SHIFT).perform()
            except NoSuchElementException:
                pass

            button = self.browser.find_element_by_xpath('/html/body/div[3]/div/div[2]/div/div[2]/div/div[1]/div[4]/div/div[3]').click()

            #time.sleep(5)
            item = CompItem()
            sel = Selector(text=self.browser.page_source)
            item["what"] = "url"
            item["seller"] = response.xpath('//div[@class="store_rating_bar_out"]/@data-storename').extract()[i]
            item["model_name"] = sel.xpath('//span[contains(@itemprop,"brand")]/text()').extract()[0] +" "+sel.xpath('//span[contains(@itemprop,"name")]/text()').extract()[0] + sel.xpath('//span[contains(@class,"variant")]/text()').extract()[0]
            # shift+click on the "Go to Store" link


            # switch to the newly opened window, read the current url and close the window
            self.browser.switch_to.window(self.browser.window_handles[-1])

            # wait until "On your way to the store" would not be in title
            wait.until(lambda browser: "On your way to the Store" not in browser.title)

            item['url'] = self.browser.current_url
            yield item
            self.browser.close()

            # switch back to the main window
            self.browser.switch_to.window(main_window)
            self.browser.find_element_by_tag_name('body').send_keys(Keys.CONTROL + "r")
            wait = WebDriverWait(self.browser, 10)

0 个答案:

没有答案