将Selenium和Scrapy用于动态页面但未能保存项目

时间:2018-06-09 05:17:41

标签: python selenium scrapy

import scrapy
from scrapy import Request
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from ..items import MmklabelsItem
from selenium.common.exceptions import TimeoutException


class MmkSpider(scrapy.Spider):
    name = 'MMK'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                      'Chrome/53.0.2785.143 Safari/537.36',
    }

def start_requests(self):
    ……
    yield Request(url, dont_filter=False, headers=self.headers)

def __init__(self, **kwargs):
    super().__init__(**kwargs)
    self.driver = webdriver.Chrome('F:\chromedriver\chromedriver')
    self.wait = WebDriverWait(self.driver, 10)

def parse(self, response):
    self.driver.get(response.url)
    self.scroll_until_loaded()
    # self.driver.implicitly_wait(30)
    pros_list = self.driver.find_elements_by_xpath(".//div[@class='jzfl']/li/div[@class='jm_mingcheng']/a")
    labs_list = self.driver.find_elements_by_xpath(".//div[@class='dqwz_text']/h1")
    program = []
    for pro in pros_list:
        print(pro.text)
        program.append(pro.text)
    item = MmklabelsItem()
    Program = []
    Label = []
    for k in range(len(program)):
        Program = program[k]
        Label = labs_list[0].text
        item['programs'] = Program
        item['Mmklabel'] = Label
        yield item
        print(item)

def scroll_until_loaded(self):
    check_height = self.driver.execute_script("return document.body.scrollHeight;")
    while True:
        self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        try:
            self.wait.until(
                lambda driver: self.driver.execute_script("return document.body.scrollHeight;") > check_height)
            check_height = self.driver.execute_script("return document.body.scrollHeight;")
        except TimeoutException:
            break

我是python中的新手。   “print(pro.text)”表示我可能已经成功提取了我想要的每个元素,但是,它无法将元素保存在item中。我已经做了很多努力来简化这段代码。

    program = []
    for pro in pros_list:
        print(pro.text)
        program.append(pro.text)
    item = MmklabelsItem()
    Program = []
    Label = []
    for k in range(len(program)):
        Program = program[k]
        Label = labs_list[0].text
        item['programs'] = Program
        item['Mmklabel'] = Label
        yield item
        print(item)
'print(item)'的输出表明我只保存了list'program'的一些元素。我想知道是什么导致了这个问题以及如何解决它。    非常感谢您的阅读,可以获得更多详细信息。我很难准确地描述我的问题,因为我对编码知之甚少。

0 个答案:

没有答案