import scrapy
from scrapy import Request
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from ..items import MmklabelsItem
from selenium.common.exceptions import TimeoutException
class MmkSpider(scrapy.Spider):
name = 'MMK'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/53.0.2785.143 Safari/537.36',
}
def start_requests(self):
……
yield Request(url, dont_filter=False, headers=self.headers)
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.driver = webdriver.Chrome('F:\chromedriver\chromedriver')
self.wait = WebDriverWait(self.driver, 10)
def parse(self, response):
self.driver.get(response.url)
self.scroll_until_loaded()
# self.driver.implicitly_wait(30)
pros_list = self.driver.find_elements_by_xpath(".//div[@class='jzfl']/li/div[@class='jm_mingcheng']/a")
labs_list = self.driver.find_elements_by_xpath(".//div[@class='dqwz_text']/h1")
program = []
for pro in pros_list:
print(pro.text)
program.append(pro.text)
item = MmklabelsItem()
Program = []
Label = []
for k in range(len(program)):
Program = program[k]
Label = labs_list[0].text
item['programs'] = Program
item['Mmklabel'] = Label
yield item
print(item)
def scroll_until_loaded(self):
check_height = self.driver.execute_script("return document.body.scrollHeight;")
while True:
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
self.wait.until(
lambda driver: self.driver.execute_script("return document.body.scrollHeight;") > check_height)
check_height = self.driver.execute_script("return document.body.scrollHeight;")
except TimeoutException:
break
我是python中的新手。 “print(pro.text)”表示我可能已经成功提取了我想要的每个元素,但是,它无法将元素保存在item中。我已经做了很多努力来简化这段代码。
program = []
for pro in pros_list:
print(pro.text)
program.append(pro.text)
item = MmklabelsItem()
Program = []
Label = []
for k in range(len(program)):
Program = program[k]
Label = labs_list[0].text
item['programs'] = Program
item['Mmklabel'] = Label
yield item
print(item)