Python Selenium:如何在向下滚动后获取更新的HTML DOM?

时间:2016-08-01 04:50:36

标签: python selenium

我正在访问已实现视差滚动的page。我使用代码滚动底部但BeautifulSoup它没有获取更新的DOM。代码如下:

import requests
from bs4 import BeautifulSoup
from gensim.summarization import summarize

from selenium import webdriver
from datetime import datetime
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.keys import Keys
from time import sleep
import sys
import os
import xmltodict
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import traceback
import random

driver = None
driver = webdriver.Firefox()
driver.maximize_window()
def fetch_links(tag):
    links = []
    url = 'https://steemit.com/trending/'+tag
    driver.get(url)
    html = driver.page_source
    sleep(4)

    soup = BeautifulSoup(html,'lxml')
    entries = soup.select('.entry-title > a')
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    sleep(5)
    entries = soup.select('.entry-title > a')
    for e in entries:
        if e['href'].strip() not in entries:
            links.append(e['href'])
    return links

1 个答案:

答案 0 :(得分:2)

滚动窗口后,您可能需要解析页面:

driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

sleep(5)

soup = BeautifulSoup(driver.page_source, 'lxml')
entries = soup.select('.entry-title > a')