为什么Python Selenium经常导致页面卸载?

时间:2018-10-31 14:24:13

标签: python selenium web-scraping

这更多的是我的理解(和抚慰我的挫败感)的问题,而不是关于如何解决它的问题,但正如该问题所指出的那样;为什么在Selenium上加载URL /页面(在我的情况下为Python)通常不加载并抛出NoSuchElementException错误?我了解与正常浏览一样,有时网页无法加载。但是我发现加载URL /页面的尝试中有25%-50%的操作在30秒的超时时间内不起作用,因此,在每次尝试尝试之前,随着每次尝试之间的超时时间的增加,我不得不重试10次。 URL /页面最终加载的实例。

如果您能帮助我理解,将不胜感激。

预先感谢您的解释。

示例代码

我目前正在尝试https://www.carsales.com.au

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
import mysql.connector
import time
import datetime

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1920, 1080))
display.start()

chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-setuid-sandbox")
driver = webdriver.Chrome(chrome_options=chrome_options)

con = mysql.connector.connect(*****)
cursor = con.cursor()

sql_user_searches = "****"
cursor.execute(sql_user_searches)
searches = cursor.fetchall()

for z in searches:
    offset = 0
    url = "https://www.carsales.com.au/cars/{0}/{1}/".format(z[2],z[4],offset)
    sleep_time = 5
    num_retries = 100
    error = 0
    for loopingcow in range(0, num_retries):  
        try:
            error = 0
            driver.get(url)
            time.sleep(sleep_time)
            driver.find_element_by_xpath("""//*[@class="result-set-container "]""").get_attribute("outerHTML")
            print("success")
        except NoSuchElementException:
            print("error")
            error = 1
            pass

        if error == 1:
            time.sleep(sleep_time)  # wait before trying to fetch the data again
            sleep_time += 1  # Implement your backoff algorithm here i.e. exponential backoff
        else:
            break
    total_pagination = driver.find_elements_by_xpath("""//div[@class="tabbed-pagination"]/div[@class="pagination-container"]/div[@class="pagination-container"]/div[@class="pagination"]/p""")[0].text
    number_of_pages_split = total_pagination.split(" ")
    number_of_pages = int(number_of_pages_split[1])
    page = 0
    while page < number_of_pages:
        offset = page * 12
        url = "https://www.carsales.com.au/cars/{0}/{1}/?offset={2}".format(z[2],z[4],offset)
        print(url)
        sleep_time = 5
        num_retries = 100
        error = 0
        for loopyloop in range(0, num_retries):  
            try:
                error = 0
                driver.get(url)
                time.sleep(sleep_time)
                driver.find_element_by_xpath("""//*[@class="result-set-container "]""").get_attribute("outerHTML")
                print("success")
            except NoSuchElementException:
                print("error")
                error = 1
                pass

            if error == 1:
                time.sleep(sleep_time)  # wait before trying to fetch the data again
                sleep_time += 1  # Implement your backoff algorithm here i.e. exponential backoff
            else:
                break
        rows = driver.find_elements_by_xpath("""//div[contains(@class,"listing-item")]""")
        count = len(rows)
        i = 0
        while i < count:
            title = rows[i].find_elements_by_xpath("""//div[contains(@class,"title ")]/a/h2""")[i].text
            i = i + 1
            query = """****""".format(*****)
            cursor.execute(query)
            con.commit()
        page = page + 1

cursor.close()
con.close()
driver.quit()
display.popen.kill()
print("success")

具有30秒超时的第二示例代码

此网站为https://www.tiket.com

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import mysql.connector
import time

from pyvirtualdisplay import Display
display = Display(visible=0, size=(1920, 1080))  
display.start()

chrome_options = Options()
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-setuid-sandbox")
driver = webdriver.Chrome(chrome_options=chrome_options)

date = int(time.strftime("%d"))
month = int(time.strftime("%m"))

con = mysql.connector.connect(*****)
cursor = con.cursor()

for z in range(11, 13):
    if z == 9:
        end_date = 31
    elif z == 10:
        end_date = 32
    elif z == 11:
        end_date = 31
    elif z == 12:
        end_date = 32
    elif z == 8:
        end_date = 32

    start_date = 1

    if z == month and (end_date - date) < 5:
        start_date = end_date
    elif z == (month + 1) and (end_date - date) < 5:
        start_date = start_date + 4 - (end_date - date)
    elif z > month:
        start_date = 1
    else:
        start_date = date

    print(z)
    print(start_date)
    print(end_date)

    for x in range(start_date, end_date):
        time.sleep(2)

        x_url = str(x).zfill(2)
        z_url = str(z).zfill(2)
        date = x_url + "-" + z_url  

        url = "https://www.tiket.com/pesawat/cari?d=DPS&a=JKT&date=2017-{1}-{0}&adult=2&child=0&infant=0".format(x_url,z_url)
        print(url)

        driver.get(url)
        time.sleep(30)

        last_height = driver.execute_script("return document.body.scrollHeight")
        print(last_height)

        w = 0
        while w < last_height:
            print("Success")
            w = last_height

            try:
                time.sleep(30)
                print(driver.find_element_by_xpath("""//*[@id="tbody_depart"]""").get_attribute("outerHTML"))
                rows = driver.find_elements_by_xpath("""//tr[contains(@id,"flight")]""")
                for row in rows:            
                    airline = row.get_attribute("data-airlinesname")
                    price = row.get_attribute("data-price")
                    departure = row.get_attribute("data-depart")
                    arrival = row.get_attribute("data-arrival")
                    baggage = row.get_attribute("data-baggage")
                    stops = row.get_attribute("data-stoptext")
                    query = """****""".format(******)
                    print(query)
                    cursor.execute(query)
                    con.commit()
            except:
                driver.get(url)
                time.sleep(30)
                print(driver.find_element_by_xpath("""//*[@id="tbody_depart"]""").get_attribute("outerHTML"))
                rows = driver.find_elements_by_xpath("""//tr[contains(@id,"flight")]""")
                for row in rows:            
                    airline = row.get_attribute("data-airlinesname")
                    price = row.get_attribute("data-price")
                    departure = row.get_attribute("data-depart")
                    arrival = row.get_attribute("data-arrival")
                    baggage = row.get_attribute("data-baggage")
                    stops = row.get_attribute("data-stoptext")
                    query = """*****""".format(*****)
                    print(query)
                    cursor.execute(query)
                    con.commit()

cursor.close()
con.close()
driver.close()
display.popen.kill()

0 个答案:

没有答案