当我从Trulia抓取数据时抛出未知错误

时间:2019-07-29 18:12:44

标签: python pandas selenium selenium-webdriver

我将任意数量的地址推送到trulia搜索栏中后,都会不断出现此错误。

selenium.common.exceptions.WebDriverException: Message: TypeError: curContainer.frame.document.documentElement is null

我不确定此错误是什么意思,或者如何只是简单地忽略它。这是我的代码,仅供参考。

from selenium import webdriver
from selenium.webdriver.remote import webelement
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import time
from bs4 import BeautifulSoup
import os
from datetime import datetime
from selenium.webdriver import ActionChains
import random

input_file = ".\\pa-property-value-tools\\input\\active_assets_RETRY.xlsx"

input_df = pd.read_excel(input_file)


input_df['Address'] = input_df['Address'].astype(str)
output_df = pd.DataFrame(columns=['Account','Address', 'trulia_estimate'])
driver = webdriver.Firefox(executable_path = 'C:\\Users\\Morgan.weiss\\Downloads\\geckodriver-v0.24.0-win64\\geckodriver.exe')
#driver = webdriver.Chrome('C:\\Users\\Morgan.weiss\\Downloads\\chromedriver_win32\\chromedriver.exe')
# actionChains = ActionChains(driver)

def append_date_timestamp(filepath, extension):
    return (
        filepath + "-" + datetime.now().strftime("%Y-%m-%d %H-%M-%S") + "." + extension
    )

def get_trulia_estimate(address):
    price = dict()
    price['estimate'] = 'N/A'
    price['listing'] = 'N/A'    
    driver.get('https://www.trulia.com/')
    print(address)
    driver.find_element_by_id('homepageSearchBoxTextInput').clear()
    driver.find_element_by_id('homepageSearchBoxTextInput').send_keys(address)
    driver.find_element_by_css_selector("button[data-auto-test-id='searchButton']").click()    
    time.sleep(3)
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    try:
        trulia_est_text = soup.select("span.Text__TextBase-sc-1cait9d-0.OmRik")
        if trulia_est_text[0].text == 'Trulia Estimate':
            trulia_est = soup.select("div.Text__TextBase-sc-1cait9d-0-div.Text__TextContainerBase-sc-1cait9d-1.hlvKRM")
            if price['estimate']:            
                price['estimate'] = trulia_est[0].text
        else:
            listing_price = soup.select("div.Text__TextBase-sc-1cait9d-0-div.Text__TextContainerBase-sc-1cait9d-1.hlvKRM")
            if price['listing']:
                price['listing'] = listing_price[0].text
        return price
    except (IndexError,TypeError):
        return price

outputfile = append_date_timestamp(".\\pa-property-value-tools\\output\\trulia", "csv")
count = 0
wait_after = 100
for row in input_df.itertuples():
    count += 1
    price = get_trulia_estimate(row.Address)
    output_df = output_df.append({
            'Account': row.Account,
            'Address': row.Address,
            'trulia_estimate':price["estimate"],
            'trulia_listing': price["listing"]
        },
        ignore_index=True,
    )     
    if count % wait_after == 0:
    # if file does not exist write header 
        if not os.path.isfile(outputfile):
            output_df.to_csv(outputfile, index=False) 
        else: # else it exists so append without writing the header
            output_df.to_csv(outputfile, mode='a', index=False, header=False)
        output_df = pd.DataFrame(columns=['Account','Address', 'trulia_estimate', 'trulia_listing'])              
        print("Waiting between 3 minutes and 7 minutes  " + str(wait_after) + " calls")    
        time.sleep(random.randint(180,420))     
    time.sleep(random.randint(3,7))

if count % wait_after > 0:
    output_df.to_csv(outputfile, mode='a', index=False, header=False)

我试图忽略TypeError,但它似乎不起作用,任何建议都将不胜感激。

0 个答案:

没有答案