当我运行下面的代码时,如果没有发生某种超时错误,或者似乎永远挂起,我似乎无法使其正常运行。我个人可以使它们大多数时候工作,但只有CSI站点似乎可以始终如一地工作。我是否可以对它们进行任何更改,以使脚本不仅更好而且更简单?我想念什么吗?我是Python的新手,可以使用一些帮助进行清理。
谢谢
import csv, os, time
import pandas as pd
import numpy as np
from selenium import webdriver
from pandas import DataFrame, read_csv, set_option
from matplotlib import pyplot
from datetime import date, datetime, timedelta
from collections import Counter
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook #Timer count
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException
from googlesearch import search
chrome_options = webdriver.ChromeOptions()
start = time.time()
sleep_time = 15
url_csi = 'http://www.csidata.com/factsheets.php?type=stock&format=html'
url_tmx = 'https://api.tmxmoney.com/en/migreport/search'
url_nyse = 'https://www.nasdaq.com/screening/company-list.aspx'
database_csi = "E:\\Stock Database\\Historical Data\\Historical Stock List\\CSI Historical Stock List\\"
database_tmx = "E:\\Stock Database\\Historical Data\\Historical Stock List\\TMX Historical Stock List\\"
database_nyse = "E:\\Stock Database\\Historical Data\\Historical Stock List\\NYSE Historical Stock List\\"
database_nasdaq = "E:\\Stock Database\\Historical Data\\Historical Stock List\\NASDAQ Historical Stock List\\"
database_amex = "E:\\Stock Database\\Historical Data\\Historical Stock List\\AMEX Historical Stock List\\"
master_file_csi = "E:\\Stock Database\\Historical Data\\Historical Stock List\\CSI Historical Stock List\\CSI_Ticker_List_Historical.csv"
master_file_tmx = "E:\\Stock Database\\Historical Data\\Historical Stock List\\TMX Historical Stock List\\TMX_Ticker_List_Historical.xlsx"
master_file_nyse = "E:\\Stock Database\\Historical Data\\Historical Stock List\\NYSE Historical Stock List\\NYSE_Ticker_List_Historical.csv"
master_file_nasdaq = "E:\\Stock Database\\Historical Data\\Historical Stock List\\NASDAQ Historical Stock List\\NASDAQ_Ticker_List_Historical.csv"
master_file_amex = "E:\\Stock Database\\Historical Data\\Historical Stock List\\AMEX Historical Stock List\\AMEX_Ticker_List_Historical.csv"
#CSI Exchange Data Scraping
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory': database_csi}
chrome_options.add_experimental_option(name='prefs', value= prefs)
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument('--dns-prefetch-disable')
chrome_driver = webdriver.Chrome(r"E:\Python Programs\chromedriver", chrome_options = chrome_options)
if os.path.exists(master_file_csi): os.remove(master_file_csi)
#Website
chrome_driver.get(url_csi)
#Navigate Web Page
chrome_driver.find_element_by_css_selector('body > a:nth-child(3)').click()
time.sleep(sleep_time)
chrome_driver.close()
os.rename("%s"%database_csi+"stockfactsheet.csv","%s"%(master_file_csi))
#TMX Exchange Data Scraping
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory': database_tmx}
chrome_options.add_experimental_option(name='prefs', value= prefs)
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument('--dns-prefetch-disable')
chrome_driver = webdriver.Chrome(r"E:\Python Programs\chromedriver", chrome_options = chrome_options)
if os.path.exists(master_file_tmx): os.remove(master_file_tmx)
#Website
chrome_driver.get(url_tmx)
#Navigate Web Page
time.sleep(sleep_time)
chrome_driver.find_element_by_css_selector('#leftside > div.idt_container > form > input[type="submit"]:nth-child(3)').click()
#leftside > div.idt_container > form > input[type="submit"]:nth-child(3)
#Download Data
time.sleep(sleep_time)
chrome_driver.find_element_by_css_selector('#leftside > div.idt_containerResults > div.searchToolBox > div.idtDownload > form > input[type="submit"]:nth-child(8)').click()
time.sleep(sleep_time)
chrome_driver.close()
os.rename("%s"%database_tmx+"mig_report.xlsx","%s"%(master_file_tmx))
#NYSE Exchange Data Scraping
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory': database_nyse}
chrome_options.add_experimental_option(name='prefs', value= prefs)
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument('--dns-prefetch-disable')
chrome_driver = webdriver.Chrome(r"E:\Python Programs\chromedriver", chrome_options = chrome_options)
if os.path.exists(master_file_nyse): os.remove(master_file_nyse)
#Website
#Navigate Web Page
chrome_driver.get(url_nyse)
time.sleep(sleep_time)
chrome_driver.find_element_by_css_selector('#companyListDownloads > table > tbody > tr:nth-child(2) > td:nth-child(2) > a').click()
time.sleep(sleep_time)
chrome_driver.close()
os.rename("%s"%database_nyse+"companylist.csv","%s"%(master_file_nyse))
#NASDAQ Exchange Data Scraping
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory': database_nasdaq}
chrome_options.add_experimental_option(name='prefs', value= prefs)
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument('--dns-prefetch-disable')
chrome_driver = webdriver.Chrome(r"E:\Python Programs\chromedriver", chrome_options = chrome_options)
if os.path.exists(master_file_nasdaq): os.remove(master_file_nasdaq)
#Website
chrome_driver.get(url_nyse)
time.sleep(sleep_time)
#Navigate Web Page
chrome_driver.find_element_by_css_selector('#companyListDownloads > table > tbody > tr:nth-child(1) > td:nth-child(2) > a > div > svg').click()
time.sleep(sleep_time)
chrome_driver.close()
os.rename("%s"%database_nasdaq+"companylist.csv","%s"%(master_file_nasdaq))
#AMEX Exchange Data Scraping
chrome_options = webdriver.ChromeOptions()
prefs = {'download.default_directory': database_amex}
chrome_options.add_experimental_option(name='prefs', value= prefs)
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument('--dns-prefetch-disable')
chrome_driver = webdriver.Chrome(r"E:\Python Programs\chromedriver", chrome_options = chrome_options)
if os.path.exists(master_file_amex): os.remove(master_file_amex)
#Website
chrome_driver.get(url_nyse)
time.sleep(sleep_time)
#Navigate Web Page
chrome_driver.find_element_by_css_selector('#companyListDownloads > table > tbody > tr:nth-child(1) > td:nth-child(2) > a > div > svg').click()
time.sleep(sleep_time)
chrome_driver.close()
os.rename("%s"%database_amex+"companylist.csv","%s"%(master_file_amex))
错误
TimeoutException: Message: timeout
(Session info: chrome=71.0.3578.98)
(Driver info: chromedriver=2.42.591088 (7b2b2dca23cca0862f674758c9a3933e685c27d5),platform=Windows NT 10.0.17763 x86_64)
经过多次尝试,我找到了解决方案。问题似乎出在页面尝试加载时。出于某种原因,当我从chromedriver启动时,加载时间太长,而且此修复程序似乎有所帮助。它只是停止图片加载。
prefs = {'download.default_directory': database_csi,'profile.managed_default_content_settings.images': 2}
答案 0 :(得分:0)
在由selenium / webdriver驱动的网页上加载较长时间的情况并不少见。
一种快速的解决方案是增加程序在寻找元素时等待的时间。
到达那里的最短方法是简单地增加sleep_time
的值。但这会增加脚本的整体运行时间。
在我的硒脚本中,我喜欢通过导入来使用WebDriverWait:
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
SELENIUM_TIMEOUT = 10 # define a global timeout (in seconds)
然后,当我在页面上查找代码看起来像的元素时。
try:
element_is_present = EC.presence_of_element_located((By.ID, 'myCustomElementID'))
WebDriverWait(driver, SELENIUM_TIMEOUT).until(element_is_present)
my_element = driver.find_element_by_id('myCustomElementID')
my_element.click()
except TimeoutException:
print("Handle the exception here")
此代码将检查页面上是否有id = {myCustomElementID
满足条件后,脚本将继续运行。
答案 1 :(得分:0)
如果您可以使它们大多数时候工作,但是只有CSI站点似乎能够始终如一地工作,请将此错误消息视为自己的运气...
TimeoutException: Message: timeout
(Session info: chrome=71.0.3578.98)
(Driver info: chromedriver=2.42.591088 (7b2b2dca23cca0862f674758c9a3933e685c27d5),platform=Windows NT 10.0.17763 x86_64)
...表示 WebDriver 实例超时尝试与 WebBrowsing 会话通信时。
您的主要问题是所使用的二进制版本之间的不兼容性:
支持 Chrome v68-70
支持 Chrome v70-72
因此 ChromeDriver v2.42 与 Chrome浏览器v71.0
之间存在明显的不匹配@Test
。driver.quit()
方法内调用tearDown(){}
,以优雅地关闭和销毁 WebDriver 和 Web Client 实例。