如何加快我的webscraping python代码?

时间:2017-10-31 15:10:59

标签: python selenium web-scraping finance

我一直致力于使用Python代码从股票交易模拟软件中提取数据(我得到了我交易的证券的代码,例如VTI,AAPL,GOOG等),然后搜索Morningstar的股票代码和拉动定价信息以及我想从该网站获得的任何其他信息。我将列表中的数据保存到.csv文件中以便在Excel中使用。我使用Selenium运行webdriver(我使用Chrome浏览器直观地查看流程,或者使用PhantomJS在没有浏览器GUI的情况下运行程序),以及访问网站并使用HTML的优秀功能。

我的计划运作得体,但是只需要120秒才能完成11个证券的投资组合,我希望扩展这个计划以做更精细的行动。

我的编码风格中是否有可以更改以加快网页编写过程的内容?是否有任何编写Python代码的通用方法可以快速执行?

以下是代码:

from selenium import webdriver
from bs4 import BeautifulSoup
import csv

#browser = webdriver.Chrome() #replace with .Firefox(), or with the browser of your choice
browser = webdriver.PhantomJS()

security_links_list = list()
equitysim_ticker_list = list()

url = ['https://www.equitysim.com/Home'
]

for item in url:

 browser.get(item) #navigate to page behind login
 username = browser.find_element_by_id('placeholderContent_txtUserName')
 username.send_keys('EDITED_FOR_SECURITY')
 password = browser.find_element_by_id('placeholderContent_txtPassword')
 password.send_keys('EDITED_FOR_SECURITY')
 form = browser.find_element_by_id("placeholderContent_LoginButton")
 form.click()

 innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
 innerHTML = browser.page_source
 html = browser.page_source
 soup = BeautifulSoup(html, 'html.parser')
 table_a = soup.find('table', 'ba-tbl admintable')
 for a in table_a.find_all('a', href=True):
    security_links_list.append(a['href'])
links_set = set(security_links_list)
links_set.remove('#')
print(links_set)

mystring = "https://www.equitysim.com"

links_set_revised = [mystring + link_text for link_text in links_set]
print(links_set_revised)

for item in links_set_revised:

 browser.get(item)
 innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
 innerHTML = browser.page_source
 html = browser.page_source
 soup = BeautifulSoup(html, 'html.parser')

 title_element = soup.find("title")
 title = title_element.text
 ticker = title.split(':', 1)[0]

 ticker = ticker.replace('\n','')
 ticker = ticker.replace('\t','')

 equitysim_ticker_list.append(ticker)

print(equitysim_ticker_list)

morningstar_ticker_search = "http://quote.morningstar.com/TickerLookup.html"
uri_links = list()

for ticker in equitysim_ticker_list:
 browser.get(morningstar_ticker_search)
 enter_ticker = browser.find_element_by_xpath("//input[@value='Ticker']")
 enter_ticker.click()
 search_ticker = browser.find_element_by_class_name('F3')
 search_ticker.send_keys(ticker)
 go_box = browser.find_element_by_xpath("//input[@src='http://im.morningstar.com/im/go.gif']")
 go_box.click()

 html = browser.page_source
 soup = BeautifulSoup(html, 'html.parser')
 outer_div = soup.find('div', attrs={'id': 'quote_quicktake'})
 iframe = outer_div.find('iframe').get('src')
 full_url = 'https:' + iframe
 uri_links.append(full_url)

print(uri_links)

price_list = list()
ticker_list = list()
nav_list = list()

for item in uri_links:

 browser.get(item) #navigate to page behind login
 innerHTML = browser.execute_script("return document.body.innerHTML") #returns the inner HTML as a string
 innerHTML = browser.page_source
 html = browser.page_source
 soup = BeautifulSoup(html, 'html.parser')

 price_element = soup.find("div", attrs={"id": "lastPrice"})
 price = price_element.text # strip() is used to remove starting and trailing

 nav_element = soup.find("span", attrs={"id": "NAV"} or {"vkey": "NAV"})
 nav = nav_element.text
 nav_split1 = nav.split('\n                                   \t\t', 1)[1]
 nav_split2 = nav_split1.split(' ', 1)[0]

 title_element = soup.find("title")
 title = title_element.text
 ticker = title.split(' ', 1)[0]

 price_list.append(price)
 nav_list.append(nav_split2)
 ticker_list.append(ticker)

 print(ticker)
 print(price)
 print(nav_split2)
 #ticker =

print(ticker_list)
print(price_list)
print(nav_list)


csvfile = "C:\\Users\\USERNAME\\AppData\\Local\\Programs\\Python\\Python36\\personal\\exampleCsv.csv"

#Assuming res is a flat list
with open(csvfile, "w") as output:
    writer = csv.writer(output,lineterminator='')
    writer.writerow(ticker_list)
    writer.writerow('\n')
    writer.writerow(price_list)
    writer.writerow('\n')
    writer.writerow(nav_list)

0 个答案:

没有答案