使用BS4对Yahoo Finance统计数据进行网络抓取

时间:2019-08-17 18:38:35

标签: python selenium web-scraping beautifulsoup yahoo-finance

我是Python编程的新手,但是我发现了一些不同的代码段,并将它们编译为下面的代码。 Python脚本正在从Summary数组返回所有正确的HTML值,但从statistics数组没有返回值,因为这些值无法匹配。

我不知道如何在Yahoo Finance的统计信息窗格中提取值。它称为url2和key_stats_on_stat。

我希望你愿意帮助我。

import os, sys
import csv
from bs4 import BeautifulSoup
import xlsxwriter
import urllib3
from selenium import webdriver
import pdb
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC 


key_stats_on_main =['Market Cap', 'PE Ratio (TTM)', 'EPS (TTM)']
key_stats_on_stat =['Enterprise Value', 'Trailing P/E', 'Forward P/E',
                     'PEG Ratio (5 yr expected)', 'Return on Assets', 'Quarterly Revenue Growth',
                     'EBITDA', 'Diluted EPS', 'Total Debt/Equity', 'Current Ratio']

stocks_arr =[]
pfolio_file= open("stocks.csv", "r")
for line in pfolio_file:
    indv_stock_arr = line.strip().split(',')
    stocks_arr.append(indv_stock_arr)

print(stocks_arr)

from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument("--headless") # Runs Chrome in headless mode.
options.add_argument('--no-sandbox') # Bypass OS security model
options.add_argument('--disable-gpu')  # applicable to windows os only
options.add_argument('start-maximized') # 
options.add_argument('disable-infobars')
options.add_argument("--disable-extensions")
driver = webdriver.Chrome(options=options, executable_path=r'C:\Users\""\Documents\Python Scripts\chromedriver_win32\chromedriver.exe')
driver.get("https://finance.yahoo.com/quote/AMZN/")
WebDriverWait(driver, 10).until(EC.element_to_be_clickable((By.XPATH, '//html/body/div/div/div/form/div/button[2]'))).click()

stock_info_arr = []

for stock in stocks_arr:
    stock_info = []
    ticker = stock[0]
    stock_info.append(ticker)

    url = "https://finance.yahoo.com/quote/{0}?p={0}".format(ticker) #Summary
    url2 = "https://finance.yahoo.com/quote/{0}/key-statistics?p={0}".format(ticker) #Statistics

    driver.get(url)

    innerHTML = driver.execute_script("return document.body.innerHTML")
    soup = BeautifulSoup(innerHTML, 'html.parser')
    for stat in key_stats_on_main:
        page_stat1 = soup.find(text=stat)
        try:
            page_row1 = page_stat1.find_parent('tr')
            try:
                page_statnum1 = page_row1.find_all('span')[1].contents[1].get_text(strip=True)
                print(page_statnum1)
            except:
                page_statnum1 = page_row1.find_all('td')[1].contents[0].get_text(strip=True)
                print(page_statnum1)
        except:
            print('Invalid parent for this element')
            page_statnum1 = "N/A"

        stock_info.append(page_statnum1)

    driver.get(url2)
    innerHTML2 = driver.execute_script("return document.body.innerHTML")
    soup2 = BeautifulSoup(innerHTML2, 'html.parser')
    for stat in key_stats_on_stat:
        page_stat2 = soup2.find(text=stat)
        try:
            page_row2 = page_stat2.find_parent('tr')
            try:
                page_statnum2 = page_row2.find_all('span')[1].contents[0].get_text(strip=True)
                print(page_statnum2)
            except:
                page_statnum2 = page_row2.find_all('td')[1].contents[0].get_text(strip=True)
                print(page_statnum2)
        except:
            print('Invalid parent for this element')
            page_statnum2 = 'N/A'
        stock_info.append(page_statnum2)

    stock_info_arr.append(stock_info)

print(stock_info_arr)
########## WRITING OUR RESULTS INTO EXCEL

key_stats_on_main.extend(key_stats_on_stat)
workbook = xlsxwriter.Workbook('Stocks01.xlsx')
worksheet = workbook.add_worksheet()
row = 0
col = 1

for stat in key_stats_on_main:
    worksheet.write(row, col, stat)
    col +=1

row = 1
col = 0
for our_stock in stock_info_arr:
    col = 0
    for info_bit in our_stock:
        worksheet.write(row, col, info_bit)
        col += 1
    row += 1
workbook.close()

print('Script completed')

1 个答案:

答案 0 :(得分:2)

您可以避免硒和正则表达式从脚本标记中提取信息并将其解析为json的开销。我不确定您为什么要向第一个网址发出请求,因为第二个网址中似乎存在相同的信息。

Market Cap = Market Cap (intraday)
PE Ratio (TTM) = Trailing P/E
EPS (TTM)  = Diluted EPS (ttm)

在开放市场期间它们可能有所不同吗?但是,可以对第一个网址使用相同的方法。


Py

import requests, re, json, pprint

p = re.compile(r'root\.App\.main = (.*);')
tickers = ['NKE','AAPL','SPG']
results = {}

with requests.Session() as s:

    for ticker in tickers:
        r = s.get('https://finance.yahoo.com/quote/{}/key-statistics?p={}'.format(ticker,ticker))
        data = json.loads(p.findall(r.text)[0])
        key_stats = data['context']['dispatcher']['stores']['QuoteSummaryStore']
        res = {
                'Enterprise Value' : key_stats['defaultKeyStatistics']['enterpriseValue']['fmt']
                ,'Trailing P/E' : key_stats['summaryDetail']['trailingPE']['fmt']
                ,'Forward P/E' : key_stats['summaryDetail']['forwardPE']['fmt']
                ,'PEG Ratio (5 yr expected)' : key_stats['defaultKeyStatistics']['pegRatio']['fmt']
                , 'Return on Assets' : key_stats['financialData']['returnOnAssets']['fmt']
                , 'Quarterly Revenue Growth' : key_stats['financialData']['revenueGrowth']['fmt']
                , 'EBITDA' : key_stats['financialData']['ebitda']['fmt']
                , 'Diluted EPS' : key_stats['defaultKeyStatistics']['trailingEps']['fmt']
                , 'Total Debt/Equity' : key_stats['financialData']['debtToEquity']['fmt']
                , 'Current Ratio' :  key_stats['financialData']['currentRatio']['fmt']
        }
        results[ticker] = res

pprint.pprint(results)

示例输出: