下面是我的年度数据返回代码。由于季度不是一个不同的链接,而是一个按钮,我无法弄清楚如何拉它。我花了几天的时间终于求助于我。
最终游戏是具有资产负债表,现金流等的出色输出,但我需要按季度提供。
欢迎任何帮助。谢谢
import lxml
from lxml import html
import requests
import numpy as np
import pandas as pd
import xlrd
def scrape_table(url):
page = requests.get(url)
tree = html.fromstring(page.content)
## page.content rather than page.text because html.fromstring implicitly expects bytes as input.)
table = tree.xpath('//table')
##XPath is a way of locating information in structured documents such as HTML or XML
assert len(table) == 1
df = pd.read_html(lxml.etree.tostring(table[0], method='html'))[0]
df = df.set_index(0)
# df = df.dropna()
df = df.transpose()
df = df.replace('-', '0')
# The first column should be a date
df[df.columns[0]] = pd.to_datetime(df[df.columns[0]])
cols = list(df.columns)
cols[0] = 'Date'
df = df.set_axis(cols, axis='columns', inplace=False)
numeric_columns = list(df.columns)[1::]
df[numeric_columns] = df[numeric_columns].astype(np.float64)
return df
loc= (r"F:\KateLaptop2019\Work\DataAnalysis\listpubliccompanies.xlsx")
wb=xlrd.open_workbook(loc)
sheet=wb.sheet_by_index(0)
sheet.cell_value(0,0)
companies=[]
for i in range(1,sheet.nrows):
companies.append((sheet.cell_value(i,1).strip()))
def annual_financials():
for item in companies:
try:
balance_sheet_url = 'https://finance.yahoo.com/quote/' + item + '/balance-sheet?p=' + item
download_destination = (r'F:\KateLaptop2019\Work\DataAnalysis\OilCompanyResearch\CompanyFinancials\BalanceSheet\\' + item + ".xlsx")
df_balance_sheet = scrape_table(balance_sheet_url)
df_balance_sheet.to_excel(download_destination)
except:
print(item,"key error")
pass
annual_financials()