使用python从html填充包含财务数据的数组

时间:2017-02-10 01:09:03

标签: python

我有这个income statement,我很好奇你们有些人会如何将数据从html中提取到一个数组中。我发布了我编造的内容。关于改进方法的任何评论。有兴趣看看使用stdlib,panads,numpy等做的方法。感谢

import requests
import pandas as pd
from bs4 import BeautifulSoup
from collections import OrderedDict

url = r'http://gambler-restaurant-15377.bitballoon.com/'

r = requests.get(url, headers={"User-Agent" : b'Opera/9.80 (Windows NT 5.2; U; zh-cn) Presto/2.6.30 Version/10.6'})
soup = BeautifulSoup(r.text, 'lxml')

financial_statement=OrderedDict()
for row in soup.find_all('table', attrs={'id':'dollarTable'})[0].find_all('tr'):
    row_items = []
    for item in row:
        try:
            row_items.append(str(item.get_text(strip=True)))
        except (AttributeError, ValueError):
            # headers dont have a tr tag, and thus raises AttributeError
            # 'Fiscal Year Ending in 2011' raises ValueError
            pass
    financial_statement[row_items[0]]=row_items[1:]



('For year ending Dec,', ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015'])
('Revenues', ['306,057.0', '--', '--', '733,438.0', '390,625.0', '407,708.0', '110,000.0', '18,833.0', '10,417.0'])

1 个答案:

答案 0 :(得分:0)

import requests
import pandas as pd
from bs4 import BeautifulSoup
from collections import OrderedDict

url = r'http://gambler-restaurant-15377.bitballoon.com/'

r = requests.get(url, headers={"User-Agent" : b'Opera/9.80 (Windows NT 5.2; U; zh-cn) Presto/2.6.30 Version/10.6'})
soup = BeautifulSoup(r.text, 'lxml')

financial_statement=OrderedDict()
for row in soup.find_all('table', attrs={'id':'dollarTable'})[0].find_all('tr'):
    row_items = []
    for item in row:
        try:
            row_items.append(str(item.get_text(strip=True)))
        except (AttributeError, ValueError):
            # headers dont have a tr tag, and thus raises AttributeError
            # 'Fiscal Year Ending in 2011' raises ValueError
            pass
    financial_statement[row_items[0]]=row_items[1:]



('For year ending Dec,', ['2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015'])
('Revenues', ['306,057.0', '--', '--', '733,438.0', '390,625.0', '407,708.0', '110,000.0', '18,833.0', '10,417.0'])