从网站抓取表数据时出错

时间:2017-08-03 13:14:16

标签: python pandas beautifulsoup

我正试图从网上为我的项目获取一些股票相关数据。我遇到了几个问题 问题1:
我试图从这个网站http://sharesansar.com/c/today-share-price.html抓住桌子 它工作但列没有按顺序抓取。例如:列'公司名称'具有'开放价格'的值。我怎么解决这个问题呢? 问题2:
我还尝试从“价格历史”标签下的http://merolagani.com/CompanyDetail.aspx?symbol=ADBL获取公司特定数据 这次我在抓取表数据时遇到错误。我得到的错误是:

self.data[key].append(cols[index].get_text())

IndexError: list index out of range    

代码如下所示:

import logging
import requests
from bs4 import BeautifulSoup
import pandas


module_logger = logging.getLogger('mainApp.dataGrabber')


class DataGrabberTable:
    ''' Grabs the table data from a certain url. '''

    def __init__(self, url, csvfilename, columnName=[], tableclass=None):
        module_logger.info("Inside 'DataGrabberTable' constructor.")
        self.pgurl = url
        self.tableclass = tableclass
        self.csvfile = csvfilename
        self.columnName = columnName

        self.tableattrs = {'class':tableclass} #to be passed in find()

        module_logger.info("Done.")


    def run(self):
        '''Call this to run the datagrabber. Returns 1 if error occurs.'''

        module_logger.info("Inside 'DataGrabberTable.run()'.")

        try:
            self.rawpgdata = (requests.get(self.pgurl, timeout=5)).text
        except Exception as e:
            module_logger.warning('Error occured: {0}'.format(e))
            return 1

        #module_logger.info('Headers from the server:\n {0}'.format(self.rawpgdata.headers))

        soup = BeautifulSoup(self.rawpgdata, 'lxml')

        module_logger.info('Connected and parsed the data.')

        table = soup.find('table',attrs = self.tableattrs)
        rows = table.find_all('tr')[1:]

        #initializing a dict in a format below
        # data = {'col1' : [...], 'col2' : [...], }
        #col1 and col2 are from columnName list
        self.data = {}
        self.data = dict(zip(self.columnName, [list() for i in range(len(self.columnName))]))

        module_logger.info('Inside for loop.')
        for row in rows:
            cols = row.find_all('td')
            index = 0
            for key in self.data:
                if index > len(cols): break
                self.data[key].append(cols[index].get_text())
                index += 1
        module_logger.info('Completed the for loop.')

        self.dataframe = pandas.DataFrame(self.data)    #make pandas dataframe

        module_logger.info('writing to file {0}'.format(self.csvfile))
        self.dataframe.to_csv(self.csvfile)
        module_logger.info('written to file {0}'.format(self.csvfile))

        module_logger.info("Done.")
        return 0

    def getData(self):
        """"Returns 'data' dictionary."""
        return self.data




    # Usage example

    def main():
        url = "http://sharesansar.com/c/today-share-price.html"
        classname = "table"
        fname = "data/sharesansardata.csv"
        cols = [str(i) for i in range(18)] #make a list of columns

        '''cols = [
          'S.No', 'Company Name', 'Symbol', 'Open price', 'Max price', 
         'Min price','Closing price', 'Volume', 'Previous closing', 
         'Turnover','Difference',
         'Diff percent', 'Range', 'Range percent', '90 days', '180 days',
         '360 days', '52 weeks high', '52 weeks low']'''

        d = DataGrabberTable(url, fname, cols, classname)
        if d.run() is 1:
            print('Data grabbing failed!')
        else:
            print('Data grabbing done.') 


    if __name__ == '__main__':
        main()    

很少有建议会有所帮助。谢谢!

1 个答案:

答案 0 :(得分:1)

您的列表缺少一个元素,有19列,而不是18列:

>>> len([str(i) for i in range(18)])
18

除此之外你似乎过于复杂。以下应该做:

import requests
from bs4 import BeautifulSoup
import pandas as pd

price_response = requests.get('http://sharesansar.com/c/today-share-price.html')
price_table = BeautifulSoup(price_response.text, 'lxml').find('table', {'class': 'table'})
price_rows = [[cell.text for cell in row.find_all(['th', 'td'])] for row in price_table.find_all('tr')]
price_df = pd.DataFrame(price_rows[1:], columns=price_rows[0])

com_df = None
for symbol in price_df['Symbol']:
    comp_response = requests.get('http://merolagani.com/CompanyDetail.aspx?symbol=%s' % symbol)
    comp_table = BeautifulSoup(comp_response.text, 'lxml').find('table', {'class': 'table'})
    com_header, com_value = list(), list()
    for tbody in comp_table.find_all('tbody'):
        comp_row = tbody.find('tr')
        com_header.append(comp_row.find('th').text.strip().replace('\n', ' ').replace('\r', ' '))
        com_value.append(comp_row.find('td').text.strip().replace('\n', ' ').replace('\r', ' '))
    df = pd.DataFrame([com_value], columns=com_header)
    com_df = df if com_df is None else pd.concat([com_df, df])

print(price_df)
print(com_df)