Python刮,跳过标签和行

时间:2016-12-07 05:34:48

标签: python html html5 beautifulsoup

抓取网页并遇到“IndexError:列表索引超出范围” 非常肯定是因为我正在抓取的表中的一行用作标题 - http://www.wsj.com/mdc/public/page/2_3022-mfsctrscan-moneyflow-20161205.html?mod=mdc_pastcalenda

from urllib2 import urlopen
import requests
from bs4 import BeautifulSoup
import re
import datetime

date = datetime.datetime.today()
url = "http://www.wsj.com/mdc/public/page/2_3022-mfsctrscan-moneyflow-  20161205.html?mod=mdc_pastcalendar"
date_time = urlopen(url.format(date=date.strftime('%Y%m%d')))
address = url
print 'Retrieving information from: ' + address
print '\n'
soup = BeautifulSoup (requests.get(address).content, "lxml")
div_main = soup.find('div', {'id': 'column0'})
table_one = div_main.find('table')
rows = table_one.findAll('tr')
if len(soup.findAll('tr')) > 0:
rows = rows[2:]
#print rows
for row in rows:
    cells = row.findAll('td')
    name = cells[0].get_text()
    last = cells[1].get_text()
    chg = cells[2].get_text()
    pct_chg = cells[3].get_text()
    money_flow = cells[4].get_text()
    tick_up = cells[5].get_text()
    tick_down = cells[6].get_text()
    up_down_Ratio = cells[7].get_text()
    money_flow = cells[8].get_text()
    tick_up = cells[9].get_text()
    tick_down = cells[10].get_text()
    up_down_Ratio = cells[11].get_text()

2 个答案:

答案 0 :(得分:2)

具有单个单元格的中间行,如道琼斯美国股票市场总部门"是你有这个错误的原因。

但是,相反,为什么不预先定义标题列表并根据"数据"的值来动态创建字典。用标题列表压缩的行:

rows = soup.select('div#column0 table tr')[2:]

headers = ['name', 'last', 'chg', 'pct_chg',
           'total_money_flow', 'total_tick_up', 'total_tick_down', 'total_up_down_ratio',
           'block_money_flow', 'block_tick_up', 'block_tick_down', 'block_up_down_ratio']
for row in rows:
    # skip non-data rows
    if row.find("td", class_="pnum") is None:
        continue

    print(dict(zip(headers, [cell.get_text(strip=True) for cell in row.find_all('td')])))

答案 1 :(得分:1)

div_main = soup.find('div', {'id': 'column0'})
table_one = div_main.find('table')

# to id the right row
def target_row(tag):
    is_row = len(tag.find_all('td')) > 5
    row_name = tag.name == 'tr'
    return is_row and row_name

rows = table_one.find_all(target_row)
for row in rows:
    cells = row.findAll('td')
    name = cells[0].get_text()
    last = cells[1].get_text()
    chg = cells[2].get_text()
    pct_chg = cells[3].get_text()
    money_flow = cells[4].get_text()
    tick_up = cells[5].get_text()
    tick_down = cells[6].get_text()
    up_down_Ratio = cells[7].get_text()
    money_flow = cells[8].get_text()
    tick_up = cells[9].get_text()
    tick_down = cells[10].get_text()
    up_down_Ratio = cells[11].get_text()

你可以使用一个返回bool作为find参数的函数,这样你的代码就可以很干净和可维护。