Question

以下是我的情况：我的代码解析了电子邮件中HTML表格中的数据。我遇到的障碍是这些表格中的一些在表格中间有空白的空行，如下图所示。此空白区域导致我的代码失败（IndexError: list index out of range），因为它尝试从单元格中提取文本。

是否有可能对Python说：＆＃34;好吧，如果遇到来自这些空行的这个错误，只需停在那里并从目前为止获取已获取文本的行并执行其余的关于那些＆＃34; ...？

的代码

这可能听起来像是这个问题的一个愚蠢的解决方案，但我的项目涉及到我只从表中最近的日期获取数据，这总是在前几行之间，并且始终在这些空行空行之前。

因此，如果可以说＆＃34;如果您遇到此错误，请忽略它并继续＆＃34;然后我想学习如何做到这一点。如果不是，那么我将不得不找到另一种解决方法。感谢您的帮助。

有差距的表：

我的代码：

from bs4 import BeautifulSoup, NavigableString, Tag
import pandas as pd
import numpy as np
import os
import re
import email
import cx_Oracle

dsnStr = cx_Oracle.makedsn("sole.nefsc.noaa.gov", "1526", "sole")
con = cx_Oracle.connect(user="user", password="password", dsn=dsnStr)

def celltext(cell):
    '''    
        textlist=[]
        for br in cell.findAll('br'):
            next = br.nextSibling
            if not (next and isinstance(next,NavigableString)):
                continue
            next2 = next.nextSibling
            if next2 and isinstance(next2,Tag) and next2.name == 'br':
                text = str(next).strip()
                if text:
                    textlist.append(next)
        return (textlist)
    '''
    textlist=[]
    y = cell.find('span')
    for a in y.childGenerator(): 
        if isinstance(a, NavigableString):
            textlist.append(str(a))
    return (textlist)

path = 'Z:\\blub_2'

for filename in os.listdir(path):
    file_path = os.path.join(path, filename)
    if os.path.isfile(file_path):
        html=open(file_path,'r').read()
        soup = BeautifulSoup(html, 'lxml') # Parse the HTML as a string
        table = soup.find_all('table')[1] # Grab the second table

df_Quota = pd.DataFrame()

for row in table.find_all('tr'):    
    columns = row.find_all('td')
    if columns[0].get_text().strip()!='ID':  # skip header 
        Quota = celltext(columns[1]) 
        Weight =  celltext(columns[2])
        price =  celltext(columns[3])

        print(Quota)

        Nrows= max([len(Quota),len(Weight),len(price)]) #get the max number of rows

        IDList = [columns[0].get_text()] * Nrows
        DateList = [columns[4].get_text()] * Nrows

        if price[0].strip()=='Package':
             price = [columns[3].get_text()] * Nrows

        if len(Quota)<len(Weight):#if Quota has less itmes extend with NaN
           lstnans= [np.nan]*(len(Weight)-len(Quota))
           Quota.extend(lstnans)

        if len(price) < len(Quota): #if price column has less items than quota column,
            val = [columns[3].get_text()] * (len(Quota)-len(price)) #extend with 
            price.extend(val)                                       #whatever is in
                                                                    #price column

        #if len(DateList) > len(Quota): #if DateList is longer than Quota, 
            #print("it's longer than")
            #value = [columns[4].get_text()] * (len(DateList)-len(Quota))
            #DateList = value * Nrows

        if len(Quota) < len(DateList): #if Quota is less than DateList (due to gap),
            stu = [np.nan]*(len(DateList)-len(Quota))   #extend with NaN
            Quota.extend(stu)

        if len(Weight) < len(DateList):
            dru = [np.nan]*(len(DateList)-len(Weight))
            Weight.extend(dru)

        FinalDataframe = pd.DataFrame(
        {
        'ID':IDList,    
         'AvailableQuota': Quota,
         'LiveWeightPounds': Weight,
         'price':price,
         'DatePosted':DateList
        })

        df_Quota = df_Quota.append(FinalDataframe, ignore_index=True)
        #df_Quota = df_Quota.loc[df_Quota['DatePosted']=='5/20']
        df_Q = df_Quota['DatePosted'].iloc[0]
        df_Quota = df_Quota[df_Quota['DatePosted'] == df_Q]
print (df_Quota)

for filename in os.listdir(path):
    file_path = os.path.join(path, filename)
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            pattern = re.compile(r'Sent:.*?\b(\d{4})\b')
            email = f.read()
            dates = pattern.findall(email)
            if dates:
                print("Date:", ''.join(dates))

#cursor = con.cursor()
#exported_data = [tuple(x) for x in df_Quota.values]
#sql_query = ("INSERT INTO ROUGHTABLE(species, date_posted, stock_id, pounds, money, sector_name, ask)" "VALUES (:1, :2, :3, :4, :5, 'NEFS 2', '1')")
#cursor.executemany(sql_query, exported_data)
#con.commit()

#cursor.close()
#con.close()

Answer 1

使用try: ... except: ...：

try:
    #extract data from table
except IndexError:
    #execute rest of program

Answer 2

继续是用于跳过空/问题行的关键字。 IndexError是由于尝试在空列列表上访问columns[0]。所以只有在出现异常时才跳到下一行。

for row in table.find_all('tr'):
    columns = row.find_all('td')
    try:
      if columns[0].get_text().strip()!='ID':
        # Rest as above in original code.
    except IndexError:
      continue

如何绕过IndexError

2 个答案: