以下是我的情况:我的代码解析了电子邮件中HTML表格中的数据。我遇到的障碍是这些表格中的一些在表格中间有空白的空行,如下图所示。此空白区域导致我的代码失败(IndexError: list index out of range
),因为它尝试从单元格中提取文本。
是否有可能对Python说:"好吧,如果遇到来自这些空行的这个错误,只需停在那里并从目前为止获取已获取文本的行并执行其余的关于那些" ...?
的代码这可能听起来像是这个问题的一个愚蠢的解决方案,但我的项目涉及到我只从表中最近的日期获取数据,这总是在前几行之间,并且始终在这些空行空行之前。
因此,如果可以说"如果您遇到此错误,请忽略它并继续"然后我想学习如何做到这一点。如果不是,那么我将不得不找到另一种解决方法。感谢您的帮助。
我的代码:
from bs4 import BeautifulSoup, NavigableString, Tag
import pandas as pd
import numpy as np
import os
import re
import email
import cx_Oracle
dsnStr = cx_Oracle.makedsn("sole.nefsc.noaa.gov", "1526", "sole")
con = cx_Oracle.connect(user="user", password="password", dsn=dsnStr)
def celltext(cell):
'''
textlist=[]
for br in cell.findAll('br'):
next = br.nextSibling
if not (next and isinstance(next,NavigableString)):
continue
next2 = next.nextSibling
if next2 and isinstance(next2,Tag) and next2.name == 'br':
text = str(next).strip()
if text:
textlist.append(next)
return (textlist)
'''
textlist=[]
y = cell.find('span')
for a in y.childGenerator():
if isinstance(a, NavigableString):
textlist.append(str(a))
return (textlist)
path = 'Z:\\blub_2'
for filename in os.listdir(path):
file_path = os.path.join(path, filename)
if os.path.isfile(file_path):
html=open(file_path,'r').read()
soup = BeautifulSoup(html, 'lxml') # Parse the HTML as a string
table = soup.find_all('table')[1] # Grab the second table
df_Quota = pd.DataFrame()
for row in table.find_all('tr'):
columns = row.find_all('td')
if columns[0].get_text().strip()!='ID': # skip header
Quota = celltext(columns[1])
Weight = celltext(columns[2])
price = celltext(columns[3])
print(Quota)
Nrows= max([len(Quota),len(Weight),len(price)]) #get the max number of rows
IDList = [columns[0].get_text()] * Nrows
DateList = [columns[4].get_text()] * Nrows
if price[0].strip()=='Package':
price = [columns[3].get_text()] * Nrows
if len(Quota)<len(Weight):#if Quota has less itmes extend with NaN
lstnans= [np.nan]*(len(Weight)-len(Quota))
Quota.extend(lstnans)
if len(price) < len(Quota): #if price column has less items than quota column,
val = [columns[3].get_text()] * (len(Quota)-len(price)) #extend with
price.extend(val) #whatever is in
#price column
#if len(DateList) > len(Quota): #if DateList is longer than Quota,
#print("it's longer than")
#value = [columns[4].get_text()] * (len(DateList)-len(Quota))
#DateList = value * Nrows
if len(Quota) < len(DateList): #if Quota is less than DateList (due to gap),
stu = [np.nan]*(len(DateList)-len(Quota)) #extend with NaN
Quota.extend(stu)
if len(Weight) < len(DateList):
dru = [np.nan]*(len(DateList)-len(Weight))
Weight.extend(dru)
FinalDataframe = pd.DataFrame(
{
'ID':IDList,
'AvailableQuota': Quota,
'LiveWeightPounds': Weight,
'price':price,
'DatePosted':DateList
})
df_Quota = df_Quota.append(FinalDataframe, ignore_index=True)
#df_Quota = df_Quota.loc[df_Quota['DatePosted']=='5/20']
df_Q = df_Quota['DatePosted'].iloc[0]
df_Quota = df_Quota[df_Quota['DatePosted'] == df_Q]
print (df_Quota)
for filename in os.listdir(path):
file_path = os.path.join(path, filename)
if os.path.isfile(file_path):
with open(file_path, 'r') as f:
pattern = re.compile(r'Sent:.*?\b(\d{4})\b')
email = f.read()
dates = pattern.findall(email)
if dates:
print("Date:", ''.join(dates))
#cursor = con.cursor()
#exported_data = [tuple(x) for x in df_Quota.values]
#sql_query = ("INSERT INTO ROUGHTABLE(species, date_posted, stock_id, pounds, money, sector_name, ask)" "VALUES (:1, :2, :3, :4, :5, 'NEFS 2', '1')")
#cursor.executemany(sql_query, exported_data)
#con.commit()
#cursor.close()
#con.close()
答案 0 :(得分:1)
使用try: ... except: ...
:
try:
#extract data from table
except IndexError:
#execute rest of program
答案 1 :(得分:1)
继续是用于跳过空/问题行的关键字。 IndexError是由于尝试在空列列表上访问columns[0]
。所以只有在出现异常时才跳到下一行。
for row in table.find_all('tr'):
columns = row.find_all('td')
try:
if columns[0].get_text().strip()!='ID':
# Rest as above in original code.
except IndexError:
continue