我正尝试使用Beautiful Soup从html页面提取数据。有些html格式错误或什至根本不存在,在这种情况下,我需要使用正则表达式来尝试对其进行检查。我使用try / except子句来处理。这是我的脚本:
def get_metadata(path):
path = path.replace('\\', '\\')
rx_1 = r'Supersedes:?\s*[^\r\n]*[\r\n]+(.*?)[ \r\n]+(?:Service)?\s*Serial Numbers?:?[ \r\n]+.*?[ \n\r]+\*+[\n\r]+\*[\n\r]*([A-Za-z ]+)[ \n\r]\*+[\n\r]+\*+[ \n\r]*\*+[\n\r]+\*+[ \n\r]*(?:\*[ \n\r]*)+(?:Author[:\w\/]+ ([\.\w\/\s�]+))'
rx_2 = r'(.*)(?:Service)[\r\n ]+Serial Numbers?:?[ \r\n]+.*?[ \n\r]+\*+[\n\r]+\*[\n\r]*([A-Za-z ]+)[ \n\r]\*+[\n\r]+\*+[ \n\r]*\*+[\n\r]+\*+[ \n\r]*(?:\*[ \n\r]*)+(?:Author[:\w\/]+ ([\.\w\/\s�]+))'
rxs = [rx_1, rx_2]
data = {}
try:
soup = BeautifulSoup(open(path, 'rb'), 'html.parser')
pre = soup.find('pre')
if pre:
pre_string = pre.text
else:
pre_string = soup.text
attachments = []
data['path'] = path
try:
description = soup.find(text=re.compile(r'\s*Title:?')).find_next('td').contents[0]
description = description.strip().replace('\n', '\\n').replace(',', '\\n')
data['document_description'] = description
except Exception as why:
# SOUP COULDN'T FIND IT, MUST RESORT TO REGEX
for rx in rxs:
try:
match = re.search(rx, pre_string, re.S|re.M)
if match:
description = match[1]
if not description:
continue
else:
if description == 'Service':
continue
description = re.sub(before_d, '', description)
description = re.sub(after_d, '', description)
description = re.sub('([\r\n]|\s+)', ' ', description)
data['document_description'] = description
break
else:
continue
except Exception as why:
err = {'path': path, 'error': str(why), 'field': 'description'}
# record_err(err)
data['document_description'] = None
continue
if not data['document_description']:
data['document_description'] = None
pass
html = soup.prettify('utf-8')
with open(path, 'wb') as f:
f.write(html)
update_log(log)
# with open(path, 'wb') as f:
# f.write(html)
# update_log(log)
except Exception as why:
print('failed to open soup' + str(why))
pass
return data
出了什么问题,以致在无法评估以下类似行时,不会引发异常
try:
description = soup.find(text=re.compile(r'\s*Title:?')).find_next('td').contents[0]
脚本只是在执行过程中冻结。
我为几个不同的路径运行get_metadata
,直到发生异常处理问题为止,此方法都有效。每次尝试路径时都会记录一次。读取带有异常的文件时,该文件将冻结在该文件上,并且我必须通过键盘转义。奇怪的是,错误似乎没有像在try子句中那样处理:
python just-meta.py
SNViewer-HTML\Compliance\CE\SN_CE_Compliance_01.htm
SNViewer-HTML\Compliance\CE\SN_CE_Compliance_02.htm
SNViewer-HTML\Compliance\CE\SN_CE_Compliance_03.htm
SNViewer-HTML\Compliance\CE\SN_CE_Compliance_04.htm
SNViewer-HTML\Compliance\CE\SN_CE_Compliance_05.htm
SNViewer-HTML\Compliance\CE\SN_CE_Compliance_06.htm
SNViewer-HTML\Compliance\CS\SN_CS_Compliance_01A.htm
Traceback (most recent call last):
File "just-meta.py", line 109, in get_metadata
description = soup.find(text=re.compile(r'\s*Title:?')).find_next('td').contents[0]
AttributeError: 'NoneType' object has no attribute 'find_next'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "just-meta.py", line 353, in <module>
migrate()
File "just-meta.py", line 46, in migrate
metadata = get_metadata(f)
File "just-meta.py", line 115, in get_metadata
match = re.search(rx, pre_string, re.S|re.M)
File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\re.py", line 182, in search
return _compile(pattern, flags).search(string)
任何人都可以给我有关如何解决此问题的建议吗?