Question

我正尝试使用Beautiful Soup从html页面提取数据。有些html格式错误或什至根本不存在，在这种情况下，我需要使用正则表达式来尝试对其进行检查。我使用try / except子句来处理。这是我的脚本：

def get_metadata(path):
    path = path.replace('\\', '\\')
    rx_1 = r'Supersedes:?\s*[^\r\n]*[\r\n]+(.*?)[ \r\n]+(?:Service)?\s*Serial Numbers?:?[ \r\n]+.*?[ \n\r]+\*+[\n\r]+\*[\n\r]*([A-Za-z ]+)[ \n\r]\*+[\n\r]+\*+[ \n\r]*\*+[\n\r]+\*+[ \n\r]*(?:\*[ \n\r]*)+(?:Author[:\w\/]+ ([\.\w\/\sï¿½]+))'
    rx_2 = r'(.*)(?:Service)[\r\n ]+Serial Numbers?:?[ \r\n]+.*?[ \n\r]+\*+[\n\r]+\*[\n\r]*([A-Za-z ]+)[ \n\r]\*+[\n\r]+\*+[ \n\r]*\*+[\n\r]+\*+[ \n\r]*(?:\*[ \n\r]*)+(?:Author[:\w\/]+ ([\.\w\/\sï¿½]+))'
    rxs = [rx_1, rx_2]
    data = {}

    try:
        soup = BeautifulSoup(open(path, 'rb'), 'html.parser')
        pre = soup.find('pre')
        if pre:
            pre_string = pre.text
        else:
            pre_string = soup.text

        attachments = []
        data['path'] = path
        try:
            description = soup.find(text=re.compile(r'\s*Title:?')).find_next('td').contents[0]
            description = description.strip().replace('\n', '\\n').replace(',', '\\n')
            data['document_description'] = description
        except Exception as why:
        # SOUP COULDN'T FIND IT, MUST RESORT TO REGEX
            for rx in rxs:
                try:
                    match = re.search(rx, pre_string, re.S|re.M)
                    if match:
                        description = match[1]
                        if not description:
                            continue
                        else:
                            if description == 'Service':
                                continue
                            description = re.sub(before_d, '', description)
                            description = re.sub(after_d, '', description)
                            description = re.sub('([\r\n]|\s+)', ' ', description)
                            data['document_description'] = description
                            break
                    else:
                        continue
                except Exception as why:
                    err = {'path': path, 'error': str(why), 'field': 'description'}
#                    record_err(err)
                    data['document_description'] = None
                    continue
            if not data['document_description']:
                data['document_description'] = None
                pass
        html = soup.prettify('utf-8')

        with open(path, 'wb') as f:
            f.write(html)
            update_log(log)

#        with open(path, 'wb') as f:
 #           f.write(html)
  #          update_log(log)

    except Exception as why:
        print('failed to open soup' + str(why))
        pass
    return data

出了什么问题，以致在无法评估以下类似行时，不会引发异常

  try:
        description = soup.find(text=re.compile(r'\s*Title:?')).find_next('td').contents[0]

脚本只是在执行过程中冻结。

我为几个不同的路径运行get_metadata，直到发生异常处理问题为止，此方法都有效。每次尝试路径时都会记录一次。读取带有异常的文件时，该文件将冻结在该文件上，并且我必须通过键盘转义。奇怪的是，错误似乎没有像在try子句中那样处理：

 python just-meta.py
SNViewer-HTML\Compliance\CE\SN_CE_Compliance_01.htm
SNViewer-HTML\Compliance\CE\SN_CE_Compliance_02.htm
SNViewer-HTML\Compliance\CE\SN_CE_Compliance_03.htm
SNViewer-HTML\Compliance\CE\SN_CE_Compliance_04.htm
SNViewer-HTML\Compliance\CE\SN_CE_Compliance_05.htm
SNViewer-HTML\Compliance\CE\SN_CE_Compliance_06.htm
SNViewer-HTML\Compliance\CS\SN_CS_Compliance_01A.htm
Traceback (most recent call last):
  File "just-meta.py", line 109, in get_metadata
    description = soup.find(text=re.compile(r'\s*Title:?')).find_next('td').contents[0]
AttributeError: 'NoneType' object has no attribute 'find_next'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "just-meta.py", line 353, in <module>
    migrate()
  File "just-meta.py", line 46, in migrate
    metadata = get_metadata(f)
  File "just-meta.py", line 115, in get_metadata
    match = re.search(rx, pre_string, re.S|re.M)
  File "C:\Users\Joseph\AppData\Local\Programs\Python\Python36\lib\re.py", line 182, in search
    return _compile(pattern, flags).search(string)

任何人都可以给我有关如何解决此问题的建议吗？

尝试/除外子句中挂起的Python脚本

0 个答案: