Question

我正在开发一个项目，通过使用BS4从本地存储的HTML文件中提取特定信息。因为我有相当多的文件（> 1百万），速度和性能是使用有用的代码浏览所有文件的关键。到目前为止，我正在使用BS4，因为我之前在网络爬虫上工作，我认为BS4非常简单方便。但是，如果涉及到大数据，BS4就会变慢。我读到了lxml parser和html.parser，它似乎是python中HTML文档中最简单，最快速的解析器。

所以我的代码现在看起来像：

from bs4 import BeautifulSoup
import glob
import os
import re
import contextlib


@contextlib.contextmanager
def stdout2file(fname):
    import sys
    f = open(fname, 'w')
    sys.stdout = f
    yield
    sys.stdout = sys.__stdout__
    f.close()


def trade_spider():
    os.chdir(r"C:\Users\XXX")
    with stdout2file("output.txt"):
        for file in glob.iglob('**/*.html', recursive=True):
            with open(file, encoding="utf8") as f:
                contents = f.read()
                soup = BeautifulSoup(contents, "html.parser")
                for item in soup.findAll("ix:nonfraction"):
                    if re.match(".*SearchTag", item['name']):
                        print(file.split(os.path.sep)[-1], end="| ")
                        print(item['name'], end="| ")
                        print(item.get_text())
                        break
trade_spider()

它打开一个文本文件，进入我的设置目录（os.chdir（..）），遍历所有以.html结尾的文件，读取内容，如果找到名称属性为“SearchTag”的标签则需要相关HTML文本并将其打印到我的打开文本文件。在一场比赛之后有一个休息时间，它将继续下一个。所以我读到的是，BS4在内存中完成了这一切，这大大增加了访问时间。

这就是为什么我想使用lxml（首选）或html.parser来改变我的代码。

你是一个天才并且能够改变我的代码以使用lxml解析器而不改变我对此的初步简单想法吗？

任何帮助都对此表示赞赏，因为我完全陷入困境......

更新：

import lxml.etree as et
import os
import glob

import contextlib


@contextlib.contextmanager
def stdout2file(fname):
    import sys
    f = open(fname, 'w')
    sys.stdout = f
    yield
    sys.stdout = sys.__stdout__
    f.close()


def skip_to(fle, line):
        with open(fle) as f:
            pos = 0
            cur_line = f.readline().strip()
            while not cur_line.startswith(line):
                pos = f.tell()
                cur_line = f.readline()
            f.seek(pos)
            return et.parse(f)


def trade_spider():
    os.chdir(r"F:\04_Independent Auditors Report")
    with stdout2file("auditfeesexpenses.txt"):
        for file in glob.iglob('**/*.html', recursive=True):
            xml = skip_to(file, "<?xml")
            tree = xml.getroot()
            nsmap = {"ix": tree.nsmap["ix"]}
            fractions = xml.xpath("//ix:nonFraction[contains(@name, 'AuditFeesExpenses')]", namespaces=nsmap)
            for fraction in fractions:
                print(file.split(os.path.sep)[-1], end="| ")
                print(fraction.get("name"), end="| ")
                print(fraction.text, end=" \n")
                break

trade_spider()

我收到此错误消息：

Traceback (most recent call last):
  File "C:/Users/6930p/PycharmProjects/untitled/Versuch/lxmlparser.py", line 43, in <module>
    trade_spider()
  File "C:/Users/6930p/PycharmProjects/untitled/Versuch/lxmlparser.py", line 33, in trade_spider
    xml = skip_to(file, "<?xml")
  File "C:/Users/6930p/PycharmProjects/untitled/Versuch/lxmlparser.py", line 26, in skip_to
    return et.parse(f)
  File "lxml.etree.pyx", line 3427, in lxml.etree.parse (src\lxml\lxml.etree.c:79720)
  File "parser.pxi", line 1803, in lxml.etree._parseDocument (src\lxml\lxml.etree.c:116182)
  File "parser.pxi", line 1823, in lxml.etree._parseFilelikeDocument (src\lxml\lxml.etree.c:116474)
  File "parser.pxi", line 1718, in lxml.etree._parseDocFromFilelike (src\lxml\lxml.etree.c:115235)
  File "parser.pxi", line 1139, in lxml.etree._BaseParser._parseDocFromFilelike (src\lxml\lxml.etree.c:110109)
  File "parser.pxi", line 573, in lxml.etree._ParserContext._handleParseResultDoc (src\lxml\lxml.etree.c:103323)
  File "parser.pxi", line 679, in lxml.etree._handleParseResult (src\lxml\lxml.etree.c:104936)
  File "lxml.etree.pyx", line 324, in lxml.etree._ExceptionContext._raise_if_stored (src\lxml\lxml.etree.c:10656)
  File "parser.pxi", line 362, in lxml.etree._FileReaderContext.copyToBuffer (src\lxml\lxml.etree.c:100828)
  File "C:\Users\6930p\Anaconda3\lib\encodings\cp1252.py", line 23, in decode
    return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 1789: character maps to <undefined>

Answer 1

根据pastebin中的html文件整理html有一些工作，下面找到名称属性包含nonFraction的{{1}}代码：

'AuditFeesExpenses'

输出：

import lxml.etree as et

def skip_to(fle, line):
    with open(fle) as f:
        pos = 0
        cur_line = f.readline().strip()
        while not cur_line.startswith(line):
            pos = f.tell()
            cur_line = f.readline()
        f.seek(pos)
        return et.parse(f)

xml = skip_to("/home/padraic/Downloads/sample_html_file.html","<?xml")
tree = xml.getroot()
# one mapping is None ->  None: 'http://www.w3.org/1999/xhtml'
nsmap = {k: v for k, v in tree.nsmap.items() if k}

print(xml.xpath("//ix:nonFraction[contains(@name, 'AuditFeesExpenses')]", namespaces=nsmap))

提取文字和名称：

[<Element {http://www.xbrl.org/2008/inlineXBRL}nonFraction at 0x7f5b9e91c560>, <Element {http://www.xbrl.org/2008/inlineXBRL}nonFraction at 0x7f5b9e91c5a8>]

哪个会给你：

fractions = xml.xpath("//ix:nonFraction[contains(@name, 'AuditFeesExpenses')]", namespaces=nsmap)

for fraction in fractions:
    print(fraction.get("name"))
    print(fraction.text)

此外，如果你只是使用ix命名空间，你可以拉

ns19:AuditFeesExpenses
1,850
ns19:AuditFeesExpenses
2,400

所以完整的woking代码：

tree = xml.getroot()
nsmap = {"ix":tree.nsmap["ix"]}

fractions = xml.xpath("//ix:nonFraction[contains(@name, 'AuditFeesExpenses')]", namespaces=nsmap)

for fraction in fractions:
    print(fraction.get("name"))
    print(fraction.text)

代替 os.chdir ，你也可以：

def trade_spider():
    os.chdir(r"C:\Users\Independent Auditors Report")
    with stdout2file("auditfeesexpenses.txt"):
        for file in glob.iglob('**/*.html', recursive=True):
            xml = skip_to(file, "<?xml")
            tree = xml.getroot()
            nsmap = {"ix": tree.nsmap["ix"]}
            fractions = xml.xpath("//ix:nonFraction[contains(@name, 'AuditFeesExpenses')]", namespaces=nsmap)
            for fraction in fractions:
                print(file.split(os.path.sep)[-1], end="| ")
                print(fraction.get("name"), end="| ")
                print(fraction.text, end="|")

从BS4到lxml解析器的代码转换

1 个答案: