我正在开发一个项目,通过使用BS4从本地存储的HTML文件中提取特定信息。因为我有相当多的文件(> 1百万),速度和性能是使用有用的代码浏览所有文件的关键。到目前为止,我正在使用BS4,因为我之前在网络爬虫上工作,我认为BS4非常简单方便。但是,如果涉及到大数据,BS4就会变慢。我读到了lxml parser
和html.parser
,它似乎是python中HTML文档中最简单,最快速的解析器。
所以我的代码现在看起来像:
from bs4 import BeautifulSoup
import glob
import os
import re
import contextlib
@contextlib.contextmanager
def stdout2file(fname):
import sys
f = open(fname, 'w')
sys.stdout = f
yield
sys.stdout = sys.__stdout__
f.close()
def trade_spider():
os.chdir(r"C:\Users\XXX")
with stdout2file("output.txt"):
for file in glob.iglob('**/*.html', recursive=True):
with open(file, encoding="utf8") as f:
contents = f.read()
soup = BeautifulSoup(contents, "html.parser")
for item in soup.findAll("ix:nonfraction"):
if re.match(".*SearchTag", item['name']):
print(file.split(os.path.sep)[-1], end="| ")
print(item['name'], end="| ")
print(item.get_text())
break
trade_spider()
它打开一个文本文件,进入我的设置目录(os.chdir(..)),遍历所有以.html结尾的文件,读取内容,如果找到名称属性为“SearchTag”的标签则需要相关HTML文本并将其打印到我的打开文本文件。在一场比赛之后有一个休息时间,它将继续下一个。 所以我读到的是,BS4在内存中完成了这一切,这大大增加了访问时间。
这就是为什么我想使用lxml(首选)或html.parser来改变我的代码。
你是一个天才并且能够改变我的代码以使用lxml解析器而不改变我对此的初步简单想法吗?
任何帮助都对此表示赞赏,因为我完全陷入困境......
更新:
import lxml.etree as et
import os
import glob
import contextlib
@contextlib.contextmanager
def stdout2file(fname):
import sys
f = open(fname, 'w')
sys.stdout = f
yield
sys.stdout = sys.__stdout__
f.close()
def skip_to(fle, line):
with open(fle) as f:
pos = 0
cur_line = f.readline().strip()
while not cur_line.startswith(line):
pos = f.tell()
cur_line = f.readline()
f.seek(pos)
return et.parse(f)
def trade_spider():
os.chdir(r"F:\04_Independent Auditors Report")
with stdout2file("auditfeesexpenses.txt"):
for file in glob.iglob('**/*.html', recursive=True):
xml = skip_to(file, "<?xml")
tree = xml.getroot()
nsmap = {"ix": tree.nsmap["ix"]}
fractions = xml.xpath("//ix:nonFraction[contains(@name, 'AuditFeesExpenses')]", namespaces=nsmap)
for fraction in fractions:
print(file.split(os.path.sep)[-1], end="| ")
print(fraction.get("name"), end="| ")
print(fraction.text, end=" \n")
break
trade_spider()
我收到此错误消息:
Traceback (most recent call last):
File "C:/Users/6930p/PycharmProjects/untitled/Versuch/lxmlparser.py", line 43, in <module>
trade_spider()
File "C:/Users/6930p/PycharmProjects/untitled/Versuch/lxmlparser.py", line 33, in trade_spider
xml = skip_to(file, "<?xml")
File "C:/Users/6930p/PycharmProjects/untitled/Versuch/lxmlparser.py", line 26, in skip_to
return et.parse(f)
File "lxml.etree.pyx", line 3427, in lxml.etree.parse (src\lxml\lxml.etree.c:79720)
File "parser.pxi", line 1803, in lxml.etree._parseDocument (src\lxml\lxml.etree.c:116182)
File "parser.pxi", line 1823, in lxml.etree._parseFilelikeDocument (src\lxml\lxml.etree.c:116474)
File "parser.pxi", line 1718, in lxml.etree._parseDocFromFilelike (src\lxml\lxml.etree.c:115235)
File "parser.pxi", line 1139, in lxml.etree._BaseParser._parseDocFromFilelike (src\lxml\lxml.etree.c:110109)
File "parser.pxi", line 573, in lxml.etree._ParserContext._handleParseResultDoc (src\lxml\lxml.etree.c:103323)
File "parser.pxi", line 679, in lxml.etree._handleParseResult (src\lxml\lxml.etree.c:104936)
File "lxml.etree.pyx", line 324, in lxml.etree._ExceptionContext._raise_if_stored (src\lxml\lxml.etree.c:10656)
File "parser.pxi", line 362, in lxml.etree._FileReaderContext.copyToBuffer (src\lxml\lxml.etree.c:100828)
File "C:\Users\6930p\Anaconda3\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 1789: character maps to <undefined>
答案 0 :(得分:3)
根据pastebin中的html文件整理html有一些工作,下面找到名称属性包含nonFraction
的{{1}}代码:
'AuditFeesExpenses'
输出:
import lxml.etree as et
def skip_to(fle, line):
with open(fle) as f:
pos = 0
cur_line = f.readline().strip()
while not cur_line.startswith(line):
pos = f.tell()
cur_line = f.readline()
f.seek(pos)
return et.parse(f)
xml = skip_to("/home/padraic/Downloads/sample_html_file.html","<?xml")
tree = xml.getroot()
# one mapping is None -> None: 'http://www.w3.org/1999/xhtml'
nsmap = {k: v for k, v in tree.nsmap.items() if k}
print(xml.xpath("//ix:nonFraction[contains(@name, 'AuditFeesExpenses')]", namespaces=nsmap))
提取文字和名称:
[<Element {http://www.xbrl.org/2008/inlineXBRL}nonFraction at 0x7f5b9e91c560>, <Element {http://www.xbrl.org/2008/inlineXBRL}nonFraction at 0x7f5b9e91c5a8>]
哪个会给你:
fractions = xml.xpath("//ix:nonFraction[contains(@name, 'AuditFeesExpenses')]", namespaces=nsmap)
for fraction in fractions:
print(fraction.get("name"))
print(fraction.text)
此外,如果你只是使用ix命名空间,你可以拉
ns19:AuditFeesExpenses
1,850
ns19:AuditFeesExpenses
2,400
所以完整的woking代码:
tree = xml.getroot()
nsmap = {"ix":tree.nsmap["ix"]}
fractions = xml.xpath("//ix:nonFraction[contains(@name, 'AuditFeesExpenses')]", namespaces=nsmap)
for fraction in fractions:
print(fraction.get("name"))
print(fraction.text)
代替 os.chdir ,你也可以:
def trade_spider():
os.chdir(r"C:\Users\Independent Auditors Report")
with stdout2file("auditfeesexpenses.txt"):
for file in glob.iglob('**/*.html', recursive=True):
xml = skip_to(file, "<?xml")
tree = xml.getroot()
nsmap = {"ix": tree.nsmap["ix"]}
fractions = xml.xpath("//ix:nonFraction[contains(@name, 'AuditFeesExpenses')]", namespaces=nsmap)
for fraction in fractions:
print(file.split(os.path.sep)[-1], end="| ")
print(fraction.get("name"), end="| ")
print(fraction.text, end="|")