我正在使用此代码:
import lxml.etree as et
import os
import glob
import contextlib
@contextlib.contextmanager
def stdout2file(fname):
import sys
f = open(fname, 'w')
sys.stdout = f
yield
sys.stdout = sys.__stdout__
f.close()
def skip_to(fle, line):
with open(fle) as f:
pos = 0
cur_line = f.readline().strip()
while not cur_line.startswith(line):
pos = f.tell()
cur_line = f.readline()
f.seek(pos)
return et.parse(f)
def trade_spider():
os.chdir(r"F:\ABC")
with stdout2file("Test123.txt"):
for file in glob.iglob('**\*.html', recursive=True):
xml = skip_to(file, "<?xml")
tree = xml.getroot()
nsmap = {"ix": tree.nsmap["ix"]}
fractions = xml.xpath("//ix:nonFraction[contains(@name, 'ABC')]", namespaces=nsmap)
for fraction in fractions:
print(file.split(os.path.sep)[-1], end="| ")
print(fraction.get("name"), end="| ")
print(fraction.text, end="|" "\n")
break
trade_spider()
我收到此错误消息:
C:\Users\Anaconda3\python.exe C:/Users/PycharmProjects/untitled/Versuch/lxmlparser.py
Traceback (most recent call last):
File "C:/PycharmProjects/untitled/Versuch/lxmlparser.py", line 42, in <module>
trade_spider()
File "C:/6930p/PycharmProjects/untitled/Versuch/lxmlparser.py", line 33, in trade_spider
xml = skip_to(file, "<?xml")
File "C:/6930p/PycharmProjects/untitled/Versuch/lxmlparser.py", line 26, in skip_to
return et.parse(f)
File "lxml.etree.pyx", line 3427, in lxml.etree.parse (src\lxml\lxml.etree.c:79720)
File "parser.pxi", line 1803, in lxml.etree._parseDocument (src\lxml\lxml.etree.c:116182)
File "parser.pxi", line 1823, in lxml.etree._parseFilelikeDocument (src\lxml\lxml.etree.c:116474)
File "parser.pxi", line 1718, in lxml.etree._parseDocFromFilelike (src\lxml\lxml.etree.c:115235)
File "parser.pxi", line 1139, in lxml.etree._BaseParser._parseDocFromFilelike (src\lxml\lxml.etree.c:110109)
File "parser.pxi", line 573, in lxml.etree._ParserContext._handleParseResultDoc (src\lxml\lxml.etree.c:103323)
File "parser.pxi", line 679, in lxml.etree._handleParseResult (src\lxml\lxml.etree.c:104936)
File "lxml.etree.pyx", line 324, in lxml.etree._ExceptionContext._raise_if_stored (src\lxml\lxml.etree.c:10656)
File "parser.pxi", line 362, in lxml.etree._FileReaderContext.copyToBuffer (src\lxml\lxml.etree.c:100828)
File "C:\6930p\Anaconda3\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x8f in position 1789: character maps to <undefined>
对于以下内容,我的目录包含5个子文件夹,每个子文件夹最多包含12个填充HTML文件的子文件夹。
如果我将os.chdir(r"F:\ABC\201X\XXX")
设置为目录&#39; 201X&#39;中的每个子文件夹,则代码可以正常运行。但是它会给我上面提到的错误消息IF:
1.我设置os.chdir as r"F:\ABC\2012\October
(10月是lxml解析器正在解析的子文件夹2012中的第一个子文件夹。(对于所有其他子文件夹,这非常有效!?)
2.如果我设置os.chdir as r"F:\ABC
。由于我不想手动设置所有子文件夹,我的初始意图必须是自动解析ABC中的所有子文件夹。我想如果我使用for file in glob.iglob('**/*.html', recursive=True):
它会浏览包含在我的目录中的所有子文件夹&#34; ABC&#34;?
之前有人遇到过这样的问题吗?
答案 0 :(得分:1)
好的,这是一个编码问题。将with open(fle) as f:
更改为with open(fle,encoding="utf=8") as f
并因此使用utf-8 enconding时,它可以正常运行。
答案 1 :(得分:1)
编码考虑了UnicodeDecodeError
,它通过将编码设置为 utf-8 进行排序,代码卡在循环中的问题是因为并非所有文件都有"<?xml"
,有些<html ...
修改过的函数会解决问题,我还添加了一些调试,以便您可以看到哪些文件不包含任何数据。
import logging
import contextlib
logger = logging.getLogger(__file__)
logging.basicConfig(filename="debug.log")
logger.setLevel(logging.DEBUG)
@contextlib.contextmanager
def stdout2file(fname):
import sys
f = open(fname, 'w', encoding="utf-8")
sys.stdout = f
yield
sys.stdout = sys.__stdout__
f.close()
def skip_to(fle, starts):
with open(fle) as f:
pos = 0
cur_line = f.readline().strip()
while not cur_line.startswith(starts):
pos = f.tell()
cur_line = f.readline()
f.seek(pos)
return et.parse(f)
def trade_spider():
os.chdir(r"C:\Users\Independent Auditors Report")
with stdout2file("auditfeesexpenses.txt"):
for file in glob.iglob('./*.html'):
xml = skip_to(file, ("<?xml","<html"))
tree = xml.getroot()
nsmap = {"ix": tree.nsmap["ix"]}
fractions = xml.xpath("//ix:nonFraction[contains(@name, 'AuditFeesExpenses')]", namespaces=nsmap)
for fraction in fractions:
print(file.split(os.path.sep)[-1], end="| ")
print(fraction.get("name"), end="| ")
print(fraction.text, end="|" "\n")
break
else:
logger.debug("Nothing found in file {}".format(file))
trade_spider()
答案 2 :(得分:0)
最终守则:
import lxml.etree as et
import os
import glob
import logging
import contextlib
logger = logging.getLogger(__file__)
logging.basicConfig(filename="debug.log")
logger.setLevel(logging.DEBUG)
@contextlib.contextmanager
def stdout2file(fname):
import sys
f = open(fname, 'w', encoding="utf-8")
sys.stdout = f
yield
sys.stdout = sys.__stdout__
f.close()
def skip_to(fle, starts):
with open(fle, encoding="utf=8") as f:
pos = 0
cur_line = f.readline().strip()
while not cur_line.startswith(starts):
pos = f.tell()
cur_line = f.readline()
f.seek(pos)
return et.parse(f)
def trade_spider():
os.chdir(r"F:\XYZ")
with stdout2file("Test123.txt"):
for file in glob.iglob('**/*.html'):
xml = skip_to(file, ("<?xml", "<html"))
tree = xml.getroot()
nsmap = {"ix": tree.nsmap["ix"]}
fractions = xml.xpath("//ix:nonNumeric[contains(@name, 'NAMEATTRIBUTE')]", namespaces=nsmap)
for fraction in fractions:
print(file.split(os.path.sep)[-1], end="| ")
print(fraction.get("name"), end="| ")
print(fraction.xpath(".//text()")[0], end="|" "\n")
break
else:
logger.debug("Nothing found in file {}".format(file))
trade_spider()
非常感谢@Padraic Cunningham!
答案 3 :(得分:-1)
嗯,我在python 2中使用Cyrillic符号处理文件并在python 3中制作时出现了这种错误。
我可以看到你也有类似的东西 可能这会对你有所帮助 输入文件顶部
#!/usr/bin/python
# -*- coding: utf-8 -*-