与
相似QDir and QDirIterator ignore files with non-ASCII filenames
和
UnicodeEncodeError: 'latin-1' codec can't encode character
关于上面的第二个链接,我在下面添加了test0()。我的理解是utf-8是我正在寻找的解决方案,但是唉试图对文件名进行编码失败。
def test0():
print("test0...using unicode literal")
name = u"123c\udcb4.wav"
test("test0b", name)
n = name.encode('utf-8')
print(n)
n = QtCore.QFile.decodeName(n)
print(n)
# From http://docs.python.org/release/3.0.1/howto/unicode.html
# This will indeed overwrite the correct file!
# f = open(name, 'w')
# f.write('blah\n')
# f.close()
Test0结果......
test0...using unicode literal
test0b QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test0b QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test0b os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test0b os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
Traceback (most recent call last):
File "unicode.py", line 157, in <module>
test0()
File "unicode.py", line 42, in test0
n = name.encode('utf-8')
UnicodeEncodeError: 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed
修改
http://tools.ietf.org/html/rfc3629的进一步阅读告诉我“UTF-8的定义禁止编码字符数 U + D800和U + DFFF“。所以如果uft-8不允许这些字符。你应该如何处理一个如此命名的文件?Python可以创建并测试它们的存在。所以这指向了我我的Qt api用法或Qt api本身的问题?!
我正在努力解决Python3中正确处理unicode文件名的问题。最终,我正在研究一款基于Phonon的音乐播放器。我试图尽可能地将问题与之隔离开来。从下面的代码中你会看到我尝试了尽可能多的替代品。我最初的反应是这里有bug ......也许是我的...可能在一个或多个库中。任何帮助将不胜感激!
我有一个包含3个unicode文件名123 [abc] U.wav的目录。前两个文件处理得当......大多数......第三个文件123c是错误的。
from PyQt4 import QtGui, QtCore
import sys, os
def test(_name, _file):
# print(_name, repr(_file))
f = QtCore.QFile(_file)
# f = QtCore.QFile(QtCore.QFile.decodeName(test))
exists = f.exists()
try:
print(_name, "QFile.exists", f.fileName(), exists)
except UnicodeEncodeError as e:
print(e, repr(_file), exists)
fileInfo = QtCore.QFileInfo(_file)
exists = fileInfo.exists()
try:
print(_name, "QFileInfo.exists", fileInfo.fileName(), exists)
except UnicodeEncodeError as e:
print(e, repr(_file), exists)
exists = os.path.exists(_file)
try:
print(_name, "os.path.exists", _file, exists)
except UnicodeEncodeError as e:
print(e, repr(_file), exists)
exists = os.path.isfile(_file)
try:
print(_name, "os.path.isfile", _file, exists)
except UnicodeEncodeError as e:
print(e, repr(_file), exists)
print()
def test1():
args = QtGui.QApplication.arguments()
print("test1...using QtGui.QApplication.arguments()")
test("test1", args[1])
def test2():
print("test2...using sys.argv")
test("test2", sys.argv[1])
def test3():
print("test3...QtGui.QFileDialog.getOpenFileName()")
name = QtGui.QFileDialog.getOpenFileName()
test("test3", name)
def test4():
print("test4...QtCore.QDir().entryInfoList()")
p = os.path.abspath(__file__)
p, _ = os.path.split(p)
d = QtCore.QDir(p)
for inf in d.entryInfoList(QtCore.QDir.AllEntries|QtCore.QDir.NoDotAndDotDot|QtCore.QDir.System):
print("test4", inf.fileName())
# if str(inf.fileName()).startswith("123c"):
if u"123c\ufffd.wav" == inf.fileName():
# if u"123c\udcb4.wav" == inf.fileName(): # This check fails..even tho that is what is reported in error messages for test2
test("test4a", inf.fileName())
test("test4b", inf.absoluteFilePath())
def test5():
print("test5...os.listdir()")
p = os.path.abspath(__file__)
p, _ = os.path.split(p)
dirList = os.listdir(p)
for file in dirList:
fullfile = os.path.join(p, file)
try:
print("test5", file)
except UnicodeEncodeError as e:
print(e)
print("test5", repr(fullfile))
# if u"123c\ufffd.wav" == file: # This check fails..even tho it worked in test4
if u"123c\udcb4.wav" == file:
test("test5a", file)
test("test5b", fullfile)
print()
def test6():
print("test6...Phonon and QtGui.QFileDialog.getOpenFileName()")
from PyQt4.phonon import Phonon
class Window(QtGui.QDialog):
def __init__(self):
QtGui.QDialog.__init__(self, None)
self.mediaObject = Phonon.MediaObject(self)
self.audioOutput = Phonon.AudioOutput(Phonon.MusicCategory, self)
Phonon.createPath(self.mediaObject, self.audioOutput)
self.mediaObject.stateChanged.connect(self.handleStateChanged)
name = QtGui.QFileDialog.getOpenFileName()# works with python3..not for 123c
# name = QtGui.QApplication.arguments()[1] # works with python2..but not python3...not for 123c
# name = sys.argv[1] # works with python3..but not python2...not for 123c
# p = os.path.abspath(__file__)
# p, _ = os.path.split(p)
# print(p)
# name = os.path.join(p, str(name))
self.mediaObject.setCurrentSource(Phonon.MediaSource(name))
self.mediaObject.play()
def handleStateChanged(self, newstate, oldstate):
if newstate == Phonon.PlayingState:
source = self.mediaObject.currentSource().fileName()
print('test6 playing: :', source)
elif newstate == Phonon.StoppedState:
source = self.mediaObject.currentSource().fileName()
print('test6 stopped: :', source)
elif newstate == Phonon.ErrorState:
source = self.mediaObject.currentSource().fileName()
print('test6 ERROR: could not play:', source)
win = Window()
win.resize(200, 100)
# win.show()
win.exec_()
def timerTick():
QtGui.QApplication.exit()
if __name__ == '__main__':
app = QtGui.QApplication(sys.argv)
app.setApplicationName('unicode_test')
test1()
test2()
test3()
test4()
test5()
test6()
timer = QtCore.QTimer()
timer.timeout.connect(timerTick)
timer.start(1)
sys.exit(app.exec_())
使用123a测试结果...
python3 unicode.py 123a�.wav
test1...using QtGui.QApplication.arguments()
test1 QFile.exists unknown False
test1 QFileInfo.exists unknown False
test1 os.path.exists unknown False
test1 os.path.isfile unknown False
test2...using sys.argv
test2 QFile.exists 123a�.wav True
test2 QFileInfo.exists 123a�.wav True
test2 os.path.exists 123a�.wav True
test2 os.path.isfile 123a�.wav True
test3...QtGui.QFileDialog.getOpenFileName()
test3 QFile.exists /home/mememe/Desktop/test/unicode/123a�.wav True
test3 QFileInfo.exists 123a�.wav True
test3 os.path.exists /home/mememe/Desktop/test/unicode/123a�.wav True
test3 os.path.isfile /home/mememe/Desktop/test/unicode/123a�.wav True
test4...QtCore.QDir().entryInfoList()
test4 123a�.wav
test4 123bÆ.wav
test4 123c�.wav
test4a QFile.exists 123c�.wav False
test4a QFileInfo.exists 123c�.wav False
test4a os.path.exists 123c�.wav False
test4a os.path.isfile 123c�.wav False
test4b QFile.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test4b QFileInfo.exists 123c�.wav False
test4b os.path.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test4b os.path.isfile /home/mememe/Desktop/test/unicode/123c�.wav False
test4 unicode.py
test5...os.listdir()
test5 unicode.py
test5 '/home/mememe/Desktop/test/unicode/unicode.py'
test5 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed
test5 '/home/mememe/Desktop/test/unicode/123c\udcb4.wav'
test5a QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test5a QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test5a os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test5a os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test5b QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' False
test5b QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' False
test5b os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' True
test5b os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' True
test5 123bÆ.wav
test5 '/home/mememe/Desktop/test/unicode/123bÆ.wav'
test5 123a�.wav
test5 '/home/mememe/Desktop/test/unicode/123a�.wav'
test6...Phonon and QtGui.QFileDialog.getOpenFileName()
test6 stopped: : /home/mememe/Desktop/test/unicode/123a�.wav
test6 playing: : /home/mememe/Desktop/test/unicode/123a�.wav
test6 stopped: : /home/mememe/Desktop/test/unicode/123a�.wav
用123b测试结果......
python3 unicode.py 123bÆ.wav
test1...using QtGui.QApplication.arguments()
test1 QFile.exists 123b.wav False
test1 QFileInfo.exists 123b.wav False
test1 os.path.exists 123b.wav False
test1 os.path.isfile 123b.wav False
test2...using sys.argv
test2 QFile.exists 123bÆ.wav True
test2 QFileInfo.exists 123bÆ.wav True
test2 os.path.exists 123bÆ.wav True
test2 os.path.isfile 123bÆ.wav True
test3...QtGui.QFileDialog.getOpenFileName()
test3 QFile.exists /home/mememe/Desktop/test/unicode/123bÆ.wav True
test3 QFileInfo.exists 123bÆ.wav True
test3 os.path.exists /home/mememe/Desktop/test/unicode/123bÆ.wav True
test3 os.path.isfile /home/mememe/Desktop/test/unicode/123bÆ.wav True
test4...QtCore.QDir().entryInfoList()
test4 123a�.wav
test4 123bÆ.wav
test4 123c�.wav
test4a QFile.exists 123c�.wav False
test4a QFileInfo.exists 123c�.wav False
test4a os.path.exists 123c�.wav False
test4a os.path.isfile 123c�.wav False
test4b QFile.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test4b QFileInfo.exists 123c�.wav False
test4b os.path.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test4b os.path.isfile /home/mememe/Desktop/test/unicode/123c�.wav False
test4 unicode.py
test5...os.listdir()
test5 unicode.py
test5 '/home/mememe/Desktop/test/unicode/unicode.py'
test5 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed
test5 '/home/mememe/Desktop/test/unicode/123c\udcb4.wav'
test5a QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test5a QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test5a os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test5a os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test5b QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' False
test5b QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' False
test5b os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' True
test5b os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' True
test5 123bÆ.wav
test5 '/home/mememe/Desktop/test/unicode/123bÆ.wav'
test5 123a�.wav
test5 '/home/mememe/Desktop/test/unicode/123a�.wav'
test6...Phonon and QtGui.QFileDialog.getOpenFileName()
test6 stopped: : /home/mememe/Desktop/test/unicode/123bÆ.wav
test6 playing: : /home/mememe/Desktop/test/unicode/123bÆ.wav
test6 stopped: : /home/mememe/Desktop/test/unicode/123bÆ.wav
用123c测试结果......
python3 unicode.py 123c�.wav
test1...using QtGui.QApplication.arguments()
test1 QFile.exists unknown False
test1 QFileInfo.exists unknown False
test1 os.path.exists unknown False
test1 os.path.isfile unknown False
test2...using sys.argv
test2 QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test2 QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test2 os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test2 os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test3...QtGui.QFileDialog.getOpenFileName()
test3 QFile.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test3 QFileInfo.exists 123c�.wav False
test3 os.path.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test3 os.path.isfile /home/mememe/Desktop/test/unicode/123c�.wav False
test4...QtCore.QDir().entryInfoList()
test4 123a�.wav
test4 123bÆ.wav
test4 123c�.wav
test4a QFile.exists 123c�.wav False
test4a QFileInfo.exists 123c�.wav False
test4a os.path.exists 123c�.wav False
test4a os.path.isfile 123c�.wav False
test4b QFile.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test4b QFileInfo.exists 123c�.wav False
test4b os.path.exists /home/mememe/Desktop/test/unicode/123c�.wav False
test4b os.path.isfile /home/mememe/Desktop/test/unicode/123c�.wav False
test4 unicode.py
test5...os.listdir()
test5 unicode.py
test5 '/home/mememe/Desktop/test/unicode/unicode.py'
test5 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed
test5 '/home/mememe/Desktop/test/unicode/123c\udcb4.wav'
test5a QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test5a QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' False
test5a os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test5a os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '123c\udcb4.wav' True
test5b QFile.exists 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' False
test5b QFileInfo.exists 'utf-8' codec can't encode character '\udcb4' in position 4: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' False
test5b os.path.exists 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' True
test5b os.path.isfile 'utf-8' codec can't encode character '\udcb4' in position 38: surrogates not allowed '/home/mememe/Desktop/test/unicode/123c\udcb4.wav' True
test5 123bÆ.wav
test5 '/home/mememe/Desktop/test/unicode/123bÆ.wav'
test5 123a�.wav
test5 '/home/mememe/Desktop/test/unicode/123a�.wav'
test6...Phonon and QtGui.QFileDialog.getOpenFileName()
test6 stopped: : /home/mememe/Desktop/test/unicode/123c�.wav
有关测试结果的有趣事项......
我知道这是很多信息......我想彻底彻底。
那么,如果有一个最后的问题是在Python3中处理unicode文件名的正确方法是什么?
答案 0 :(得分:2)
你是对的,123c
是错的。证据显示磁盘上的文件名包含invalid Unicode codepoint U+DCB4。当Python试图打印那个角色时,它正确地抱怨它不能。当Qt处理test4中的字符时,它也无法处理它,但它不会抛出错误而是将其转换为Unicode REPLACEMENT CHARACTER U+FFFD。显然,新文件名不再与磁盘上的文件名匹配。
如果您自己进行转换并指定正确的错误处理,Python也可以在字符串中使用替换字符而不是抛出错误。我手边没有Python 3来测试它,但我认为它会起作用:
filename = filename.encode('utf-8').decode('utf-8', 'replace')
答案 1 :(得分:1)
像“\ udcb4”这样的代码来自代理逃脱。这是Python保留不能被解释为有效UTF-8的字节的一种方式。当编码为UTF-8时,代理将变为没有0xDC字节的字节,因此“\ udcb4”变为0xB4。代理转义使得处理文件名中的任何字节序列成为可能。但是你需要小心使用Unicode HOWTO https://docs.python.org/3/howto/unicode.html
中记录的errors =“surrogateescape”答案 2 :(得分:0)
Python2与Python3
python
Python 2.7.4 (default, Sep 26 2013, 03:20:56)
>>> import os
>>> os.listdir('.')
['unicode.py', '123c\xb4.wav', '123b\xc3\x86.wav', '123a\xef\xbf\xbd.wav']
>>> os.path.exists(u'123c\xb4.wav')
False
>>> os.path.exists('123c\xb4.wav')
True
>>> n ='123c\xb4.wav'
>>> print(n)
123c�.wav
>>> n =u'123c\xb4.wav'
>>> print(n)
123c´.wav
上面最后一行的反击是我一直在寻找的! ..vs那
与Python3一起列出的同一目录显示了一组不同的文件名
python3
Python 3.3.1 (default, Sep 25 2013, 19:30:50)
>>> import os
>>> os.listdir('.')
['unicode.py', '123c\udcb4.wav', '123bÆ.wav', '123a�.wav']
>>> os.path.exists('123c\udcb4.wav')
True
这是Python3中的错误吗?