我目前正在尝试使用英文字母映射devnagari脚本。但有一段时间我遇到错误列表索引超出范围。我不想错过任何名单。这就是为什么我不想使用错误处理,除非有必要。您能否查看我的脚本并帮助解决为什么会出现此错误? 在我的单词文件中,我找到了哪个单词导致了错误但是然后如果我从该单词上下使用几个句子那么错误就不存在了。即我认为错误发生在特定长度的字符串上。
clean=[]
dafuq=[]
clean_list = []
replacements = {'अ':'A','आ':'AA', 'इ':'I', 'ई':'II', 'उ':'U','ऊ':'UU', 'ए':'E', 'ऐ':'AI',
'ओ':'O','औ':'OU', 'क':'KA', 'ख':'KHA', 'ग':'GA', 'घ':'GHA', 'ङ':'NGA',
'च':'CA','छ':'CHHA', 'ज':'JA', 'झ':'JHA','ञ':'NIA', 'ट':'TA', 'ठ':'THA',
'ड':'DHA','ढ':'DHHA', 'ण':'NAE', 'त':'TA', 'थ':'THA','द':'DA', 'ध':'DHA',
'न':'NA','प':'PA', 'फ':'FA', 'ब':'B', 'भ':'BHA', 'म':'MA','य':'YA', 'र':'RA',
'ल':'L','व':'WA', 'स':'SA', 'ष':'SHHA', 'श':'SHA', 'ह':'HA', '्':'A',
'ऋ':'RI', 'ॠ':'RI','ऌ':'LI','ॐ':'OMS', 'ः':' ', 'ँ':'U',
'ं':'M', 'ृ':'RI', 'ा':'AA', 'ी':'II', 'ि':'I', 'े':'E', 'ै':'AI',
'ो':'O','ौ':'OU','ु' :'U','ू':'UU' }
import unicodedata
from functools import reduce
def reducer(r, v):
if unicodedata.category(v) in ('Mc', 'Mn'):
r[-1] = r[-1] + v
else:
r.append(v)
return r
with open('words_original.txt', mode='r',encoding="utf-8") as f:
with open ('alphabeths.txt', mode='w+', encoding='utf-8') as d:
with open('only_words.txt', mode='w+', encoding="utf-8") as e:
chunk_size = 4096
f_chunk = f.read(chunk_size)
while len(f_chunk)>0:
for word in f_chunk.split():
for char in ['।', ',', '’', '‘', '?','#','1','2','3','4','0','5','6','7','8','9',
'१','२','३','४','५','.''६','७','८','९','०', '5','6','7','8','9','0','\ufeff']:
if char in word:
word = word.replace(char, '')
if word.strip():
clean_list.append(word)
f_chunk = f.read(chunk_size)
for clean_word in clean_list:
test_word= reduce(reducer,clean_word,[])
final_word= (''.join(test_word))
dafuq.append(final_word)
print (final_word)
f_chunk = f.read(chunk_size)
这是我在
上测试它的文件words_original.txt
stacktrace错误
Traceback (most recent call last):
File "C:\Users\KUSHAL\Desktop\EARTHQUAKE_PYTHON\test.py", line 82, in <module>
test_word= reduce(reducer,clean_word,[])
File "C:\Users\KUSHAL\Desktop\EARTHQUAKE_PYTHON\test.py", line 27, in reducer
r[-1] = r[-1] + v
IndexError: list index out of range
答案 0 :(得分:0)
问题在于一些unicode字符。删除它们后就可以了。