Question

我的老师给我们的任务是为除英语之外的任何语言编写拼写检查程序所以我拿荷兰语是因为它差不多接近英文字母..

import re, collections

def words(text): return re.findall('[a-z]+', text.lower())

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model


NWORDS = train(words(open('dutch2.txt').read()))

alphabet = 'aäbßcdefghijklmnoöpqrstuüvwxyz'

def edits1(word):
   splits     = [(word[:i], word[i:]) for i in range(len(word) + 1)]
   deletes    = [a + b[1:] for a, b in splits if b]
   transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
   replaces   = [a + c + b[1:] for a, b in splits for c in alphabet if b]
   inserts    = [a + c + b     for a, b in splits for c in alphabet]
   return set(deletes + transposes + replaces + inserts)

def known_edits2(word):
    return set(e2 for e1 in edits1(word) for e2 in edits1(e1) if e2 in NWORDS)

def known(words): return set(w for w in words if w in NWORDS)

def correct(word):
    candidates = known([word]) or known(edits1(word)) or known_edits2(word) or [word]
    return max(candidates, key=NWORDS.get)

dutch2.txt有以下内容：当我运行它时，输出是

    *** Python 3.3.3 (v3.3.3:c3896275c0f6, Nov 18 2013, 21:18:40) [MSC v.1600 32 bit (Intel)] on win32. ***
>>> 
>>> correct("de")
'e'
>>>

这真的不对.. 其他字符的字母变化

import re, collections

def words(text): return re.findall('[a-z]+', text.lower())

def train(features):
    model = collections.defaultdict(lambda: 1)
    for f in features:
        model[f] += 1
    return model


NWORDS = train(words(open('dutch2.txt').read()))

alphabet = 'aÃ¤bÃŸcdefghijklmnoÃ¶pqrstuÃ¼vwxyz'

我该怎么做来解决角色的变化我尝试了很多，但我不能

如何让python不改变我的字典代码中的字符？

0 个答案: