以下来自善良撒玛利亚人的代码-很好地使用英语,可以在大型文档中找到文本字符串,并对其匹配程度充满信心
但无法弄清楚如何使用泰语字符
#!/usr/bin/python
from difflib import SequenceMatcher as SM
from nltk.util import ngrams
import codecs
with open('mainEN.txt', 'r') as hay_file:
hay = hay_file.read()
with open('searchEN.txt', 'r') as needle_file:
needle = needle_file.read()
needle_length = len(needle.split())
max_sim_val = 0
max_sim_string = u""
for ngram in ngrams(hay.split(), needle_length + int(.2*needle_length)):
hay_ngram = u" ".join(ngram)
similarity = SM(None, hay_ngram, needle).ratio()
if similarity > max_sim_val:
max_sim_val = similarity
max_sim_string = hay_ngram
print max_sim_val, max_sim_string