我对Good-Turing平滑的实现产生了下面的困惑数字。但这些似乎并不正确。任何直觉为什么?我正在使用来自NLTK的1000条电影评论。我的实现似乎是正确的(转载如下)。
1gram ppl: 1057.398218919647
2gram ppl: 3262.444941553032
3gram ppl: 68.10224173098685
4gram ppl: 4.542117543343882
5gram ppl: 1.7044134004884632
def good_turing_prob(ngram_occurences,freq_of_freq,total_ngram_count):
# unseen gram
if ngram_occurences == 0:
N_1 = freq_of_freq[1]
N = total_ngram_count
return N_1/N
#ngram is present in model
else:
# take closest count if count+1 is not present
N_c_plus_1 = freq_of_freq[min(freq_of_freq, key= lambda x:abs(x-(ngram_occurences+1)))]
N_c = freq_of_freq[min(freq_of_freq, key= lambda x:abs(x-ngram_occurences))]
good_turing_count = (ngram_occurences+1) * (N_c_plus_1/N_c)
return good_turing_count/total_ngram_count