我想查看如何使用nltk HMM-Trainer对HMM进行无监督学习。 据我所知,具有监督和无监督训练的HMM应该比仅受监督的训练有效。 我得到了以下代码
#!/bin/python3
import nltk
from nltk.corpus import brown
from nltk.util import unique_list
def hmmTrainer(sents):
tag_set = unique_list(tag for sent in sents for (word,tag) in sent)
symbols = unique_list(word for sent in sents for (word,tag) in sent)
return nltk.tag.HiddenMarkovModelTrainer(tag_set, symbols)
if __name__ == '__main__':
corpus=brown
print(len(corpus.sents()))
> 57340
train= corpus.tagged_sents()[:10000]
devel=corpus.tagged_sents()[10000:11000]
test=corpus.tagged_sents()[11000:13000]
trainer=tag.hmmTrainer(train+devel)
hmm=trainer.train_supervised(train)
print(hmm.evaluate(test))
> 0.3531
tmp=[nltk.untag(sent) for sent in devel]
hmm=trainer.train(labeled_sequences=train,
unlabeled_sequences=[tmp])
print(hmm.evaluate(test))
> 0.0866
# this needs ~40h for training
train= corpus.tagged_sents()[:30000]
devel=corpus.tagged_sents()[30000:33000]
test=corpus.tagged_sents()[33000:36000]
trainer=tag.hmmTrainer(train+devel)
hmm=trainer.train_supervised(train)
print(hmm.evaluate(test))
> 0.59785
tmp=[nltk.untag(sent) for sent in devel]
hmm=trainer.train(labeled_sequences=train,
unlabeled_sequences=[tmp])
print(hmm.evaluate(test))
> 0.1057
我的错误是什么?