我想为德语实现朴素贝叶斯分类器。我正在使用NLTK,基本上是从NLTK食谱中复制代码。我已经整理了一个小语料库。我有一个包含两个子文件夹的文件夹,名为neg和pos(如电影评论语料库)。我正在使用一袋文字模型。
这是我处理语料库的方式:
def bag_of_words(words):
return dict([(word, True) for word in words])
def bag_of_words_not_in_set(words, badwords):
return bag_of_words(set(words) - set(badwords))
def bag_of_words_without_stopwords(words):
badwords = stopwords.words("german")
return bag_of_words_not_in_set(words, badwords)
def label_feats_from_corpus(corp, feature_detector=bag_of_words_without_stopwords):
label_feats = collections.defaultdict(list)
for label in corp.categories():
for fileid in corp.fileids(categories=[label]):
feats = feature_detector(corp.words(fileids=[fileid]))
label_feats[label].append(feats)
return label_feats
def label_feats_from_corpus(corp, feature_detector=bag_of_words_without_stopwords):
label_feats = collections.defaultdict(list)
for label in corp.categories():
for fileid in corp.fileids(categories=[label]):
print (label + " >> " + fieldid)
feats = feature_detector(corp.words(fileids=[fileid]))
print(feats)
label_feats[label].append(feats)
return label_feats
reader = CategorizedPlaintextCorpusReader('D:/corpus/', r'.*\.txt', cat_pattern=r'(\w+)/*')
对于
print (label + " >> " + fieldid)
print(feats)
我得到了
neg >> neg/fdfdg.txt
{'autorisierten': True, 'durchführen': True, 'Sicherheit': True, 'Fachwerkstatt': True, 'Arbeiten': True, 'in': True, 'Lassen': True, 'im': True, 'Interesse': True}
neg >> neg/fdffdf.txt
{'Arbeiten': True, 'Fachkenntnisse': True, 'gekennzeichnet': True, 'Verständnis': True, 'technisches': True, 'erfordern': True, 'Symbol': True}
neg >> neg/fgfdgdgdg.txt
{'Arbeiten': True, 'Spezialwerkzeuge': True, 'notwendig': True}
neg >> neg/fgff.txt
{'aussetzen': True, 'niemals': True, 'Flüssigkeiten': True, 'Regen': True, 'Salzwasser': True, 'Feuchtigkeit': True, 'Nässe': True, 'Batterien': True, ').': True}
这意味着功能选择有效,对吗? (不论它们是否已经是最好的特征)
我只是对这一点感兴趣,正确阅读语料库并获得正确的词语特征选择权。
感谢您提供任何帮助和反馈!