我正在尝试使用棕色语料库类型作为分类任务,但我获得的准确度得分非常低。我尝试了不同的功能,例如停用词的频率。你能检查我做得对吗或我的代码有问题吗?任何建议都表示赞赏。
from collections import defaultdict
from nltk.corpus import brown,stopwords
import random
import nltk
dataset = [] # 500 samples
for category in brown.categories():
for fileid in brown.fileids(category):
dataset.append((brown.words(fileids = fileid),category))
dataset = [([w.lower() for w in text],category) for text,category in dataset]
def feature_extractor(text,bag):
# bag -> bag of words
frec = defaultdict(int)
for word in text:
if word in bag:
frec[word] += 1
return frec
# training & test 90%-10% naivebayes nltk
def train_and_test(featureset,n=90):
random.shuffle(featureset)
split = int((len(featureset)*n)/100)
train,test = featureset[:split],featureset[split:]
classifier = nltk.NaiveBayesClassifier.train(train)
accuracy= nltk.classify.accuracy(classifier, test)
return accuracy
# Stopwords as features
stopwords = stopwords.words("english") # 153 words
featureset = [(feature_extractor(text,stopwords),category)for text,category in dataset]
print("Accuracy: ",train_and_test(featureset)) # around 0.25