使用布朗语料库进行文本分类NLTK

时间:2016-05-25 11:07:50

标签: python python-3.x nlp nltk

我正在尝试使用棕色语料库类型作为分类任务,但我获得的准确度得分非常低。我尝试了不同的功能,例如停用词的频率。你能检查我做得对吗或我的代码有问题吗?任何建议都表示赞赏。

from collections import defaultdict
from nltk.corpus import brown,stopwords
import random
import nltk

dataset = [] # 500 samples

for category in brown.categories():
    for fileid in brown.fileids(category):
        dataset.append((brown.words(fileids = fileid),category))

dataset = [([w.lower() for w in text],category) for text,category in dataset]

def feature_extractor(text,bag):
    # bag -> bag of words
    frec = defaultdict(int)
    for word in text:
        if word in bag:
            frec[word] += 1

    return frec

# training & test 90%-10% naivebayes nltk

def train_and_test(featureset,n=90):

    random.shuffle(featureset)
    split = int((len(featureset)*n)/100)
    train,test = featureset[:split],featureset[split:]
    classifier = nltk.NaiveBayesClassifier.train(train)
    accuracy= nltk.classify.accuracy(classifier, test)
    return accuracy

# Stopwords as features
stopwords = stopwords.words("english") # 153 words

featureset = [(feature_extractor(text,stopwords),category)for text,category in dataset]

print("Accuracy: ",train_and_test(featureset)) # around 0.25

0 个答案:

没有答案