Question

我已经读过，改进功能选择会缩短分类器的训练时间并提高准确度，但我不知道如何减少功能的数量。我应该计算它们并在选择第一个3000之后吗？

这是我的代码：

def save_object(obj, filename):
    with open(filename, 'wb') as output:
        pickle.dump(obj,output,pickle.HIGHEST_PROTOCOL)
        print "saved"
        ujson.dumps({"output" : "obj"})


with open('neg5000.csv','rb') as f:
    reader = csv.reader(f)
    neg_tweets = list(reader)
    print "list ready"

with open('pos5000.csv','rb') as f:
    reader = csv.reader(f)
    pos_tweets = list(reader)
    print "list ready"

tweets = []
for (words, sentiment) in pos_tweets + neg_tweets:
    words_filtered = [e.lower() for e in words.split() if len(e) >= 3] 
    tweets.append((words_filtered, sentiment))




def get_words_in_tweets(tweets):
    all_words = []
    for (words, sentiment) in tweets:
      all_words.extend(words)
    return all_words

def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = list(wordlist.keys())[:3000]
    #word_features = wordlist.keys()
    return word_features

def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features
#def extract_features(words):
 #   return dict([(word, True) for word in words])


word_features = get_word_features(get_words_in_tweets(tweets))
training_set = nltk.classify.apply_features(extract_features, tweets)


save_object(word_features, 'wordf.save')
print 'features done'
print datetime.datetime.now()
classifier = nltk.NaiveBayesClassifier.train(training_set)
print 'training done'
print datetime.datetime.now()

save_object(classifier, 'classifier.save')

tweet = 'I love this car'
print classifier.classify(extract_features(tweet.split()))

Answer 1

有许多方法可以对监督分类问题进行特征选择（这是Naive Bayes所做的）。我建议前往scikit-learn manual并尝试列出其中的所有内容，因为特定方法的选择取决于您拥有的数据。

最简单的方法是切换到朴素贝叶斯的scikit-learn implementation，并使用Pipeline链接功能选择和分类器培训。有关代码示例，请参阅此tutorial。

以下是scikit-learn使用SelectKBest功能选择的代码版本：

import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline


def read_input(path):
    with open(path) as handle:
        lines = (line.rsplit(",", 1) for line in handle)
        return [text for text, label in lines]


# Assuming each line in ``neg5000.csv`` and ``pos5000.csv`` is a
# UTF-8-encoded tweet.
neg_tweets = read_input("neg5000.csv")
pos_tweets = read_input("pos5000.csv")

X = np.append(neg_tweets, pos_tweets)
y = np.append(np.full(len(neg_tweets), -1, dtype=int),
              np.full(len(pos_tweets), 1, dtype=int))


p = Pipeline([
    ("vectorizer", CountVectorizer()),
    ("selector", SelectPercentile(percentile=20)),
    ("nb", MultinomialNB())
])

p.fit(X, y)
print(p.predict(["I love this car"]))

如何改进NB分类器的功能选择？

1 个答案: