
时间:2016-01-23 06:17:22

标签: python matplotlib machine-learning scikit-learn

我正在使用sk-learn中的LinearSVC训练垃圾邮件分类器,这非常有效。 我的实施:

import os
import numpy
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import KFold
from sklearn.metrics import confusion_matrix, f1_score
from sklearn import svm

# So that we can successfully truncate the headers
# in e-mails
NEWLINE = '\n'

# set variables for convenience
HAM = 'ham'
SPAM = 'spam'

# Sources of all the e-mail files
    ('C:/data/spam', SPAM),
    ('C:/data/easy_ham', HAM),
    ('C:/data/hard_ham', HAM),
    # ('C:/data/beck-s', HAM),
    # ('C:/data/farmer-d', HAM),
    # ('C:/data/kaminski-v', HAM),
    # ('C:/data/kitchen-l', HAM),
    # ('C:/data/lokay-m', HAM),
    # ('C:/data/williams-w3', HAM),
    ('C:/data/BG', SPAM),
    # ('C:/data/GP', SPAM),
    # ('C:/data/SH', SPAM)

# These 'cmds' files are not e-mail files, so we
# better skip them
SKIP_FILES = {'cmds'}

# path variable contains the path to files
def read_files(path):
    for root, dir_names, file_names in os.walk(path):
        # for all the files, join the path to get absolute path
        for path in dir_names:
            read_files(os.path.join(root, path))
        for file_name in file_names:
            if file_name not in SKIP_FILES:
                file_path = os.path.join(root, file_name)
                if os.path.isfile(file_path):
                    # the initial line is irrelevant data (e-mail header), so skip it
                    past_header, lines = False, []
                    # open the file in latin-1, since some of the mail is not unicode
                    f = open(file_path, encoding="latin-1")
                    for line in f:
                        if past_header:
                        # after the initial line, past_header will be true
                        elif line == NEWLINE:
                            past_header = True
                    content = NEWLINE.join(lines)
                    yield file_path, content

# uses library called 'Panda' to build dataframe
def build_data_frame(path, classification):
    rows = []
    index = []
    # file_name contains the name of the file
    # and the text contains the actual e-mail
    # look at the above function's last line
    for file_name, text in read_files(path):
        # append the text and it's class,
        # class will be passed to this function
        # as a parameter
        rows.append({'text': text, 'class': classification})

    # create dataFrame
    data_frame = DataFrame(rows, index=index)
    return data_frame

# Initialise an empty DataFrame
data = DataFrame({'text': [], 'class': []})

# for all the path and classification provided
# in sources, build a dataframe out of it
for path, classification in SOURCES:
    data = data.append(build_data_frame(path, classification))

#  The last thing we do is use DataFrame's reindex to shuffle
#  the whole dataset. Otherwise, we'd have contiguous blocks
#  of examples from each source. This is important for validating
#  prediction accuracy later.
data = data.reindex(numpy.random.permutation(data.index))

pipeline = Pipeline([
    # CountVectorizer vectorizes the input and converts
    # the collection of text documents to matrix of
    # token words.
    # We pass it a n-grams range of minimum 1 and maximum 2
        ('count_vectorizer', CountVectorizer(ngram_range=(1, 2))),
    # Use Linear Support Vector Machine
    ('classifier', svm.LinearSVC())

k_fold = KFold(n=len(data), n_folds=6)
scores = []
confusion = numpy.array([[0, 0], [0, 0]])
for train_indices, test_indices in k_fold:
    train_text = data.iloc[train_indices]['text'].values
    train_y = data.iloc[train_indices]['class'].values.astype(str)

    test_text = data.iloc[test_indices]['text'].values
    test_y = data.iloc[test_indices]['class'].values.astype(str)

    pipeline.fit(train_text, train_y)
    predictions = pipeline.predict(test_text)

    confusion += confusion_matrix(test_y, predictions)
    score = f1_score(test_y, predictions, pos_label=SPAM)

print('Total emails classified:', len(data))
print('Support Vector Machine Output : ')
print('Score:' + str((sum(scores) / len(scores)) * 100) + '%')
print('Confusion matrix:')


所以,我要做的下一件事就是按照以下方式打印SVM结果,其中每个点都是一封电子邮件: SVM Result

但问题是,我的输入矩阵是高维的,因此我无法使用matplotlib.pyplot打印出SVM的结果。因此,任何能给我解决方法的资源都会受到赞赏。 感谢。

0 个答案:
