
时间:2018-01-23 22:59:03

标签: tensorflow machine-learning multilabel-classification softmax



  • 加载安然数据集
  • 获取前10名发件人的数据
  • 执行10个发件人的tf-idf矢量化
  • 将数据拆分为培训和测试集
  • 定义模型(tf.matmul(X_train_dataset,权重))+偏差和损失函数(tf.nn.softmax_cross_entropy_with_logits)
  • 训练模型
  • 输出测试精度



import tensorflow as tf
import matplotlib.pyplot as plt
import csv
import numpy as np
import os
import string
import requests
import io
import nltk
from zipfile import ZipFile
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from tensorflow.python.framework import ops
import pandas as pd
import re
import email


num_labels = 10
batch_size = 200
max_features = 1000
learning_rate = 0.05
num_steps = 10000

def email_from_string(raw_email):
    msg = email.message_from_string(raw_email)

    content = []
    for part in msg.walk():
        if part.get_content_type() == 'text/plain':

    result = {}
    for key in msg.keys():
        result[key] = msg[key]
    result["content"] = ''.join(content)

    return result

def content_to_wordlist( content, remove_stopwords=False ):
    content = re.sub("[^a-zA-Z]"," ", content)
    words = content.lower().split()
    return ' '.join(words)

def setup_enron():
    # Read the emails from csv
    enron_data = pd.read_csv("emails.csv", header=0, quoting=2)
    enron_sent = enron_data[enron_data["file"].str.contains('sent').tolist()]
    enron_sent = enron_sent.assign(sender=enron_sent["file"].map(lambda x: re.search("(.*)/.*sent", x).group(1)).values)
    enron_sent.drop("file", axis=1, inplace=True)

    # Get the top senders based on the number of labels
    top_senders = enron_sent["sender"].value_counts().head(num_labels).index.values
    mapping = dict(zip(top_senders, range(num_labels)))
    enron_sent = enron_sent[enron_sent.sender.isin(top_senders)]

    enron_parsed = pd.DataFrame(list(map(email_from_string, enron_sent.message)))

    data = pd.DataFrame(list(map(content_to_wordlist,
                      enron_parsed[['Subject', 'content']].apply(lambda x: ' '.join(x), axis=1))),
                      columns = ["content"])

    data = data.assign(sender=enron_sent["sender"].values)
    data = data.replace({'sender': mapping})

    # Perform tfidf vectorization
    tfidf = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=max_features)
    sparse_tfidf_texts = tfidf.fit_transform(data["content"].values.tolist())
    text_enron = data["content"].values.tolist()
    target_enron = data["sender"].values.tolist()

    # one hot vector
    enron_y = np.zeros((len(target_enron), 10), dtype=np.float32)
    for i in range(len(target_enron)):
        enron_y[i, target_enron[i]] = 1.0

    # split the data sets into train and test
    train_indices = np.random.choice(sparse_tfidf_texts.shape[0], round(0.8*sparse_tfidf_texts.shape[0]), replace=False)
    test_indices = np.array(list(set(range(sparse_tfidf_texts.shape[0])) - set(train_indices)))
    texts_train = sparse_tfidf_texts[train_indices]
    texts_test = sparse_tfidf_texts[test_indices]
    train_y = np.array([x for ix, x in enumerate(enron_y) if ix in train_indices])
    test_y = np.array([x for ix, x in enumerate(enron_y) if ix in test_indices])

    return texts_train, texts_test, train_y, test_y

def accuracy(predictions, labels):
    correctly_predicted = np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
    return (100.0 * correctly_predicted) / predictions.shape[0]

def start_tensorflow(train_dataset, test_dataset, train_labels, test_labels):
    graph = tf.Graph()

    with graph.as_default():
            X_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, max_features))
            Y_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
            X_test_dataset = tf.constant(test_dataset.todense(), dtype=tf.float32)
            Y_test_dataset = tf.constant(test_labels, dtype=tf.float32)

            # Variables
            weights = tf.Variable(tf.truncated_normal([max_features, num_labels]))
            biases = tf.Variable(tf.zeros([num_labels]), dtype=tf.float32)

            # Training computation
            logits = tf.matmul(X_train_dataset, weights) + biases
            loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
                                    labels=Y_train_dataset, logits=logits))

            # Optimizer
            optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)

            # Predictions
            train_prediction = tf.nn.softmax(logits)
            test_prediction = tf.nn.softmax(tf.matmul(X_test_dataset, weights) + biases)

            with tf.Session(graph=graph) as sess:

                    for step in range(num_steps):
                            offset = np.random.randint(0, train_labels.shape[0] - batch_size - 1)
                            # Generate a batch
                            batch_data = train_dataset[offset:(offset + batch_size), :].todense()
                            batch_labels = train_labels[offset:(offset + batch_size), :]

                            feed_dict = {X_train_dataset: batch_data, Y_train_dataset: batch_labels}

                            _, l, predictions = sess.run([optimizer, loss, train_prediction],

                            if (step % 500 == 0):
                                    print("Batch loss step: {0}: {1}".format(step, l))
                                    print("Batch accuracy: {:.1f}%".format(accuracy(predictions, batch_labels)))
                                    print("Test Accuracy: {:.1f}%".format(accuracy(test_prediction.eval(), test_labels)))

if __name__ == "__main__":
    x_train, x_test, y_train, y_test = setup_enron()
    start_tensorflow(x_train, x_test, y_train, y_test)


Generation # 500. Train Loss (Test Loss): 2.27 (2.28). Train Acc(Test Acc): 0.18 (0.19)
Generation # 1000. Train Loss (Test Loss): 2.27 (2.26). Train Acc(Test Acc): 0.18 (0.19)
Generation # 1500. Train Loss (Test Loss): 2.26 (2.24). Train Acc (Test Acc): 0.18 (0.19)
Generation # 2000. Train Loss (Test Loss): 2.22 (2.23). Train Acc (Test Acc): 0.21 (0.19)
Generation # 2500. Train Loss (Test Loss): 2.22 (2.22). Train Acc (Test Acc): 0.21 (0.19)
Generation # 3000. Train Loss (Test Loss): 2.21 (2.22). Train Acc (Test Acc): 0.20 (0.19)
Generation # 3500. Train Loss (Test Loss): 2.25 (2.21). Train Acc (Test Acc): 0.16 (0.19)
Generation # 4000. Train Loss (Test Loss): 2.19 (2.21). Train Acc (Test Acc): 0.24 (0.19)
Generation # 4500. Train Loss (Test Loss): 2.22 (2.21). Train Acc (Test Acc): 0.18 (0.19)
Generation # 5000. Train Loss (Test Loss): 2.18 (2.20). Train Acc (Test Acc): 0.22 (0.19)
Generation # 5500. Train Loss (Test Loss): 2.16 (2.20). Train Acc (Test Acc): 0.25 (0.19)
Generation # 6000. Train Loss (Test Loss): 2.23 (2.20). Train Acc (Test Acc): 0.18 (0.19)
Generation # 6500. Train Loss (Test Loss): 2.22 (2.20). Train Acc (Test Acc): 0.18 (0.19)
Generation # 7000. Train Loss (Test Loss): 2.21 (2.20). Train Acc (Test Acc): 0.16 (0.19)
Generation # 7500. Train Loss (Test Loss): 2.16 (2.20). Train Acc (Test Acc): 0.20 (0.19)
Generation # 8000. Train Loss (Test Loss): 2.25 (2.20). Train Acc (Test Acc): 0.13 (0.19)
Generation # 8500. Train Loss (Test Loss): 2.18 (2.20). Train Acc (Test Acc): 0.21 (0.19)
Generation # 9000. Train Loss (Test Loss): 2.22 (2.20). Train Acc (Test Acc): 0.16 (0.19)
Generation # 9500. Train Loss (Test Loss): 2.19 (2.20). Train Acc (Test Acc): 0.16 (0.19)
Generation # 10000. Train Loss (Test Loss): 2.27 (2.20). Train Acc (Test Acc): 0.16 (0.19)     

0 个答案:
