我对张量流和机器学习都很陌生,我正在使用安然数据集对十大发件人进行分类。我在kaggle中发现了一些使用scikit-learn的好例子,但是当我尝试使用张量流时,准确度就差得很大。
以下是我正在做的事情
我无法弄清楚我做错了什么。任何有关这方面的见解都会非常有用。
以下是整个代码,我使用https://github.com/jupyter/docker-stacks/tree/master/tensorflow-notebook运行它,加载enron的代码和清理enron数据集来自kaggle示例(https://www.kaggle.com/yaroshevskiy/enron-top-10-senders-classification)和培训数据集可以在https://www.kaggle.com/yaroshevskiy/enron-top-10-senders-classification/data找到。
import tensorflow as tf
import matplotlib.pyplot as plt
import csv
import numpy as np
import os
import string
import requests
import io
import nltk
from zipfile import ZipFile
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from tensorflow.python.framework import ops
import pandas as pd
import re
import email
ops.reset_default_graph()
num_labels = 10
batch_size = 200
max_features = 1000
learning_rate = 0.05
num_steps = 10000
def email_from_string(raw_email):
msg = email.message_from_string(raw_email)
content = []
for part in msg.walk():
if part.get_content_type() == 'text/plain':
content.append(part.get_payload())
result = {}
for key in msg.keys():
result[key] = msg[key]
result["content"] = ''.join(content)
return result
def content_to_wordlist( content, remove_stopwords=False ):
content = re.sub("[^a-zA-Z]"," ", content)
words = content.lower().split()
return ' '.join(words)
def setup_enron():
# Read the emails from csv
enron_data = pd.read_csv("emails.csv", header=0, quoting=2)
enron_sent = enron_data[enron_data["file"].str.contains('sent').tolist()]
enron_sent = enron_sent.assign(sender=enron_sent["file"].map(lambda x: re.search("(.*)/.*sent", x).group(1)).values)
enron_sent.drop("file", axis=1, inplace=True)
# Get the top senders based on the number of labels
top_senders = enron_sent["sender"].value_counts().head(num_labels).index.values
mapping = dict(zip(top_senders, range(num_labels)))
enron_sent = enron_sent[enron_sent.sender.isin(top_senders)]
enron_parsed = pd.DataFrame(list(map(email_from_string, enron_sent.message)))
data = pd.DataFrame(list(map(content_to_wordlist,
enron_parsed[['Subject', 'content']].apply(lambda x: ' '.join(x), axis=1))),
columns = ["content"])
data = data.assign(sender=enron_sent["sender"].values)
data = data.replace({'sender': mapping})
# Perform tfidf vectorization
tfidf = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english', max_features=max_features)
sparse_tfidf_texts = tfidf.fit_transform(data["content"].values.tolist())
text_enron = data["content"].values.tolist()
target_enron = data["sender"].values.tolist()
# one hot vector
enron_y = np.zeros((len(target_enron), 10), dtype=np.float32)
for i in range(len(target_enron)):
enron_y[i, target_enron[i]] = 1.0
# split the data sets into train and test
train_indices = np.random.choice(sparse_tfidf_texts.shape[0], round(0.8*sparse_tfidf_texts.shape[0]), replace=False)
test_indices = np.array(list(set(range(sparse_tfidf_texts.shape[0])) - set(train_indices)))
texts_train = sparse_tfidf_texts[train_indices]
texts_test = sparse_tfidf_texts[test_indices]
train_y = np.array([x for ix, x in enumerate(enron_y) if ix in train_indices])
test_y = np.array([x for ix, x in enumerate(enron_y) if ix in test_indices])
return texts_train, texts_test, train_y, test_y
def accuracy(predictions, labels):
correctly_predicted = np.sum(np.argmax(predictions, 1) == np.argmax(labels, 1))
return (100.0 * correctly_predicted) / predictions.shape[0]
def start_tensorflow(train_dataset, test_dataset, train_labels, test_labels):
graph = tf.Graph()
with graph.as_default():
X_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, max_features))
Y_train_dataset = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
X_test_dataset = tf.constant(test_dataset.todense(), dtype=tf.float32)
Y_test_dataset = tf.constant(test_labels, dtype=tf.float32)
# Variables
weights = tf.Variable(tf.truncated_normal([max_features, num_labels]))
biases = tf.Variable(tf.zeros([num_labels]), dtype=tf.float32)
# Training computation
logits = tf.matmul(X_train_dataset, weights) + biases
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(
labels=Y_train_dataset, logits=logits))
# Optimizer
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
# Predictions
train_prediction = tf.nn.softmax(logits)
test_prediction = tf.nn.softmax(tf.matmul(X_test_dataset, weights) + biases)
with tf.Session(graph=graph) as sess:
tf.global_variables_initializer().run()
print("Initialized")
for step in range(num_steps):
offset = np.random.randint(0, train_labels.shape[0] - batch_size - 1)
# Generate a batch
batch_data = train_dataset[offset:(offset + batch_size), :].todense()
batch_labels = train_labels[offset:(offset + batch_size), :]
feed_dict = {X_train_dataset: batch_data, Y_train_dataset: batch_labels}
_, l, predictions = sess.run([optimizer, loss, train_prediction],
feed_dict=feed_dict)
if (step % 500 == 0):
print("Batch loss step: {0}: {1}".format(step, l))
print("Batch accuracy: {:.1f}%".format(accuracy(predictions, batch_labels)))
print("Test Accuracy: {:.1f}%".format(accuracy(test_prediction.eval(), test_labels)))
if __name__ == "__main__":
x_train, x_test, y_train, y_test = setup_enron()
start_tensorflow(x_train, x_test, y_train, y_test)
以下是输出
Generation # 500. Train Loss (Test Loss): 2.27 (2.28). Train Acc(Test Acc): 0.18 (0.19)
Generation # 1000. Train Loss (Test Loss): 2.27 (2.26). Train Acc(Test Acc): 0.18 (0.19)
Generation # 1500. Train Loss (Test Loss): 2.26 (2.24). Train Acc (Test Acc): 0.18 (0.19)
Generation # 2000. Train Loss (Test Loss): 2.22 (2.23). Train Acc (Test Acc): 0.21 (0.19)
Generation # 2500. Train Loss (Test Loss): 2.22 (2.22). Train Acc (Test Acc): 0.21 (0.19)
Generation # 3000. Train Loss (Test Loss): 2.21 (2.22). Train Acc (Test Acc): 0.20 (0.19)
Generation # 3500. Train Loss (Test Loss): 2.25 (2.21). Train Acc (Test Acc): 0.16 (0.19)
Generation # 4000. Train Loss (Test Loss): 2.19 (2.21). Train Acc (Test Acc): 0.24 (0.19)
Generation # 4500. Train Loss (Test Loss): 2.22 (2.21). Train Acc (Test Acc): 0.18 (0.19)
Generation # 5000. Train Loss (Test Loss): 2.18 (2.20). Train Acc (Test Acc): 0.22 (0.19)
Generation # 5500. Train Loss (Test Loss): 2.16 (2.20). Train Acc (Test Acc): 0.25 (0.19)
Generation # 6000. Train Loss (Test Loss): 2.23 (2.20). Train Acc (Test Acc): 0.18 (0.19)
Generation # 6500. Train Loss (Test Loss): 2.22 (2.20). Train Acc (Test Acc): 0.18 (0.19)
Generation # 7000. Train Loss (Test Loss): 2.21 (2.20). Train Acc (Test Acc): 0.16 (0.19)
Generation # 7500. Train Loss (Test Loss): 2.16 (2.20). Train Acc (Test Acc): 0.20 (0.19)
Generation # 8000. Train Loss (Test Loss): 2.25 (2.20). Train Acc (Test Acc): 0.13 (0.19)
Generation # 8500. Train Loss (Test Loss): 2.18 (2.20). Train Acc (Test Acc): 0.21 (0.19)
Generation # 9000. Train Loss (Test Loss): 2.22 (2.20). Train Acc (Test Acc): 0.16 (0.19)
Generation # 9500. Train Loss (Test Loss): 2.19 (2.20). Train Acc (Test Acc): 0.16 (0.19)
Generation # 10000. Train Loss (Test Loss): 2.27 (2.20). Train Acc (Test Acc): 0.16 (0.19)