为什么我得到100%的错误率(垃圾邮件的RNN)

时间:2017-04-18 21:38:36

标签: tensorflow

我正在通过修改我发现的一些例子来学习张量流。首先,我采用了一个RNN示例来反对" Spam"来自UCI的数据集。

我的代码和示例数据集可以在这里找到: https://trinket.io/python/c7d6b95452

当我运行代码时,我得到100%的错误率。我认为即使这个数据集不适合这个特定的模型,我至少得到了比这更好的东西,所以我不认为它是我选择的样本数据集。 下面是我的Python代码。如果有人可以建议如何修改这个以使模型正常工作,我将不胜感激!我也非常感谢任何一般的张量流建议。

# Example for my blog post at:
# https://danijar.com/introduction-to-recurrent-networks-in-tensorflow/
import functools
import os
import sets
import random

import numpy as np
import tensorflow as tf
from tensorflow.python.ops import rnn, rnn_cell


def lazy_property(function):
    attribute = '_' + function.__name__

    @property
    @functools.wraps(function)
    def wrapper(self):
        if not hasattr(self, attribute):
            setattr(self, attribute, function(self))
        return getattr(self, attribute)
    return wrapper


class SequenceClassification:

    def __init__(self, data, target, dropout, num_hidden=200, num_layers=3):
        self.data = data
        self.target = target
        self.dropout = dropout
        self._num_hidden = num_hidden
        self._num_layers = num_layers
        self.prediction
        self.error
        self.optimize

    @lazy_property
    def prediction(self):
        # Recurrent network.
        network = rnn_cell.GRUCell(self._num_hidden)
        network = rnn_cell.DropoutWrapper(
            network, output_keep_prob=self.dropout)
        network = rnn_cell.MultiRNNCell([network] * self._num_layers)
        output, _ = tf.nn.dynamic_rnn(network, self.data, dtype=tf.float32)
        # Select last output.
        output = tf.transpose(output, [1, 0, 2])
        last = tf.gather(output, int(output.get_shape()[0]) - 1)
        # Softmax layer.
        weight, bias = self._weight_and_bias(
            self._num_hidden, int(self.target.get_shape()[1]))
        prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
        return prediction

    @lazy_property
    def cost(self):
        cross_entropy = -tf.reduce_sum(self.target *tf.log(self.prediction))
        return cross_entropy

    @lazy_property
    def optimize(self):
        learning_rate = 0.003
        optimizer = tf.train.RMSPropOptimizer(learning_rate)
        return optimizer.minimize(self.cost)

    @lazy_property
    def error(self):
        mistakes = tf.not_equal(
            tf.argmax(self.target, 1), tf.argmax(self.prediction, 1))
        return tf.reduce_mean(tf.cast(mistakes, tf.float32))

    @staticmethod
    def _weight_and_bias(in_size, out_size):
        weight = tf.truncated_normal([in_size, out_size], stddev=0.01)
        bias = tf.constant(0.1, shape=[out_size])
        return tf.Variable(weight), tf.Variable(bias)


def main():
    sample_size=10
    num_classes=2 #spam or ham
    ##
    # import spam data
    ##
    spam_data=[]
    spam_data_train=[]
    spam_data_test=[]
    data_dir="."
    data_file="spam.csv"
    with open(os.path.join(data_dir, data_file), "r") as file_handle:
        for row in file_handle:
            spam_data.append(row)
        spam_data=[line.rstrip().split(",") for line in spam_data if len(line) >=1]
        random.shuffle(spam_data)
        spam_data_train=spam_data[0:int(len(spam_data)*.8)]
        spam_data_test=spam_data[int(len(spam_data)*.8):int(len(spam_data))]

    def next_train_batch(batch_size):
        a=random.sample(spam_data_train, batch_size)
        return [np.array([line[:-1] for line in a]), np.array([line[len(line)-1] for line in a])]

    def train_batch():
        return [np.array([line[:-1] for line in spam_data_train]),np.array([line[len(line)-1] for line in spam_data_train])]    

    def next_test_batch(batch_size):
        a=random.sample(spam_data_test, batch_size)
        return [np.array([line[:-1] for line in a]), np.array([line[len(line)-1] for line in a])]

    def test_batch():
        return [np.array([line[:-1] for line in spam_data_test]),np.array([line[len(line)-1] for line in spam_data_test])]

    t=train_batch();
    train_input=t[0]
    train_target=t[1]

    test=test_batch()
    test_input=t[0]
    test_target=t[1]

    training_data = tf.placeholder(tf.float32, [None, sample_size, len(train_input[0])], "training_data")
    training_target = tf.placeholder(tf.float32, [None, sample_size], "training_target")
    testing_data = tf.placeholder(tf.float32, [None, len(test_input), len(test_input[0])], "testing_data")
    testing_target = tf.placeholder(tf.float32, [None, len(test_target)], "testing_target")
    dropout = tf.placeholder(tf.float32)
    training_model = SequenceClassification(training_data, training_target, dropout)
tf.get_variable_scope().reuse_variables()
    testing_model = SequenceClassification(testing_data, testing_target, dropout)
    sess = tf.Session()
    init = tf.initialize_all_variables()
    sess.run(init)
    for epoch in range(sample_size):
        for _ in range(100):
            sample=random.sample(range(0,len(train_input)-1),sample_size)
            batch_train = [train_input[i] for i in sample]
            batch_target = [train_target[i] for i in sample]
            sess.run(training_model.optimize, {
                training_data: [batch_train], training_target: [batch_target] , dropout: 0.5})
        error = sess.run(testing_model.error, {
            testing_data: [test_input], testing_target: [test_target], dropout: 1.0})
        print('Epoch {:2d} error {:3.1f}%'.format(epoch + 1, 100 * error))


if __name__ == '__main__':
    main()

0 个答案:

没有答案