CNN训练,训练期间的损失保持不变,其他一切似乎都是正确的

时间:2018-07-26 18:31:06

标签: python tensorflow

首先,几点:

这是一个二进制分类问题,我的最后一层是只有一个输出(没有Sigmoid激活)的完全连接的层,我使用tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits())作为损失。

我已经检查了logit,在每一步都打印了logits,它们正在更改,我还列出了所有可训练的参数,以防万一,所有必要的参数都已设置为可训练。只有损失保持不变。

该代码实际上取自其github页面上的stanford cs20si mnist示例(请检查您是否想要全面引入tensorflow,或者甚至跟上新的变化),也就是说,他们的示例非常完美,我即使比较tf图,它们看起来也一样。我的代码和代码之间的唯一区别是数据集api(通过从目录中读取图像文件获取输入),类数和丢失。

即是,这是代码:

import os
import numpy as np
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import time

import tensorflow as tf
from data_pipeline2 import ImageDataGenerator
from model.ResNetBest import network as forward_pass
from model.ResNetBest import loss as lossfn
import utils

EPOCHS = 20


class ConvNet(object):
    def __init__(self):
        self.lr = 0.001
        self.batch_size = 2
        self.keep_prob = tf.constant(0.75)
        self.gstep = tf.Variable(0, dtype=tf.int32,
                                 trainable=False, name='global_step')
        self.n_classes = 1
        self.skip_step = 20
        # self.n_test = 10000
        self.training = True

    def get_data(self):
        with tf.name_scope('data'):
            train_data = ImageDataGenerator(directory="C:\\Users\\A123\\Desktop\\TRAIN",
                                    horizontal_flip=True, vertical_flip=True, rescale=True, normalize=True,
                                    color_jitter=True, epochs=1, batch_size=self.batch_size, num_cpus=20).dataset_pipeline()

            test_data = ImageDataGenerator(directory="C:\\Users\\A123\\Desktop\\TRAIN",
                                    horizontal_flip=False, vertical_flip=False, rescale=False, normalize=False,
                                    color_jitter=True, epochs=1, batch_size=self.batch_size, num_cpus=20).dataset_pipeline()

            iterator = tf.data.Iterator.from_structure(train_data.output_types,
                                                       train_data.output_shapes)
            self.img, self.label = iterator.get_next()
            self.img = tf.reshape(self.img, [-1, 389, 389, 3])

            self.train_init = iterator.make_initializer(train_data)  # initializer for train_data
            self.test_init = iterator.make_initializer(test_data)  # initializer for train_data

    def inference(self):
        self.logits = forward_pass.network(self.img)


    def loss(self):
        '''
        define loss function
        use softmax cross entropy with logits as the loss function
        compute mean cross entropy, softmax is applied internally
        '''
        #
        with tf.name_scope('loss'):
            # entropy = tf.nn.softmax_cross_entropy_with_logits(labels=self.label, logits=self.logits)
            # self.loss = tf.reduce_mean(entropy, name='loss')
            self.loss = lossfn.lossfn(self.logits, None, self.label)

    def optimize(self):
        '''
        Define training op
        using Adam Gradient Descent to minimize cost
        '''
        self.opt = tf.train.AdamOptimizer(self.lr).minimize(self.loss,
                                                            global_step=self.gstep)

    def summary(self):
        '''
        Create summaries to write on TensorBoard
        '''
        with tf.name_scope('summaries'):
            tf.summary.scalar('loss', self.loss)
            tf.summary.scalar('accuracy', self.accuracy)
            tf.summary.histogram('histogram loss', self.loss)

        with tf.name_scope('debug'):
            tf.summary.scalar('logits', tf.reduce_mean(self.logits))
        self.summary_op = tf.summary.merge_all()

    def eval(self):
        '''
        Count the number of right predictions in a batch
        '''
        with tf.name_scope('predict'):
            # preds = tf.nn.sigmoid(self.logits)
            preds = self.logits
            # correct_preds = tf.equal(tf.argmax(preds, 1), tf.argmax(self.label, 1))
            correct_preds = tf.equal(tf.cast(tf.round(preds), dtype=tf.int32), self.label)
            self.accuracy = tf.reduce_sum(tf.cast(correct_preds, tf.float32))

    def build(self):
        '''
        Build the computation graph
        '''
        self.get_data()
        self.inference()
        self.loss()
        self.optimize()
        self.eval()
        self.summary()

    def train_one_epoch(self, sess, saver, init, writer, epoch, step):
        start_time = time.time()
        sess.run(init)
        self.training = True
        total_loss = 0
        n_batches = 0
        try:
            while True:
                _, l, summaries, logits = sess.run([self.opt, self.loss, self.summary_op, self.logits])
                writer.add_summary(summaries, global_step=step)
                if (step + 1) % self.skip_step == 0:
                    print('Loss at step {0}: {1} and logits: {2}'.format(step, l, np.sum(logits)))
                    print(logits)
                step += 1
                total_loss += l
                n_batches += 1
        except tf.errors.OutOfRangeError:
            pass
        saver.save(sess, 'checkpoints/ResNet18/ResNet', step)
        print('Average loss at epoch {0}: {1}'.format(epoch, total_loss / n_batches))
        print('Took: {0} seconds'.format(time.time() - start_time))
        ############################## debug code ###########################################
        variables_names = [v.name for v in tf.trainable_variables()]
        values = sess.run(variables_names)
        for k, v in zip(variables_names, values):
            print("Variable: ", k)
            # print("Shape: ", v.shape)
            # print(v)

        return step

    def eval_once(self, sess, init, writer, epoch, step):
        start_time = time.time()
        sess.run(init)
        self.training = False
        total_correct_preds = 0
        n_step = 0
        try:
            while True:
                accuracy_batch, summaries = sess.run([self.accuracy, self.summary_op])
                writer.add_summary(summaries, global_step=step)
                total_correct_preds += accuracy_batch
                n_step += 1
        except tf.errors.OutOfRangeError:
            pass

        print('Accuracy at epoch {0}: {1} '.format(epoch, total_correct_preds / n_step))
        print('Took: {0} seconds'.format(time.time() - start_time))

    def train(self, n_epochs):
        '''
        The train function alternates between training one epoch and evaluating
        '''
        utils.safe_mkdir('checkpoints')
        utils.safe_mkdir('checkpoints/ResNet18')
        writer = tf.summary.FileWriter('./graphs/ResNet18', tf.get_default_graph())

        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()
            ckpt = tf.train.get_checkpoint_state(os.path.dirname('checkpoints/ResNet18/checkpoint'))
            if ckpt and ckpt.model_checkpoint_path:
                saver.restore(sess, ckpt.model_checkpoint_path)

            step = self.gstep.eval()

            for epoch in range(n_epochs):
                step = self.train_one_epoch(sess, saver, self.train_init, writer, epoch, step)
                self.eval_once(sess, self.test_init, writer, epoch, step)
        writer.close()


if __name__ == '__main__':
    model = ConvNet()
    model.build()
    model.train(n_epochs=EPOCHS)

这是我的network.py文件(它不是resnet,我之前曾尝试过resnet,但是发生了同样的事情,所以我创建了一个简单的网络):

import tensorflow as tf

def conv_relu(inputs, filters, k_size, stride, padding, scope_name):
    '''
    A method that does convolution + relu on inputs
    '''
    with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE) as scope:
        in_channels = inputs.shape[-1]
        kernel = tf.get_variable('kernel',
                                 [k_size, k_size, in_channels, filters],
                                 initializer=tf.truncated_normal_initializer())
        biases = tf.get_variable('biases',
                                 [filters],
                                 initializer=tf.random_normal_initializer())
        conv = tf.nn.conv2d(inputs, kernel, strides=[1, stride, stride, 1], padding=padding)
    return tf.nn.relu(conv + biases, name=scope.name)


def maxpool(inputs, ksize, stride, padding='VALID', scope_name='pool'):
    '''A method that does max pooling on inputs'''
    with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE) as scope:
        pool = tf.nn.max_pool(inputs,
                              ksize=[1, ksize, ksize, 1],
                              strides=[1, stride, stride, 1],
                              padding=padding)
    return pool


def fully_connected(inputs, out_dim, scope_name='fc'):
    '''
    A fully connected linear layer on inputs
    '''
    with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE) as scope:
        in_dim = inputs.shape[-1]
        w = tf.get_variable('weights', [in_dim, out_dim],
                            initializer=tf.truncated_normal_initializer())
        b = tf.get_variable('biases', [out_dim],
                            initializer=tf.constant_initializer(0.0))
        out = tf.matmul(inputs, w) + b
    return out

def network(data, labels_one_hot=None):
    # jsonFile = './model/ResNetBest/resnetV4_2Best.json'
    # with open(jsonFile, "r") as file:
    #     jsonDef = file.read()
    # model = tf.keras.models.model_from_json(jsonDef)
    conv1 = conv_relu(inputs=data,
                      filters=32,
                      k_size=5,
                      stride=1,
                      padding='SAME',
                      scope_name='conv1')
    pool1 = maxpool(conv1, 2, 2, 'VALID', 'pool1')
    conv2 = conv_relu(inputs=pool1,
                      filters=64,
                      k_size=5,
                      stride=1,
                      padding='SAME',
                      scope_name='conv2')
    pool2 = maxpool(conv2, 2, 2, 'VALID', 'pool2')
    feature_dim = pool2.shape[1] * pool2.shape[2] * pool2.shape[3]
    pool2 = tf.reshape(pool2, [-1, feature_dim])
    fc = fully_connected(pool2, 10, 'fc')
    dropout = tf.nn.dropout(tf.nn.relu(fc), 0.75, name='relu_dropout')
    logits = fully_connected(dropout, 1, 'logits')
    # return model(data)
    return logits

这是具有丢失功能的文件:

import tensorflow as tf

def binary_crossentropy(y, y_):
    cross_entropy = tf.reduce_sum(- y * tf.log(y_) - (1 - y) * tf.log(1 - y_), 1)
    loss = tf.reduce_mean(cross_entropy)
    return loss

def lossfn(net_out, data, labels):
    with tf.name_scope('cross_entropy'):
        # return tf.losses.sparse_softmax_cross_entropy(labels=labels, logits=net_out)
        # return tf.keras.losses.binary_crossentropy(tf.cast(labels, tf.float32), tf.squeeze(net_out))
        cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=tf.cast(labels, tf.float32), logits=tf.squeeze(net_out))
        # return binary_crossentropy(tf.squeeze(tf.cast(labels, tf.float32)), net_out)
        # return tf.losses.sigmoid_cross_entropy(tf.squeeze(labels), net_out)
        return tf.reduce_mean(cross_entropy, name='loss')

请帮助,这根本没有道理。

0 个答案:

没有答案