Question

我正在尝试复制我在tensorflow中使用Keras格式的旧代码。由于某种原因，我的损失永远是微不足道的。我认为错误在于我正在使用的损失中（keras中的'categorical_crossentropy'与Tensorflow中的'tf.nn.softmax_cross_entropy_with_logits'）

Keras代码：

import keras

from keras.models import Sequential
from keras.layers import Dropout, Dense, Activation
from keras.regularizers import l2
from keras.layers.normalization import BatchNormalization


# Keras items
from keras.optimizers import Adam, Nadam
from keras.activations import relu, elu
from keras.losses import binary_crossentropy, categorical_crossentropy
from keras import metrics

import pandas as pd
import numpy as np

x_main = pd.read_csv("glioma DB X.csv")

y_main = pd.read_csv("glioma DB Y.csv")

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_main, y_main, test_size=0.3)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5)

# train shape
np.shape(x_train), np.shape(y_train)

((132, 47), (132, 1))

# Normalize training data; will want to have the same mu and sigma for test

def normalize_features(dataset):
    mu = np.mean(dataset, axis = 0) # columns
    sigma = np.std(dataset, axis = 0)
    norm_parameters = {'mu': mu,
                'sigma': sigma}
    return (dataset-mu)/(sigma+1e-10), norm_parameters

# Normal X data; using same mu and sigma from test set;

x_train, norm_parameters = normalize_features(x_train)

x_val = (x_val-norm_parameters['mu'])/(norm_parameters['sigma']+1e-10)

x_test = (x_test-norm_parameters['mu'])/(norm_parameters['sigma']+1e-10)

params = {'lr': 0.001,
     'batch_size': 30,
     'epochs': 8000,
     'dropout': 0.5,
     'weight_regulizer':['l2'],
     'optimizer': 'adam',
     'losses': 'categorical_crossentropy',
     'activation':'relu',
     'last_activation': 'softmax'}

from keras.utils.np_utils import to_categorical

#categorical_labels = to_categorical(int_labels, num_classes=None)

if params['losses']=='categorical_crossentropy':
    y_train = to_categorical(y_train,num_classes=4)
    y_val = to_categorical(y_val,num_classes=4)
    y_test = to_categorical(y_test,num_classes=4)

    model = Sequential()

    # layer 1
    model.add(Dense(30, input_dim=x_train.shape[1],
                    W_regularizer=l2(0.01),
                    kernel_initializer='he_uniform'))

    model.add(BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True))
    model.add(Activation(params['activation']))
    model.add(Dropout(params['dropout']))

    # layer 2
    model.add(Dense(20, W_regularizer=l2(0.01),
                    kernel_initializer='he_uniform'))

    model.add(BatchNormalization(axis=-1, momentum=0.99, epsilon=0.001, center=True))
    model.add(Activation(params['activation']))
    model.add(Dropout(params['dropout']))

    # if we want to also test for number of layers and shapes, that's possible
    #hidden_layers(model, params, 1)

    # Last layer
    model.add(Dense(4, activation=params['last_activation'],
                    kernel_initializer='he_uniform'))

    model.compile(loss=params['losses'],
                  optimizer=keras.optimizers.adam(lr=params['lr']),
                  metrics=['categorical_accuracy'])

    history = model.fit(x_train, y_train, 
                        validation_data=[x_val, y_val],
                        batch_size=params['batch_size'],
                        epochs=params['epochs'],
                        verbose=1)

使用tensorflow的工作代码给了我一个漂亮的损耗图哈哈：

x_train, x_test, y_train, y_test = train_test_split(X_main, Y_main, test_size=0.3)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, test_size=0.5)

# ANOTHER OPTION IS TO USE SKLEARN sklearn.model_selection.ShuffleSplit
# look into stratification


# Normalize training data; will want to have the same mu and sigma for test

def normalize_features(dataset):
    mu = np.mean(dataset, axis = 0) # columns
    sigma = np.std(dataset, axis = 0)
    norm_parameters = {'mu': mu,
                'sigma': sigma}
    return (dataset-mu)/(sigma+1e-10), norm_parameters



# TRY LOG TRANSFORMATION LOG(1+X) to deal with outliers

# change ordinal to one hot vector
# to make label encoder
    # for c in x_train.columns[x_train.dtype == 'object']:
    # X[c] (which was copy of xtrain) X[c].factorize()[0]

# able to plot feature importance in random forest



# Normal X data; using same mu and sigma from test set; then transposed

x_train, norm_parameters = normalize_features(x_train)

x_val = (x_val-norm_parameters['mu'])/(norm_parameters['sigma']+1e-10)

x_test = (x_test-norm_parameters['mu'])/(norm_parameters['sigma']+1e-10)
x_train = np.transpose(x_train)
x_val = np.transpose(x_val)
x_test = np.transpose(x_test)

y_train = np.transpose(y_train)
y_val = np.transpose(y_val)
y_test = np.transpose(y_test)


# converting values from database to matrix

x_train = x_train.as_matrix()
x_val = x_val.as_matrix()
x_test = x_test.as_matrix()

y_train = y_train.as_matrix()
y_val = y_val.as_matrix()
y_test = y_test.as_matrix()


# testing shape

#print(y_train.shape)
#print(y_val.shape)
#print(y_test.shape)
#
#print(x_train.shape)
#print(x_val.shape)
#print(x_test.shape)


# convert y to array per value so 3 = [0 0 1]

def convert_to_one_hot(Y, C):
    Y = np.eye(C)[Y.reshape(-1)].T
    return Y

y_train = convert_to_one_hot(y_train, 4)
y_val = convert_to_one_hot(y_val, 4)
y_test = convert_to_one_hot(y_test, 4)
print ("number of training examples = " + str(x_train.shape[1]))
print ("number of test examples = " + str(x_test.shape[1]))
print ("X_train shape: " + str(x_train.shape))
print ("Y_train shape: " + str(y_train.shape))
print ("X_test shape: " + str(x_test.shape))
print ("Y_test shape: " + str(y_test.shape))


# minibatches for later

def random_mini_batches(X, Y, mini_batch_size = 64, seed = 0):
    """
    Creates a list of random minibatches from (X, Y)

    Arguments:
    X -- input data, of shape (input size, number of examples)
    Y -- true "label" vector (containing 0 if cat, 1 if non-cat), of shape (1, number of examples)
    mini_batch_size - size of the mini-batches, integer
    seed -- this is only for the purpose of grading, so that you're "random minibatches are the same as ours.

    Returns:
    mini_batches -- list of synchronous (mini_batch_X, mini_batch_Y)
    """

    m = X.shape[1]                  # number of training examples
    mini_batches = []

    # Step 1: Shuffle (X, Y)
    permutation = list(np.random.permutation(m))
    shuffled_X = X[:, permutation]
    shuffled_Y = Y[:, permutation].reshape((Y.shape[0],m))

    # Step 2: Partition (shuffled_X, shuffled_Y). Minus the end case.
    num_complete_minibatches = math.floor(m/mini_batch_size) # number of mini batches of size mini_batch_size in your partitionning
    for k in range(0, num_complete_minibatches):
        mini_batch_X = shuffled_X[:, k * mini_batch_size : k * mini_batch_size + mini_batch_size]
        mini_batch_Y = shuffled_Y[:, k * mini_batch_size : k * mini_batch_size + mini_batch_size]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    # Handling the end case (last mini-batch < mini_batch_size)
    if m % mini_batch_size != 0:
        mini_batch_X = shuffled_X[:, num_complete_minibatches * mini_batch_size : m]
        mini_batch_Y = shuffled_Y[:, num_complete_minibatches * mini_batch_size : m]
        mini_batch = (mini_batch_X, mini_batch_Y)
        mini_batches.append(mini_batch)

    return mini_batches









# starting TF graph


# Create X and Y placeholders

def create_xy_placeholder(n_x, n_y):
    X = tf.placeholder(tf.float32, shape = [n_x, None], name = 'X')
    Y = tf.placeholder(tf.float32, shape = [n_y, None], name = 'Y')

    return X, Y


# initialize parameters hidden layers

def initialize_parameters(n_x, scale, hidden_units):

    hidden_units= [n_x] + hidden_units
    parameters = {}
    regularizer = tf.contrib.layers.l2_regularizer(scale)

    for i in range(0, len(hidden_units[1:])):
        with tf.variable_scope('hidden_parameters_'+str(i+1)):
            w = tf.get_variable("W"+str(i+1), [hidden_units[i+1], hidden_units[i]], 
                                    initializer=tf.contrib.layers.xavier_initializer(),
                                    regularizer=regularizer)

            b = tf.get_variable("b"+str(i+1), [hidden_units[i+1], 1], 
                                    initializer = tf.constant_initializer(0.1))

            parameters.update({"W"+str(i+1): w})
            parameters.update({"b"+str(i+1): b})

    return parameters


# forward progression with batch norm and dropout

def forward_propagation(X, parameters, batch_norm=False, keep_prob=1):

    a_new = X   

    for i in range(0, int(len(parameters)/2)-1):

        with tf.name_scope('forward_pass_'+str(i+1)):

            w = parameters['W'+str(i+1)]
            b = parameters['b'+str(i+1)]

            z = tf.matmul(w, a_new) + b

            if batch_norm == True:
                z = tf.layers.batch_normalization(z, momentum=0.99, axis=0)

            a = tf.nn.relu(z)

            if keep_prob < 1:
                a = tf.nn.dropout(a, keep_prob)  

            a_new = a

            tf.summary.histogram('act_'+str(i+1), a_new)

    # calculating final Z before input into cost as logit 

    with tf.name_scope('forward_pass_'+str(int(len(parameters)/2))):
        w = parameters['W'+str(int(len(parameters)/2))]
        b = parameters['b'+str(int(len(parameters)/2))]

        z = tf.matmul(w, a_new) + b

        if batch_norm == True:
                z = tf.layers.batch_normalization(z, momentum=0.99, axis=0)

    return z

# compute cost with option for l2 regularizatoin

def compute_cost(z, Y, parameters, l2_reg=False):

    with tf.name_scope('cost'):
        logits = tf.transpose(z)
        labels = tf.transpose(Y)

        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits = logits, 
                                                                      labels = labels))
        if l2_reg == True:

            reg = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)

            cost = cost + tf.reduce_sum(reg)

    with tf.name_scope('Pred/Accuracy'):

        prediction=tf.argmax(z)
        correct_prediction = tf.equal(tf.argmax(z), tf.argmax(Y))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

    return cost, prediction, accuracy


# defining the model (need to add keep_prob for dropout)

def model(X_train, Y_train, X_test, Y_test, 
          hidden_units=[30, 20, 4],            # hidden units/layers
          learning_rate = 0.0001,                       # Learning rate
          num_epochs = 10000, minibatch_size = 30,       # minibatch/ number epochs
          keep_prob=0.5,                                # dropout
          batch_norm=True,                              # batch normalization
          l2_reg=True, scale = 0.01,                    # L2 regularization/scale is lambda
          print_cost = True):

    ops.reset_default_graph()                         # to be able to rerun the model without overwriting tf variables
    tf.set_random_seed(1)                             # to keep consistent results
    seed = 3                                          # to keep consistent results
    (n_x, m) = X_train.shape                          # (n_x: input size, m : number of examples in the train set)
    n_y = Y_train.shape[0]                            # n_y : output size
    costs = []                                        # To keep track of the cost



    # Create Placeholders of shape (n_x, n_y)
    X, Y = create_xy_placeholder(n_x, n_y)

    # Initialize parameters
    parameters = initialize_parameters(n_x, scale, hidden_units)

    # Forward propagation: Build the forward propagation in the tensorflow graph
    z = forward_propagation(X, parameters, keep_prob, batch_norm)

    # Cost function: Add cost function to tensorflow graph
    cost, prediction, accuracy = compute_cost(z, Y, parameters, l2_reg)

    # Backpropagation: Define the tensorflow optimizer. Use an AdamOptimizer.
    with tf.name_scope('optimizer'):

        optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(cost)




    # Initialize all the variables
    init = tf.global_variables_initializer()


    config = tf.ConfigProto()
    config.gpu_options.allow_growth = True

    # Start the session to compute the tensorflow graph
    with tf.Session(config=config) as sess:
        # Run the initialization
        sess.run(init)



        # Do the training loop
        for epoch in range(num_epochs):

            epoch_cost = 0.                       # Defines a cost related to an epoch
            num_minibatches = int(m / minibatch_size) # number of minibatches of size minibatch_size in the train set
            seed = seed + 1
            minibatches = random_mini_batches(X_train, Y_train, minibatch_size, seed)



            for minibatch in minibatches:

                # Select a minibatch
                (minibatch_X, minibatch_Y) = minibatch

                # IMPORTANT: The line that runs the graph on a minibatch.
                # Run the session to execute the "optimizer" and the "cost", the feedict should contain a minibatch for (X,Y).

                _ , minibatch_cost = sess.run([optimizer, cost], 
                                              feed_dict = {X: minibatch_X, Y: minibatch_Y})

                epoch_cost += minibatch_cost / num_minibatches



            # Print the cost every epoch
            if print_cost == True and epoch % 100 == 0:
                print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
                prediction1=tf.argmax(z)
#                print('Z5: ', Z5.eval(feed_dict={X: minibatch_X, Y: minibatch_Y}))
                print('prediction: ', prediction1.eval(feed_dict={X: minibatch_X, 
                                                                  Y: minibatch_Y}))

                correct1=tf.argmax(Y)
#                print('Y: ', Y.eval(feed_dict={X: minibatch_X, 
#                                                            Y: minibatch_Y}))
                print('correct: ', correct1.eval(feed_dict={X: minibatch_X, 
                                                            Y: minibatch_Y}))

            if print_cost == True and epoch % 5 == 0:
                costs.append(epoch_cost)

        # plot the cost
        plt.plot(np.squeeze(costs))
        plt.ylabel('cost')
        plt.xlabel('iterations (per tens)')
        plt.title("Learning rate =" + str(learning_rate))
        plt.show()

        # lets save the parameters in a variable
        parameters = sess.run(parameters)
        print ("Parameters have been trained!")

        # Calculate the correct predictions
        correct_prediction = tf.equal(tf.argmax(z), tf.argmax(Y))

        # Calculate accuracy on the test set

        accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

        print ("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train}))
        print ("Test Accuracy:", accuracy.eval({X: X_test, Y: Y_test}))





        return parameters


# run model on test data


parameters = model(x_train, y_train, x_test, y_test, keep_prob=1)

损失：使用Keras的nan与使用tensorflow的非nan（工作）输出

0 个答案: