为什么我的带有张量流的DQN​​训练每次迭代都会变慢?

时间:2019-02-01 12:31:11

标签: python-3.x tensorflow

我试图实现与tensorflow经验重播一个DQN。它似乎正在起作用,即我的损失正在减少。但是,随着训练循环的运行,我注意到每次训练迭代的速度都越来越慢。好像我的张量流图变得越来越大并且放慢了训练速度。我看不到自己的代码有什么问题。那里的任何tensorflow专家都可以指出吗?我在这里制作了按比例缩小版本的代码,该版本可对随机数据进行操作,但会产生相同的问题。

import numpy as np
import tensorflow as tf

# Function which initializes tensorflow weights for feed-forward NN.
def InitWeights(LayerSizes):
    # Make tensorflow input/output placeholders    
    X = tf.placeholder(shape = (None,LayerSizes[0]),  dtype = tf.float32, name ='InputData')
    y = tf.placeholder(shape = (None,LayerSizes[-1]), dtype = tf.float32, name ='OutputData')
    # Initialize dictionaries for weights and biases.
    W = {}
    b = {}
    for ii in range(len(LayerSizes)-1):
        layername = f'layer%s' % ii
        with tf.variable_scope(layername):
            ny = LayerSizes[ii]
            nx = LayerSizes[ii+1]
            # Weights (initialized with xavier initializatiion).
            W['Weights_'+layername] = tf.get_variable(
                                name = 'Weights_'+layername,
                                shape = (ny, nx),
                                initializer = tf.contrib.layers.xavier_initializer(),
                                dtype = tf.float32
                                )
            # Bias (initialized with xavier initializatiion).
            b['Bias_'+layername] = tf.get_variable(
                                name = 'Bias_'+layername,
                                shape = (nx),
                                initializer = tf.contrib.layers.xavier_initializer(),
                                dtype = tf.float32
                                )
    return W, b, X, y

# Function which defines feed-forward neural network operation.
def FeedForward(X, W, b):
    a = X
    # Loop all layers of the network.
    for ii in range(len(W)):
        # Use name of each layer as index.
        layername = f'layer%s' % ii
        # Weighted sum: z = input*W + b
        z = tf.add(tf.matmul(a, W['Weights_'+layername], name = 'WeightedSum_z_'+layername), b['Bias_'+layername])
        # Passed through actication fcn: a = h(z)
        if ii == len(W)-1:
            a = z
        else:
            a = tf.nn.relu(z, name = 'activation_a_'+layername)
    return a

# Function used for experience replay   
def ExperienceReplay(s, a, r, s_prime, gamma, TermState, X, y, yhat, yhatNN2, train_op, loss, sess):
    # Inputs:
    # s         - state(s)
    # a         - actions(s)
    # r         - rewards(s)
    # s_prime   - new state(s)
    # gamma     - discount factor
    # TermState - scalar of which action is termenating 
    # X         - tensorflow placeholder for network inputs
    # y         - tensorflow placeholder for network outputs
    # yhat      - tensorflow operation for feed foward with NN 1
    # yhatNN2   - tensorflow operation for feed foward with NN 2
    # train_op  - tensorflow training operation
    # loss      - tensorflow fcn for calculating loss
    # sess      - tensorflow session

    # Forward pass throught NN2 using s_prime to find max(Q(s',a',theta')).
    Q = sess.run(yhatNN2, feed_dict={X : s_prime}) 

    # Actions that NN1 thinks is best @ sprime state
    a_argmax = np.argmax(sess.run(yhat, feed_dict={X : s_prime}), axis=1)
    # Values from NN2's opinion about the actions NN1 picked.
    Qm = np.zeros(len(r))
    for obs in range(len(r)):
        Qm[obs] = Q[obs,a_argmax[obs]]

    # First make all targets equal to NN1's approximation of Q (so the error is 0 in all unobserved cases) 
    Targets = sess.run(yhat, feed_dict={X : s})

    # If the action was experienced, change the target to either real reward or discounted future reward.
    for obs in range(len(r)):
        # If the action was episode-terminating, use only reward as target.
        if int(a[obs]) == TermState:
            Targets[obs,int(a[obs])] = r[obs]
        # Otherwise use discounted future reward.
        else:
            Targets[obs,int(a[obs])] = r[obs] + gamma*Qm[obs]

    # Gradient decent one step on NN1 weights.
    sess.run(train_op, feed_dict={X : s, y : Targets})

    # Calculate the losses.
    loss_val = sess.run(loss, feed_dict={X : s, y : Targets})
    meanloss = np.mean(loss_val)

    return loss_val, meanloss


if __name__ == "__main__":

    #### Hyperparameter settings
    N         = 64     # Minibatch size during training
    gamma     = 0.99   # Discount rate
    C         = 100    # How many iterations between NN sync NN2 = NN1
    lr        = 1e-7   # Learning rate of NN during training
    nstates   = 256    # Number of possible states
    nactions  = 256    # Number of possible actions
    TermState = 255    # Which state ends episode
    """
    Initialize tensorflow session and create one NN with two set of weights
    """
    # Initialize & configure action-value function Q with random weights theta.
    LayerSizes = [nstates, 1024, 1024, nactions]
    W, b, X, y = InitWeights(LayerSizes)

    # Define loss function to optimize. Here: quadratic loss fcn. (Outputdata-a)^2
    yhat = FeedForward(X, W, b)
    loss = tf.reduce_sum(tf.square(y - yhat),reduction_indices=[0])

    # Define optimizer to use when minimizing loss function.
    all_variables = tf.trainable_variables()
    optimizer     = tf.train.AdamOptimizer(learning_rate = lr)
    train_op      = optimizer.minimize(loss, var_list = all_variables)

    # Initialize target action-value function Qhat with random weights theta_= theta.
    with tf.device('/gpu:0'):
        W2 = {}
        b2 = {}
        # Make hard copy of tensorflow Weights and biases
        for key in W:
            W2[key] = tf.Variable(W[key].initialized_value())
        for key in b:
            b2[key] = tf.Variable(b[key].initialized_value())
        yhatNN2 = FeedForward(X, W2, b2)

    # Start tf session and initialize variables.
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())

    ## Generate random data representing state transitions <s,a,r,s'>.
    # Random states
    Ds       = np.random.rand(100000,nstates)>0.5
    Ds       = Ds.astype(np.float32)
    # Random actions
    Da       = np.random.randint(0,nstates,(100000,1)).astype(np.float32)
    # Random rewards
    Dr       = np.random.rand(100000,1).astype(np.float32)
    # Random new states
    Ds_prime  = np.random.rand(100000,nstates)>0.5
    Ds_prime  = Ds.astype(np.float32)

    """
    Pretrain network and report time each C iterations
    """
    import time
    t0 = time.time()
    for i in range(100000):
        # Randomly pick minibatch to use
        MemsToUse = np.random.choice(len(Dr), N)
        s      = Ds[MemsToUse,:]
        a      = Da[MemsToUse,0]
        r      = Dr[MemsToUse,0]
        sprime = Ds_prime[MemsToUse,:]

        # Experience replay.
        loss_val, meanloss = ExperienceReplay(s, a, r, sprime, gamma,TermState, X, y, yhat, yhatNN2, train_op, loss, sess)

        # every C iteration copy NN2 = NN1 
        if (i % C) == 0:
            t1 = time.time()
            print('iter: %i meanloss: %0.5f iteration took %0.2f s' %(i,meanloss,t1-t0))
            t0 = time.time()
            with tf.device('/gpu:0'):
                for key in W:
                    W2[key] = tf.Variable(W[key].initialized_value())
                for key in b:
                    b2[key] = tf.Variable(b[key].initialized_value())

更新:对代码的各个部分进行计时后,它似乎是我的体验重播功能的第一行:

Q = sess.run(yhatNN2, feed_dict={X : s_prime})

这是导致大部分(甚至不是全部)减速的原因。我不明白为什么这样做的逻辑,即程序中发生了几次前馈传递,但似乎只有一次会引起问题。

0 个答案:

没有答案