我的简单强化学习模型没有学习。我不知道为什么

时间:2021-02-10 16:15:11

标签: python tensorflow reinforcement-learning

我是 tensorflow 和强化学习编程的初学者。我用强化学习算法制作了简单的程序。更准确地说,我从 Sudharsan Ravichandiran 的“Hands-On-Reinforcement-Learning-With-Python”一书中重新编写了示例程序。

我举了下一个例子: https://github.com/sudharsan13296/Hands-On-Reinforcement-Learning-With-Python/blob/master/08.%20Atari%20Games%20with%20DQN/8.8%20Building%20an%20Agent%20to%20Play%20Atari%20Games.ipynb

此示例使用 Deep Q Network 构建一个代理来玩 Atari 游戏 Pacman。在我的程序中,我保留了 DQN 算法并更改了模型和状态向量。 现在代理是二维平面上的汽车。它从位置 x=1000, y=1000 开始运动。根据我的想法,汽车必须行驶到坐标 x=0, y=0 的位置。 我将奖励设置为函数 f=1/(x1^2+y1^2) - 1/(x0^2+y0^2),其中 (x0,y0) 是汽车的前一个位置,(x1,y1) – 汽车的下一个位置。因此,如果汽车行驶到位置 (0,0),则奖励会增加。

状态向量只有 4 维:[x,y,sin(fi),cos(fi)],其中 x,y 是汽车的两个坐标,fi 是汽车在二维平面中的角度。车有动作。可以右转、左转或同方向移动。

如您所见,这是一个非常简单的模型。但是 DQN 算法没有学习。它找不到好的策略,汽车不会行驶到位置 (0,0)。

我想请强化学习专家在python中运行我的简单程序并找出这个非常简单的程序中的问题。

    import math
    import numpy as np
    import matplotlib.pyplot as plt

    import tensorflow as tf2
    import tensorflow.compat.v1 as tf
    from tensorflow.compat.v1.layers import dense
    from collections import deque, Counter

    fi = np.float(0)
    xx = np.float(0)
    yy = np.float(0)
    V = np.float(1)
    MaxAbsAction = 0.1;

    N = 1000;
    x_Data = np.zeros(N, dtype=np.float)
    y_Data = np.zeros(N, dtype=np.float)
    i_D = int(0)

    def ModelReset():
        global fi, xx, yy, x_Data, y_Data, i_D    

        fi = np.float(0)
        xx = np.float(1000)
        yy = np.float(1000)
        i_D = int(0);
        x_Data[i_D] = xx
        y_Data[i_D] = yy

        obs = np.array([xx,yy,math.cos(fi),math.sin(fi)],dtype=np.float)    
        return obs 

    def ModelStep(action):
        global fi, xx, yy, x_Data, y_Data
        global MaxAbsAction, i_D, V, N

        PreviousDistance = math.sqrt( xx*xx + yy*yy )

        if action > MaxAbsAction:
           action = MaxAbsAction

        if action < -MaxAbsAction:
           action = -MaxAbsAction

        fi += action
        xx += V * math.cos(fi)
        yy += V * math.sin(fi)
        i_D += 1
        x_Data[i_D] = xx
        y_Data[i_D] = yy

        NextDistance = math.sqrt( xx*xx + yy*yy )    
        reward = (1/NextDistance - 1/PreviousDistance)
        next_obs = np.array([xx,yy,math.cos(fi),math.sin(fi)],dtype=np.float)        
        done = i_D>=N-1 or NextDistance < 20
        return next_obs, reward, done 

    n_outputs = 3

    def n_to_float_action(n):
        global MaxAbsAction
        return -MaxAbsAction + 2*MaxAbsAction/(n_outputs-1)*n

    def Q_Network(X, name_scope) :

        initializer = tf.keras.initializers.VarianceScaling()    

        with tf.variable_scope(name_scope) as scope:
            fc1 = dense(X, 100, kernel_initializer=initializer, 
                       activation=tf.keras.activations.sigmoid)

            fc2 = dense(fc1, 100, kernel_initializer=initializer, 
                       activation=tf.keras.activations.relu)

            output = dense( fc2, n_outputs, 
                       kernel_initializer=initializer )

            Q_vars = {v.name[len(scope.name):]: v for v in 
                      tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES, 
                                        scope=scope.name)} 
           return Q_vars, output   

    epsilon = 0.1
    eps_min = 0.01
    eps_max = 1
    eps_decay_steps = 5000000 

    def epsilon_greedy(action,step):
        
        p = np.random.random(1).squeeze()
        epsilon = max(eps_min, eps_max-(eps_max-eps_min)*step/eps_decay_steps)
        
        if np.random.rand(1) < epsilon:
            return np.random.randint(n_outputs), epsilon
        else:
            return action, epsilon    
        
    def sample_memories(batch_size):
        if exp_buffer_full:
            size_buff = exp_buffer_length
        else:
            size_buff = exp_buffer_pos
            
        perm_batch = np.random.permutation(size_buff)[:batch_size]
        mem = exp_buffer[perm_batch]
        return mem[:,0],mem[:,1],mem[:,2],mem[:,3],mem[:,4]    

    num_episodes = 100000
    batch_size = 200
    learning_rate = 0.001
    X_shape = (None,4)
    discount_factor = 0.97

    global_step = 0
    copy_steps = 10000
    steps_train = 40
    start_steps = 2000
    logdir = 'logs'

    exp_buffer_length = 1000000
    exp_buffer_pos = 0;
    exp_buffer_full = False
    exp_buffer = np.zeros(shape=(exp_buffer_length,5), dtype=object)

    tf.compat.v1.disable_eager_execution()

    X = tf.placeholder(tf.float32, shape=X_shape,name='X')
    in_training_mode = tf.placeholder(tf.bool,name='in_training_mode')

    mainQ, mainQ_outputs = Q_Network(X,'maimQ')
    targetQ, targetQ_outputs = Q_Network(X,'targetQ')

    X_action = tf.placeholder(tf.int32, shape=(None,),name='X_action')
    Q_action = tf.reduce_sum(
        targetQ_outputs * tf.one_hot(X_action, n_outputs),
        axis=-1, keep_dims=True )

    copy_op = [tf.assign(main_name,targetQ[var_name]) 
               for var_name, main_name in mainQ.items() ] 
    copy_target_to_main = tf.group(*copy_op)

    y = tf.placeholder( tf.float32, shape=(None,1), name='y' )
    loss = tf.reduce_mean( tf.square(y-Q_action) )

    optimazer = tf.train.AdamOptimizer(learning_rate)
    training_op = optimazer.minimize(loss) 

    loss_summary = tf.summary.scalar('LOSS',loss)
    merge_summary = tf.summary.merge_all()
    file_writer = tf.summary.FileWriter(logdir,tf.get_default_graph())

    train_loss = None

    init = tf.global_variables_initializer()
    with tf.Session() as sess:
        init.run()

        for i in range(num_episodes):
            done = False
            obs = ModelReset()
            epoch = 0
            episodic_reward = 0
            action_counter = Counter()
            episodic_loss = []
            
            while not done:
              
                actions = mainQ_outputs.eval (
                    feed_dict={X:[obs], in_training_mode:False}) 
                    
                action = np.argmax(actions,axis=-1)
                action_counter[str(action)] += 1

                action, epsilonn = epsilon_greedy(action, global_step)
                
                next_obs, reward, done = ModelStep(n_to_float_action(action))      
                        
                exp_buffer[exp_buffer_pos,:] = np.array([obs, action, next_obs, reward, done],dtype=object)
                
                exp_buffer_pos += 1
                if exp_buffer_pos >= exp_buffer_length:
                    exp_buffer_pos = 0
                    exp_buffer_full = True            
                
                if global_step % steps_train == 0 and global_step > start_steps:
                    o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)
                    
                    o_obs = [x for x in o_obs]
                    o_next_obs = [x for x in o_next_obs]
                    
                    next_act = mainQ_outputs.eval( 
                        feed_dict={X:o_next_obs,in_training_mode:False}) 
                    
                    y_batch = o_rew + discount_factor * np.max(next_act,axis=-1) 
                    
                    train_loss, _ = sess.run( [loss, training_op],
                        feed_dict={X:np.array(o_obs,dtype=np.float), 
                                   y:np.expand_dims(
                                      np.array(y_batch,dtype=np.float),axis=-1), 
                                   X_action:np.array(o_act,dtype=np.int32), 
                                   in_training_mode:True } )

                
                if (global_step+1) % copy_steps == 0 and global_step > start_steps:
                    copy_target_to_main.run()
                    print('copy_target_to_main.run()')
                
                obs = next_obs
                epoch += 1
                global_step += 1
                episodic_reward += reward
            
            print('Episode', i, 'Reward', episodic_reward, 'epsilon', epsilonn,
                  'loss', train_loss )        
            
            if (i+1) % 100 == 0:            
                plt.plot(x_Data,y_Data)
                plt.show()

1 个答案:

答案 0 :(得分:0)

我在我的简单程序中发现了问题。我必须规范化状态向量和奖励。所以这些值必须在区间 [-1, 1] 内。但我没有这样做。当我这样做时,我的简单程序开始运行良好。