我是 tensorflow 和强化学习编程的初学者。我用强化学习算法制作了简单的程序。更准确地说,我从 Sudharsan Ravichandiran 的“Hands-On-Reinforcement-Learning-With-Python”一书中重新编写了示例程序。
此示例使用 Deep Q Network 构建一个代理来玩 Atari 游戏 Pacman。在我的程序中,我保留了 DQN 算法并更改了模型和状态向量。 现在代理是二维平面上的汽车。它从位置 x=1000, y=1000 开始运动。根据我的想法,汽车必须行驶到坐标 x=0, y=0 的位置。 我将奖励设置为函数 f=1/(x1^2+y1^2) - 1/(x0^2+y0^2),其中 (x0,y0) 是汽车的前一个位置,(x1,y1) – 汽车的下一个位置。因此,如果汽车行驶到位置 (0,0),则奖励会增加。
状态向量只有 4 维:[x,y,sin(fi),cos(fi)],其中 x,y 是汽车的两个坐标,fi 是汽车在二维平面中的角度。车有动作。可以右转、左转或同方向移动。
如您所见,这是一个非常简单的模型。但是 DQN 算法没有学习。它找不到好的策略,汽车不会行驶到位置 (0,0)。
我想请强化学习专家在python中运行我的简单程序并找出这个非常简单的程序中的问题。
import math
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf2
import tensorflow.compat.v1 as tf
from tensorflow.compat.v1.layers import dense
from collections import deque, Counter
fi = np.float(0)
xx = np.float(0)
yy = np.float(0)
V = np.float(1)
MaxAbsAction = 0.1;
N = 1000;
x_Data = np.zeros(N, dtype=np.float)
y_Data = np.zeros(N, dtype=np.float)
i_D = int(0)
def ModelReset():
global fi, xx, yy, x_Data, y_Data, i_D
fi = np.float(0)
xx = np.float(1000)
yy = np.float(1000)
i_D = int(0);
x_Data[i_D] = xx
y_Data[i_D] = yy
obs = np.array([xx,yy,math.cos(fi),math.sin(fi)],dtype=np.float)
return obs
def ModelStep(action):
global fi, xx, yy, x_Data, y_Data
global MaxAbsAction, i_D, V, N
PreviousDistance = math.sqrt( xx*xx + yy*yy )
if action > MaxAbsAction:
action = MaxAbsAction
if action < -MaxAbsAction:
action = -MaxAbsAction
fi += action
xx += V * math.cos(fi)
yy += V * math.sin(fi)
i_D += 1
x_Data[i_D] = xx
y_Data[i_D] = yy
NextDistance = math.sqrt( xx*xx + yy*yy )
reward = (1/NextDistance - 1/PreviousDistance)
next_obs = np.array([xx,yy,math.cos(fi),math.sin(fi)],dtype=np.float)
done = i_D>=N-1 or NextDistance < 20
return next_obs, reward, done
n_outputs = 3
def n_to_float_action(n):
global MaxAbsAction
return -MaxAbsAction + 2*MaxAbsAction/(n_outputs-1)*n
def Q_Network(X, name_scope) :
initializer = tf.keras.initializers.VarianceScaling()
with tf.variable_scope(name_scope) as scope:
fc1 = dense(X, 100, kernel_initializer=initializer,
activation=tf.keras.activations.sigmoid)
fc2 = dense(fc1, 100, kernel_initializer=initializer,
activation=tf.keras.activations.relu)
output = dense( fc2, n_outputs,
kernel_initializer=initializer )
Q_vars = {v.name[len(scope.name):]: v for v in
tf.get_collection(key=tf.GraphKeys.TRAINABLE_VARIABLES,
scope=scope.name)}
return Q_vars, output
epsilon = 0.1
eps_min = 0.01
eps_max = 1
eps_decay_steps = 5000000
def epsilon_greedy(action,step):
p = np.random.random(1).squeeze()
epsilon = max(eps_min, eps_max-(eps_max-eps_min)*step/eps_decay_steps)
if np.random.rand(1) < epsilon:
return np.random.randint(n_outputs), epsilon
else:
return action, epsilon
def sample_memories(batch_size):
if exp_buffer_full:
size_buff = exp_buffer_length
else:
size_buff = exp_buffer_pos
perm_batch = np.random.permutation(size_buff)[:batch_size]
mem = exp_buffer[perm_batch]
return mem[:,0],mem[:,1],mem[:,2],mem[:,3],mem[:,4]
num_episodes = 100000
batch_size = 200
learning_rate = 0.001
X_shape = (None,4)
discount_factor = 0.97
global_step = 0
copy_steps = 10000
steps_train = 40
start_steps = 2000
logdir = 'logs'
exp_buffer_length = 1000000
exp_buffer_pos = 0;
exp_buffer_full = False
exp_buffer = np.zeros(shape=(exp_buffer_length,5), dtype=object)
tf.compat.v1.disable_eager_execution()
X = tf.placeholder(tf.float32, shape=X_shape,name='X')
in_training_mode = tf.placeholder(tf.bool,name='in_training_mode')
mainQ, mainQ_outputs = Q_Network(X,'maimQ')
targetQ, targetQ_outputs = Q_Network(X,'targetQ')
X_action = tf.placeholder(tf.int32, shape=(None,),name='X_action')
Q_action = tf.reduce_sum(
targetQ_outputs * tf.one_hot(X_action, n_outputs),
axis=-1, keep_dims=True )
copy_op = [tf.assign(main_name,targetQ[var_name])
for var_name, main_name in mainQ.items() ]
copy_target_to_main = tf.group(*copy_op)
y = tf.placeholder( tf.float32, shape=(None,1), name='y' )
loss = tf.reduce_mean( tf.square(y-Q_action) )
optimazer = tf.train.AdamOptimizer(learning_rate)
training_op = optimazer.minimize(loss)
loss_summary = tf.summary.scalar('LOSS',loss)
merge_summary = tf.summary.merge_all()
file_writer = tf.summary.FileWriter(logdir,tf.get_default_graph())
train_loss = None
init = tf.global_variables_initializer()
with tf.Session() as sess:
init.run()
for i in range(num_episodes):
done = False
obs = ModelReset()
epoch = 0
episodic_reward = 0
action_counter = Counter()
episodic_loss = []
while not done:
actions = mainQ_outputs.eval (
feed_dict={X:[obs], in_training_mode:False})
action = np.argmax(actions,axis=-1)
action_counter[str(action)] += 1
action, epsilonn = epsilon_greedy(action, global_step)
next_obs, reward, done = ModelStep(n_to_float_action(action))
exp_buffer[exp_buffer_pos,:] = np.array([obs, action, next_obs, reward, done],dtype=object)
exp_buffer_pos += 1
if exp_buffer_pos >= exp_buffer_length:
exp_buffer_pos = 0
exp_buffer_full = True
if global_step % steps_train == 0 and global_step > start_steps:
o_obs, o_act, o_next_obs, o_rew, o_done = sample_memories(batch_size)
o_obs = [x for x in o_obs]
o_next_obs = [x for x in o_next_obs]
next_act = mainQ_outputs.eval(
feed_dict={X:o_next_obs,in_training_mode:False})
y_batch = o_rew + discount_factor * np.max(next_act,axis=-1)
train_loss, _ = sess.run( [loss, training_op],
feed_dict={X:np.array(o_obs,dtype=np.float),
y:np.expand_dims(
np.array(y_batch,dtype=np.float),axis=-1),
X_action:np.array(o_act,dtype=np.int32),
in_training_mode:True } )
if (global_step+1) % copy_steps == 0 and global_step > start_steps:
copy_target_to_main.run()
print('copy_target_to_main.run()')
obs = next_obs
epoch += 1
global_step += 1
episodic_reward += reward
print('Episode', i, 'Reward', episodic_reward, 'epsilon', epsilonn,
'loss', train_loss )
if (i+1) % 100 == 0:
plt.plot(x_Data,y_Data)
plt.show()
答案 0 :(得分:0)
我在我的简单程序中发现了问题。我必须规范化状态向量和奖励。所以这些值必须在区间 [-1, 1] 内。但我没有这样做。当我这样做时,我的简单程序开始运行良好。