我希望对DDPG算法有所帮助。一切正常,并且批注者网络的Q值已正确创建,但操作结果最终固定为最大或最小允许值。这种行为似乎暗示我的梯度计算出了点问题,但我不太清楚。
出于测试目的,我使用了一个非常简单的输入,代理应该可以学习该输入,并且我知道正确的答案:
state | action | reward
A | 100 | 100
A | 100 | 100
A | 100 | 100
A | 100 | 100
A | 200 | 200
A | 200 | 0
A | 200 | 0
A | 200 | 0
因为200被接受一次的期望值是50,而100被接受了,这意味着它的期望值为100。因此,假设状态相同,则结果应始终为100。我应该指出,未来的奖励目前正在被忽略-这只是一个最大化会话的模型。
代码有点长,所以我已经删除了不相关的部分:
def build_actor_net(s, scope, trainable,a_dim,act_max,act_min,features):
with tf.variable_scope(scope):
s = tf.feature_column.input_layer(features=features, feature_columns=s)
init_w = tf.random_normal_initializer(0., 0.003)
init_b = tf.constant_initializer(0.1)
regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)
net = tf.layers.batch_normalization(s,fused=True)
net = tf.layers.dense(net, 400, activation=tf.nn.relu,kernel_initializer=init_w, bias_initializer=init_b, name='l1',trainable=trainable,kernel_regularizer=regularizer)
net = tf.layers.batch_normalization(net,fused=True)
net = tf.layers.dense(net, 300, activation=tf.nn.relu,kernel_initializer=init_w, bias_initializer=init_b, name='l2',trainable=trainable,kernel_regularizer=regularizer)
with tf.variable_scope('actor_action'):
actions = tf.layers.dense(net, a_dim, activation=tf.nn.sigmoid,name='actions', trainable=trainable)
scaled_a = tf.add(tf.multiply(actions,tf.subtract(act_max,act_min)), act_min, name='scaled_a')
return scaled_a
def build_critic_net(s, a, scope, trainable,s_dim,a_dim,features):
with tf.variable_scope(scope):
s = tf.feature_column.input_layer(features=features, feature_columns=s)
init_w = tf.random_normal_initializer(0., 0.003)
init_b = tf.constant_initializer(0.1)
regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)
net = tf.layers.batch_normalization(s,fused=True)
net = tf.layers.dense(net,400,activation=tf.nn.relu,kernel_initializer=init_w,bias_initializer=init_b,name='l1',trainable=trainable,kernel_regularizer=regularizer)
net = tf.layers.batch_normalization(net+a,fused=True)
net = tf.layers.dense(net, 300, activation=tf.nn.relu,kernel_initializer=init_w, bias_initializer=init_b, name='l2',trainable=trainable,kernel_regularizer=regularizer)
with tf.variable_scope('q'):
q = tf.layers.dense(net, a_dim, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable,kernel_regularizer=regularizer)# Q(s,a)
return q
def model_fn(features, mode, params):
state = (There are several state columns here that dont matter)
reward = tf.feature_column.numeric_column('reward')
action = tf.feature_column.numeric_column('action')
# ---------------------- Build Actor Networks ---------------------------
with tf.variable_scope('Actor'):
act_a = build_actor_net(state, 'act_eval_net', True,params['a_dim'],params['act_max'],params['act_min'],features)
act_a_ = build_actor_net(state_, 'act_target_net', True,params['a_dim'],params['act_max'],params['act_min'],features)
# ---------------------- Build Critic Networks ---------------------------
with tf.variable_scope('Critic'):
crit_a = tf.feature_column.input_layer(features=features,feature_columns=[action]) #Interchange action and mult here
crit_q = build_critic_net(state, crit_a, 'crit_eval_net', True,params['s_dim'],params['a_dim'],features)
crit_actor_update = build_critic_net(state,act_a,'crit_update_actor_net',True,params['s_dim'],params['a_dim'],features)
crit_q_ = build_critic_net(state_, act_a_, 'crit_target_net', True,params['s_dim'],params['a_dim'],features)
# ---------------------- Set up target, loss, and gradient --------------------
with tf.variable_scope('target_q'):
r = tf.feature_column.input_layer(features=features, feature_columns=[reward])
crit_target_q = r #+ params['gamma'] * crit_q_ #(Session-max model)
with tf.variable_scope('crit_loss'):
crit_loss = tf.reduce_mean(tf.squared_difference(crit_target_q, crit_q))
with tf.variable_scope('update'):
act_e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/act_eval_net')
act_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/act_target_net')
crit_e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/crit_eval_net')
crit_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/crit_target_net')
crit_update_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/crit_update_actor_net')
crit_update_replace_op = [crit_update_params[i].assign(crit_e_params[i]) for i in range(len(crit_update_params))]
with tf.control_dependencies(crit_update_replace_op):
crit_target_replace_op = [crit_t_params[i].assign(tf.multiply(crit_e_params[i], params['tau']) +
tf.multiply(crit_t_params[i], 1. - params['tau'])) for i in range(len(crit_t_params))]
with tf.control_dependencies(crit_target_replace_op):
act_target_replace_op = [act_t_params[i].assign(tf.multiply(act_e_params[i], params['tau']) +
tf.multiply(act_t_params[i], 1. - params['tau'])) for i in range(len(act_t_params))]
with tf.variable_scope('C_train'):
reg = tf.losses.get_regularization_loss()
with tf.control_dependencies(act_target_replace_op):
crit_train_op = tf.train.AdamOptimizer(params['clr']).minimize(loss=crit_loss, global_step=tf.train.get_global_step())
with tf.variable_scope('a_grad'):
with tf.control_dependencies([crit_train_op]):###Not sure about this. May be necessary may not be
with tf.control_dependencies([act_a]):
a_grads = tf.gradients(crit_actor_update, act_a)[0] # tensor of gradients of each sample (None, a_dim)
with tf.variable_scope('A_train'):
act_policy_grads = tf.gradients(ys=act_a, xs=act_e_params, grad_ys=a_grads)
actor_gradients = list(map(lambda x: tf.div(x,params['bsize']),act_policy_grads))
with tf.control_dependencies([crit_train_op]):
act_train_op = tf.train.AdamOptimizer(-params['alr']).apply_gradients(zip(actor_gradients, act_e_params))
return tf.estimator.EstimatorSpec(mode=mode,loss=crit_loss,train_op=tf.group(crit_train_op,act_train_op))
此外,还有一个主要功能是设置一些参数(假设定义了以上代码中调用的所有参数),然后调用Estimator:
DDPG = tf.estimator.Estimator(model_fn=model_fn, params=params, model_dir=model_dir)
DDPG.train(input_fn=lambda: my_input_fn(path,True,args.maxe,args.batch,args.buffer,feature_names))
我也跳过了my_input_fn
的代码,但假设它工作正常。
我相当确定我的某个地方的坡度有问题,无论是在坡度本身的计算,坡度的反向传播还是在火车操作中。最终结果是,我最终建议的每个操作都是最大允许操作(200)或最小允许操作(0),而不是正确的操作(100)。如果有人可以帮助我阐明我的错误,我将不胜感激。