DDPG(演员评判)跑到最小值/最大值

时间:2018-09-20 20:17:14

标签: python tensorflow reinforcement-learning

我希望对DDPG算法有所帮助。一切正常,并且批注者网络的Q值已正确创建,但操作结果最终固定为最大或最小允许值。这种行为似乎暗示我的梯度计算出了点问题,但我不太清楚。

出于测试目的,我使用了一个非常简单的输入,代理应该可以学习该输入,并且我知道正确的答案:

state | action | reward A | 100 | 100 A | 100 | 100 A | 100 | 100 A | 100 | 100 A | 200 | 200 A | 200 | 0 A | 200 | 0 A | 200 | 0

因为200被接受一次的期望值是50,而100被接受了,这意味着它的期望值为100。因此,假设状态相同,则结果应始终为100。我应该指出,未来的奖励目前正在被忽略-这只是一个最大化会话的模型。

代码有点长,所以我已经删除了不相关的部分:

def build_actor_net(s, scope, trainable,a_dim,act_max,act_min,features):
    with tf.variable_scope(scope):
        s = tf.feature_column.input_layer(features=features, feature_columns=s)
        init_w = tf.random_normal_initializer(0., 0.003)
        init_b = tf.constant_initializer(0.1)
        regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)

        net = tf.layers.batch_normalization(s,fused=True)
        net = tf.layers.dense(net, 400, activation=tf.nn.relu,kernel_initializer=init_w, bias_initializer=init_b, name='l1',trainable=trainable,kernel_regularizer=regularizer)
        net = tf.layers.batch_normalization(net,fused=True)
        net = tf.layers.dense(net, 300, activation=tf.nn.relu,kernel_initializer=init_w, bias_initializer=init_b, name='l2',trainable=trainable,kernel_regularizer=regularizer)
        with tf.variable_scope('actor_action'):
            actions = tf.layers.dense(net, a_dim, activation=tf.nn.sigmoid,name='actions', trainable=trainable)
            scaled_a = tf.add(tf.multiply(actions,tf.subtract(act_max,act_min)), act_min, name='scaled_a')
    return scaled_a

def build_critic_net(s, a, scope, trainable,s_dim,a_dim,features):
    with tf.variable_scope(scope):
        s = tf.feature_column.input_layer(features=features, feature_columns=s)
        init_w = tf.random_normal_initializer(0., 0.003)
        init_b = tf.constant_initializer(0.1)
        regularizer = tf.contrib.layers.l2_regularizer(scale=0.01)

        net = tf.layers.batch_normalization(s,fused=True)
        net = tf.layers.dense(net,400,activation=tf.nn.relu,kernel_initializer=init_w,bias_initializer=init_b,name='l1',trainable=trainable,kernel_regularizer=regularizer)
        net = tf.layers.batch_normalization(net+a,fused=True)
        net = tf.layers.dense(net, 300, activation=tf.nn.relu,kernel_initializer=init_w, bias_initializer=init_b, name='l2',trainable=trainable,kernel_regularizer=regularizer)

        with tf.variable_scope('q'):
            q = tf.layers.dense(net, a_dim, kernel_initializer=init_w, bias_initializer=init_b, trainable=trainable,kernel_regularizer=regularizer)# Q(s,a)
    return q

def model_fn(features, mode, params):
    state = (There are several state columns here that dont matter)       
    reward     = tf.feature_column.numeric_column('reward')
    action     = tf.feature_column.numeric_column('action')

    # ---------------------- Build Actor Networks ---------------------------
    with tf.variable_scope('Actor'):
        act_a = build_actor_net(state, 'act_eval_net', True,params['a_dim'],params['act_max'],params['act_min'],features)

        act_a_ = build_actor_net(state_, 'act_target_net', True,params['a_dim'],params['act_max'],params['act_min'],features)

    # ---------------------- Build Critic Networks ---------------------------
    with tf.variable_scope('Critic'):
        crit_a = tf.feature_column.input_layer(features=features,feature_columns=[action]) #Interchange action and mult here

        crit_q = build_critic_net(state, crit_a, 'crit_eval_net', True,params['s_dim'],params['a_dim'],features)
        crit_actor_update = build_critic_net(state,act_a,'crit_update_actor_net',True,params['s_dim'],params['a_dim'],features)

        crit_q_ = build_critic_net(state_, act_a_, 'crit_target_net', True,params['s_dim'],params['a_dim'],features)

    # ---------------------- Set up target, loss, and gradient --------------------
    with tf.variable_scope('target_q'):
        r = tf.feature_column.input_layer(features=features, feature_columns=[reward])
        crit_target_q = r #+ params['gamma'] * crit_q_ #(Session-max model)

    with tf.variable_scope('crit_loss'):
        crit_loss = tf.reduce_mean(tf.squared_difference(crit_target_q, crit_q))

    with tf.variable_scope('update'):
        act_e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/act_eval_net')
        act_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Actor/act_target_net')
        crit_e_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/crit_eval_net')
        crit_t_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/crit_target_net')
        crit_update_params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope='Critic/crit_update_actor_net')

        crit_update_replace_op = [crit_update_params[i].assign(crit_e_params[i]) for i in range(len(crit_update_params))]
        with tf.control_dependencies(crit_update_replace_op):
            crit_target_replace_op = [crit_t_params[i].assign(tf.multiply(crit_e_params[i], params['tau']) +
                                              tf.multiply(crit_t_params[i], 1. - params['tau'])) for i in range(len(crit_t_params))]
        with tf.control_dependencies(crit_target_replace_op):
            act_target_replace_op =  [act_t_params[i].assign(tf.multiply(act_e_params[i], params['tau']) +
                                              tf.multiply(act_t_params[i], 1. - params['tau'])) for i in range(len(act_t_params))] 

    with tf.variable_scope('C_train'):
        reg = tf.losses.get_regularization_loss()
        with tf.control_dependencies(act_target_replace_op):
            crit_train_op = tf.train.AdamOptimizer(params['clr']).minimize(loss=crit_loss, global_step=tf.train.get_global_step())

    with tf.variable_scope('a_grad'):
        with tf.control_dependencies([crit_train_op]):###Not sure about this. May be necessary may not be
            with tf.control_dependencies([act_a]):
                a_grads = tf.gradients(crit_actor_update, act_a)[0]   # tensor of gradients of each sample (None, a_dim)

    with tf.variable_scope('A_train'):
        act_policy_grads = tf.gradients(ys=act_a, xs=act_e_params, grad_ys=a_grads)
        actor_gradients = list(map(lambda x: tf.div(x,params['bsize']),act_policy_grads))
        with tf.control_dependencies([crit_train_op]):
            act_train_op = tf.train.AdamOptimizer(-params['alr']).apply_gradients(zip(actor_gradients, act_e_params))

    return tf.estimator.EstimatorSpec(mode=mode,loss=crit_loss,train_op=tf.group(crit_train_op,act_train_op))

此外,还有一个主要功能是设置一些参数(假设定义了以上代码中调用的所有参数),然后调用Estimator:

DDPG = tf.estimator.Estimator(model_fn=model_fn, params=params, model_dir=model_dir)
DDPG.train(input_fn=lambda: my_input_fn(path,True,args.maxe,args.batch,args.buffer,feature_names))

我也跳过了my_input_fn的代码,但假设它工作正常。

我相当确定我的某个地方的坡度有问题,无论是在坡度本身的计算,坡度的反向传播还是在火车操作中。最终结果是,我最终建议的每个操作都是最大允许操作(200)或最小允许操作(0),而不是正确的操作(100)。如果有人可以帮助我阐明我的错误,我将不胜感激。

0 个答案:

没有答案