在Actor-Critic with Gaussian的代码中,
class PolicyEstimator():
"""
Policy Function approximator.
"""
def __init__(self, learning_rate=0.01, scope="policy_estimator"):
with tf.variable_scope(scope):
self.state = tf.placeholder(tf.float32, [400], "state")
self.target = tf.placeholder(dtype=tf.float32, name="target")
# This is just linear classifier
self.mu = tf.contrib.layers.fully_connected(
inputs=tf.expand_dims(self.state, 0),
num_outputs=1,
activation_fn=None,
weights_initializer=tf.zeros_initializer)
self.mu = tf.squeeze(self.mu)
self.sigma = tf.contrib.layers.fully_connected(
inputs=tf.expand_dims(self.state, 0),
num_outputs=1,
activation_fn=None,
weights_initializer=tf.zeros_initializer)
self.sigma = tf.squeeze(self.sigma)
self.sigma = tf.nn.softplus(self.sigma) + 1e-5
self.normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma)
self.action = self.normal_dist._sample_n(1)
初始化正态分布的实例
self.normal_dist = tf.contrib.distributions.Normal(self.mu, self.sigma)
采样
self.action = self.normal_dist._sample_n(1)
由于env的维数为1,因此代码仅采样一个动作。但是,如果动作空间为40或更大,该如何采样该动作?
self.action = self.normal_dist._sample_n(40)
我认为这意味着要采样40个维度空间为1的动作,而不是采样40个维度值的动作。
如何对维度值为40或更大的一个动作进行采样?
答案 0 :(得分:2)
要创建形状为(40)
的动作矢量,您需要在网络的最后一层输出形状为40的矢量。因此,请更改:
self.mu = tf.contrib.layers.fully_connected(
inputs=tf.expand_dims(self.state, 0),
num_outputs=1,
activation_fn=None,
weights_initializer=tf.zeros_initializer)
收件人:
self.mu = tf.contrib.layers.fully_connected(
inputs=tf.expand_dims(self.state, 0),
num_outputs=40,
activation_fn=None,
weights_initializer=tf.zeros_initializer)
这意味着self.mu
(被馈送到tf.distributions.Normal
)将是形状为(40)
的向量。
您可以对sigma
执行相同的操作,但是根据我的经验,最好将其视为可训练的参数,而不是网络的输出,例如:
import numpy as np
import tensorflow as tf
state_dim = 3 # 3 dimentional state
action_dim = 40 # 40 dimentional action
action_bound = 2 # Actions are scaled between -2 & +2
# Define ops for actor/policy
state = tf.placeholder(tf.float32, [None, state_dim])
# Dense layer which takes an imput of shape 3, and output shape 40
mu = tf.layers.dense(state, action_dim, tf.nn.tanh, name='pi_mu')
# Use log sigma to prevent NaNs (initialised to 0)
log_sigma = tf.get_variable(name="log_sigma", shape=action_dim, initializer=tf.zeros_initializer())
# Create a 40D Gaussian distribution (sigma = exp(0) = 1)
dist = tf.distributions.Normal(loc=mu * action_bound, scale=tf.exp(log_sigma))
# This sample_op returns a single vector of shape 40 sampled from dist
sample_op = tf.squeeze(dist.sample(1), axis=0)
# Start session
sess = tf.Session()
sess.run(tf.global_variables_initializer())
# Sample a 40D action using an input state
sess.run(sample_op, feed_dict={state: np.array([[1, 0, -1]])})
输出:
array([[-0.12732446, -1.0969237 , 0.19172549, -0.53541076, -1.7409694 ,
-1.9716561 , -0.4621313 , 1.1770394 , -0.89807725, -0.428378 ,
0.43714064, 0.5723815 , -2.4273002 , -1.1083983 , -0.67126757,
1.4471897 , -1.9418054 , -0.3857537 , 0.3149717 , -0.5094094 ,
-0.9856905 , 1.1567912 , 0.37608355, -1.1339413 , 0.13634366,
-0.22886413, 1.2220807 , -0.9807693 , 1.5443543 , -0.01700211,
-0.30074215, 0.77911556, 1.0790621 , 1.4446486 , 0.11510286,
0.13127172, 0.9332013 , -0.22423705, 0.27746603, 0.03245509]],
dtype=float32)