我正在使用Actor Critic成功解决Acrobot v1环境,一旦我的程序完成运行,它将调用在同一会话上在CartPole v1环境上运行Actor Critic的函数,但是我想重新初始化输出层的权重同时保持所有图层和变量相同。我还没有找到网上的方法?
这是我的演员:
def __init__(self, hidden_layers_size, gamma, learning_rate, input_size, num_of_actions):
self.state = tf.placeholder(shape=(None, input_size), dtype=tf.float32, name='pg_states')
self.R_t = tf.placeholder(shape=None, dtype=tf.float32, name='pg_q')
self.action = tf.placeholder(shape=(num_of_actions), dtype=tf.float32, name='pg_actions')
_layer = self.state
for l in hidden_layers_size:
_layer = tf.layers.dense(inputs=_layer, units=l, activation=tf.nn.relu,
kernel_initializer=tf.contrib.layers.xavier_initializer(), name="first_layer")
self.last_layer = tf.layers.dense(inputs=_layer, units=num_of_actions, activation=None, # Linear activation
kernel_initializer=tf.contrib.layers.xavier_initializer(), name="second_layer")
self.actions_distribution = tf.nn.softmax(self.last_layer, name="action_distri")
self.log_policy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.last_layer, labels=self.action, name="log")
self.loss = tf.reduce_mean(self.R_t * self.log_policy, name="loss")
self.optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(self.loss)
这是我的评论家:
def __init__(self, state_size, action_size, learning_rate):
self.state_size = state_size
self.action_size = action_size
self.learning_rate = learning_rate
self.state = tf.placeholder(tf.float32, [None, state_size], name="value_state")
self.R_t = tf.placeholder(tf.float32, name="total_rewards")
layer_1 = tf.contrib.layers.fully_connected(self.state, 24, activation_fn=tf.nn.relu,
weights_initializer=tf.contrib.layers.xavier_initializer(seed=0))
hidden_layer = tf.contrib.layers.fully_connected(layer_1, 24, activation_fn=tf.nn.relu,
weights_initializer=tf.contrib.layers.xavier_initializer(
seed=0))
layer_2 = tf.contrib.layers.fully_connected(hidden_layer, 1, activation_fn=None,
weights_initializer=tf.contrib.layers.xavier_initializer(seed=0))
self.output = tf.squeeze(layer_2, name="value_output")
self.output = tf.in
# Softmax probability distribution over actions
# Loss with negative log probability
self.loss = tf.reduce_mean(tf.squared_difference(self.output, self.R_t))
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)