tf.keras.layers.Dense
和tf.layers.Dense
有什么区别?
出于某种原因,我的DQN仅与tf.layers.Dense
融合,而从未与tf.keras.layers.Dense
融合。
我希望它们是相同的,因为根据文档,您应该使用tf.keras.layers.Dense
而不是tf.layers.Dense
。
一些背景:
我试图在TensorFlow中实施分布式Q学习模型。卡住后,由于某种原因,我从tf.keras.layers.Dense
切换到tf.layers.Dense
。由于某种原因,网络现在趋于融合,这是唯一的更改。,我多次对其进行了仔细检查。
我正在使用python 3.6和TensorFlow 1.13.1
收敛:
layer_in = tf.reshape(inputs, (tf.shape(inputs)[0], layer_in_size))
for layer_idx, out_size in enumerate(layer_sizes):
with tf.variable_scope(None, default_name='layer_' + str(layer_idx)):
layer_in = tf.layers.Dense(out_size, activation="relu")(layer_in)
不收敛:
layer_in = tf.reshape(inputs, (tf.shape(inputs)[0], layer_in_size))
for layer_idx, out_size in enumerate(layer_sizes):
with tf.variable_scope(None, default_name='layer_' + str(layer_idx)):
layer_in = tf.keras.layers.Dense(out_size, activation="relu")(layer_in)
很难为此做一个最小的例子,因为我不知道这个问题在哪里发生。因此,对于那些感兴趣的人,这里是完整的模型代码(它基于anyrl的代码)(我不知道它发生在哪里,所以可能涉及很多代码):
class DistQNetwork:
def __init__(self, num_actions, state_shape, name, num_atoms, min_val, max_val,dueling=False):
"""
Create a distributional network.
Args:
session: the TF session.
num_actions: size of action space.
obs_vectorizer: observation vectorizer.
name: name for this model.
num_atoms: number of distribution atoms.
min_val: minimum atom value.
max_val: maximum atom value.
dueling: if True, use a separate baseline and
per-action value function.
"""
self.num_actions = num_actions
self.state_shape = state_shape
self.name = name
self.dueling = dueling
self.dist = ActionDist(num_atoms, min_val, max_val)
old_vars = tf.trainable_variables()
with tf.variable_scope(name):
self.step_obs_ph = tf.placeholder(self.input_dtype,
shape=(None,) + self.state_shape)
self.step_base_out = self.base(self.step_obs_ph)
log_probs = self.value_func(self.step_base_out)
values = self.dist.mean(log_probs)
self.step_outs = (values, log_probs)
self.actions_out = tf.argmax(values, axis=1, output_type=tf.int32)
self.variables = [v for v in tf.trainable_variables() if v not in old_vars]
tf.get_default_session().run(tf.global_variables_initializer())
def update_variables(self, other):
assigns = []
for w_agent, w_target in zip(other.variables, self.variables):
assigns.append(tf.assign(w_target, w_agent, validate_shape=True))
tf.get_default_session().run(assigns)
def transition_loss(self, target_net, obses, actions, rews, new_obses, terminals, discounts):
with tf.variable_scope(self.name, reuse=True):
max_actions = tf.argmax(self.dist.mean(self.value_func(self.base(new_obses))),
axis=1, output_type=tf.int32)
with tf.variable_scope(target_net.name, reuse=True):
target_preds = target_net.value_func(target_net.base(new_obses))
target_preds = tf.where(terminals,
tf.zeros_like(target_preds) - math.log(self.dist.num_atoms),
target_preds)
discounts = tf.where(terminals, tf.zeros_like(discounts), discounts)
target_dists = self.dist.add_rewards(tf.exp(take_vector_elems(target_preds, max_actions)),
rews, discounts)
with tf.variable_scope(self.name, reuse=True):
online_preds = self.value_func(self.base(obses))
onlines = take_vector_elems(online_preds, actions)
return _kl_divergence(tf.stop_gradient(target_dists), onlines)
def value_func(self, feature_batch):
"""
Go from a 2-D Tensor of feature vectors to a 3-D
Tensor of predicted action distributions.
Args:
feature_batch: a batch of features from base().
Returns:
A Tensor of shape [batch x actions x atoms].
All probabilities are computed in the log domain.
"""
logits = tf.layers.Dense(self.num_actions * self.dist.num_atoms, activation="relu")(feature_batch)
actions = tf.reshape(logits, (tf.shape(logits)[0], self.num_actions, self.dist.num_atoms))
if not self.dueling:
return tf.nn.log_softmax(actions)
values = tf.expand_dims(tf.layers.Dense(self.dist.num_atoms, activation="relu")(feature_batch), axis=1)
actions -= tf.reduce_mean(actions, axis=1, keepdims=True)
return tf.nn.log_softmax(values + actions)
@property
def input_dtype(self):
return tf.float32
def base(self, obs_batch):
return simple_mlp(obs_batch, [32, 32, 32, 32])
def predict(self, states):
sess = tf.get_default_session()
return sess.run(self.actions_out, {self.step_obs_ph: states})
def product(vals):
"""
Compute the product of values in a list-like object.
"""
prod = 1
for val in vals:
prod *= val
return prod
def simple_mlp(inputs, layer_sizes):
"""
Apply a simple multi-layer perceptron model to the
batch of inputs.
Args:
inputs: the batch of inputs. This may have any shape
with at least two dimensions, provided all the
sizes are known ahead of time besides the batch
size.
layer_sizes: a sequence of hidden layer sizes.
"""
layer_in_size = product([x.value for x in inputs.get_shape()[1:]])
layer_in = tf.reshape(inputs, (tf.shape(inputs)[0], layer_in_size))
for layer_idx, out_size in enumerate(layer_sizes):
with tf.variable_scope(None, default_name='layer_' + str(layer_idx)):
layer_in = tf.layers.Dense(out_size, activation="relu")(layer_in)
return layer_in
class ActionDist:
"""
A discrete reward distribution.
"""
def __init__(self, num_atoms, min_val, max_val):
assert num_atoms >= 2
assert max_val > min_val
self.num_atoms = num_atoms
self.min_val = min_val
self.max_val = max_val
self._delta = (self.max_val - self.min_val) / (self.num_atoms - 1)
def atom_values(self):
"""Get the reward values for each atom."""
return [self.min_val + i * self._delta for i in range(0, self.num_atoms)]
def mean(self, log_probs):
"""Get the mean rewards for the distributions."""
probs = tf.exp(log_probs)
return tf.reduce_sum(probs * tf.constant(self.atom_values(), dtype=probs.dtype), axis=-1)
def add_rewards(self, probs, rewards, discounts):
"""
Compute new distributions after adding rewards to
old distributions.
Args:
log_probs: a batch of log probability vectors.
rewards: a batch of rewards.
discounts: the discount factors to apply to the
distribution rewards.
Returns:
A new batch of log probability vectors.
"""
atom_rews = tf.tile(tf.constant([self.atom_values()], dtype=probs.dtype),
tf.stack([tf.shape(rewards)[0], 1]))
fuzzy_idxs = tf.expand_dims(rewards, axis=1) + tf.expand_dims(discounts, axis=1) * atom_rews
fuzzy_idxs = (fuzzy_idxs - self.min_val) / self._delta
# If the position were exactly 0, rounding up
# and subtracting 1 would cause problems.
fuzzy_idxs = tf.clip_by_value(fuzzy_idxs, 1e-18, float(self.num_atoms - 1))
indices_1 = tf.cast(tf.ceil(fuzzy_idxs) - 1, tf.int32)
fracs_1 = tf.abs(tf.ceil(fuzzy_idxs) - fuzzy_idxs)
indices_2 = indices_1 + 1
fracs_2 = 1 - fracs_1
res = tf.zeros_like(probs)
for indices, fracs in [(indices_1, fracs_1), (indices_2, fracs_2)]:
index_matrix = tf.expand_dims(tf.range(tf.shape(indices)[0], dtype=tf.int32), axis=1)
index_matrix = tf.tile(index_matrix, (1, self.num_atoms))
scatter_indices = tf.stack([index_matrix, indices], axis=-1)
res = res + tf.scatter_nd(scatter_indices, probs * fracs, tf.shape(res))
return res