tf.keras.layers.Densetf.layers.Dense有什么区别? 出于某种原因,我的DQN仅与tf.layers.Dense融合,而从未与tf.keras.layers.Dense融合。 我希望它们是相同的,因为根据文档,您应该使用tf.keras.layers.Dense而不是tf.layers.Dense

一些背景: 我试图在TensorFlow中实施分布式Q学习模型。卡住后,由于某种原因,我从tf.keras.layers.Dense切换到tf.layers.Dense。由于某种原因,网络现在趋于融合,这是唯一的更改。,我多次对其进行了仔细检查。

我正在使用python 3.6和TensorFlow 1.13.1


layer_in = tf.reshape(inputs, (tf.shape(inputs)[0], layer_in_size))
    for layer_idx, out_size in enumerate(layer_sizes):
        with tf.variable_scope(None, default_name='layer_' + str(layer_idx)):
            layer_in = tf.layers.Dense(out_size, activation="relu")(layer_in)


layer_in = tf.reshape(inputs, (tf.shape(inputs)[0], layer_in_size))
    for layer_idx, out_size in enumerate(layer_sizes):
        with tf.variable_scope(None, default_name='layer_' + str(layer_idx)):
            layer_in = tf.keras.layers.Dense(out_size, activation="relu")(layer_in)


class DistQNetwork:
    def __init__(self, num_actions, state_shape, name, num_atoms, min_val, max_val,dueling=False):
        Create a distributional network.
          session: the TF session.
          num_actions: size of action space.
          obs_vectorizer: observation vectorizer.
          name: name for this model.
          num_atoms: number of distribution atoms.
          min_val: minimum atom value.
          max_val: maximum atom value.
          dueling: if True, use a separate baseline and
            per-action value function.
        self.num_actions = num_actions
        self.state_shape = state_shape
        self.name = name
        self.dueling = dueling
        self.dist = ActionDist(num_atoms, min_val, max_val)
        old_vars = tf.trainable_variables()
        with tf.variable_scope(name):
            self.step_obs_ph = tf.placeholder(self.input_dtype,
                                              shape=(None,) + self.state_shape)
            self.step_base_out = self.base(self.step_obs_ph)
            log_probs = self.value_func(self.step_base_out)
            values = self.dist.mean(log_probs)
            self.step_outs = (values, log_probs)
            self.actions_out = tf.argmax(values, axis=1, output_type=tf.int32)
        self.variables = [v for v in tf.trainable_variables() if v not in old_vars]


    def update_variables(self, other):
        assigns = []
        for w_agent, w_target in zip(other.variables, self.variables):
            assigns.append(tf.assign(w_target, w_agent, validate_shape=True))

    def transition_loss(self, target_net, obses, actions, rews, new_obses, terminals, discounts):
        with tf.variable_scope(self.name, reuse=True):
            max_actions = tf.argmax(self.dist.mean(self.value_func(self.base(new_obses))),
                                    axis=1, output_type=tf.int32)
        with tf.variable_scope(target_net.name, reuse=True):
            target_preds = target_net.value_func(target_net.base(new_obses))
            target_preds = tf.where(terminals,
                                    tf.zeros_like(target_preds) - math.log(self.dist.num_atoms),
        discounts = tf.where(terminals, tf.zeros_like(discounts), discounts)
        target_dists = self.dist.add_rewards(tf.exp(take_vector_elems(target_preds, max_actions)),
                                             rews, discounts)
        with tf.variable_scope(self.name, reuse=True):
            online_preds = self.value_func(self.base(obses))
            onlines = take_vector_elems(online_preds, actions)
            return _kl_divergence(tf.stop_gradient(target_dists), onlines)

    def value_func(self, feature_batch):
        Go from a 2-D Tensor of feature vectors to a 3-D
        Tensor of predicted action distributions.
          feature_batch: a batch of features from base().
          A Tensor of shape [batch x actions x atoms].
        All probabilities are computed in the log domain.
        logits = tf.layers.Dense(self.num_actions * self.dist.num_atoms, activation="relu")(feature_batch)
        actions = tf.reshape(logits, (tf.shape(logits)[0], self.num_actions, self.dist.num_atoms))
        if not self.dueling:
            return tf.nn.log_softmax(actions)
        values = tf.expand_dims(tf.layers.Dense(self.dist.num_atoms, activation="relu")(feature_batch), axis=1)
        actions -= tf.reduce_mean(actions, axis=1, keepdims=True)
        return tf.nn.log_softmax(values + actions)

    def input_dtype(self):
        return tf.float32

    def base(self, obs_batch):
        return simple_mlp(obs_batch, [32, 32, 32, 32])

    def predict(self, states):
        sess = tf.get_default_session()
        return sess.run(self.actions_out, {self.step_obs_ph: states})

def product(vals):
    Compute the product of values in a list-like object.
    prod = 1
    for val in vals:
        prod *= val
    return prod

def simple_mlp(inputs, layer_sizes):
    Apply a simple multi-layer perceptron model to the
    batch of inputs.
      inputs: the batch of inputs. This may have any shape
        with at least two dimensions, provided all the
        sizes are known ahead of time besides the batch
      layer_sizes: a sequence of hidden layer sizes.
    layer_in_size = product([x.value for x in inputs.get_shape()[1:]])
    layer_in = tf.reshape(inputs, (tf.shape(inputs)[0], layer_in_size))
    for layer_idx, out_size in enumerate(layer_sizes):
        with tf.variable_scope(None, default_name='layer_' + str(layer_idx)):
            layer_in = tf.layers.Dense(out_size, activation="relu")(layer_in)
    return layer_in

class ActionDist:
    A discrete reward distribution.

    def __init__(self, num_atoms, min_val, max_val):
        assert num_atoms >= 2
        assert max_val > min_val
        self.num_atoms = num_atoms
        self.min_val = min_val
        self.max_val = max_val
        self._delta = (self.max_val - self.min_val) / (self.num_atoms - 1)

    def atom_values(self):
        """Get the reward values for each atom."""
        return [self.min_val + i * self._delta for i in range(0, self.num_atoms)]

    def mean(self, log_probs):
        """Get the mean rewards for the distributions."""
        probs = tf.exp(log_probs)
        return tf.reduce_sum(probs * tf.constant(self.atom_values(), dtype=probs.dtype), axis=-1)

    def add_rewards(self, probs, rewards, discounts):
        Compute new distributions after adding rewards to
        old distributions.
          log_probs: a batch of log probability vectors.
          rewards: a batch of rewards.
          discounts: the discount factors to apply to the
            distribution rewards.
          A new batch of log probability vectors.
        atom_rews = tf.tile(tf.constant([self.atom_values()], dtype=probs.dtype),
                            tf.stack([tf.shape(rewards)[0], 1]))

        fuzzy_idxs = tf.expand_dims(rewards, axis=1) + tf.expand_dims(discounts, axis=1) * atom_rews
        fuzzy_idxs = (fuzzy_idxs - self.min_val) / self._delta

        # If the position were exactly 0, rounding up
        # and subtracting 1 would cause problems.
        fuzzy_idxs = tf.clip_by_value(fuzzy_idxs, 1e-18, float(self.num_atoms - 1))

        indices_1 = tf.cast(tf.ceil(fuzzy_idxs) - 1, tf.int32)
        fracs_1 = tf.abs(tf.ceil(fuzzy_idxs) - fuzzy_idxs)
        indices_2 = indices_1 + 1
        fracs_2 = 1 - fracs_1

        res = tf.zeros_like(probs)
        for indices, fracs in [(indices_1, fracs_1), (indices_2, fracs_2)]:
            index_matrix = tf.expand_dims(tf.range(tf.shape(indices)[0], dtype=tf.int32), axis=1)
            index_matrix = tf.tile(index_matrix, (1, self.num_atoms))
            scatter_indices = tf.stack([index_matrix, indices], axis=-1)
            res = res + tf.scatter_nd(scatter_indices, probs * fracs, tf.shape(res))

        return res

