Question

我目前正在使用tensorflow编程一个强化学习项目。当我描述了一次训练运行时，我注意到仅对动作进行采样就需要超过65％的运行时间。

注意：在强化学习中，您必须在轨迹的每个步骤中采样动作。这就是为什么无法分批采样的原因。

我在GPU上运行此图，因此我怀疑调用session.run（）方法带来的开销，因此每次将数据从RAM复制到GPU都会导致这种开销。

我的问题是：有没有办法减少多次调用session.run（）方法带来的开销？是否可以在CPU上运行sample_op（正向传递），并在GPU上运行训练？

非常感谢！

这是我的RL政策的代码：

class Policy:
def __init__(self, sess, state_size, action_size, lr, alpha_entropy,  epsilon):
    self.sess = sess
    self.action_size = action_size
    with tf.device("/cpu:0"):
        with tf.variable_scope("Policy"):
            self.state_ph = tf.placeholder(tf.float32, [None, state_size], name="state_ph")
            self.action_ph = tf.placeholder(tf.float32, [None, action_size], name="action_ph")
            self.advantage_ph = tf.placeholder(tf.float32, [None, 1], name="action_ph")

            with tf.variable_scope("pi"):
                self.pi, self.mean_action = self._create_model(trainable=True)
                self.pi_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="Policy/pi")

                self.sample_op = self.pi.sample()

            with tf.variable_scope("old_pi"):
                self.old_pi, _ = self._create_model(trainable=False)
                self.old_pi_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="Policy/old_pi")

            with tf.variable_scope("loss"):
                prob_ratio = self.pi.prob(self.action_ph) / self.old_pi.prob(self.action_ph)
                surrogate = prob_ratio * self.advantage_ph

                clipped_surrogate = tf.minimum(surrogate,  tf.clip_by_value(prob_ratio, 1.-epsilon, 1.+epsilon)*self.advantage_ph)

                self.pi_entropy = self.pi.entropy()

                tf.summary.scalar("entropy", tf.reduce_mean(self.pi_entropy))

                self.loss = -tf.reduce_mean(clipped_surrogate + alpha_entropy * self.pi_entropy)

                tf.summary.scalar("objective", self.loss)

            with tf.variable_scope("training"):
                self.gradients = tf.gradients(self.loss, self.pi_vars)
                #self.gradients = [tf.clip_by_value(g, -1000, 1000) for g in self.gradients]
                #self.gradients, _ = tf.clip_by_global_norm(self.gradients, GRADIENT_NORM)
                grads = zip(self.gradients, self.pi_vars)
                self.optimize = tf.train.AdamOptimizer(lr).apply_gradients(grads)

                [tf.summary.histogram(v.name, g) for g, v in grads]

            with tf.variable_scope("update_old_policy"):
                self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(self.pi_vars, self.old_pi_vars)]

            self.summary_op = tf.summary.merge_all(scope="Policy")

def _create_model(self, trainable):
    layer_names = ["l1", "l2", "l3", "l4"]

    l1 = tf.layers.Dense(32, activation="relu", name=layer_names[0], trainable=trainable, kernel_initializer = tf.initializers.he_normal(),)(self.state_ph)
    l2 = tf.layers.Dense(64, activation="relu", name=layer_names[1], trainable=trainable, kernel_initializer = tf.initializers.he_normal(),)(l1)
    l3 = tf.layers.Dense(32, activation="relu", name=layer_names[2], trainable=trainable, activity_regularizer= tf.contrib.layers.l2_regularizer(scale=0.001),kernel_initializer = tf.initializers.he_normal(),)(l2)
    mu = tf.layers.Dense(self.action_size, activation="tanh", name=layer_names[3], trainable=trainable, kernel_initializer = tf.initializers.he_normal(),)(l3)

    log_sigma = tf.Variable(initial_value=tf.fill((self.action_size,), 0.), trainable=trainable)

    distribution = tfp.distributions.MultivariateNormalDiag(loc=mu, scale_diag=tf.exp(log_sigma))

    tf.summary.histogram("log_sigma", log_sigma)
    tf.summary.histogram("mu", mu)

    for name in layer_names:
        with tf.variable_scope(name, reuse=True):
            tf.summary.histogram("kernel", tf.get_variable("kernel"))
            tf.summary.histogram("bias", tf.get_variable("bias"))

    return distribution, mu

def sample_action(self, state):
    return self.sess.run([self.mean_action, self.sample_op], feed_dict={
        self.state_ph: state
    })

def train(self, states, actions, advantages):
    _, summaries =self.sess.run([self.optimize, self.summary_op], feed_dict={
        self.state_ph:states,
        self.action_ph: actions,
        self.advantage_ph: advantages
    })
    return summaries

def update_old_pi(self):
    self.sess.run([self.update_oldpi_op])

分析结果：

Tensorflow-减少session.run（）的开销

0 个答案: