我目前正在使用tensorflow编程一个强化学习项目。当我描述了一次训练运行时,我注意到仅对动作进行采样就需要超过65%的运行时间。
注意:在强化学习中,您必须在轨迹的每个步骤中采样动作。这就是为什么无法分批采样的原因。
我在GPU上运行此图,因此我怀疑调用session.run()方法带来的开销,因此每次将数据从RAM复制到GPU都会导致这种开销。
我的问题是:有没有办法减少多次调用session.run()方法带来的开销?是否可以在CPU上运行sample_op(正向传递),并在GPU上运行训练?
非常感谢!
这是我的RL政策的代码:
class Policy:
def __init__(self, sess, state_size, action_size, lr, alpha_entropy, epsilon):
self.sess = sess
self.action_size = action_size
with tf.device("/cpu:0"):
with tf.variable_scope("Policy"):
self.state_ph = tf.placeholder(tf.float32, [None, state_size], name="state_ph")
self.action_ph = tf.placeholder(tf.float32, [None, action_size], name="action_ph")
self.advantage_ph = tf.placeholder(tf.float32, [None, 1], name="action_ph")
with tf.variable_scope("pi"):
self.pi, self.mean_action = self._create_model(trainable=True)
self.pi_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="Policy/pi")
self.sample_op = self.pi.sample()
with tf.variable_scope("old_pi"):
self.old_pi, _ = self._create_model(trainable=False)
self.old_pi_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="Policy/old_pi")
with tf.variable_scope("loss"):
prob_ratio = self.pi.prob(self.action_ph) / self.old_pi.prob(self.action_ph)
surrogate = prob_ratio * self.advantage_ph
clipped_surrogate = tf.minimum(surrogate, tf.clip_by_value(prob_ratio, 1.-epsilon, 1.+epsilon)*self.advantage_ph)
self.pi_entropy = self.pi.entropy()
tf.summary.scalar("entropy", tf.reduce_mean(self.pi_entropy))
self.loss = -tf.reduce_mean(clipped_surrogate + alpha_entropy * self.pi_entropy)
tf.summary.scalar("objective", self.loss)
with tf.variable_scope("training"):
self.gradients = tf.gradients(self.loss, self.pi_vars)
#self.gradients = [tf.clip_by_value(g, -1000, 1000) for g in self.gradients]
#self.gradients, _ = tf.clip_by_global_norm(self.gradients, GRADIENT_NORM)
grads = zip(self.gradients, self.pi_vars)
self.optimize = tf.train.AdamOptimizer(lr).apply_gradients(grads)
[tf.summary.histogram(v.name, g) for g, v in grads]
with tf.variable_scope("update_old_policy"):
self.update_oldpi_op = [oldp.assign(p) for p, oldp in zip(self.pi_vars, self.old_pi_vars)]
self.summary_op = tf.summary.merge_all(scope="Policy")
def _create_model(self, trainable):
layer_names = ["l1", "l2", "l3", "l4"]
l1 = tf.layers.Dense(32, activation="relu", name=layer_names[0], trainable=trainable, kernel_initializer = tf.initializers.he_normal(),)(self.state_ph)
l2 = tf.layers.Dense(64, activation="relu", name=layer_names[1], trainable=trainable, kernel_initializer = tf.initializers.he_normal(),)(l1)
l3 = tf.layers.Dense(32, activation="relu", name=layer_names[2], trainable=trainable, activity_regularizer= tf.contrib.layers.l2_regularizer(scale=0.001),kernel_initializer = tf.initializers.he_normal(),)(l2)
mu = tf.layers.Dense(self.action_size, activation="tanh", name=layer_names[3], trainable=trainable, kernel_initializer = tf.initializers.he_normal(),)(l3)
log_sigma = tf.Variable(initial_value=tf.fill((self.action_size,), 0.), trainable=trainable)
distribution = tfp.distributions.MultivariateNormalDiag(loc=mu, scale_diag=tf.exp(log_sigma))
tf.summary.histogram("log_sigma", log_sigma)
tf.summary.histogram("mu", mu)
for name in layer_names:
with tf.variable_scope(name, reuse=True):
tf.summary.histogram("kernel", tf.get_variable("kernel"))
tf.summary.histogram("bias", tf.get_variable("bias"))
return distribution, mu
def sample_action(self, state):
return self.sess.run([self.mean_action, self.sample_op], feed_dict={
self.state_ph: state
})
def train(self, states, actions, advantages):
_, summaries =self.sess.run([self.optimize, self.summary_op], feed_dict={
self.state_ph:states,
self.action_ph: actions,
self.advantage_ph: advantages
})
return summaries
def update_old_pi(self):
self.sess.run([self.update_oldpi_op])
分析结果: