train_net
方法计算梯度并更新 Policy Model
中的变量。
loss = -log_prob * R
它是如何工作的?
如何在强化学习中编写 Policy 的损失函数?
class Policy(keras.Model):
def __init__(self):
super(Policy, self).__init__()
self.data = []
self.fc1 = layers.Dense(128, kernel_initializer="he_normal")
self.fc2 = layers.Dense(2, kernel_initializer="he_normal")
self.optimizer = optimizers.Adam(learning_rate=learning_rate)
def call(self, inputs, training=False):
x = tf.nn.relu(self.fc1(inputs))
x = tf.nn.softmax(self.fc2(x), axis=1)
return x
def put_data(self, item):
self.data.append(item)
def train_net(self, tape):
R = 0
for r, log_prob in self.data[::-1]:
R = r + gamma * R
loss = -log_prob * R
with tape.stop_recording():
grads = tape.gradient(loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(grads, self.trainable_variables))
self.data = []