我正在尝试将此Pytorch script转换为TensorFlow 2,但是当计算梯度时,它会引发以下错误:没有为任何变量提供梯度。由GradientTape监视的变量,我认为图中的某处必须存在断开连接,但我找不到它所在的位置。这是最小的可复制代码:
import numpy as np
import random
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras.layers import Dense, Input
import gym
class ActorCritic(tf.keras.Model):
def __init__(self, n_actions):
super(ActorCritic, self).__init__()
self.hiddenLogits = Dense(256, activation='tanh')
self.hiddenValue = Dense(256, activation='tanh')
self.outputLogits = Dense(n_actions, 'softmax')
self.outputValue = Dense(n_actions, activation='linear')
def call(self, inputs):
x = tf.convert_to_tensor(inputs)
logits = self.hiddenLogits(x)
value = self.hiddenValue(x)
probas = self.outputLogits(logits)
qvalues = self.outputValue(value)
value = tf.reduce_sum(probas*qvalues, axis=-1)
return tf.squeeze(probas), tf.squeeze(qvalues), value
def compute_acer_loss(policies, q_values, values, actions, rewards, retrace, masks, behavior_policies, gamma=0.99, truncation_clip=10, entropy_weight=0.0001):
loss = 0
var_list = model.trainable_variables
with tf.GradientTape() as tape:
tape.watch(var_list)
for step in reversed(range(len(rewards))):
importance_weight = tf.stop_gradient(policies[step]) / tf.stop_gradient(behavior_policies[step])
retrace = rewards[step] + gamma * retrace * masks[step]
advantage = retrace - values[step]
log_policy_action =tf.math.log(tf.gather(policies[step], actions[step], axis=1))
truncated_importance_weight = tf.clip_by_value(tf.gather(importance_weight, actions[step], axis=1), -float('inf'), truncation_clip)
actor_loss = -tf.reduce_mean(truncated_importance_weight * log_policy_action * tf.stop_gradient(advantage))
correction_weight = tf.clip_by_value(1 - truncation_clip / importance_weight, 0, float('inf'))
actor_loss -= tf.reduce_mean(tf.reduce_sum(correction_weight * tf.math.log(policies[step]) * tf.stop_gradient(q_values[step] - values[step]), axis=0))
entropy = entropy_weight * -tf.reduce_mean(tf.reduce_sum(tf.math.log(policies[step])* policies[step], axis=0))
q_value = tf.gather(q_values[step], actions[step], axis=1)
critic_loss = tf.reduce_mean((retrace - q_value) ** 2, axis=0)
truncated_rho = tf.clip_by_value(tf.gather(importance_weight, actions[step], axis=1), -float('inf'), 1)
retrace = truncated_rho * (retrace - tf.stop_gradient(q_value)) + tf.stop_gradient(values[step])
loss += actor_loss + critic_loss - entropy
grads = tape.gradient(loss,var_list)
optimizer.apply_gradients(zip(grads, var_list))
env = gym.make("CartPole-v0")
model = ActorCritic(env.action_space.n)
optimizer = tf.keras.optimizers.Adam()
frame_idx = 0
max_frames = 10000
num_steps = 5
state = env.reset()
while frame_idx < max_frames:
q_values = []
values = []
policies = []
actions = []
rewards = []
masks = []
for step in range(num_steps):
policy, q_value, value = model(state[None,:])
action = int(tfp.distributions.Multinomial(1, probs=policy).sample().numpy()[0])
next_state, reward, done, _ = env.step(action)
mask = 1-done
q_values.append(q_value)
policies.append(policy)
actions.append(action)
rewards.append(reward)
values.append(value)
masks.append(mask)
state = next_state
if done:
state = env.reset()
_, _, retrace = model(state[None,:])
retrace = tf.stop_gradient(retrace)
compute_acer_loss(policies, q_values, values, actions, rewards, retrace, masks, policies)
frame_idx += num_steps