我正在构建深度强化学习并使用 LSTM 作为演员和评论家网络。它无法计算 actor 网络的梯度。渐变似乎是其中包含“无”值的数组。
计算critic 网络的梯度时效果很好,但计算actor 网络的梯度时失败
计算梯度:
def learn(self, training = True):
if self.memory.mem_count < self.batch_size:
return
states, prev_actions, actions, rewards, new_states = self.memory.buffer_memory(self.batch_size)
states = tf.convert_to_tensor(states, dtype=tf.float32)
new_states = tf.convert_to_tensor(new_states, dtype=tf.float32)
rewards = tf.convert_to_tensor(rewards, dtype=tf.float32)
actions = tf.convert_to_tensor(actions, dtype=tf.float32)
prev_actions = tf.convert_to_tensor(prev_actions, dtype=tf.float32)
##### works fine for this #######
with tf.GradientTape() as tape:
target_actions = self.target_actor(new_states, prev_actions)
critic_value_ = tf.squeeze(self.target_critic(new_states, target_actions, prev_actions), 1)
critic_value = tf.squeeze(self.critic(states, actions, prev_actions), 1)
target = rewards + self.gamma*critic_value_
critic_loss = keras.losses.MSE(target, critic_value)
critic_network_gradient = tape.gradient(critic_loss, self.critic.trainable_variables)
print('critic loss', critic_loss)
self.critic.optimizer.apply_gradients(zip(critic_network_gradient, self.critic.trainable_variables))
##### fail to apply gradient on this #####
with tf.GradientTape() as tape:
new_policy_actions = self.actor(states, prev_actions)
actor_loss = -self.critic(states, new_policy_actions, prev_actions)
actor_loss = tf.math.reduce_mean(actor_loss)
actor_network_gradient = tape.gradient(actor_loss, self.actor.trainable_variables)
print('gradient: ', actor_network_gradient)
print('actor loss:', actor_loss)
self.actor.optimizer.apply_gradients(zip(
actor_network_gradient, self.actor.trainable_variables))
网络:
import os
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Concatenate
from tensorflow.keras.layers import Input
class CriticNetwork(keras.Model):
def __init__(self, period = 64, n_coins = 6, n_feature = 3,
name='critic', chkpt_dir='tmp/ddpg'):
super(CriticNetwork, self).__init__()
self.period = period
self.n_coins = n_coins
self.n_feature = n_feature
self.model_name = name
self.checkpoint_dir = chkpt_dir
self.checkpoint_file = os.path.join(self.checkpoint_dir,
self.model_name+'_ddpg.h5')
self.lstm1 = LSTM(units = self.period, return_sequences = True, input_shape = (self.period, self.n_coins* self.n_feature))
self.lstm2 = LSTM(units = self.period, return_sequences = True)
self.lstm3 = LSTM(units = period)
self.q = Dense(units = 1, activation = 'relu')
def call(self, state, actions, prev_actions):
state = tf.reshape(state, (state.shape[0], self.period, self.n_coins * self.n_feature))
# 1. LSTM for time series data
action_value = self.lstm1(state)
action_value = self.lstm2(action_value)
action_value = self.lstm3(action_value)
q = self.q(action_value)
return q
class ActorNetwork(keras.Model):
def __init__(self, period = 64, n_coins = 6, n_feature = 3,
name='actor', chkpt_dir='tmp/ddpg'):
super(ActorNetwork, self).__init__()
self.period = period
self.n_coins = n_coins
self.n_feature = n_feature
self.model_name = name
self.checkpoint_dir = chkpt_dir
self.checkpoint_file = os.path.join(self.checkpoint_dir,
self.model_name+'_ddpg.h5')
self.lstm1 = LSTM(units = self.period, return_sequences = True, input_shape = (self.period, self.n_coins* self.n_feature))
self.lstm2 = LSTM(units = self.period, return_sequences = True)
self.lstm3 = LSTM(units = self.period, return_state = False)
self.mu = Dense(units = self.n_coins, activation = 'softmax')
def call(self, state, prev_actions):
state = tf.reshape(state, (state.shape[0], self.period, self.n_coins * self.n_feature))
portfolio = self.lstm1(state)
portfolio = self.lstm2(portfolio)
portfolio = self.lstm3(portfolio)
portfolio = self.mu(portfolio)
return portfolio
错误代码如下。
ValueError: No gradients provided for any variable: ['actor_network_4/lstm_24/lstm_cell_24/kernel:0', 'actor_network_4/lstm_24/lstm_cell_24/recurrent_kernel:0', 'actor_network_4/lstm_24/lstm_cell_24/bias:0', 'actor_network_4/lstm_25/lstm_cell_25/kernel:0', 'actor_network_4/lstm_25/lstm_cell_25/recurrent_kernel:0', 'actor_network_4/lstm_25/lstm_cell_25/bias:0', 'actor_network_4/lstm_26/lstm_cell_26/kernel:0', 'actor_network_4/lstm_26/lstm_cell_26/recurrent_kernel:0', 'actor_network_4/lstm_26/lstm_cell_26/bias:0', 'actor_network_4/dense_8/kernel:0', 'actor_network_4/dense_8/bias:0'].
根据我的理解,我正在计算 actor_loss 上 actor 网络中可训练变量的梯度。但是actor网络的输出输入到critic网络中,loss基本上是输出的均值。
我无法理解为什么磁带无法跟踪可训练变量。如果有人可以提供帮助,不胜感激