我正在编写用于深度q学习模型的代码,它包含两个类DQN网络和代理。我希望可视化每一层的权重和偏差,并观察它是否随着网络学习而改变。它包含两个用于预测当前状态的Q值的网络(q_eval网络)和一个用于预测目标Q值的网络(q_next网络),我希望将每个网络的权重可视化。如果您可以帮助我,那么网络体系结构中也可能存在一个错误,那就太棒了。
我已经尝试过tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES()),但是它无法正常工作,也许是我做错了。
DeepQNetwork(object)类:
def __init__(self, lr, n_actions, name, fc1_dims=1024,
#input_dims=(210, 160, 4),
input_dims=(3, 4), chkpt_dir="tmp/dqn"):
self.lr = lr
self.name = name
self.n_actions = n_actions
self.fc1_dims = fc1_dims
self.chkpt_dir = chkpt_dir
self.input_dims = input_dims
self.sess = tf.Session()
self.build_network()
self.sess.run(tf.global_variables_initializer())
self.saver = tf.train.Saver()
self.checkpoint_file = os.path.join(chkpt_dir, "deepqnet.ckpt")
self.params = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES,
scope=self.name)
self.write_op = tf.summary.merge([self.accuracy_sum, self.loss_sum, self.summ])
self.writer = tf.summary.FileWriter("tmp/log_dir")
self.writer.add_graph(self.sess.graph)
# The list of values in the collection with the given name
# or an empty list if no value has been added to that collection.
# trainable variables are the whose values are updated while performing optimisation.
def build_network(self):
with tf.variable_scope(self.name):
self.input = tf.placeholder(tf.float32, shape=[None, *self.input_dims],
name='inputs')
# * here indicates that the function can take multiple inputs as arguments into the function.
self.actions = tf.placeholder(tf.float32, shape=[None, self.n_actions],
name='action_taken')
self.q_target = tf.placeholder(tf.float32, shape=[None, self.n_actions],
name='q_target')
# 1st dimension inside shape is set to None because we want to pass
# batches of stacked frame into the neural network.
conv1 = tf.layers.conv2d(inputs=self.input, filters=32,
kernel_size=(8, 8), strides=4, name='conv1',
kernel_initializer=tf.contrib.layers.variance_scaling_initializer(factor=2))
conv1_activated = tf.nn.relu(conv1)
conv2 = tf.layers.conv2d(inputs=conv1_activated, filters=64,
kernel_size=(4, 4), strides=2, name='conv2',
kernel_initializer=tf.contrib.layers.variance_scaling_initializer(factor=2))
conv2_activated = tf.nn.relu(conv2)
conv3 = tf.layers.conv2d(inputs=conv2_activated, filters=128,
kernel_size=(3, 3), strides=1, name='conv3',
kernel_initializer=tf.contrib.layers.variance_scaling_initializer(factor=2))
conv3_activated = tf.nn.relu(conv3)
flat = tf.contrib.layers.flatten(conv3_activated)
dense1 = tf.layers.dense(flat, units=self.fc1_dims, activation=tf.nn.relu,
kernel_initializer=tf.contrib.layers.variance_scaling_initializer(factor=2))
self.Q_values = tf.layers.dense(dense1, units=self.n_actions,
kernel_initializer=tf.contrib.layers.variance_scaling_initializer(factor=2))
self.q = tf.reduce_sum(tf.multiply(self.Q_values, self.actions))
self.accuracy_sum = tf.summary.scalar('Accuracy', self.q)
self.loss = tf.reduce_mean(tf.square(self.q - self.q_target))
self.loss_sum = tf.summary.scalar("Loss", self.loss)
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
for var in tf.trainable_variables():
print(var.name[:-2])
self.summ = tf.summary.histogram(var.name[:-2], var)
Agent类(对象): def init (自我,alpha,伽玛,mem_size,n_actions,epsilon,batch_size, replace_target = 10000,input_dims =(210,160,4), q_next_dir =“ tmp / q_next”,q_eval_dir =“ tmp / q_eval”): self.n_actions = n_actions self.action_space = [i for i in range(self.n_actions)] #对于n_actions = 3,action_space是一个列表[0,1,2] self.gamma =伽玛 self.mem_size = mem_size self.mem_cntr = 0 self.epsilon = epsilon self.batch_size =批处理大小 self.replace_target = replace_target
self.q_next = DeepQNetwork(alpha, n_actions, input_dims=input_dims,
name='q_next', chkpt_dir=q_next_dir)
self.q_eval = DeepQNetwork(alpha, n_actions, input_dims=input_dims,
name='q_eval', chkpt_dir=q_eval_dir)
def learn(self):
if self.mem_cntr % self.replace_target == 0:
self.update_graph()
# we update the graph after every K steps, so that the q_target is not fluctuating.
max_mem = self.mem_cntr if self.mem_cntr < self.mem_size else self.mem_size
batch = np.random.choice(max_mem, self.batch_size)
# Batch is of the length equal to batch size with elements that are generated using np.arange(max_mem) (b).
state_batch = self.state_memory[batch]
# Shape of the state batch is equal to (batch_size, input_dims)
# ex: (32, 180, 160, 4)
action_batch = self.action_memory[batch]
action_values = np.array([0, 1, 2], dtype=np.int8)
action_indices = np.dot(action_batch, action_values)
reward_batch = self.reward_memory[batch]
new_state_batch = self.new_state_memory[batch]
terminal_batch = self.terminal_memory[batch]
q_eval = self.q_eval.sess.run(self.q_eval.Q_values,
feed_dict={self.q_eval.input: state_batch})
# It has shape (batch_size, n_actions).
# This gives Q values for each action, in this case 3 actions, using q_eval network for current state batch.
q_next = self.q_next.sess.run(self.q_next.Q_values,
feed_dict={self.q_next.input: new_state_batch})
# This gives Q values for the next state using the q_next network.
q_target = q_eval.copy()
idx = np.arange(self.batch_size)
q_target[idx, action_indices] = reward_batch + \
self.gamma*np.max(q_next, axis=1)*terminal_batch
# axis= 1 means along each row we calculate the maximum value, where rows are the actions.
#q_target = np.zeros(self.batch_size)
#q_target = reward_batch + self.gamma*np.max(q_next, axis =1)*terminal_batch
_ = self.q_eval.sess.run(self.q_eval.train_op,
feed_dict={self.q_eval.input: state_batch,
self.q_eval.actions: action_batch,
self.q_eval.q_target: q_target})
loss = self.q_eval.sess.run(self.q_eval.loss,
feed_dict={self.q_eval.input: state_batch,
self.q_eval.actions: action_batch,
self.q_eval.q_target: q_target})
summary = self.q_eval.sess.run(self.q_eval.write_op,
feed_dict={self.q_eval.input: state_batch,
self.q_eval.actions: action_batch,
self.q_eval.q_target: q_target,
self.q_next.input: new_state_batch})
self.q_eval.writer.add_summary(summary, time.time())
self.q_eval.writer.flush()
运行此代码时,我只能看到仅一层的偏差,即密集层q_eval网络的偏差。