我试图了解如何计算批次损失。我已经将DQN建模如下
class DQN:
def __init__(self, session, state_dim, action_dim, lr, nodes):
self.sess = session
self.s_dim = state_dim
self.a_dim = action_dim
self.learning_rate = lr
self.nodes = nodes
self.state = tf.placeholder("float", [None, self.s_dim], name="state_batch")
# one-hot encoded action
self.action = tf.placeholder("float", [None, self.a_dim], name="action_batch")
self.predicted_q_value = tf.placeholder("float", [None, 1], name="prediction_batch")
self.q_out = self.create_network()
self.loss = tf.reduce_mean(tf.square(self.predicted_q_value - tf.reduce_sum(self.q_out * self.action)))
self.optimize = tf.train.AdamOptimizer(self.learning_rate).minimize(self.loss)
def create_network(self):
h0 = tf.layers.dense(inputs=self.state, units=self.nodes, activation=tf.nn.relu)
h1 = tf.layers.dense(inputs=h0, units=self.nodes, activation=tf.nn.relu)
out = tf.layers.dense(inputs=h1, units=self.a_dim, activation=None)
return out
def train(self, state, action, predicted_q_value):
return self.sess.run([self.loss, self.optimize], feed_dict={
self.state: state,
self.action: action,
self.predicted_q_value: predicted_q_value
})
def predict(self, state):
return self.sess.run(self.q_out, feed_dict={
self.state: state
})
根据我的理解,损失将是批处理数据损失的平均值。但是,我看到总损失值正乘以批次大小的平方。
sess = tf.Session()
nw = DQN(sess, 3, 3, 0.0001, 64)
sess.run(tf.global_variables_initializer())
# batch size is 1
state_ip = [[1, 1, 1]]
action_ip = [[0, 1, 0]]
pred_val = [[0]]
print(nw.predict(state_ip))
loss, _ = nw.train(state_ip, action_ip, pred_val)
print(loss)
[[ 0.11640665 0.10434964 -0.31503427]]
0.010888848 # loss is as expected = (0 - 0.10434964)^2
如果我传递的批量大小为2的数据具有完全相同的值
state_ip = [[1, 1, 1], [1, 1, 1]]
action_ip = [[0, 1, 0], [0, 1, 0]]
pred_val = [[0], [0]]
print(nw.predict(state_ip))
loss, _ = nw.train(state_ip, action_ip, pred_val)
print(loss)
[[-0.28207895 -0.15026638 -0.0181574 ]
[-0.28207895 -0.15026638 -0.0181574 ]]
0.09031994 # loss = (0 - -0.15026638)^2 * 2^2
由于我使用tf.reduce_mean进行了损失,所以我期望损失是批处理数据损失的平均值。为什么要乘以批次大小的平方?我在这里缺少基本的东西吗?
答案 0 :(得分:0)
您的错误在于如何计算损耗,特别是tf.reduce_sum(self.q_out * self.action))
计算整个张量的全局值。逐步:
self.q_out * self.action
给您[[0,-0.15026638,0] [0,-0.15026638,0]]
tf.reduce_sum
给出2*-0.15026638=-0.30053276
0.30053276**2=0.09031994
您可能已经意识到,错误是在第2步中,因为您希望获得[-0.15026638,-0.15026638]
作为输出,而这可以通过axis
参数来实现。因此,计算损失的正确方法是:
self.loss = tf.reduce_mean(tf.square(
self.predicted_q_value - tf.reduce_sum(self.q_out * self.action, axis=1)
))