两个示例代码都应该导致不同的培训行为(任何损失/任何优化器)?
# first code
inputs1 = tf.placeholder(shape=[16,1,32,32], dtype=tf.float32)
inputs2 = tf.placeholder(shape=[16,1,32,32], dtype=tf.float32)
full_inputs = tf.concat([inputs1, inputs2], axis=0)
with tf.variable_scope('convnet'):
outputs = tf.nn.conv2d(inputs, kernel_size=[3,3], num_outputs=1, stride=[1,1], padding='VALID', data_format='NCHW')
# second code
inputs1 = tf.placeholder(shape=[16,1,32,32], dtype=tf.float32)
inputs2 = tf.placeholder(shape=[16,1,32,32], dtype=tf.float32)
full_inputs = tf.concat([inputs1, inputs2], axis=0)
with tf.variable_scope('convnet'):
outputs1 = tf.nn.conv2d(inputs1, kernel_size=[3,3], num_outputs=1, stride=[1,1], padding='VALID', data_format='NCHW')
with tf.variable_scope('convnet', reuse=True):
outputs2 = tf.nn.conv2d(inputs2, kernel_size=[3,3], num_outputs=1, stride=[1,1], padding='VALID', data_format='NCHW')
outputs = tf.concat([outputs1, outputs2], axis=0)
我试图实现虚拟批量规范,我有两个实现,它们的行为方式与improved gan repository的广泛启发相同。这里显示的两个实现都被简化,以保持它们之间的不同之处。
@add_arg_scope
def vbn_single(x, epsilon=1e-5, scope=None):
assert isinstance(epsilon, float)
shape = x.get_shape().as_list()
if shape[0] is None:
half_size = x.shape[0] // 2
else:
half_size = shape[0] // 2
needs_reshape = len(shape) != 4
if needs_reshape:
orig_shape = shape
if len(shape) == 2:
x = tf.reshape(x, [shape[0], shape[1], 0, 0])
elif len(shape) == 1:
x = tf.reshape(x, [shape[0], 1, 1, 1])
else:
assert False, shape
shape = x.get_shape().as_list()
batch_size = int(x.get_shape()[0])
with tf.variable_scope(scope, 'VBN'):
ref_half = tf.slice(x, [0,0,0,0], [half_size, shape[1], \
shape[2], shape[3]])
gamma = tf.get_variable("gamma", [1,shape[1],1,1],
initializer=tf.constant_initializer(1.))
beta = tf.get_variable("beta", [1,shape[1],1,1],
initializer=tf.constant_initializer(0.))
ref_mean, ref_var = tf.nn.moments(ref_half, [0,2,3], \
keep_dims=True)
inv_std = tf.rsqrt(ref_var + epsilon)
coeff = inv_std * gamma
return (x * coeff) + (beta - ref_mean * coeff)
inputs = tf.placeholder(shape=[32, 1, 256, 256], dtype=tf.float32)
reference_batch = tf.get_variable('reference_batch', initializer=reference_array)
full_inputs = tf.concat([reference_batch, inputs], axis=0)
L = []
with tf.variable_scope('convnet'):
L.append(tf.contrib.layers.conv2d(inputs, [...], \
scope='Layer0'))
L.append(vbn_single(L[-1], scope='Norm0'))
L.append(tf.nn.relu(L[-1], name='Activ0')
L.append(tf.contrib.layers.conv2d(L[-1], [...], \
scope='Layer1'))
L.append(vbn_single(L[-1], scope='Norm1'))
L.append(tf.nn.relu(L[-1], name='Activq')
L.append(tf.contrib.layers.conv2d(L[-1], [...], \
scope='Layer2'))
L.append(vbn_single(L[-1], scope='Norm2'))
L.append(tf.nn.relu(L[-1], name='Activ2')
shape = L[-1].get_shape().as_list()
half_size = shape[0] // 2
L.append(tf.slice(L[-1], [half_size,0,0,0], \
[half_size, shape[1], shape[2], shape[3]]))
L.append(tf.reduce_mean(L[-1], axis=[2,3]))
L.append(tf.contrib.layers.fully_connected(L[-1], num_outputs=2))
# loss accuracy and optimizer
一切似乎都运转良好,验证和培训准确性趋于一致,而且损失也在减少。
class Vbn_double(object):
def __init__(self, x, epsilon=1e-5, scope=None):
shape = x.get_shape().as_list()
needs_reshape = len(shape) != 4
if needs_reshape:
orig_shape = shape
if len(shape) == 2:
if data_format == 'NCHW':
x = tf.reshape(x, [shape[0], shape[1], 0, 0])
else:
x = tf.reshape(x, [shape[0], 1, 1, shape[1]])
elif len(shape) == 1:
x = tf.reshape(x, [shape[0], 1, 1, 1])
else:
assert False, shape
shape = x.get_shape().as_list()
with tf.variable_scope(scope):
self.epsilon = epsilon
self.scope = scope
self.mean, self.var = tf.nn.moments(x, [0,2,3], \
keep_dims=True)
self.inv_std = tf.rsqrt(self.var + epsilon)
self.batch_size = int(x.get_shape()[0])
out = self._normalize(x, self.mean, self.inv_std)
if needs_reshape:
out = tf.reshape(out, orig_shape)
self.reference_output = out
def __call__(self, x):
shape = x.get_shape().as_list()
needs_reshape = len(shape) != 4
if needs_reshape:
orig_shape = shape
if len(shape) == 2:
if self.data_format == 'NCHW':
x = tf.reshape(x, [shape[0], shape[1], 0, 0])
else:
x = tf.reshape(x, [shape[0], 1, 1, shape[1]])
elif len(shape) == 1:
x = tf.reshape(x, [shape[0], 1, 1, 1])
else:
assert False, shape
with tf.variable_scope(self.scope, reuse=True):
out = self._normalize(x, self.mean, self.inv_std)
if needs_reshape:
out = tf.reshape(out, orig_shape)
return out
def _normalize(self, x, mean, inv_std):
shape = x.get_shape().as_list()
assert len(shape) == 4
gamma = tf.get_variable("gamma", [1,shape[1],1,1],
initializer=tf.constant_initializer(1.))
beta = tf.get_variable("beta", [1,shape[1],1,1],
initializer=tf.constant_initializer(0.))
coeff = gamma * inv_std
return (x * coeff) + (beta - mean * coeff)
inputs = tf.placeholder(shape=[32, 1, 256, 256], dtype=tf.float32)
reference_batch = tf.get_variable('reference_batch', initializer=reference_array)
L = []
vbn = {}
with tf.variable_scope('convnet'):
L.append(tf.contrib.layers.conv2d(reference_batch, [...], \
scope='Layer0'))
vbn['Norm0'] = Vbn_double(L[-1], scope='Norm0')
L.append(vbn['Norm0'].reference_output)
L.append(tf.nn.relu(L[-1], name='Activ0')
L.append(tf.contrib.layers.conv2d(L[-1], [...], \
scope='Layer1'))
vbn['Norm1'] = Vbn_double(L[-1], scope='Norm1')
L.append(vbn['Norm1'].reference_output)
L.append(tf.nn.relu(L[-1], name='Activ1')
L.append(tf.contrib.layers.conv2d(L[-1], [...], \
scope='Layer2'))
vbn['Norm2'] = Vbn_double(L[-1], scope='Norm2')
L.append(vbn['Norm2'].reference_output)
L.append(tf.nn.relu(L[-1], name='Activ2')
with tf.variable_scope('convnet', reuse=True):
L.append(tf.contrib.layers.conv2d(inputs, [...], \
scope='Layer0'))
L.append(vbn['Norm0'](L[-1]))
L.append(tf.nn.relu(L[-1], name='Activ0')
L.append(tf.contrib.layers.conv2d(L[-1], [...], \
scope='Layer1'))
L.append(vbn['Norm1'](L[-1]))
L.append(tf.nn.relu(L[-1], name='Activ1')
L.append(tf.contrib.layers.conv2d(L[-1], [...], \
scope='Layer2'))
L.append(vbn['Norm2'](L[-1]))
L.append(tf.nn.relu(L[-1], name='Activ2')
L.append(tf.reduce_mean(L[-1], axis=[2,3]))
L.append(tf.contrib.layers.fully_connected(L[-1], num_outputs=2))
# loss accuracy and optimizer
这里只有训练收敛(但曲线略有不同于第一次实现),而验证损失增加和准确性保持随机猜测。
作为一个细节,我使用GPU,tensorflow 1.2.1启用了XLA。 关于我做错了什么的任何线索?
所以我尝试比较两个输出模型,并查看渐变(使用compute_gradients),以避免权重(然后是渐变)共享我在两个不同的范围内构建模型并分别加载相同的权重(来自之前的训练有素的模型)。
如果我使用的话,我有相同的输出:
sess.run([model.outputs, model2.outputs])
但如果我同时使用以下方法查看渐变(每个元组的第一个元素由Optimizer.compute_gradients(loss)返回):
sess.run([model.outputs, model2.outputs, grads])
突然模型输出是不同的...如何在不使用apply_gradients的情况下通过查看渐变来改变模型输出?它似乎也没有改变权重,因为如果我正在运行:
sess.run(grads)
sess.run([model.outputs, model.outputs2])
模型输出仍然相同......
答案 0 :(得分:0)
好的,似乎XLA在这里有问题,因为我在禁用XLA后获得了一致的结果。似乎XLA无法在第二次实施中处理某些事情......
我稍后会在存储库中提出一个问题,' compute_gradients'修改输出特别令人不安...