我尝试像自动编码器一样从Google训练Vggish发布模型。这是自动编码器的形状:
print('Input {0}'.format(net))
with tf.variable_scope('Encoder'):
with tf.variable_scope('Stage1'):
net = lays.conv2d(net, 64, [3,3], stride=2, padding='SAME')
print('Stage1 {0}'.format(net))
with tf.variable_scope('Stage2'):
net = lays.conv2d(net, 128, [3,3], stride=2, padding='SAME')
print('Stage2 {0}'.format(net))
with tf.variable_scope('Stage3'):
net = lays.conv2d(net, 256, [3,3], stride=1, padding='SAME')
print('Stage3 {0}'.format(net))
net = lays.conv2d(net, 256, [3,3], stride=2, padding='SAME')
print('Stage3 {0}'.format(net))
with tf.variable_scope('Stage4'):
net = lays.conv2d(net, 512, [3,3], stride=1, padding='SAME')
print('Stage4 {0}'.format(net))
net = lays.conv2d(net, 512, [3,3], stride=2, padding='SAME')
print('Stage4 {0}'.format(net))
with tf.variable_scope('Stage5'):
net = slim.flatten(net)
print('Stage5 {0}'.format(net))
net = lays.fully_connected(net,4096, scope='fc1_1')
print('Stage5 {0}'.format(net))
net = lays.fully_connected(net,4096, scope='fc1_2')
print('Stage5 {0}'.format(net))
with tf.variable_scope('EMBEDDING'):
net = lays.fully_connected(net,EMBEDDING_SIZE, scope='fc2')
print('EMBEDDING {0}'.format(net))
with tf.variable_scope('Decoder'):
with tf.variable_scope('Stage5d'):
net = lays.fully_connected(net,4096, scope='fc1_2d')
print('Stage5d {0}'.format(net))
net = lays.fully_connected(net,12288, scope='fc1_1d')
print('Stage5d {0}'.format(net))
net = tf.reshape(net, [-1,6,4,512])
print('Stage5d {0}'.format(net))
with tf.variable_scope('Stage4d'):
net = lays.conv2d_transpose(net, 512, [3, 3], stride=1, padding='SAME')
print('Stage4d {0}'.format(net))
net = lays.conv2d_transpose(net, 256, [3, 3], stride=2, padding='SAME')
print('Stage4d {0}'.format(net))
with tf.variable_scope('Stage3d'):
net = lays.conv2d_transpose(net, 256, [3, 3], stride=1, padding='SAME')
print('Stage3d {0}'.format(net))
net = lays.conv2d_transpose(net, 128, [3, 3], stride=2, padding='SAME')
print('Stage3d {0}'.format(net))
with tf.variable_scope('Stage2d'):
net = lays.conv2d_transpose(net, 64, [3, 3], stride=2, padding='SAME')
print('Stage2d {0}'.format(net))
with tf.variable_scope('Stage1d'):
net = lays.conv2d_transpose(net, 1, [3,3], stride=2, padding='SAME', activation_fn=tf.nn.tanh)
print('Stage1d {0}'.format(net))
我使用城市声音数据集并生成96 * 64 MFCC数据作为输入。
这是火车期间的成本演变。它似乎并没有收敛到某个东西。
我的问题是,在一开始像自动编码器一样训练Vggish模型以产生嵌入瓶颈是个好主意吗?