我正在尝试调整我找到的代码here。
我想在文本数据上使用它。我有一个全球数据库和一个本地数据库,大约90%的时间匹配。我想用一个暹罗神经网络训练匹配的东西,然后将它应用于不匹配的东西,以便找到可能的匹配
siamese.py
:
import tensorflow as tf
#flags = tf.app.flags
#FLAGS = flags.FLAGS
def mynet(input, reuse=False,
numFilter = 32,
convWindow = 2,
poolwindow = 2,
poolStrid = 2):
with tf.name_scope("model"):
with tf.variable_scope("conv1") as scope:
net = tf.layers.conv1d(input, numFilter, kernel_size = convWindow,
activation=tf.nn.relu, padding='SAME',reuse=reuse)
net = tf.layers.max_pooling1d(net, poolwindow, strides = poolStrid, padding='valid')
with tf.variable_scope("conv2") as scope:
net = tf.layers.conv1d(net, numFilter, kernel_size = convWindow,
activation=tf.nn.relu, padding='SAME',reuse=reuse)
net = tf.layers.max_pooling1d(net, poolwindow, strides = poolStrid, padding='valid')
<about 10 more of these layers>
net = tf.layers.flatten(net, name = 'flat')
return net
def contrastive_loss(model1, model2, y, margin):
with tf.name_scope("contrastive-loss"):
d = tf.sqrt(tf.reduce_sum(tf.pow(model1-model2, 2), 1, keep_dims=True))
tmp= y * tf.square(d)
tmp2 = (1 - y) * tf.square(tf.maximum((margin - d),0))
return tf.reduce_mean(tmp + tmp2) /2
train.py
:
<import a dataframe consisting of labels/strings, addresses and countries and company names>
<convert each entry into a 56-element list consisting of numbers that correspond to a bigram dictionary ('aa' = 1, 'ab' = 2, etc.); pad as necessary>
<mash up all the columns, so for every row, I get a single list consisting of those bigrams; something like [1,45,6,0,0,0]>
<split the data into 'left' and 'right', and give pairs a score of 0>
<shuffle up 'left' and randomly pair with 'right', and give pairs a score of 1>
left_output = siamese01.mynet(left, reuse=False,convWindow=convWindow)
right_output = siamese01.mynet(right, reuse=True,convWindow=convWindow)
loss = siamese01.contrastive_loss(left_output, right_output, label, margin)
global_step = tf.Variable(0, trainable=False)
train_step = tf.train.MomentumOptimizer(0.01, 0.99, use_nesterov=True).minimize(loss, global_step=global_step)
saver = tf.train.Saver()
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
#setup tensorboard
tf.summary.scalar('step', global_step)
tf.summary.scalar('loss', loss)
# for var in tf.trainable_variables():
# tf.summary.histogram(var.op.name, var)
merged = tf.summary.merge_all()
writer = tf.summary.FileWriter('train.log', sess.graph)
#train iter
for i in range(train_iter):
b_l, b_r, b_sim = getDummydata(<helper function to provide nicely shaped data>)
FD = {left:b_l, right:b_r, label: b_sim}
_, l, summary_str = sess.run([train_step, loss, merged],feed_dict=FD)
writer.add_summary(summary_str, i)
print("\r#%d - Loss"%i, l)
b_l, b_r, b_sim =
saver.save(sess, "model/model.ckpt") #save every epoch
当我运行时,我得到了
#0 - Loss 11.6008835
#0 - Loss 21.896631
#0 - Loss 0.19516087
#0 - Loss 0.6260054
#0 - Loss 1.6012161
#0 - Loss 2.767976
...
#0 - Loss 0.010000003
#0 - Loss 0.010000003
#0 - Loss 0.010000003
#0 - Loss 0.010000002
所以网络似乎正在融合。
但是,当我将它应用于我的测试集中的随机示例时,我会得到每个示例的相同答案。好像所有的卷积滤波器都要归零,也许?
为什么会这样?我可以尝试什么来解决它? 10个卷积够吗?
有些列只有两个或三个字符,但我将每列转换为56个元素的列表。是不是有很多填充物要扔掉?
我有大约6000个训练样例。我应该得到更多吗?还有多少? 60k,600k?
答案 0 :(得分:0)
假设数据形状良好(单热或单词嵌入),我怀疑填充选择(而不是填充每列,加入所有列然后填充)或网络太深,尝试更简单的网络作为基线