我目前正在尝试从HiCE(https://github.com/acbull/HiCE)的启发中在喀拉拉邦建立模型,基本上输入了一组输入句子,并使用了一个自我关注模块来创建句子表示(基于单词),对于每个句子,然后使用另一个自我关注块来组合每个句子(目标是输出单词嵌入,以估计词汇量不足的单词)。但是,我的模型表现非常差。经过一番调查,我发现问题的第二个重点是问题所在,但我似乎无法找出问题所在。有谁对它可能在哪里有任何见识?我正在使用此转换器的实现来引起注意:https://github.com/CyberZHG/keras-transformer
这是我的模型的(简化)版本:
context_encoder:
word_embs = np.zeros((self.vocab_size+1, self.word_emb_dim)) #MASK
largest_ind = 0
for word in self.word2id_dict:
ind = self.word2id_dict[word]
#print(ind)
if ind > largest_ind:
largest_ind = ind
w_emb = self.word_embedding_model.wv[word]
word_embs[ind] = w_emb
emb_layer = EmbeddingRet(
input_dim=self.vocab_size+1,
output_dim=self.word_emb_dim,
mask_zero=True,
weights=[word_embs],
trainable=False,
name='Encoder-Token-Embedding',
)
emb = emb_layer(context)
emb2 = emb[0]
encoder_embed = emb2 #Removed positional embedding instead!
sa_context_encoder = transformer.get_encoders(self.encoder_num, encoder_embed, self.head_num, self.hidden_dim)
context_emb = SumInternal()(sa_context_encoder)
context_encoder = Model(inputs=[context], outputs=context_emb)
context_encoder.summary()
return context_encoder
完整模型:
contexts = Input(shape=(self.max_num_context, self.max_num_words_per_context, ), dtype=tf.int64)
context_embs = TimeDistributed(self.context_encoder)(contexts)
context_embs = ZeroVectorMasker()(context_embs) #added to rebuild mask
aggregator_encoder_out = transformer.get_encoders(self.encoder_num, context_embs, self.head_num, self.hidden_dim)
final_estimate = MeanInternal()(aggregator_encoder_out)
model = Model(inputs=[contexts], outputs=final_estimate)
model.compile(loss='mean_squared_error', optimizer='adam', metrics=[])
model.summary()
return model
自定义图层:
SumInternal(Layer)类:
def __init__(self, **kwargs):
super(SumInternal, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
super(SumInternal, self).build(input_shape) # Be sure to call this at the end
def compute_mask(self, inputs, mask=None):
return None #We don't need the current masking after this step!
def call(self, x, mask):
mask = K.cast(mask, dtype = "float32")
mask = tf.expand_dims(mask, -1)
masked_vecs = x * mask
final_sum = K.sum(masked_vecs, axis=1, keepdims=False)
return final_sum
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[2])
MeanInternal(Layer)类:
def __init__(self, **kwargs):
super(MeanInternal, self).__init__(**kwargs)
def build(self, input_shape):
super(MeanInternal, self).build(input_shape) # Be sure to call this at the end
def compute_mask(self, inputs, mask=None):
return None
def call(self, x, mask):
mask = K.cast(mask, dtype = "float32")
mask = tf.expand_dims(mask, -1)
masked_vecs = x * mask
div = tf.count_nonzero(mask, axis=1, dtype = "float32")
sum = K.sum(masked_vecs, axis=1, keepdims=False)
mean = sum / div
return mean
def compute_output_shape(self, input_shape):
return (input_shape[0], input_shape[2])
ZeroVectorMasker(Layer)类:
def __init__(self, **kwargs):
super(ZeroVectorMasker, self).__init__(**kwargs)
def build(self, input_shape):
# Create a trainable weight variable for this layer.
super(ZeroVectorMasker, self).build(input_shape) # Be sure to call this at the end
def compute_mask(self, inputs, mask=None):
#calculate zero vector masks
zero_vector_mask = tf.not_equal(tf.count_nonzero(inputs, axis=2), 0)
#zero_vector_mask = K.cast(zero_vector_mask, dtype = "bool")
zero_vector_mask = tf.Print(zero_vector_mask, [zero_vector_mask], message='Value of Zero Vec Mask !!!', summarize=100)
return zero_vector_mask #We don't need the current masking after this step!
def call(self, x, mask=None):
#here we ignore the mask, the compute_mask will output the real one (based on 0 vectors inputed through here)
print('0vec call')
print(x)
#print(mask)
return x
def compute_output_shape(self, input_shape):
return input_shape