我实现了一个名为“ MultiHeadAttention”的自定义层。当我尝试使用它时,会导致
tensorflow.python.framework.errors_impl.InvalidArgumentError: 不兼容的形状:[128]与[128,256,256]
...(省略)...(培训/ SGD /梯度/ multi_head_attention_1 / mul_1_grad /形状, training / SGD / gradients / multi_head_attention_1 / mul_1_grad / Shape_1)]]
MultiHeadAttention代码:
class MultiHeadAttention(Layer):
def __init__(self, n_head: int, model_dim: int, **kwargs):
self.n_head = n_head
self.model_dim = model_dim
self.dim_per_head = model_dim // n_head
super(MultiHeadAttention, self).__init__(**kwargs)
def build(self, input_shape):
if isinstance(input_shape, list):
input_shape = input_shape[0]
self.query_kernel = self.add_weight(name='query_kernel',
shape=(input_shape[2], self.dim_per_head * self.n_head),
initializer='uniform', trainable=True)
self.key_kernel = self.add_weight(name='key_kernel',
shape=(input_shape[2], self.dim_per_head * self.n_head),
initializer='uniform', trainable=True)
self.value_kernel = self.add_weight(name='value_kernel',
shape=(input_shape[2], self.dim_per_head * self.n_head),
initializer='uniform', trainable=True)
self.output_kernel = self.add_weight(name='output_kernel',
shape=(self.dim_per_head * self.n_head, self.model_dim),
initializer='uniform', trainable=True)
self.output_bias = self.add_weight(name='output_bias',
shape=(self.model_dim,),
initializer='zeros', trainable=True)
super(MultiHeadAttention, self).build(input_shape)
def call(self, x):
if isinstance(x, list):
attn, attn_mask = x
attn_mask = K.repeat_elements(attn_mask, self.n_head, 0)
else:
attn = x
attn_mask = None
query_big = K.dot(attn, self.query_kernel)
key_big = K.dot(attn, self.key_kernel)
value_big = K.dot(attn, self.value_kernel) # batch ,seq_len, hid*n_head
def reshape1(x):
s = list(x.shape)
x = K.reshape(x, [-1, s[1], self.n_head, s[2] // self.n_head])
x = K.permute_dimensions(x, [2, 0, 1, 3])
x = K.reshape(x, [-1, s[1], s[2] // self.n_head])
return x
query_big = reshape1(query_big)
key_big = reshape1(key_big)
value_big = reshape1(value_big)
# print(value_big.shape)
result = scale_dot_product(query_big, key_big, value_big, attn_mask) # n_head * batch, seq_len, hid
def reshape2(x):
s = list(x.shape) # [n_head * batch_size, len_v, d_v]
x = K.reshape(x, [self.n_head, -1, s[1], s[2]])
x = K.permute_dimensions(x, [1, 2, 0, 3])
x = K.reshape(x, [-1, s[1], self.n_head * s[2]]) # [batch_size, len_v, n_head * d_v]
return x
result = reshape2(result)
result = K.dot(result, self.output_kernel) + self.output_bias
return result
def compute_output_shape(self, input_shape):
if isinstance(input_shape, list):
input_shape = input_shape[0]
return (input_shape[0], input_shape[1], self.model_dim)
def compute_mask(self, inputs, mask=None):
return None
def scale_dot_product(query: tf.Tensor,
key: tf.Tensor,
value: tf.Tensor,
attn_mask=None):
shape_list = list(value.shape)
mul = K.batch_dot(query, K.permute_dimensions(key, (0, 2, 1)))
if attn_mask is not None:
attn_mask = K.cast(attn_mask, dtype=tf.float32)
mul = attn_mask * mul + (1.0 - attn_mask) * neg_inf
scale = mul / K.sqrt(K.cast(shape_list[-1], mul.dtype))
softmax = K.softmax(scale)
result = K.batch_dot(softmax, value)
return result
一个简单的例子:
import numpy as np
import keras.backend as K
from keras.optimizers import SGD
from keras import Input, Model, losses
from keras.layers import Embedding, Lambda, Dense
import MultiHeadAttention
if __name__ == "__main__":
max_len = 256
word_dim = 200
vacab_size = 10000
input = Input(shape=(max_len,), name="Input-Sentence")
word_embedding = Embedding(vacab_size, word_dim, input_length=max_len,
mask_zero=False, trainable=True)(input)
inp_mask = Lambda(lambda t: K.any(K.not_equal(t, 0), axis=-1), name="Input_mask")(input)
out = word_embedding
# There were something wrong with the custom layer of MultiHeadAttention. if comment line below,it would be ok.
out = MultiHeadAttention(n_head=8, model_dim=word_dim)([out, inp_mask])
out = Dense(2, activation="softmax")(out)
model = Model(inputs=input, outputs=out)
model.summary()
model.compile(optimizer=SGD(), loss=losses.sparse_categorical_crossentropy)
# example data
data_num = 1024
x = np.array(np.random.randint(0, vacab_size, (data_num, max_len)).tolist())
y = np.array(np.random.randint(0, 2, (data_num, max_len, 1)).tolist())
print(x.shape, y.shape)
model.fit(x, y, epochs=24, batch_size=16)
keras == 2.2.4 tf == 1.13.1 错误信息:
回溯(最近通话最近一次):
文件“ D:\ PyCharm社区版” 2018.1.4 \ helpers \ pydev \ pydev_run_in_console.py“,第52行,在run_file中 pydev_imports.execfile(file,globals,locals)#执行脚本
文件“ D:\ PyCharm社区版” 2018.1.4 \ helpers \ pydev_pydev_imps_pydev_execfile.py“,第18行,在execfile中 exec(compile(contents +“ \ n”,file,'exec'),glob,loc)
文件 “ C:/用户/管理员/ PyProgram / InfosExtractor /代码/BERT/MultiAttentionTest.py”, 第30行,在 model.fit(x,y,epochs = 24,batch_size = 16)
文件“ D:\ Anaconda3.7 \ lib \ site-packages \ keras \ engine \ training.py”,行 1039年,身体健康 validate_steps = validation_steps)
文件 “ D:\ Anaconda3.7 \ lib \ site-packages \ keras \ engine \ training_arrays.py”, 在fit_loop中的第199行 outs = f(ins_batch)
文件 “ D:\ Anaconda3.7 \ lib \ site-packages \ keras \ backend \ tensorflow_backend.py”, 第2715行,在致电中 返回self._call(inputs)
文件 “ D:\ Anaconda3.7 \ lib \ site-packages \ keras \ backend \ tensorflow_backend.py”, _call中的第2675行 提取= self._callable_fn(* array_vals)
文件 “ D:\ Anaconda3.7 \ lib \ site-packages \ tensorflow \ python \ client \ session.py”, 第1454行,在致电中 self._session._session,self._handle,参数,状态,无)
文件 “ D:\ Anaconda3.7 \ lib \ site-packages \ tensorflow \ python \ framework \ errors_impl.py”, 第519行,位于退出中 c_api.TF_GetCode(self.status.status))
tensorflow.python.framework.errors_impl.InvalidArgumentError: 不兼容的形状:[128]与[128,256,256]
[[节点: 培训/ SGD /梯度/ multi_head_attention_1 / mul_1_grad / BroadcastGradientArgs = BroadcastGradientArgs [T = DT_INT32,_class = [“ loc:@ training / SGD / gradients / multi_head_attention_1 / mul_1_grad / Reshape_1”], _device =“ / job:localhost / replica:0 / task:0 / device:CPU:0”](培训/ SGD / gradients / multi_head_attention_1 / mul_1_grad / Shape, training / SGD / gradients / multi_head_attention_1 / mul_1_grad / Shape_1)]]
答案 0 :(得分:0)
由于attn_mask的形状与“ scale_dot_product”方法中的mul不匹配。因此,我进行了一些更改:
首先,在“ inp_mask”:inp_mask = Lambda(lambda t: K.any(K.not_equal(t, 0), axis=-1, keep_dim=True), name="Input_mask")(input)
中添加参数keep_dim。但这仍然行不通。
其次,对attn_mask = K.repeat_elements(attn_mask, self.n_head, 0)
行进行注释,并执行一个名为“ reshape_mask”的新方法
def reshape_mask(mask, head_num):
if mask is None:
return mask
seq_len = K.shape(mask)[1]
mask = K.expand_dims(mask, axis=1)
mask = K.tile(mask, [1, head_num, 1])
return K.reshape(mask, (-1, seq_len))
第三,重写方法scale_dot_product。
def scale_dot_product(query: tf.Tensor,
key: tf.Tensor,
value: tf.Tensor,
attn_mask=None):
feature_dim = K.shape(query)[-1]
e = K.batch_dot(query, key, axes=2) / K.sqrt(K.cast(feature_dim, dtype=K.floatx()))
e = K.exp(e - K.max(e, axis=-1, keepdims=True))
if attn_mask is not None:
e *= K.cast(K.expand_dims(attn_mask, axis=-2), K.floatx())
a = e / (K.sum(e, axis=-1, keepdims=True) + K.epsilon())
v = K.batch_dot(a, value)
return v
干杯!干杯!干杯!干杯!干杯!问题已经解决!