立即学习如何在pytorch中编码多头注意,
如果输入张量的尺寸为4个暗角,我无法解决size_mismatch的问题。
我参考了def和class代码 http://nlp.seas.harvard.edu/2018/04/03/attention.html
对于给您带来的不便,我们深表歉意。
#attention def and class
def clones(module, N):
"Produce N identical layers."
return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])
def attention(query, key, value, mask=None, dropout=None):
"Compute 'Scaled Dot Product Attention'"
d_k = query.size(-1)
scores = torch.matmul(query, key.transpose(-2, -1)) \
/ math.sqrt(d_k)
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
p_attn = F.softmax(scores, dim = -1)
if dropout is not None:
p_attn = dropout(p_attn)
return torch.matmul(p_attn, value), p_attn
# MultiHead Attention class
class MultiHeadedAttention(nn.Module):
def __init__(self, h, d_model, dropout=0.1):
"Take in model size and number of heads."
super(MultiHeadedAttention, self).__init__()
assert d_model % h == 0
# We assume d_v always equals d_k
self.d_k = d_model // h
self.h = h
self.linears = clones(nn.Linear(d_model, d_model), 4)
self.attn = None
self.dropout = nn.Dropout(p=dropout)
def forward(self, query, key, value, mask=None):
"Implements Figure 2"
if mask is not None:
# Same mask applied to all h heads.
mask = mask.unsqueeze(1)
nbatches = query.size(0)
# 1) Do all the linear projections in batch from d_model => h x d_k
query, key, value = \
[l(x).view(nbatches, -1, self.h, self.d_k).transpose(1, 2)
for l, x in zip(self.linears, (query, key, value))]
# 2) Apply attention on all the projected vectors in batch.
x, self.attn = attention(query, key, value, mask=mask,
dropout=self.dropout)
# 3) "Concat" using a view and apply a final linear.
x = x.transpose(1, 2).contiguous() \
.view(nbatches, -1, self.h * self.d_k)
return self.linears[-1](x)
# create test_4_dim tensor
X=torch.randn(10,5,64,64)
X=X.view(X.shape[0],X.shape[1],X.shape[2]*X.shape[3])
#X:torch.Size([10, 5, 4096])
query_=X.transpose(2,1)
key_=X
value_=X
print("query:",query_.size())
print("key:",key_.size())
print("value:",value_.size())
#query: torch.Size([10, 4096, 5])
#key: torch.Size([10, 5, 4096])
#value: torch.Size([10, 5, 4096])
multihead_testmodel= MultiHeadedAttention(h=4,d_model=4096,dropout=0.1)
#print(multihead_model)
output=multihead_testmodel(query=query_,key=key_,value=value_)
print("model output:",output.size())
#size mismatch, m1: [40960 x 5], m2: [4096 x 4096] at #../aten/src/TH/generic/THTensorMath.cpp:197
如果张量被塞住:torch.randn(5,64,64),则此代码没有错误。
X=torch.randn(5,64,64)
#X=X.view(X.shape[0],X.shape[1],X.shape[2]*X.shape[3])
query_=X.transpose(2,1)
key_=X
value_=X
print("query:",query_.size())
print("key:",key_.size())
print("value:",value_.size())
#query: torch.Size([5, 64, 64])
#key: torch.Size([5, 64, 64])
#value: torch.Size([5, 64, 64])
multihead_model= MultiHeadedAttention(h=4,d_model=64,dropout=0.1)
temp_output=multihead_model(query=query_,key=key_,value=value_)
print(temp_output.size())
#torch.Size([5, 64, 64])
答案 0 :(得分:1)
看起来代码期望为query
,key
和value
获得相同的尺寸,因此,如果不进行转置,它可以解决以下问题:
query_ = X
key_ = X
value_ = X
您是对的,要使注意力正常工作,您是对的,但是代码已经通过在注意力实现中调用key.transpose(-2, -1)
来处理此问题。