我关注Pytorch Batchnorm layer different from Keras Batchnorm, Pytorch Batchnorm implementation,但它们不能解决我的问题。
我也读了Wiki about Batchnorm。 然后从tensorflow batchnorm和pytorch source code中搜索源代码。
下面是我的测试代码,pytorch和keras之间的结果在1e-2到1e-3的错误顺序上是不同的,这是很大的。函数b0,b1与割炬结果相似,但仍不太准确。 b2尝试遵循tensorflow batchnorm中使用的公式。
卷积部分产生相同的结果,但是我停留在batchnorm层。我还对pytorch使用了eval(),no_grad()以及对keras模型使用了model.predict来确保它们处于推断阶段。
Tensorflow实现不使用1 / sqrt(var + eps),而是使用sqrt(var + eps)。我尝试将1 / running_var转移到keras.BN.moving_var,但仍然失败。
import tensorflow as tf
import tensorflow.keras.layers as L
from tensorflow.keras import Model as KModel
import torch.nn as nn
import torch
def KM():
x = L.Input((None,None,3))
y0 = L.Concatenate(axis=-1)([x[:,::2,::2,:],x[:,::2,1::2,:],x[:,1::2,::2,:],x[:,1::2,1::2,:]])
y1 = L.Conv2D(32,3,1,"same",use_bias=False)(y0)
y2 = L.BatchNormalization()(y1)
y3 = L.LeakyReLU(0.1)(y2)
return KModel(x, [y1, y2, y3])
class YM(nn.Module):
def __init__(self):
super(YM, self).__init__()
self.cat = lambda x : torch.cat([x[:,:,::2,::2],x[:,:,::2,1::2],x[:,:,1::2,::2],x[:,:,1::2,1::2]],axis=1)
self.conv = nn.Conv2d(12,32,3,1,1,bias=False)
self.bn = nn.BatchNorm2d(32)
self.act = nn.LeakyReLU(0.1)
def forward(self, x):
y0 = ym.cat(x)
y0 = ym.conv(y0)
y1 = ym.bn(y0)
y2 = ym.act(y1)
return [y0, y1, y2]
np.random.seed(0)
img = np.random.randint(0,255,(1,12,14,3)).astype(np.float32)
img_torch = torch.from_numpy(img.transpose(0,3,1,2).astype(np.float32))
w1 = np.random.rand(32,12,3,3).astype(np.float32)*0.1
bw1 = np.random.rand(32).astype(np.float32)*0.1
bb1 = np.random.rand(32).astype(np.float32)
bm1 = np.random.rand(32).astype(np.float32)
bv1 = np.abs(np.random.rand(32).astype(np.float32))*0.1
ym = YM()
km = KM()
ym.conv.weight = nn.Parameter(torch.from_numpy(w1))
ym.bn.weight = nn.Parameter(torch.from_numpy(bw1))
ym.bn.bias = nn.Parameter(torch.from_numpy(bb1))
ym.bn.running_mean = torch.from_numpy(bm1)
ym.bn.running_var = torch.from_numpy(bv1)
km.layers[6].set_weights([w1.transpose(2,3,1,0)])
km.layers[7].set_weights([bw1, bb1, bm1, bv1])
ym.eval()
ym.bn.track_running_stats = True
with torch.no_grad():
t0 = Ym(ym, img_torch/255.-0.5)
k0 = km.predict(img/255.-0.5)
for i in range(len(t0)):
print(t0[i].shape, k0[i].shape)
Key = 1
print(t0[Key][0,0,:,:].detach().numpy())
print(k0[Key][0,:,:,0])
>>>>>>>>>>>
[[ 0.71826 0.72964 0.73189 0.70224 0.74954 0.72928 0.7524]
[ 0.71305 0.68717 0.68581 0.7242 0.73491 0.71925 0.70781]
[ 0.70145 0.66769 0.6857 0.70804 0.73533 0.73165 0.72006]
[ 0.6758 0.69231 0.71173 0.71325 0.72097 0.71414 0.75782]
[ 0.68255 0.72283 0.71273 0.7226 0.71788 0.68119 0.72556]
[ 0.70452 0.68088 0.74389 0.73558 0.72853 0.7174 0.74389]]
[[ 0.71953 0.73082 0.73306 0.70365 0.75056 0.73046 0.75339]
[ 0.71437 0.6887 0.68736 0.72543 0.73605 0.72052 0.70918]
[ 0.70287 0.66939 0.68724 0.7094 0.73647 0.73282 0.72133]
[ 0.67743 0.6938 0.71306 0.71457 0.72223 0.71545 0.75877]
[ 0.68413 0.72407 0.71405 0.72384 0.71916 0.68278 0.72678]
[ 0.70592 0.68246 0.74495 0.73671 0.72972 0.71868 0.74496]]```
tt = t0[Key].detach().numpy().transpose(0,2,3,1)
kk = k0[Key]
np.abs(tt-kk).max()
>>>>>>>>>>
0.078752756
gamma, beta = bw1[0], bb1[0]
mu, var = bm1[0], bv1[0]
x_p = t0[0][0,0,0,0]
print(gamma,beta,mu,var,x_p)
eps = 1e-10
def bn0(x_p, mu, var, gamma, beta):
# wiki
xhat = (x_p - mu)/np.sqrt(var + eps)
_x = xhat * gamma + beta
return _x
def bn1(x_p, mu, var, gamma, beta):
# pytorch cpp
inv_var = 1/ np.sqrt(var + eps)
alpha_d = gamma * inv_var
beta_d = beta - mu * inv_var * gamma
return x_p * alpha_d + beta_d
def bn2(x_p, mu, var, gamma, beta):
# tensorflow cpp
inv_var = np.sqrt(var + eps)
xhat = (x_p - mu)*inv_var
_x = xhat * gamma + beta
return _x
print(bn0(x_p, mu, var, gamma, beta))
print(bn1(x_p, mu, var, gamma, beta))
print(bn2(x_p, mu, var, gamma, beta))
print(bn2(x_p, mu, 1/var, gamma, beta))
>>>>>>>>
0.048011426 0.87305844 0.67954195 0.059197646 tensor(-0.26256)
tensor(0.68715)
tensor(0.68715)
tensor(0.86205)
tensor(0.68715)
答案 0 :(得分:0)
在Pytorch中,Keras似乎使用了不同的默认值epsilon(1e-3)与(1e-5)
张量流源代码中的输入“ var”似乎已经被某个位置的1 / moving_variance占用。
除了batchnorm之外,tensorflow与pytorch的填充策略可能会产生不同的输出结果。建议在张量流中使用Zeropadding2D来指定填充数,然后在步幅大于1(从pytorch到张量流的传输权重)之后使用有效的conv2d
在整个网络中累积的误差可能很大。约70层小型网络的最终激活功能之前,最大错误约为0.6。