我正在尝试使用(几乎)完全矢量化的操作来实现自己的神经网络。有很多帖子,但我似乎找不到适合所有这三个帖子的帖子:
我的主要问题是:如何使用完全矢量化操作在给定dE / da(N x K)和da / dz(N x K x K)的情况下达到dE / dz(N x K)?即如何矢量化 dE_dz_test2
?
我的第二个问题是:
有没有更好的写softmax_derivative
的方法?
我以此为参考来一次计算一个样本的梯度: http://saitcelebi.com/tut/output/part2.html 这是为了弄清楚如何做反向传播 https://peterroelants.github.io/posts/neural-network-implementation-part04/
def one_hot_encode(y, n_classes):
y_onehot = np.zeros((len(y), n_classes))
for i, y_i in enumerate(y):
y_onehot[i, y_i] = 1
return y_onehot
def cross_entropy_derivative(y_true, y_pred):
# dE / da
# input: N x K
# output: N x K array
N = len(y_true)
return -(y_true / y_pred) / N
def softmax(x):
# activation (a)
# input: N x K array
# output: N x K array
# https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/
exp = np.exp(x - np.max(x))
return exp / np.sum(exp, axis=1)[:, None]
def softmax_derivative(Z):
# da/dz
#input: N x K array
#output: N x K x K array
#http://saitcelebi.com/tut/output/part2.html
N, K = Z.shape
s = softmax(Z)[:, :, np.newaxis]
a = np.tensordot(s, np.ones((1, K)), axes=([-1],[0]))
I = np.repeat(np.eye(K, K)[np.newaxis, :, :], N, axis=0)
b = I - np.tensordot(np.ones((K, 1)), s.T, axes=([-1],[0])).T
return a * np.swapaxes(b, 1, 2)
def softmax_derivative_test(Z):
# da/dz
# non-vectorized softmax gradient calculation
#http://saitcelebi.com/tut/output/part2.html
N, K = Z.shape
da_dz = np.zeros((N, K, K))
kron_delta = np.eye(K)
s = softmax(Z)
for n in range(N):
for i in range(K):
for j in range(K):
da_dz[n, i, j] = s[n, i] * (kron_delta[i, j] - s[n, j])
return da_dz
def dE_dz_test2(dE_da, da_dz):
# array (N x K)
# array (N x K x K)
# output: array (N x K)
N, K = dE_da.shape
dE_dz = np.zeros((N, K))
for n in range(N):
dE_dz[n, :] = np.matmul(da_dz[n], dE_da[n, :, np.newaxis]).T
return dE_dz
def some_type_of_matrix_multiplication_(dE_da, da_dz):
# how do i get dE/dz from dE_da and da_dz
pass
X = np.random.rand(100, 2)
W = np.random.rand(2, 4)
y = np.random.randint(0, 4, size=100)
y = one_hot_encode(y, 4)
Z = X @ W
S = softmax(Z)
N, K = Z.shape
# da / dz for softmax
da_dz = softmax_derivative(Z) # (100, 4, 4)
da_dz_test = softmax_derivative_test(Z) # (100, 4, 4) - non vectorized implementation
print(np.isclose(da_dz, da_dz_test).all()) # equivalence test
dE_da = cross_entropy_derivative(y, S) # (100, 4)
dE_dz = some_type_of_matrix_multiplication_(dE_da, da_dz) # what do I do here? *****
dE_dz_test = (S - y) / N # (100, 4) If you combine dE/da and da/dz terms
dE_dz_test2 = dE_dz_test2(dE_da, da_dz)
print(np.isclose(dE_dz_test, dE_dz_test2).all()) # equivalence test
是
是
答案 0 :(得分:1)
以下是使用np.einsum
的方法:
def da_dz_pp(z,sm=None):
if sm is None:
sm = softmax(z)
res = np.einsum('ij,ik->ijk',sm,-sm)
np.einsum('ijj->ij',res)[...] += sm
return res
def dE_dz_pp(y,z,sm=None):
if sm is None:
sm = softmax(z)
dE_da = cross_entropy_derivative(y,sm)
da_dz = da_dz_pp(z,sm)
return np.einsum('ij,ijk->ik',dE_da,da_dz)
似乎可以重现您的代码输出,并且速度更快。