我从头开始编写一个多层感知器,我觉得它的速度比它应该的慢。罪魁祸首似乎是我的compute_gradients
- 函数,根据我的调查答案,大部分执行时间。它看起来像这样:
def compute_gradients(X, Y, S1, H, P, W1, W2, lamb):
# Y must be one-hot
# Y and P must be (10, n)
# X and H must be (3072, n)
# P is the softmax layer
if not (Y.shape[0] == 10 and P.shape[0] == 10 and Y.shape == P.shape):
raise ValueError("Y and P must have shape (10, k). Y: {}, P: {}".format(Y.shape, P.shape))
if not X.shape[0] == n_input:
raise ValueError("X must have shape ({}, k), has {}".format(n_input, X.shape))
if not H.shape[0] == n_hidden:
raise ValueError("H must have shape ({}, k)".format(n_hidden))
grad_W1 = np.zeros([n_hidden, n_input])
grad_W2 = np.zeros([10, n_hidden])
grad_b1 = np.zeros([n_hidden, 1])
grad_b2 = np.zeros([10, 1])
X, Y, H, P = X.T, Y.T, H.T, P.T
for x, y, s1, h, p in zip(X, Y, S1, H, P):
h = np.reshape(h, [1, n_hidden])
y = np.reshape(y, [10, 1])
p = np.reshape(p, [10, 1])
# Second layer
g = -(y-p).T
grad_b2 += g.T
grad_W2 += np.matmul(g.T, h)
# First layer
g = np.matmul(g, W2)
ind = np.zeros(h.shape[1])
for i, val in enumerate(s1):
if val > 0:
ind[i] = 1
diag = np.diag(ind)
g = np.matmul(g, diag)
grad_b1 += g.T
grad_W1 += np.matmul(g.T, np.reshape(x, [1, n_input]))
# Divide by batch size
grad_b1 /= X.shape[0]
grad_b2 /= X.shape[0]
grad_W1 /= X.shape[0]
grad_W2 /= X.shape[0]
# Add regularization term
grad_W1 += 2*lamb*W1
grad_W2 += 2*lamb*W2
return grad_W1, grad_W2, grad_b1, grad_b2
如果X, Y, H, P
全长10行(n = 10),则计算大约需要1秒。与执行相同任务的朋友相比,这太过分了。但我无法在代码中看到任何明显的低效率。我该怎么做才能加快计算速度?
编辑:输入数据是CIFAR数据集。像这样加载它:
def one_hot(Y):
# assume Y = [1, 4, 9, 0, ...]
result = [None]*len(Y)
for i, cls in enumerate(Y):
onehot = {
0: lambda: [1,0,0,0,0,0,0,0,0,0],
1: lambda: [0,1,0,0,0,0,0,0,0,0],
2: lambda: [0,0,1,0,0,0,0,0,0,0],
3: lambda: [0,0,0,1,0,0,0,0,0,0],
4: lambda: [0,0,0,0,1,0,0,0,0,0],
5: lambda: [0,0,0,0,0,1,0,0,0,0],
6: lambda: [0,0,0,0,0,0,1,0,0,0],
7: lambda: [0,0,0,0,0,0,0,1,0,0],
8: lambda: [0,0,0,0,0,0,0,0,1,0],
9: lambda: [0,0,0,0,0,0,0,0,0,1],
}[cls]()
result[i] = onehot
result = np.array(result).T
return result
def unpickle(file):
import pickle
with open(file, "rb") as fo:
d = pickle.load(fo, encoding="bytes")
return d
names = ["data_batch_1",
"data_batch_2",
"data_batch_3",
"data_batch_4",
"data_batch_5",
]
# All data sets
dataset_large = {"data": np.zeros([0, 3072]), "labels": np.array([])}
validation_large = {}
## All data batches
for name in names[0:4]:
raw = unpickle(os.path.join(path, name))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"], axis = 0)
raw = unpickle(os.path.join(path, names[4]))
dataset_large["data"] = np.append(dataset_large["data"], raw[b"data"][0: -1000], axis = 0)
dataset_large["labels"] = np.append(dataset_large["labels"], raw[b"labels"][0: -1000], axis = 0)
validation_large["data"] = raw[b"data"][-1000: ]
validation_large["labels"] = raw[b"labels"][-1000: ]
# Make one-hot
dataset_large["labels"] = one_hot(dataset_large["labels"]).T
validation_large["labels"] = one_hot(validation_large["labels"]).T
# Normalize
dataset_large["data"] = dataset_large["data"]/255
validation_large["data"] = validation_large["data"]/255
可以在https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz找到数据集。然后运行:
def evaluate_classifier(X, W1, W2, b1, b2):
if not X.shape[0] == n_input:
ValueError("Wrong shape of X: {}".format(X.shape))
if not len(X.shape) == 2:
ValueError("Wrong shape of X: {}".format(X.shape))
if not W1.shape == (n_hidden, n_input):
raise ValueError("Wrong shape of W1: {}".format(W1.shape))
if not b1.shape == (n_hidden, 1):
raise ValueError("Wrong shape of b1: {}".format(b1.shape))
if not W2.shape == (10, n_hidden):
raise ValueError("Wrong shape of W2: {}".format(W2.shape))
if not b2.shape == (10, 1):
raise ValueError("Wrong shape of b2: {}".format(b2.shape))
s1, h = layer_1(X, W1, b1)
p = layer_2(h, W2, b2)
return s1, h, p
W1 = np.random.normal(0, 0.01, [n_hidden, n_input])
W2 = np.random.normal(0, 0.01, [10, n_hidden])
b1 = np.random.normal(0, 0.1, [n_hidden, 1])
b2 = np.random.normal(0, 0.1, [10, 1])
X = dataset_large["data"][0:10]
Y = dataset_large["labels"][0:10]
S1, H, P = evaluate_classifier(X, W1, W2, b1, b2)
lamb = 0
compute_gradients(X, Y, S1, H, P, W1, W2, lamb)