我有一个类似于以下代码的代码,其中我想针对参数my_cost
最小化函数w
。
但是,在运行代码时,与没有张量流实现的同一代码(通过显式定义给出成本梯度的函数)相比,在我看来这是非常慢的(例如慢30倍)。
我在以下示例代码中做错什么了吗? (也许我每次都不必要重新计算梯度图?)
我正在使用Python 3和TensorFlow 2.0.0。 Relevant Git
在下面的代码中,我以一个简单的虚拟成本函数为例来说明运行时的巨大差异。
带有Tensorflow的代码:
import numpy as np
import tensorflow as tf
import time
class ExampleTF:
def __init__(self, n=100, m=10):
Z = np.random.randn(n, m)
self.Z = tf.convert_to_tensor(Z, dtype=tf.float32)
self.w = tf.Variable(np.ones((m, 1)), dtype=tf.float32)
# =====================================
def cost(self, P):
# This is a simple dummy cost function just as an example
return tf.reduce_sum((self.Z @ self.w) - P)
# =====================================
def optimize_w(self, cost_func, parameters, lr=0.01, iterations=2000):
optimizer = tf.optimizers.Adam(lr)
for _ in range(iterations):
optimizer.minimize(cost_func, var_list=parameters)
# =====================================
def update(self, P):
P = tf.convert_to_tensor(P, dtype=tf.float32)
self.optimize_w(
cost_func = lambda: self.cost(P),
parameters = [self.w]
)
#print("===> cost:", self.cost(P).numpy())
#print("w:", self.w.numpy().reshape(-1)[:10])
# =====================================
n, m = 10000, 100
ex_tf = ExampleTF(n, m)
for _ in range(50):
P = np.random.uniform(size=n).reshape((-1, 1))
start = time.time()
ex_tf.update(P)
elapsed = time.time() - start
print("elapsed time:", elapsed)
不含Tensorflow的代码(仅numpy):
import numpy as np
import tensorflow as tf
import time
class ExampleNonTF:
def __init__(self, n=100, m=10):
self.Z = np.random.randn(n, m)
self.w = np.ones((m, 1))
# =====================================
def cost(self, P):
# This is a simple dummy cost function just as an example
return np.sum(self.Z @ self.w - P)
# =====================================
def gradient_cost(self, P):
# This is the gradient of the dummy cost function with respect to self.w
return np.sum(self.Z, axis=0).reshape(self.w.shape)
# =====================================
def optimize_w(self, P, lr=0.01, iterations=2000): # This is the ADAM optimizer
avg_grad1 = 0; avg_grad2 = 0
beta1 = 0.9; beta2 = 0.999; eps = 1e-07
for itr in range(iterations):
grad = self.gradient_cost(P)
avg_grad1 = beta1 * avg_grad1 + (1 - beta1) * grad
avg_grad2 = (beta2 * avg_grad2 + (1 - beta2) * (grad ** 2))
avg_grad1_corr = avg_grad1 / (1 - beta1 ** (itr + 1))
avg_grad2_corr = avg_grad2 / (1 - beta2 ** (itr + 1))
self.w = self.w - lr * (avg_grad1_corr / (np.sqrt(avg_grad2_corr) + eps))
# =====================================
def update(self, P):
self.optimize_w(P)
#print("===> cost:", self.cost(P))
#print("w:", self.w.reshape(-1)[:10])
# =====================================
n, m = 10000, 100
ex_nontf = ExampleNonTF(n, m)
for _ in range(50):
P = np.random.uniform(size=n).reshape((-1, 1))
start = time.time()
ex_nontf.update(P)
elapsed = time.time() - start
print("elapsed time:", elapsed)