Question

我有一个类似于以下代码的代码，其中我想针对参数my_cost最小化函数w。

但是，在运行代码时，与没有张量流实现的同一代码（通过显式定义给出成本梯度的函数）相比，在我看来这是非常慢的（例如慢30倍）。

我在以下示例代码中做错什么了吗？（也许我每次都不必要重新计算梯度图？）

我正在使用Python 3和TensorFlow 2.0.0。 Relevant Git

在下面的代码中，我以一个简单的虚拟成本函数为例来说明运行时的巨大差异。

带有Tensorflow的代码：

import numpy as np
import tensorflow as tf
import time

class ExampleTF:
    def __init__(self, n=100, m=10):
        Z = np.random.randn(n, m)
        self.Z = tf.convert_to_tensor(Z, dtype=tf.float32)
        self.w = tf.Variable(np.ones((m, 1)), dtype=tf.float32)

    # =====================================
    def cost(self, P):
        # This is a simple dummy cost function just as an example
        return tf.reduce_sum((self.Z @ self.w) - P)

    # =====================================
    def optimize_w(self, cost_func, parameters, lr=0.01, iterations=2000):
        optimizer = tf.optimizers.Adam(lr)
        for _ in range(iterations):
            optimizer.minimize(cost_func, var_list=parameters)

    # =====================================
    def update(self, P):
        P = tf.convert_to_tensor(P, dtype=tf.float32)

        self.optimize_w(
            cost_func = lambda: self.cost(P),
            parameters = [self.w]
        )

        #print("===> cost:", self.cost(P).numpy())
        #print("w:", self.w.numpy().reshape(-1)[:10])

# =====================================
n, m = 10000, 100
ex_tf = ExampleTF(n, m)
for _ in range(50):
    P = np.random.uniform(size=n).reshape((-1, 1))

    start = time.time()
    ex_tf.update(P)
    elapsed = time.time() - start

    print("elapsed time:", elapsed)

不含Tensorflow的代码（仅numpy）：

import numpy as np
import tensorflow as tf
import time

class ExampleNonTF:
    def __init__(self, n=100, m=10):
        self.Z = np.random.randn(n, m)
        self.w = np.ones((m, 1))

    # =====================================
    def cost(self, P):
        # This is a simple dummy cost function just as an example
        return np.sum(self.Z @ self.w - P)

    # =====================================
    def gradient_cost(self, P):
        # This is the gradient of the dummy cost function with respect to self.w
        return np.sum(self.Z, axis=0).reshape(self.w.shape)

    # =====================================
    def optimize_w(self, P, lr=0.01, iterations=2000): # This is the ADAM optimizer
        avg_grad1 = 0; avg_grad2 = 0
        beta1 = 0.9; beta2 = 0.999; eps = 1e-07
        for itr in range(iterations):
            grad = self.gradient_cost(P)
            avg_grad1 = beta1 * avg_grad1 + (1 - beta1) * grad
            avg_grad2 = (beta2 * avg_grad2 + (1 - beta2) * (grad ** 2))
            avg_grad1_corr = avg_grad1 / (1 - beta1 ** (itr + 1))
            avg_grad2_corr = avg_grad2 / (1 - beta2 ** (itr + 1))
            self.w = self.w - lr * (avg_grad1_corr / (np.sqrt(avg_grad2_corr) + eps))

    # =====================================
    def update(self, P):
        self.optimize_w(P)

        #print("===> cost:", self.cost(P))
        #print("w:", self.w.reshape(-1)[:10])

# =====================================
n, m = 10000, 100
ex_nontf = ExampleNonTF(n, m)
for _ in range(50):
    P = np.random.uniform(size=n).reshape((-1, 1))

    start = time.time()
    ex_nontf.update(P)
    elapsed = time.time() - start

    print("elapsed time:", elapsed)

最小化损失函数时，Tensorflow太慢

0 个答案: