Question

操作系统：Ubuntu 16.04

Python 3.5

我试图在REINFORCE策略梯度算法程序上实现自定义丢失函数，但遇到以下错误：

2018-03-28 23:32:37.766405: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA Traceback (most recent call last):   File "reinforce.py", line 199, in <module>
    main(sys.argv)   File "reinforce.py", line 191, in main
    reinforce.train(env)   File "reinforce.py", line 57, in train
    self.model.compile(loss=custom_loss, optimizer='adam')   File "/usr/local/lib/python3.5/dist-packages/keras/models.py", line 824, in compile
    **kwargs)   File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 603, in compile
    loss = loss or {}   File "/home/yuyangw/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 614, in __bool__
    raise TypeError("Using a `tf.Tensor` as a Python `bool` is not allowed. " TypeError: Using a `tf.Tensor` as a Python `bool` is not allowed. Use `if t is not None:` instead of `if t:` to test if a tensor is defined, and use TensorFlow ops such as tf.cond to execute subgraphs conditioned on the value of a tensor. Exception ignored in: <bound method BaseSession.__del__ of <tensorflow.python.client.session.Session object at 0x7f2a3346d748>>

错误在于模仿类的训练功能。在这里，我想实现一个自定义日志功能来执行渐变策略。但我不能使用model.compile来优化自定义丢失函数。

以下是代码：

import sys
import argparse
import numpy as np
import tensorflow as tf
import keras
import gym

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt


class Reinforce(object):
    # Implementation of the policy gradient method REINFORCE.

    def __init__(self, model, lr):
        self.testReward = []
        self.learning_rate = lr
        self.model = model
        '''
        self.model = keras.models.Sequential()
        self.model.add(keras.layers.Dense(16, input_shape=(8, ), activation='relu'))
        self.model.add(keras.layers.Dense(16, activation='relu'))
        self.model.add(keras.layers.Dense(16, activation='relu'))
        self.model.add(keras.layers.Dense(4, activation='softmax'))
        print(self.model.summary())
        '''
        # TODO: Define any training operations and optimizers here, initialize
        #       your variables, or alternately compile your model here.  

    def train(self, env, gamma=1.0):
        # Trains the model on a single episode using REINFORCE.
        states, hotKeyActions, rewards = self.generate_episode(env)
        #states.reverse()
        #hotKeyActions.reverse()
        rewards.reverse()
        policies = []
        Gt = 0

        for i in range(len(rewards)):
            Gt += gamma**i*rewards[i]*0.01

            stateFit = np.reshape(states[i], (1, 8))
            prediction = self.model.predict(stateFit)
            action = np.argmax(hotKeyActions[i])
            policy = prediction[0][action]
            policies.append(policy)
            #prob.append(prediction)

        Gt = tf.cast(Gt, tf.float32)
        custom_loss = tf.reduce_mean(tf.multiply(Gt, tf.log(policies)))
        #print(custom_loss)
        #train = tf.train.AdamOptimizer(self.learning_rate)
        #opt = train.minimize(custom_loss)
        #delta = train.compute_gradient(weights)

        self.model.compile(loss=custom_loss, optimizer='adam')
        self.model.fit(np.array(states), np.array(hotKeyActions), epochs=5, batch_size=32)

        return

    def generate_episode(self, env, render=False):
        # Generates an episode by executing the current policy in the given env.
        # Returns:
        # - a list of states, indexed by time step
        # - a list of actions, indexed by time step
        # - a list of rewards, indexed by time step
        # TODO: Implement this method.
        states = []
        actions = []
        rewards = []

        self.model.compile(loss='categorical_crossentropy', optimizer='adam')
        state_dimension = env.observation_space.shape[0]
        action_dimension = env.action_space.n
        done = False
        state = env.reset()

        while not done:
            stateFit = np.reshape(state, (1, state_dimension))
            prediction = self.model.predict(stateFit)
            actionFit = np.argmax(prediction)
            #print(actionFit)
            nextState, reward, done, __ = env.step(actionFit)

            hotKeyAction = np.zeros(action_dimension)
            hotKeyAction[actionFit] = 1

            states.append(state)
            actions.append(hotKeyAction)
            rewards.append(reward)

            state = nextState

        return states, actions, rewards

    def test(self, env, num_test=100, render=False):
        #self.model.compile(loss='categorical_crossentropy', optimizer='adam')
        state_dimension = env.observation_space.shape[0]
        action_dimension = env.action_space.n
        rewards = []

        for i in range(num_test):
            done = False
            state = env.reset()
            totalReward = 0

            while not done:
                if render:
                    env.render()
                stateFit = np.reshape(state, (1, state_dimension))
                prediction = self.model.predict(stateFit)
                actionFit = np.argmax(prediction)
                nextState, reward, done, __ = env.step(actionFit)

                hotKeyAction = np.zeros(action_dimension)
                hotKeyAction[actionFit] = 1

                state = nextState
                totalReward += reward

            rewards.append(totalReward)

        rewards = np.array(rewards)
        mean = np.mean(rewards)
        std = np.std(rewards)
        self.testReward.append((mean, std))

        return

    def draw(self):
        x = range(len(self.testReward))
        y = [data[0] for data in testReward]
        error = [data[1] for data in testReward]

        plt.figure(figsize=(8,6))
        plt.bar(range(num_test), rewards)
        plt.errorbar(x, y, yerr = error)
        plt.title("Simplest errorbars, 0.2 in x, 0.4 in y")
        plt.xlabel('100 tests result after training 100 episodes')
        plt.ylabel('mean reward every 100 tests($\gamma = 1$)')
        plt.show()

        return

def parse_arguments():
    # Command-line flags are defined here.
    parser = argparse.ArgumentParser()
    parser.add_argument('--model-config-path', dest='model_config_path',
                        type=str, default='LunarLander-v2-config.json',
                        help="Path to the model config file.")
    parser.add_argument('--num-episodes', dest='num_episodes', type=int,
                        default=50000, help="Number of episodes to train on.")
    parser.add_argument('--lr', dest='lr', type=float,
                        default=5e-4, help="The learning rate.")

    # https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
    parser_group = parser.add_mutually_exclusive_group(required=False)
    parser_group.add_argument('--render', dest='render',
                              action='store_true',
                              help="Whether to render the environment.")
    parser_group.add_argument('--no-render', dest='render',
                              action='store_false',
                              help="Whether to render the environment.")
    parser.set_defaults(render=False)

    return parser.parse_args()

def main(args):
    # Parse command-line arguments.
    args = parse_arguments()
    model_config_path = args.model_config_path
    num_episodes = args.num_episodes
    lr = args.lr
    _render = args.render

    # Create the environment.
    env = gym.make('LunarLander-v2')

    # Load the policy model from file.
    with open(model_config_path, 'r') as f:
        model = keras.models.model_from_json(f.read())

    print(model.summary())
    #print(model.get_weights())
    # TODO: Train the model using REINFORCE and plot the learning curve.
    reinforce = Reinforce(model, lr)
    testFreq  = 20

    for i in range(num_episodes):
        reinforce.train(env)
        if (i+1)%testFreq == 0:
            reinforce.test(env, num_test=10, render=True)

    reinforce.draw()


if __name__ == '__main__':
    main(sys.argv)

谢谢！

Answer 1

Have a look how loss functions are defined in keras.损失函数必须是具有第一个参数标签和第二个参数预测的函数。此外，如果您使用Keras，您可能希望使用keras后端函数构建这样的自定义丢失函数（但使用tf函数应该可以工作）。

使用`tf.Tensor`作为Python`bool`不允许在keras中使用自定义丢失功能

1 个答案: