操作系统:Ubuntu 16.04
Python 3.5
我试图在REINFORCE策略梯度算法程序上实现自定义丢失函数,但遇到以下错误:
2018-03-28 23:32:37.766405: I tensorflow/core/platform/cpu_feature_guard.cc:140] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA Traceback (most recent call last): File "reinforce.py", line 199, in <module>
main(sys.argv) File "reinforce.py", line 191, in main
reinforce.train(env) File "reinforce.py", line 57, in train
self.model.compile(loss=custom_loss, optimizer='adam') File "/usr/local/lib/python3.5/dist-packages/keras/models.py", line 824, in compile
**kwargs) File "/usr/local/lib/python3.5/dist-packages/keras/engine/training.py", line 603, in compile
loss = loss or {} File "/home/yuyangw/.local/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 614, in __bool__
raise TypeError("Using a `tf.Tensor` as a Python `bool` is not allowed. " TypeError: Using a `tf.Tensor` as a Python `bool` is not allowed. Use `if t is not None:` instead of `if t:` to test if a tensor is defined, and use TensorFlow ops such as tf.cond to execute subgraphs conditioned on the value of a tensor. Exception ignored in: <bound method BaseSession.__del__ of <tensorflow.python.client.session.Session object at 0x7f2a3346d748>>
错误在于模仿类的训练功能。在这里,我想实现一个自定义日志功能来执行渐变策略。但我不能使用model.compile来优化自定义丢失函数。
以下是代码:
import sys
import argparse
import numpy as np
import tensorflow as tf
import keras
import gym
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
class Reinforce(object):
# Implementation of the policy gradient method REINFORCE.
def __init__(self, model, lr):
self.testReward = []
self.learning_rate = lr
self.model = model
'''
self.model = keras.models.Sequential()
self.model.add(keras.layers.Dense(16, input_shape=(8, ), activation='relu'))
self.model.add(keras.layers.Dense(16, activation='relu'))
self.model.add(keras.layers.Dense(16, activation='relu'))
self.model.add(keras.layers.Dense(4, activation='softmax'))
print(self.model.summary())
'''
# TODO: Define any training operations and optimizers here, initialize
# your variables, or alternately compile your model here.
def train(self, env, gamma=1.0):
# Trains the model on a single episode using REINFORCE.
states, hotKeyActions, rewards = self.generate_episode(env)
#states.reverse()
#hotKeyActions.reverse()
rewards.reverse()
policies = []
Gt = 0
for i in range(len(rewards)):
Gt += gamma**i*rewards[i]*0.01
stateFit = np.reshape(states[i], (1, 8))
prediction = self.model.predict(stateFit)
action = np.argmax(hotKeyActions[i])
policy = prediction[0][action]
policies.append(policy)
#prob.append(prediction)
Gt = tf.cast(Gt, tf.float32)
custom_loss = tf.reduce_mean(tf.multiply(Gt, tf.log(policies)))
#print(custom_loss)
#train = tf.train.AdamOptimizer(self.learning_rate)
#opt = train.minimize(custom_loss)
#delta = train.compute_gradient(weights)
self.model.compile(loss=custom_loss, optimizer='adam')
self.model.fit(np.array(states), np.array(hotKeyActions), epochs=5, batch_size=32)
return
def generate_episode(self, env, render=False):
# Generates an episode by executing the current policy in the given env.
# Returns:
# - a list of states, indexed by time step
# - a list of actions, indexed by time step
# - a list of rewards, indexed by time step
# TODO: Implement this method.
states = []
actions = []
rewards = []
self.model.compile(loss='categorical_crossentropy', optimizer='adam')
state_dimension = env.observation_space.shape[0]
action_dimension = env.action_space.n
done = False
state = env.reset()
while not done:
stateFit = np.reshape(state, (1, state_dimension))
prediction = self.model.predict(stateFit)
actionFit = np.argmax(prediction)
#print(actionFit)
nextState, reward, done, __ = env.step(actionFit)
hotKeyAction = np.zeros(action_dimension)
hotKeyAction[actionFit] = 1
states.append(state)
actions.append(hotKeyAction)
rewards.append(reward)
state = nextState
return states, actions, rewards
def test(self, env, num_test=100, render=False):
#self.model.compile(loss='categorical_crossentropy', optimizer='adam')
state_dimension = env.observation_space.shape[0]
action_dimension = env.action_space.n
rewards = []
for i in range(num_test):
done = False
state = env.reset()
totalReward = 0
while not done:
if render:
env.render()
stateFit = np.reshape(state, (1, state_dimension))
prediction = self.model.predict(stateFit)
actionFit = np.argmax(prediction)
nextState, reward, done, __ = env.step(actionFit)
hotKeyAction = np.zeros(action_dimension)
hotKeyAction[actionFit] = 1
state = nextState
totalReward += reward
rewards.append(totalReward)
rewards = np.array(rewards)
mean = np.mean(rewards)
std = np.std(rewards)
self.testReward.append((mean, std))
return
def draw(self):
x = range(len(self.testReward))
y = [data[0] for data in testReward]
error = [data[1] for data in testReward]
plt.figure(figsize=(8,6))
plt.bar(range(num_test), rewards)
plt.errorbar(x, y, yerr = error)
plt.title("Simplest errorbars, 0.2 in x, 0.4 in y")
plt.xlabel('100 tests result after training 100 episodes')
plt.ylabel('mean reward every 100 tests($\gamma = 1$)')
plt.show()
return
def parse_arguments():
# Command-line flags are defined here.
parser = argparse.ArgumentParser()
parser.add_argument('--model-config-path', dest='model_config_path',
type=str, default='LunarLander-v2-config.json',
help="Path to the model config file.")
parser.add_argument('--num-episodes', dest='num_episodes', type=int,
default=50000, help="Number of episodes to train on.")
parser.add_argument('--lr', dest='lr', type=float,
default=5e-4, help="The learning rate.")
# https://stackoverflow.com/questions/15008758/parsing-boolean-values-with-argparse
parser_group = parser.add_mutually_exclusive_group(required=False)
parser_group.add_argument('--render', dest='render',
action='store_true',
help="Whether to render the environment.")
parser_group.add_argument('--no-render', dest='render',
action='store_false',
help="Whether to render the environment.")
parser.set_defaults(render=False)
return parser.parse_args()
def main(args):
# Parse command-line arguments.
args = parse_arguments()
model_config_path = args.model_config_path
num_episodes = args.num_episodes
lr = args.lr
_render = args.render
# Create the environment.
env = gym.make('LunarLander-v2')
# Load the policy model from file.
with open(model_config_path, 'r') as f:
model = keras.models.model_from_json(f.read())
print(model.summary())
#print(model.get_weights())
# TODO: Train the model using REINFORCE and plot the learning curve.
reinforce = Reinforce(model, lr)
testFreq = 20
for i in range(num_episodes):
reinforce.train(env)
if (i+1)%testFreq == 0:
reinforce.test(env, num_test=10, render=True)
reinforce.draw()
if __name__ == '__main__':
main(sys.argv)
谢谢!
答案 0 :(得分:0)
Have a look how loss functions are defined in keras.损失函数必须是具有第一个参数标签和第二个参数预测的函数。 此外,如果您使用Keras,您可能希望使用keras后端函数构建这样的自定义丢失函数(但使用tf函数应该可以工作)。