我正在尝试使用张量流训练在线近端策略优化模型,但是过了一会儿,张量流会话开始返回NaN。这导致我的代理人step
使用这些Nan,最终整个事情变得一团糟。
控制台中的简短摘录:
Action Taken [2. 1.3305835 0.9937418]
Observation [ 0.69689728 -0.46114012 -11.39961704 -0.05004346 -0.05004346
0.74720544 3.49857114 3.05071477 -1.10276782 -9.71530186]
Reward Gained -0.023699851569145534
Action Taken [2. 0.62562937 1.0081608 ]
Observation [ 0.71591491 -0.47488649 11.84026042 -0.05004346 -0.05004346 0.75886336
3.49857114 3.07180685 -1.12458586 -9.84382414]
Reward Gained -0.015462812448075767
Action Taken [nan nan nan]
Observation [ nan nan nan -0.05004346 -0.05004346 nan
nan nan nan nan]
Reward Gained nan
Action Taken [nan nan nan]
Observation [ nan nan nan -0.05004346 -0.05004346 nan
nan nan nan nan]
Reward Gained nan
我的代码[已更新]:
import gym
import numpy as np
import tensorflow as tf
import rocket_lander_gym
EP_LEN = 200
GAMMA = 0.9
SL_LR = 1e-4
CR_LR = 1e-4
BATCH = 5
ACTOR_UPDATE_STEPS = 20
CRITIC_UPDATE_STEPS = 20
STATE_DIM, ACT_DIM = 10, 3
METHOD = [
dict(name='kl_penalty', kl_target=0.01, lam=0.5),
dict(name='clip', epsilon=0.2),
][1]
PRINT_DEBUG_MSG = True
class PPO:
def __init__(self):
self.tfsess = tf.Session()
self.tf_state = tf.placeholder(tf.float32, [None, STATE_DIM], 'state')
# Critic (value network)
with tf.variable_scope('critic'):
# Layers
l1 = tf.layers.dense(self.tf_state, 100, tf.nn.relu)
# Value
self.value = tf.layers.dense(l1, 1)
# Discounted reward: reward in the furture
self.tf_dreward = tf.placeholder(tf.float32, [None, 1], 'discounted_reward')
# Advantage: determine quality of action
self.advantage = self.tf_dreward - self.value
# Loss function: minimize the advantage over time
# The loss function is a mean squared error
self.loss = tf.reduce_mean(tf.square(self.advantage))
# Gradient descent using Adam optimizer
self.train_opt = tf.train.AdamOptimizer(CR_LR)
gradients, variables = zip(*self.train_opt.compute_gradients(self.loss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
self.train_opt = self.train_opt.apply_gradients(zip(gradients, variables))
# Actor (policy network)
pi, pi_params = self.tinynn('pi', trainable=True)
old_pi, old_pi_params = self.tinynn('old_pi', trainable=False)
# Sample actions from both the old and the new policy networks
with tf.variable_scope('sample_action'):
# Choose an action from the distribution learnt
self.sample_operation = tf.squeeze(pi.sample(1), axis=0)
with tf.variable_scope('update_old_pi'):
# Choose an action from the distribution learnt
self.update_old_pi_operation = [old_pi.assign(p) for p, old_pi in zip(pi_params, old_pi_params)]
# Placeholder for the action and the advantage
self.tf_action = tf.placeholder(tf.float32, [None, ACT_DIM], 'action')
self.tf_advantage = tf.placeholder(tf.float32, [None, 1], 'advantage')
# Compute loss function
with tf.variable_scope('loss'):
with tf.variable_scope('surrogate'):
ratio = pi.prob(self.tf_advantage) / old_pi.prob(self.tf_advantage)
surrogate = ratio * self.tf_advantage
# KL penalty
if METHOD['name'] == 'kl_penalty':
# Lambda
self.tf_lambda = tf.placeholder(tf.float32, None, 'lambda')
# Compute KL divergence between old and new policy
kl = tf.contrib.distributions.kl_divergence(old_pi, pi)
# Get mean
self.kl_mean = tf.reduce_mean(kl)
# Compute loss using surrogate
self.aloss = -(tf.reduce_mean(surrogate - self.tf_lambda * kl))
else:
self.aloss = -tf.reduce_mean(tf.minimum(surrogate, tf.clip_by_value(ratio, 1.-METHOD['epsilon'], 1.+METHOD['epsilon']) * self.tf_advantage))
# Minimize the loss using gradient descent
with tf.variable_scope('atrain'):
self.atrain_operation = tf.train.AdamOptimizer(SL_LR)
gradients, variables = zip(*self.atrain_operation.compute_gradients(self.aloss))
gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
self.atrain_operation = self.atrain_operation.apply_gradients(zip(gradients, variables))
# Write to disk
tf.summary.FileWriter("log/", self.tfsess.graph)
# Run the session
self.tfsess.run(tf.global_variables_initializer())
def update(self, state, action, reward):
self.tfsess.run(self.update_old_pi_operation)
advantage = self.tfsess.run(self.advantage, {self.tf_state: state, self.tf_dreward: reward})
# Update actor (policy)
if METHOD['name'] == 'kl_penalty':
for _ in range(ACTOR_UPDATE_STEPS):
_, kl = self.tfsess.run([self.atrain_operation, self.kl_mean], {self.tf_state: state, self.tf_action: action, tf_advantage: advantage, self.tf_lambda: METHOD['lam']})
if kl > 4*METHOD['kl_target']:
break
if kl < METHOD['kl_target'] / 1.5:
# Adaptive lambda
METHOD['lam'] /= 2
elif kl > METHOD['kl_target'] * 1.5:
METHOD['lam'] *= 2
# Lambda might explode, we need to clip it
METHOD['lam'] = np.clip(METHOD['lam'], 1e-4, 10)
else:
[self.tfsess.run(self.atrain_operation, {self.tf_state: state, self.tf_action: action, self.tf_advantage: advantage}) for _ in range(ACTOR_UPDATE_STEPS)]
# Update critic (value)
[self.tfsess.run(self.train_opt, {self.tf_state: state, self.tf_dreward: reward}) for _ in range(CRITIC_UPDATE_STEPS)]
def tinynn(self, name, trainable):
with tf.variable_scope(name):
l1 = tf.layers.dense(self.tf_state, 100, tf.nn.relu, trainable=trainable)
mu = 2 * tf.layers.dense(l1, ACT_DIM, tf.nn.tanh, trainable=trainable)
sigma = tf.layers.dense(l1, ACT_DIM, tf.nn.softplus, trainable=trainable)
norm_dist = tf.distributions.Normal(loc=mu, scale=sigma)
params = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope=name)
return norm_dist, params
def choose_action(self, state):
state = state[np.newaxis, :]
action = self.tfsess.run(self.sample_operation, {self.tf_state: state})[0]
return np.clip(action, -1, 1)
def get_value(self, state):
if state.ndim < 2: state = state[np.newaxis, :]
return self.tfsess.run(self.value, {self.tf_state: state})[0, 0]
def train(self, env, ppo, epochs, render=True):
# Rewards
all_ep_r = []
# Training loop
for ep in range(epochs):
# Initial state
s = env.reset()
# States, actions and rewards
buffer_s, buffer_a, buffer_r = [], [], []
# Initial reward
ep_r = 0
# For a single episode
for t in range(EP_LEN):
if render:
# Render the environment
env.render()
# Choose best action
a = ppo.choose_action(s)
# State,reward,done,info
s_, r, done, _ = env.step(a)
if PRINT_DEBUG_MSG:
print("Action Taken ",a)
print("Observation ",s_)
print("Reward Gained ",r, end='\n\n')
# Add to buffers
buffer_s.append(s)
buffer_a.append(a)
buffer_r.append((r+8)/8) # normalize reward, find to be useful
s = s_
# Total reward
ep_r += r
# Update PPO
if (t+1) % BATCH == 0 or t == EP_LEN - 1:
# Get value
v_s_ = ppo.get_value(s_)
# Discounted reward
discounted_r = []
# Update rewards
for r in buffer_r[::-1]:
v_s_ = r + GAMMA * v_s_
discounted_r.append(v_s_)
discounted_r.reverse()
# Buffer states actions rewards
bs, ba, br = np.vstack(buffer_s), np.vstack(buffer_a), np.array(discounted_r)[:, np.newaxis]
buffer_s, buffer_a, buffer_r = [], [], []
ppo.update(bs, ba, br)
# Check if done
if done:
print("Simulation done.")
break
# Append episode rewards
if ep == 0: all_ep_r.append(ep_r)
else: all_ep_r.append(all_ep_r[-1]*0.9 + ep_r*0.1)
# Close the environment
env.close()
# Return all episode rewards
return all_ep_r
if __name__ == '__main__':
ppo = PPO()
env = gym.make('RocketLander-v0')
reward = ppo.train(env, ppo, 100)
print(reward)
我尝试过的事情:
BATCH
号,以便PPO更快地更新。我已经被这个问题困扰了几个小时,我无法在线找到任何解决方案。我也是新手,如果有任何愚蠢的错误,请原谅我。
更新:追踪
Traceback (most recent call last):
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1334, in _do_call
return fn(*args)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1319, in _run_fn
options, feed_dict, fetch_list, target_list, run_metadata)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1407, in _call_tf_sessionrun
run_metadata)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Found Inf or NaN global norm. : Tensor had NaN values
[[{{node atrain/VerifyFinite/CheckNumerics}} = CheckNumerics[T=DT_FLOAT, message="Found Inf or NaN global norm.", _device="/job:localhost/replica:0/task:0/device:CPU:0"](atrain/global_norm/global_norm)]]
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "main.py", line 209, in <module>
reward = ppo.train(env, ppo, 100)
File "main.py", line 191, in train
ppo.update(bs, ba, br)
File "main.py", line 118, in update
[self.tfsess.run(self.atrain_operation, {self.tf_state: state, self.tf_action: action, self.tf_advantage: advantage}) for _ in range(ACTOR_UPDATE_STEPS)]
File "main.py", line 118, in <listcomp>
[self.tfsess.run(self.atrain_operation, {self.tf_state: state, self.tf_action: action, self.tf_advantage: advantage}) for _ in range(ACTOR_UPDATE_STEPS)]
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 929, in run
run_metadata_ptr)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1152, in _run
feed_dict_tensor, options, run_metadata)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1328, in _do_run
run_metadata)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 1348, in _do_call
raise type(e)(node_def, op, message)
tensorflow.python.framework.errors_impl.InvalidArgumentError: Found Inf or NaN global norm. : Tensor had NaN values
[[node atrain/VerifyFinite/CheckNumerics (defined at main.py:90) = CheckNumerics[T=DT_FLOAT, message="Found Inf or NaN global norm.", _device="/job:localhost/replica:0/task:0/device:CPU:0"](atrain/global_norm/global_norm)]]
Caused by op 'atrain/VerifyFinite/CheckNumerics', defined at:
File "main.py", line 207, in <module>
ppo = PPO()
File "main.py", line 90, in __init__
gradients, _ = tf.clip_by_global_norm(gradients, 1.0)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/ops/clip_ops.py", line 265, in clip_by_global_norm
"Found Inf or NaN global norm.")
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/ops/numerics.py", line 47, in verify_tensor_all_finite
verify_input = array_ops.check_numerics(t, message=msg)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/ops/gen_array_ops.py", line 817, in check_numerics
"CheckNumerics", tensor=tensor, message=message, name=name)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
op_def=op_def)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/util/deprecation.py", line 488, in new_func
return func(*args, **kwargs)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3274, in create_op
op_def=op_def)
File "/Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1770, in __init__
self._traceback = tf_stack.extract_stack()
InvalidArgumentError (see above for traceback): Found Inf or NaN global norm. : Tensor had NaN values
[[node atrain/VerifyFinite/CheckNumerics (defined at main.py:90) = CheckNumerics[T=DT_FLOAT, message="Found Inf or NaN global norm.", _device="/job:localhost/replica:0/task:0/device:CPU:0"](atrain/global_norm/global_norm)]]
答案 0 :(得分:1)
为简单起见,我将您的代码修改为与Pendulum-v0一起运行,而不是与Google Colab上的自定义环境RocketLander-v0一起运行。
以下是我为运行Pendulum-v0所做的修改:
删除行:import rocket_lander_gym
将行:STATE_DIM, ACT_DIM = 10, 3
更改为此:STATE_DIM, ACT_DIM = 3, 1
将行:env = gym.make('RocketLander-v0')
更改为此:env = gym.make('Pendulum-v0')
进行了一些细微但必要的修改以运行Pendulum-v0之后,您的代码仍在最后的print(reward)
语句中产生nans。这表明问题很可能与代码有关,不太可能是游戏环境问题。
最终print(reward)
语句的输出之前,解决了问题(从nans一直到输出末尾):
[-1239.414496251207, -1267.7001978172505, -1247.1635071416315, -1255.8660458301786, -1246.770645397439, -1259.1171723968932, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]
仔细检查后,我发现了以下问题并对您的代码进行了一些更改,最终解决了nans问题。
(导致Nans的 actual 问题位于第5点和第6点。魔术数2
,是您用于乘以{的乘数{1}}与您的剪辑mu
在第6点上的上限不同。)
1)您的概率比是错误的,因此我将其更改为:
1
对此:
ratio = pi.prob(self.tf_advantage) / old_pi.prob(self.tf_advantage)
2)您有2个ratio = pi.prob(self.tf_action) / old_pi.prob(self.tf_action)
self.train_opt
所以我将第二个self.train_opt = tf.train.AdamOptimizer(CR_LR)
self.train_opt = self.train_opt.apply_gradients(zip(gradients, variables))
语句更改为:
self.train_opt
3) self.ctrain_op = self.train_opt.apply_gradients(zip(gradients, variables))
是优化程序,因此我替换了这一行:
self.atrain_operation
具有:
self.atrain_operation = self.atrain_operation.apply_gradients(zip(gradients, variables))
4)相应地,注释掉的行也被self.atrain_op = self.atrain_operation.apply_gradients(zip(gradients, variables))
函数替换:
update
5)在#[self.tfsess.run(self.atrain_operation, {self.tf_state: state, self.tf_action: action, self.tf_advantage: advantage}) for _ in range(ACTOR_UPDATE_STEPS)]
[self.tfsess.run(self.atrain_op, {self.tf_state: state, self.tf_action: action, self.tf_advantage: advantage}) for _ in range(ACTOR_UPDATE_STEPS)]
#[self.tfsess.run(self.train_opt, {self.tf_state: state, self.tf_dreward: reward}) for _ in range(CRITIC_UPDATE_STEPS)]
[self.tfsess.run(self.ctrain_op, {self.tf_state: state, self.tf_dreward: reward}) for _ in range(CRITIC_UPDATE_STEPS)]
函数中,而不是乘以2:
tinynn
替换为:
mu = 2 * tf.layers.dense(l1, ACT_DIM, tf.nn.tanh, trainable=trainable)
6),而不是在mu = self.env.action_space.high * tf.layers.dense(l1, ACT_DIM, tf.nn.tanh, name='mu', trainable=trainable)
函数中使用此return np.clip(action, -1, 1)
,请使用:
choose_action
7)我还将return np.clip(a, self.env.action_space.low, self.env.action_space.high)
传递给PPO(),以便env
可以访问环境:
tinynn
在修正问题后(不再使用nan)后,"""
if __name__ == '__main__':
ppo = PPO()
#env = gym.make('RocketLander-v0')
env = gym.make('Pendulum-v0')
reward = ppo.train(env, ppo, 100)
print(reward)
"""
if __name__ == '__main__':
#env = gym.make('RocketLander-v0')
env = gym.make('Pendulum-v0')
ppo = PPO(env)
reward = ppo.train(env, ppo, 100)
print(reward)
最终声明的输出:
print(reward)
[-1076.4211985938728, -1089.7948555704293, -1115.6341917789869, -1147.7961139172062, -1162.9589624975872, -1193.6444573268725, -1214.9662239699737, -1219.295151702447, -1228.3773779343328, -1211.7559065793157, -1239.1770034164979, -1256.5497739717612, -1248.942050034072, -1251.5809026533057, -1246.350714892043, -1223.1414157442061, -1231.5288547710811, -1223.5475405502032, -1217.095971096193, -1215.639878904649, -1182.084416025169, -1174.3085216226718, -1176.5976104186886, -1188.5439312195451, -1160.6565487872776, -1132.5758139546506, -1148.7299082836548, -1149.1097155137375, -1124.4154423538491, -1100.4411098048593, -1081.2445587548245, -1035.7597376533809, -1039.5657416397464, -1046.8627585876952, -1007.554202371864, -997.4072232047926, -924.0742105089892, -872.5268280283873, -889.6594740458157, -929.8577808816676, -957.1616193294444, -887.3960001717214, -811.6005555799227, -769.4648914456843, -692.6909819129986, -623.7238271047137, -656.6829518032941, -629.9657550649539, -651.9125731231816, -678.5172027274579, -683.0097144683796, -640.7089935328387, -589.4306203212271, -556.3242756529115, -526.881331084439, -539.3604006694065, -511.27673189202727, -526.1856726355412, -512.7768642430646, -514.7892695498354, -527.2777710366902, -516.3731318862425, -504.3876365547384, -466.66983741261095, -446.0724507306932, -414.25670263412803, -449.7266236253488, -471.7990471628901, -492.56922815695845, -455.6665136249609, -436.67493361178475, -393.1425637497276, -445.3335873259794, -440.30325932671377, -437.07634044015583, -406.7068409952513, -379.062809279313, -444.46652386541916, -439.60389029825603, -422.0043960746679, -424.80904663279813, -486.0321568909586, -476.00519893661306, -493.3553901668465, -457.4723683354885, -450.83268159600254, -458.6995892890558, -514.3951245072926, -519.3061062950538, -507.1919061966863, -469.59914342990675, -422.66056322913045, -439.53868966691357, -395.9325190449425, -369.7488471733708, -398.1944563259144, -397.3649275140671, -401.18423175784426, -400.9083352836444, -374.0640183220304]