from flask_restful import Api
api = Api(app)
from flask_restful import Resource, reqparse
class MyResource(Resource):
def __init__(self, **kwargs):
assert db_session.is_active
def get(self, version):
parser = reqparse.RequestParser()
if version == 'v0':
#do something
else if version == 'v1':
#do another thing
#alert the user the version is incorrect
Traceback (most recent call last):
File "train.py", line 346, in <module>
File "train.py", line 313, in main
policy.update(observes, actions, advantages, logger) # update policy
File "/home/ryan/trpo_fractal8NN/trpo/policy.py", line 87, in update
old_means, old_logvars, old_logp])
File "/home/ryan/.local/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training.py", line 973, in train_on_batch
class_weight=class_weight, reset_metrics=reset_metrics)
File "/home/ryan/.local/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_v2_utils.py", line 264, in train_on_batch
File "/home/ryan/.local/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_eager.py", line 311, in train_on_batch
File "/home/ryan/.local/lib/python3.6/site-packages/tensorflow_core/python/keras/engine/training_eager.py", line 272, in _process_single_batch
model.optimizer.apply_gradients(zip(grads, trainable_weights))
File "/home/ryan/.local/lib/python3.6/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py", line 427, in apply_gradients
grads_and_vars = _filter_grads(grads_and_vars)
File "/home/ryan/.local/lib/python3.6/site-packages/tensorflow_core/python/keras/optimizer_v2/optimizer_v2.py", line 1025, in _filter_grads
([v.name for _, v in grads_and_vars],))
ValueError: No gradients provided for any variable: ['dense_4/kernel:0', 'dense_4/bias:0', 'dense_5/kernel:0', 'dense_5/bias:0', 'dense_6/kernel:0', 'dense_6/bias:0', 'dense_7/kernel:0', 'dense_7/bias:0', 'Variable:0'].
NN Policy with KL Divergence Constraint
Written by Patrick Coady (pat-coady.github.io)
import tensorflow.keras.backend as K
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Layer, Input
from tensorflow.keras.optimizers import Adam
import numpy as np
class Policy(object):
def __init__(self, obs_dim, act_dim, kl_targ, hid1_mult, init_logvar):
obs_dim: num observation dimensions (int)
act_dim: num action dimensions (int)
kl_targ: target KL divergence between pi_old and pi_new
hid1_mult: size of first hidden layer, multiplier of obs_dim
init_logvar: natural log of initial policy variance
self.beta = 1.0 # dynamically adjusted D_KL loss multiplier
eta = 50 # multiplier for D_KL-kl_targ hinge-squared loss
self.kl_targ = kl_targ
self.epochs = 20
self.lr_multiplier = 1.0 # dynamically adjust lr when D_KL out of control
self.trpo = TRPO(obs_dim, act_dim, hid1_mult, kl_targ, init_logvar, eta)
self.policy = self.trpo.get_policy()
self.pol_model = self.policy.get_pol_model()
self.lr = self.policy.get_lr() # lr calculated based on size of PolicyNN
self.logprob_calc = LogProb()
self.run_trpo_once = 'run'
def sample(self, obs):
"""Draw sample from policy."""
act_means, act_logvars = self.policy.call_PolNN(obs)
act_stddevs = np.exp(act_logvars / 2)
return np.random.normal(act_means, act_stddevs).astype(np.float32)
def update(self, observes, actions, advantages, logger):
""" Update policy based on observations, actions and advantages
observes: observations, shape = (N, obs_dim)
actions: actions, shape = (N, act_dim)
advantages: advantages, shape = (N,)
logger: Logger object, see utils.py
if self.run_trpo_once == 'run':
# run to initialize loss in pol_model first
old_means = np.ones(len(observes))
old_means = old_means[:,np.newaxis]
old_logvars = [[-1.]]
old_logp = np.ones(len(observes))
old_logp = old_logp[:,np.newaxis]
kl, entropy = self.trpo.call_TRPO([observes, actions, advantages,
old_means, old_logvars, old_logp])
# then compile
self.pol_model.compile(optimizer=Adam(self.lr * self.lr_multiplier))
self.run_trpo_once == 'ran'
K.set_value(self.pol_model.optimizer.lr, self.lr * self.lr_multiplier)
# K.set_value(self.pol_model.beta, self.beta)
old_means, old_logvars = self.policy.call_PolNN(observes)
old_means = old_means.numpy()
old_logvars = old_logvars.numpy()
old_logp = self.logprob_calc.call_LogP([actions, old_means, old_logvars])
old_logp = old_logp.numpy()
loss, kl, entropy = 0, 0, 0
for e in range(self.epochs):
# observes - states
# actions - the 8x1 vector of values
# advantages - how much better a particular policy surface is from the old one
# old_means, old_logvars - parameters that specify the old policy surface
# old_logp - the old log probs for the actions
# train_on_batch - Single gradient update or model evaluation over one batch of samples
loss = self.pol_model.train_on_batch([observes, actions, advantages,
old_means, old_logvars, old_logp])
# predict_on_batch - Returns predictions for a single batch of samples - output of the model
kl, entropy = self.pol_model.predict_on_batch([observes, actions, advantages,
old_means, old_logvars, old_logp])
kl, entropy = np.mean(kl), np.mean(entropy)
if kl > self.kl_targ * 4: # early stopping if D_KL diverges badly
# TODO: too many "magic numbers" in next 8 lines of code, need to clean up
if kl > self.kl_targ * 2: # servo beta to reach D_KL target
self.beta = np.minimum(35, 1.5 * self.beta) # max clip beta
if self.beta > 30 and self.lr_multiplier > 0.1:
self.lr_multiplier /= 1.5 # if kl is too large, reduce the learning rate so that the new weights move at a slower rate
elif kl < self.kl_targ / 2:
self.beta = np.maximum(1 / 35, self.beta / 1.5) # min clip beta
if self.beta < (1 / 30) and self.lr_multiplier < 10:
self.lr_multiplier *= 1.5 # if kl is too small, increase the learning rate so that the new weights move at a faster rate
logger.log({'PolicyLoss': loss,
'PolicyEntropy': entropy,
'KL': kl,
'Beta': self.beta,
'_lr_multiplier': self.lr_multiplier})
class PolicyNN():
""" Neural net for policy approximation function.
Policy parameterized by Gaussian means and variances. NN outputs mean
action based on observation. Trainable variables hold log-variances
for each action dimension (i.e. variances not determined by NN).
def __init__(self, obs_dim, act_dim, hid1_mult, init_logvar, **kwargs):
super(PolicyNN, self).__init__(**kwargs)
self.obs_dim = obs_dim
self.act_dim = act_dim
self.hid1_mult = hid1_mult
self.batch_sz = 1
self.init_logvar = init_logvar
self.pol_model = self.build_model()
def build(self, input_shape):
self.batch_sz = input_shape[0]
def build_model(self):
obs = Input(shape=(self.obs_dim,), dtype='float32')
hid1_units = self.obs_dim * self.hid1_mult
hid3_units = self.act_dim * 10 # 10 empirically determined
hid2_units = int(np.sqrt(hid1_units * hid3_units))
self.lr = 9e-4 / np.sqrt(hid2_units) # 9e-4 empirically determined
# heuristic to set learning rate based on NN size (tuned on 'Hopper-v1')
self.dense1 = Dense(hid1_units, activation='tanh', input_shape=(self.obs_dim,))
self.dense2 = Dense(hid2_units, activation='tanh', input_shape=(hid1_units,))
self.dense3 = Dense(hid3_units, activation='tanh', input_shape=(hid2_units,))
self.dense4 = Dense(self.act_dim, input_shape=(hid3_units,))
y = self.dense1(obs)
y = self.dense2(y)
y = self.dense3(y)
means = self.dense4(y)
self.model = Model(inputs=obs, outputs=means)
# logvar_speed increases learning rate for log-variances.
# heuristic sets logvar_speed based on network size.
logvar_speed = (10 * hid3_units) // 48
self.logvars = self.model.add_weight(shape=(logvar_speed, self.act_dim),
trainable=True, initializer='zeros')
print('Policy Params -- h1: {}, h2: {}, h3: {}, lr: {:.3g}, logvar_speed: {}'
.format(hid1_units, hid2_units, hid3_units, self.lr, logvar_speed))
return self.model
def call_PolNN(self, inputs, **kwargs):
# y = self.dense1(inputs)
# y = self.dense2(y)
# y = self.dense3(y)
# means = self.dense4(y)
means = self.model(inputs)
logvars = K.sum(self.logvars, axis=0, keepdims=True) + self.init_logvar
logvars = K.tile(logvars, (self.batch_sz, 1))
return [means, logvars]
def get_pol_model(self):
return self.pol_model
def get_lr(self):
return self.lr
class KLEntropy():
Layer calculates:
1. KL divergence between old and new distributions
2. Entropy of present policy
def __init__(self, **kwargs):
super(KLEntropy, self).__init__(**kwargs)
self.act_dim = None
def build(self, input_shape):
self.act_dim = input_shape[0][1]
def call_KLE(self, inputs, **kwargs):
old_means, old_logvars, new_means, new_logvars = inputs
log_det_cov_old = K.sum(old_logvars, axis=-1, keepdims=True)
log_det_cov_new = K.sum(new_logvars, axis=-1, keepdims=True)
trace_old_new = K.sum(K.exp(old_logvars - new_logvars), axis=-1, keepdims=True)
kl = 0.5 * (log_det_cov_new - log_det_cov_old + trace_old_new +
K.sum(K.square(new_means - old_means) /
K.exp(new_logvars), axis=-1, keepdims=True) -
entropy = 0.5 * (np.float32(self.act_dim) * (np.log(2 * np.pi) + 1.0) +
K.sum(new_logvars, axis=-1, keepdims=True))
return [kl, entropy]
class LogProb():
"""Layer calculates log probabilities of a batch of actions."""
def __init__(self, **kwargs):
super(LogProb, self).__init__(**kwargs)
def call_LogP(self, inputs, **kwargs):
# actions - the actual values for actions
# act_means - the current guess for the mean of the Gaussian dist for actions at a specific state
# act_logvars - the current guess for the variance of the Gaussian dist for actions at a specific state
actions, act_means, act_logvars = inputs
logp = -0.5 * K.sum(act_logvars, axis=-1, keepdims=True)
logp += -0.5 * K.sum(K.square(actions - act_means) / K.exp(act_logvars),
axis=-1, keepdims=True)
return logp
class TRPO():
def __init__(self, obs_dim, act_dim, hid1_mult, kl_targ, init_logvar, eta, **kwargs):
super(TRPO, self).__init__(**kwargs)
self.kl_targ = kl_targ
self.eta = eta
self.policy = PolicyNN(obs_dim, act_dim, hid1_mult, init_logvar)
self.logprob = LogProb()
self.kl_entropy = KLEntropy()
self.pol_model = self.policy.get_pol_model()
# self.beta = self.pol_model.add_weight('beta', initializer='zeros', trainable=False)
self.beta = 1.0
def call_TRPO(self, inputs):
# obs - states
# act - action (vector of length 8)
# adv - advantage of the new policy surface compared to the old one
# old_means, old_logvars - the old policy surface (states X actions X probability of that state/action pair)
# old_logp - log probabilities from the old policy surface
obs, act, adv, old_means, old_logvars, old_logp = inputs
new_means, new_logvars = self.policy.call_PolNN(obs) # the new policy surface
new_logp = self.logprob.call_LogP([act, new_means, new_logvars]) # the log probabilities of the new actions
kl, entropy = self.kl_entropy.call_KLE([old_means, old_logvars, # kl is the distance from the old policy surface to the new policy surface
new_means, new_logvars])
loss1 = -K.mean(adv * K.exp(new_logp - old_logp)) # rewards for if there are advantages for the new policy surface
loss2 = K.mean(self.beta * kl) # we are putting higher cost on a higher change in the policy surface
# TODO - Take mean before or after hinge loss?
loss3 = self.eta * K.square(K.maximum(0.0, K.mean(kl) - 2.0 * self.kl_targ)) # more loss on the kl distance
self.pol_model.add_loss(lambda: loss1 + loss2 + loss3)
return [kl, entropy]
def get_policy(self):
return self.policy
我不确定,但是我想知道是否在add_loss中使用lambda术语作为" add_loss(lambda:loss1 + loss2 + loss3)"来避免此错误可能导致我的问题。
WARNING:tensorflow:Output output_1 missing from loss dictionary. We assume this was done on purpose. The fit and evaluate APIs will not be expecting any data to be passed to output_1.
WARNING:tensorflow:Output output_2 missing from loss dictionary. We assume this was done on purpose. The fit and evaluate APIs will not be expecting any data to be passed to output_2.