我尝试了不同的超参数以及层和节点的数量,但是即使在2000年迭代之后,我的模型也没有学习任何东西,并且我还尝试了MountainCarContinuous-v0环境,但这也没有用。
我尝试了与github不同的体系结构和模型,但是我的模型仍然没有学到任何东西
import numpy as np
import tensorflow as tf
import random
import gym
import pylab
import sys
from keras.initializers import RandomUniform
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.layers import Dense, Input, Add, Concatenate, Flatten, GaussianNoise, Lambda
from keras import backend as K
from collections import deque
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
sess = tf.Session(config=config)
K.set_session(sess)
EPISODES = 100000
class Actor(object):
def __init__(self, sess, state_size, action_size, TAU, lr, action_bound=1, load=False):
self.sess = sess
self.TAU = TAU
self.lr = lr
self.load = load
self.action_bound = action_bound
self.model, self.weights, self.state = self.bulid_network(state_size, action_size)
self.target_model, self.target_weights, self.target_weights = self.bulid_network(state_size, action_size)
self.q_grads = tf.placeholder(tf.float32, [None, action_size])
self.grads = tf.gradients(self.model.output, self.weights, -self.q_grads)
self.optimize = tf.train.AdamOptimizer(lr).apply_gradients(zip(self.grads, self.weights))
self.sess.run(tf.global_variables_initializer())
if self.load:
self.model.load_weights("./DDPG_Actor.h5")
self.target_model.load_weights("./DDPG_Actor_target.h5")
def train(self, state, grads):
self.sess.run(self.optimize, feed_dict={self.state : state, self.q_grads : grads})
def update(self):
W, target_W = self.model.get_weights(), self.target_model.get_weights()
for i in range(len(W)):
target_W[i] = self.TAU*W[i] + (1 - self.TAU)*target_W[i]
self.target_model.set_weights(target_W)
def save(self):
self.model.save_weights("./DDPG_Actor.h5")
self.target_model.save_weights("./DDPG_Actor_target.h5")
def bulid_network(self, state_size, action_size):
input = Input(shape=[state_size])
X = Dense(400, activation='relu', kernel_initializer='glorot_normal')(input)
X = Dense(300, activation='relu', kernel_initializer='glorot_normal')(X)
output = Dense(action_size, activation='tanh', kernel_initializer='glorot_normal')(X)
output = Lambda(lambda i : i*self.action_bound)(output)
model = Model(input=input, output=output)
return model, model.trainable_weights, input
class Critic(object):
def __init__(self, sess, state_size, action_size, TAU, lr, load=False):
self.sess = sess
self.TAU = TAU
self.lr = lr
self.load = load
self.optimizer = tf.train.AdamOptimizer(lr)
self.model, self.state, self.action = self.build_network(state_size, action_size)
self.target_model, self.target_state, self.target_action = self.build_network(state_size, action_size)
self.q_grads = tf.gradients(self.model.output, self.action)
self.sess.run(tf.global_variables_initializer())
if self.load:
self.model.load_weights("./DDPG_Critic.h5")
self.target_model.load_weights("./DDPG_Critic_target.h5")
def gradients(self, state, action):
return self.sess.run(self.q_grads, feed_dict={self.state : state, self.action : action})[0]
def save(self):
self.model.save_weights("./DDPG_Critic.h5")
self.target_model.save_weights("./DDPG_Critic_target.h5")
def update(self):
W, target_W = self.model.get_weights(), self.target_model.get_weights()
for i in range(len(W)):
target_W[i] = self.TAU*W[i] + (1 - self.TAU)*target_W[i]
self.target_model.set_weights(target_W)
def build_network(self, state_size, action_size):
S = Input(shape=[state_size])
A = Input(shape=[action_size])
X1 = Dense(400, activation='relu', kernel_initializer='glorot_normal')(S)
X2 = Dense(400, activation='relu', kernel_initializer='glorot_normal')(A)
X = Add()([X1,X2])
X = Dense(300, activation='relu', kernel_initializer='glorot_normal')(X)
output = Dense(action_size, activation='linear', kernel_initializer='glorot_normal')(X)
model = Model(inputs=[S, A], outputs=output)
model.compile(loss='mse', optimizer=Adam(lr=self.lr))
return model, S, A
class DDPG(object):
def __init__(self, sess, state_size, action_size, action_bound=1, memory_size=5000, batch_size=64, actor_lr=0.0001, critic_lr=0.001, gamma=0.99, TAU=0.001):
self.sess = sess
self.state_size = state_size
self.action_size = action_size
self.memory = deque(maxlen=memory_size)
self.batch_size = batch_size
self.actor_lr = actor_lr
self.critic_lr = critic_lr
self.gamma = gamma
self.TAU = TAU
self.train_start = 1000
self.epsilon = 1
self.epsilon_min = 0.001
self.mu = 0.0
self.x = 0
self.theta = 0.01
self.sigma = 0.1
self.epsilon_decay = (self.epsilon - self.epsilon_min) / 100000
self.actor = Actor(sess, state_size, action_size, TAU, actor_lr,action_bound, load=False)
self.critic = Critic(sess, state_size, action_size, TAU, critic_lr, load=False)
def append(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
if self.epsilon > self.epsilon_min :
self.epsilon -= self.epsilon_decay
def OU(self):
dx = self.theta*(self.mu - self.x) + self.sigma*np.random.randn(1)
self.x += dx
return self.x
def get_action(self, state):
action = self.actor.model.predict(state)[0]
return action + self.OU()*self.epsilon
def save(self):
self.actor.save()
self.critic.save()
def train(self):
if len(self.memory) < self.train_start:
return
batch_size = min(self.batch_size, len(self.memory))
mini_batch = random.sample(self.memory, batch_size)
states = np.asarray([e[0] for e in mini_batch])
states = np.reshape(states, [batch_size, self.state_size])
actions = np.asarray([e[1] for e in mini_batch])
rewards = np.asarray([e[2] for e in mini_batch])
next_states = np.asarray([e[3] for e in mini_batch])
next_states = np.reshape(next_states, [batch_size, self.state_size])
done = np.asarray([e[4] for e in mini_batch])
target = np.zeros_like(actions)
target_q_values = self.critic.target_model.predict([next_states, self.actor.target_model.predict(next_states)])
for i in range(len(mini_batch)):
if done[i]:
target[i] = rewards[i]
else :
target[i] = rewards[i] + self.gamma*target_q_values[i]
loss = self.critic.model.train_on_batch([states, actions], target)
action_for_grad = self.actor.model.predict(states)
q_grads = self.critic.gradients(states,action_for_grad)
self.actor.train(states,q_grads)
self.actor.update()
self.critic.update()
env = gym.make('Pendulum-v0')
state_size = env.observation_space.shape[0]
action_size = 1
action_bound = env.action_space.high
agent = DDPG(sess, state_size, action_size, action_bound)
scores, episodes = [], []
for e in range(EPISODES):
done = False
score = 0
state = env.reset()
state = np.reshape(state, [1,state_size])
step = 0
while not done:
action = agent.get_action(state)
#print(action)
next_state, reward, done, _ = env.step([action])
next_state = np.reshape(next_state,[1,state_size])
score += reward[0]
agent.append(state, action, reward, next_state, done)
state = next_state
step += 1
if step % 20 == 0:
agent.train()
if done:
scores.append(score)
episodes.append(e)
pylab.plot(episodes, scores, 'b')
pylab.savefig("./DDPG_Pendulum.png")
print("episode:", e, " score:", score, " epsilon:", agent.epsilon)
#if np.mean(scores[-min(10, len(scores)) :]) > -120 :
#sys.exit()
if e % 50 == 0:
agent.save()
我每集总能获得-1450的奖励